diff --git a/Forums/Initialization/prepare_parser.py b/Forums/Initialization/prepare_parser.py index ca582c2..dff8168 100644 --- a/Forums/Initialization/prepare_parser.py +++ b/Forums/Initialization/prepare_parser.py @@ -13,6 +13,7 @@ from Forums.Incogsnoo.parser import * from Forums.Classifier.classify_product import predict # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi +from Translator.translate import translate # controls the log id nError = 0 @@ -166,6 +167,20 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile): return None +def get_source_language(forum): + + if forum == "BestCardingWorld": + lang = 'english' + elif forum == "CryptBB": + lang = 'english' + elif forum == "Incogsnoo": + lang = 'english' + else: + print("MISSING CALL TO GET LANGUAGE IN PREPARE_PARSER.PY!") + lang = 'auto' + return lang + + def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): try: @@ -241,6 +256,8 @@ def new_parse(forum, url, createLog): else: logFile = None + source_lang = get_source_language(forum) + # Reading the Listing Html Pages listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) @@ -325,8 +342,11 @@ def new_parse(forum, url, createLog): # Classify on final description page if descriptionIndex == len(descriptions) - 1: + title = translate(rec[15], source_lang) + content = translate(getPosts(posts), source_lang) + # classification for topic based on all posts from all pages - rec[19] = str(predict(rec[3], getPosts(posts), language='sup_english')) + rec[19] = str(predict(title, content, language='sup_english')) # Persisting the information in the database persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile) diff --git a/MarketPlaces/Initialization/prepare_parser.py b/MarketPlaces/Initialization/prepare_parser.py index 4cf169e..ffd5c90 100644 --- a/MarketPlaces/Initialization/prepare_parser.py +++ b/MarketPlaces/Initialization/prepare_parser.py @@ -29,6 +29,7 @@ from MarketPlaces.GoFish.parser import * from MarketPlaces.ZeroDay.parser import * from MarketPlaces.Classifier.classify_product import predict +from Translator.translate import translate nError = 0 @@ -241,6 +242,20 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile): return None +def get_source_language(marketPlace): + + if marketPlace == "BestCardingWorld": + lang = 'english' + elif marketPlace == "CryptBB": + lang = 'english' + elif marketPlace == "Incogsnoo": + lang = 'english' + else: + print("MISSING CALL TO GET LANGUAGE IN PREPARE_PARSER.PY!") + lang = 'auto' + return lang + + def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): try: persist_data(url, tuple(rec), cur) @@ -313,6 +328,8 @@ def new_parse(marketPlace, url, createLog): else: logFile = None + source_lang = get_source_language(marketPlace) + # Reading the Listing Html Pages listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) @@ -389,8 +406,11 @@ def new_parse(marketPlace, url, createLog): # Combining the information from Listing and Description Pages rec = mergePages(rmm, rec) + title = translate(rec[4], source_lang) + content = translate(rec[5], source_lang) + # Append to the list the classification of the topic - rec.append(str(predict(rec[4], rec[5], language='sup_english'))) + rec.append(str(predict(title, content, language='sup_english'))) # Persisting the information in the database persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, diff --git a/Translator/translate.py b/Translator/translate.py new file mode 100644 index 0000000..a3a99f4 --- /dev/null +++ b/Translator/translate.py @@ -0,0 +1,60 @@ +from deep_translator import GoogleTranslator + + +# For the arguments source and target, you can use the name of the language or its abbreviation +# chinese (simplified): zh-CN, english: en, korean: ko, persian: fa, russian: ru +def translate(input_text: str, source: str, target='english') -> str: + + if len(input_text.strip()) == 0: + return input_text + + batch_size = 4999 + batches = [] + whitespaces = [] + start = 0 + translated_text = '' + + while not input_text[start].strip(): + translated_text += input_text[start] + start += 1 + + # A while loop that will continue as long as start is less than the total length + # of the HTML content (len(input_text)). This ensures that we process the entire content. + while start < len(input_text): + whitespace = '' + # Set the end index for the current batch + end = start + batch_size + # If end is beyond the end of the content, we don't need to adjust it. + if end < len(input_text): + # A while loop to adjust the end index so that it doesn't split a word. + # It continues as long as end is within the content (end < len(input_text)) and + # the character at the end position is not a whitespace (input_text[end].strip() + # returns True for non-whitespace characters). + while end < len(input_text) and input_text[end].strip(): + # The inner while loop, decrements the end index by 1. This moves the end index + # backwards until it reaches a whitespace character, ensuring that we don't split a word. + end -= 1 + while not input_text[end].strip(): + whitespace = input_text[end] + whitespace + end -= 1 + else: + end = len(input_text) - 1 + while not input_text[end].strip(): + whitespace = input_text[end] + whitespace + end -= 1 + + # This line extracts a substring from the HTML content, starting from the start index + # and ending at the end index. This substring is our batch. + batch = input_text[start: end + 1] + # Add our batch to the batches list + batches.append(batch) + whitespaces.append(whitespace) + # Updates our start index for the next batch + start = end + 1 + len(whitespace) + + translated_batches = GoogleTranslator(source, target).translate_batch(batches) + + for batch, whitespace in zip(translated_batches, whitespaces): + translated_text += batch + whitespace + + return translated_text diff --git a/Translator/translate_test.py b/Translator/translate_test.py new file mode 100644 index 0000000..af59f44 --- /dev/null +++ b/Translator/translate_test.py @@ -0,0 +1,23 @@ +from Translator.translate import translate + +original_text = """ +Жил-был человек из Нантакета, +Который хранил все свои деньги в ведре. +Но его дочь, по имени Нан, +Сбежала с мужчиной, +А что касается ведра, то Нантакет. +""" + +english_text = """ +There once was a man from Нантакета, +Who kept all his cash in a bucket. +But his daughter, named Nan, +Ran away with a man, +And as for the bucket, Nantucket. +""" + +translated_text = translate(english_text, 'russian') + +print(translated_text) + +print(english_text == translated_text)