integrated translator to prepare_parser.py

1 year ago · 79cb0990d1
--- a/Forums/Initialization/prepare_parser.py
+++ b/Forums/Initialization/prepare_parser.py
@ -13,6 +13,7 @@ from Forums.Incogsnoo.parser import *
 from Forums.Classifier.classify_product import predict
 # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
 from Translator.translate import translate
 # controls the log id
 nError = 0
@ -166,6 +167,20 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
        return None
 def get_source_language(forum):
    if forum == "BestCardingWorld":
        lang = 'english'
    elif forum == "CryptBB":
        lang = 'english'
    elif forum == "Incogsnoo":
        lang = 'english'
    else:
        print("MISSING CALL TO GET LANGUAGE IN PREPARE_PARSER.PY!")
        lang = 'auto'
    return lang
 def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile):
    try:
@ -241,6 +256,8 @@ def new_parse(forum, url, createLog):
    else:
        logFile = None
    source_lang = get_source_language(forum)
    # Reading the Listing Html Pages
    listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
@ -325,8 +342,11 @@ def new_parse(forum, url, createLog):
                        # Classify on final description page
                        if descriptionIndex == len(descriptions) - 1:
                            title = translate(rec[15], source_lang)
                            content = translate(getPosts(posts), source_lang)
                            # classification for topic based on all posts from all pages
                            rec[19] = str(predict(rec[3], getPosts(posts), language='sup_english'))
                            rec[19] = str(predict(title, content, language='sup_english'))
                        # Persisting the information in the database
                        persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile)
--- a/MarketPlaces/Initialization/prepare_parser.py
+++ b/MarketPlaces/Initialization/prepare_parser.py
@ -29,6 +29,7 @@ from MarketPlaces.GoFish.parser import *
 from MarketPlaces.ZeroDay.parser import *
 from MarketPlaces.Classifier.classify_product import predict
 from Translator.translate import translate
 nError = 0
@ -241,6 +242,20 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
        return None
 def get_source_language(marketPlace):
    if marketPlace == "BestCardingWorld":
        lang = 'english'
    elif marketPlace == "CryptBB":
        lang = 'english'
    elif marketPlace == "Incogsnoo":
        lang = 'english'
    else:
        print("MISSING CALL TO GET LANGUAGE IN PREPARE_PARSER.PY!")
        lang = 'auto'
    return lang
 def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile):
    try:
        persist_data(url, tuple(rec), cur)
@ -313,6 +328,8 @@ def new_parse(marketPlace, url, createLog):
    else:
        logFile = None
    source_lang = get_source_language(marketPlace)
    # Reading the Listing Html Pages
    listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
@ -389,8 +406,11 @@ def new_parse(marketPlace, url, createLog):
                        # Combining the information from Listing and Description Pages
                        rec = mergePages(rmm, rec)
                        title = translate(rec[4], source_lang)
                        content = translate(rec[5], source_lang)
                        # Append to the list the classification of the topic
                        rec.append(str(predict(rec[4], rec[5], language='sup_english')))
                        rec.append(str(predict(title, content, language='sup_english')))
                        # Persisting the information in the database
                        persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile,
--- a/Translator/translate.py
+++ b/Translator/translate.py
@ -0,0 +1,60 @@
 from deep_translator import GoogleTranslator
 # For the arguments source and target, you can use the name of the language or its abbreviation
 # chinese (simplified): zh-CN, english: en, korean: ko, persian: fa, russian: ru
 def translate(input_text: str, source: str, target='english') -> str:
    if len(input_text.strip()) == 0:
        return input_text
    batch_size = 4999
    batches = []
    whitespaces = []
    start = 0
    translated_text = ''
    while not input_text[start].strip():
        translated_text += input_text[start]
        start += 1
    # A while loop that will continue as long as start is less than the total length
    # of the HTML content (len(input_text)). This ensures that we process the entire content.
    while start < len(input_text):
        whitespace = ''
        # Set the end index for the current batch
        end = start + batch_size
        # If end is beyond the end of the content, we don't need to adjust it.
        if end < len(input_text):
            # A while loop to adjust the end index so that it doesn't split a word.
            # It continues as long as end is within the content (end < len(input_text)) and
            # the character at the end position is not a whitespace (input_text[end].strip()
            # returns True for non-whitespace characters).
            while end < len(input_text) and input_text[end].strip():
                # The inner while loop, decrements the end index by 1. This moves the end index
                # backwards until it reaches a whitespace character, ensuring that we don't split a word.
                end -= 1
            while not input_text[end].strip():
                whitespace = input_text[end] + whitespace
                end -= 1
        else:
            end = len(input_text) - 1
            while not input_text[end].strip():
                whitespace = input_text[end] + whitespace
                end -= 1
        # This line extracts a substring from the HTML content, starting from the start index
        # and ending at the end index. This substring is our batch.
        batch = input_text[start: end + 1]
        # Add our batch to the batches list
        batches.append(batch)
        whitespaces.append(whitespace)
        # Updates our start index for the next batch
        start = end + 1 + len(whitespace)
    translated_batches = GoogleTranslator(source, target).translate_batch(batches)
    for batch, whitespace in zip(translated_batches, whitespaces):
        translated_text += batch + whitespace
    return translated_text
--- a/Translator/translate_test.py
+++ b/Translator/translate_test.py
@ -0,0 +1,23 @@
 from Translator.translate import translate
 original_text = """
 Жил-был человек из Нантакета,
 Который хранил все свои деньги в ведре.
 Но его дочь, по имени Нан,
 Сбежала с мужчиной,
 А что касается ведра, то Нантакет.
 """
 english_text = """
 There once was a man from Нантакета,
 Who kept all his cash in a bucket.
 But his daughter, named Nan,
 Ran away with a man,
 And as for the bucket, Nantucket.
 """
 translated_text = translate(english_text, 'russian')
 print(translated_text)
 print(english_text == translated_text)