Browse Source

integrated translator to prepare_parser.py

main
westernmeadow 9 months ago
parent
commit
79cb0990d1
4 changed files with 125 additions and 2 deletions
  1. +21
    -1
      Forums/Initialization/prepare_parser.py
  2. +21
    -1
      MarketPlaces/Initialization/prepare_parser.py
  3. +60
    -0
      Translator/translate.py
  4. +23
    -0
      Translator/translate_test.py

+ 21
- 1
Forums/Initialization/prepare_parser.py View File

@ -13,6 +13,7 @@ from Forums.Incogsnoo.parser import *
from Forums.Classifier.classify_product import predict from Forums.Classifier.classify_product import predict
# from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi # from DarkWebMining_Sample.Forums.Classifier.classify_product import predict_semi
from Translator.translate import translate
# controls the log id # controls the log id
nError = 0 nError = 0
@ -166,6 +167,20 @@ def parse_description(forum, descriptionFile, soup, createLog, logFile):
return None return None
def get_source_language(forum):
if forum == "BestCardingWorld":
lang = 'english'
elif forum == "CryptBB":
lang = 'english'
elif forum == "Incogsnoo":
lang = 'english'
else:
print("MISSING CALL TO GET LANGUAGE IN PREPARE_PARSER.PY!")
lang = 'auto'
return lang
def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile):
try: try:
@ -241,6 +256,8 @@ def new_parse(forum, url, createLog):
else: else:
logFile = None logFile = None
source_lang = get_source_language(forum)
# Reading the Listing Html Pages # Reading the Listing Html Pages
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
@ -325,8 +342,11 @@ def new_parse(forum, url, createLog):
# Classify on final description page # Classify on final description page
if descriptionIndex == len(descriptions) - 1: if descriptionIndex == len(descriptions) - 1:
title = translate(rec[15], source_lang)
content = translate(getPosts(posts), source_lang)
# classification for topic based on all posts from all pages # classification for topic based on all posts from all pages
rec[19] = str(predict(rec[3], getPosts(posts), language='sup_english'))
rec[19] = str(predict(title, content, language='sup_english'))
# Persisting the information in the database # Persisting the information in the database
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile) persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile)


+ 21
- 1
MarketPlaces/Initialization/prepare_parser.py View File

@ -29,6 +29,7 @@ from MarketPlaces.GoFish.parser import *
from MarketPlaces.ZeroDay.parser import * from MarketPlaces.ZeroDay.parser import *
from MarketPlaces.Classifier.classify_product import predict from MarketPlaces.Classifier.classify_product import predict
from Translator.translate import translate
nError = 0 nError = 0
@ -241,6 +242,20 @@ def parse_description(marketPlace, descriptionFile, soup, createLog, logFile):
return None return None
def get_source_language(marketPlace):
if marketPlace == "BestCardingWorld":
lang = 'english'
elif marketPlace == "CryptBB":
lang = 'english'
elif marketPlace == "Incogsnoo":
lang = 'english'
else:
print("MISSING CALL TO GET LANGUAGE IN PREPARE_PARSER.PY!")
lang = 'auto'
return lang
def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile): def persist_record(url, rec, cur, con, createLog, logFile, listingFile, descriptionFile):
try: try:
persist_data(url, tuple(rec), cur) persist_data(url, tuple(rec), cur)
@ -313,6 +328,8 @@ def new_parse(marketPlace, url, createLog):
else: else:
logFile = None logFile = None
source_lang = get_source_language(marketPlace)
# Reading the Listing Html Pages # Reading the Listing Html Pages
listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html')) listings = glob.glob(os.path.join(mainDir, CURRENT_DATE + "\\Listing", '*.html'))
@ -389,8 +406,11 @@ def new_parse(marketPlace, url, createLog):
# Combining the information from Listing and Description Pages # Combining the information from Listing and Description Pages
rec = mergePages(rmm, rec) rec = mergePages(rmm, rec)
title = translate(rec[4], source_lang)
content = translate(rec[5], source_lang)
# Append to the list the classification of the topic # Append to the list the classification of the topic
rec.append(str(predict(rec[4], rec[5], language='sup_english')))
rec.append(str(predict(title, content, language='sup_english')))
# Persisting the information in the database # Persisting the information in the database
persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile, persistSuccess = persist_record(url, rec, cur, con, createLog, logFile, listingFile,


+ 60
- 0
Translator/translate.py View File

@ -0,0 +1,60 @@
from deep_translator import GoogleTranslator
# For the arguments source and target, you can use the name of the language or its abbreviation
# chinese (simplified): zh-CN, english: en, korean: ko, persian: fa, russian: ru
def translate(input_text: str, source: str, target='english') -> str:
if len(input_text.strip()) == 0:
return input_text
batch_size = 4999
batches = []
whitespaces = []
start = 0
translated_text = ''
while not input_text[start].strip():
translated_text += input_text[start]
start += 1
# A while loop that will continue as long as start is less than the total length
# of the HTML content (len(input_text)). This ensures that we process the entire content.
while start < len(input_text):
whitespace = ''
# Set the end index for the current batch
end = start + batch_size
# If end is beyond the end of the content, we don't need to adjust it.
if end < len(input_text):
# A while loop to adjust the end index so that it doesn't split a word.
# It continues as long as end is within the content (end < len(input_text)) and
# the character at the end position is not a whitespace (input_text[end].strip()
# returns True for non-whitespace characters).
while end < len(input_text) and input_text[end].strip():
# The inner while loop, decrements the end index by 1. This moves the end index
# backwards until it reaches a whitespace character, ensuring that we don't split a word.
end -= 1
while not input_text[end].strip():
whitespace = input_text[end] + whitespace
end -= 1
else:
end = len(input_text) - 1
while not input_text[end].strip():
whitespace = input_text[end] + whitespace
end -= 1
# This line extracts a substring from the HTML content, starting from the start index
# and ending at the end index. This substring is our batch.
batch = input_text[start: end + 1]
# Add our batch to the batches list
batches.append(batch)
whitespaces.append(whitespace)
# Updates our start index for the next batch
start = end + 1 + len(whitespace)
translated_batches = GoogleTranslator(source, target).translate_batch(batches)
for batch, whitespace in zip(translated_batches, whitespaces):
translated_text += batch + whitespace
return translated_text

+ 23
- 0
Translator/translate_test.py View File

@ -0,0 +1,23 @@
from Translator.translate import translate
original_text = """
Жил-был человек из Нантакета,
Который хранил все свои деньги в ведре.
Но его дочь, по имени Нан,
Сбежала с мужчиной,
А что касается ведра, то Нантакет.
"""
english_text = """
There once was a man from Нантакета,
Who kept all his cash in a bucket.
But his daughter, named Nan,
Ran away with a man,
And as for the bucket, Nantucket.
"""
translated_text = translate(english_text, 'russian')
print(translated_text)
print(english_text == translated_text)

Loading…
Cancel
Save