khangtran
/
dark_web_forums


								from deep_translator import GoogleTranslator


								# For the arguments source and target, you can use the name of the language or its abbreviation

								# chinese (simplified): zh-CN, english: en, korean: ko, persian: fa, russian: ru

								def translate(input_text: str, source: str, target='english') -> str:


								    if len(input_text.strip()) == 0:

								        return input_text


								    if source == target:

								        return input_text


								    batch_size = 4999

								    batches = []

								    whitespaces = []

								    start = 0

								    translated_text = ''


								    while not input_text[start].strip():

								        translated_text += input_text[start]

								        start += 1


								    # A while loop that will continue as long as start is less than the total length

								    # of the HTML content (len(input_text)). This ensures that we process the entire content.

								    while start < len(input_text):

								        whitespace = ''

								        # Set the end index for the current batch

								        end = start + batch_size

								        # If end is beyond the end of the content, we don't need to adjust it.

								        if end < len(input_text):

								            # A while loop to adjust the end index so that it doesn't split a word.

								            # It continues as long as end is within the content (end < len(input_text)) and

								            # the character at the end position is not a whitespace (input_text[end].strip()

								            # returns True for non-whitespace characters).

								            while end < len(input_text) and input_text[end].strip():

								                # The inner while loop, decrements the end index by 1. This moves the end index

								                # backwards until it reaches a whitespace character, ensuring that we don't split a word.

								                end -= 1

								            while not input_text[end].strip():

								                whitespace = input_text[end] + whitespace

								                end -= 1

								        else:

								            end = len(input_text) - 1

								            while not input_text[end].strip():

								                whitespace = input_text[end] + whitespace

								                end -= 1


								        # This line extracts a substring from the HTML content, starting from the start index

								        # and ending at the end index. This substring is our batch.

								        batch = input_text[start: end + 1]

								        # Add our batch to the batches list

								        batches.append(batch)

								        whitespaces.append(whitespace)

								        # Updates our start index for the next batch

								        start = end + 1 + len(whitespace)


								    translated_batches = GoogleTranslator(source, target).translate_batch(batches)


								    for batch, whitespace in zip(translated_batches, whitespaces):

								        translated_text += batch + whitespace


								    return translated_text