|
from deep_translator import GoogleTranslator
|
|
|
|
|
|
# For the arguments source and target, you can use the name of the language or its abbreviation
|
|
# chinese (simplified): zh-CN, english: en, korean: ko, persian: fa, russian: ru
|
|
def translate(input_text: str, source: str, target='english') -> str:
|
|
|
|
if len(input_text.strip()) == 0:
|
|
return input_text
|
|
|
|
if source == target:
|
|
return input_text
|
|
|
|
batch_size = 4999
|
|
batches = []
|
|
whitespaces = []
|
|
start = 0
|
|
translated_text = ''
|
|
|
|
while not input_text[start].strip():
|
|
translated_text += input_text[start]
|
|
start += 1
|
|
|
|
# A while loop that will continue as long as start is less than the total length
|
|
# of the HTML content (len(input_text)). This ensures that we process the entire content.
|
|
while start < len(input_text):
|
|
whitespace = ''
|
|
# Set the end index for the current batch
|
|
end = start + batch_size
|
|
# If end is beyond the end of the content, we don't need to adjust it.
|
|
if end < len(input_text):
|
|
# A while loop to adjust the end index so that it doesn't split a word.
|
|
# It continues as long as end is within the content (end < len(input_text)) and
|
|
# the character at the end position is not a whitespace (input_text[end].strip()
|
|
# returns True for non-whitespace characters).
|
|
while end < len(input_text) and input_text[end].strip():
|
|
# The inner while loop, decrements the end index by 1. This moves the end index
|
|
# backwards until it reaches a whitespace character, ensuring that we don't split a word.
|
|
end -= 1
|
|
while not input_text[end].strip():
|
|
whitespace = input_text[end] + whitespace
|
|
end -= 1
|
|
else:
|
|
end = len(input_text) - 1
|
|
while not input_text[end].strip():
|
|
whitespace = input_text[end] + whitespace
|
|
end -= 1
|
|
|
|
# This line extracts a substring from the HTML content, starting from the start index
|
|
# and ending at the end index. This substring is our batch.
|
|
batch = input_text[start: end + 1]
|
|
# Add our batch to the batches list
|
|
batches.append(batch)
|
|
whitespaces.append(whitespace)
|
|
# Updates our start index for the next batch
|
|
start = end + 1 + len(whitespace)
|
|
|
|
translated_batches = GoogleTranslator(source, target).translate_batch(batches)
|
|
|
|
for batch, whitespace in zip(translated_batches, whitespaces):
|
|
translated_text += batch + whitespace
|
|
|
|
return translated_text
|