|
|
- from deep_translator import GoogleTranslator
-
-
- # For the arguments source and target, you can use the name of the language or its abbreviation
- # chinese (simplified): zh-CN, english: en, korean: ko, persian: fa, russian: ru
- def translate(input_text: str, source: str, target='english') -> str:
-
- if len(input_text.strip()) == 0:
- return input_text
-
- if source == target:
- return input_text
-
- batch_size = 4999
- batches = []
- whitespaces = []
- start = 0
- translated_text = ''
-
- while not input_text[start].strip():
- translated_text += input_text[start]
- start += 1
-
- # A while loop that will continue as long as start is less than the total length
- # of the HTML content (len(input_text)). This ensures that we process the entire content.
- while start < len(input_text):
- whitespace = ''
- # Set the end index for the current batch
- end = start + batch_size
- # If end is beyond the end of the content, we don't need to adjust it.
- if end < len(input_text):
- # A while loop to adjust the end index so that it doesn't split a word.
- # It continues as long as end is within the content (end < len(input_text)) and
- # the character at the end position is not a whitespace (input_text[end].strip()
- # returns True for non-whitespace characters).
- while end < len(input_text) and input_text[end].strip():
- # The inner while loop, decrements the end index by 1. This moves the end index
- # backwards until it reaches a whitespace character, ensuring that we don't split a word.
- end -= 1
- while not input_text[end].strip():
- whitespace = input_text[end] + whitespace
- end -= 1
- else:
- end = len(input_text) - 1
- while not input_text[end].strip():
- whitespace = input_text[end] + whitespace
- end -= 1
-
- # This line extracts a substring from the HTML content, starting from the start index
- # and ending at the end index. This substring is our batch.
- batch = input_text[start: end + 1]
- # Add our batch to the batches list
- batches.append(batch)
- whitespaces.append(whitespace)
- # Updates our start index for the next batch
- start = end + 1 + len(whitespace)
-
- translated_batches = GoogleTranslator(source, target).translate_batch(batches)
-
- for batch, whitespace in zip(translated_batches, whitespaces):
- translated_text += batch + whitespace
-
- return translated_text
|