from deep_translator import GoogleTranslator # For the arguments source and target, you can use the name of the language or its abbreviation # chinese (simplified): zh-CN, english: en, korean: ko, persian: fa, russian: ru def translate(input_text: str, source: str, target='english') -> str: if len(input_text.strip()) == 0: return input_text if source == target: return input_text batch_size = 4999 batches = [] whitespaces = [] start = 0 translated_text = '' while not input_text[start].strip(): translated_text += input_text[start] start += 1 # A while loop that will continue as long as start is less than the total length # of the HTML content (len(input_text)). This ensures that we process the entire content. while start < len(input_text): whitespace = '' # Set the end index for the current batch end = start + batch_size # If end is beyond the end of the content, we don't need to adjust it. if end < len(input_text): # A while loop to adjust the end index so that it doesn't split a word. # It continues as long as end is within the content (end < len(input_text)) and # the character at the end position is not a whitespace (input_text[end].strip() # returns True for non-whitespace characters). while end < len(input_text) and input_text[end].strip(): # The inner while loop, decrements the end index by 1. This moves the end index # backwards until it reaches a whitespace character, ensuring that we don't split a word. end -= 1 while not input_text[end].strip(): whitespace = input_text[end] + whitespace end -= 1 else: end = len(input_text) - 1 while not input_text[end].strip(): whitespace = input_text[end] + whitespace end -= 1 # This line extracts a substring from the HTML content, starting from the start index # and ending at the end index. This substring is our batch. batch = input_text[start: end + 1] # Add our batch to the batches list batches.append(batch) whitespaces.append(whitespace) # Updates our start index for the next batch start = end + 1 + len(whitespace) translated_batches = GoogleTranslator(source, target).translate_batch(batches) for batch, whitespace in zip(translated_batches, whitespaces): translated_text += batch + whitespace return translated_text