this is based on calsyslab project
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 lines
2.6 KiB

  1. from deep_translator import GoogleTranslator
  2. # For the arguments source and target, you can use the name of the language or its abbreviation
  3. # chinese (simplified): zh-CN, english: en, korean: ko, persian: fa, russian: ru
  4. def translate(input_text: str, source: str, target='english') -> str:
  5. if len(input_text.strip()) == 0:
  6. return input_text
  7. if source == target:
  8. return input_text
  9. batch_size = 4999
  10. batches = []
  11. whitespaces = []
  12. start = 0
  13. translated_text = ''
  14. while not input_text[start].strip():
  15. translated_text += input_text[start]
  16. start += 1
  17. # A while loop that will continue as long as start is less than the total length
  18. # of the HTML content (len(input_text)). This ensures that we process the entire content.
  19. while start < len(input_text):
  20. whitespace = ''
  21. # Set the end index for the current batch
  22. end = start + batch_size
  23. # If end is beyond the end of the content, we don't need to adjust it.
  24. if end < len(input_text):
  25. # A while loop to adjust the end index so that it doesn't split a word.
  26. # It continues as long as end is within the content (end < len(input_text)) and
  27. # the character at the end position is not a whitespace (input_text[end].strip()
  28. # returns True for non-whitespace characters).
  29. while end < len(input_text) and input_text[end].strip():
  30. # The inner while loop, decrements the end index by 1. This moves the end index
  31. # backwards until it reaches a whitespace character, ensuring that we don't split a word.
  32. end -= 1
  33. while not input_text[end].strip():
  34. whitespace = input_text[end] + whitespace
  35. end -= 1
  36. else:
  37. end = len(input_text) - 1
  38. while not input_text[end].strip():
  39. whitespace = input_text[end] + whitespace
  40. end -= 1
  41. # This line extracts a substring from the HTML content, starting from the start index
  42. # and ending at the end index. This substring is our batch.
  43. batch = input_text[start: end + 1]
  44. # Add our batch to the batches list
  45. batches.append(batch)
  46. whitespaces.append(whitespace)
  47. # Updates our start index for the next batch
  48. start = end + 1 + len(whitespace)
  49. translated_batches = GoogleTranslator(source, target).translate_batch(batches)
  50. for batch, whitespace in zip(translated_batches, whitespaces):
  51. translated_text += batch + whitespace
  52. return translated_text