pre_processors.py 1.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. # -*- coding: utf-8 -*-
  2. from gtts.tokenizer import PreProcessorRegex, PreProcessorSub, symbols
  3. import re
  4. def tone_marks(text):
  5. """Add a space after tone-modifying punctuation.
  6. Because the `tone_marks` tokenizer case will split after a tone-modifying
  7. punctuation mark, make sure there's whitespace after.
  8. """
  9. return PreProcessorRegex(
  10. search_args=symbols.TONE_MARKS,
  11. search_func=lambda x: u"(?<={})".format(x),
  12. repl=" ",
  13. ).run(text)
  14. def end_of_line(text):
  15. """Re-form words cut by end-of-line hyphens.
  16. Remove "<hyphen><newline>".
  17. """
  18. return PreProcessorRegex(
  19. search_args="-", search_func=lambda x: u"{}\n".format(x), repl=""
  20. ).run(text)
  21. def abbreviations(text):
  22. """Remove periods after an abbreviation from a list of known
  23. abbreviations that can be spoken the same without that period. This
  24. prevents having to handle tokenization of that period.
  25. Note:
  26. Could potentially remove the ending period of a sentence.
  27. Note:
  28. Abbreviations that Google Translate can't pronounce without
  29. (or even with) a period should be added as a word substitution with a
  30. :class:`PreProcessorSub` pre-processor. Ex.: 'Esq.', 'Esquire'.
  31. """
  32. return PreProcessorRegex(
  33. search_args=symbols.ABBREVIATIONS,
  34. search_func=lambda x: r"(?<={})(?=\.).".format(x),
  35. repl="",
  36. flags=re.IGNORECASE,
  37. ).run(text)
  38. def word_sub(text):
  39. """Word-for-word substitutions."""
  40. return PreProcessorSub(sub_pairs=symbols.SUB_PAIRS).run(text)