tokenizer_cases.py 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869
  1. # -*- coding: utf-8 -*-
  2. from gtts.tokenizer import RegexBuilder, symbols
  3. def tone_marks():
  4. """Keep tone-modifying punctuation by matching following character.
  5. Assumes the `tone_marks` pre-processor was run for cases where there might
  6. not be any space after a tone-modifying punctuation mark.
  7. """
  8. return RegexBuilder(
  9. pattern_args=symbols.TONE_MARKS, pattern_func=lambda x: u"(?<={}).".format(x)
  10. ).regex
  11. def period_comma():
  12. """Period and comma case.
  13. Match if not preceded by ".<letter>" and only if followed by space.
  14. Won't cut in the middle/after dotted abbreviations; won't cut numbers.
  15. Note:
  16. Won't match if a dotted abbreviation ends a sentence.
  17. Note:
  18. Won't match the end of a sentence if not followed by a space.
  19. """
  20. return RegexBuilder(
  21. pattern_args=symbols.PERIOD_COMMA,
  22. pattern_func=lambda x: r"(?<!\.[a-z]){} ".format(x),
  23. ).regex
  24. def colon():
  25. """Colon case.
  26. Match a colon ":" only if not preceded by a digit.
  27. Mainly to prevent a cut in the middle of time notations e.g. 10:01
  28. """
  29. return RegexBuilder(
  30. pattern_args=symbols.COLON, pattern_func=lambda x: r"(?<!\d){}".format(x)
  31. ).regex
  32. def other_punctuation():
  33. """Match other punctuation.
  34. Match other punctuation to split on; punctuation that naturally
  35. inserts a break in speech.
  36. """
  37. punc = "".join(
  38. set(symbols.ALL_PUNC)
  39. - set(symbols.TONE_MARKS)
  40. - set(symbols.PERIOD_COMMA)
  41. - set(symbols.COLON)
  42. )
  43. return RegexBuilder(pattern_args=punc, pattern_func=lambda x: u"{}".format(x)).regex
  44. def legacy_all_punctuation(): # pragma: no cover b/c tested but Coveralls: ¯\_(ツ)_/¯
  45. """Match all punctuation.
  46. Use as only tokenizer case to mimic gTTS 1.x tokenization.
  47. """
  48. punc = symbols.ALL_PUNC
  49. return RegexBuilder(pattern_args=punc, pattern_func=lambda x: u"{}".format(x)).regex