test_tokenizer_cases.py 1.6 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950
  1. # -*- coding: utf-8 -*-
  2. import unittest
  3. from gtts.tokenizer.tokenizer_cases import (
  4. tone_marks,
  5. period_comma,
  6. colon,
  7. other_punctuation,
  8. legacy_all_punctuation,
  9. )
  10. from gtts.tokenizer import Tokenizer, symbols
  11. class TestPreTokenizerCases(unittest.TestCase):
  12. def test_tone_marks(self):
  13. t = Tokenizer([tone_marks])
  14. _in = "Lorem? Ipsum!"
  15. _out = ["Lorem?", "Ipsum!"]
  16. self.assertEqual(t.run(_in), _out)
  17. def test_period_comma(self):
  18. t = Tokenizer([period_comma])
  19. _in = "Hello, it's 24.5 degrees in the U.K. today. $20,000,000."
  20. _out = ["Hello", "it's 24.5 degrees in the U.K. today", "$20,000,000."]
  21. self.assertEqual(t.run(_in), _out)
  22. def test_colon(self):
  23. t = Tokenizer([colon])
  24. _in = "It's now 6:30 which means: morning missing:space"
  25. _out = ["It's now 6:30 which means", " morning missing", "space"]
  26. self.assertEqual(t.run(_in), _out)
  27. def test_other_punctuation(self):
  28. # String of the unique 'other punctuations'
  29. other_punc_str = "".join(
  30. set(symbols.ALL_PUNC)
  31. - set(symbols.TONE_MARKS)
  32. - set(symbols.PERIOD_COMMA)
  33. - set(symbols.COLON)
  34. )
  35. t = Tokenizer([other_punctuation])
  36. self.assertEqual(len(t.run(other_punc_str)) - 1, len(other_punc_str))
  37. def test_legacy_all_punctuation(self):
  38. t = Tokenizer([legacy_all_punctuation])
  39. self.assertEqual(len(t.run(symbols.ALL_PUNC)) - 1, len(symbols.ALL_PUNC))
  40. if __name__ == "__main__":
  41. unittest.main()