utils.py 3.0 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. # -*- coding: utf-8 -*-
  2. from gtts.tokenizer.symbols import ALL_PUNC as punc
  3. from string import whitespace as ws
  4. import re
  5. _ALL_PUNC_OR_SPACE = re.compile(u"^[{}]*$".format(re.escape(punc + ws)))
  6. """Regex that matches if an entire line is only comprised
  7. of whitespace and punctuation
  8. """
  9. def _minimize(the_string, delim, max_size):
  10. """Recursively split a string in the largest chunks
  11. possible from the highest position of a delimiter all the way
  12. to a maximum size
  13. Args:
  14. the_string (string): The string to split.
  15. delim (string): The delimiter to split on.
  16. max_size (int): The maximum size of a chunk.
  17. Returns:
  18. list: the minimized string in tokens
  19. Every chunk size will be at minimum ``the_string[0:idx]`` where ``idx``
  20. is the highest index of ``delim`` found in ``the_string``; and at maximum
  21. ``the_string[0:max_size]`` if no ``delim`` was found in ``the_string``.
  22. In the latter case, the split will occur at ``the_string[max_size]``
  23. which can be any character. The function runs itself again on the rest of
  24. ``the_string`` (``the_string[idx:]``) until no chunk is larger than
  25. ``max_size``.
  26. """
  27. # Remove `delim` from start of `the_string`
  28. # i.e. prevent a recursive infinite loop on `the_string[0:0]`
  29. # if `the_string` starts with `delim` and is larger than `max_size`
  30. if the_string.startswith(delim):
  31. the_string = the_string[len(delim):]
  32. if len(the_string) > max_size:
  33. try:
  34. # Find the highest index of `delim` in `the_string[0:max_size]`
  35. # i.e. `the_string` will be cut in half on `delim` index
  36. idx = the_string.rindex(delim, 0, max_size)
  37. except ValueError:
  38. # `delim` not found in `the_string`, index becomes `max_size`
  39. # i.e. `the_string` will be cut in half arbitrarily on `max_size`
  40. idx = max_size
  41. # Call itself again for `the_string[idx:]`
  42. return [the_string[:idx]] + _minimize(the_string[idx:], delim, max_size)
  43. else:
  44. return [the_string]
  45. def _clean_tokens(tokens):
  46. """Clean a list of strings
  47. Args:
  48. tokens (list): A list of strings (tokens) to clean.
  49. Returns:
  50. list: Stripped strings ``tokens`` without the original elements
  51. that only consisted of whitespace and/or punctuation characters.
  52. """
  53. return [t.strip() for t in tokens if not _ALL_PUNC_OR_SPACE.match(t)]
  54. def _translate_url(tld="com", path=""):
  55. """Generates a Google Translate URL
  56. Args:
  57. tld (string): Top-level domain for the Google Translate host,
  58. i.e ``https://translate.google.<tld>``. Default is ``com``.
  59. path: (string): A path to append to the Google Translate host,
  60. i.e ``https://translate.google.com/<path>``. Default is ``""``.
  61. Returns:
  62. string: A Google Translate URL `https://translate.google.<tld>/path`
  63. """
  64. _GOOGLE_TTS_URL = "https://translate.google.{}/{}"
  65. return _GOOGLE_TTS_URL.format(tld, path)