# -*- coding: utf-8 -*- from gtts.tokenizer.symbols import ALL_PUNC as punc from string import whitespace as ws import re _ALL_PUNC_OR_SPACE = re.compile(u"^[{}]*$".format(re.escape(punc + ws))) """Regex that matches if an entire line is only comprised of whitespace and punctuation """ def _minimize(the_string, delim, max_size): """Recursively split a string in the largest chunks possible from the highest position of a delimiter all the way to a maximum size Args: the_string (string): The string to split. delim (string): The delimiter to split on. max_size (int): The maximum size of a chunk. Returns: list: the minimized string in tokens Every chunk size will be at minimum ``the_string[0:idx]`` where ``idx`` is the highest index of ``delim`` found in ``the_string``; and at maximum ``the_string[0:max_size]`` if no ``delim`` was found in ``the_string``. In the latter case, the split will occur at ``the_string[max_size]`` which can be any character. The function runs itself again on the rest of ``the_string`` (``the_string[idx:]``) until no chunk is larger than ``max_size``. """ # Remove `delim` from start of `the_string` # i.e. prevent a recursive infinite loop on `the_string[0:0]` # if `the_string` starts with `delim` and is larger than `max_size` if the_string.startswith(delim): the_string = the_string[len(delim):] if len(the_string) > max_size: try: # Find the highest index of `delim` in `the_string[0:max_size]` # i.e. `the_string` will be cut in half on `delim` index idx = the_string.rindex(delim, 0, max_size) except ValueError: # `delim` not found in `the_string`, index becomes `max_size` # i.e. `the_string` will be cut in half arbitrarily on `max_size` idx = max_size # Call itself again for `the_string[idx:]` return [the_string[:idx]] + _minimize(the_string[idx:], delim, max_size) else: return [the_string] def _clean_tokens(tokens): """Clean a list of strings Args: tokens (list): A list of strings (tokens) to clean. Returns: list: Stripped strings ``tokens`` without the original elements that only consisted of whitespace and/or punctuation characters. """ return [t.strip() for t in tokens if not _ALL_PUNC_OR_SPACE.match(t)] def _translate_url(tld="com", path=""): """Generates a Google Translate URL Args: tld (string): Top-level domain for the Google Translate host, i.e ``https://translate.google.``. Default is ``com``. path: (string): A path to append to the Google Translate host, i.e ``https://translate.google.com/``. Default is ``""``. Returns: string: A Google Translate URL `https://translate.google./path` """ _GOOGLE_TTS_URL = "https://translate.google.{}/{}" return _GOOGLE_TTS_URL.format(tld, path)