core.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324
  1. # -*- coding: utf-8 -*-
  2. import re
  3. class RegexBuilder:
  4. r"""Builds regex using arguments passed into a pattern template.
  5. Builds a regex object for which the pattern is made from an argument
  6. passed into a template. If more than one argument is passed (iterable),
  7. each pattern is joined by "|" (regex alternation 'or') to create a
  8. single pattern.
  9. Args:
  10. pattern_args (iteratable): String element(s) to be each passed to
  11. ``pattern_func`` to create a regex pattern. Each element is
  12. ``re.escape``'d before being passed.
  13. pattern_func (callable): A 'template' function that should take a
  14. string and return a string. It should take an element of
  15. ``pattern_args`` and return a valid regex pattern group string.
  16. flags: ``re`` flag(s) to compile with the regex.
  17. Example:
  18. To create a simple regex that matches on the characters "a", "b",
  19. or "c", followed by a period::
  20. >>> rb = RegexBuilder('abc', lambda x: "{}\.".format(x))
  21. Looking at ``rb.regex`` we get the following compiled regex::
  22. >>> print(rb.regex)
  23. 'a\.|b\.|c\.'
  24. The above is fairly simple, but this class can help in writing more
  25. complex repetitive regex, making them more readable and easier to
  26. create by using existing data structures.
  27. Example:
  28. To match the character following the words "lorem", "ipsum", "meili"
  29. or "koda"::
  30. >>> words = ['lorem', 'ipsum', 'meili', 'koda']
  31. >>> rb = RegexBuilder(words, lambda x: "(?<={}).".format(x))
  32. Looking at ``rb.regex`` we get the following compiled regex::
  33. >>> print(rb.regex)
  34. '(?<=lorem).|(?<=ipsum).|(?<=meili).|(?<=koda).'
  35. """
  36. def __init__(self, pattern_args, pattern_func, flags=0):
  37. self.pattern_args = pattern_args
  38. self.pattern_func = pattern_func
  39. self.flags = flags
  40. # Compile
  41. self.regex = self._compile()
  42. def _compile(self):
  43. alts = []
  44. for arg in self.pattern_args:
  45. arg = re.escape(arg)
  46. alt = self.pattern_func(arg)
  47. alts.append(alt)
  48. pattern = "|".join(alts)
  49. return re.compile(pattern, self.flags)
  50. def __repr__(self): # pragma: no cover
  51. return str(self.regex)
  52. class PreProcessorRegex:
  53. r"""Regex-based substitution text pre-processor.
  54. Runs a series of regex substitutions (``re.sub``) from each ``regex`` of a
  55. :class:`gtts.tokenizer.core.RegexBuilder` with an extra ``repl``
  56. replacement parameter.
  57. Args:
  58. search_args (iteratable): String element(s) to be each passed to
  59. ``search_func`` to create a regex pattern. Each element is
  60. ``re.escape``'d before being passed.
  61. search_func (callable): A 'template' function that should take a
  62. string and return a string. It should take an element of
  63. ``search_args`` and return a valid regex search pattern string.
  64. repl (string): The common replacement passed to the ``sub`` method for
  65. each ``regex``. Can be a raw string (the case of a regex
  66. backreference, for example)
  67. flags: ``re`` flag(s) to compile with each `regex`.
  68. Example:
  69. Add "!" after the words "lorem" or "ipsum", while ignoring case::
  70. >>> import re
  71. >>> words = ['lorem', 'ipsum']
  72. >>> pp = PreProcessorRegex(words,
  73. ... lambda x: "({})".format(x), r'\\1!',
  74. ... re.IGNORECASE)
  75. In this case, the regex is a group and the replacement uses its
  76. backreference ``\\1`` (as a raw string). Looking at ``pp`` we get the
  77. following list of search/replacement pairs::
  78. >>> print(pp)
  79. (re.compile('(lorem)', re.IGNORECASE), repl='\1!'),
  80. (re.compile('(ipsum)', re.IGNORECASE), repl='\1!')
  81. It can then be run on any string of text::
  82. >>> pp.run("LOREM ipSuM")
  83. "LOREM! ipSuM!"
  84. See :mod:`gtts.tokenizer.pre_processors` for more examples.
  85. """
  86. def __init__(self, search_args, search_func, repl, flags=0):
  87. self.repl = repl
  88. # Create regex list
  89. self.regexes = []
  90. for arg in search_args:
  91. rb = RegexBuilder([arg], search_func, flags)
  92. self.regexes.append(rb.regex)
  93. def run(self, text):
  94. """Run each regex substitution on ``text``.
  95. Args:
  96. text (string): the input text.
  97. Returns:
  98. string: text after all substitutions have been sequentially
  99. applied.
  100. """
  101. for regex in self.regexes:
  102. text = regex.sub(self.repl, text)
  103. return text
  104. def __repr__(self): # pragma: no cover
  105. subs_strs = []
  106. for r in self.regexes:
  107. subs_strs.append("({}, repl='{}')".format(r, self.repl))
  108. return ", ".join(subs_strs)
  109. class PreProcessorSub:
  110. r"""Simple substitution text preprocessor.
  111. Performs string-for-string substitution from list a find/replace pairs.
  112. It abstracts :class:`gtts.tokenizer.core.PreProcessorRegex` with a default
  113. simple substitution regex.
  114. Args:
  115. sub_pairs (list): A list of tuples of the style
  116. ``(<search str>, <replace str>)``
  117. ignore_case (bool): Ignore case during search. Defaults to ``True``.
  118. Example:
  119. Replace all occurences of "Mac" to "PC" and "Firefox" to "Chrome"::
  120. >>> sub_pairs = [('Mac', 'PC'), ('Firefox', 'Chrome')]
  121. >>> pp = PreProcessorSub(sub_pairs)
  122. Looking at the ``pp``, we get the following list of
  123. search (regex)/replacement pairs::
  124. >>> print(pp)
  125. (re.compile('Mac', re.IGNORECASE), repl='PC'),
  126. (re.compile('Firefox', re.IGNORECASE), repl='Chrome')
  127. It can then be run on any string of text::
  128. >>> pp.run("I use firefox on my mac")
  129. "I use Chrome on my PC"
  130. See :mod:`gtts.tokenizer.pre_processors` for more examples.
  131. """
  132. def __init__(self, sub_pairs, ignore_case=True):
  133. def search_func(x):
  134. return u"{}".format(x)
  135. flags = re.I if ignore_case else 0
  136. # Create pre-processor list
  137. self.pre_processors = []
  138. for sub_pair in sub_pairs:
  139. pattern, repl = sub_pair
  140. pp = PreProcessorRegex([pattern], search_func, repl, flags)
  141. self.pre_processors.append(pp)
  142. def run(self, text):
  143. """Run each substitution on ``text``.
  144. Args:
  145. text (string): the input text.
  146. Returns:
  147. string: text after all substitutions have been sequentially
  148. applied.
  149. """
  150. for pp in self.pre_processors:
  151. text = pp.run(text)
  152. return text
  153. def __repr__(self): # pragma: no cover
  154. return ", ".join([str(pp) for pp in self.pre_processors])
  155. class Tokenizer:
  156. r"""An extensible but simple generic rule-based tokenizer.
  157. A generic and simple string tokenizer that takes a list of functions
  158. (called `tokenizer cases`) returning ``regex`` objects and joins them by
  159. "|" (regex alternation 'or') to create a single regex to use with the
  160. standard ``regex.split()`` function.
  161. ``regex_funcs`` is a list of any function that can return a ``regex``
  162. (from ``re.compile()``) object, such as a
  163. :class:`gtts.tokenizer.core.RegexBuilder` instance (and its ``regex``
  164. attribute).
  165. See the :mod:`gtts.tokenizer.tokenizer_cases` module for examples.
  166. Args:
  167. regex_funcs (list): List of compiled ``regex`` objects. Each
  168. function's pattern will be joined into a single pattern and
  169. compiled.
  170. flags: ``re`` flag(s) to compile with the final regex. Defaults to
  171. ``re.IGNORECASE``
  172. Note:
  173. When the ``regex`` objects obtained from ``regex_funcs`` are joined,
  174. their individual ``re`` flags are ignored in favour of ``flags``.
  175. Raises:
  176. TypeError: When an element of ``regex_funcs`` is not a function, or
  177. a function that does not return a compiled ``regex`` object.
  178. Warning:
  179. Joined ``regex`` patterns can easily interfere with one another in
  180. unexpected ways. It is recommanded that each tokenizer case operate
  181. on distinct or non-overlapping chracters/sets of characters
  182. (For example, a tokenizer case for the period (".") should also
  183. handle not matching/cutting on decimals, instead of making that
  184. a seperate tokenizer case).
  185. Example:
  186. A tokenizer with a two simple case (*Note: these are bad cases to
  187. tokenize on, this is simply a usage example*)::
  188. >>> import re, RegexBuilder
  189. >>>
  190. >>> def case1():
  191. ... return re.compile("\,")
  192. >>>
  193. >>> def case2():
  194. ... return RegexBuilder('abc', lambda x: "{}\.".format(x)).regex
  195. >>>
  196. >>> t = Tokenizer([case1, case2])
  197. Looking at ``case1().pattern``, we get::
  198. >>> print(case1().pattern)
  199. '\\,'
  200. Looking at ``case2().pattern``, we get::
  201. >>> print(case2().pattern)
  202. 'a\\.|b\\.|c\\.'
  203. Finally, looking at ``t``, we get them combined::
  204. >>> print(t)
  205. 're.compile('\\,|a\\.|b\\.|c\\.', re.IGNORECASE)
  206. from: [<function case1 at 0x10bbcdd08>, <function case2 at 0x10b5c5e18>]'
  207. It can then be run on any string of text::
  208. >>> t.run("Hello, my name is Linda a. Call me Lin, b. I'm your friend")
  209. ['Hello', ' my name is Linda ', ' Call me Lin', ' ', " I'm your friend"]
  210. """
  211. def __init__(self, regex_funcs, flags=re.IGNORECASE):
  212. self.regex_funcs = regex_funcs
  213. self.flags = flags
  214. try:
  215. # Combine
  216. self.total_regex = self._combine_regex()
  217. except (TypeError, AttributeError) as e: # pragma: no cover
  218. raise TypeError(
  219. "Tokenizer() expects a list of functions returning "
  220. "regular expression objects (i.e. re.compile). " + str(e)
  221. )
  222. def _combine_regex(self):
  223. alts = []
  224. for func in self.regex_funcs:
  225. alts.append(func())
  226. pattern = "|".join(alt.pattern for alt in alts)
  227. return re.compile(pattern, self.flags)
  228. def run(self, text):
  229. """Tokenize `text`.
  230. Args:
  231. text (string): the input text to tokenize.
  232. Returns:
  233. list: A list of strings (token) split according to the tokenizer cases.
  234. """
  235. return self.total_regex.split(text)
  236. def __repr__(self): # pragma: no cover
  237. return str(self.total_regex) + " from: " + str(self.regex_funcs)