parsing.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. """
  2. :func:`~pandas.eval` source string parsing functions
  3. """
  4. from __future__ import annotations
  5. from io import StringIO
  6. from keyword import iskeyword
  7. import token
  8. import tokenize
  9. from typing import (
  10. Hashable,
  11. Iterator,
  12. )
  13. # A token value Python's tokenizer probably will never use.
  14. BACKTICK_QUOTED_STRING = 100
  15. def create_valid_python_identifier(name: str) -> str:
  16. """
  17. Create valid Python identifiers from any string.
  18. Check if name contains any special characters. If it contains any
  19. special characters, the special characters will be replaced by
  20. a special string and a prefix is added.
  21. Raises
  22. ------
  23. SyntaxError
  24. If the returned name is not a Python valid identifier, raise an exception.
  25. This can happen if there is a hashtag in the name, as the tokenizer will
  26. than terminate and not find the backtick.
  27. But also for characters that fall out of the range of (U+0001..U+007F).
  28. """
  29. if name.isidentifier() and not iskeyword(name):
  30. return name
  31. # Create a dict with the special characters and their replacement string.
  32. # EXACT_TOKEN_TYPES contains these special characters
  33. # token.tok_name contains a readable description of the replacement string.
  34. special_characters_replacements = {
  35. char: f"_{token.tok_name[tokval]}_"
  36. for char, tokval in (tokenize.EXACT_TOKEN_TYPES.items())
  37. }
  38. special_characters_replacements.update(
  39. {
  40. " ": "_",
  41. "?": "_QUESTIONMARK_",
  42. "!": "_EXCLAMATIONMARK_",
  43. "$": "_DOLLARSIGN_",
  44. "€": "_EUROSIGN_",
  45. "°": "_DEGREESIGN_",
  46. # Including quotes works, but there are exceptions.
  47. "'": "_SINGLEQUOTE_",
  48. '"': "_DOUBLEQUOTE_",
  49. # Currently not possible. Terminates parser and won't find backtick.
  50. # "#": "_HASH_",
  51. }
  52. )
  53. name = "".join([special_characters_replacements.get(char, char) for char in name])
  54. name = f"BACKTICK_QUOTED_STRING_{name}"
  55. if not name.isidentifier():
  56. raise SyntaxError(f"Could not convert '{name}' to a valid Python identifier.")
  57. return name
  58. def clean_backtick_quoted_toks(tok: tuple[int, str]) -> tuple[int, str]:
  59. """
  60. Clean up a column name if surrounded by backticks.
  61. Backtick quoted string are indicated by a certain tokval value. If a string
  62. is a backtick quoted token it will processed by
  63. :func:`_create_valid_python_identifier` so that the parser can find this
  64. string when the query is executed.
  65. In this case the tok will get the NAME tokval.
  66. Parameters
  67. ----------
  68. tok : tuple of int, str
  69. ints correspond to the all caps constants in the tokenize module
  70. Returns
  71. -------
  72. tok : Tuple[int, str]
  73. Either the input or token or the replacement values
  74. """
  75. toknum, tokval = tok
  76. if toknum == BACKTICK_QUOTED_STRING:
  77. return tokenize.NAME, create_valid_python_identifier(tokval)
  78. return toknum, tokval
  79. def clean_column_name(name: Hashable) -> Hashable:
  80. """
  81. Function to emulate the cleaning of a backtick quoted name.
  82. The purpose for this function is to see what happens to the name of
  83. identifier if it goes to the process of being parsed a Python code
  84. inside a backtick quoted string and than being cleaned
  85. (removed of any special characters).
  86. Parameters
  87. ----------
  88. name : hashable
  89. Name to be cleaned.
  90. Returns
  91. -------
  92. name : hashable
  93. Returns the name after tokenizing and cleaning.
  94. Notes
  95. -----
  96. For some cases, a name cannot be converted to a valid Python identifier.
  97. In that case :func:`tokenize_string` raises a SyntaxError.
  98. In that case, we just return the name unmodified.
  99. If this name was used in the query string (this makes the query call impossible)
  100. an error will be raised by :func:`tokenize_backtick_quoted_string` instead,
  101. which is not caught and propagates to the user level.
  102. """
  103. try:
  104. tokenized = tokenize_string(f"`{name}`")
  105. tokval = next(tokenized)[1]
  106. return create_valid_python_identifier(tokval)
  107. except SyntaxError:
  108. return name
  109. def tokenize_backtick_quoted_string(
  110. token_generator: Iterator[tokenize.TokenInfo], source: str, string_start: int
  111. ) -> tuple[int, str]:
  112. """
  113. Creates a token from a backtick quoted string.
  114. Moves the token_generator forwards till right after the next backtick.
  115. Parameters
  116. ----------
  117. token_generator : Iterator[tokenize.TokenInfo]
  118. The generator that yields the tokens of the source string (Tuple[int, str]).
  119. The generator is at the first token after the backtick (`)
  120. source : str
  121. The Python source code string.
  122. string_start : int
  123. This is the start of backtick quoted string inside the source string.
  124. Returns
  125. -------
  126. tok: Tuple[int, str]
  127. The token that represents the backtick quoted string.
  128. The integer is equal to BACKTICK_QUOTED_STRING (100).
  129. """
  130. for _, tokval, start, _, _ in token_generator:
  131. if tokval == "`":
  132. string_end = start[1]
  133. break
  134. return BACKTICK_QUOTED_STRING, source[string_start:string_end]
  135. def tokenize_string(source: str) -> Iterator[tuple[int, str]]:
  136. """
  137. Tokenize a Python source code string.
  138. Parameters
  139. ----------
  140. source : str
  141. The Python source code string.
  142. Returns
  143. -------
  144. tok_generator : Iterator[Tuple[int, str]]
  145. An iterator yielding all tokens with only toknum and tokval (Tuple[ing, str]).
  146. """
  147. line_reader = StringIO(source).readline
  148. token_generator = tokenize.generate_tokens(line_reader)
  149. # Loop over all tokens till a backtick (`) is found.
  150. # Then, take all tokens till the next backtick to form a backtick quoted string
  151. for toknum, tokval, start, _, _ in token_generator:
  152. if tokval == "`":
  153. try:
  154. yield tokenize_backtick_quoted_string(
  155. token_generator, source, string_start=start[1] + 1
  156. )
  157. except Exception as err:
  158. raise SyntaxError(f"Failed to parse backticks in '{source}'.") from err
  159. else:
  160. yield toknum, tokval