eval.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413
  1. """
  2. Top level ``eval`` module.
  3. """
  4. from __future__ import annotations
  5. import tokenize
  6. from typing import TYPE_CHECKING
  7. import warnings
  8. from pandas.util._exceptions import find_stack_level
  9. from pandas.util._validators import validate_bool_kwarg
  10. from pandas.core.dtypes.common import is_extension_array_dtype
  11. from pandas.core.computation.engines import ENGINES
  12. from pandas.core.computation.expr import (
  13. PARSERS,
  14. Expr,
  15. )
  16. from pandas.core.computation.parsing import tokenize_string
  17. from pandas.core.computation.scope import ensure_scope
  18. from pandas.core.generic import NDFrame
  19. from pandas.io.formats.printing import pprint_thing
  20. if TYPE_CHECKING:
  21. from pandas.core.computation.ops import BinOp
  22. def _check_engine(engine: str | None) -> str:
  23. """
  24. Make sure a valid engine is passed.
  25. Parameters
  26. ----------
  27. engine : str
  28. String to validate.
  29. Raises
  30. ------
  31. KeyError
  32. * If an invalid engine is passed.
  33. ImportError
  34. * If numexpr was requested but doesn't exist.
  35. Returns
  36. -------
  37. str
  38. Engine name.
  39. """
  40. from pandas.core.computation.check import NUMEXPR_INSTALLED
  41. from pandas.core.computation.expressions import USE_NUMEXPR
  42. if engine is None:
  43. engine = "numexpr" if USE_NUMEXPR else "python"
  44. if engine not in ENGINES:
  45. valid_engines = list(ENGINES.keys())
  46. raise KeyError(
  47. f"Invalid engine '{engine}' passed, valid engines are {valid_engines}"
  48. )
  49. # TODO: validate this in a more general way (thinking of future engines
  50. # that won't necessarily be import-able)
  51. # Could potentially be done on engine instantiation
  52. if engine == "numexpr" and not NUMEXPR_INSTALLED:
  53. raise ImportError(
  54. "'numexpr' is not installed or an unsupported version. Cannot use "
  55. "engine='numexpr' for query/eval if 'numexpr' is not installed"
  56. )
  57. return engine
  58. def _check_parser(parser: str):
  59. """
  60. Make sure a valid parser is passed.
  61. Parameters
  62. ----------
  63. parser : str
  64. Raises
  65. ------
  66. KeyError
  67. * If an invalid parser is passed
  68. """
  69. if parser not in PARSERS:
  70. raise KeyError(
  71. f"Invalid parser '{parser}' passed, valid parsers are {PARSERS.keys()}"
  72. )
  73. def _check_resolvers(resolvers):
  74. if resolvers is not None:
  75. for resolver in resolvers:
  76. if not hasattr(resolver, "__getitem__"):
  77. name = type(resolver).__name__
  78. raise TypeError(
  79. f"Resolver of type '{name}' does not "
  80. "implement the __getitem__ method"
  81. )
  82. def _check_expression(expr):
  83. """
  84. Make sure an expression is not an empty string
  85. Parameters
  86. ----------
  87. expr : object
  88. An object that can be converted to a string
  89. Raises
  90. ------
  91. ValueError
  92. * If expr is an empty string
  93. """
  94. if not expr:
  95. raise ValueError("expr cannot be an empty string")
  96. def _convert_expression(expr) -> str:
  97. """
  98. Convert an object to an expression.
  99. This function converts an object to an expression (a unicode string) and
  100. checks to make sure it isn't empty after conversion. This is used to
  101. convert operators to their string representation for recursive calls to
  102. :func:`~pandas.eval`.
  103. Parameters
  104. ----------
  105. expr : object
  106. The object to be converted to a string.
  107. Returns
  108. -------
  109. str
  110. The string representation of an object.
  111. Raises
  112. ------
  113. ValueError
  114. * If the expression is empty.
  115. """
  116. s = pprint_thing(expr)
  117. _check_expression(s)
  118. return s
  119. def _check_for_locals(expr: str, stack_level: int, parser: str):
  120. at_top_of_stack = stack_level == 0
  121. not_pandas_parser = parser != "pandas"
  122. if not_pandas_parser:
  123. msg = "The '@' prefix is only supported by the pandas parser"
  124. elif at_top_of_stack:
  125. msg = (
  126. "The '@' prefix is not allowed in top-level eval calls.\n"
  127. "please refer to your variables by name without the '@' prefix."
  128. )
  129. if at_top_of_stack or not_pandas_parser:
  130. for toknum, tokval in tokenize_string(expr):
  131. if toknum == tokenize.OP and tokval == "@":
  132. raise SyntaxError(msg)
  133. def eval(
  134. expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users
  135. parser: str = "pandas",
  136. engine: str | None = None,
  137. local_dict=None,
  138. global_dict=None,
  139. resolvers=(),
  140. level: int = 0,
  141. target=None,
  142. inplace: bool = False,
  143. ):
  144. """
  145. Evaluate a Python expression as a string using various backends.
  146. The following arithmetic operations are supported: ``+``, ``-``, ``*``,
  147. ``/``, ``**``, ``%``, ``//`` (python engine only) along with the following
  148. boolean operations: ``|`` (or), ``&`` (and), and ``~`` (not).
  149. Additionally, the ``'pandas'`` parser allows the use of :keyword:`and`,
  150. :keyword:`or`, and :keyword:`not` with the same semantics as the
  151. corresponding bitwise operators. :class:`~pandas.Series` and
  152. :class:`~pandas.DataFrame` objects are supported and behave as they would
  153. with plain ol' Python evaluation.
  154. Parameters
  155. ----------
  156. expr : str
  157. The expression to evaluate. This string cannot contain any Python
  158. `statements
  159. <https://docs.python.org/3/reference/simple_stmts.html#simple-statements>`__,
  160. only Python `expressions
  161. <https://docs.python.org/3/reference/simple_stmts.html#expression-statements>`__.
  162. parser : {'pandas', 'python'}, default 'pandas'
  163. The parser to use to construct the syntax tree from the expression. The
  164. default of ``'pandas'`` parses code slightly different than standard
  165. Python. Alternatively, you can parse an expression using the
  166. ``'python'`` parser to retain strict Python semantics. See the
  167. :ref:`enhancing performance <enhancingperf.eval>` documentation for
  168. more details.
  169. engine : {'python', 'numexpr'}, default 'numexpr'
  170. The engine used to evaluate the expression. Supported engines are
  171. - None : tries to use ``numexpr``, falls back to ``python``
  172. - ``'numexpr'`` : This default engine evaluates pandas objects using
  173. numexpr for large speed ups in complex expressions with large frames.
  174. - ``'python'`` : Performs operations as if you had ``eval``'d in top
  175. level python. This engine is generally not that useful.
  176. More backends may be available in the future.
  177. local_dict : dict or None, optional
  178. A dictionary of local variables, taken from locals() by default.
  179. global_dict : dict or None, optional
  180. A dictionary of global variables, taken from globals() by default.
  181. resolvers : list of dict-like or None, optional
  182. A list of objects implementing the ``__getitem__`` special method that
  183. you can use to inject an additional collection of namespaces to use for
  184. variable lookup. For example, this is used in the
  185. :meth:`~DataFrame.query` method to inject the
  186. ``DataFrame.index`` and ``DataFrame.columns``
  187. variables that refer to their respective :class:`~pandas.DataFrame`
  188. instance attributes.
  189. level : int, optional
  190. The number of prior stack frames to traverse and add to the current
  191. scope. Most users will **not** need to change this parameter.
  192. target : object, optional, default None
  193. This is the target object for assignment. It is used when there is
  194. variable assignment in the expression. If so, then `target` must
  195. support item assignment with string keys, and if a copy is being
  196. returned, it must also support `.copy()`.
  197. inplace : bool, default False
  198. If `target` is provided, and the expression mutates `target`, whether
  199. to modify `target` inplace. Otherwise, return a copy of `target` with
  200. the mutation.
  201. Returns
  202. -------
  203. ndarray, numeric scalar, DataFrame, Series, or None
  204. The completion value of evaluating the given code or None if ``inplace=True``.
  205. Raises
  206. ------
  207. ValueError
  208. There are many instances where such an error can be raised:
  209. - `target=None`, but the expression is multiline.
  210. - The expression is multiline, but not all them have item assignment.
  211. An example of such an arrangement is this:
  212. a = b + 1
  213. a + 2
  214. Here, there are expressions on different lines, making it multiline,
  215. but the last line has no variable assigned to the output of `a + 2`.
  216. - `inplace=True`, but the expression is missing item assignment.
  217. - Item assignment is provided, but the `target` does not support
  218. string item assignment.
  219. - Item assignment is provided and `inplace=False`, but the `target`
  220. does not support the `.copy()` method
  221. See Also
  222. --------
  223. DataFrame.query : Evaluates a boolean expression to query the columns
  224. of a frame.
  225. DataFrame.eval : Evaluate a string describing operations on
  226. DataFrame columns.
  227. Notes
  228. -----
  229. The ``dtype`` of any objects involved in an arithmetic ``%`` operation are
  230. recursively cast to ``float64``.
  231. See the :ref:`enhancing performance <enhancingperf.eval>` documentation for
  232. more details.
  233. Examples
  234. --------
  235. >>> df = pd.DataFrame({"animal": ["dog", "pig"], "age": [10, 20]})
  236. >>> df
  237. animal age
  238. 0 dog 10
  239. 1 pig 20
  240. We can add a new column using ``pd.eval``:
  241. >>> pd.eval("double_age = df.age * 2", target=df)
  242. animal age double_age
  243. 0 dog 10 20
  244. 1 pig 20 40
  245. """
  246. inplace = validate_bool_kwarg(inplace, "inplace")
  247. exprs: list[str | BinOp]
  248. if isinstance(expr, str):
  249. _check_expression(expr)
  250. exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""]
  251. else:
  252. # ops.BinOp; for internal compat, not intended to be passed by users
  253. exprs = [expr]
  254. multi_line = len(exprs) > 1
  255. if multi_line and target is None:
  256. raise ValueError(
  257. "multi-line expressions are only valid in the "
  258. "context of data, use DataFrame.eval"
  259. )
  260. engine = _check_engine(engine)
  261. _check_parser(parser)
  262. _check_resolvers(resolvers)
  263. ret = None
  264. first_expr = True
  265. target_modified = False
  266. for expr in exprs:
  267. expr = _convert_expression(expr)
  268. _check_for_locals(expr, level, parser)
  269. # get our (possibly passed-in) scope
  270. env = ensure_scope(
  271. level + 1,
  272. global_dict=global_dict,
  273. local_dict=local_dict,
  274. resolvers=resolvers,
  275. target=target,
  276. )
  277. parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
  278. if engine == "numexpr" and (
  279. is_extension_array_dtype(parsed_expr.terms.return_type)
  280. or getattr(parsed_expr.terms, "operand_types", None) is not None
  281. and any(
  282. is_extension_array_dtype(elem)
  283. for elem in parsed_expr.terms.operand_types
  284. )
  285. ):
  286. warnings.warn(
  287. "Engine has switched to 'python' because numexpr does not support "
  288. "extension array dtypes. Please set your engine to python manually.",
  289. RuntimeWarning,
  290. stacklevel=find_stack_level(),
  291. )
  292. engine = "python"
  293. # construct the engine and evaluate the parsed expression
  294. eng = ENGINES[engine]
  295. eng_inst = eng(parsed_expr)
  296. ret = eng_inst.evaluate()
  297. if parsed_expr.assigner is None:
  298. if multi_line:
  299. raise ValueError(
  300. "Multi-line expressions are only valid "
  301. "if all expressions contain an assignment"
  302. )
  303. if inplace:
  304. raise ValueError("Cannot operate inplace if there is no assignment")
  305. # assign if needed
  306. assigner = parsed_expr.assigner
  307. if env.target is not None and assigner is not None:
  308. target_modified = True
  309. # if returning a copy, copy only on the first assignment
  310. if not inplace and first_expr:
  311. try:
  312. target = env.target.copy()
  313. except AttributeError as err:
  314. raise ValueError("Cannot return a copy of the target") from err
  315. else:
  316. target = env.target
  317. # TypeError is most commonly raised (e.g. int, list), but you
  318. # get IndexError if you try to do this assignment on np.ndarray.
  319. # we will ignore numpy warnings here; e.g. if trying
  320. # to use a non-numeric indexer
  321. try:
  322. with warnings.catch_warnings(record=True):
  323. # TODO: Filter the warnings we actually care about here.
  324. if inplace and isinstance(target, NDFrame):
  325. target.loc[:, assigner] = ret
  326. else:
  327. target[assigner] = ret
  328. except (TypeError, IndexError) as err:
  329. raise ValueError("Cannot assign expression output to target") from err
  330. if not resolvers:
  331. resolvers = ({assigner: ret},)
  332. else:
  333. # existing resolver needs updated to handle
  334. # case of mutating existing column in copy
  335. for resolver in resolvers:
  336. if assigner in resolver:
  337. resolver[assigner] = ret
  338. break
  339. else:
  340. resolvers += ({assigner: ret},)
  341. ret = None
  342. first_expr = False
  343. # We want to exclude `inplace=None` as being False.
  344. if inplace is False:
  345. return target if target_modified else ret