test_query_eval.py 51 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361
  1. import operator
  2. import numpy as np
  3. import pytest
  4. from pandas.errors import (
  5. NumExprClobberingError,
  6. UndefinedVariableError,
  7. )
  8. import pandas.util._test_decorators as td
  9. import pandas as pd
  10. from pandas import (
  11. DataFrame,
  12. Index,
  13. MultiIndex,
  14. Series,
  15. date_range,
  16. )
  17. import pandas._testing as tm
  18. from pandas.core.computation.check import NUMEXPR_INSTALLED
  19. @pytest.fixture(params=["python", "pandas"], ids=lambda x: x)
  20. def parser(request):
  21. return request.param
  22. @pytest.fixture(
  23. params=["python", pytest.param("numexpr", marks=td.skip_if_no_ne)], ids=lambda x: x
  24. )
  25. def engine(request):
  26. return request.param
  27. def skip_if_no_pandas_parser(parser):
  28. if parser != "pandas":
  29. pytest.skip(f"cannot evaluate with parser {repr(parser)}")
  30. class TestCompat:
  31. @pytest.fixture
  32. def df(self):
  33. return DataFrame({"A": [1, 2, 3]})
  34. @pytest.fixture
  35. def expected1(self, df):
  36. return df[df.A > 0]
  37. @pytest.fixture
  38. def expected2(self, df):
  39. return df.A + 1
  40. def test_query_default(self, df, expected1, expected2):
  41. # GH 12749
  42. # this should always work, whether NUMEXPR_INSTALLED or not
  43. result = df.query("A>0")
  44. tm.assert_frame_equal(result, expected1)
  45. result = df.eval("A+1")
  46. tm.assert_series_equal(result, expected2, check_names=False)
  47. def test_query_None(self, df, expected1, expected2):
  48. result = df.query("A>0", engine=None)
  49. tm.assert_frame_equal(result, expected1)
  50. result = df.eval("A+1", engine=None)
  51. tm.assert_series_equal(result, expected2, check_names=False)
  52. def test_query_python(self, df, expected1, expected2):
  53. result = df.query("A>0", engine="python")
  54. tm.assert_frame_equal(result, expected1)
  55. result = df.eval("A+1", engine="python")
  56. tm.assert_series_equal(result, expected2, check_names=False)
  57. def test_query_numexpr(self, df, expected1, expected2):
  58. if NUMEXPR_INSTALLED:
  59. result = df.query("A>0", engine="numexpr")
  60. tm.assert_frame_equal(result, expected1)
  61. result = df.eval("A+1", engine="numexpr")
  62. tm.assert_series_equal(result, expected2, check_names=False)
  63. else:
  64. msg = (
  65. r"'numexpr' is not installed or an unsupported version. "
  66. r"Cannot use engine='numexpr' for query/eval if 'numexpr' is "
  67. r"not installed"
  68. )
  69. with pytest.raises(ImportError, match=msg):
  70. df.query("A>0", engine="numexpr")
  71. with pytest.raises(ImportError, match=msg):
  72. df.eval("A+1", engine="numexpr")
  73. class TestDataFrameEval:
  74. # smaller hits python, larger hits numexpr
  75. @pytest.mark.parametrize("n", [4, 4000])
  76. @pytest.mark.parametrize(
  77. "op_str,op,rop",
  78. [
  79. ("+", "__add__", "__radd__"),
  80. ("-", "__sub__", "__rsub__"),
  81. ("*", "__mul__", "__rmul__"),
  82. ("/", "__truediv__", "__rtruediv__"),
  83. ],
  84. )
  85. def test_ops(self, op_str, op, rop, n):
  86. # tst ops and reversed ops in evaluation
  87. # GH7198
  88. df = DataFrame(1, index=range(n), columns=list("abcd"))
  89. df.iloc[0] = 2
  90. m = df.mean()
  91. base = DataFrame( # noqa:F841
  92. np.tile(m.values, n).reshape(n, -1), columns=list("abcd")
  93. )
  94. expected = eval(f"base {op_str} df")
  95. # ops as strings
  96. result = eval(f"m {op_str} df")
  97. tm.assert_frame_equal(result, expected)
  98. # these are commutative
  99. if op in ["+", "*"]:
  100. result = getattr(df, op)(m)
  101. tm.assert_frame_equal(result, expected)
  102. # these are not
  103. elif op in ["-", "/"]:
  104. result = getattr(df, rop)(m)
  105. tm.assert_frame_equal(result, expected)
  106. def test_dataframe_sub_numexpr_path(self):
  107. # GH7192: Note we need a large number of rows to ensure this
  108. # goes through the numexpr path
  109. df = DataFrame({"A": np.random.randn(25000)})
  110. df.iloc[0:5] = np.nan
  111. expected = 1 - np.isnan(df.iloc[0:25])
  112. result = (1 - np.isnan(df)).iloc[0:25]
  113. tm.assert_frame_equal(result, expected)
  114. def test_query_non_str(self):
  115. # GH 11485
  116. df = DataFrame({"A": [1, 2, 3], "B": ["a", "b", "b"]})
  117. msg = "expr must be a string to be evaluated"
  118. with pytest.raises(ValueError, match=msg):
  119. df.query(lambda x: x.B == "b")
  120. with pytest.raises(ValueError, match=msg):
  121. df.query(111)
  122. def test_query_empty_string(self):
  123. # GH 13139
  124. df = DataFrame({"A": [1, 2, 3]})
  125. msg = "expr cannot be an empty string"
  126. with pytest.raises(ValueError, match=msg):
  127. df.query("")
  128. def test_eval_resolvers_as_list(self):
  129. # GH 14095
  130. df = DataFrame(np.random.randn(10, 2), columns=list("ab"))
  131. dict1 = {"a": 1}
  132. dict2 = {"b": 2}
  133. assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
  134. assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
  135. def test_eval_resolvers_combined(self):
  136. # GH 34966
  137. df = DataFrame(np.random.randn(10, 2), columns=list("ab"))
  138. dict1 = {"c": 2}
  139. # Both input and default index/column resolvers should be usable
  140. result = df.eval("a + b * c", resolvers=[dict1])
  141. expected = df["a"] + df["b"] * dict1["c"]
  142. tm.assert_series_equal(result, expected)
  143. def test_eval_object_dtype_binop(self):
  144. # GH#24883
  145. df = DataFrame({"a1": ["Y", "N"]})
  146. res = df.eval("c = ((a1 == 'Y') & True)")
  147. expected = DataFrame({"a1": ["Y", "N"], "c": [True, False]})
  148. tm.assert_frame_equal(res, expected)
  149. class TestDataFrameQueryWithMultiIndex:
  150. def test_query_with_named_multiindex(self, parser, engine):
  151. skip_if_no_pandas_parser(parser)
  152. a = np.random.choice(["red", "green"], size=10)
  153. b = np.random.choice(["eggs", "ham"], size=10)
  154. index = MultiIndex.from_arrays([a, b], names=["color", "food"])
  155. df = DataFrame(np.random.randn(10, 2), index=index)
  156. ind = Series(
  157. df.index.get_level_values("color").values, index=index, name="color"
  158. )
  159. # equality
  160. res1 = df.query('color == "red"', parser=parser, engine=engine)
  161. res2 = df.query('"red" == color', parser=parser, engine=engine)
  162. exp = df[ind == "red"]
  163. tm.assert_frame_equal(res1, exp)
  164. tm.assert_frame_equal(res2, exp)
  165. # inequality
  166. res1 = df.query('color != "red"', parser=parser, engine=engine)
  167. res2 = df.query('"red" != color', parser=parser, engine=engine)
  168. exp = df[ind != "red"]
  169. tm.assert_frame_equal(res1, exp)
  170. tm.assert_frame_equal(res2, exp)
  171. # list equality (really just set membership)
  172. res1 = df.query('color == ["red"]', parser=parser, engine=engine)
  173. res2 = df.query('["red"] == color', parser=parser, engine=engine)
  174. exp = df[ind.isin(["red"])]
  175. tm.assert_frame_equal(res1, exp)
  176. tm.assert_frame_equal(res2, exp)
  177. res1 = df.query('color != ["red"]', parser=parser, engine=engine)
  178. res2 = df.query('["red"] != color', parser=parser, engine=engine)
  179. exp = df[~ind.isin(["red"])]
  180. tm.assert_frame_equal(res1, exp)
  181. tm.assert_frame_equal(res2, exp)
  182. # in/not in ops
  183. res1 = df.query('["red"] in color', parser=parser, engine=engine)
  184. res2 = df.query('"red" in color', parser=parser, engine=engine)
  185. exp = df[ind.isin(["red"])]
  186. tm.assert_frame_equal(res1, exp)
  187. tm.assert_frame_equal(res2, exp)
  188. res1 = df.query('["red"] not in color', parser=parser, engine=engine)
  189. res2 = df.query('"red" not in color', parser=parser, engine=engine)
  190. exp = df[~ind.isin(["red"])]
  191. tm.assert_frame_equal(res1, exp)
  192. tm.assert_frame_equal(res2, exp)
  193. def test_query_with_unnamed_multiindex(self, parser, engine):
  194. skip_if_no_pandas_parser(parser)
  195. a = np.random.choice(["red", "green"], size=10)
  196. b = np.random.choice(["eggs", "ham"], size=10)
  197. index = MultiIndex.from_arrays([a, b])
  198. df = DataFrame(np.random.randn(10, 2), index=index)
  199. ind = Series(df.index.get_level_values(0).values, index=index)
  200. res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
  201. res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
  202. exp = df[ind == "red"]
  203. tm.assert_frame_equal(res1, exp)
  204. tm.assert_frame_equal(res2, exp)
  205. # inequality
  206. res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
  207. res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine)
  208. exp = df[ind != "red"]
  209. tm.assert_frame_equal(res1, exp)
  210. tm.assert_frame_equal(res2, exp)
  211. # list equality (really just set membership)
  212. res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine)
  213. res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine)
  214. exp = df[ind.isin(["red"])]
  215. tm.assert_frame_equal(res1, exp)
  216. tm.assert_frame_equal(res2, exp)
  217. res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine)
  218. res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine)
  219. exp = df[~ind.isin(["red"])]
  220. tm.assert_frame_equal(res1, exp)
  221. tm.assert_frame_equal(res2, exp)
  222. # in/not in ops
  223. res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine)
  224. res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine)
  225. exp = df[ind.isin(["red"])]
  226. tm.assert_frame_equal(res1, exp)
  227. tm.assert_frame_equal(res2, exp)
  228. res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine)
  229. res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine)
  230. exp = df[~ind.isin(["red"])]
  231. tm.assert_frame_equal(res1, exp)
  232. tm.assert_frame_equal(res2, exp)
  233. # ## LEVEL 1
  234. ind = Series(df.index.get_level_values(1).values, index=index)
  235. res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine)
  236. res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine)
  237. exp = df[ind == "eggs"]
  238. tm.assert_frame_equal(res1, exp)
  239. tm.assert_frame_equal(res2, exp)
  240. # inequality
  241. res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine)
  242. res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine)
  243. exp = df[ind != "eggs"]
  244. tm.assert_frame_equal(res1, exp)
  245. tm.assert_frame_equal(res2, exp)
  246. # list equality (really just set membership)
  247. res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine)
  248. res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine)
  249. exp = df[ind.isin(["eggs"])]
  250. tm.assert_frame_equal(res1, exp)
  251. tm.assert_frame_equal(res2, exp)
  252. res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine)
  253. res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine)
  254. exp = df[~ind.isin(["eggs"])]
  255. tm.assert_frame_equal(res1, exp)
  256. tm.assert_frame_equal(res2, exp)
  257. # in/not in ops
  258. res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine)
  259. res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine)
  260. exp = df[ind.isin(["eggs"])]
  261. tm.assert_frame_equal(res1, exp)
  262. tm.assert_frame_equal(res2, exp)
  263. res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine)
  264. res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine)
  265. exp = df[~ind.isin(["eggs"])]
  266. tm.assert_frame_equal(res1, exp)
  267. tm.assert_frame_equal(res2, exp)
  268. def test_query_with_partially_named_multiindex(self, parser, engine):
  269. skip_if_no_pandas_parser(parser)
  270. a = np.random.choice(["red", "green"], size=10)
  271. b = np.arange(10)
  272. index = MultiIndex.from_arrays([a, b])
  273. index.names = [None, "rating"]
  274. df = DataFrame(np.random.randn(10, 2), index=index)
  275. res = df.query("rating == 1", parser=parser, engine=engine)
  276. ind = Series(
  277. df.index.get_level_values("rating").values, index=index, name="rating"
  278. )
  279. exp = df[ind == 1]
  280. tm.assert_frame_equal(res, exp)
  281. res = df.query("rating != 1", parser=parser, engine=engine)
  282. ind = Series(
  283. df.index.get_level_values("rating").values, index=index, name="rating"
  284. )
  285. exp = df[ind != 1]
  286. tm.assert_frame_equal(res, exp)
  287. res = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
  288. ind = Series(df.index.get_level_values(0).values, index=index)
  289. exp = df[ind == "red"]
  290. tm.assert_frame_equal(res, exp)
  291. res = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
  292. ind = Series(df.index.get_level_values(0).values, index=index)
  293. exp = df[ind != "red"]
  294. tm.assert_frame_equal(res, exp)
  295. def test_query_multiindex_get_index_resolvers(self):
  296. df = tm.makeCustomDataframe(
  297. 10, 3, r_idx_nlevels=2, r_idx_names=["spam", "eggs"]
  298. )
  299. resolvers = df._get_index_resolvers()
  300. def to_series(mi, level):
  301. level_values = mi.get_level_values(level)
  302. s = level_values.to_series()
  303. s.index = mi
  304. return s
  305. col_series = df.columns.to_series()
  306. expected = {
  307. "index": df.index,
  308. "columns": col_series,
  309. "spam": to_series(df.index, "spam"),
  310. "eggs": to_series(df.index, "eggs"),
  311. "C0": col_series,
  312. }
  313. for k, v in resolvers.items():
  314. if isinstance(v, Index):
  315. assert v.is_(expected[k])
  316. elif isinstance(v, Series):
  317. tm.assert_series_equal(v, expected[k])
  318. else:
  319. raise AssertionError("object must be a Series or Index")
  320. @td.skip_if_no_ne
  321. class TestDataFrameQueryNumExprPandas:
  322. @classmethod
  323. def setup_class(cls):
  324. cls.engine = "numexpr"
  325. cls.parser = "pandas"
  326. @classmethod
  327. def teardown_class(cls):
  328. del cls.engine, cls.parser
  329. def test_date_query_with_attribute_access(self):
  330. engine, parser = self.engine, self.parser
  331. skip_if_no_pandas_parser(parser)
  332. df = DataFrame(np.random.randn(5, 3))
  333. df["dates1"] = date_range("1/1/2012", periods=5)
  334. df["dates2"] = date_range("1/1/2013", periods=5)
  335. df["dates3"] = date_range("1/1/2014", periods=5)
  336. res = df.query(
  337. "@df.dates1 < 20130101 < @df.dates3", engine=engine, parser=parser
  338. )
  339. expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
  340. tm.assert_frame_equal(res, expec)
  341. def test_date_query_no_attribute_access(self):
  342. engine, parser = self.engine, self.parser
  343. df = DataFrame(np.random.randn(5, 3))
  344. df["dates1"] = date_range("1/1/2012", periods=5)
  345. df["dates2"] = date_range("1/1/2013", periods=5)
  346. df["dates3"] = date_range("1/1/2014", periods=5)
  347. res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
  348. expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
  349. tm.assert_frame_equal(res, expec)
  350. def test_date_query_with_NaT(self):
  351. engine, parser = self.engine, self.parser
  352. n = 10
  353. df = DataFrame(np.random.randn(n, 3))
  354. df["dates1"] = date_range("1/1/2012", periods=n)
  355. df["dates2"] = date_range("1/1/2013", periods=n)
  356. df["dates3"] = date_range("1/1/2014", periods=n)
  357. df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
  358. df.loc[np.random.rand(n) > 0.5, "dates3"] = pd.NaT
  359. res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
  360. expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
  361. tm.assert_frame_equal(res, expec)
  362. def test_date_index_query(self):
  363. engine, parser = self.engine, self.parser
  364. n = 10
  365. df = DataFrame(np.random.randn(n, 3))
  366. df["dates1"] = date_range("1/1/2012", periods=n)
  367. df["dates3"] = date_range("1/1/2014", periods=n)
  368. return_value = df.set_index("dates1", inplace=True, drop=True)
  369. assert return_value is None
  370. res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
  371. expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
  372. tm.assert_frame_equal(res, expec)
  373. def test_date_index_query_with_NaT(self):
  374. engine, parser = self.engine, self.parser
  375. n = 10
  376. # Cast to object to avoid implicit cast when setting entry to pd.NaT below
  377. df = DataFrame(np.random.randn(n, 3)).astype({0: object})
  378. df["dates1"] = date_range("1/1/2012", periods=n)
  379. df["dates3"] = date_range("1/1/2014", periods=n)
  380. df.iloc[0, 0] = pd.NaT
  381. return_value = df.set_index("dates1", inplace=True, drop=True)
  382. assert return_value is None
  383. res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
  384. expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
  385. tm.assert_frame_equal(res, expec)
  386. def test_date_index_query_with_NaT_duplicates(self):
  387. engine, parser = self.engine, self.parser
  388. n = 10
  389. d = {}
  390. d["dates1"] = date_range("1/1/2012", periods=n)
  391. d["dates3"] = date_range("1/1/2014", periods=n)
  392. df = DataFrame(d)
  393. df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
  394. return_value = df.set_index("dates1", inplace=True, drop=True)
  395. assert return_value is None
  396. res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
  397. expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)]
  398. tm.assert_frame_equal(res, expec)
  399. def test_date_query_with_non_date(self):
  400. engine, parser = self.engine, self.parser
  401. n = 10
  402. df = DataFrame(
  403. {"dates": date_range("1/1/2012", periods=n), "nondate": np.arange(n)}
  404. )
  405. result = df.query("dates == nondate", parser=parser, engine=engine)
  406. assert len(result) == 0
  407. result = df.query("dates != nondate", parser=parser, engine=engine)
  408. tm.assert_frame_equal(result, df)
  409. msg = r"Invalid comparison between dtype=datetime64\[ns\] and ndarray"
  410. for op in ["<", ">", "<=", ">="]:
  411. with pytest.raises(TypeError, match=msg):
  412. df.query(f"dates {op} nondate", parser=parser, engine=engine)
  413. def test_query_syntax_error(self):
  414. engine, parser = self.engine, self.parser
  415. df = DataFrame({"i": range(10), "+": range(3, 13), "r": range(4, 14)})
  416. msg = "invalid syntax"
  417. with pytest.raises(SyntaxError, match=msg):
  418. df.query("i - +", engine=engine, parser=parser)
  419. def test_query_scope(self):
  420. engine, parser = self.engine, self.parser
  421. skip_if_no_pandas_parser(parser)
  422. df = DataFrame(np.random.randn(20, 2), columns=list("ab"))
  423. a, b = 1, 2 # noqa:F841
  424. res = df.query("a > b", engine=engine, parser=parser)
  425. expected = df[df.a > df.b]
  426. tm.assert_frame_equal(res, expected)
  427. res = df.query("@a > b", engine=engine, parser=parser)
  428. expected = df[a > df.b]
  429. tm.assert_frame_equal(res, expected)
  430. # no local variable c
  431. with pytest.raises(
  432. UndefinedVariableError, match="local variable 'c' is not defined"
  433. ):
  434. df.query("@a > b > @c", engine=engine, parser=parser)
  435. # no column named 'c'
  436. with pytest.raises(UndefinedVariableError, match="name 'c' is not defined"):
  437. df.query("@a > b > c", engine=engine, parser=parser)
  438. def test_query_doesnt_pickup_local(self):
  439. engine, parser = self.engine, self.parser
  440. n = m = 10
  441. df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc"))
  442. # we don't pick up the local 'sin'
  443. with pytest.raises(UndefinedVariableError, match="name 'sin' is not defined"):
  444. df.query("sin > 5", engine=engine, parser=parser)
  445. def test_query_builtin(self):
  446. engine, parser = self.engine, self.parser
  447. n = m = 10
  448. df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc"))
  449. df.index.name = "sin"
  450. msg = "Variables in expression.+"
  451. with pytest.raises(NumExprClobberingError, match=msg):
  452. df.query("sin > 5", engine=engine, parser=parser)
  453. def test_query(self):
  454. engine, parser = self.engine, self.parser
  455. df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"])
  456. tm.assert_frame_equal(
  457. df.query("a < b", engine=engine, parser=parser), df[df.a < df.b]
  458. )
  459. tm.assert_frame_equal(
  460. df.query("a + b > b * c", engine=engine, parser=parser),
  461. df[df.a + df.b > df.b * df.c],
  462. )
  463. def test_query_index_with_name(self):
  464. engine, parser = self.engine, self.parser
  465. df = DataFrame(
  466. np.random.randint(10, size=(10, 3)),
  467. index=Index(range(10), name="blob"),
  468. columns=["a", "b", "c"],
  469. )
  470. res = df.query("(blob < 5) & (a < b)", engine=engine, parser=parser)
  471. expec = df[(df.index < 5) & (df.a < df.b)]
  472. tm.assert_frame_equal(res, expec)
  473. res = df.query("blob < b", engine=engine, parser=parser)
  474. expec = df[df.index < df.b]
  475. tm.assert_frame_equal(res, expec)
  476. def test_query_index_without_name(self):
  477. engine, parser = self.engine, self.parser
  478. df = DataFrame(
  479. np.random.randint(10, size=(10, 3)),
  480. index=range(10),
  481. columns=["a", "b", "c"],
  482. )
  483. # "index" should refer to the index
  484. res = df.query("index < b", engine=engine, parser=parser)
  485. expec = df[df.index < df.b]
  486. tm.assert_frame_equal(res, expec)
  487. # test against a scalar
  488. res = df.query("index < 5", engine=engine, parser=parser)
  489. expec = df[df.index < 5]
  490. tm.assert_frame_equal(res, expec)
  491. def test_nested_scope(self):
  492. engine = self.engine
  493. parser = self.parser
  494. skip_if_no_pandas_parser(parser)
  495. df = DataFrame(np.random.randn(5, 3))
  496. df2 = DataFrame(np.random.randn(5, 3))
  497. expected = df[(df > 0) & (df2 > 0)]
  498. result = df.query("(@df > 0) & (@df2 > 0)", engine=engine, parser=parser)
  499. tm.assert_frame_equal(result, expected)
  500. result = pd.eval("df[df > 0 and df2 > 0]", engine=engine, parser=parser)
  501. tm.assert_frame_equal(result, expected)
  502. result = pd.eval(
  503. "df[df > 0 and df2 > 0 and df[df > 0] > 0]", engine=engine, parser=parser
  504. )
  505. expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
  506. tm.assert_frame_equal(result, expected)
  507. result = pd.eval("df[(df>0) & (df2>0)]", engine=engine, parser=parser)
  508. expected = df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser)
  509. tm.assert_frame_equal(result, expected)
  510. def test_nested_raises_on_local_self_reference(self):
  511. df = DataFrame(np.random.randn(5, 3))
  512. # can't reference ourself b/c we're a local so @ is necessary
  513. with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"):
  514. df.query("df > 0", engine=self.engine, parser=self.parser)
  515. def test_local_syntax(self):
  516. skip_if_no_pandas_parser(self.parser)
  517. engine, parser = self.engine, self.parser
  518. df = DataFrame(np.random.randn(100, 10), columns=list("abcdefghij"))
  519. b = 1
  520. expect = df[df.a < b]
  521. result = df.query("a < @b", engine=engine, parser=parser)
  522. tm.assert_frame_equal(result, expect)
  523. expect = df[df.a < df.b]
  524. result = df.query("a < b", engine=engine, parser=parser)
  525. tm.assert_frame_equal(result, expect)
  526. def test_chained_cmp_and_in(self):
  527. skip_if_no_pandas_parser(self.parser)
  528. engine, parser = self.engine, self.parser
  529. cols = list("abc")
  530. df = DataFrame(np.random.randn(100, len(cols)), columns=cols)
  531. res = df.query(
  532. "a < b < c and a not in b not in c", engine=engine, parser=parser
  533. )
  534. ind = (df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b)
  535. expec = df[ind]
  536. tm.assert_frame_equal(res, expec)
  537. def test_local_variable_with_in(self):
  538. engine, parser = self.engine, self.parser
  539. skip_if_no_pandas_parser(parser)
  540. a = Series(np.random.randint(3, size=15), name="a")
  541. b = Series(np.random.randint(10, size=15), name="b")
  542. df = DataFrame({"a": a, "b": b})
  543. expected = df.loc[(df.b - 1).isin(a)]
  544. result = df.query("b - 1 in a", engine=engine, parser=parser)
  545. tm.assert_frame_equal(expected, result)
  546. b = Series(np.random.randint(10, size=15), name="b")
  547. expected = df.loc[(b - 1).isin(a)]
  548. result = df.query("@b - 1 in a", engine=engine, parser=parser)
  549. tm.assert_frame_equal(expected, result)
  550. def test_at_inside_string(self):
  551. engine, parser = self.engine, self.parser
  552. skip_if_no_pandas_parser(parser)
  553. c = 1 # noqa:F841
  554. df = DataFrame({"a": ["a", "a", "b", "b", "@c", "@c"]})
  555. result = df.query('a == "@c"', engine=engine, parser=parser)
  556. expected = df[df.a == "@c"]
  557. tm.assert_frame_equal(result, expected)
  558. def test_query_undefined_local(self):
  559. engine, parser = self.engine, self.parser
  560. skip_if_no_pandas_parser(parser)
  561. df = DataFrame(np.random.rand(10, 2), columns=list("ab"))
  562. with pytest.raises(
  563. UndefinedVariableError, match="local variable 'c' is not defined"
  564. ):
  565. df.query("a == @c", engine=engine, parser=parser)
  566. def test_index_resolvers_come_after_columns_with_the_same_name(self):
  567. n = 1 # noqa:F841
  568. a = np.r_[20:101:20]
  569. df = DataFrame({"index": a, "b": np.random.randn(a.size)})
  570. df.index.name = "index"
  571. result = df.query("index > 5", engine=self.engine, parser=self.parser)
  572. expected = df[df["index"] > 5]
  573. tm.assert_frame_equal(result, expected)
  574. df = DataFrame({"index": a, "b": np.random.randn(a.size)})
  575. result = df.query("ilevel_0 > 5", engine=self.engine, parser=self.parser)
  576. expected = df.loc[df.index[df.index > 5]]
  577. tm.assert_frame_equal(result, expected)
  578. df = DataFrame({"a": a, "b": np.random.randn(a.size)})
  579. df.index.name = "a"
  580. result = df.query("a > 5", engine=self.engine, parser=self.parser)
  581. expected = df[df.a > 5]
  582. tm.assert_frame_equal(result, expected)
  583. result = df.query("index > 5", engine=self.engine, parser=self.parser)
  584. expected = df.loc[df.index[df.index > 5]]
  585. tm.assert_frame_equal(result, expected)
  586. @pytest.mark.parametrize("op, f", [["==", operator.eq], ["!=", operator.ne]])
  587. def test_inf(self, op, f):
  588. n = 10
  589. df = DataFrame({"a": np.random.rand(n), "b": np.random.rand(n)})
  590. df.loc[::2, 0] = np.inf
  591. q = f"a {op} inf"
  592. expected = df[f(df.a, np.inf)]
  593. result = df.query(q, engine=self.engine, parser=self.parser)
  594. tm.assert_frame_equal(result, expected)
  595. def test_check_tz_aware_index_query(self, tz_aware_fixture):
  596. # https://github.com/pandas-dev/pandas/issues/29463
  597. tz = tz_aware_fixture
  598. df_index = date_range(
  599. start="2019-01-01", freq="1d", periods=10, tz=tz, name="time"
  600. )
  601. expected = DataFrame(index=df_index)
  602. df = DataFrame(index=df_index)
  603. result = df.query('"2018-01-03 00:00:00+00" < time')
  604. tm.assert_frame_equal(result, expected)
  605. expected = DataFrame(df_index)
  606. result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
  607. tm.assert_frame_equal(result, expected)
  608. def test_method_calls_in_query(self):
  609. # https://github.com/pandas-dev/pandas/issues/22435
  610. n = 10
  611. df = DataFrame({"a": 2 * np.random.rand(n), "b": np.random.rand(n)})
  612. expected = df[df["a"].astype("int") == 0]
  613. result = df.query(
  614. "a.astype('int') == 0", engine=self.engine, parser=self.parser
  615. )
  616. tm.assert_frame_equal(result, expected)
  617. df = DataFrame(
  618. {
  619. "a": np.where(np.random.rand(n) < 0.5, np.nan, np.random.randn(n)),
  620. "b": np.random.randn(n),
  621. }
  622. )
  623. expected = df[df["a"].notnull()]
  624. result = df.query("a.notnull()", engine=self.engine, parser=self.parser)
  625. tm.assert_frame_equal(result, expected)
  626. @td.skip_if_no_ne
  627. class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas):
  628. @classmethod
  629. def setup_class(cls):
  630. super().setup_class()
  631. cls.engine = "numexpr"
  632. cls.parser = "python"
  633. def test_date_query_no_attribute_access(self):
  634. engine, parser = self.engine, self.parser
  635. df = DataFrame(np.random.randn(5, 3))
  636. df["dates1"] = date_range("1/1/2012", periods=5)
  637. df["dates2"] = date_range("1/1/2013", periods=5)
  638. df["dates3"] = date_range("1/1/2014", periods=5)
  639. res = df.query(
  640. "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
  641. )
  642. expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
  643. tm.assert_frame_equal(res, expec)
  644. def test_date_query_with_NaT(self):
  645. engine, parser = self.engine, self.parser
  646. n = 10
  647. df = DataFrame(np.random.randn(n, 3))
  648. df["dates1"] = date_range("1/1/2012", periods=n)
  649. df["dates2"] = date_range("1/1/2013", periods=n)
  650. df["dates3"] = date_range("1/1/2014", periods=n)
  651. df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
  652. df.loc[np.random.rand(n) > 0.5, "dates3"] = pd.NaT
  653. res = df.query(
  654. "(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
  655. )
  656. expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
  657. tm.assert_frame_equal(res, expec)
  658. def test_date_index_query(self):
  659. engine, parser = self.engine, self.parser
  660. n = 10
  661. df = DataFrame(np.random.randn(n, 3))
  662. df["dates1"] = date_range("1/1/2012", periods=n)
  663. df["dates3"] = date_range("1/1/2014", periods=n)
  664. return_value = df.set_index("dates1", inplace=True, drop=True)
  665. assert return_value is None
  666. res = df.query(
  667. "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
  668. )
  669. expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
  670. tm.assert_frame_equal(res, expec)
  671. def test_date_index_query_with_NaT(self):
  672. engine, parser = self.engine, self.parser
  673. n = 10
  674. # Cast to object to avoid implicit cast when setting entry to pd.NaT below
  675. df = DataFrame(np.random.randn(n, 3)).astype({0: object})
  676. df["dates1"] = date_range("1/1/2012", periods=n)
  677. df["dates3"] = date_range("1/1/2014", periods=n)
  678. df.iloc[0, 0] = pd.NaT
  679. return_value = df.set_index("dates1", inplace=True, drop=True)
  680. assert return_value is None
  681. res = df.query(
  682. "(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
  683. )
  684. expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
  685. tm.assert_frame_equal(res, expec)
  686. def test_date_index_query_with_NaT_duplicates(self):
  687. engine, parser = self.engine, self.parser
  688. n = 10
  689. df = DataFrame(np.random.randn(n, 3))
  690. df["dates1"] = date_range("1/1/2012", periods=n)
  691. df["dates3"] = date_range("1/1/2014", periods=n)
  692. df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
  693. return_value = df.set_index("dates1", inplace=True, drop=True)
  694. assert return_value is None
  695. msg = r"'BoolOp' nodes are not implemented"
  696. with pytest.raises(NotImplementedError, match=msg):
  697. df.query("index < 20130101 < dates3", engine=engine, parser=parser)
  698. def test_nested_scope(self):
  699. engine = self.engine
  700. parser = self.parser
  701. # smoke test
  702. x = 1 # noqa:F841
  703. result = pd.eval("x + 1", engine=engine, parser=parser)
  704. assert result == 2
  705. df = DataFrame(np.random.randn(5, 3))
  706. df2 = DataFrame(np.random.randn(5, 3))
  707. # don't have the pandas parser
  708. msg = r"The '@' prefix is only supported by the pandas parser"
  709. with pytest.raises(SyntaxError, match=msg):
  710. df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser)
  711. with pytest.raises(UndefinedVariableError, match="name 'df' is not defined"):
  712. df.query("(df>0) & (df2>0)", engine=engine, parser=parser)
  713. expected = df[(df > 0) & (df2 > 0)]
  714. result = pd.eval("df[(df > 0) & (df2 > 0)]", engine=engine, parser=parser)
  715. tm.assert_frame_equal(expected, result)
  716. expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
  717. result = pd.eval(
  718. "df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]", engine=engine, parser=parser
  719. )
  720. tm.assert_frame_equal(expected, result)
  721. def test_query_numexpr_with_min_and_max_columns(self):
  722. df = DataFrame({"min": [1, 2, 3], "max": [4, 5, 6]})
  723. regex_to_match = (
  724. r"Variables in expression \"\(min\) == \(1\)\" "
  725. r"overlap with builtins: \('min'\)"
  726. )
  727. with pytest.raises(NumExprClobberingError, match=regex_to_match):
  728. df.query("min == 1")
  729. regex_to_match = (
  730. r"Variables in expression \"\(max\) == \(1\)\" "
  731. r"overlap with builtins: \('max'\)"
  732. )
  733. with pytest.raises(NumExprClobberingError, match=regex_to_match):
  734. df.query("max == 1")
  735. class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas):
  736. @classmethod
  737. def setup_class(cls):
  738. super().setup_class()
  739. cls.engine = "python"
  740. cls.parser = "pandas"
  741. def test_query_builtin(self):
  742. engine, parser = self.engine, self.parser
  743. n = m = 10
  744. df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc"))
  745. df.index.name = "sin"
  746. expected = df[df.index > 5]
  747. result = df.query("sin > 5", engine=engine, parser=parser)
  748. tm.assert_frame_equal(expected, result)
  749. class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython):
  750. @classmethod
  751. def setup_class(cls):
  752. super().setup_class()
  753. cls.engine = cls.parser = "python"
  754. def test_query_builtin(self):
  755. engine, parser = self.engine, self.parser
  756. n = m = 10
  757. df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc"))
  758. df.index.name = "sin"
  759. expected = df[df.index > 5]
  760. result = df.query("sin > 5", engine=engine, parser=parser)
  761. tm.assert_frame_equal(expected, result)
  762. class TestDataFrameQueryStrings:
  763. def test_str_query_method(self, parser, engine):
  764. df = DataFrame(np.random.randn(10, 1), columns=["b"])
  765. df["strings"] = Series(list("aabbccddee"))
  766. expect = df[df.strings == "a"]
  767. if parser != "pandas":
  768. col = "strings"
  769. lst = '"a"'
  770. lhs = [col] * 2 + [lst] * 2
  771. rhs = lhs[::-1]
  772. eq, ne = "==", "!="
  773. ops = 2 * ([eq] + [ne])
  774. msg = r"'(Not)?In' nodes are not implemented"
  775. for lhs, op, rhs in zip(lhs, ops, rhs):
  776. ex = f"{lhs} {op} {rhs}"
  777. with pytest.raises(NotImplementedError, match=msg):
  778. df.query(
  779. ex,
  780. engine=engine,
  781. parser=parser,
  782. local_dict={"strings": df.strings},
  783. )
  784. else:
  785. res = df.query('"a" == strings', engine=engine, parser=parser)
  786. tm.assert_frame_equal(res, expect)
  787. res = df.query('strings == "a"', engine=engine, parser=parser)
  788. tm.assert_frame_equal(res, expect)
  789. tm.assert_frame_equal(res, df[df.strings.isin(["a"])])
  790. expect = df[df.strings != "a"]
  791. res = df.query('strings != "a"', engine=engine, parser=parser)
  792. tm.assert_frame_equal(res, expect)
  793. res = df.query('"a" != strings', engine=engine, parser=parser)
  794. tm.assert_frame_equal(res, expect)
  795. tm.assert_frame_equal(res, df[~df.strings.isin(["a"])])
  796. def test_str_list_query_method(self, parser, engine):
  797. df = DataFrame(np.random.randn(10, 1), columns=["b"])
  798. df["strings"] = Series(list("aabbccddee"))
  799. expect = df[df.strings.isin(["a", "b"])]
  800. if parser != "pandas":
  801. col = "strings"
  802. lst = '["a", "b"]'
  803. lhs = [col] * 2 + [lst] * 2
  804. rhs = lhs[::-1]
  805. eq, ne = "==", "!="
  806. ops = 2 * ([eq] + [ne])
  807. msg = r"'(Not)?In' nodes are not implemented"
  808. for lhs, op, rhs in zip(lhs, ops, rhs):
  809. ex = f"{lhs} {op} {rhs}"
  810. with pytest.raises(NotImplementedError, match=msg):
  811. df.query(ex, engine=engine, parser=parser)
  812. else:
  813. res = df.query('strings == ["a", "b"]', engine=engine, parser=parser)
  814. tm.assert_frame_equal(res, expect)
  815. res = df.query('["a", "b"] == strings', engine=engine, parser=parser)
  816. tm.assert_frame_equal(res, expect)
  817. expect = df[~df.strings.isin(["a", "b"])]
  818. res = df.query('strings != ["a", "b"]', engine=engine, parser=parser)
  819. tm.assert_frame_equal(res, expect)
  820. res = df.query('["a", "b"] != strings', engine=engine, parser=parser)
  821. tm.assert_frame_equal(res, expect)
  822. def test_query_with_string_columns(self, parser, engine):
  823. df = DataFrame(
  824. {
  825. "a": list("aaaabbbbcccc"),
  826. "b": list("aabbccddeeff"),
  827. "c": np.random.randint(5, size=12),
  828. "d": np.random.randint(9, size=12),
  829. }
  830. )
  831. if parser == "pandas":
  832. res = df.query("a in b", parser=parser, engine=engine)
  833. expec = df[df.a.isin(df.b)]
  834. tm.assert_frame_equal(res, expec)
  835. res = df.query("a in b and c < d", parser=parser, engine=engine)
  836. expec = df[df.a.isin(df.b) & (df.c < df.d)]
  837. tm.assert_frame_equal(res, expec)
  838. else:
  839. msg = r"'(Not)?In' nodes are not implemented"
  840. with pytest.raises(NotImplementedError, match=msg):
  841. df.query("a in b", parser=parser, engine=engine)
  842. msg = r"'BoolOp' nodes are not implemented"
  843. with pytest.raises(NotImplementedError, match=msg):
  844. df.query("a in b and c < d", parser=parser, engine=engine)
  845. def test_object_array_eq_ne(self, parser, engine):
  846. df = DataFrame(
  847. {
  848. "a": list("aaaabbbbcccc"),
  849. "b": list("aabbccddeeff"),
  850. "c": np.random.randint(5, size=12),
  851. "d": np.random.randint(9, size=12),
  852. }
  853. )
  854. res = df.query("a == b", parser=parser, engine=engine)
  855. exp = df[df.a == df.b]
  856. tm.assert_frame_equal(res, exp)
  857. res = df.query("a != b", parser=parser, engine=engine)
  858. exp = df[df.a != df.b]
  859. tm.assert_frame_equal(res, exp)
  860. def test_query_with_nested_strings(self, parser, engine):
  861. skip_if_no_pandas_parser(parser)
  862. events = [
  863. f"page {n} {act}" for n in range(1, 4) for act in ["load", "exit"]
  864. ] * 2
  865. stamps1 = date_range("2014-01-01 0:00:01", freq="30s", periods=6)
  866. stamps2 = date_range("2014-02-01 1:00:01", freq="30s", periods=6)
  867. df = DataFrame(
  868. {
  869. "id": np.arange(1, 7).repeat(2),
  870. "event": events,
  871. "timestamp": stamps1.append(stamps2),
  872. }
  873. )
  874. expected = df[df.event == '"page 1 load"']
  875. res = df.query("""'"page 1 load"' in event""", parser=parser, engine=engine)
  876. tm.assert_frame_equal(expected, res)
  877. def test_query_with_nested_special_character(self, parser, engine):
  878. skip_if_no_pandas_parser(parser)
  879. df = DataFrame({"a": ["a", "b", "test & test"], "b": [1, 2, 3]})
  880. res = df.query('a == "test & test"', parser=parser, engine=engine)
  881. expec = df[df.a == "test & test"]
  882. tm.assert_frame_equal(res, expec)
  883. @pytest.mark.parametrize(
  884. "op, func",
  885. [
  886. ["<", operator.lt],
  887. [">", operator.gt],
  888. ["<=", operator.le],
  889. [">=", operator.ge],
  890. ],
  891. )
  892. def test_query_lex_compare_strings(self, parser, engine, op, func):
  893. a = Series(np.random.choice(list("abcde"), 20))
  894. b = Series(np.arange(a.size))
  895. df = DataFrame({"X": a, "Y": b})
  896. res = df.query(f'X {op} "d"', engine=engine, parser=parser)
  897. expected = df[func(df.X, "d")]
  898. tm.assert_frame_equal(res, expected)
  899. def test_query_single_element_booleans(self, parser, engine):
  900. columns = "bid", "bidsize", "ask", "asksize"
  901. data = np.random.randint(2, size=(1, len(columns))).astype(bool)
  902. df = DataFrame(data, columns=columns)
  903. res = df.query("bid & ask", engine=engine, parser=parser)
  904. expected = df[df.bid & df.ask]
  905. tm.assert_frame_equal(res, expected)
  906. def test_query_string_scalar_variable(self, parser, engine):
  907. skip_if_no_pandas_parser(parser)
  908. df = DataFrame(
  909. {
  910. "Symbol": ["BUD US", "BUD US", "IBM US", "IBM US"],
  911. "Price": [109.70, 109.72, 183.30, 183.35],
  912. }
  913. )
  914. e = df[df.Symbol == "BUD US"]
  915. symb = "BUD US" # noqa:F841
  916. r = df.query("Symbol == @symb", parser=parser, engine=engine)
  917. tm.assert_frame_equal(e, r)
  918. class TestDataFrameEvalWithFrame:
  919. @pytest.fixture
  920. def frame(self):
  921. return DataFrame(np.random.randn(10, 3), columns=list("abc"))
  922. def test_simple_expr(self, frame, parser, engine):
  923. res = frame.eval("a + b", engine=engine, parser=parser)
  924. expect = frame.a + frame.b
  925. tm.assert_series_equal(res, expect)
  926. def test_bool_arith_expr(self, frame, parser, engine):
  927. res = frame.eval("a[a < 1] + b", engine=engine, parser=parser)
  928. expect = frame.a[frame.a < 1] + frame.b
  929. tm.assert_series_equal(res, expect)
  930. @pytest.mark.parametrize("op", ["+", "-", "*", "/"])
  931. def test_invalid_type_for_operator_raises(self, parser, engine, op):
  932. df = DataFrame({"a": [1, 2], "b": ["c", "d"]})
  933. msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'"
  934. with pytest.raises(TypeError, match=msg):
  935. df.eval(f"a {op} b", engine=engine, parser=parser)
  936. class TestDataFrameQueryBacktickQuoting:
  937. @pytest.fixture
  938. def df(self):
  939. """
  940. Yields a dataframe with strings that may or may not need escaping
  941. by backticks. The last two columns cannot be escaped by backticks
  942. and should raise a ValueError.
  943. """
  944. yield DataFrame(
  945. {
  946. "A": [1, 2, 3],
  947. "B B": [3, 2, 1],
  948. "C C": [4, 5, 6],
  949. "C C": [7, 4, 3],
  950. "C_C": [8, 9, 10],
  951. "D_D D": [11, 1, 101],
  952. "E.E": [6, 3, 5],
  953. "F-F": [8, 1, 10],
  954. "1e1": [2, 4, 8],
  955. "def": [10, 11, 2],
  956. "A (x)": [4, 1, 3],
  957. "B(x)": [1, 1, 5],
  958. "B (x)": [2, 7, 4],
  959. " &^ :!€$?(} > <++*'' ": [2, 5, 6],
  960. "": [10, 11, 1],
  961. " A": [4, 7, 9],
  962. " ": [1, 2, 1],
  963. "it's": [6, 3, 1],
  964. "that's": [9, 1, 8],
  965. "☺": [8, 7, 6],
  966. "foo#bar": [2, 4, 5],
  967. 1: [5, 7, 9],
  968. }
  969. )
  970. def test_single_backtick_variable_query(self, df):
  971. res = df.query("1 < `B B`")
  972. expect = df[1 < df["B B"]]
  973. tm.assert_frame_equal(res, expect)
  974. def test_two_backtick_variables_query(self, df):
  975. res = df.query("1 < `B B` and 4 < `C C`")
  976. expect = df[(1 < df["B B"]) & (4 < df["C C"])]
  977. tm.assert_frame_equal(res, expect)
  978. def test_single_backtick_variable_expr(self, df):
  979. res = df.eval("A + `B B`")
  980. expect = df["A"] + df["B B"]
  981. tm.assert_series_equal(res, expect)
  982. def test_two_backtick_variables_expr(self, df):
  983. res = df.eval("`B B` + `C C`")
  984. expect = df["B B"] + df["C C"]
  985. tm.assert_series_equal(res, expect)
  986. def test_already_underscore_variable(self, df):
  987. res = df.eval("`C_C` + A")
  988. expect = df["C_C"] + df["A"]
  989. tm.assert_series_equal(res, expect)
  990. def test_same_name_but_underscores(self, df):
  991. res = df.eval("C_C + `C C`")
  992. expect = df["C_C"] + df["C C"]
  993. tm.assert_series_equal(res, expect)
  994. def test_mixed_underscores_and_spaces(self, df):
  995. res = df.eval("A + `D_D D`")
  996. expect = df["A"] + df["D_D D"]
  997. tm.assert_series_equal(res, expect)
  998. def test_backtick_quote_name_with_no_spaces(self, df):
  999. res = df.eval("A + `C_C`")
  1000. expect = df["A"] + df["C_C"]
  1001. tm.assert_series_equal(res, expect)
  1002. def test_special_characters(self, df):
  1003. res = df.eval("`E.E` + `F-F` - A")
  1004. expect = df["E.E"] + df["F-F"] - df["A"]
  1005. tm.assert_series_equal(res, expect)
  1006. def test_start_with_digit(self, df):
  1007. res = df.eval("A + `1e1`")
  1008. expect = df["A"] + df["1e1"]
  1009. tm.assert_series_equal(res, expect)
  1010. def test_keyword(self, df):
  1011. res = df.eval("A + `def`")
  1012. expect = df["A"] + df["def"]
  1013. tm.assert_series_equal(res, expect)
  1014. def test_unneeded_quoting(self, df):
  1015. res = df.query("`A` > 2")
  1016. expect = df[df["A"] > 2]
  1017. tm.assert_frame_equal(res, expect)
  1018. def test_parenthesis(self, df):
  1019. res = df.query("`A (x)` > 2")
  1020. expect = df[df["A (x)"] > 2]
  1021. tm.assert_frame_equal(res, expect)
  1022. def test_empty_string(self, df):
  1023. res = df.query("`` > 5")
  1024. expect = df[df[""] > 5]
  1025. tm.assert_frame_equal(res, expect)
  1026. def test_multiple_spaces(self, df):
  1027. res = df.query("`C C` > 5")
  1028. expect = df[df["C C"] > 5]
  1029. tm.assert_frame_equal(res, expect)
  1030. def test_start_with_spaces(self, df):
  1031. res = df.eval("` A` + ` `")
  1032. expect = df[" A"] + df[" "]
  1033. tm.assert_series_equal(res, expect)
  1034. def test_lots_of_operators_string(self, df):
  1035. res = df.query("` &^ :!€$?(} > <++*'' ` > 4")
  1036. expect = df[df[" &^ :!€$?(} > <++*'' "] > 4]
  1037. tm.assert_frame_equal(res, expect)
  1038. def test_missing_attribute(self, df):
  1039. message = "module 'pandas' has no attribute 'thing'"
  1040. with pytest.raises(AttributeError, match=message):
  1041. df.eval("@pd.thing")
  1042. def test_failing_quote(self, df):
  1043. msg = r"(Could not convert ).*( to a valid Python identifier.)"
  1044. with pytest.raises(SyntaxError, match=msg):
  1045. df.query("`it's` > `that's`")
  1046. def test_failing_character_outside_range(self, df):
  1047. msg = r"(Could not convert ).*( to a valid Python identifier.)"
  1048. with pytest.raises(SyntaxError, match=msg):
  1049. df.query("`☺` > 4")
  1050. def test_failing_hashtag(self, df):
  1051. msg = "Failed to parse backticks"
  1052. with pytest.raises(SyntaxError, match=msg):
  1053. df.query("`foo#bar` > 4")
  1054. def test_call_non_named_expression(self, df):
  1055. """
  1056. Only attributes and variables ('named functions') can be called.
  1057. .__call__() is not an allowed attribute because that would allow
  1058. calling anything.
  1059. https://github.com/pandas-dev/pandas/pull/32460
  1060. """
  1061. def func(*_):
  1062. return 1
  1063. funcs = [func] # noqa:F841
  1064. df.eval("@func()")
  1065. with pytest.raises(TypeError, match="Only named functions are supported"):
  1066. df.eval("@funcs[0]()")
  1067. with pytest.raises(TypeError, match="Only named functions are supported"):
  1068. df.eval("@funcs[0].__call__()")
  1069. def test_ea_dtypes(self, any_numeric_ea_and_arrow_dtype):
  1070. # GH#29618
  1071. df = DataFrame(
  1072. [[1, 2], [3, 4]], columns=["a", "b"], dtype=any_numeric_ea_and_arrow_dtype
  1073. )
  1074. warning = RuntimeWarning if NUMEXPR_INSTALLED else None
  1075. with tm.assert_produces_warning(warning):
  1076. result = df.eval("c = b - a")
  1077. expected = DataFrame(
  1078. [[1, 2, 1], [3, 4, 1]],
  1079. columns=["a", "b", "c"],
  1080. dtype=any_numeric_ea_and_arrow_dtype,
  1081. )
  1082. tm.assert_frame_equal(result, expected)
  1083. def test_ea_dtypes_and_scalar(self):
  1084. # GH#29618
  1085. df = DataFrame([[1, 2], [3, 4]], columns=["a", "b"], dtype="Float64")
  1086. warning = RuntimeWarning if NUMEXPR_INSTALLED else None
  1087. with tm.assert_produces_warning(warning):
  1088. result = df.eval("c = b - 1")
  1089. expected = DataFrame(
  1090. [[1, 2, 1], [3, 4, 3]], columns=["a", "b", "c"], dtype="Float64"
  1091. )
  1092. tm.assert_frame_equal(result, expected)
  1093. def test_ea_dtypes_and_scalar_operation(self, any_numeric_ea_and_arrow_dtype):
  1094. # GH#29618
  1095. df = DataFrame(
  1096. [[1, 2], [3, 4]], columns=["a", "b"], dtype=any_numeric_ea_and_arrow_dtype
  1097. )
  1098. result = df.eval("c = 2 - 1")
  1099. expected = DataFrame(
  1100. {
  1101. "a": Series([1, 3], dtype=any_numeric_ea_and_arrow_dtype),
  1102. "b": Series([2, 4], dtype=any_numeric_ea_and_arrow_dtype),
  1103. "c": Series([1, 1], dtype=result["c"].dtype),
  1104. }
  1105. )
  1106. tm.assert_frame_equal(result, expected)
  1107. @pytest.mark.parametrize("dtype", ["int64", "Int64", "int64[pyarrow]"])
  1108. def test_query_ea_dtypes(self, dtype):
  1109. if dtype == "int64[pyarrow]":
  1110. pytest.importorskip("pyarrow")
  1111. # GH#50261
  1112. df = DataFrame({"a": Series([1, 2], dtype=dtype)})
  1113. ref = {2} # noqa:F841
  1114. warning = RuntimeWarning if dtype == "Int64" and NUMEXPR_INSTALLED else None
  1115. with tm.assert_produces_warning(warning):
  1116. result = df.query("a in @ref")
  1117. expected = DataFrame({"a": Series([2], dtype=dtype, index=[1])})
  1118. tm.assert_frame_equal(result, expected)
  1119. @pytest.mark.parametrize("engine", ["python", "numexpr"])
  1120. @pytest.mark.parametrize("dtype", ["int64", "Int64", "int64[pyarrow]"])
  1121. def test_query_ea_equality_comparison(self, dtype, engine):
  1122. # GH#50261
  1123. warning = RuntimeWarning if engine == "numexpr" else None
  1124. if engine == "numexpr" and not NUMEXPR_INSTALLED:
  1125. pytest.skip("numexpr not installed")
  1126. if dtype == "int64[pyarrow]":
  1127. pytest.importorskip("pyarrow")
  1128. df = DataFrame(
  1129. {"A": Series([1, 1, 2], dtype="Int64"), "B": Series([1, 2, 2], dtype=dtype)}
  1130. )
  1131. with tm.assert_produces_warning(warning):
  1132. result = df.query("A == B", engine=engine)
  1133. expected = DataFrame(
  1134. {
  1135. "A": Series([1, 2], dtype="Int64", index=[0, 2]),
  1136. "B": Series([1, 2], dtype=dtype, index=[0, 2]),
  1137. }
  1138. )
  1139. tm.assert_frame_equal(result, expected)