test_cython.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. """
  2. test cython .agg behavior
  3. """
  4. import numpy as np
  5. import pytest
  6. from pandas.core.dtypes.common import (
  7. is_float_dtype,
  8. is_integer_dtype,
  9. )
  10. import pandas as pd
  11. from pandas import (
  12. DataFrame,
  13. Index,
  14. NaT,
  15. Series,
  16. Timedelta,
  17. Timestamp,
  18. bdate_range,
  19. )
  20. import pandas._testing as tm
  21. @pytest.mark.parametrize(
  22. "op_name",
  23. [
  24. "count",
  25. "sum",
  26. "std",
  27. "var",
  28. "sem",
  29. "mean",
  30. pytest.param(
  31. "median",
  32. # ignore mean of empty slice
  33. # and all-NaN
  34. marks=[pytest.mark.filterwarnings("ignore::RuntimeWarning")],
  35. ),
  36. "prod",
  37. "min",
  38. "max",
  39. ],
  40. )
  41. def test_cythonized_aggers(op_name):
  42. data = {
  43. "A": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1.0, np.nan, np.nan],
  44. "B": ["A", "B"] * 6,
  45. "C": np.random.randn(12),
  46. }
  47. df = DataFrame(data)
  48. df.loc[2:10:2, "C"] = np.nan
  49. op = lambda x: getattr(x, op_name)()
  50. # single column
  51. grouped = df.drop(["B"], axis=1).groupby("A")
  52. exp = {cat: op(group["C"]) for cat, group in grouped}
  53. exp = DataFrame({"C": exp})
  54. exp.index.name = "A"
  55. result = op(grouped)
  56. tm.assert_frame_equal(result, exp)
  57. # multiple columns
  58. grouped = df.groupby(["A", "B"])
  59. expd = {}
  60. for (cat1, cat2), group in grouped:
  61. expd.setdefault(cat1, {})[cat2] = op(group["C"])
  62. exp = DataFrame(expd).T.stack(dropna=False)
  63. exp.index.names = ["A", "B"]
  64. exp.name = "C"
  65. result = op(grouped)["C"]
  66. if op_name in ["sum", "prod"]:
  67. tm.assert_series_equal(result, exp)
  68. def test_cython_agg_boolean():
  69. frame = DataFrame(
  70. {
  71. "a": np.random.randint(0, 5, 50),
  72. "b": np.random.randint(0, 2, 50).astype("bool"),
  73. }
  74. )
  75. result = frame.groupby("a")["b"].mean()
  76. expected = frame.groupby("a")["b"].agg(np.mean)
  77. tm.assert_series_equal(result, expected)
  78. def test_cython_agg_nothing_to_agg():
  79. frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
  80. msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
  81. with pytest.raises(TypeError, match=msg):
  82. frame.groupby("a")["b"].mean(numeric_only=True)
  83. frame = DataFrame({"a": np.random.randint(0, 5, 50), "b": ["foo", "bar"] * 25})
  84. result = frame[["b"]].groupby(frame["a"]).mean(numeric_only=True)
  85. expected = DataFrame(
  86. [], index=frame["a"].sort_values().drop_duplicates(), columns=[]
  87. )
  88. tm.assert_frame_equal(result, expected)
  89. def test_cython_agg_nothing_to_agg_with_dates():
  90. frame = DataFrame(
  91. {
  92. "a": np.random.randint(0, 5, 50),
  93. "b": ["foo", "bar"] * 25,
  94. "dates": pd.date_range("now", periods=50, freq="T"),
  95. }
  96. )
  97. msg = "Cannot use numeric_only=True with SeriesGroupBy.mean and non-numeric dtypes"
  98. with pytest.raises(TypeError, match=msg):
  99. frame.groupby("b").dates.mean(numeric_only=True)
  100. def test_cython_agg_frame_columns():
  101. # #2113
  102. df = DataFrame({"x": [1, 2, 3], "y": [3, 4, 5]})
  103. df.groupby(level=0, axis="columns").mean()
  104. df.groupby(level=0, axis="columns").mean()
  105. df.groupby(level=0, axis="columns").mean()
  106. df.groupby(level=0, axis="columns").mean()
  107. def test_cython_agg_return_dict():
  108. # GH 16741
  109. df = DataFrame(
  110. {
  111. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
  112. "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
  113. "C": np.random.randn(8),
  114. "D": np.random.randn(8),
  115. }
  116. )
  117. ts = df.groupby("A")["B"].agg(lambda x: x.value_counts().to_dict())
  118. expected = Series(
  119. [{"two": 1, "one": 1, "three": 1}, {"two": 2, "one": 2, "three": 1}],
  120. index=Index(["bar", "foo"], name="A"),
  121. name="B",
  122. )
  123. tm.assert_series_equal(ts, expected)
  124. def test_cython_fail_agg():
  125. dr = bdate_range("1/1/2000", periods=50)
  126. ts = Series(["A", "B", "C", "D", "E"] * 10, index=dr)
  127. grouped = ts.groupby(lambda x: x.month)
  128. summed = grouped.sum()
  129. expected = grouped.agg(np.sum)
  130. tm.assert_series_equal(summed, expected)
  131. @pytest.mark.parametrize(
  132. "op, targop",
  133. [
  134. ("mean", np.mean),
  135. ("median", np.median),
  136. ("var", np.var),
  137. ("sum", np.sum),
  138. ("prod", np.prod),
  139. ("min", np.min),
  140. ("max", np.max),
  141. ("first", lambda x: x.iloc[0]),
  142. ("last", lambda x: x.iloc[-1]),
  143. ],
  144. )
  145. def test__cython_agg_general(op, targop):
  146. df = DataFrame(np.random.randn(1000))
  147. labels = np.random.randint(0, 50, size=1000).astype(float)
  148. result = df.groupby(labels)._cython_agg_general(op, alt=None, numeric_only=True)
  149. expected = df.groupby(labels).agg(targop)
  150. tm.assert_frame_equal(result, expected)
  151. @pytest.mark.parametrize(
  152. "op, targop",
  153. [
  154. ("mean", np.mean),
  155. ("median", lambda x: np.median(x) if len(x) > 0 else np.nan),
  156. ("var", lambda x: np.var(x, ddof=1)),
  157. ("min", np.min),
  158. ("max", np.max),
  159. ],
  160. )
  161. def test_cython_agg_empty_buckets(op, targop, observed):
  162. df = DataFrame([11, 12, 13])
  163. grps = range(0, 55, 5)
  164. # calling _cython_agg_general directly, instead of via the user API
  165. # which sets different values for min_count, so do that here.
  166. g = df.groupby(pd.cut(df[0], grps), observed=observed)
  167. result = g._cython_agg_general(op, alt=None, numeric_only=True)
  168. g = df.groupby(pd.cut(df[0], grps), observed=observed)
  169. expected = g.agg(lambda x: targop(x))
  170. tm.assert_frame_equal(result, expected)
  171. def test_cython_agg_empty_buckets_nanops(observed):
  172. # GH-18869 can't call nanops on empty groups, so hardcode expected
  173. # for these
  174. df = DataFrame([11, 12, 13], columns=["a"])
  175. grps = np.arange(0, 25, 5, dtype=np.int_)
  176. # add / sum
  177. result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
  178. "sum", alt=None, numeric_only=True
  179. )
  180. intervals = pd.interval_range(0, 20, freq=5)
  181. expected = DataFrame(
  182. {"a": [0, 0, 36, 0]},
  183. index=pd.CategoricalIndex(intervals, name="a", ordered=True),
  184. )
  185. if observed:
  186. expected = expected[expected.a != 0]
  187. tm.assert_frame_equal(result, expected)
  188. # prod
  189. result = df.groupby(pd.cut(df["a"], grps), observed=observed)._cython_agg_general(
  190. "prod", alt=None, numeric_only=True
  191. )
  192. expected = DataFrame(
  193. {"a": [1, 1, 1716, 1]},
  194. index=pd.CategoricalIndex(intervals, name="a", ordered=True),
  195. )
  196. if observed:
  197. expected = expected[expected.a != 1]
  198. tm.assert_frame_equal(result, expected)
  199. @pytest.mark.parametrize("op", ["first", "last", "max", "min"])
  200. @pytest.mark.parametrize(
  201. "data", [Timestamp("2016-10-14 21:00:44.557"), Timedelta("17088 days 21:00:44.557")]
  202. )
  203. def test_cython_with_timestamp_and_nat(op, data):
  204. # https://github.com/pandas-dev/pandas/issues/19526
  205. df = DataFrame({"a": [0, 1], "b": [data, NaT]})
  206. index = Index([0, 1], name="a")
  207. # We will group by a and test the cython aggregations
  208. expected = DataFrame({"b": [data, NaT]}, index=index)
  209. result = df.groupby("a").aggregate(op)
  210. tm.assert_frame_equal(expected, result)
  211. @pytest.mark.parametrize(
  212. "agg",
  213. [
  214. "min",
  215. "max",
  216. "count",
  217. "sum",
  218. "prod",
  219. "var",
  220. "mean",
  221. "median",
  222. "ohlc",
  223. "cumprod",
  224. "cumsum",
  225. "shift",
  226. "any",
  227. "all",
  228. "quantile",
  229. "first",
  230. "last",
  231. "rank",
  232. "cummin",
  233. "cummax",
  234. ],
  235. )
  236. def test_read_only_buffer_source_agg(agg):
  237. # https://github.com/pandas-dev/pandas/issues/36014
  238. df = DataFrame(
  239. {
  240. "sepal_length": [5.1, 4.9, 4.7, 4.6, 5.0],
  241. "species": ["setosa", "setosa", "setosa", "setosa", "setosa"],
  242. }
  243. )
  244. df._mgr.arrays[0].flags.writeable = False
  245. result = df.groupby(["species"]).agg({"sepal_length": agg})
  246. expected = df.copy().groupby(["species"]).agg({"sepal_length": agg})
  247. tm.assert_equal(result, expected)
  248. @pytest.mark.parametrize(
  249. "op_name",
  250. [
  251. "count",
  252. "sum",
  253. "std",
  254. "var",
  255. "sem",
  256. "mean",
  257. "median",
  258. "prod",
  259. "min",
  260. "max",
  261. ],
  262. )
  263. def test_cython_agg_nullable_int(op_name):
  264. # ensure that the cython-based aggregations don't fail for nullable dtype
  265. # (eg https://github.com/pandas-dev/pandas/issues/37415)
  266. df = DataFrame(
  267. {
  268. "A": ["A", "B"] * 5,
  269. "B": pd.array([1, 2, 3, 4, 5, 6, 7, 8, 9, pd.NA], dtype="Int64"),
  270. }
  271. )
  272. result = getattr(df.groupby("A")["B"], op_name)()
  273. df2 = df.assign(B=df["B"].astype("float64"))
  274. expected = getattr(df2.groupby("A")["B"], op_name)()
  275. if op_name != "count":
  276. # the result is not yet consistently using Int64/Float64 dtype,
  277. # so for now just checking the values by casting to float
  278. result = result.astype("float64")
  279. tm.assert_series_equal(result, expected)
  280. @pytest.mark.parametrize("with_na", [True, False])
  281. @pytest.mark.parametrize(
  282. "op_name, action",
  283. [
  284. # ("count", "always_int"),
  285. ("sum", "large_int"),
  286. # ("std", "always_float"),
  287. ("var", "always_float"),
  288. # ("sem", "always_float"),
  289. ("mean", "always_float"),
  290. ("median", "always_float"),
  291. ("prod", "large_int"),
  292. ("min", "preserve"),
  293. ("max", "preserve"),
  294. ("first", "preserve"),
  295. ("last", "preserve"),
  296. ],
  297. )
  298. @pytest.mark.parametrize(
  299. "data",
  300. [
  301. pd.array([1, 2, 3, 4], dtype="Int64"),
  302. pd.array([1, 2, 3, 4], dtype="Int8"),
  303. pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float32"),
  304. pd.array([0.1, 0.2, 0.3, 0.4], dtype="Float64"),
  305. pd.array([True, True, False, False], dtype="boolean"),
  306. ],
  307. )
  308. def test_cython_agg_EA_known_dtypes(data, op_name, action, with_na):
  309. if with_na:
  310. data[3] = pd.NA
  311. df = DataFrame({"key": ["a", "a", "b", "b"], "col": data})
  312. grouped = df.groupby("key")
  313. if action == "always_int":
  314. # always Int64
  315. expected_dtype = pd.Int64Dtype()
  316. elif action == "large_int":
  317. # for any int/bool use Int64, for float preserve dtype
  318. if is_float_dtype(data.dtype):
  319. expected_dtype = data.dtype
  320. elif is_integer_dtype(data.dtype):
  321. # match the numpy dtype we'd get with the non-nullable analogue
  322. expected_dtype = data.dtype
  323. else:
  324. expected_dtype = pd.Int64Dtype()
  325. elif action == "always_float":
  326. # for any int/bool use Float64, for float preserve dtype
  327. if is_float_dtype(data.dtype):
  328. expected_dtype = data.dtype
  329. else:
  330. expected_dtype = pd.Float64Dtype()
  331. elif action == "preserve":
  332. expected_dtype = data.dtype
  333. result = getattr(grouped, op_name)()
  334. assert result["col"].dtype == expected_dtype
  335. result = grouped.aggregate(op_name)
  336. assert result["col"].dtype == expected_dtype
  337. result = getattr(grouped["col"], op_name)()
  338. assert result.dtype == expected_dtype
  339. result = grouped["col"].aggregate(op_name)
  340. assert result.dtype == expected_dtype