test_groupby_dropna.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684
  1. import numpy as np
  2. import pytest
  3. from pandas.compat.pyarrow import pa_version_under7p0
  4. from pandas.core.dtypes.missing import na_value_for_dtype
  5. import pandas as pd
  6. import pandas._testing as tm
  7. from pandas.tests.groupby import get_groupby_method_args
  8. @pytest.mark.parametrize(
  9. "dropna, tuples, outputs",
  10. [
  11. (
  12. True,
  13. [["A", "B"], ["B", "A"]],
  14. {"c": [13.0, 123.23], "d": [13.0, 123.0], "e": [13.0, 1.0]},
  15. ),
  16. (
  17. False,
  18. [["A", "B"], ["A", np.nan], ["B", "A"]],
  19. {
  20. "c": [13.0, 12.3, 123.23],
  21. "d": [13.0, 233.0, 123.0],
  22. "e": [13.0, 12.0, 1.0],
  23. },
  24. ),
  25. ],
  26. )
  27. def test_groupby_dropna_multi_index_dataframe_nan_in_one_group(
  28. dropna, tuples, outputs, nulls_fixture
  29. ):
  30. # GH 3729 this is to test that NA is in one group
  31. df_list = [
  32. ["A", "B", 12, 12, 12],
  33. ["A", nulls_fixture, 12.3, 233.0, 12],
  34. ["B", "A", 123.23, 123, 1],
  35. ["A", "B", 1, 1, 1.0],
  36. ]
  37. df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
  38. grouped = df.groupby(["a", "b"], dropna=dropna).sum()
  39. mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
  40. # Since right now, by default MI will drop NA from levels when we create MI
  41. # via `from_*`, so we need to add NA for level manually afterwards.
  42. if not dropna:
  43. mi = mi.set_levels(["A", "B", np.nan], level="b")
  44. expected = pd.DataFrame(outputs, index=mi)
  45. tm.assert_frame_equal(grouped, expected)
  46. @pytest.mark.parametrize(
  47. "dropna, tuples, outputs",
  48. [
  49. (
  50. True,
  51. [["A", "B"], ["B", "A"]],
  52. {"c": [12.0, 123.23], "d": [12.0, 123.0], "e": [12.0, 1.0]},
  53. ),
  54. (
  55. False,
  56. [["A", "B"], ["A", np.nan], ["B", "A"], [np.nan, "B"]],
  57. {
  58. "c": [12.0, 13.3, 123.23, 1.0],
  59. "d": [12.0, 234.0, 123.0, 1.0],
  60. "e": [12.0, 13.0, 1.0, 1.0],
  61. },
  62. ),
  63. ],
  64. )
  65. def test_groupby_dropna_multi_index_dataframe_nan_in_two_groups(
  66. dropna, tuples, outputs, nulls_fixture, nulls_fixture2
  67. ):
  68. # GH 3729 this is to test that NA in different groups with different representations
  69. df_list = [
  70. ["A", "B", 12, 12, 12],
  71. ["A", nulls_fixture, 12.3, 233.0, 12],
  72. ["B", "A", 123.23, 123, 1],
  73. [nulls_fixture2, "B", 1, 1, 1.0],
  74. ["A", nulls_fixture2, 1, 1, 1.0],
  75. ]
  76. df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
  77. grouped = df.groupby(["a", "b"], dropna=dropna).sum()
  78. mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
  79. # Since right now, by default MI will drop NA from levels when we create MI
  80. # via `from_*`, so we need to add NA for level manually afterwards.
  81. if not dropna:
  82. mi = mi.set_levels([["A", "B", np.nan], ["A", "B", np.nan]])
  83. expected = pd.DataFrame(outputs, index=mi)
  84. tm.assert_frame_equal(grouped, expected)
  85. @pytest.mark.parametrize(
  86. "dropna, idx, outputs",
  87. [
  88. (True, ["A", "B"], {"b": [123.23, 13.0], "c": [123.0, 13.0], "d": [1.0, 13.0]}),
  89. (
  90. False,
  91. ["A", "B", np.nan],
  92. {
  93. "b": [123.23, 13.0, 12.3],
  94. "c": [123.0, 13.0, 233.0],
  95. "d": [1.0, 13.0, 12.0],
  96. },
  97. ),
  98. ],
  99. )
  100. def test_groupby_dropna_normal_index_dataframe(dropna, idx, outputs):
  101. # GH 3729
  102. df_list = [
  103. ["B", 12, 12, 12],
  104. [None, 12.3, 233.0, 12],
  105. ["A", 123.23, 123, 1],
  106. ["B", 1, 1, 1.0],
  107. ]
  108. df = pd.DataFrame(df_list, columns=["a", "b", "c", "d"])
  109. grouped = df.groupby("a", dropna=dropna).sum()
  110. expected = pd.DataFrame(outputs, index=pd.Index(idx, dtype="object", name="a"))
  111. tm.assert_frame_equal(grouped, expected)
  112. @pytest.mark.parametrize(
  113. "dropna, idx, expected",
  114. [
  115. (True, ["a", "a", "b", np.nan], pd.Series([3, 3], index=["a", "b"])),
  116. (
  117. False,
  118. ["a", "a", "b", np.nan],
  119. pd.Series([3, 3, 3], index=["a", "b", np.nan]),
  120. ),
  121. ],
  122. )
  123. def test_groupby_dropna_series_level(dropna, idx, expected):
  124. ser = pd.Series([1, 2, 3, 3], index=idx)
  125. result = ser.groupby(level=0, dropna=dropna).sum()
  126. tm.assert_series_equal(result, expected)
  127. @pytest.mark.parametrize(
  128. "dropna, expected",
  129. [
  130. (True, pd.Series([210.0, 350.0], index=["a", "b"], name="Max Speed")),
  131. (
  132. False,
  133. pd.Series([210.0, 350.0, 20.0], index=["a", "b", np.nan], name="Max Speed"),
  134. ),
  135. ],
  136. )
  137. def test_groupby_dropna_series_by(dropna, expected):
  138. ser = pd.Series(
  139. [390.0, 350.0, 30.0, 20.0],
  140. index=["Falcon", "Falcon", "Parrot", "Parrot"],
  141. name="Max Speed",
  142. )
  143. result = ser.groupby(["a", "b", "a", np.nan], dropna=dropna).mean()
  144. tm.assert_series_equal(result, expected)
  145. @pytest.mark.parametrize("dropna", (False, True))
  146. def test_grouper_dropna_propagation(dropna):
  147. # GH 36604
  148. df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]})
  149. gb = df.groupby("A", dropna=dropna)
  150. assert gb.grouper.dropna == dropna
  151. @pytest.mark.parametrize(
  152. "index",
  153. [
  154. pd.RangeIndex(0, 4),
  155. list("abcd"),
  156. pd.MultiIndex.from_product([(1, 2), ("R", "B")], names=["num", "col"]),
  157. ],
  158. )
  159. def test_groupby_dataframe_slice_then_transform(dropna, index):
  160. # GH35014 & GH35612
  161. expected_data = {"B": [2, 2, 1, np.nan if dropna else 1]}
  162. df = pd.DataFrame({"A": [0, 0, 1, None], "B": [1, 2, 3, None]}, index=index)
  163. gb = df.groupby("A", dropna=dropna)
  164. result = gb.transform(len)
  165. expected = pd.DataFrame(expected_data, index=index)
  166. tm.assert_frame_equal(result, expected)
  167. result = gb[["B"]].transform(len)
  168. expected = pd.DataFrame(expected_data, index=index)
  169. tm.assert_frame_equal(result, expected)
  170. result = gb["B"].transform(len)
  171. expected = pd.Series(expected_data["B"], index=index, name="B")
  172. tm.assert_series_equal(result, expected)
  173. @pytest.mark.parametrize(
  174. "dropna, tuples, outputs",
  175. [
  176. (
  177. True,
  178. [["A", "B"], ["B", "A"]],
  179. {"c": [13.0, 123.23], "d": [12.0, 123.0], "e": [1.0, 1.0]},
  180. ),
  181. (
  182. False,
  183. [["A", "B"], ["A", np.nan], ["B", "A"]],
  184. {
  185. "c": [13.0, 12.3, 123.23],
  186. "d": [12.0, 233.0, 123.0],
  187. "e": [1.0, 12.0, 1.0],
  188. },
  189. ),
  190. ],
  191. )
  192. def test_groupby_dropna_multi_index_dataframe_agg(dropna, tuples, outputs):
  193. # GH 3729
  194. df_list = [
  195. ["A", "B", 12, 12, 12],
  196. ["A", None, 12.3, 233.0, 12],
  197. ["B", "A", 123.23, 123, 1],
  198. ["A", "B", 1, 1, 1.0],
  199. ]
  200. df = pd.DataFrame(df_list, columns=["a", "b", "c", "d", "e"])
  201. agg_dict = {"c": sum, "d": max, "e": "min"}
  202. grouped = df.groupby(["a", "b"], dropna=dropna).agg(agg_dict)
  203. mi = pd.MultiIndex.from_tuples(tuples, names=list("ab"))
  204. # Since right now, by default MI will drop NA from levels when we create MI
  205. # via `from_*`, so we need to add NA for level manually afterwards.
  206. if not dropna:
  207. mi = mi.set_levels(["A", "B", np.nan], level="b")
  208. expected = pd.DataFrame(outputs, index=mi)
  209. tm.assert_frame_equal(grouped, expected)
  210. @pytest.mark.arm_slow
  211. @pytest.mark.parametrize(
  212. "datetime1, datetime2",
  213. [
  214. (pd.Timestamp("2020-01-01"), pd.Timestamp("2020-02-01")),
  215. (pd.Timedelta("-2 days"), pd.Timedelta("-1 days")),
  216. (pd.Period("2020-01-01"), pd.Period("2020-02-01")),
  217. ],
  218. )
  219. @pytest.mark.parametrize("dropna, values", [(True, [12, 3]), (False, [12, 3, 6])])
  220. def test_groupby_dropna_datetime_like_data(
  221. dropna, values, datetime1, datetime2, unique_nulls_fixture, unique_nulls_fixture2
  222. ):
  223. # 3729
  224. df = pd.DataFrame(
  225. {
  226. "values": [1, 2, 3, 4, 5, 6],
  227. "dt": [
  228. datetime1,
  229. unique_nulls_fixture,
  230. datetime2,
  231. unique_nulls_fixture2,
  232. datetime1,
  233. datetime1,
  234. ],
  235. }
  236. )
  237. if dropna:
  238. indexes = [datetime1, datetime2]
  239. else:
  240. indexes = [datetime1, datetime2, np.nan]
  241. grouped = df.groupby("dt", dropna=dropna).agg({"values": sum})
  242. expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
  243. tm.assert_frame_equal(grouped, expected)
  244. @pytest.mark.parametrize(
  245. "dropna, data, selected_data, levels",
  246. [
  247. pytest.param(
  248. False,
  249. {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
  250. {"values": [0, 1, 0, 0]},
  251. ["a", "b", np.nan],
  252. id="dropna_false_has_nan",
  253. ),
  254. pytest.param(
  255. True,
  256. {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
  257. {"values": [0, 1, 0]},
  258. None,
  259. id="dropna_true_has_nan",
  260. ),
  261. pytest.param(
  262. # no nan in "groups"; dropna=True|False should be same.
  263. False,
  264. {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
  265. {"values": [0, 1, 0, 0]},
  266. None,
  267. id="dropna_false_no_nan",
  268. ),
  269. pytest.param(
  270. # no nan in "groups"; dropna=True|False should be same.
  271. True,
  272. {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
  273. {"values": [0, 1, 0, 0]},
  274. None,
  275. id="dropna_true_no_nan",
  276. ),
  277. ],
  278. )
  279. def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
  280. # GH 35889
  281. df = pd.DataFrame(data)
  282. gb = df.groupby("groups", dropna=dropna)
  283. result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
  284. mi_tuples = tuple(zip(data["groups"], selected_data["values"]))
  285. mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
  286. # Since right now, by default MI will drop NA from levels when we create MI
  287. # via `from_*`, so we need to add NA for level manually afterwards.
  288. if not dropna and levels:
  289. mi = mi.set_levels(levels, level="groups")
  290. expected = pd.DataFrame(selected_data, index=mi)
  291. tm.assert_frame_equal(result, expected)
  292. @pytest.mark.parametrize("input_index", [None, ["a"], ["a", "b"]])
  293. @pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
  294. @pytest.mark.parametrize("series", [True, False])
  295. def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
  296. # GH#46783
  297. obj = pd.DataFrame(
  298. {
  299. "a": [1, np.nan],
  300. "b": [1, 1],
  301. "c": [2, 3],
  302. }
  303. )
  304. expected = obj.set_index(keys)
  305. if series:
  306. expected = expected["c"]
  307. elif input_index == ["a", "b"] and keys == ["a"]:
  308. # Column b should not be aggregated
  309. expected = expected[["c"]]
  310. if input_index is not None:
  311. obj = obj.set_index(input_index)
  312. gb = obj.groupby(keys, dropna=False)
  313. if series:
  314. gb = gb["c"]
  315. result = gb.sum()
  316. tm.assert_equal(result, expected)
  317. def test_groupby_nan_included():
  318. # GH 35646
  319. data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}
  320. df = pd.DataFrame(data)
  321. grouped = df.groupby("group", dropna=False)
  322. result = grouped.indices
  323. dtype = np.intp
  324. expected = {
  325. "g1": np.array([0, 2], dtype=dtype),
  326. "g2": np.array([3], dtype=dtype),
  327. np.nan: np.array([1, 4], dtype=dtype),
  328. }
  329. for result_values, expected_values in zip(result.values(), expected.values()):
  330. tm.assert_numpy_array_equal(result_values, expected_values)
  331. assert np.isnan(list(result.keys())[2])
  332. assert list(result.keys())[0:2] == ["g1", "g2"]
  333. def test_groupby_drop_nan_with_multi_index():
  334. # GH 39895
  335. df = pd.DataFrame([[np.nan, 0, 1]], columns=["a", "b", "c"])
  336. df = df.set_index(["a", "b"])
  337. result = df.groupby(["a", "b"], dropna=False).first()
  338. expected = df
  339. tm.assert_frame_equal(result, expected)
  340. # sequence_index enumerates all strings made up of x, y, z of length 4
  341. @pytest.mark.parametrize("sequence_index", range(3**4))
  342. @pytest.mark.parametrize(
  343. "dtype",
  344. [
  345. None,
  346. "UInt8",
  347. "Int8",
  348. "UInt16",
  349. "Int16",
  350. "UInt32",
  351. "Int32",
  352. "UInt64",
  353. "Int64",
  354. "Float32",
  355. "Int64",
  356. "Float64",
  357. "category",
  358. "string",
  359. pytest.param(
  360. "string[pyarrow]",
  361. marks=pytest.mark.skipif(
  362. pa_version_under7p0, reason="pyarrow is not installed"
  363. ),
  364. ),
  365. "datetime64[ns]",
  366. "period[d]",
  367. "Sparse[float]",
  368. ],
  369. )
  370. @pytest.mark.parametrize("test_series", [True, False])
  371. def test_no_sort_keep_na(sequence_index, dtype, test_series, as_index):
  372. # GH#46584, GH#48794
  373. # Convert sequence_index into a string sequence, e.g. 5 becomes "xxyz"
  374. # This sequence is used for the grouper.
  375. sequence = "".join(
  376. [{0: "x", 1: "y", 2: "z"}[sequence_index // (3**k) % 3] for k in range(4)]
  377. )
  378. # Unique values to use for grouper, depends on dtype
  379. if dtype in ("string", "string[pyarrow]"):
  380. uniques = {"x": "x", "y": "y", "z": pd.NA}
  381. elif dtype in ("datetime64[ns]", "period[d]"):
  382. uniques = {"x": "2016-01-01", "y": "2017-01-01", "z": pd.NA}
  383. else:
  384. uniques = {"x": 1, "y": 2, "z": np.nan}
  385. df = pd.DataFrame(
  386. {
  387. "key": pd.Series([uniques[label] for label in sequence], dtype=dtype),
  388. "a": [0, 1, 2, 3],
  389. }
  390. )
  391. gb = df.groupby("key", dropna=False, sort=False, as_index=as_index)
  392. if test_series:
  393. gb = gb["a"]
  394. result = gb.sum()
  395. # Manually compute the groupby sum, use the labels "x", "y", and "z" to avoid
  396. # issues with hashing np.nan
  397. summed = {}
  398. for idx, label in enumerate(sequence):
  399. summed[label] = summed.get(label, 0) + idx
  400. if dtype == "category":
  401. index = pd.CategoricalIndex(
  402. [uniques[e] for e in summed],
  403. df["key"].cat.categories,
  404. name="key",
  405. )
  406. elif isinstance(dtype, str) and dtype.startswith("Sparse"):
  407. index = pd.Index(
  408. pd.array([uniques[label] for label in summed], dtype=dtype), name="key"
  409. )
  410. else:
  411. index = pd.Index([uniques[label] for label in summed], dtype=dtype, name="key")
  412. expected = pd.Series(summed.values(), index=index, name="a", dtype=None)
  413. if not test_series:
  414. expected = expected.to_frame()
  415. if not as_index:
  416. expected = expected.reset_index()
  417. if dtype is not None and dtype.startswith("Sparse"):
  418. expected["key"] = expected["key"].astype(dtype)
  419. tm.assert_equal(result, expected)
  420. @pytest.mark.parametrize("test_series", [True, False])
  421. @pytest.mark.parametrize("dtype", [object, None])
  422. def test_null_is_null_for_dtype(
  423. sort, dtype, nulls_fixture, nulls_fixture2, test_series
  424. ):
  425. # GH#48506 - groups should always result in using the null for the dtype
  426. df = pd.DataFrame({"a": [1, 2]})
  427. groups = pd.Series([nulls_fixture, nulls_fixture2], dtype=dtype)
  428. obj = df["a"] if test_series else df
  429. gb = obj.groupby(groups, dropna=False, sort=sort)
  430. result = gb.sum()
  431. index = pd.Index([na_value_for_dtype(groups.dtype)])
  432. expected = pd.DataFrame({"a": [3]}, index=index)
  433. if test_series:
  434. tm.assert_series_equal(result, expected["a"])
  435. else:
  436. tm.assert_frame_equal(result, expected)
  437. @pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
  438. def test_categorical_reducers(
  439. request, reduction_func, observed, sort, as_index, index_kind
  440. ):
  441. # GH#36327
  442. if (
  443. reduction_func in ("idxmin", "idxmax")
  444. and not observed
  445. and index_kind != "multi"
  446. ):
  447. msg = "GH#10694 - idxmin/max broken for categorical with observed=False"
  448. request.node.add_marker(pytest.mark.xfail(reason=msg))
  449. # Ensure there is at least one null value by appending to the end
  450. values = np.append(np.random.choice([1, 2, None], size=19), None)
  451. df = pd.DataFrame(
  452. {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
  453. )
  454. # Strategy: Compare to dropna=True by filling null values with a new code
  455. df_filled = df.copy()
  456. df_filled["x"] = pd.Categorical(values, categories=[1, 2, 3, 4]).fillna(4)
  457. if index_kind == "range":
  458. keys = ["x"]
  459. elif index_kind == "single":
  460. keys = ["x"]
  461. df = df.set_index("x")
  462. df_filled = df_filled.set_index("x")
  463. else:
  464. keys = ["x", "x2"]
  465. df["x2"] = df["x"]
  466. df = df.set_index(["x", "x2"])
  467. df_filled["x2"] = df_filled["x"]
  468. df_filled = df_filled.set_index(["x", "x2"])
  469. args = get_groupby_method_args(reduction_func, df)
  470. args_filled = get_groupby_method_args(reduction_func, df_filled)
  471. if reduction_func == "corrwith" and index_kind == "range":
  472. # Don't include the grouping columns so we can call reset_index
  473. args = (args[0].drop(columns=keys),)
  474. args_filled = (args_filled[0].drop(columns=keys),)
  475. gb_filled = df_filled.groupby(keys, observed=observed, sort=sort, as_index=True)
  476. expected = getattr(gb_filled, reduction_func)(*args_filled).reset_index()
  477. expected["x"] = expected["x"].replace(4, None)
  478. if index_kind == "multi":
  479. expected["x2"] = expected["x2"].replace(4, None)
  480. if as_index:
  481. if index_kind == "multi":
  482. expected = expected.set_index(["x", "x2"])
  483. else:
  484. expected = expected.set_index("x")
  485. else:
  486. if index_kind != "range" and reduction_func != "size":
  487. # size, unlike other methods, has the desired behavior in GH#49519
  488. expected = expected.drop(columns="x")
  489. if index_kind == "multi":
  490. expected = expected.drop(columns="x2")
  491. if reduction_func in ("idxmax", "idxmin") and index_kind != "range":
  492. # expected was computed with a RangeIndex; need to translate to index values
  493. values = expected["y"].values.tolist()
  494. if index_kind == "single":
  495. values = [np.nan if e == 4 else e for e in values]
  496. else:
  497. values = [(np.nan, np.nan) if e == (4, 4) else e for e in values]
  498. expected["y"] = values
  499. if reduction_func == "size":
  500. # size, unlike other methods, has the desired behavior in GH#49519
  501. expected = expected.rename(columns={0: "size"})
  502. if as_index:
  503. expected = expected["size"].rename(None)
  504. gb_keepna = df.groupby(
  505. keys, dropna=False, observed=observed, sort=sort, as_index=as_index
  506. )
  507. result = getattr(gb_keepna, reduction_func)(*args)
  508. # size will return a Series, others are DataFrame
  509. tm.assert_equal(result, expected)
  510. def test_categorical_transformers(
  511. request, transformation_func, observed, sort, as_index
  512. ):
  513. # GH#36327
  514. if transformation_func == "fillna":
  515. msg = "GH#49651 fillna may incorrectly reorders results when dropna=False"
  516. request.node.add_marker(pytest.mark.xfail(reason=msg, strict=False))
  517. values = np.append(np.random.choice([1, 2, None], size=19), None)
  518. df = pd.DataFrame(
  519. {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(20)}
  520. )
  521. args = get_groupby_method_args(transformation_func, df)
  522. # Compute result for null group
  523. null_group_values = df[df["x"].isnull()]["y"]
  524. if transformation_func == "cumcount":
  525. null_group_data = list(range(len(null_group_values)))
  526. elif transformation_func == "ngroup":
  527. if sort:
  528. if observed:
  529. na_group = df["x"].nunique(dropna=False) - 1
  530. else:
  531. # TODO: Should this be 3?
  532. na_group = df["x"].nunique(dropna=False) - 1
  533. else:
  534. na_group = df.iloc[: null_group_values.index[0]]["x"].nunique()
  535. null_group_data = len(null_group_values) * [na_group]
  536. else:
  537. null_group_data = getattr(null_group_values, transformation_func)(*args)
  538. null_group_result = pd.DataFrame({"y": null_group_data})
  539. gb_keepna = df.groupby(
  540. "x", dropna=False, observed=observed, sort=sort, as_index=as_index
  541. )
  542. gb_dropna = df.groupby("x", dropna=True, observed=observed, sort=sort)
  543. result = getattr(gb_keepna, transformation_func)(*args)
  544. expected = getattr(gb_dropna, transformation_func)(*args)
  545. for iloc, value in zip(
  546. df[df["x"].isnull()].index.tolist(), null_group_result.values.ravel()
  547. ):
  548. if expected.ndim == 1:
  549. expected.iloc[iloc] = value
  550. else:
  551. expected.iloc[iloc, 0] = value
  552. if transformation_func == "ngroup":
  553. expected[df["x"].notnull() & expected.ge(na_group)] += 1
  554. if transformation_func not in ("rank", "diff", "pct_change", "shift"):
  555. expected = expected.astype("int64")
  556. tm.assert_equal(result, expected)
  557. @pytest.mark.parametrize("method", ["head", "tail"])
  558. def test_categorical_head_tail(method, observed, sort, as_index):
  559. # GH#36327
  560. values = np.random.choice([1, 2, None], 30)
  561. df = pd.DataFrame(
  562. {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
  563. )
  564. gb = df.groupby("x", dropna=False, observed=observed, sort=sort, as_index=as_index)
  565. result = getattr(gb, method)()
  566. if method == "tail":
  567. values = values[::-1]
  568. # Take the top 5 values from each group
  569. mask = (
  570. ((values == 1) & ((values == 1).cumsum() <= 5))
  571. | ((values == 2) & ((values == 2).cumsum() <= 5))
  572. # flake8 doesn't like the vectorized check for None, thinks we should use `is`
  573. | ((values == None) & ((values == None).cumsum() <= 5)) # noqa: E711
  574. )
  575. if method == "tail":
  576. mask = mask[::-1]
  577. expected = df[mask]
  578. tm.assert_frame_equal(result, expected)
  579. def test_categorical_agg():
  580. # GH#36327
  581. values = np.random.choice([1, 2, None], 30)
  582. df = pd.DataFrame(
  583. {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
  584. )
  585. gb = df.groupby("x", dropna=False)
  586. result = gb.agg(lambda x: x.sum())
  587. expected = gb.sum()
  588. tm.assert_frame_equal(result, expected)
  589. def test_categorical_transform():
  590. # GH#36327
  591. values = np.random.choice([1, 2, None], 30)
  592. df = pd.DataFrame(
  593. {"x": pd.Categorical(values, categories=[1, 2, 3]), "y": range(len(values))}
  594. )
  595. gb = df.groupby("x", dropna=False)
  596. result = gb.transform(lambda x: x.sum())
  597. expected = gb.transform("sum")
  598. tm.assert_frame_equal(result, expected)