test_grouping.py 38 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077
  1. """
  2. test where we are determining what we are grouping, or getting groups
  3. """
  4. from datetime import (
  5. date,
  6. timedelta,
  7. )
  8. import numpy as np
  9. import pytest
  10. import pandas as pd
  11. from pandas import (
  12. CategoricalIndex,
  13. DataFrame,
  14. Grouper,
  15. Index,
  16. MultiIndex,
  17. Series,
  18. Timestamp,
  19. date_range,
  20. )
  21. import pandas._testing as tm
  22. from pandas.core.groupby.grouper import Grouping
  23. # selection
  24. # --------------------------------
  25. class TestSelection:
  26. def test_select_bad_cols(self):
  27. df = DataFrame([[1, 2]], columns=["A", "B"])
  28. g = df.groupby("A")
  29. with pytest.raises(KeyError, match="\"Columns not found: 'C'\""):
  30. g[["C"]]
  31. with pytest.raises(KeyError, match="^[^A]+$"):
  32. # A should not be referenced as a bad column...
  33. # will have to rethink regex if you change message!
  34. g[["A", "C"]]
  35. def test_groupby_duplicated_column_errormsg(self):
  36. # GH7511
  37. df = DataFrame(
  38. columns=["A", "B", "A", "C"], data=[range(4), range(2, 6), range(0, 8, 2)]
  39. )
  40. msg = "Grouper for 'A' not 1-dimensional"
  41. with pytest.raises(ValueError, match=msg):
  42. df.groupby("A")
  43. with pytest.raises(ValueError, match=msg):
  44. df.groupby(["A", "B"])
  45. grouped = df.groupby("B")
  46. c = grouped.count()
  47. assert c.columns.nlevels == 1
  48. assert c.columns.size == 3
  49. def test_column_select_via_attr(self, df):
  50. result = df.groupby("A").C.sum()
  51. expected = df.groupby("A")["C"].sum()
  52. tm.assert_series_equal(result, expected)
  53. df["mean"] = 1.5
  54. result = df.groupby("A").mean(numeric_only=True)
  55. expected = df.groupby("A")[["C", "D", "mean"]].agg(np.mean)
  56. tm.assert_frame_equal(result, expected)
  57. def test_getitem_list_of_columns(self):
  58. df = DataFrame(
  59. {
  60. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
  61. "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
  62. "C": np.random.randn(8),
  63. "D": np.random.randn(8),
  64. "E": np.random.randn(8),
  65. }
  66. )
  67. result = df.groupby("A")[["C", "D"]].mean()
  68. result2 = df.groupby("A")[df.columns[2:4]].mean()
  69. expected = df.loc[:, ["A", "C", "D"]].groupby("A").mean()
  70. tm.assert_frame_equal(result, expected)
  71. tm.assert_frame_equal(result2, expected)
  72. def test_getitem_numeric_column_names(self):
  73. # GH #13731
  74. df = DataFrame(
  75. {
  76. 0: list("abcd") * 2,
  77. 2: np.random.randn(8),
  78. 4: np.random.randn(8),
  79. 6: np.random.randn(8),
  80. }
  81. )
  82. result = df.groupby(0)[df.columns[1:3]].mean()
  83. result2 = df.groupby(0)[[2, 4]].mean()
  84. expected = df.loc[:, [0, 2, 4]].groupby(0).mean()
  85. tm.assert_frame_equal(result, expected)
  86. tm.assert_frame_equal(result2, expected)
  87. # per GH 23566 enforced deprecation raises a ValueError
  88. with pytest.raises(ValueError, match="Cannot subset columns with a tuple"):
  89. df.groupby(0)[2, 4].mean()
  90. def test_getitem_single_tuple_of_columns_raises(self, df):
  91. # per GH 23566 enforced deprecation raises a ValueError
  92. with pytest.raises(ValueError, match="Cannot subset columns with a tuple"):
  93. df.groupby("A")["C", "D"].mean()
  94. def test_getitem_single_column(self):
  95. df = DataFrame(
  96. {
  97. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
  98. "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
  99. "C": np.random.randn(8),
  100. "D": np.random.randn(8),
  101. "E": np.random.randn(8),
  102. }
  103. )
  104. result = df.groupby("A")["C"].mean()
  105. as_frame = df.loc[:, ["A", "C"]].groupby("A").mean()
  106. as_series = as_frame.iloc[:, 0]
  107. expected = as_series
  108. tm.assert_series_equal(result, expected)
  109. def test_indices_grouped_by_tuple_with_lambda(self):
  110. # GH 36158
  111. df = DataFrame(
  112. {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))}
  113. )
  114. gb = df.groupby("Tuples")
  115. gb_lambda = df.groupby(lambda x: df.iloc[x, 0])
  116. expected = gb.indices
  117. result = gb_lambda.indices
  118. tm.assert_dict_equal(result, expected)
  119. # grouping
  120. # --------------------------------
  121. class TestGrouping:
  122. @pytest.mark.parametrize(
  123. "index",
  124. [
  125. tm.makeFloatIndex,
  126. tm.makeStringIndex,
  127. tm.makeIntIndex,
  128. tm.makeDateIndex,
  129. tm.makePeriodIndex,
  130. ],
  131. )
  132. def test_grouper_index_types(self, index):
  133. # related GH5375
  134. # groupby misbehaving when using a Floatlike index
  135. df = DataFrame(np.arange(10).reshape(5, 2), columns=list("AB"))
  136. df.index = index(len(df))
  137. df.groupby(list("abcde"), group_keys=False).apply(lambda x: x)
  138. df.index = list(reversed(df.index.tolist()))
  139. df.groupby(list("abcde"), group_keys=False).apply(lambda x: x)
  140. def test_grouper_multilevel_freq(self):
  141. # GH 7885
  142. # with level and freq specified in a Grouper
  143. d0 = date.today() - timedelta(days=14)
  144. dates = date_range(d0, date.today())
  145. date_index = MultiIndex.from_product([dates, dates], names=["foo", "bar"])
  146. df = DataFrame(np.random.randint(0, 100, 225), index=date_index)
  147. # Check string level
  148. expected = (
  149. df.reset_index()
  150. .groupby([Grouper(key="foo", freq="W"), Grouper(key="bar", freq="W")])
  151. .sum()
  152. )
  153. # reset index changes columns dtype to object
  154. expected.columns = Index([0], dtype="int64")
  155. result = df.groupby(
  156. [Grouper(level="foo", freq="W"), Grouper(level="bar", freq="W")]
  157. ).sum()
  158. tm.assert_frame_equal(result, expected)
  159. # Check integer level
  160. result = df.groupby(
  161. [Grouper(level=0, freq="W"), Grouper(level=1, freq="W")]
  162. ).sum()
  163. tm.assert_frame_equal(result, expected)
  164. def test_grouper_creation_bug(self):
  165. # GH 8795
  166. df = DataFrame({"A": [0, 0, 1, 1, 2, 2], "B": [1, 2, 3, 4, 5, 6]})
  167. g = df.groupby("A")
  168. expected = g.sum()
  169. g = df.groupby(Grouper(key="A"))
  170. result = g.sum()
  171. tm.assert_frame_equal(result, expected)
  172. g = df.groupby(Grouper(key="A", axis=0))
  173. result = g.sum()
  174. tm.assert_frame_equal(result, expected)
  175. result = g.apply(lambda x: x.sum())
  176. expected["A"] = [0, 2, 4]
  177. expected = expected.loc[:, ["A", "B"]]
  178. tm.assert_frame_equal(result, expected)
  179. # GH14334
  180. # Grouper(key=...) may be passed in a list
  181. df = DataFrame(
  182. {"A": [0, 0, 0, 1, 1, 1], "B": [1, 1, 2, 2, 3, 3], "C": [1, 2, 3, 4, 5, 6]}
  183. )
  184. # Group by single column
  185. expected = df.groupby("A").sum()
  186. g = df.groupby([Grouper(key="A")])
  187. result = g.sum()
  188. tm.assert_frame_equal(result, expected)
  189. # Group by two columns
  190. # using a combination of strings and Grouper objects
  191. expected = df.groupby(["A", "B"]).sum()
  192. # Group with two Grouper objects
  193. g = df.groupby([Grouper(key="A"), Grouper(key="B")])
  194. result = g.sum()
  195. tm.assert_frame_equal(result, expected)
  196. # Group with a string and a Grouper object
  197. g = df.groupby(["A", Grouper(key="B")])
  198. result = g.sum()
  199. tm.assert_frame_equal(result, expected)
  200. # Group with a Grouper object and a string
  201. g = df.groupby([Grouper(key="A"), "B"])
  202. result = g.sum()
  203. tm.assert_frame_equal(result, expected)
  204. # GH8866
  205. s = Series(
  206. np.arange(8, dtype="int64"),
  207. index=MultiIndex.from_product(
  208. [list("ab"), range(2), date_range("20130101", periods=2)],
  209. names=["one", "two", "three"],
  210. ),
  211. )
  212. result = s.groupby(Grouper(level="three", freq="M")).sum()
  213. expected = Series(
  214. [28],
  215. index=pd.DatetimeIndex([Timestamp("2013-01-31")], freq="M", name="three"),
  216. )
  217. tm.assert_series_equal(result, expected)
  218. # just specifying a level breaks
  219. result = s.groupby(Grouper(level="one")).sum()
  220. expected = s.groupby(level="one").sum()
  221. tm.assert_series_equal(result, expected)
  222. def test_grouper_column_and_index(self):
  223. # GH 14327
  224. # Grouping a multi-index frame by a column and an index level should
  225. # be equivalent to resetting the index and grouping by two columns
  226. idx = MultiIndex.from_tuples(
  227. [("a", 1), ("a", 2), ("a", 3), ("b", 1), ("b", 2), ("b", 3)]
  228. )
  229. idx.names = ["outer", "inner"]
  230. df_multi = DataFrame(
  231. {"A": np.arange(6), "B": ["one", "one", "two", "two", "one", "one"]},
  232. index=idx,
  233. )
  234. result = df_multi.groupby(["B", Grouper(level="inner")]).mean(numeric_only=True)
  235. expected = (
  236. df_multi.reset_index().groupby(["B", "inner"]).mean(numeric_only=True)
  237. )
  238. tm.assert_frame_equal(result, expected)
  239. # Test the reverse grouping order
  240. result = df_multi.groupby([Grouper(level="inner"), "B"]).mean(numeric_only=True)
  241. expected = (
  242. df_multi.reset_index().groupby(["inner", "B"]).mean(numeric_only=True)
  243. )
  244. tm.assert_frame_equal(result, expected)
  245. # Grouping a single-index frame by a column and the index should
  246. # be equivalent to resetting the index and grouping by two columns
  247. df_single = df_multi.reset_index("outer")
  248. result = df_single.groupby(["B", Grouper(level="inner")]).mean(
  249. numeric_only=True
  250. )
  251. expected = (
  252. df_single.reset_index().groupby(["B", "inner"]).mean(numeric_only=True)
  253. )
  254. tm.assert_frame_equal(result, expected)
  255. # Test the reverse grouping order
  256. result = df_single.groupby([Grouper(level="inner"), "B"]).mean(
  257. numeric_only=True
  258. )
  259. expected = (
  260. df_single.reset_index().groupby(["inner", "B"]).mean(numeric_only=True)
  261. )
  262. tm.assert_frame_equal(result, expected)
  263. def test_groupby_levels_and_columns(self):
  264. # GH9344, GH9049
  265. idx_names = ["x", "y"]
  266. idx = MultiIndex.from_tuples([(1, 1), (1, 2), (3, 4), (5, 6)], names=idx_names)
  267. df = DataFrame(np.arange(12).reshape(-1, 3), index=idx)
  268. by_levels = df.groupby(level=idx_names).mean()
  269. # reset_index changes columns dtype to object
  270. by_columns = df.reset_index().groupby(idx_names).mean()
  271. # without casting, by_columns.columns is object-dtype
  272. by_columns.columns = by_columns.columns.astype(np.int64)
  273. tm.assert_frame_equal(by_levels, by_columns)
  274. def test_groupby_categorical_index_and_columns(self, observed):
  275. # GH18432, adapted for GH25871
  276. columns = ["A", "B", "A", "B"]
  277. categories = ["B", "A"]
  278. data = np.array(
  279. [[1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2], [1, 2, 1, 2]], int
  280. )
  281. cat_columns = CategoricalIndex(columns, categories=categories, ordered=True)
  282. df = DataFrame(data=data, columns=cat_columns)
  283. result = df.groupby(axis=1, level=0, observed=observed).sum()
  284. expected_data = np.array([[4, 2], [4, 2], [4, 2], [4, 2], [4, 2]], int)
  285. expected_columns = CategoricalIndex(
  286. categories, categories=categories, ordered=True
  287. )
  288. expected = DataFrame(data=expected_data, columns=expected_columns)
  289. tm.assert_frame_equal(result, expected)
  290. # test transposed version
  291. df = DataFrame(data.T, index=cat_columns)
  292. result = df.groupby(axis=0, level=0, observed=observed).sum()
  293. expected = DataFrame(data=expected_data.T, index=expected_columns)
  294. tm.assert_frame_equal(result, expected)
  295. def test_grouper_getting_correct_binner(self):
  296. # GH 10063
  297. # using a non-time-based grouper and a time-based grouper
  298. # and specifying levels
  299. df = DataFrame(
  300. {"A": 1},
  301. index=MultiIndex.from_product(
  302. [list("ab"), date_range("20130101", periods=80)], names=["one", "two"]
  303. ),
  304. )
  305. result = df.groupby(
  306. [Grouper(level="one"), Grouper(level="two", freq="M")]
  307. ).sum()
  308. expected = DataFrame(
  309. {"A": [31, 28, 21, 31, 28, 21]},
  310. index=MultiIndex.from_product(
  311. [list("ab"), date_range("20130101", freq="M", periods=3)],
  312. names=["one", "two"],
  313. ),
  314. )
  315. tm.assert_frame_equal(result, expected)
  316. def test_grouper_iter(self, df):
  317. assert sorted(df.groupby("A").grouper) == ["bar", "foo"]
  318. def test_empty_groups(self, df):
  319. # see gh-1048
  320. with pytest.raises(ValueError, match="No group keys passed!"):
  321. df.groupby([])
  322. def test_groupby_grouper(self, df):
  323. grouped = df.groupby("A")
  324. result = df.groupby(grouped.grouper).mean(numeric_only=True)
  325. expected = grouped.mean(numeric_only=True)
  326. tm.assert_frame_equal(result, expected)
  327. def test_groupby_dict_mapping(self):
  328. # GH #679
  329. s = Series({"T1": 5})
  330. result = s.groupby({"T1": "T2"}).agg(sum)
  331. expected = s.groupby(["T2"]).agg(sum)
  332. tm.assert_series_equal(result, expected)
  333. s = Series([1.0, 2.0, 3.0, 4.0], index=list("abcd"))
  334. mapping = {"a": 0, "b": 0, "c": 1, "d": 1}
  335. result = s.groupby(mapping).mean()
  336. result2 = s.groupby(mapping).agg(np.mean)
  337. exp_key = np.array([0, 0, 1, 1], dtype=np.int64)
  338. expected = s.groupby(exp_key).mean()
  339. expected2 = s.groupby(exp_key).mean()
  340. tm.assert_series_equal(result, expected)
  341. tm.assert_series_equal(result, result2)
  342. tm.assert_series_equal(result, expected2)
  343. @pytest.mark.parametrize(
  344. "index",
  345. [
  346. [0, 1, 2, 3],
  347. ["a", "b", "c", "d"],
  348. [Timestamp(2021, 7, 28 + i) for i in range(4)],
  349. ],
  350. )
  351. def test_groupby_series_named_with_tuple(self, frame_or_series, index):
  352. # GH 42731
  353. obj = frame_or_series([1, 2, 3, 4], index=index)
  354. groups = Series([1, 0, 1, 0], index=index, name=("a", "a"))
  355. result = obj.groupby(groups).last()
  356. expected = frame_or_series([4, 3])
  357. expected.index.name = ("a", "a")
  358. tm.assert_equal(result, expected)
  359. def test_groupby_grouper_f_sanity_checked(self):
  360. dates = date_range("01-Jan-2013", periods=12, freq="MS")
  361. ts = Series(np.random.randn(12), index=dates)
  362. # GH3035
  363. # index.map is used to apply grouper to the index
  364. # if it fails on the elements, map tries it on the entire index as
  365. # a sequence. That can yield invalid results that cause trouble
  366. # down the line.
  367. # the surprise comes from using key[0:6] rather than str(key)[0:6]
  368. # when the elements are Timestamp.
  369. # the result is Index[0:6], very confusing.
  370. msg = r"Grouper result violates len\(labels\) == len\(data\)"
  371. with pytest.raises(AssertionError, match=msg):
  372. ts.groupby(lambda key: key[0:6])
  373. def test_grouping_error_on_multidim_input(self, df):
  374. msg = "Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional"
  375. with pytest.raises(ValueError, match=msg):
  376. Grouping(df.index, df[["A", "A"]])
  377. def test_multiindex_passthru(self):
  378. # GH 7997
  379. # regression from 0.14.1
  380. df = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
  381. df.columns = MultiIndex.from_tuples([(0, 1), (1, 1), (2, 1)])
  382. result = df.groupby(axis=1, level=[0, 1]).first()
  383. tm.assert_frame_equal(result, df)
  384. def test_multiindex_negative_level(self, mframe):
  385. # GH 13901
  386. result = mframe.groupby(level=-1).sum()
  387. expected = mframe.groupby(level="second").sum()
  388. tm.assert_frame_equal(result, expected)
  389. result = mframe.groupby(level=-2).sum()
  390. expected = mframe.groupby(level="first").sum()
  391. tm.assert_frame_equal(result, expected)
  392. result = mframe.groupby(level=[-2, -1]).sum()
  393. expected = mframe.sort_index()
  394. tm.assert_frame_equal(result, expected)
  395. result = mframe.groupby(level=[-1, "first"]).sum()
  396. expected = mframe.groupby(level=["second", "first"]).sum()
  397. tm.assert_frame_equal(result, expected)
  398. def test_multifunc_select_col_integer_cols(self, df):
  399. df.columns = np.arange(len(df.columns))
  400. # it works!
  401. df.groupby(1, as_index=False)[2].agg({"Q": np.mean})
  402. def test_multiindex_columns_empty_level(self):
  403. lst = [["count", "values"], ["to filter", ""]]
  404. midx = MultiIndex.from_tuples(lst)
  405. df = DataFrame([[1, "A"]], columns=midx)
  406. grouped = df.groupby("to filter").groups
  407. assert grouped["A"] == [0]
  408. grouped = df.groupby([("to filter", "")]).groups
  409. assert grouped["A"] == [0]
  410. df = DataFrame([[1, "A"], [2, "B"]], columns=midx)
  411. expected = df.groupby("to filter").groups
  412. result = df.groupby([("to filter", "")]).groups
  413. assert result == expected
  414. df = DataFrame([[1, "A"], [2, "A"]], columns=midx)
  415. expected = df.groupby("to filter").groups
  416. result = df.groupby([("to filter", "")]).groups
  417. tm.assert_dict_equal(result, expected)
  418. def test_groupby_multiindex_tuple(self):
  419. # GH 17979
  420. df = DataFrame(
  421. [[1, 2, 3, 4], [3, 4, 5, 6], [1, 4, 2, 3]],
  422. columns=MultiIndex.from_arrays([["a", "b", "b", "c"], [1, 1, 2, 2]]),
  423. )
  424. expected = df.groupby([("b", 1)]).groups
  425. result = df.groupby(("b", 1)).groups
  426. tm.assert_dict_equal(expected, result)
  427. df2 = DataFrame(
  428. df.values,
  429. columns=MultiIndex.from_arrays(
  430. [["a", "b", "b", "c"], ["d", "d", "e", "e"]]
  431. ),
  432. )
  433. expected = df2.groupby([("b", "d")]).groups
  434. result = df.groupby(("b", 1)).groups
  435. tm.assert_dict_equal(expected, result)
  436. df3 = DataFrame(df.values, columns=[("a", "d"), ("b", "d"), ("b", "e"), "c"])
  437. expected = df3.groupby([("b", "d")]).groups
  438. result = df.groupby(("b", 1)).groups
  439. tm.assert_dict_equal(expected, result)
  440. @pytest.mark.parametrize("sort", [True, False])
  441. def test_groupby_level(self, sort, mframe, df):
  442. # GH 17537
  443. frame = mframe
  444. deleveled = frame.reset_index()
  445. result0 = frame.groupby(level=0, sort=sort).sum()
  446. result1 = frame.groupby(level=1, sort=sort).sum()
  447. expected0 = frame.groupby(deleveled["first"].values, sort=sort).sum()
  448. expected1 = frame.groupby(deleveled["second"].values, sort=sort).sum()
  449. expected0.index.name = "first"
  450. expected1.index.name = "second"
  451. assert result0.index.name == "first"
  452. assert result1.index.name == "second"
  453. tm.assert_frame_equal(result0, expected0)
  454. tm.assert_frame_equal(result1, expected1)
  455. assert result0.index.name == frame.index.names[0]
  456. assert result1.index.name == frame.index.names[1]
  457. # groupby level name
  458. result0 = frame.groupby(level="first", sort=sort).sum()
  459. result1 = frame.groupby(level="second", sort=sort).sum()
  460. tm.assert_frame_equal(result0, expected0)
  461. tm.assert_frame_equal(result1, expected1)
  462. # axis=1
  463. result0 = frame.T.groupby(level=0, axis=1, sort=sort).sum()
  464. result1 = frame.T.groupby(level=1, axis=1, sort=sort).sum()
  465. tm.assert_frame_equal(result0, expected0.T)
  466. tm.assert_frame_equal(result1, expected1.T)
  467. # raise exception for non-MultiIndex
  468. msg = "level > 0 or level < -1 only valid with MultiIndex"
  469. with pytest.raises(ValueError, match=msg):
  470. df.groupby(level=1)
  471. def test_groupby_level_index_names(self, axis):
  472. # GH4014 this used to raise ValueError since 'exp'>1 (in py2)
  473. df = DataFrame({"exp": ["A"] * 3 + ["B"] * 3, "var1": range(6)}).set_index(
  474. "exp"
  475. )
  476. if axis in (1, "columns"):
  477. df = df.T
  478. df.groupby(level="exp", axis=axis)
  479. msg = f"level name foo is not the name of the {df._get_axis_name(axis)}"
  480. with pytest.raises(ValueError, match=msg):
  481. df.groupby(level="foo", axis=axis)
  482. @pytest.mark.parametrize("sort", [True, False])
  483. def test_groupby_level_with_nas(self, sort):
  484. # GH 17537
  485. index = MultiIndex(
  486. levels=[[1, 0], [0, 1, 2, 3]],
  487. codes=[[1, 1, 1, 1, 0, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
  488. )
  489. # factorizing doesn't confuse things
  490. s = Series(np.arange(8.0), index=index)
  491. result = s.groupby(level=0, sort=sort).sum()
  492. expected = Series([6.0, 22.0], index=[0, 1])
  493. tm.assert_series_equal(result, expected)
  494. index = MultiIndex(
  495. levels=[[1, 0], [0, 1, 2, 3]],
  496. codes=[[1, 1, 1, 1, -1, 0, 0, 0], [0, 1, 2, 3, 0, 1, 2, 3]],
  497. )
  498. # factorizing doesn't confuse things
  499. s = Series(np.arange(8.0), index=index)
  500. result = s.groupby(level=0, sort=sort).sum()
  501. expected = Series([6.0, 18.0], index=[0.0, 1.0])
  502. tm.assert_series_equal(result, expected)
  503. def test_groupby_args(self, mframe):
  504. # PR8618 and issue 8015
  505. frame = mframe
  506. msg = "You have to supply one of 'by' and 'level'"
  507. with pytest.raises(TypeError, match=msg):
  508. frame.groupby()
  509. msg = "You have to supply one of 'by' and 'level'"
  510. with pytest.raises(TypeError, match=msg):
  511. frame.groupby(by=None, level=None)
  512. @pytest.mark.parametrize(
  513. "sort,labels",
  514. [
  515. [True, [2, 2, 2, 0, 0, 1, 1, 3, 3, 3]],
  516. [False, [0, 0, 0, 1, 1, 2, 2, 3, 3, 3]],
  517. ],
  518. )
  519. def test_level_preserve_order(self, sort, labels, mframe):
  520. # GH 17537
  521. grouped = mframe.groupby(level=0, sort=sort)
  522. exp_labels = np.array(labels, np.intp)
  523. tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels)
  524. def test_grouping_labels(self, mframe):
  525. grouped = mframe.groupby(mframe.index.get_level_values(0))
  526. exp_labels = np.array([2, 2, 2, 0, 0, 1, 1, 3, 3, 3], dtype=np.intp)
  527. tm.assert_almost_equal(grouped.grouper.codes[0], exp_labels)
  528. def test_list_grouper_with_nat(self):
  529. # GH 14715
  530. df = DataFrame({"date": date_range("1/1/2011", periods=365, freq="D")})
  531. df.iloc[-1] = pd.NaT
  532. grouper = Grouper(key="date", freq="AS")
  533. # Grouper in a list grouping
  534. result = df.groupby([grouper])
  535. expected = {Timestamp("2011-01-01"): Index(list(range(364)))}
  536. tm.assert_dict_equal(result.groups, expected)
  537. # Test case without a list
  538. result = df.groupby(grouper)
  539. expected = {Timestamp("2011-01-01"): 365}
  540. tm.assert_dict_equal(result.groups, expected)
  541. @pytest.mark.parametrize(
  542. "func,expected",
  543. [
  544. (
  545. "transform",
  546. Series(name=2, dtype=np.float64),
  547. ),
  548. (
  549. "agg",
  550. Series(
  551. name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
  552. ),
  553. ),
  554. (
  555. "apply",
  556. Series(
  557. name=2, dtype=np.float64, index=Index([], dtype=np.float64, name=1)
  558. ),
  559. ),
  560. ],
  561. )
  562. def test_evaluate_with_empty_groups(self, func, expected):
  563. # 26208
  564. # test transform'ing empty groups
  565. # (not testing other agg fns, because they return
  566. # different index objects.
  567. df = DataFrame({1: [], 2: []})
  568. g = df.groupby(1, group_keys=False)
  569. result = getattr(g[2], func)(lambda x: x)
  570. tm.assert_series_equal(result, expected)
  571. def test_groupby_empty(self):
  572. # https://github.com/pandas-dev/pandas/issues/27190
  573. s = Series([], name="name", dtype="float64")
  574. gr = s.groupby([])
  575. result = gr.mean()
  576. expected = s.set_axis(Index([], dtype=np.intp))
  577. tm.assert_series_equal(result, expected)
  578. # check group properties
  579. assert len(gr.grouper.groupings) == 1
  580. tm.assert_numpy_array_equal(
  581. gr.grouper.group_info[0], np.array([], dtype=np.dtype(np.intp))
  582. )
  583. tm.assert_numpy_array_equal(
  584. gr.grouper.group_info[1], np.array([], dtype=np.dtype(np.intp))
  585. )
  586. assert gr.grouper.group_info[2] == 0
  587. # check name
  588. assert s.groupby(s).grouper.names == ["name"]
  589. def test_groupby_level_index_value_all_na(self):
  590. # issue 20519
  591. df = DataFrame(
  592. [["x", np.nan, 10], [None, np.nan, 20]], columns=["A", "B", "C"]
  593. ).set_index(["A", "B"])
  594. result = df.groupby(level=["A", "B"]).sum()
  595. expected = DataFrame(
  596. data=[],
  597. index=MultiIndex(
  598. levels=[Index(["x"], dtype="object"), Index([], dtype="float64")],
  599. codes=[[], []],
  600. names=["A", "B"],
  601. ),
  602. columns=["C"],
  603. dtype="int64",
  604. )
  605. tm.assert_frame_equal(result, expected)
  606. def test_groupby_multiindex_level_empty(self):
  607. # https://github.com/pandas-dev/pandas/issues/31670
  608. df = DataFrame(
  609. [[123, "a", 1.0], [123, "b", 2.0]], columns=["id", "category", "value"]
  610. )
  611. df = df.set_index(["id", "category"])
  612. empty = df[df.value < 0]
  613. result = empty.groupby("id").sum()
  614. expected = DataFrame(
  615. dtype="float64",
  616. columns=["value"],
  617. index=Index([], dtype=np.int64, name="id"),
  618. )
  619. tm.assert_frame_equal(result, expected)
  620. # get_group
  621. # --------------------------------
  622. class TestGetGroup:
  623. def test_get_group(self):
  624. # GH 5267
  625. # be datelike friendly
  626. df = DataFrame(
  627. {
  628. "DATE": pd.to_datetime(
  629. [
  630. "10-Oct-2013",
  631. "10-Oct-2013",
  632. "10-Oct-2013",
  633. "11-Oct-2013",
  634. "11-Oct-2013",
  635. "11-Oct-2013",
  636. ]
  637. ),
  638. "label": ["foo", "foo", "bar", "foo", "foo", "bar"],
  639. "VAL": [1, 2, 3, 4, 5, 6],
  640. }
  641. )
  642. g = df.groupby("DATE")
  643. key = list(g.groups)[0]
  644. result1 = g.get_group(key)
  645. result2 = g.get_group(Timestamp(key).to_pydatetime())
  646. result3 = g.get_group(str(Timestamp(key)))
  647. tm.assert_frame_equal(result1, result2)
  648. tm.assert_frame_equal(result1, result3)
  649. g = df.groupby(["DATE", "label"])
  650. key = list(g.groups)[0]
  651. result1 = g.get_group(key)
  652. result2 = g.get_group((Timestamp(key[0]).to_pydatetime(), key[1]))
  653. result3 = g.get_group((str(Timestamp(key[0])), key[1]))
  654. tm.assert_frame_equal(result1, result2)
  655. tm.assert_frame_equal(result1, result3)
  656. # must pass a same-length tuple with multiple keys
  657. msg = "must supply a tuple to get_group with multiple grouping keys"
  658. with pytest.raises(ValueError, match=msg):
  659. g.get_group("foo")
  660. with pytest.raises(ValueError, match=msg):
  661. g.get_group("foo")
  662. msg = "must supply a same-length tuple to get_group with multiple grouping keys"
  663. with pytest.raises(ValueError, match=msg):
  664. g.get_group(("foo", "bar", "baz"))
  665. def test_get_group_empty_bins(self, observed):
  666. d = DataFrame([3, 1, 7, 6])
  667. bins = [0, 5, 10, 15]
  668. g = d.groupby(pd.cut(d[0], bins), observed=observed)
  669. # TODO: should prob allow a str of Interval work as well
  670. # IOW '(0, 5]'
  671. result = g.get_group(pd.Interval(0, 5))
  672. expected = DataFrame([3, 1], index=[0, 1])
  673. tm.assert_frame_equal(result, expected)
  674. msg = r"Interval\(10, 15, closed='right'\)"
  675. with pytest.raises(KeyError, match=msg):
  676. g.get_group(pd.Interval(10, 15))
  677. def test_get_group_grouped_by_tuple(self):
  678. # GH 8121
  679. df = DataFrame([[(1,), (1, 2), (1,), (1, 2)]], index=["ids"]).T
  680. gr = df.groupby("ids")
  681. expected = DataFrame({"ids": [(1,), (1,)]}, index=[0, 2])
  682. result = gr.get_group((1,))
  683. tm.assert_frame_equal(result, expected)
  684. dt = pd.to_datetime(["2010-01-01", "2010-01-02", "2010-01-01", "2010-01-02"])
  685. df = DataFrame({"ids": [(x,) for x in dt]})
  686. gr = df.groupby("ids")
  687. result = gr.get_group(("2010-01-01",))
  688. expected = DataFrame({"ids": [(dt[0],), (dt[0],)]}, index=[0, 2])
  689. tm.assert_frame_equal(result, expected)
  690. def test_get_group_grouped_by_tuple_with_lambda(self):
  691. # GH 36158
  692. df = DataFrame(
  693. {"Tuples": ((x, y) for x in [0, 1] for y in np.random.randint(3, 5, 5))}
  694. )
  695. gb = df.groupby("Tuples")
  696. gb_lambda = df.groupby(lambda x: df.iloc[x, 0])
  697. expected = gb.get_group(list(gb.groups.keys())[0])
  698. result = gb_lambda.get_group(list(gb_lambda.groups.keys())[0])
  699. tm.assert_frame_equal(result, expected)
  700. def test_groupby_with_empty(self):
  701. index = pd.DatetimeIndex(())
  702. data = ()
  703. series = Series(data, index, dtype=object)
  704. grouper = Grouper(freq="D")
  705. grouped = series.groupby(grouper)
  706. assert next(iter(grouped), None) is None
  707. def test_groupby_with_single_column(self):
  708. df = DataFrame({"a": list("abssbab")})
  709. tm.assert_frame_equal(df.groupby("a").get_group("a"), df.iloc[[0, 5]])
  710. # GH 13530
  711. exp = DataFrame(index=Index(["a", "b", "s"], name="a"), columns=[])
  712. tm.assert_frame_equal(df.groupby("a").count(), exp)
  713. tm.assert_frame_equal(df.groupby("a").sum(), exp)
  714. exp = df.iloc[[3, 4, 5]]
  715. tm.assert_frame_equal(df.groupby("a").nth(1), exp)
  716. def test_gb_key_len_equal_axis_len(self):
  717. # GH16843
  718. # test ensures that index and column keys are recognized correctly
  719. # when number of keys equals axis length of groupby
  720. df = DataFrame(
  721. [["foo", "bar", "B", 1], ["foo", "bar", "B", 2], ["foo", "baz", "C", 3]],
  722. columns=["first", "second", "third", "one"],
  723. )
  724. df = df.set_index(["first", "second"])
  725. df = df.groupby(["first", "second", "third"]).size()
  726. assert df.loc[("foo", "bar", "B")] == 2
  727. assert df.loc[("foo", "baz", "C")] == 1
  728. # groups & iteration
  729. # --------------------------------
  730. class TestIteration:
  731. def test_groups(self, df):
  732. grouped = df.groupby(["A"])
  733. groups = grouped.groups
  734. assert groups is grouped.groups # caching works
  735. for k, v in grouped.groups.items():
  736. assert (df.loc[v]["A"] == k).all()
  737. grouped = df.groupby(["A", "B"])
  738. groups = grouped.groups
  739. assert groups is grouped.groups # caching works
  740. for k, v in grouped.groups.items():
  741. assert (df.loc[v]["A"] == k[0]).all()
  742. assert (df.loc[v]["B"] == k[1]).all()
  743. def test_grouping_is_iterable(self, tsframe):
  744. # this code path isn't used anywhere else
  745. # not sure it's useful
  746. grouped = tsframe.groupby([lambda x: x.weekday(), lambda x: x.year])
  747. # test it works
  748. for g in grouped.grouper.groupings[0]:
  749. pass
  750. def test_multi_iter(self):
  751. s = Series(np.arange(6))
  752. k1 = np.array(["a", "a", "a", "b", "b", "b"])
  753. k2 = np.array(["1", "2", "1", "2", "1", "2"])
  754. grouped = s.groupby([k1, k2])
  755. iterated = list(grouped)
  756. expected = [
  757. ("a", "1", s[[0, 2]]),
  758. ("a", "2", s[[1]]),
  759. ("b", "1", s[[4]]),
  760. ("b", "2", s[[3, 5]]),
  761. ]
  762. for i, ((one, two), three) in enumerate(iterated):
  763. e1, e2, e3 = expected[i]
  764. assert e1 == one
  765. assert e2 == two
  766. tm.assert_series_equal(three, e3)
  767. def test_multi_iter_frame(self, three_group):
  768. k1 = np.array(["b", "b", "b", "a", "a", "a"])
  769. k2 = np.array(["1", "2", "1", "2", "1", "2"])
  770. df = DataFrame(
  771. {"v1": np.random.randn(6), "v2": np.random.randn(6), "k1": k1, "k2": k2},
  772. index=["one", "two", "three", "four", "five", "six"],
  773. )
  774. grouped = df.groupby(["k1", "k2"])
  775. # things get sorted!
  776. iterated = list(grouped)
  777. idx = df.index
  778. expected = [
  779. ("a", "1", df.loc[idx[[4]]]),
  780. ("a", "2", df.loc[idx[[3, 5]]]),
  781. ("b", "1", df.loc[idx[[0, 2]]]),
  782. ("b", "2", df.loc[idx[[1]]]),
  783. ]
  784. for i, ((one, two), three) in enumerate(iterated):
  785. e1, e2, e3 = expected[i]
  786. assert e1 == one
  787. assert e2 == two
  788. tm.assert_frame_equal(three, e3)
  789. # don't iterate through groups with no data
  790. df["k1"] = np.array(["b", "b", "b", "a", "a", "a"])
  791. df["k2"] = np.array(["1", "1", "1", "2", "2", "2"])
  792. grouped = df.groupby(["k1", "k2"])
  793. # calling `dict` on a DataFrameGroupBy leads to a TypeError,
  794. # we need to use a dictionary comprehension here
  795. # pylint: disable-next=unnecessary-comprehension
  796. groups = {key: gp for key, gp in grouped}
  797. assert len(groups) == 2
  798. # axis = 1
  799. three_levels = three_group.groupby(["A", "B", "C"]).mean()
  800. grouped = three_levels.T.groupby(axis=1, level=(1, 2))
  801. for key, group in grouped:
  802. pass
  803. def test_dictify(self, df):
  804. dict(iter(df.groupby("A")))
  805. dict(iter(df.groupby(["A", "B"])))
  806. dict(iter(df["C"].groupby(df["A"])))
  807. dict(iter(df["C"].groupby([df["A"], df["B"]])))
  808. dict(iter(df.groupby("A")["C"]))
  809. dict(iter(df.groupby(["A", "B"])["C"]))
  810. def test_groupby_with_small_elem(self):
  811. # GH 8542
  812. # length=2
  813. df = DataFrame(
  814. {"event": ["start", "start"], "change": [1234, 5678]},
  815. index=pd.DatetimeIndex(["2014-09-10", "2013-10-10"]),
  816. )
  817. grouped = df.groupby([Grouper(freq="M"), "event"])
  818. assert len(grouped.groups) == 2
  819. assert grouped.ngroups == 2
  820. assert (Timestamp("2014-09-30"), "start") in grouped.groups
  821. assert (Timestamp("2013-10-31"), "start") in grouped.groups
  822. res = grouped.get_group((Timestamp("2014-09-30"), "start"))
  823. tm.assert_frame_equal(res, df.iloc[[0], :])
  824. res = grouped.get_group((Timestamp("2013-10-31"), "start"))
  825. tm.assert_frame_equal(res, df.iloc[[1], :])
  826. df = DataFrame(
  827. {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
  828. index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-09-15"]),
  829. )
  830. grouped = df.groupby([Grouper(freq="M"), "event"])
  831. assert len(grouped.groups) == 2
  832. assert grouped.ngroups == 2
  833. assert (Timestamp("2014-09-30"), "start") in grouped.groups
  834. assert (Timestamp("2013-10-31"), "start") in grouped.groups
  835. res = grouped.get_group((Timestamp("2014-09-30"), "start"))
  836. tm.assert_frame_equal(res, df.iloc[[0, 2], :])
  837. res = grouped.get_group((Timestamp("2013-10-31"), "start"))
  838. tm.assert_frame_equal(res, df.iloc[[1], :])
  839. # length=3
  840. df = DataFrame(
  841. {"event": ["start", "start", "start"], "change": [1234, 5678, 9123]},
  842. index=pd.DatetimeIndex(["2014-09-10", "2013-10-10", "2014-08-05"]),
  843. )
  844. grouped = df.groupby([Grouper(freq="M"), "event"])
  845. assert len(grouped.groups) == 3
  846. assert grouped.ngroups == 3
  847. assert (Timestamp("2014-09-30"), "start") in grouped.groups
  848. assert (Timestamp("2013-10-31"), "start") in grouped.groups
  849. assert (Timestamp("2014-08-31"), "start") in grouped.groups
  850. res = grouped.get_group((Timestamp("2014-09-30"), "start"))
  851. tm.assert_frame_equal(res, df.iloc[[0], :])
  852. res = grouped.get_group((Timestamp("2013-10-31"), "start"))
  853. tm.assert_frame_equal(res, df.iloc[[1], :])
  854. res = grouped.get_group((Timestamp("2014-08-31"), "start"))
  855. tm.assert_frame_equal(res, df.iloc[[2], :])
  856. def test_grouping_string_repr(self):
  857. # GH 13394
  858. mi = MultiIndex.from_arrays([list("AAB"), list("aba")])
  859. df = DataFrame([[1, 2, 3]], columns=mi)
  860. gr = df.groupby(df[("A", "a")])
  861. result = gr.grouper.groupings[0].__repr__()
  862. expected = "Grouping(('A', 'a'))"
  863. assert result == expected
  864. def test_grouping_by_key_is_in_axis():
  865. # GH#50413 - Groupers specified by key are in-axis
  866. df = DataFrame({"a": [1, 1, 2], "b": [1, 1, 2], "c": [3, 4, 5]}).set_index("a")
  867. gb = df.groupby([Grouper(level="a"), Grouper(key="b")], as_index=False)
  868. assert not gb.grouper.groupings[0].in_axis
  869. assert gb.grouper.groupings[1].in_axis
  870. # Currently only in-axis groupings are including in the result when as_index=False;
  871. # This is likely to change in the future.
  872. result = gb.sum()
  873. expected = DataFrame({"b": [1, 2], "c": [7, 5]})
  874. tm.assert_frame_equal(result, expected)
  875. def test_grouper_groups():
  876. # GH#51182 check Grouper.groups does not raise AttributeError
  877. df = DataFrame({"a": [1, 2, 3], "b": 1})
  878. grper = Grouper(key="a")
  879. gb = df.groupby(grper)
  880. msg = "Use GroupBy.groups instead"
  881. with tm.assert_produces_warning(FutureWarning, match=msg):
  882. res = grper.groups
  883. assert res is gb.groups
  884. msg = "Use GroupBy.grouper instead"
  885. with tm.assert_produces_warning(FutureWarning, match=msg):
  886. res = grper.grouper
  887. assert res is gb.grouper
  888. msg = "Grouper.obj is deprecated and will be removed"
  889. with tm.assert_produces_warning(FutureWarning, match=msg):
  890. res = grper.obj
  891. assert res is gb.obj
  892. msg = "Use Resampler.ax instead"
  893. with tm.assert_produces_warning(FutureWarning, match=msg):
  894. grper.ax
  895. msg = "Grouper.indexer is deprecated"
  896. with tm.assert_produces_warning(FutureWarning, match=msg):
  897. grper.indexer