test_aggregate.py 49 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523
  1. """
  2. test .agg behavior / note that .apply is tested generally in test_groupby.py
  3. """
  4. import datetime
  5. import functools
  6. from functools import partial
  7. import re
  8. import numpy as np
  9. import pytest
  10. from pandas.errors import SpecificationError
  11. from pandas.core.dtypes.common import is_integer_dtype
  12. import pandas as pd
  13. from pandas import (
  14. DataFrame,
  15. Index,
  16. MultiIndex,
  17. Series,
  18. concat,
  19. to_datetime,
  20. )
  21. import pandas._testing as tm
  22. from pandas.core.groupby.grouper import Grouping
  23. def test_groupby_agg_no_extra_calls():
  24. # GH#31760
  25. df = DataFrame({"key": ["a", "b", "c", "c"], "value": [1, 2, 3, 4]})
  26. gb = df.groupby("key")["value"]
  27. def dummy_func(x):
  28. assert len(x) != 0
  29. return x.sum()
  30. gb.agg(dummy_func)
  31. def test_agg_regression1(tsframe):
  32. grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
  33. result = grouped.agg(np.mean)
  34. expected = grouped.mean()
  35. tm.assert_frame_equal(result, expected)
  36. def test_agg_must_agg(df):
  37. grouped = df.groupby("A")["C"]
  38. msg = "Must produce aggregated value"
  39. with pytest.raises(Exception, match=msg):
  40. grouped.agg(lambda x: x.describe())
  41. with pytest.raises(Exception, match=msg):
  42. grouped.agg(lambda x: x.index[:2])
  43. def test_agg_ser_multi_key(df):
  44. f = lambda x: x.sum()
  45. results = df.C.groupby([df.A, df.B]).aggregate(f)
  46. expected = df.groupby(["A", "B"]).sum()["C"]
  47. tm.assert_series_equal(results, expected)
  48. def test_groupby_aggregation_mixed_dtype():
  49. # GH 6212
  50. expected = DataFrame(
  51. {
  52. "v1": [5, 5, 7, np.nan, 3, 3, 4, 1],
  53. "v2": [55, 55, 77, np.nan, 33, 33, 44, 11],
  54. },
  55. index=MultiIndex.from_tuples(
  56. [
  57. (1, 95),
  58. (1, 99),
  59. (2, 95),
  60. (2, 99),
  61. ("big", "damp"),
  62. ("blue", "dry"),
  63. ("red", "red"),
  64. ("red", "wet"),
  65. ],
  66. names=["by1", "by2"],
  67. ),
  68. )
  69. df = DataFrame(
  70. {
  71. "v1": [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9],
  72. "v2": [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99],
  73. "by1": ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12],
  74. "by2": [
  75. "wet",
  76. "dry",
  77. 99,
  78. 95,
  79. np.nan,
  80. "damp",
  81. 95,
  82. 99,
  83. "red",
  84. 99,
  85. np.nan,
  86. np.nan,
  87. ],
  88. }
  89. )
  90. g = df.groupby(["by1", "by2"])
  91. result = g[["v1", "v2"]].mean()
  92. tm.assert_frame_equal(result, expected)
  93. def test_groupby_aggregation_multi_level_column():
  94. # GH 29772
  95. lst = [
  96. [True, True, True, False],
  97. [True, False, np.nan, False],
  98. [True, True, np.nan, False],
  99. [True, True, np.nan, False],
  100. ]
  101. df = DataFrame(
  102. data=lst,
  103. columns=MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 0), ("B", 1)]),
  104. )
  105. gb = df.groupby(level=1, axis=1)
  106. result = gb.sum(numeric_only=False)
  107. expected = DataFrame({0: [2.0, True, True, True], 1: [1, 0, 1, 1]})
  108. tm.assert_frame_equal(result, expected)
  109. def test_agg_apply_corner(ts, tsframe):
  110. # nothing to group, all NA
  111. grouped = ts.groupby(ts * np.nan, group_keys=False)
  112. assert ts.dtype == np.float64
  113. # groupby float64 values results in a float64 Index
  114. exp = Series([], dtype=np.float64, index=Index([], dtype=np.float64))
  115. tm.assert_series_equal(grouped.sum(), exp)
  116. tm.assert_series_equal(grouped.agg(np.sum), exp)
  117. tm.assert_series_equal(grouped.apply(np.sum), exp, check_index_type=False)
  118. # DataFrame
  119. grouped = tsframe.groupby(tsframe["A"] * np.nan, group_keys=False)
  120. exp_df = DataFrame(
  121. columns=tsframe.columns,
  122. dtype=float,
  123. index=Index([], name="A", dtype=np.float64),
  124. )
  125. tm.assert_frame_equal(grouped.sum(), exp_df)
  126. tm.assert_frame_equal(grouped.agg(np.sum), exp_df)
  127. tm.assert_frame_equal(grouped.apply(np.sum), exp_df)
  128. def test_agg_grouping_is_list_tuple(ts):
  129. df = tm.makeTimeDataFrame()
  130. grouped = df.groupby(lambda x: x.year)
  131. grouper = grouped.grouper.groupings[0].grouping_vector
  132. grouped.grouper.groupings[0] = Grouping(ts.index, list(grouper))
  133. result = grouped.agg(np.mean)
  134. expected = grouped.mean()
  135. tm.assert_frame_equal(result, expected)
  136. grouped.grouper.groupings[0] = Grouping(ts.index, tuple(grouper))
  137. result = grouped.agg(np.mean)
  138. expected = grouped.mean()
  139. tm.assert_frame_equal(result, expected)
  140. def test_agg_python_multiindex(mframe):
  141. grouped = mframe.groupby(["A", "B"])
  142. result = grouped.agg(np.mean)
  143. expected = grouped.mean()
  144. tm.assert_frame_equal(result, expected)
  145. @pytest.mark.parametrize(
  146. "groupbyfunc", [lambda x: x.weekday(), [lambda x: x.month, lambda x: x.weekday()]]
  147. )
  148. def test_aggregate_str_func(tsframe, groupbyfunc):
  149. grouped = tsframe.groupby(groupbyfunc)
  150. # single series
  151. result = grouped["A"].agg("std")
  152. expected = grouped["A"].std()
  153. tm.assert_series_equal(result, expected)
  154. # group frame by function name
  155. result = grouped.aggregate("var")
  156. expected = grouped.var()
  157. tm.assert_frame_equal(result, expected)
  158. # group frame by function dict
  159. result = grouped.agg({"A": "var", "B": "std", "C": "mean", "D": "sem"})
  160. expected = DataFrame(
  161. {
  162. "A": grouped["A"].var(),
  163. "B": grouped["B"].std(),
  164. "C": grouped["C"].mean(),
  165. "D": grouped["D"].sem(),
  166. }
  167. )
  168. tm.assert_frame_equal(result, expected)
  169. def test_std_masked_dtype(any_numeric_ea_dtype):
  170. # GH#35516
  171. df = DataFrame(
  172. {
  173. "a": [2, 1, 1, 1, 2, 2, 1],
  174. "b": Series([pd.NA, 1, 2, 1, 1, 1, 2], dtype="Float64"),
  175. }
  176. )
  177. result = df.groupby("a").std()
  178. expected = DataFrame(
  179. {"b": [0.57735, 0]}, index=Index([1, 2], name="a"), dtype="Float64"
  180. )
  181. tm.assert_frame_equal(result, expected)
  182. def test_agg_str_with_kwarg_axis_1_raises(df, reduction_func):
  183. gb = df.groupby(level=0)
  184. if reduction_func in ("idxmax", "idxmin"):
  185. error = TypeError
  186. msg = "reduction operation '.*' not allowed for this dtype"
  187. else:
  188. error = ValueError
  189. msg = f"Operation {reduction_func} does not support axis=1"
  190. with pytest.raises(error, match=msg):
  191. gb.agg(reduction_func, axis=1)
  192. @pytest.mark.parametrize(
  193. "func, expected, dtype, result_dtype_dict",
  194. [
  195. ("sum", [5, 7, 9], "int64", {}),
  196. ("std", [4.5**0.5] * 3, int, {"i": float, "j": float, "k": float}),
  197. ("var", [4.5] * 3, int, {"i": float, "j": float, "k": float}),
  198. ("sum", [5, 7, 9], "Int64", {"j": "int64"}),
  199. ("std", [4.5**0.5] * 3, "Int64", {"i": float, "j": float, "k": float}),
  200. ("var", [4.5] * 3, "Int64", {"i": "float64", "j": "float64", "k": "float64"}),
  201. ],
  202. )
  203. def test_multiindex_groupby_mixed_cols_axis1(func, expected, dtype, result_dtype_dict):
  204. # GH#43209
  205. df = DataFrame(
  206. [[1, 2, 3, 4, 5, 6]] * 3,
  207. columns=MultiIndex.from_product([["a", "b"], ["i", "j", "k"]]),
  208. ).astype({("a", "j"): dtype, ("b", "j"): dtype})
  209. result = df.groupby(level=1, axis=1).agg(func)
  210. expected = DataFrame([expected] * 3, columns=["i", "j", "k"]).astype(
  211. result_dtype_dict
  212. )
  213. tm.assert_frame_equal(result, expected)
  214. @pytest.mark.parametrize(
  215. "func, expected_data, result_dtype_dict",
  216. [
  217. ("sum", [[2, 4], [10, 12], [18, 20]], {10: "int64", 20: "int64"}),
  218. # std should ideally return Int64 / Float64 #43330
  219. ("std", [[2**0.5] * 2] * 3, "float64"),
  220. ("var", [[2] * 2] * 3, {10: "float64", 20: "float64"}),
  221. ],
  222. )
  223. def test_groupby_mixed_cols_axis1(func, expected_data, result_dtype_dict):
  224. # GH#43209
  225. df = DataFrame(
  226. np.arange(12).reshape(3, 4),
  227. index=Index([0, 1, 0], name="y"),
  228. columns=Index([10, 20, 10, 20], name="x"),
  229. dtype="int64",
  230. ).astype({10: "Int64"})
  231. result = df.groupby("x", axis=1).agg(func)
  232. expected = DataFrame(
  233. data=expected_data,
  234. index=Index([0, 1, 0], name="y"),
  235. columns=Index([10, 20], name="x"),
  236. ).astype(result_dtype_dict)
  237. tm.assert_frame_equal(result, expected)
  238. def test_aggregate_item_by_item(df):
  239. grouped = df.groupby("A")
  240. aggfun_0 = lambda ser: ser.size
  241. result = grouped.agg(aggfun_0)
  242. foosum = (df.A == "foo").sum()
  243. barsum = (df.A == "bar").sum()
  244. K = len(result.columns)
  245. # GH5782
  246. exp = Series(np.array([foosum] * K), index=list("BCD"), name="foo")
  247. tm.assert_series_equal(result.xs("foo"), exp)
  248. exp = Series(np.array([barsum] * K), index=list("BCD"), name="bar")
  249. tm.assert_almost_equal(result.xs("bar"), exp)
  250. def aggfun_1(ser):
  251. return ser.size
  252. result = DataFrame().groupby(df.A).agg(aggfun_1)
  253. assert isinstance(result, DataFrame)
  254. assert len(result) == 0
  255. def test_wrap_agg_out(three_group):
  256. grouped = three_group.groupby(["A", "B"])
  257. def func(ser):
  258. if ser.dtype == object:
  259. raise TypeError("Test error message")
  260. return ser.sum()
  261. with pytest.raises(TypeError, match="Test error message"):
  262. grouped.aggregate(func)
  263. result = grouped[[c for c in three_group if c != "C"]].aggregate(func)
  264. exp_grouped = three_group.loc[:, three_group.columns != "C"]
  265. expected = exp_grouped.groupby(["A", "B"]).aggregate(func)
  266. tm.assert_frame_equal(result, expected)
  267. def test_agg_multiple_functions_maintain_order(df):
  268. # GH #610
  269. funcs = [("mean", np.mean), ("max", np.max), ("min", np.min)]
  270. result = df.groupby("A")["C"].agg(funcs)
  271. exp_cols = Index(["mean", "max", "min"])
  272. tm.assert_index_equal(result.columns, exp_cols)
  273. def test_agg_multiple_functions_same_name():
  274. # GH 30880
  275. df = DataFrame(
  276. np.random.randn(1000, 3),
  277. index=pd.date_range("1/1/2012", freq="S", periods=1000),
  278. columns=["A", "B", "C"],
  279. )
  280. result = df.resample("3T").agg(
  281. {"A": [partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
  282. )
  283. expected_index = pd.date_range("1/1/2012", freq="3T", periods=6)
  284. expected_columns = MultiIndex.from_tuples([("A", "quantile"), ("A", "quantile")])
  285. expected_values = np.array(
  286. [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]]
  287. ).T
  288. expected = DataFrame(
  289. expected_values, columns=expected_columns, index=expected_index
  290. )
  291. tm.assert_frame_equal(result, expected)
  292. def test_agg_multiple_functions_same_name_with_ohlc_present():
  293. # GH 30880
  294. # ohlc expands dimensions, so different test to the above is required.
  295. df = DataFrame(
  296. np.random.randn(1000, 3),
  297. index=pd.date_range("1/1/2012", freq="S", periods=1000, name="dti"),
  298. columns=Index(["A", "B", "C"], name="alpha"),
  299. )
  300. result = df.resample("3T").agg(
  301. {"A": ["ohlc", partial(np.quantile, q=0.9999), partial(np.quantile, q=0.1111)]}
  302. )
  303. expected_index = pd.date_range("1/1/2012", freq="3T", periods=6, name="dti")
  304. expected_columns = MultiIndex.from_tuples(
  305. [
  306. ("A", "ohlc", "open"),
  307. ("A", "ohlc", "high"),
  308. ("A", "ohlc", "low"),
  309. ("A", "ohlc", "close"),
  310. ("A", "quantile", "A"),
  311. ("A", "quantile", "A"),
  312. ],
  313. names=["alpha", None, None],
  314. )
  315. non_ohlc_expected_values = np.array(
  316. [df.resample("3T").A.quantile(q=q).values for q in [0.9999, 0.1111]]
  317. ).T
  318. expected_values = np.hstack([df.resample("3T").A.ohlc(), non_ohlc_expected_values])
  319. expected = DataFrame(
  320. expected_values, columns=expected_columns, index=expected_index
  321. )
  322. tm.assert_frame_equal(result, expected)
  323. def test_multiple_functions_tuples_and_non_tuples(df):
  324. # #1359
  325. # Columns B and C would cause partial failure
  326. df = df.drop(columns=["B", "C"])
  327. funcs = [("foo", "mean"), "std"]
  328. ex_funcs = [("foo", "mean"), ("std", "std")]
  329. result = df.groupby("A")["D"].agg(funcs)
  330. expected = df.groupby("A")["D"].agg(ex_funcs)
  331. tm.assert_frame_equal(result, expected)
  332. result = df.groupby("A").agg(funcs)
  333. expected = df.groupby("A").agg(ex_funcs)
  334. tm.assert_frame_equal(result, expected)
  335. def test_more_flexible_frame_multi_function(df):
  336. grouped = df.groupby("A")
  337. exmean = grouped.agg({"C": np.mean, "D": np.mean})
  338. exstd = grouped.agg({"C": np.std, "D": np.std})
  339. expected = concat([exmean, exstd], keys=["mean", "std"], axis=1)
  340. expected = expected.swaplevel(0, 1, axis=1).sort_index(level=0, axis=1)
  341. d = {"C": [np.mean, np.std], "D": [np.mean, np.std]}
  342. result = grouped.aggregate(d)
  343. tm.assert_frame_equal(result, expected)
  344. # be careful
  345. result = grouped.aggregate({"C": np.mean, "D": [np.mean, np.std]})
  346. expected = grouped.aggregate({"C": np.mean, "D": [np.mean, np.std]})
  347. tm.assert_frame_equal(result, expected)
  348. def numpymean(x):
  349. return np.mean(x)
  350. def numpystd(x):
  351. return np.std(x, ddof=1)
  352. # this uses column selection & renaming
  353. msg = r"nested renamer is not supported"
  354. with pytest.raises(SpecificationError, match=msg):
  355. d = {"C": np.mean, "D": {"foo": np.mean, "bar": np.std}}
  356. grouped.aggregate(d)
  357. # But without renaming, these functions are OK
  358. d = {"C": [np.mean], "D": [numpymean, numpystd]}
  359. grouped.aggregate(d)
  360. def test_multi_function_flexible_mix(df):
  361. # GH #1268
  362. grouped = df.groupby("A")
  363. # Expected
  364. d = {"C": {"foo": "mean", "bar": "std"}, "D": {"sum": "sum"}}
  365. # this uses column selection & renaming
  366. msg = r"nested renamer is not supported"
  367. with pytest.raises(SpecificationError, match=msg):
  368. grouped.aggregate(d)
  369. # Test 1
  370. d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"}
  371. # this uses column selection & renaming
  372. with pytest.raises(SpecificationError, match=msg):
  373. grouped.aggregate(d)
  374. # Test 2
  375. d = {"C": {"foo": "mean", "bar": "std"}, "D": "sum"}
  376. # this uses column selection & renaming
  377. with pytest.raises(SpecificationError, match=msg):
  378. grouped.aggregate(d)
  379. def test_groupby_agg_coercing_bools():
  380. # issue 14873
  381. dat = DataFrame({"a": [1, 1, 2, 2], "b": [0, 1, 2, 3], "c": [None, None, 1, 1]})
  382. gp = dat.groupby("a")
  383. index = Index([1, 2], name="a")
  384. result = gp["b"].aggregate(lambda x: (x != 0).all())
  385. expected = Series([False, True], index=index, name="b")
  386. tm.assert_series_equal(result, expected)
  387. result = gp["c"].aggregate(lambda x: x.isnull().all())
  388. expected = Series([True, False], index=index, name="c")
  389. tm.assert_series_equal(result, expected)
  390. def test_groupby_agg_dict_with_getitem():
  391. # issue 25471
  392. dat = DataFrame({"A": ["A", "A", "B", "B", "B"], "B": [1, 2, 1, 1, 2]})
  393. result = dat.groupby("A")[["B"]].agg({"B": "sum"})
  394. expected = DataFrame({"B": [3, 4]}, index=["A", "B"]).rename_axis("A", axis=0)
  395. tm.assert_frame_equal(result, expected)
  396. @pytest.mark.parametrize(
  397. "op",
  398. [
  399. lambda x: x.sum(),
  400. lambda x: x.cumsum(),
  401. lambda x: x.transform("sum"),
  402. lambda x: x.transform("cumsum"),
  403. lambda x: x.agg("sum"),
  404. lambda x: x.agg("cumsum"),
  405. ],
  406. )
  407. def test_bool_agg_dtype(op):
  408. # GH 7001
  409. # Bool sum aggregations result in int
  410. df = DataFrame({"a": [1, 1], "b": [False, True]})
  411. s = df.set_index("a")["b"]
  412. result = op(df.groupby("a"))["b"].dtype
  413. assert is_integer_dtype(result)
  414. result = op(s.groupby("a")).dtype
  415. assert is_integer_dtype(result)
  416. @pytest.mark.parametrize(
  417. "keys, agg_index",
  418. [
  419. (["a"], Index([1], name="a")),
  420. (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
  421. ],
  422. )
  423. @pytest.mark.parametrize(
  424. "input_dtype", ["bool", "int32", "int64", "float32", "float64"]
  425. )
  426. @pytest.mark.parametrize(
  427. "result_dtype", ["bool", "int32", "int64", "float32", "float64"]
  428. )
  429. @pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
  430. def test_callable_result_dtype_frame(
  431. keys, agg_index, input_dtype, result_dtype, method
  432. ):
  433. # GH 21240
  434. df = DataFrame({"a": [1], "b": [2], "c": [True]})
  435. df["c"] = df["c"].astype(input_dtype)
  436. op = getattr(df.groupby(keys)[["c"]], method)
  437. result = op(lambda x: x.astype(result_dtype).iloc[0])
  438. expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
  439. expected = DataFrame({"c": [df["c"].iloc[0]]}, index=expected_index).astype(
  440. result_dtype
  441. )
  442. if method == "apply":
  443. expected.columns.names = [0]
  444. tm.assert_frame_equal(result, expected)
  445. @pytest.mark.parametrize(
  446. "keys, agg_index",
  447. [
  448. (["a"], Index([1], name="a")),
  449. (["a", "b"], MultiIndex([[1], [2]], [[0], [0]], names=["a", "b"])),
  450. ],
  451. )
  452. @pytest.mark.parametrize("input", [True, 1, 1.0])
  453. @pytest.mark.parametrize("dtype", [bool, int, float])
  454. @pytest.mark.parametrize("method", ["apply", "aggregate", "transform"])
  455. def test_callable_result_dtype_series(keys, agg_index, input, dtype, method):
  456. # GH 21240
  457. df = DataFrame({"a": [1], "b": [2], "c": [input]})
  458. op = getattr(df.groupby(keys)["c"], method)
  459. result = op(lambda x: x.astype(dtype).iloc[0])
  460. expected_index = pd.RangeIndex(0, 1) if method == "transform" else agg_index
  461. expected = Series([df["c"].iloc[0]], index=expected_index, name="c").astype(dtype)
  462. tm.assert_series_equal(result, expected)
  463. def test_order_aggregate_multiple_funcs():
  464. # GH 25692
  465. df = DataFrame({"A": [1, 1, 2, 2], "B": [1, 2, 3, 4]})
  466. res = df.groupby("A").agg(["sum", "max", "mean", "ohlc", "min"])
  467. result = res.columns.levels[1]
  468. expected = Index(["sum", "max", "mean", "ohlc", "min"])
  469. tm.assert_index_equal(result, expected)
  470. def test_ohlc_ea_dtypes(any_numeric_ea_dtype):
  471. # GH#37493
  472. df = DataFrame(
  473. {"a": [1, 1, 2, 3, 4, 4], "b": [22, 11, pd.NA, 10, 20, pd.NA]},
  474. dtype=any_numeric_ea_dtype,
  475. )
  476. gb = df.groupby("a")
  477. result = gb.ohlc()
  478. expected = DataFrame(
  479. [[22, 22, 11, 11], [pd.NA] * 4, [10] * 4, [20] * 4],
  480. columns=MultiIndex.from_product([["b"], ["open", "high", "low", "close"]]),
  481. index=Index([1, 2, 3, 4], dtype=any_numeric_ea_dtype, name="a"),
  482. dtype=any_numeric_ea_dtype,
  483. )
  484. tm.assert_frame_equal(result, expected)
  485. gb2 = df.groupby("a", as_index=False)
  486. result2 = gb2.ohlc()
  487. expected2 = expected.reset_index()
  488. tm.assert_frame_equal(result2, expected2)
  489. @pytest.mark.parametrize("dtype", [np.int64, np.uint64])
  490. @pytest.mark.parametrize("how", ["first", "last", "min", "max", "mean", "median"])
  491. def test_uint64_type_handling(dtype, how):
  492. # GH 26310
  493. df = DataFrame({"x": 6903052872240755750, "y": [1, 2]})
  494. expected = df.groupby("y").agg({"x": how})
  495. df.x = df.x.astype(dtype)
  496. result = df.groupby("y").agg({"x": how})
  497. if how not in ("mean", "median"):
  498. # mean and median always result in floats
  499. result.x = result.x.astype(np.int64)
  500. tm.assert_frame_equal(result, expected, check_exact=True)
  501. def test_func_duplicates_raises():
  502. # GH28426
  503. msg = "Function names"
  504. df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
  505. with pytest.raises(SpecificationError, match=msg):
  506. df.groupby("A").agg(["min", "min"])
  507. @pytest.mark.parametrize(
  508. "index",
  509. [
  510. pd.CategoricalIndex(list("abc")),
  511. pd.interval_range(0, 3),
  512. pd.period_range("2020", periods=3, freq="D"),
  513. MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
  514. ],
  515. )
  516. def test_agg_index_has_complex_internals(index):
  517. # GH 31223
  518. df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
  519. result = df.groupby("group").agg({"value": Series.nunique})
  520. expected = DataFrame({"group": [1, 2], "value": [2, 1]}).set_index("group")
  521. tm.assert_frame_equal(result, expected)
  522. def test_agg_split_block():
  523. # https://github.com/pandas-dev/pandas/issues/31522
  524. df = DataFrame(
  525. {
  526. "key1": ["a", "a", "b", "b", "a"],
  527. "key2": ["one", "two", "one", "two", "one"],
  528. "key3": ["three", "three", "three", "six", "six"],
  529. }
  530. )
  531. result = df.groupby("key1").min()
  532. expected = DataFrame(
  533. {"key2": ["one", "one"], "key3": ["six", "six"]},
  534. index=Index(["a", "b"], name="key1"),
  535. )
  536. tm.assert_frame_equal(result, expected)
  537. def test_agg_split_object_part_datetime():
  538. # https://github.com/pandas-dev/pandas/pull/31616
  539. df = DataFrame(
  540. {
  541. "A": pd.date_range("2000", periods=4),
  542. "B": ["a", "b", "c", "d"],
  543. "C": [1, 2, 3, 4],
  544. "D": ["b", "c", "d", "e"],
  545. "E": pd.date_range("2000", periods=4),
  546. "F": [1, 2, 3, 4],
  547. }
  548. ).astype(object)
  549. result = df.groupby([0, 0, 0, 0]).min()
  550. expected = DataFrame(
  551. {
  552. "A": [pd.Timestamp("2000")],
  553. "B": ["a"],
  554. "C": [1],
  555. "D": ["b"],
  556. "E": [pd.Timestamp("2000")],
  557. "F": [1],
  558. },
  559. index=np.array([0]),
  560. dtype=object,
  561. )
  562. tm.assert_frame_equal(result, expected)
  563. class TestNamedAggregationSeries:
  564. def test_series_named_agg(self):
  565. df = Series([1, 2, 3, 4])
  566. gr = df.groupby([0, 0, 1, 1])
  567. result = gr.agg(a="sum", b="min")
  568. expected = DataFrame(
  569. {"a": [3, 7], "b": [1, 3]}, columns=["a", "b"], index=np.array([0, 1])
  570. )
  571. tm.assert_frame_equal(result, expected)
  572. result = gr.agg(b="min", a="sum")
  573. expected = expected[["b", "a"]]
  574. tm.assert_frame_equal(result, expected)
  575. def test_no_args_raises(self):
  576. gr = Series([1, 2]).groupby([0, 1])
  577. with pytest.raises(TypeError, match="Must provide"):
  578. gr.agg()
  579. # but we do allow this
  580. result = gr.agg([])
  581. expected = DataFrame(columns=[])
  582. tm.assert_frame_equal(result, expected)
  583. def test_series_named_agg_duplicates_no_raises(self):
  584. # GH28426
  585. gr = Series([1, 2, 3]).groupby([0, 0, 1])
  586. grouped = gr.agg(a="sum", b="sum")
  587. expected = DataFrame({"a": [3, 3], "b": [3, 3]}, index=np.array([0, 1]))
  588. tm.assert_frame_equal(expected, grouped)
  589. def test_mangled(self):
  590. gr = Series([1, 2, 3]).groupby([0, 0, 1])
  591. result = gr.agg(a=lambda x: 0, b=lambda x: 1)
  592. expected = DataFrame({"a": [0, 0], "b": [1, 1]}, index=np.array([0, 1]))
  593. tm.assert_frame_equal(result, expected)
  594. @pytest.mark.parametrize(
  595. "inp",
  596. [
  597. pd.NamedAgg(column="anything", aggfunc="min"),
  598. ("anything", "min"),
  599. ["anything", "min"],
  600. ],
  601. )
  602. def test_named_agg_nametuple(self, inp):
  603. # GH34422
  604. s = Series([1, 1, 2, 2, 3, 3, 4, 5])
  605. msg = f"func is expected but received {type(inp).__name__}"
  606. with pytest.raises(TypeError, match=msg):
  607. s.groupby(s.values).agg(a=inp)
  608. class TestNamedAggregationDataFrame:
  609. def test_agg_relabel(self):
  610. df = DataFrame(
  611. {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
  612. )
  613. result = df.groupby("group").agg(a_max=("A", "max"), b_max=("B", "max"))
  614. expected = DataFrame(
  615. {"a_max": [1, 3], "b_max": [6, 8]},
  616. index=Index(["a", "b"], name="group"),
  617. columns=["a_max", "b_max"],
  618. )
  619. tm.assert_frame_equal(result, expected)
  620. # order invariance
  621. p98 = functools.partial(np.percentile, q=98)
  622. result = df.groupby("group").agg(
  623. b_min=("B", "min"),
  624. a_min=("A", min),
  625. a_mean=("A", np.mean),
  626. a_max=("A", "max"),
  627. b_max=("B", "max"),
  628. a_98=("A", p98),
  629. )
  630. expected = DataFrame(
  631. {
  632. "b_min": [5, 7],
  633. "a_min": [0, 2],
  634. "a_mean": [0.5, 2.5],
  635. "a_max": [1, 3],
  636. "b_max": [6, 8],
  637. "a_98": [0.98, 2.98],
  638. },
  639. index=Index(["a", "b"], name="group"),
  640. columns=["b_min", "a_min", "a_mean", "a_max", "b_max", "a_98"],
  641. )
  642. tm.assert_frame_equal(result, expected)
  643. def test_agg_relabel_non_identifier(self):
  644. df = DataFrame(
  645. {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
  646. )
  647. result = df.groupby("group").agg(**{"my col": ("A", "max")})
  648. expected = DataFrame({"my col": [1, 3]}, index=Index(["a", "b"], name="group"))
  649. tm.assert_frame_equal(result, expected)
  650. def test_duplicate_no_raises(self):
  651. # GH 28426, if use same input function on same column,
  652. # no error should raise
  653. df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
  654. grouped = df.groupby("A").agg(a=("B", "min"), b=("B", "min"))
  655. expected = DataFrame({"a": [1, 3], "b": [1, 3]}, index=Index([0, 1], name="A"))
  656. tm.assert_frame_equal(grouped, expected)
  657. quant50 = functools.partial(np.percentile, q=50)
  658. quant70 = functools.partial(np.percentile, q=70)
  659. quant50.__name__ = "quant50"
  660. quant70.__name__ = "quant70"
  661. test = DataFrame({"col1": ["a", "a", "b", "b", "b"], "col2": [1, 2, 3, 4, 5]})
  662. grouped = test.groupby("col1").agg(
  663. quantile_50=("col2", quant50), quantile_70=("col2", quant70)
  664. )
  665. expected = DataFrame(
  666. {"quantile_50": [1.5, 4.0], "quantile_70": [1.7, 4.4]},
  667. index=Index(["a", "b"], name="col1"),
  668. )
  669. tm.assert_frame_equal(grouped, expected)
  670. def test_agg_relabel_with_level(self):
  671. df = DataFrame(
  672. {"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]},
  673. index=MultiIndex.from_product([["A", "B"], ["a", "b"]]),
  674. )
  675. result = df.groupby(level=0).agg(
  676. aa=("A", "max"), bb=("A", "min"), cc=("B", "mean")
  677. )
  678. expected = DataFrame(
  679. {"aa": [0, 1], "bb": [0, 1], "cc": [1.5, 3.5]}, index=["A", "B"]
  680. )
  681. tm.assert_frame_equal(result, expected)
  682. def test_agg_relabel_other_raises(self):
  683. df = DataFrame({"A": [0, 0, 1], "B": [1, 2, 3]})
  684. grouped = df.groupby("A")
  685. match = "Must provide"
  686. with pytest.raises(TypeError, match=match):
  687. grouped.agg(foo=1)
  688. with pytest.raises(TypeError, match=match):
  689. grouped.agg()
  690. with pytest.raises(TypeError, match=match):
  691. grouped.agg(a=("B", "max"), b=(1, 2, 3))
  692. def test_missing_raises(self):
  693. df = DataFrame({"A": [0, 1], "B": [1, 2]})
  694. match = re.escape("Column(s) ['C'] do not exist")
  695. with pytest.raises(KeyError, match=match):
  696. df.groupby("A").agg(c=("C", "sum"))
  697. def test_agg_namedtuple(self):
  698. df = DataFrame({"A": [0, 1], "B": [1, 2]})
  699. result = df.groupby("A").agg(
  700. b=pd.NamedAgg("B", "sum"), c=pd.NamedAgg(column="B", aggfunc="count")
  701. )
  702. expected = df.groupby("A").agg(b=("B", "sum"), c=("B", "count"))
  703. tm.assert_frame_equal(result, expected)
  704. def test_mangled(self):
  705. df = DataFrame({"A": [0, 1], "B": [1, 2], "C": [3, 4]})
  706. result = df.groupby("A").agg(b=("B", lambda x: 0), c=("C", lambda x: 1))
  707. expected = DataFrame({"b": [0, 0], "c": [1, 1]}, index=Index([0, 1], name="A"))
  708. tm.assert_frame_equal(result, expected)
  709. @pytest.mark.parametrize(
  710. "agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3",
  711. [
  712. (
  713. (("y", "A"), "max"),
  714. (("y", "A"), np.min),
  715. (("y", "B"), "mean"),
  716. [1, 3],
  717. [0, 2],
  718. [5.5, 7.5],
  719. ),
  720. (
  721. (("y", "A"), lambda x: max(x)),
  722. (("y", "A"), lambda x: 1),
  723. (("y", "B"), "mean"),
  724. [1, 3],
  725. [1, 1],
  726. [5.5, 7.5],
  727. ),
  728. (
  729. pd.NamedAgg(("y", "A"), "max"),
  730. pd.NamedAgg(("y", "B"), np.mean),
  731. pd.NamedAgg(("y", "A"), lambda x: 1),
  732. [1, 3],
  733. [5.5, 7.5],
  734. [1, 1],
  735. ),
  736. ],
  737. )
  738. def test_agg_relabel_multiindex_column(
  739. agg_col1, agg_col2, agg_col3, agg_result1, agg_result2, agg_result3
  740. ):
  741. # GH 29422, add tests for multiindex column cases
  742. df = DataFrame(
  743. {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
  744. )
  745. df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
  746. idx = Index(["a", "b"], name=("x", "group"))
  747. result = df.groupby(("x", "group")).agg(a_max=(("y", "A"), "max"))
  748. expected = DataFrame({"a_max": [1, 3]}, index=idx)
  749. tm.assert_frame_equal(result, expected)
  750. result = df.groupby(("x", "group")).agg(
  751. col_1=agg_col1, col_2=agg_col2, col_3=agg_col3
  752. )
  753. expected = DataFrame(
  754. {"col_1": agg_result1, "col_2": agg_result2, "col_3": agg_result3}, index=idx
  755. )
  756. tm.assert_frame_equal(result, expected)
  757. def test_agg_relabel_multiindex_raises_not_exist():
  758. # GH 29422, add test for raises scenario when aggregate column does not exist
  759. df = DataFrame(
  760. {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
  761. )
  762. df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
  763. with pytest.raises(KeyError, match="do not exist"):
  764. df.groupby(("x", "group")).agg(a=(("Y", "a"), "max"))
  765. def test_agg_relabel_multiindex_duplicates():
  766. # GH29422, add test for raises scenario when getting duplicates
  767. # GH28426, after this change, duplicates should also work if the relabelling is
  768. # different
  769. df = DataFrame(
  770. {"group": ["a", "a", "b", "b"], "A": [0, 1, 2, 3], "B": [5, 6, 7, 8]}
  771. )
  772. df.columns = MultiIndex.from_tuples([("x", "group"), ("y", "A"), ("y", "B")])
  773. result = df.groupby(("x", "group")).agg(
  774. a=(("y", "A"), "min"), b=(("y", "A"), "min")
  775. )
  776. idx = Index(["a", "b"], name=("x", "group"))
  777. expected = DataFrame({"a": [0, 2], "b": [0, 2]}, index=idx)
  778. tm.assert_frame_equal(result, expected)
  779. @pytest.mark.parametrize("kwargs", [{"c": ["min"]}, {"b": [], "c": ["min"]}])
  780. def test_groupby_aggregate_empty_key(kwargs):
  781. # GH: 32580
  782. df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
  783. result = df.groupby("a").agg(kwargs)
  784. expected = DataFrame(
  785. [1, 4],
  786. index=Index([1, 2], dtype="int64", name="a"),
  787. columns=MultiIndex.from_tuples([["c", "min"]]),
  788. )
  789. tm.assert_frame_equal(result, expected)
  790. def test_groupby_aggregate_empty_key_empty_return():
  791. # GH: 32580 Check if everything works, when return is empty
  792. df = DataFrame({"a": [1, 1, 2], "b": [1, 2, 3], "c": [1, 2, 4]})
  793. result = df.groupby("a").agg({"b": []})
  794. expected = DataFrame(columns=MultiIndex(levels=[["b"], []], codes=[[], []]))
  795. tm.assert_frame_equal(result, expected)
  796. def test_groupby_aggregate_empty_with_multiindex_frame():
  797. # GH 39178
  798. df = DataFrame(columns=["a", "b", "c"])
  799. result = df.groupby(["a", "b"], group_keys=False).agg(d=("c", list))
  800. expected = DataFrame(
  801. columns=["d"], index=MultiIndex([[], []], [[], []], names=["a", "b"])
  802. )
  803. tm.assert_frame_equal(result, expected)
  804. def test_grouby_agg_loses_results_with_as_index_false_relabel():
  805. # GH 32240: When the aggregate function relabels column names and
  806. # as_index=False is specified, the results are dropped.
  807. df = DataFrame(
  808. {"key": ["x", "y", "z", "x", "y", "z"], "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75]}
  809. )
  810. grouped = df.groupby("key", as_index=False)
  811. result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
  812. expected = DataFrame({"key": ["x", "y", "z"], "min_val": [1.0, 0.8, 0.75]})
  813. tm.assert_frame_equal(result, expected)
  814. def test_grouby_agg_loses_results_with_as_index_false_relabel_multiindex():
  815. # GH 32240: When the aggregate function relabels column names and
  816. # as_index=False is specified, the results are dropped. Check if
  817. # multiindex is returned in the right order
  818. df = DataFrame(
  819. {
  820. "key": ["x", "y", "x", "y", "x", "x"],
  821. "key1": ["a", "b", "c", "b", "a", "c"],
  822. "val": [1.0, 0.8, 2.0, 3.0, 3.6, 0.75],
  823. }
  824. )
  825. grouped = df.groupby(["key", "key1"], as_index=False)
  826. result = grouped.agg(min_val=pd.NamedAgg(column="val", aggfunc="min"))
  827. expected = DataFrame(
  828. {"key": ["x", "x", "y"], "key1": ["a", "c", "b"], "min_val": [1.0, 0.75, 0.8]}
  829. )
  830. tm.assert_frame_equal(result, expected)
  831. @pytest.mark.parametrize(
  832. "func", [lambda s: s.mean(), lambda s: np.mean(s), lambda s: np.nanmean(s)]
  833. )
  834. def test_multiindex_custom_func(func):
  835. # GH 31777
  836. data = [[1, 4, 2], [5, 7, 1]]
  837. df = DataFrame(
  838. data,
  839. columns=MultiIndex.from_arrays(
  840. [[1, 1, 2], [3, 4, 3]], names=["Sisko", "Janeway"]
  841. ),
  842. )
  843. result = df.groupby(np.array([0, 1])).agg(func)
  844. expected_dict = {
  845. (1, 3): {0: 1.0, 1: 5.0},
  846. (1, 4): {0: 4.0, 1: 7.0},
  847. (2, 3): {0: 2.0, 1: 1.0},
  848. }
  849. expected = DataFrame(expected_dict, index=np.array([0, 1]), columns=df.columns)
  850. tm.assert_frame_equal(result, expected)
  851. def myfunc(s):
  852. return np.percentile(s, q=0.90)
  853. @pytest.mark.parametrize("func", [lambda s: np.percentile(s, q=0.90), myfunc])
  854. def test_lambda_named_agg(func):
  855. # see gh-28467
  856. animals = DataFrame(
  857. {
  858. "kind": ["cat", "dog", "cat", "dog"],
  859. "height": [9.1, 6.0, 9.5, 34.0],
  860. "weight": [7.9, 7.5, 9.9, 198.0],
  861. }
  862. )
  863. result = animals.groupby("kind").agg(
  864. mean_height=("height", "mean"), perc90=("height", func)
  865. )
  866. expected = DataFrame(
  867. [[9.3, 9.1036], [20.0, 6.252]],
  868. columns=["mean_height", "perc90"],
  869. index=Index(["cat", "dog"], name="kind"),
  870. )
  871. tm.assert_frame_equal(result, expected)
  872. def test_aggregate_mixed_types():
  873. # GH 16916
  874. df = DataFrame(
  875. data=np.array([0] * 9).reshape(3, 3), columns=list("XYZ"), index=list("abc")
  876. )
  877. df["grouping"] = ["group 1", "group 1", 2]
  878. result = df.groupby("grouping").aggregate(lambda x: x.tolist())
  879. expected_data = [[[0], [0], [0]], [[0, 0], [0, 0], [0, 0]]]
  880. expected = DataFrame(
  881. expected_data,
  882. index=Index([2, "group 1"], dtype="object", name="grouping"),
  883. columns=Index(["X", "Y", "Z"], dtype="object"),
  884. )
  885. tm.assert_frame_equal(result, expected)
  886. @pytest.mark.xfail(reason="Not implemented;see GH 31256")
  887. def test_aggregate_udf_na_extension_type():
  888. # https://github.com/pandas-dev/pandas/pull/31359
  889. # This is currently failing to cast back to Int64Dtype.
  890. # The presence of the NA causes two problems
  891. # 1. NA is not an instance of Int64Dtype.type (numpy.int64)
  892. # 2. The presence of an NA forces object type, so the non-NA values is
  893. # a Python int rather than a NumPy int64. Python ints aren't
  894. # instances of numpy.int64.
  895. def aggfunc(x):
  896. if all(x > 2):
  897. return 1
  898. else:
  899. return pd.NA
  900. df = DataFrame({"A": pd.array([1, 2, 3])})
  901. result = df.groupby([1, 1, 2]).agg(aggfunc)
  902. expected = DataFrame({"A": pd.array([1, pd.NA], dtype="Int64")}, index=[1, 2])
  903. tm.assert_frame_equal(result, expected)
  904. class TestLambdaMangling:
  905. def test_basic(self):
  906. df = DataFrame({"A": [0, 0, 1, 1], "B": [1, 2, 3, 4]})
  907. result = df.groupby("A").agg({"B": [lambda x: 0, lambda x: 1]})
  908. expected = DataFrame(
  909. {("B", "<lambda_0>"): [0, 0], ("B", "<lambda_1>"): [1, 1]},
  910. index=Index([0, 1], name="A"),
  911. )
  912. tm.assert_frame_equal(result, expected)
  913. def test_mangle_series_groupby(self):
  914. gr = Series([1, 2, 3, 4]).groupby([0, 0, 1, 1])
  915. result = gr.agg([lambda x: 0, lambda x: 1])
  916. exp_data = {"<lambda_0>": [0, 0], "<lambda_1>": [1, 1]}
  917. expected = DataFrame(exp_data, index=np.array([0, 1]))
  918. tm.assert_frame_equal(result, expected)
  919. @pytest.mark.xfail(reason="GH-26611. kwargs for multi-agg.")
  920. def test_with_kwargs(self):
  921. f1 = lambda x, y, b=1: x.sum() + y + b
  922. f2 = lambda x, y, b=2: x.sum() + y * b
  923. result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0)
  924. expected = DataFrame({"<lambda_0>": [4], "<lambda_1>": [6]})
  925. tm.assert_frame_equal(result, expected)
  926. result = Series([1, 2]).groupby([0, 0]).agg([f1, f2], 0, b=10)
  927. expected = DataFrame({"<lambda_0>": [13], "<lambda_1>": [30]})
  928. tm.assert_frame_equal(result, expected)
  929. def test_agg_with_one_lambda(self):
  930. # GH 25719, write tests for DataFrameGroupby.agg with only one lambda
  931. df = DataFrame(
  932. {
  933. "kind": ["cat", "dog", "cat", "dog"],
  934. "height": [9.1, 6.0, 9.5, 34.0],
  935. "weight": [7.9, 7.5, 9.9, 198.0],
  936. }
  937. )
  938. columns = ["height_sqr_min", "height_max", "weight_max"]
  939. expected = DataFrame(
  940. {
  941. "height_sqr_min": [82.81, 36.00],
  942. "height_max": [9.5, 34.0],
  943. "weight_max": [9.9, 198.0],
  944. },
  945. index=Index(["cat", "dog"], name="kind"),
  946. columns=columns,
  947. )
  948. # check pd.NameAgg case
  949. result1 = df.groupby(by="kind").agg(
  950. height_sqr_min=pd.NamedAgg(
  951. column="height", aggfunc=lambda x: np.min(x**2)
  952. ),
  953. height_max=pd.NamedAgg(column="height", aggfunc="max"),
  954. weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
  955. )
  956. tm.assert_frame_equal(result1, expected)
  957. # check agg(key=(col, aggfunc)) case
  958. result2 = df.groupby(by="kind").agg(
  959. height_sqr_min=("height", lambda x: np.min(x**2)),
  960. height_max=("height", "max"),
  961. weight_max=("weight", "max"),
  962. )
  963. tm.assert_frame_equal(result2, expected)
  964. def test_agg_multiple_lambda(self):
  965. # GH25719, test for DataFrameGroupby.agg with multiple lambdas
  966. # with mixed aggfunc
  967. df = DataFrame(
  968. {
  969. "kind": ["cat", "dog", "cat", "dog"],
  970. "height": [9.1, 6.0, 9.5, 34.0],
  971. "weight": [7.9, 7.5, 9.9, 198.0],
  972. }
  973. )
  974. columns = [
  975. "height_sqr_min",
  976. "height_max",
  977. "weight_max",
  978. "height_max_2",
  979. "weight_min",
  980. ]
  981. expected = DataFrame(
  982. {
  983. "height_sqr_min": [82.81, 36.00],
  984. "height_max": [9.5, 34.0],
  985. "weight_max": [9.9, 198.0],
  986. "height_max_2": [9.5, 34.0],
  987. "weight_min": [7.9, 7.5],
  988. },
  989. index=Index(["cat", "dog"], name="kind"),
  990. columns=columns,
  991. )
  992. # check agg(key=(col, aggfunc)) case
  993. result1 = df.groupby(by="kind").agg(
  994. height_sqr_min=("height", lambda x: np.min(x**2)),
  995. height_max=("height", "max"),
  996. weight_max=("weight", "max"),
  997. height_max_2=("height", lambda x: np.max(x)),
  998. weight_min=("weight", lambda x: np.min(x)),
  999. )
  1000. tm.assert_frame_equal(result1, expected)
  1001. # check pd.NamedAgg case
  1002. result2 = df.groupby(by="kind").agg(
  1003. height_sqr_min=pd.NamedAgg(
  1004. column="height", aggfunc=lambda x: np.min(x**2)
  1005. ),
  1006. height_max=pd.NamedAgg(column="height", aggfunc="max"),
  1007. weight_max=pd.NamedAgg(column="weight", aggfunc="max"),
  1008. height_max_2=pd.NamedAgg(column="height", aggfunc=lambda x: np.max(x)),
  1009. weight_min=pd.NamedAgg(column="weight", aggfunc=lambda x: np.min(x)),
  1010. )
  1011. tm.assert_frame_equal(result2, expected)
  1012. def test_groupby_get_by_index():
  1013. # GH 33439
  1014. df = DataFrame({"A": ["S", "W", "W"], "B": [1.0, 1.0, 2.0]})
  1015. res = df.groupby("A").agg({"B": lambda x: x.get(x.index[-1])})
  1016. expected = DataFrame({"A": ["S", "W"], "B": [1.0, 2.0]}).set_index("A")
  1017. tm.assert_frame_equal(res, expected)
  1018. @pytest.mark.parametrize(
  1019. "grp_col_dict, exp_data",
  1020. [
  1021. ({"nr": "min", "cat_ord": "min"}, {"nr": [1, 5], "cat_ord": ["a", "c"]}),
  1022. ({"cat_ord": "min"}, {"cat_ord": ["a", "c"]}),
  1023. ({"nr": "min"}, {"nr": [1, 5]}),
  1024. ],
  1025. )
  1026. def test_groupby_single_agg_cat_cols(grp_col_dict, exp_data):
  1027. # test single aggregations on ordered categorical cols GHGH27800
  1028. # create the result dataframe
  1029. input_df = DataFrame(
  1030. {
  1031. "nr": [1, 2, 3, 4, 5, 6, 7, 8],
  1032. "cat_ord": list("aabbccdd"),
  1033. "cat": list("aaaabbbb"),
  1034. }
  1035. )
  1036. input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
  1037. input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
  1038. result_df = input_df.groupby("cat").agg(grp_col_dict)
  1039. # create expected dataframe
  1040. cat_index = pd.CategoricalIndex(
  1041. ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
  1042. )
  1043. expected_df = DataFrame(data=exp_data, index=cat_index)
  1044. if "cat_ord" in expected_df:
  1045. # ordered categorical columns should be preserved
  1046. dtype = input_df["cat_ord"].dtype
  1047. expected_df["cat_ord"] = expected_df["cat_ord"].astype(dtype)
  1048. tm.assert_frame_equal(result_df, expected_df)
  1049. @pytest.mark.parametrize(
  1050. "grp_col_dict, exp_data",
  1051. [
  1052. ({"nr": ["min", "max"], "cat_ord": "min"}, [(1, 4, "a"), (5, 8, "c")]),
  1053. ({"nr": "min", "cat_ord": ["min", "max"]}, [(1, "a", "b"), (5, "c", "d")]),
  1054. ({"cat_ord": ["min", "max"]}, [("a", "b"), ("c", "d")]),
  1055. ],
  1056. )
  1057. def test_groupby_combined_aggs_cat_cols(grp_col_dict, exp_data):
  1058. # test combined aggregations on ordered categorical cols GH27800
  1059. # create the result dataframe
  1060. input_df = DataFrame(
  1061. {
  1062. "nr": [1, 2, 3, 4, 5, 6, 7, 8],
  1063. "cat_ord": list("aabbccdd"),
  1064. "cat": list("aaaabbbb"),
  1065. }
  1066. )
  1067. input_df = input_df.astype({"cat": "category", "cat_ord": "category"})
  1068. input_df["cat_ord"] = input_df["cat_ord"].cat.as_ordered()
  1069. result_df = input_df.groupby("cat").agg(grp_col_dict)
  1070. # create expected dataframe
  1071. cat_index = pd.CategoricalIndex(
  1072. ["a", "b"], categories=["a", "b"], ordered=False, name="cat", dtype="category"
  1073. )
  1074. # unpack the grp_col_dict to create the multi-index tuple
  1075. # this tuple will be used to create the expected dataframe index
  1076. multi_index_list = []
  1077. for k, v in grp_col_dict.items():
  1078. if isinstance(v, list):
  1079. for value in v:
  1080. multi_index_list.append([k, value])
  1081. else:
  1082. multi_index_list.append([k, v])
  1083. multi_index = MultiIndex.from_tuples(tuple(multi_index_list))
  1084. expected_df = DataFrame(data=exp_data, columns=multi_index, index=cat_index)
  1085. for col in expected_df.columns:
  1086. if isinstance(col, tuple) and "cat_ord" in col:
  1087. # ordered categorical should be preserved
  1088. expected_df[col] = expected_df[col].astype(input_df["cat_ord"].dtype)
  1089. tm.assert_frame_equal(result_df, expected_df)
  1090. def test_nonagg_agg():
  1091. # GH 35490 - Single/Multiple agg of non-agg function give same results
  1092. # TODO: agg should raise for functions that don't aggregate
  1093. df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 2, 1]})
  1094. g = df.groupby("a")
  1095. result = g.agg(["cumsum"])
  1096. result.columns = result.columns.droplevel(-1)
  1097. expected = g.agg("cumsum")
  1098. tm.assert_frame_equal(result, expected)
  1099. def test_aggregate_datetime_objects():
  1100. # https://github.com/pandas-dev/pandas/issues/36003
  1101. # ensure we don't raise an error but keep object dtype for out-of-bounds
  1102. # datetimes
  1103. df = DataFrame(
  1104. {
  1105. "A": ["X", "Y"],
  1106. "B": [
  1107. datetime.datetime(2005, 1, 1, 10, 30, 23, 540000),
  1108. datetime.datetime(3005, 1, 1, 10, 30, 23, 540000),
  1109. ],
  1110. }
  1111. )
  1112. result = df.groupby("A").B.max()
  1113. expected = df.set_index("A")["B"]
  1114. tm.assert_series_equal(result, expected)
  1115. def test_groupby_index_object_dtype():
  1116. # GH 40014
  1117. df = DataFrame({"c0": ["x", "x", "x"], "c1": ["x", "x", "y"], "p": [0, 1, 2]})
  1118. df.index = df.index.astype("O")
  1119. grouped = df.groupby(["c0", "c1"])
  1120. res = grouped.p.agg(lambda x: all(x > 0))
  1121. # Check that providing a user-defined function in agg()
  1122. # produces the correct index shape when using an object-typed index.
  1123. expected_index = MultiIndex.from_tuples(
  1124. [("x", "x"), ("x", "y")], names=("c0", "c1")
  1125. )
  1126. expected = Series([False, True], index=expected_index, name="p")
  1127. tm.assert_series_equal(res, expected)
  1128. def test_timeseries_groupby_agg():
  1129. # GH#43290
  1130. def func(ser):
  1131. if ser.isna().all():
  1132. return None
  1133. return np.sum(ser)
  1134. df = DataFrame([1.0], index=[pd.Timestamp("2018-01-16 00:00:00+00:00")])
  1135. res = df.groupby(lambda x: 1).agg(func)
  1136. expected = DataFrame([[1.0]], index=[1])
  1137. tm.assert_frame_equal(res, expected)
  1138. def test_groupby_aggregate_directory(reduction_func):
  1139. # GH#32793
  1140. if reduction_func in ["corrwith", "nth"]:
  1141. return None
  1142. obj = DataFrame([[0, 1], [0, np.nan]])
  1143. result_reduced_series = obj.groupby(0).agg(reduction_func)
  1144. result_reduced_frame = obj.groupby(0).agg({1: reduction_func})
  1145. if reduction_func in ["size", "ngroup"]:
  1146. # names are different: None / 1
  1147. tm.assert_series_equal(
  1148. result_reduced_series, result_reduced_frame[1], check_names=False
  1149. )
  1150. else:
  1151. tm.assert_frame_equal(result_reduced_series, result_reduced_frame)
  1152. tm.assert_series_equal(
  1153. result_reduced_series.dtypes, result_reduced_frame.dtypes
  1154. )
  1155. def test_group_mean_timedelta_nat():
  1156. # GH43132
  1157. data = Series(["1 day", "3 days", "NaT"], dtype="timedelta64[ns]")
  1158. expected = Series(["2 days"], dtype="timedelta64[ns]", index=np.array([0]))
  1159. result = data.groupby([0, 0, 0]).mean()
  1160. tm.assert_series_equal(result, expected)
  1161. @pytest.mark.parametrize(
  1162. "input_data, expected_output",
  1163. [
  1164. ( # no timezone
  1165. ["2021-01-01T00:00", "NaT", "2021-01-01T02:00"],
  1166. ["2021-01-01T01:00"],
  1167. ),
  1168. ( # timezone
  1169. ["2021-01-01T00:00-0100", "NaT", "2021-01-01T02:00-0100"],
  1170. ["2021-01-01T01:00-0100"],
  1171. ),
  1172. ],
  1173. )
  1174. def test_group_mean_datetime64_nat(input_data, expected_output):
  1175. # GH43132
  1176. data = to_datetime(Series(input_data))
  1177. expected = to_datetime(Series(expected_output, index=np.array([0])))
  1178. result = data.groupby([0, 0, 0]).mean()
  1179. tm.assert_series_equal(result, expected)
  1180. @pytest.mark.parametrize(
  1181. "func, output", [("mean", [8 + 18j, 10 + 22j]), ("sum", [40 + 90j, 50 + 110j])]
  1182. )
  1183. def test_groupby_complex(func, output):
  1184. # GH#43701
  1185. data = Series(np.arange(20).reshape(10, 2).dot([1, 2j]))
  1186. result = data.groupby(data.index % 2).agg(func)
  1187. expected = Series(output)
  1188. tm.assert_series_equal(result, expected)
  1189. @pytest.mark.parametrize("func", ["min", "max", "var"])
  1190. def test_groupby_complex_raises(func):
  1191. # GH#43701
  1192. data = Series(np.arange(20).reshape(10, 2).dot([1, 2j]))
  1193. msg = "No matching signature found"
  1194. with pytest.raises(TypeError, match=msg):
  1195. data.groupby(data.index % 2).agg(func)
  1196. @pytest.mark.parametrize(
  1197. "func", [["min"], ["mean", "max"], {"b": "sum"}, {"b": "prod", "c": "median"}]
  1198. )
  1199. def test_multi_axis_1_raises(func):
  1200. # GH#46995
  1201. df = DataFrame({"a": [1, 1, 2], "b": [3, 4, 5], "c": [6, 7, 8]})
  1202. gb = df.groupby("a", axis=1)
  1203. with pytest.raises(NotImplementedError, match="axis other than 0 is not supported"):
  1204. gb.agg(func)
  1205. @pytest.mark.parametrize(
  1206. "test, constant",
  1207. [
  1208. ([[20, "A"], [20, "B"], [10, "C"]], {0: [10, 20], 1: ["C", ["A", "B"]]}),
  1209. ([[20, "A"], [20, "B"], [30, "C"]], {0: [20, 30], 1: [["A", "B"], "C"]}),
  1210. ([["a", 1], ["a", 1], ["b", 2], ["b", 3]], {0: ["a", "b"], 1: [1, [2, 3]]}),
  1211. pytest.param(
  1212. [["a", 1], ["a", 2], ["b", 3], ["b", 3]],
  1213. {0: ["a", "b"], 1: [[1, 2], 3]},
  1214. marks=pytest.mark.xfail,
  1215. ),
  1216. ],
  1217. )
  1218. def test_agg_of_mode_list(test, constant):
  1219. # GH#25581
  1220. df1 = DataFrame(test)
  1221. result = df1.groupby(0).agg(Series.mode)
  1222. # Mode usually only returns 1 value, but can return a list in the case of a tie.
  1223. expected = DataFrame(constant)
  1224. expected = expected.set_index(0)
  1225. tm.assert_frame_equal(result, expected)
  1226. def test__dataframe_groupy_agg_list_like_func_with_args():
  1227. # GH 50624
  1228. df = DataFrame({"x": [1, 2, 3], "y": ["a", "b", "c"]})
  1229. gb = df.groupby("y")
  1230. def foo1(x, a=1, c=0):
  1231. return x.sum() + a + c
  1232. def foo2(x, b=2, c=0):
  1233. return x.sum() + b + c
  1234. msg = r"foo1\(\) got an unexpected keyword argument 'b'"
  1235. with pytest.raises(TypeError, match=msg):
  1236. gb.agg([foo1, foo2], 3, b=3, c=4)
  1237. result = gb.agg([foo1, foo2], 3, c=4)
  1238. expected = DataFrame(
  1239. [[8, 8], [9, 9], [10, 10]],
  1240. index=Index(["a", "b", "c"], name="y"),
  1241. columns=MultiIndex.from_tuples([("x", "foo1"), ("x", "foo2")]),
  1242. )
  1243. tm.assert_frame_equal(result, expected)
  1244. def test__series_groupy_agg_list_like_func_with_args():
  1245. # GH 50624
  1246. s = Series([1, 2, 3])
  1247. sgb = s.groupby(s)
  1248. def foo1(x, a=1, c=0):
  1249. return x.sum() + a + c
  1250. def foo2(x, b=2, c=0):
  1251. return x.sum() + b + c
  1252. msg = r"foo1\(\) got an unexpected keyword argument 'b'"
  1253. with pytest.raises(TypeError, match=msg):
  1254. sgb.agg([foo1, foo2], 3, b=3, c=4)
  1255. result = sgb.agg([foo1, foo2], 3, c=4)
  1256. expected = DataFrame(
  1257. [[8, 8], [9, 9], [10, 10]], index=Index([1, 2, 3]), columns=["foo1", "foo2"]
  1258. )
  1259. tm.assert_frame_equal(result, expected)