test_groupby.py 87 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837
  1. from datetime import datetime
  2. from decimal import Decimal
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import IS64
  6. from pandas.errors import (
  7. PerformanceWarning,
  8. SpecificationError,
  9. )
  10. import pandas as pd
  11. from pandas import (
  12. Categorical,
  13. DataFrame,
  14. Grouper,
  15. Index,
  16. MultiIndex,
  17. RangeIndex,
  18. Series,
  19. Timedelta,
  20. Timestamp,
  21. date_range,
  22. to_datetime,
  23. )
  24. import pandas._testing as tm
  25. from pandas.core.arrays import BooleanArray
  26. import pandas.core.common as com
  27. from pandas.tests.groupby import get_groupby_method_args
  28. def test_repr():
  29. # GH18203
  30. result = repr(Grouper(key="A", level="B"))
  31. expected = "Grouper(key='A', level='B', axis=0, sort=False, dropna=True)"
  32. assert result == expected
  33. def test_groupby_std_datetimelike():
  34. # GH#48481
  35. tdi = pd.timedelta_range("1 Day", periods=10000)
  36. ser = Series(tdi)
  37. ser[::5] *= 2 # get different std for different groups
  38. df = ser.to_frame("A")
  39. df["B"] = ser + Timestamp(0)
  40. df["C"] = ser + Timestamp(0, tz="UTC")
  41. df.iloc[-1] = pd.NaT # last group includes NaTs
  42. gb = df.groupby(list(range(5)) * 2000)
  43. result = gb.std()
  44. # Note: this does not _exactly_ match what we would get if we did
  45. # [gb.get_group(i).std() for i in gb.groups]
  46. # but it _does_ match the floating point error we get doing the
  47. # same operation on int64 data xref GH#51332
  48. td1 = Timedelta("2887 days 11:21:02.326710176")
  49. td4 = Timedelta("2886 days 00:42:34.664668096")
  50. exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5))
  51. expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser})
  52. tm.assert_frame_equal(result, expected)
  53. @pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"])
  54. def test_basic(dtype):
  55. data = Series(np.arange(9) // 3, index=np.arange(9), dtype=dtype)
  56. index = np.arange(9)
  57. np.random.shuffle(index)
  58. data = data.reindex(index)
  59. grouped = data.groupby(lambda x: x // 3, group_keys=False)
  60. for k, v in grouped:
  61. assert len(v) == 3
  62. agged = grouped.aggregate(np.mean)
  63. assert agged[1] == 1
  64. tm.assert_series_equal(agged, grouped.agg(np.mean)) # shorthand
  65. tm.assert_series_equal(agged, grouped.mean())
  66. tm.assert_series_equal(grouped.agg(np.sum), grouped.sum())
  67. expected = grouped.apply(lambda x: x * x.sum())
  68. transformed = grouped.transform(lambda x: x * x.sum())
  69. assert transformed[7] == 12
  70. tm.assert_series_equal(transformed, expected)
  71. value_grouped = data.groupby(data)
  72. tm.assert_series_equal(
  73. value_grouped.aggregate(np.mean), agged, check_index_type=False
  74. )
  75. # complex agg
  76. agged = grouped.aggregate([np.mean, np.std])
  77. msg = r"nested renamer is not supported"
  78. with pytest.raises(SpecificationError, match=msg):
  79. grouped.aggregate({"one": np.mean, "two": np.std})
  80. group_constants = {0: 10, 1: 20, 2: 30}
  81. agged = grouped.agg(lambda x: group_constants[x.name] + x.mean())
  82. assert agged[1] == 21
  83. # corner cases
  84. msg = "Must produce aggregated value"
  85. # exception raised is type Exception
  86. with pytest.raises(Exception, match=msg):
  87. grouped.aggregate(lambda x: x * 2)
  88. def test_groupby_nonobject_dtype(mframe, df_mixed_floats):
  89. key = mframe.index.codes[0]
  90. grouped = mframe.groupby(key)
  91. result = grouped.sum()
  92. expected = mframe.groupby(key.astype("O")).sum()
  93. assert result.index.dtype == np.int8
  94. assert expected.index.dtype == np.int64
  95. tm.assert_frame_equal(result, expected, check_index_type=False)
  96. # GH 3911, mixed frame non-conversion
  97. df = df_mixed_floats.copy()
  98. df["value"] = range(len(df))
  99. def max_value(group):
  100. return group.loc[group["value"].idxmax()]
  101. applied = df.groupby("A").apply(max_value)
  102. result = applied.dtypes
  103. expected = df.dtypes
  104. tm.assert_series_equal(result, expected)
  105. def test_inconsistent_return_type():
  106. # GH5592
  107. # inconsistent return type
  108. df = DataFrame(
  109. {
  110. "A": ["Tiger", "Tiger", "Tiger", "Lamb", "Lamb", "Pony", "Pony"],
  111. "B": Series(np.arange(7), dtype="int64"),
  112. "C": date_range("20130101", periods=7),
  113. }
  114. )
  115. def f_0(grp):
  116. return grp.iloc[0]
  117. expected = df.groupby("A").first()[["B"]]
  118. result = df.groupby("A").apply(f_0)[["B"]]
  119. tm.assert_frame_equal(result, expected)
  120. def f_1(grp):
  121. if grp.name == "Tiger":
  122. return None
  123. return grp.iloc[0]
  124. result = df.groupby("A").apply(f_1)[["B"]]
  125. e = expected.copy()
  126. e.loc["Tiger"] = np.nan
  127. tm.assert_frame_equal(result, e)
  128. def f_2(grp):
  129. if grp.name == "Pony":
  130. return None
  131. return grp.iloc[0]
  132. result = df.groupby("A").apply(f_2)[["B"]]
  133. e = expected.copy()
  134. e.loc["Pony"] = np.nan
  135. tm.assert_frame_equal(result, e)
  136. # 5592 revisited, with datetimes
  137. def f_3(grp):
  138. if grp.name == "Pony":
  139. return None
  140. return grp.iloc[0]
  141. result = df.groupby("A").apply(f_3)[["C"]]
  142. e = df.groupby("A").first()[["C"]]
  143. e.loc["Pony"] = pd.NaT
  144. tm.assert_frame_equal(result, e)
  145. # scalar outputs
  146. def f_4(grp):
  147. if grp.name == "Pony":
  148. return None
  149. return grp.iloc[0].loc["C"]
  150. result = df.groupby("A").apply(f_4)
  151. e = df.groupby("A").first()["C"].copy()
  152. e.loc["Pony"] = np.nan
  153. e.name = None
  154. tm.assert_series_equal(result, e)
  155. def test_pass_args_kwargs(ts, tsframe):
  156. def f(x, q=None, axis=0):
  157. return np.percentile(x, q, axis=axis)
  158. g = lambda x: np.percentile(x, 80, axis=0)
  159. # Series
  160. ts_grouped = ts.groupby(lambda x: x.month)
  161. agg_result = ts_grouped.agg(np.percentile, 80, axis=0)
  162. apply_result = ts_grouped.apply(np.percentile, 80, axis=0)
  163. trans_result = ts_grouped.transform(np.percentile, 80, axis=0)
  164. agg_expected = ts_grouped.quantile(0.8)
  165. trans_expected = ts_grouped.transform(g)
  166. tm.assert_series_equal(apply_result, agg_expected)
  167. tm.assert_series_equal(agg_result, agg_expected)
  168. tm.assert_series_equal(trans_result, trans_expected)
  169. agg_result = ts_grouped.agg(f, q=80)
  170. apply_result = ts_grouped.apply(f, q=80)
  171. trans_result = ts_grouped.transform(f, q=80)
  172. tm.assert_series_equal(agg_result, agg_expected)
  173. tm.assert_series_equal(apply_result, agg_expected)
  174. tm.assert_series_equal(trans_result, trans_expected)
  175. # DataFrame
  176. for as_index in [True, False]:
  177. df_grouped = tsframe.groupby(lambda x: x.month, as_index=as_index)
  178. agg_result = df_grouped.agg(np.percentile, 80, axis=0)
  179. apply_result = df_grouped.apply(DataFrame.quantile, 0.8)
  180. expected = df_grouped.quantile(0.8)
  181. tm.assert_frame_equal(apply_result, expected, check_names=False)
  182. tm.assert_frame_equal(agg_result, expected)
  183. apply_result = df_grouped.apply(DataFrame.quantile, [0.4, 0.8])
  184. expected_seq = df_grouped.quantile([0.4, 0.8])
  185. tm.assert_frame_equal(apply_result, expected_seq, check_names=False)
  186. agg_result = df_grouped.agg(f, q=80)
  187. apply_result = df_grouped.apply(DataFrame.quantile, q=0.8)
  188. tm.assert_frame_equal(agg_result, expected)
  189. tm.assert_frame_equal(apply_result, expected, check_names=False)
  190. @pytest.mark.parametrize("as_index", [True, False])
  191. def test_pass_args_kwargs_duplicate_columns(tsframe, as_index):
  192. # go through _aggregate_frame with self.axis == 0 and duplicate columns
  193. tsframe.columns = ["A", "B", "A", "C"]
  194. gb = tsframe.groupby(lambda x: x.month, as_index=as_index)
  195. res = gb.agg(np.percentile, 80, axis=0)
  196. ex_data = {
  197. 1: tsframe[tsframe.index.month == 1].quantile(0.8),
  198. 2: tsframe[tsframe.index.month == 2].quantile(0.8),
  199. }
  200. expected = DataFrame(ex_data).T
  201. expected.index = expected.index.astype(np.int32)
  202. if not as_index:
  203. # TODO: try to get this more consistent?
  204. expected.index = Index(range(2))
  205. tm.assert_frame_equal(res, expected)
  206. def test_len():
  207. df = tm.makeTimeDataFrame()
  208. grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
  209. assert len(grouped) == len(df)
  210. grouped = df.groupby([lambda x: x.year, lambda x: x.month])
  211. expected = len({(x.year, x.month) for x in df.index})
  212. assert len(grouped) == expected
  213. # issue 11016
  214. df = DataFrame({"a": [np.nan] * 3, "b": [1, 2, 3]})
  215. assert len(df.groupby("a")) == 0
  216. assert len(df.groupby("b")) == 3
  217. assert len(df.groupby(["a", "b"])) == 3
  218. def test_basic_regression():
  219. # regression
  220. result = Series([1.0 * x for x in list(range(1, 10)) * 10])
  221. data = np.random.random(1100) * 10.0
  222. groupings = Series(data)
  223. grouped = result.groupby(groupings)
  224. grouped.mean()
  225. @pytest.mark.parametrize(
  226. "dtype", ["float64", "float32", "int64", "int32", "int16", "int8"]
  227. )
  228. def test_with_na_groups(dtype):
  229. index = Index(np.arange(10))
  230. values = Series(np.ones(10), index, dtype=dtype)
  231. labels = Series(
  232. [np.nan, "foo", "bar", "bar", np.nan, np.nan, "bar", "bar", np.nan, "foo"],
  233. index=index,
  234. )
  235. # this SHOULD be an int
  236. grouped = values.groupby(labels)
  237. agged = grouped.agg(len)
  238. expected = Series([4, 2], index=["bar", "foo"])
  239. tm.assert_series_equal(agged, expected, check_dtype=False)
  240. # assert issubclass(agged.dtype.type, np.integer)
  241. # explicitly return a float from my function
  242. def f(x):
  243. return float(len(x))
  244. agged = grouped.agg(f)
  245. expected = Series([4.0, 2.0], index=["bar", "foo"])
  246. tm.assert_series_equal(agged, expected)
  247. def test_indices_concatenation_order():
  248. # GH 2808
  249. def f1(x):
  250. y = x[(x.b % 2) == 1] ** 2
  251. if y.empty:
  252. multiindex = MultiIndex(levels=[[]] * 2, codes=[[]] * 2, names=["b", "c"])
  253. res = DataFrame(columns=["a"], index=multiindex)
  254. return res
  255. else:
  256. y = y.set_index(["b", "c"])
  257. return y
  258. def f2(x):
  259. y = x[(x.b % 2) == 1] ** 2
  260. if y.empty:
  261. return DataFrame()
  262. else:
  263. y = y.set_index(["b", "c"])
  264. return y
  265. def f3(x):
  266. y = x[(x.b % 2) == 1] ** 2
  267. if y.empty:
  268. multiindex = MultiIndex(
  269. levels=[[]] * 2, codes=[[]] * 2, names=["foo", "bar"]
  270. )
  271. res = DataFrame(columns=["a", "b"], index=multiindex)
  272. return res
  273. else:
  274. return y
  275. df = DataFrame({"a": [1, 2, 2, 2], "b": range(4), "c": range(5, 9)})
  276. df2 = DataFrame({"a": [3, 2, 2, 2], "b": range(4), "c": range(5, 9)})
  277. # correct result
  278. result1 = df.groupby("a").apply(f1)
  279. result2 = df2.groupby("a").apply(f1)
  280. tm.assert_frame_equal(result1, result2)
  281. # should fail (not the same number of levels)
  282. msg = "Cannot concat indices that do not have the same number of levels"
  283. with pytest.raises(AssertionError, match=msg):
  284. df.groupby("a").apply(f2)
  285. with pytest.raises(AssertionError, match=msg):
  286. df2.groupby("a").apply(f2)
  287. # should fail (incorrect shape)
  288. with pytest.raises(AssertionError, match=msg):
  289. df.groupby("a").apply(f3)
  290. with pytest.raises(AssertionError, match=msg):
  291. df2.groupby("a").apply(f3)
  292. def test_attr_wrapper(ts):
  293. grouped = ts.groupby(lambda x: x.weekday())
  294. result = grouped.std()
  295. expected = grouped.agg(lambda x: np.std(x, ddof=1))
  296. tm.assert_series_equal(result, expected)
  297. # this is pretty cool
  298. result = grouped.describe()
  299. expected = {name: gp.describe() for name, gp in grouped}
  300. expected = DataFrame(expected).T
  301. tm.assert_frame_equal(result, expected)
  302. # get attribute
  303. result = grouped.dtype
  304. expected = grouped.agg(lambda x: x.dtype)
  305. tm.assert_series_equal(result, expected)
  306. # make sure raises error
  307. msg = "'SeriesGroupBy' object has no attribute 'foo'"
  308. with pytest.raises(AttributeError, match=msg):
  309. getattr(grouped, "foo")
  310. def test_frame_groupby(tsframe):
  311. grouped = tsframe.groupby(lambda x: x.weekday())
  312. # aggregate
  313. aggregated = grouped.aggregate(np.mean)
  314. assert len(aggregated) == 5
  315. assert len(aggregated.columns) == 4
  316. # by string
  317. tscopy = tsframe.copy()
  318. tscopy["weekday"] = [x.weekday() for x in tscopy.index]
  319. stragged = tscopy.groupby("weekday").aggregate(np.mean)
  320. tm.assert_frame_equal(stragged, aggregated, check_names=False)
  321. # transform
  322. grouped = tsframe.head(30).groupby(lambda x: x.weekday())
  323. transformed = grouped.transform(lambda x: x - x.mean())
  324. assert len(transformed) == 30
  325. assert len(transformed.columns) == 4
  326. # transform propagate
  327. transformed = grouped.transform(lambda x: x.mean())
  328. for name, group in grouped:
  329. mean = group.mean()
  330. for idx in group.index:
  331. tm.assert_series_equal(transformed.xs(idx), mean, check_names=False)
  332. # iterate
  333. for weekday, group in grouped:
  334. assert group.index[0].weekday() == weekday
  335. # groups / group_indices
  336. groups = grouped.groups
  337. indices = grouped.indices
  338. for k, v in groups.items():
  339. samething = tsframe.index.take(indices[k])
  340. assert (samething == v).all()
  341. def test_frame_groupby_columns(tsframe):
  342. mapping = {"A": 0, "B": 0, "C": 1, "D": 1}
  343. grouped = tsframe.groupby(mapping, axis=1)
  344. # aggregate
  345. aggregated = grouped.aggregate(np.mean)
  346. assert len(aggregated) == len(tsframe)
  347. assert len(aggregated.columns) == 2
  348. # transform
  349. tf = lambda x: x - x.mean()
  350. groupedT = tsframe.T.groupby(mapping, axis=0)
  351. tm.assert_frame_equal(groupedT.transform(tf).T, grouped.transform(tf))
  352. # iterate
  353. for k, v in grouped:
  354. assert len(v.columns) == 2
  355. def test_frame_set_name_single(df):
  356. grouped = df.groupby("A")
  357. result = grouped.mean(numeric_only=True)
  358. assert result.index.name == "A"
  359. result = df.groupby("A", as_index=False).mean(numeric_only=True)
  360. assert result.index.name != "A"
  361. result = grouped[["C", "D"]].agg(np.mean)
  362. assert result.index.name == "A"
  363. result = grouped.agg({"C": np.mean, "D": np.std})
  364. assert result.index.name == "A"
  365. result = grouped["C"].mean()
  366. assert result.index.name == "A"
  367. result = grouped["C"].agg(np.mean)
  368. assert result.index.name == "A"
  369. result = grouped["C"].agg([np.mean, np.std])
  370. assert result.index.name == "A"
  371. msg = r"nested renamer is not supported"
  372. with pytest.raises(SpecificationError, match=msg):
  373. grouped["C"].agg({"foo": np.mean, "bar": np.std})
  374. def test_multi_func(df):
  375. col1 = df["A"]
  376. col2 = df["B"]
  377. grouped = df.groupby([col1.get, col2.get])
  378. agged = grouped.mean(numeric_only=True)
  379. expected = df.groupby(["A", "B"]).mean()
  380. # TODO groupby get drops names
  381. tm.assert_frame_equal(
  382. agged.loc[:, ["C", "D"]], expected.loc[:, ["C", "D"]], check_names=False
  383. )
  384. # some "groups" with no data
  385. df = DataFrame(
  386. {
  387. "v1": np.random.randn(6),
  388. "v2": np.random.randn(6),
  389. "k1": np.array(["b", "b", "b", "a", "a", "a"]),
  390. "k2": np.array(["1", "1", "1", "2", "2", "2"]),
  391. },
  392. index=["one", "two", "three", "four", "five", "six"],
  393. )
  394. # only verify that it works for now
  395. grouped = df.groupby(["k1", "k2"])
  396. grouped.agg(np.sum)
  397. def test_multi_key_multiple_functions(df):
  398. grouped = df.groupby(["A", "B"])["C"]
  399. agged = grouped.agg([np.mean, np.std])
  400. expected = DataFrame({"mean": grouped.agg(np.mean), "std": grouped.agg(np.std)})
  401. tm.assert_frame_equal(agged, expected)
  402. def test_frame_multi_key_function_list():
  403. data = DataFrame(
  404. {
  405. "A": [
  406. "foo",
  407. "foo",
  408. "foo",
  409. "foo",
  410. "bar",
  411. "bar",
  412. "bar",
  413. "bar",
  414. "foo",
  415. "foo",
  416. "foo",
  417. ],
  418. "B": [
  419. "one",
  420. "one",
  421. "one",
  422. "two",
  423. "one",
  424. "one",
  425. "one",
  426. "two",
  427. "two",
  428. "two",
  429. "one",
  430. ],
  431. "D": np.random.randn(11),
  432. "E": np.random.randn(11),
  433. "F": np.random.randn(11),
  434. }
  435. )
  436. grouped = data.groupby(["A", "B"])
  437. funcs = [np.mean, np.std]
  438. agged = grouped.agg(funcs)
  439. expected = pd.concat(
  440. [grouped["D"].agg(funcs), grouped["E"].agg(funcs), grouped["F"].agg(funcs)],
  441. keys=["D", "E", "F"],
  442. axis=1,
  443. )
  444. assert isinstance(agged.index, MultiIndex)
  445. assert isinstance(expected.index, MultiIndex)
  446. tm.assert_frame_equal(agged, expected)
  447. def test_frame_multi_key_function_list_partial_failure():
  448. data = DataFrame(
  449. {
  450. "A": [
  451. "foo",
  452. "foo",
  453. "foo",
  454. "foo",
  455. "bar",
  456. "bar",
  457. "bar",
  458. "bar",
  459. "foo",
  460. "foo",
  461. "foo",
  462. ],
  463. "B": [
  464. "one",
  465. "one",
  466. "one",
  467. "two",
  468. "one",
  469. "one",
  470. "one",
  471. "two",
  472. "two",
  473. "two",
  474. "one",
  475. ],
  476. "C": [
  477. "dull",
  478. "dull",
  479. "shiny",
  480. "dull",
  481. "dull",
  482. "shiny",
  483. "shiny",
  484. "dull",
  485. "shiny",
  486. "shiny",
  487. "shiny",
  488. ],
  489. "D": np.random.randn(11),
  490. "E": np.random.randn(11),
  491. "F": np.random.randn(11),
  492. }
  493. )
  494. grouped = data.groupby(["A", "B"])
  495. funcs = [np.mean, np.std]
  496. with pytest.raises(TypeError, match="Could not convert dullshinyshiny to numeric"):
  497. grouped.agg(funcs)
  498. @pytest.mark.parametrize("op", [lambda x: x.sum(), lambda x: x.mean()])
  499. def test_groupby_multiple_columns(df, op):
  500. data = df
  501. grouped = data.groupby(["A", "B"])
  502. result1 = op(grouped)
  503. keys = []
  504. values = []
  505. for n1, gp1 in data.groupby("A"):
  506. for n2, gp2 in gp1.groupby("B"):
  507. keys.append((n1, n2))
  508. values.append(op(gp2.loc[:, ["C", "D"]]))
  509. mi = MultiIndex.from_tuples(keys, names=["A", "B"])
  510. expected = pd.concat(values, axis=1).T
  511. expected.index = mi
  512. # a little bit crude
  513. for col in ["C", "D"]:
  514. result_col = op(grouped[col])
  515. pivoted = result1[col]
  516. exp = expected[col]
  517. tm.assert_series_equal(result_col, exp)
  518. tm.assert_series_equal(pivoted, exp)
  519. # test single series works the same
  520. result = data["C"].groupby([data["A"], data["B"]]).mean()
  521. expected = data.groupby(["A", "B"]).mean()["C"]
  522. tm.assert_series_equal(result, expected)
  523. def test_as_index_select_column():
  524. # GH 5764
  525. df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
  526. result = df.groupby("A", as_index=False)["B"].get_group(1)
  527. expected = Series([2, 4], name="B")
  528. tm.assert_series_equal(result, expected)
  529. result = df.groupby("A", as_index=False, group_keys=True)["B"].apply(
  530. lambda x: x.cumsum()
  531. )
  532. expected = Series(
  533. [2, 6, 6], name="B", index=MultiIndex.from_tuples([(0, 0), (0, 1), (1, 2)])
  534. )
  535. tm.assert_series_equal(result, expected)
  536. def test_groupby_as_index_select_column_sum_empty_df():
  537. # GH 35246
  538. df = DataFrame(columns=Index(["A", "B", "C"], name="alpha"))
  539. left = df.groupby(by="A", as_index=False)["B"].sum(numeric_only=False)
  540. expected = DataFrame(columns=df.columns[:2], index=range(0))
  541. # GH#50744 - Columns after selection shouldn't retain names
  542. expected.columns.names = [None]
  543. tm.assert_frame_equal(left, expected)
  544. def test_groupby_as_index_agg(df):
  545. grouped = df.groupby("A", as_index=False)
  546. # single-key
  547. result = grouped[["C", "D"]].agg(np.mean)
  548. expected = grouped.mean(numeric_only=True)
  549. tm.assert_frame_equal(result, expected)
  550. result2 = grouped.agg({"C": np.mean, "D": np.sum})
  551. expected2 = grouped.mean(numeric_only=True)
  552. expected2["D"] = grouped.sum()["D"]
  553. tm.assert_frame_equal(result2, expected2)
  554. grouped = df.groupby("A", as_index=True)
  555. msg = r"nested renamer is not supported"
  556. with pytest.raises(SpecificationError, match=msg):
  557. grouped["C"].agg({"Q": np.sum})
  558. # multi-key
  559. grouped = df.groupby(["A", "B"], as_index=False)
  560. result = grouped.agg(np.mean)
  561. expected = grouped.mean()
  562. tm.assert_frame_equal(result, expected)
  563. result2 = grouped.agg({"C": np.mean, "D": np.sum})
  564. expected2 = grouped.mean()
  565. expected2["D"] = grouped.sum()["D"]
  566. tm.assert_frame_equal(result2, expected2)
  567. expected3 = grouped["C"].sum()
  568. expected3 = DataFrame(expected3).rename(columns={"C": "Q"})
  569. result3 = grouped["C"].agg({"Q": np.sum})
  570. tm.assert_frame_equal(result3, expected3)
  571. # GH7115 & GH8112 & GH8582
  572. df = DataFrame(np.random.randint(0, 100, (50, 3)), columns=["jim", "joe", "jolie"])
  573. ts = Series(np.random.randint(5, 10, 50), name="jim")
  574. gr = df.groupby(ts)
  575. gr.nth(0) # invokes set_selection_from_grouper internally
  576. tm.assert_frame_equal(gr.apply(sum), df.groupby(ts).apply(sum))
  577. for attr in ["mean", "max", "count", "idxmax", "cumsum", "all"]:
  578. gr = df.groupby(ts, as_index=False)
  579. left = getattr(gr, attr)()
  580. gr = df.groupby(ts.values, as_index=True)
  581. right = getattr(gr, attr)().reset_index(drop=True)
  582. tm.assert_frame_equal(left, right)
  583. def test_ops_not_as_index(reduction_func):
  584. # GH 10355, 21090
  585. # Using as_index=False should not modify grouped column
  586. if reduction_func in ("corrwith", "nth", "ngroup"):
  587. pytest.skip(f"GH 5755: Test not applicable for {reduction_func}")
  588. df = DataFrame(np.random.randint(0, 5, size=(100, 2)), columns=["a", "b"])
  589. expected = getattr(df.groupby("a"), reduction_func)()
  590. if reduction_func == "size":
  591. expected = expected.rename("size")
  592. expected = expected.reset_index()
  593. if reduction_func != "size":
  594. # 32 bit compat -> groupby preserves dtype whereas reset_index casts to int64
  595. expected["a"] = expected["a"].astype(df["a"].dtype)
  596. g = df.groupby("a", as_index=False)
  597. result = getattr(g, reduction_func)()
  598. tm.assert_frame_equal(result, expected)
  599. result = g.agg(reduction_func)
  600. tm.assert_frame_equal(result, expected)
  601. result = getattr(g["b"], reduction_func)()
  602. tm.assert_frame_equal(result, expected)
  603. result = g["b"].agg(reduction_func)
  604. tm.assert_frame_equal(result, expected)
  605. def test_as_index_series_return_frame(df):
  606. grouped = df.groupby("A", as_index=False)
  607. grouped2 = df.groupby(["A", "B"], as_index=False)
  608. result = grouped["C"].agg(np.sum)
  609. expected = grouped.agg(np.sum).loc[:, ["A", "C"]]
  610. assert isinstance(result, DataFrame)
  611. tm.assert_frame_equal(result, expected)
  612. result2 = grouped2["C"].agg(np.sum)
  613. expected2 = grouped2.agg(np.sum).loc[:, ["A", "B", "C"]]
  614. assert isinstance(result2, DataFrame)
  615. tm.assert_frame_equal(result2, expected2)
  616. result = grouped["C"].sum()
  617. expected = grouped.sum().loc[:, ["A", "C"]]
  618. assert isinstance(result, DataFrame)
  619. tm.assert_frame_equal(result, expected)
  620. result2 = grouped2["C"].sum()
  621. expected2 = grouped2.sum().loc[:, ["A", "B", "C"]]
  622. assert isinstance(result2, DataFrame)
  623. tm.assert_frame_equal(result2, expected2)
  624. def test_as_index_series_column_slice_raises(df):
  625. # GH15072
  626. grouped = df.groupby("A", as_index=False)
  627. msg = r"Column\(s\) C already selected"
  628. with pytest.raises(IndexError, match=msg):
  629. grouped["C"].__getitem__("D")
  630. def test_groupby_as_index_cython(df):
  631. data = df
  632. # single-key
  633. grouped = data.groupby("A", as_index=False)
  634. result = grouped.mean(numeric_only=True)
  635. expected = data.groupby(["A"]).mean(numeric_only=True)
  636. expected.insert(0, "A", expected.index)
  637. expected.index = RangeIndex(len(expected))
  638. tm.assert_frame_equal(result, expected)
  639. # multi-key
  640. grouped = data.groupby(["A", "B"], as_index=False)
  641. result = grouped.mean()
  642. expected = data.groupby(["A", "B"]).mean()
  643. arrays = list(zip(*expected.index.values))
  644. expected.insert(0, "A", arrays[0])
  645. expected.insert(1, "B", arrays[1])
  646. expected.index = RangeIndex(len(expected))
  647. tm.assert_frame_equal(result, expected)
  648. def test_groupby_as_index_series_scalar(df):
  649. grouped = df.groupby(["A", "B"], as_index=False)
  650. # GH #421
  651. result = grouped["C"].agg(len)
  652. expected = grouped.agg(len).loc[:, ["A", "B", "C"]]
  653. tm.assert_frame_equal(result, expected)
  654. def test_groupby_as_index_corner(df, ts):
  655. msg = "as_index=False only valid with DataFrame"
  656. with pytest.raises(TypeError, match=msg):
  657. ts.groupby(lambda x: x.weekday(), as_index=False)
  658. msg = "as_index=False only valid for axis=0"
  659. with pytest.raises(ValueError, match=msg):
  660. df.groupby(lambda x: x.lower(), as_index=False, axis=1)
  661. def test_groupby_multiple_key():
  662. df = tm.makeTimeDataFrame()
  663. grouped = df.groupby([lambda x: x.year, lambda x: x.month, lambda x: x.day])
  664. agged = grouped.sum()
  665. tm.assert_almost_equal(df.values, agged.values)
  666. grouped = df.T.groupby(
  667. [lambda x: x.year, lambda x: x.month, lambda x: x.day], axis=1
  668. )
  669. agged = grouped.agg(lambda x: x.sum())
  670. tm.assert_index_equal(agged.index, df.columns)
  671. tm.assert_almost_equal(df.T.values, agged.values)
  672. agged = grouped.agg(lambda x: x.sum())
  673. tm.assert_almost_equal(df.T.values, agged.values)
  674. def test_groupby_multi_corner(df):
  675. # test that having an all-NA column doesn't mess you up
  676. df = df.copy()
  677. df["bad"] = np.nan
  678. agged = df.groupby(["A", "B"]).mean()
  679. expected = df.groupby(["A", "B"]).mean()
  680. expected["bad"] = np.nan
  681. tm.assert_frame_equal(agged, expected)
  682. def test_raises_on_nuisance(df):
  683. grouped = df.groupby("A")
  684. with pytest.raises(TypeError, match="Could not convert"):
  685. grouped.agg(np.mean)
  686. with pytest.raises(TypeError, match="Could not convert"):
  687. grouped.mean()
  688. df = df.loc[:, ["A", "C", "D"]]
  689. df["E"] = datetime.now()
  690. grouped = df.groupby("A")
  691. msg = "datetime64 type does not support sum operations"
  692. with pytest.raises(TypeError, match=msg):
  693. grouped.agg(np.sum)
  694. with pytest.raises(TypeError, match=msg):
  695. grouped.sum()
  696. # won't work with axis = 1
  697. grouped = df.groupby({"A": 0, "C": 0, "D": 1, "E": 1}, axis=1)
  698. msg = "does not support reduction 'sum'"
  699. with pytest.raises(TypeError, match=msg):
  700. grouped.agg(lambda x: x.sum(0, numeric_only=False))
  701. @pytest.mark.parametrize(
  702. "agg_function",
  703. ["max", "min"],
  704. )
  705. def test_keep_nuisance_agg(df, agg_function):
  706. # GH 38815
  707. grouped = df.groupby("A")
  708. result = getattr(grouped, agg_function)()
  709. expected = result.copy()
  710. expected.loc["bar", "B"] = getattr(df.loc[df["A"] == "bar", "B"], agg_function)()
  711. expected.loc["foo", "B"] = getattr(df.loc[df["A"] == "foo", "B"], agg_function)()
  712. tm.assert_frame_equal(result, expected)
  713. @pytest.mark.parametrize(
  714. "agg_function",
  715. ["sum", "mean", "prod", "std", "var", "sem", "median"],
  716. )
  717. @pytest.mark.parametrize("numeric_only", [True, False])
  718. def test_omit_nuisance_agg(df, agg_function, numeric_only):
  719. # GH 38774, GH 38815
  720. grouped = df.groupby("A")
  721. no_drop_nuisance = ("var", "std", "sem", "mean", "prod", "median")
  722. if agg_function in no_drop_nuisance and not numeric_only:
  723. # Added numeric_only as part of GH#46560; these do not drop nuisance
  724. # columns when numeric_only is False
  725. klass = ValueError if agg_function in ("std", "sem") else TypeError
  726. msg = "|".join(["[C|c]ould not convert", "can't multiply sequence"])
  727. with pytest.raises(klass, match=msg):
  728. getattr(grouped, agg_function)(numeric_only=numeric_only)
  729. else:
  730. result = getattr(grouped, agg_function)(numeric_only=numeric_only)
  731. if not numeric_only and agg_function == "sum":
  732. # sum is successful on column B
  733. columns = ["A", "B", "C", "D"]
  734. else:
  735. columns = ["A", "C", "D"]
  736. expected = getattr(df.loc[:, columns].groupby("A"), agg_function)(
  737. numeric_only=numeric_only
  738. )
  739. tm.assert_frame_equal(result, expected)
  740. def test_raise_on_nuisance_python_single(df):
  741. # GH 38815
  742. grouped = df.groupby("A")
  743. with pytest.raises(TypeError, match="could not convert"):
  744. grouped.skew()
  745. def test_raise_on_nuisance_python_multiple(three_group):
  746. grouped = three_group.groupby(["A", "B"])
  747. with pytest.raises(TypeError, match="Could not convert"):
  748. grouped.agg(np.mean)
  749. with pytest.raises(TypeError, match="Could not convert"):
  750. grouped.mean()
  751. def test_empty_groups_corner(mframe):
  752. # handle empty groups
  753. df = DataFrame(
  754. {
  755. "k1": np.array(["b", "b", "b", "a", "a", "a"]),
  756. "k2": np.array(["1", "1", "1", "2", "2", "2"]),
  757. "k3": ["foo", "bar"] * 3,
  758. "v1": np.random.randn(6),
  759. "v2": np.random.randn(6),
  760. }
  761. )
  762. grouped = df.groupby(["k1", "k2"])
  763. result = grouped[["v1", "v2"]].agg(np.mean)
  764. expected = grouped.mean(numeric_only=True)
  765. tm.assert_frame_equal(result, expected)
  766. grouped = mframe[3:5].groupby(level=0)
  767. agged = grouped.apply(lambda x: x.mean())
  768. agged_A = grouped["A"].apply(np.mean)
  769. tm.assert_series_equal(agged["A"], agged_A)
  770. assert agged.index.name == "first"
  771. def test_nonsense_func():
  772. df = DataFrame([0])
  773. msg = r"unsupported operand type\(s\) for \+: 'int' and 'str'"
  774. with pytest.raises(TypeError, match=msg):
  775. df.groupby(lambda x: x + "foo")
  776. def test_wrap_aggregated_output_multindex(mframe):
  777. df = mframe.T
  778. df["baz", "two"] = "peekaboo"
  779. keys = [np.array([0, 0, 1]), np.array([0, 0, 1])]
  780. with pytest.raises(TypeError, match="Could not convert"):
  781. df.groupby(keys).agg(np.mean)
  782. agged = df.drop(columns=("baz", "two")).groupby(keys).agg(np.mean)
  783. assert isinstance(agged.columns, MultiIndex)
  784. def aggfun(ser):
  785. if ser.name == ("foo", "one"):
  786. raise TypeError("Test error message")
  787. return ser.sum()
  788. with pytest.raises(TypeError, match="Test error message"):
  789. df.groupby(keys).aggregate(aggfun)
  790. def test_groupby_level_apply(mframe):
  791. result = mframe.groupby(level=0).count()
  792. assert result.index.name == "first"
  793. result = mframe.groupby(level=1).count()
  794. assert result.index.name == "second"
  795. result = mframe["A"].groupby(level=0).count()
  796. assert result.index.name == "first"
  797. def test_groupby_level_mapper(mframe):
  798. deleveled = mframe.reset_index()
  799. mapper0 = {"foo": 0, "bar": 0, "baz": 1, "qux": 1}
  800. mapper1 = {"one": 0, "two": 0, "three": 1}
  801. result0 = mframe.groupby(mapper0, level=0).sum()
  802. result1 = mframe.groupby(mapper1, level=1).sum()
  803. mapped_level0 = np.array(
  804. [mapper0.get(x) for x in deleveled["first"]], dtype=np.int64
  805. )
  806. mapped_level1 = np.array(
  807. [mapper1.get(x) for x in deleveled["second"]], dtype=np.int64
  808. )
  809. expected0 = mframe.groupby(mapped_level0).sum()
  810. expected1 = mframe.groupby(mapped_level1).sum()
  811. expected0.index.name, expected1.index.name = "first", "second"
  812. tm.assert_frame_equal(result0, expected0)
  813. tm.assert_frame_equal(result1, expected1)
  814. def test_groupby_level_nonmulti():
  815. # GH 1313, GH 13901
  816. s = Series([1, 2, 3, 10, 4, 5, 20, 6], Index([1, 2, 3, 1, 4, 5, 2, 6], name="foo"))
  817. expected = Series([11, 22, 3, 4, 5, 6], Index(range(1, 7), name="foo"))
  818. result = s.groupby(level=0).sum()
  819. tm.assert_series_equal(result, expected)
  820. result = s.groupby(level=[0]).sum()
  821. tm.assert_series_equal(result, expected)
  822. result = s.groupby(level=-1).sum()
  823. tm.assert_series_equal(result, expected)
  824. result = s.groupby(level=[-1]).sum()
  825. tm.assert_series_equal(result, expected)
  826. msg = "level > 0 or level < -1 only valid with MultiIndex"
  827. with pytest.raises(ValueError, match=msg):
  828. s.groupby(level=1)
  829. with pytest.raises(ValueError, match=msg):
  830. s.groupby(level=-2)
  831. msg = "No group keys passed!"
  832. with pytest.raises(ValueError, match=msg):
  833. s.groupby(level=[])
  834. msg = "multiple levels only valid with MultiIndex"
  835. with pytest.raises(ValueError, match=msg):
  836. s.groupby(level=[0, 0])
  837. with pytest.raises(ValueError, match=msg):
  838. s.groupby(level=[0, 1])
  839. msg = "level > 0 or level < -1 only valid with MultiIndex"
  840. with pytest.raises(ValueError, match=msg):
  841. s.groupby(level=[1])
  842. def test_groupby_complex():
  843. # GH 12902
  844. a = Series(data=np.arange(4) * (1 + 2j), index=[0, 0, 1, 1])
  845. expected = Series((1 + 2j, 5 + 10j))
  846. result = a.groupby(level=0).sum()
  847. tm.assert_series_equal(result, expected)
  848. def test_groupby_complex_numbers():
  849. # GH 17927
  850. df = DataFrame(
  851. [
  852. {"a": 1, "b": 1 + 1j},
  853. {"a": 1, "b": 1 + 2j},
  854. {"a": 4, "b": 1},
  855. ]
  856. )
  857. expected = DataFrame(
  858. np.array([1, 1, 1], dtype=np.int64),
  859. index=Index([(1 + 1j), (1 + 2j), (1 + 0j)], name="b"),
  860. columns=Index(["a"], dtype="object"),
  861. )
  862. result = df.groupby("b", sort=False).count()
  863. tm.assert_frame_equal(result, expected)
  864. # Sorted by the magnitude of the complex numbers
  865. expected.index = Index([(1 + 0j), (1 + 1j), (1 + 2j)], name="b")
  866. result = df.groupby("b", sort=True).count()
  867. tm.assert_frame_equal(result, expected)
  868. def test_groupby_series_indexed_differently():
  869. s1 = Series(
  870. [5.0, -9.0, 4.0, 100.0, -5.0, 55.0, 6.7],
  871. index=Index(["a", "b", "c", "d", "e", "f", "g"]),
  872. )
  873. s2 = Series(
  874. [1.0, 1.0, 4.0, 5.0, 5.0, 7.0], index=Index(["a", "b", "d", "f", "g", "h"])
  875. )
  876. grouped = s1.groupby(s2)
  877. agged = grouped.mean()
  878. exp = s1.groupby(s2.reindex(s1.index).get).mean()
  879. tm.assert_series_equal(agged, exp)
  880. def test_groupby_with_hier_columns():
  881. tuples = list(
  882. zip(
  883. *[
  884. ["bar", "bar", "baz", "baz", "foo", "foo", "qux", "qux"],
  885. ["one", "two", "one", "two", "one", "two", "one", "two"],
  886. ]
  887. )
  888. )
  889. index = MultiIndex.from_tuples(tuples)
  890. columns = MultiIndex.from_tuples(
  891. [("A", "cat"), ("B", "dog"), ("B", "cat"), ("A", "dog")]
  892. )
  893. df = DataFrame(np.random.randn(8, 4), index=index, columns=columns)
  894. result = df.groupby(level=0).mean()
  895. tm.assert_index_equal(result.columns, columns)
  896. result = df.groupby(level=0, axis=1).mean()
  897. tm.assert_index_equal(result.index, df.index)
  898. result = df.groupby(level=0).agg(np.mean)
  899. tm.assert_index_equal(result.columns, columns)
  900. result = df.groupby(level=0).apply(lambda x: x.mean())
  901. tm.assert_index_equal(result.columns, columns)
  902. result = df.groupby(level=0, axis=1).agg(lambda x: x.mean(1))
  903. tm.assert_index_equal(result.columns, Index(["A", "B"]))
  904. tm.assert_index_equal(result.index, df.index)
  905. # add a nuisance column
  906. sorted_columns, _ = columns.sortlevel(0)
  907. df["A", "foo"] = "bar"
  908. result = df.groupby(level=0).mean(numeric_only=True)
  909. tm.assert_index_equal(result.columns, df.columns[:-1])
  910. def test_grouping_ndarray(df):
  911. grouped = df.groupby(df["A"].values)
  912. result = grouped.sum()
  913. expected = df.groupby(df["A"].rename(None)).sum()
  914. tm.assert_frame_equal(result, expected)
  915. def test_groupby_wrong_multi_labels():
  916. index = Index([0, 1, 2, 3, 4], name="index")
  917. data = DataFrame(
  918. {
  919. "foo": ["foo1", "foo1", "foo2", "foo1", "foo3"],
  920. "bar": ["bar1", "bar2", "bar2", "bar1", "bar1"],
  921. "baz": ["baz1", "baz1", "baz1", "baz2", "baz2"],
  922. "spam": ["spam2", "spam3", "spam2", "spam1", "spam1"],
  923. "data": [20, 30, 40, 50, 60],
  924. },
  925. index=index,
  926. )
  927. grouped = data.groupby(["foo", "bar", "baz", "spam"])
  928. result = grouped.agg(np.mean)
  929. expected = grouped.mean()
  930. tm.assert_frame_equal(result, expected)
  931. def test_groupby_series_with_name(df):
  932. result = df.groupby(df["A"]).mean(numeric_only=True)
  933. result2 = df.groupby(df["A"], as_index=False).mean(numeric_only=True)
  934. assert result.index.name == "A"
  935. assert "A" in result2
  936. result = df.groupby([df["A"], df["B"]]).mean()
  937. result2 = df.groupby([df["A"], df["B"]], as_index=False).mean()
  938. assert result.index.names == ("A", "B")
  939. assert "A" in result2
  940. assert "B" in result2
  941. def test_seriesgroupby_name_attr(df):
  942. # GH 6265
  943. result = df.groupby("A")["C"]
  944. assert result.count().name == "C"
  945. assert result.mean().name == "C"
  946. testFunc = lambda x: np.sum(x) * 2
  947. assert result.agg(testFunc).name == "C"
  948. def test_consistency_name():
  949. # GH 12363
  950. df = DataFrame(
  951. {
  952. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
  953. "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
  954. "C": np.random.randn(8) + 1.0,
  955. "D": np.arange(8),
  956. }
  957. )
  958. expected = df.groupby(["A"]).B.count()
  959. result = df.B.groupby(df.A).count()
  960. tm.assert_series_equal(result, expected)
  961. def test_groupby_name_propagation(df):
  962. # GH 6124
  963. def summarize(df, name=None):
  964. return Series({"count": 1, "mean": 2, "omissions": 3}, name=name)
  965. def summarize_random_name(df):
  966. # Provide a different name for each Series. In this case, groupby
  967. # should not attempt to propagate the Series name since they are
  968. # inconsistent.
  969. return Series({"count": 1, "mean": 2, "omissions": 3}, name=df.iloc[0]["A"])
  970. metrics = df.groupby("A").apply(summarize)
  971. assert metrics.columns.name is None
  972. metrics = df.groupby("A").apply(summarize, "metrics")
  973. assert metrics.columns.name == "metrics"
  974. metrics = df.groupby("A").apply(summarize_random_name)
  975. assert metrics.columns.name is None
  976. def test_groupby_nonstring_columns():
  977. df = DataFrame([np.arange(10) for x in range(10)])
  978. grouped = df.groupby(0)
  979. result = grouped.mean()
  980. expected = df.groupby(df[0]).mean()
  981. tm.assert_frame_equal(result, expected)
  982. def test_groupby_mixed_type_columns():
  983. # GH 13432, unorderable types in py3
  984. df = DataFrame([[0, 1, 2]], columns=["A", "B", 0])
  985. expected = DataFrame([[1, 2]], columns=["B", 0], index=Index([0], name="A"))
  986. result = df.groupby("A").first()
  987. tm.assert_frame_equal(result, expected)
  988. result = df.groupby("A").sum()
  989. tm.assert_frame_equal(result, expected)
  990. def test_cython_grouper_series_bug_noncontig():
  991. arr = np.empty((100, 100))
  992. arr.fill(np.nan)
  993. obj = Series(arr[:, 0])
  994. inds = np.tile(range(10), 10)
  995. result = obj.groupby(inds).agg(Series.median)
  996. assert result.isna().all()
  997. def test_series_grouper_noncontig_index():
  998. index = Index(tm.rands_array(10, 100))
  999. values = Series(np.random.randn(50), index=index[::2])
  1000. labels = np.random.randint(0, 5, 50)
  1001. # it works!
  1002. grouped = values.groupby(labels)
  1003. # accessing the index elements causes segfault
  1004. f = lambda x: len(set(map(id, x.index)))
  1005. grouped.agg(f)
  1006. def test_convert_objects_leave_decimal_alone():
  1007. s = Series(range(5))
  1008. labels = np.array(["a", "b", "c", "d", "e"], dtype="O")
  1009. def convert_fast(x):
  1010. return Decimal(str(x.mean()))
  1011. def convert_force_pure(x):
  1012. # base will be length 0
  1013. assert len(x.values.base) > 0
  1014. return Decimal(str(x.mean()))
  1015. grouped = s.groupby(labels)
  1016. result = grouped.agg(convert_fast)
  1017. assert result.dtype == np.object_
  1018. assert isinstance(result[0], Decimal)
  1019. result = grouped.agg(convert_force_pure)
  1020. assert result.dtype == np.object_
  1021. assert isinstance(result[0], Decimal)
  1022. def test_groupby_dtype_inference_empty():
  1023. # GH 6733
  1024. df = DataFrame({"x": [], "range": np.arange(0, dtype="int64")})
  1025. assert df["x"].dtype == np.float64
  1026. result = df.groupby("x").first()
  1027. exp_index = Index([], name="x", dtype=np.float64)
  1028. expected = DataFrame({"range": Series([], index=exp_index, dtype="int64")})
  1029. tm.assert_frame_equal(result, expected, by_blocks=True)
  1030. def test_groupby_unit64_float_conversion():
  1031. #  GH: 30859 groupby converts unit64 to floats sometimes
  1032. df = DataFrame({"first": [1], "second": [1], "value": [16148277970000000000]})
  1033. result = df.groupby(["first", "second"])["value"].max()
  1034. expected = Series(
  1035. [16148277970000000000],
  1036. MultiIndex.from_product([[1], [1]], names=["first", "second"]),
  1037. name="value",
  1038. )
  1039. tm.assert_series_equal(result, expected)
  1040. def test_groupby_list_infer_array_like(df):
  1041. result = df.groupby(list(df["A"])).mean(numeric_only=True)
  1042. expected = df.groupby(df["A"]).mean(numeric_only=True)
  1043. tm.assert_frame_equal(result, expected, check_names=False)
  1044. with pytest.raises(KeyError, match=r"^'foo'$"):
  1045. df.groupby(list(df["A"][:-1]))
  1046. # pathological case of ambiguity
  1047. df = DataFrame({"foo": [0, 1], "bar": [3, 4], "val": np.random.randn(2)})
  1048. result = df.groupby(["foo", "bar"]).mean()
  1049. expected = df.groupby([df["foo"], df["bar"]]).mean()[["val"]]
  1050. def test_groupby_keys_same_size_as_index():
  1051. # GH 11185
  1052. freq = "s"
  1053. index = date_range(
  1054. start=Timestamp("2015-09-29T11:34:44-0700"), periods=2, freq=freq
  1055. )
  1056. df = DataFrame([["A", 10], ["B", 15]], columns=["metric", "values"], index=index)
  1057. result = df.groupby([Grouper(level=0, freq=freq), "metric"]).mean()
  1058. expected = df.set_index([df.index, "metric"]).astype(float)
  1059. tm.assert_frame_equal(result, expected)
  1060. def test_groupby_one_row():
  1061. # GH 11741
  1062. msg = r"^'Z'$"
  1063. df1 = DataFrame(np.random.randn(1, 4), columns=list("ABCD"))
  1064. with pytest.raises(KeyError, match=msg):
  1065. df1.groupby("Z")
  1066. df2 = DataFrame(np.random.randn(2, 4), columns=list("ABCD"))
  1067. with pytest.raises(KeyError, match=msg):
  1068. df2.groupby("Z")
  1069. def test_groupby_nat_exclude():
  1070. # GH 6992
  1071. df = DataFrame(
  1072. {
  1073. "values": np.random.randn(8),
  1074. "dt": [
  1075. np.nan,
  1076. Timestamp("2013-01-01"),
  1077. np.nan,
  1078. Timestamp("2013-02-01"),
  1079. np.nan,
  1080. Timestamp("2013-02-01"),
  1081. np.nan,
  1082. Timestamp("2013-01-01"),
  1083. ],
  1084. "str": [np.nan, "a", np.nan, "a", np.nan, "a", np.nan, "b"],
  1085. }
  1086. )
  1087. grouped = df.groupby("dt")
  1088. expected = [Index([1, 7]), Index([3, 5])]
  1089. keys = sorted(grouped.groups.keys())
  1090. assert len(keys) == 2
  1091. for k, e in zip(keys, expected):
  1092. # grouped.groups keys are np.datetime64 with system tz
  1093. # not to be affected by tz, only compare values
  1094. tm.assert_index_equal(grouped.groups[k], e)
  1095. # confirm obj is not filtered
  1096. tm.assert_frame_equal(grouped.grouper.groupings[0].obj, df)
  1097. assert grouped.ngroups == 2
  1098. expected = {
  1099. Timestamp("2013-01-01 00:00:00"): np.array([1, 7], dtype=np.intp),
  1100. Timestamp("2013-02-01 00:00:00"): np.array([3, 5], dtype=np.intp),
  1101. }
  1102. for k in grouped.indices:
  1103. tm.assert_numpy_array_equal(grouped.indices[k], expected[k])
  1104. tm.assert_frame_equal(grouped.get_group(Timestamp("2013-01-01")), df.iloc[[1, 7]])
  1105. tm.assert_frame_equal(grouped.get_group(Timestamp("2013-02-01")), df.iloc[[3, 5]])
  1106. with pytest.raises(KeyError, match=r"^NaT$"):
  1107. grouped.get_group(pd.NaT)
  1108. nan_df = DataFrame(
  1109. {"nan": [np.nan, np.nan, np.nan], "nat": [pd.NaT, pd.NaT, pd.NaT]}
  1110. )
  1111. assert nan_df["nan"].dtype == "float64"
  1112. assert nan_df["nat"].dtype == "datetime64[ns]"
  1113. for key in ["nan", "nat"]:
  1114. grouped = nan_df.groupby(key)
  1115. assert grouped.groups == {}
  1116. assert grouped.ngroups == 0
  1117. assert grouped.indices == {}
  1118. with pytest.raises(KeyError, match=r"^nan$"):
  1119. grouped.get_group(np.nan)
  1120. with pytest.raises(KeyError, match=r"^NaT$"):
  1121. grouped.get_group(pd.NaT)
  1122. def test_groupby_two_group_keys_all_nan():
  1123. # GH #36842: Grouping over two group keys shouldn't raise an error
  1124. df = DataFrame({"a": [np.nan, np.nan], "b": [np.nan, np.nan], "c": [1, 2]})
  1125. result = df.groupby(["a", "b"]).indices
  1126. assert result == {}
  1127. def test_groupby_2d_malformed():
  1128. d = DataFrame(index=range(2))
  1129. d["group"] = ["g1", "g2"]
  1130. d["zeros"] = [0, 0]
  1131. d["ones"] = [1, 1]
  1132. d["label"] = ["l1", "l2"]
  1133. tmp = d.groupby(["group"]).mean(numeric_only=True)
  1134. res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
  1135. tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
  1136. tm.assert_numpy_array_equal(tmp.values, res_values)
  1137. def test_int32_overflow():
  1138. B = np.concatenate((np.arange(10000), np.arange(10000), np.arange(5000)))
  1139. A = np.arange(25000)
  1140. df = DataFrame({"A": A, "B": B, "C": A, "D": B, "E": np.random.randn(25000)})
  1141. left = df.groupby(["A", "B", "C", "D"]).sum()
  1142. right = df.groupby(["D", "C", "B", "A"]).sum()
  1143. assert len(left) == len(right)
  1144. def test_groupby_sort_multi():
  1145. df = DataFrame(
  1146. {
  1147. "a": ["foo", "bar", "baz"],
  1148. "b": [3, 2, 1],
  1149. "c": [0, 1, 2],
  1150. "d": np.random.randn(3),
  1151. }
  1152. )
  1153. tups = [tuple(row) for row in df[["a", "b", "c"]].values]
  1154. tups = com.asarray_tuplesafe(tups)
  1155. result = df.groupby(["a", "b", "c"], sort=True).sum()
  1156. tm.assert_numpy_array_equal(result.index.values, tups[[1, 2, 0]])
  1157. tups = [tuple(row) for row in df[["c", "a", "b"]].values]
  1158. tups = com.asarray_tuplesafe(tups)
  1159. result = df.groupby(["c", "a", "b"], sort=True).sum()
  1160. tm.assert_numpy_array_equal(result.index.values, tups)
  1161. tups = [tuple(x) for x in df[["b", "c", "a"]].values]
  1162. tups = com.asarray_tuplesafe(tups)
  1163. result = df.groupby(["b", "c", "a"], sort=True).sum()
  1164. tm.assert_numpy_array_equal(result.index.values, tups[[2, 1, 0]])
  1165. df = DataFrame(
  1166. {"a": [0, 1, 2, 0, 1, 2], "b": [0, 0, 0, 1, 1, 1], "d": np.random.randn(6)}
  1167. )
  1168. grouped = df.groupby(["a", "b"])["d"]
  1169. result = grouped.sum()
  1170. def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
  1171. tups = [tuple(row) for row in df[keys].values]
  1172. tups = com.asarray_tuplesafe(tups)
  1173. expected = f(df.groupby(tups)[field])
  1174. for k, v in expected.items():
  1175. assert result[k] == v
  1176. _check_groupby(df, result, ["a", "b"], "d")
  1177. def test_dont_clobber_name_column():
  1178. df = DataFrame(
  1179. {"key": ["a", "a", "a", "b", "b", "b"], "name": ["foo", "bar", "baz"] * 2}
  1180. )
  1181. result = df.groupby("key", group_keys=False).apply(lambda x: x)
  1182. tm.assert_frame_equal(result, df)
  1183. def test_skip_group_keys():
  1184. tsf = tm.makeTimeDataFrame()
  1185. grouped = tsf.groupby(lambda x: x.month, group_keys=False)
  1186. result = grouped.apply(lambda x: x.sort_values(by="A")[:3])
  1187. pieces = [group.sort_values(by="A")[:3] for key, group in grouped]
  1188. expected = pd.concat(pieces)
  1189. tm.assert_frame_equal(result, expected)
  1190. grouped = tsf["A"].groupby(lambda x: x.month, group_keys=False)
  1191. result = grouped.apply(lambda x: x.sort_values()[:3])
  1192. pieces = [group.sort_values()[:3] for key, group in grouped]
  1193. expected = pd.concat(pieces)
  1194. tm.assert_series_equal(result, expected)
  1195. def test_no_nonsense_name(float_frame):
  1196. # GH #995
  1197. s = float_frame["C"].copy()
  1198. s.name = None
  1199. result = s.groupby(float_frame["A"]).agg(np.sum)
  1200. assert result.name is None
  1201. def test_multifunc_sum_bug():
  1202. # GH #1065
  1203. x = DataFrame(np.arange(9).reshape(3, 3))
  1204. x["test"] = 0
  1205. x["fl"] = [1.3, 1.5, 1.6]
  1206. grouped = x.groupby("test")
  1207. result = grouped.agg({"fl": "sum", 2: "size"})
  1208. assert result["fl"].dtype == np.float64
  1209. def test_handle_dict_return_value(df):
  1210. def f(group):
  1211. return {"max": group.max(), "min": group.min()}
  1212. def g(group):
  1213. return Series({"max": group.max(), "min": group.min()})
  1214. result = df.groupby("A")["C"].apply(f)
  1215. expected = df.groupby("A")["C"].apply(g)
  1216. assert isinstance(result, Series)
  1217. tm.assert_series_equal(result, expected)
  1218. @pytest.mark.parametrize("grouper", ["A", ["A", "B"]])
  1219. def test_set_group_name(df, grouper):
  1220. def f(group):
  1221. assert group.name is not None
  1222. return group
  1223. def freduce(group):
  1224. assert group.name is not None
  1225. return group.sum()
  1226. def freducex(x):
  1227. return freduce(x)
  1228. grouped = df.groupby(grouper, group_keys=False)
  1229. # make sure all these work
  1230. grouped.apply(f)
  1231. grouped.aggregate(freduce)
  1232. grouped.aggregate({"C": freduce, "D": freduce})
  1233. grouped.transform(f)
  1234. grouped["C"].apply(f)
  1235. grouped["C"].aggregate(freduce)
  1236. grouped["C"].aggregate([freduce, freducex])
  1237. grouped["C"].transform(f)
  1238. def test_group_name_available_in_inference_pass():
  1239. # gh-15062
  1240. df = DataFrame({"a": [0, 0, 1, 1, 2, 2], "b": np.arange(6)})
  1241. names = []
  1242. def f(group):
  1243. names.append(group.name)
  1244. return group.copy()
  1245. df.groupby("a", sort=False, group_keys=False).apply(f)
  1246. expected_names = [0, 1, 2]
  1247. assert names == expected_names
  1248. def test_no_dummy_key_names(df):
  1249. # see gh-1291
  1250. result = df.groupby(df["A"].values).sum()
  1251. assert result.index.name is None
  1252. result = df.groupby([df["A"].values, df["B"].values]).sum()
  1253. assert result.index.names == (None, None)
  1254. def test_groupby_sort_multiindex_series():
  1255. # series multiindex groupby sort argument was not being passed through
  1256. # _compress_group_index
  1257. # GH 9444
  1258. index = MultiIndex(
  1259. levels=[[1, 2], [1, 2]],
  1260. codes=[[0, 0, 0, 0, 1, 1], [1, 1, 0, 0, 0, 0]],
  1261. names=["a", "b"],
  1262. )
  1263. mseries = Series([0, 1, 2, 3, 4, 5], index=index)
  1264. index = MultiIndex(
  1265. levels=[[1, 2], [1, 2]], codes=[[0, 0, 1], [1, 0, 0]], names=["a", "b"]
  1266. )
  1267. mseries_result = Series([0, 2, 4], index=index)
  1268. result = mseries.groupby(level=["a", "b"], sort=False).first()
  1269. tm.assert_series_equal(result, mseries_result)
  1270. result = mseries.groupby(level=["a", "b"], sort=True).first()
  1271. tm.assert_series_equal(result, mseries_result.sort_index())
  1272. def test_groupby_reindex_inside_function():
  1273. periods = 1000
  1274. ind = date_range(start="2012/1/1", freq="5min", periods=periods)
  1275. df = DataFrame({"high": np.arange(periods), "low": np.arange(periods)}, index=ind)
  1276. def agg_before(func, fix=False):
  1277. """
  1278. Run an aggregate func on the subset of data.
  1279. """
  1280. def _func(data):
  1281. d = data.loc[data.index.map(lambda x: x.hour < 11)].dropna()
  1282. if fix:
  1283. data[data.index[0]]
  1284. if len(d) == 0:
  1285. return None
  1286. return func(d)
  1287. return _func
  1288. grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
  1289. closure_bad = grouped.agg({"high": agg_before(np.max)})
  1290. closure_good = grouped.agg({"high": agg_before(np.max, True)})
  1291. tm.assert_frame_equal(closure_bad, closure_good)
  1292. def test_groupby_multiindex_missing_pair():
  1293. # GH9049
  1294. df = DataFrame(
  1295. {
  1296. "group1": ["a", "a", "a", "b"],
  1297. "group2": ["c", "c", "d", "c"],
  1298. "value": [1, 1, 1, 5],
  1299. }
  1300. )
  1301. df = df.set_index(["group1", "group2"])
  1302. df_grouped = df.groupby(level=["group1", "group2"], sort=True)
  1303. res = df_grouped.agg("sum")
  1304. idx = MultiIndex.from_tuples(
  1305. [("a", "c"), ("a", "d"), ("b", "c")], names=["group1", "group2"]
  1306. )
  1307. exp = DataFrame([[2], [1], [5]], index=idx, columns=["value"])
  1308. tm.assert_frame_equal(res, exp)
  1309. def test_groupby_multiindex_not_lexsorted():
  1310. # GH 11640
  1311. # define the lexsorted version
  1312. lexsorted_mi = MultiIndex.from_tuples(
  1313. [("a", ""), ("b1", "c1"), ("b2", "c2")], names=["b", "c"]
  1314. )
  1315. lexsorted_df = DataFrame([[1, 3, 4]], columns=lexsorted_mi)
  1316. assert lexsorted_df.columns._is_lexsorted()
  1317. # define the non-lexsorted version
  1318. not_lexsorted_df = DataFrame(
  1319. columns=["a", "b", "c", "d"], data=[[1, "b1", "c1", 3], [1, "b2", "c2", 4]]
  1320. )
  1321. not_lexsorted_df = not_lexsorted_df.pivot_table(
  1322. index="a", columns=["b", "c"], values="d"
  1323. )
  1324. not_lexsorted_df = not_lexsorted_df.reset_index()
  1325. assert not not_lexsorted_df.columns._is_lexsorted()
  1326. # compare the results
  1327. tm.assert_frame_equal(lexsorted_df, not_lexsorted_df)
  1328. expected = lexsorted_df.groupby("a").mean()
  1329. with tm.assert_produces_warning(PerformanceWarning):
  1330. result = not_lexsorted_df.groupby("a").mean()
  1331. tm.assert_frame_equal(expected, result)
  1332. # a transforming function should work regardless of sort
  1333. # GH 14776
  1334. df = DataFrame(
  1335. {"x": ["a", "a", "b", "a"], "y": [1, 1, 2, 2], "z": [1, 2, 3, 4]}
  1336. ).set_index(["x", "y"])
  1337. assert not df.index._is_lexsorted()
  1338. for level in [0, 1, [0, 1]]:
  1339. for sort in [False, True]:
  1340. result = df.groupby(level=level, sort=sort, group_keys=False).apply(
  1341. DataFrame.drop_duplicates
  1342. )
  1343. expected = df
  1344. tm.assert_frame_equal(expected, result)
  1345. result = (
  1346. df.sort_index()
  1347. .groupby(level=level, sort=sort, group_keys=False)
  1348. .apply(DataFrame.drop_duplicates)
  1349. )
  1350. expected = df.sort_index()
  1351. tm.assert_frame_equal(expected, result)
  1352. def test_index_label_overlaps_location():
  1353. # checking we don't have any label/location confusion in the
  1354. # wake of GH5375
  1355. df = DataFrame(list("ABCDE"), index=[2, 0, 2, 1, 1])
  1356. g = df.groupby(list("ababb"))
  1357. actual = g.filter(lambda x: len(x) > 2)
  1358. expected = df.iloc[[1, 3, 4]]
  1359. tm.assert_frame_equal(actual, expected)
  1360. ser = df[0]
  1361. g = ser.groupby(list("ababb"))
  1362. actual = g.filter(lambda x: len(x) > 2)
  1363. expected = ser.take([1, 3, 4])
  1364. tm.assert_series_equal(actual, expected)
  1365. # and again, with a generic Index of floats
  1366. df.index = df.index.astype(float)
  1367. g = df.groupby(list("ababb"))
  1368. actual = g.filter(lambda x: len(x) > 2)
  1369. expected = df.iloc[[1, 3, 4]]
  1370. tm.assert_frame_equal(actual, expected)
  1371. ser = df[0]
  1372. g = ser.groupby(list("ababb"))
  1373. actual = g.filter(lambda x: len(x) > 2)
  1374. expected = ser.take([1, 3, 4])
  1375. tm.assert_series_equal(actual, expected)
  1376. def test_transform_doesnt_clobber_ints():
  1377. # GH 7972
  1378. n = 6
  1379. x = np.arange(n)
  1380. df = DataFrame({"a": x // 2, "b": 2.0 * x, "c": 3.0 * x})
  1381. df2 = DataFrame({"a": x // 2 * 1.0, "b": 2.0 * x, "c": 3.0 * x})
  1382. gb = df.groupby("a")
  1383. result = gb.transform("mean")
  1384. gb2 = df2.groupby("a")
  1385. expected = gb2.transform("mean")
  1386. tm.assert_frame_equal(result, expected)
  1387. @pytest.mark.parametrize(
  1388. "sort_column",
  1389. ["ints", "floats", "strings", ["ints", "floats"], ["ints", "strings"]],
  1390. )
  1391. @pytest.mark.parametrize(
  1392. "group_column", ["int_groups", "string_groups", ["int_groups", "string_groups"]]
  1393. )
  1394. def test_groupby_preserves_sort(sort_column, group_column):
  1395. # Test to ensure that groupby always preserves sort order of original
  1396. # object. Issue #8588 and #9651
  1397. df = DataFrame(
  1398. {
  1399. "int_groups": [3, 1, 0, 1, 0, 3, 3, 3],
  1400. "string_groups": ["z", "a", "z", "a", "a", "g", "g", "g"],
  1401. "ints": [8, 7, 4, 5, 2, 9, 1, 1],
  1402. "floats": [2.3, 5.3, 6.2, -2.4, 2.2, 1.1, 1.1, 5],
  1403. "strings": ["z", "d", "a", "e", "word", "word2", "42", "47"],
  1404. }
  1405. )
  1406. # Try sorting on different types and with different group types
  1407. df = df.sort_values(by=sort_column)
  1408. g = df.groupby(group_column)
  1409. def test_sort(x):
  1410. tm.assert_frame_equal(x, x.sort_values(by=sort_column))
  1411. g.apply(test_sort)
  1412. def test_pivot_table_values_key_error():
  1413. # This test is designed to replicate the error in issue #14938
  1414. df = DataFrame(
  1415. {
  1416. "eventDate": date_range(datetime.today(), periods=20, freq="M").tolist(),
  1417. "thename": range(0, 20),
  1418. }
  1419. )
  1420. df["year"] = df.set_index("eventDate").index.year
  1421. df["month"] = df.set_index("eventDate").index.month
  1422. with pytest.raises(KeyError, match="'badname'"):
  1423. df.reset_index().pivot_table(
  1424. index="year", columns="month", values="badname", aggfunc="count"
  1425. )
  1426. @pytest.mark.parametrize("columns", ["C", ["C"]])
  1427. @pytest.mark.parametrize("keys", [["A"], ["A", "B"]])
  1428. @pytest.mark.parametrize(
  1429. "values",
  1430. [
  1431. [True],
  1432. [0],
  1433. [0.0],
  1434. ["a"],
  1435. Categorical([0]),
  1436. [to_datetime(0)],
  1437. date_range(0, 1, 1, tz="US/Eastern"),
  1438. pd.period_range("2016-01-01", periods=3, freq="D"),
  1439. pd.array([0], dtype="Int64"),
  1440. pd.array([0], dtype="Float64"),
  1441. pd.array([False], dtype="boolean"),
  1442. ],
  1443. ids=[
  1444. "bool",
  1445. "int",
  1446. "float",
  1447. "str",
  1448. "cat",
  1449. "dt64",
  1450. "dt64tz",
  1451. "period",
  1452. "Int64",
  1453. "Float64",
  1454. "boolean",
  1455. ],
  1456. )
  1457. @pytest.mark.parametrize("method", ["attr", "agg", "apply"])
  1458. @pytest.mark.parametrize(
  1459. "op", ["idxmax", "idxmin", "min", "max", "sum", "prod", "skew"]
  1460. )
  1461. def test_empty_groupby(
  1462. columns, keys, values, method, op, request, using_array_manager, dropna
  1463. ):
  1464. # GH8093 & GH26411
  1465. override_dtype = None
  1466. if (
  1467. isinstance(values, Categorical)
  1468. and len(keys) == 1
  1469. and op in ["idxmax", "idxmin"]
  1470. ):
  1471. mark = pytest.mark.xfail(
  1472. raises=ValueError, match="attempt to get arg(min|max) of an empty sequence"
  1473. )
  1474. request.node.add_marker(mark)
  1475. if isinstance(values, BooleanArray) and op in ["sum", "prod"]:
  1476. # We expect to get Int64 back for these
  1477. override_dtype = "Int64"
  1478. if isinstance(values[0], bool) and op in ("prod", "sum"):
  1479. # sum/product of bools is an integer
  1480. override_dtype = "int64"
  1481. df = DataFrame({"A": values, "B": values, "C": values}, columns=list("ABC"))
  1482. if hasattr(values, "dtype"):
  1483. # check that we did the construction right
  1484. assert (df.dtypes == values.dtype).all()
  1485. df = df.iloc[:0]
  1486. gb = df.groupby(keys, group_keys=False, dropna=dropna)[columns]
  1487. def get_result(**kwargs):
  1488. if method == "attr":
  1489. return getattr(gb, op)(**kwargs)
  1490. else:
  1491. return getattr(gb, method)(op, **kwargs)
  1492. def get_categorical_invalid_expected():
  1493. # Categorical is special without 'observed=True', we get an NaN entry
  1494. # corresponding to the unobserved group. If we passed observed=True
  1495. # to groupby, expected would just be 'df.set_index(keys)[columns]'
  1496. # as below
  1497. lev = Categorical([0], dtype=values.dtype)
  1498. if len(keys) != 1:
  1499. idx = MultiIndex.from_product([lev, lev], names=keys)
  1500. else:
  1501. # all columns are dropped, but we end up with one row
  1502. # Categorical is special without 'observed=True'
  1503. idx = Index(lev, name=keys[0])
  1504. expected = DataFrame([], columns=[], index=idx)
  1505. return expected
  1506. is_per = isinstance(df.dtypes[0], pd.PeriodDtype)
  1507. is_dt64 = df.dtypes[0].kind == "M"
  1508. is_cat = isinstance(values, Categorical)
  1509. if isinstance(values, Categorical) and not values.ordered and op in ["min", "max"]:
  1510. msg = f"Cannot perform {op} with non-ordered Categorical"
  1511. with pytest.raises(TypeError, match=msg):
  1512. get_result()
  1513. if isinstance(columns, list):
  1514. # i.e. DataframeGroupBy, not SeriesGroupBy
  1515. result = get_result(numeric_only=True)
  1516. expected = get_categorical_invalid_expected()
  1517. tm.assert_equal(result, expected)
  1518. return
  1519. if op in ["prod", "sum", "skew"]:
  1520. # ops that require more than just ordered-ness
  1521. if is_dt64 or is_cat or is_per:
  1522. # GH#41291
  1523. # datetime64 -> prod and sum are invalid
  1524. if op == "skew":
  1525. msg = "does not support reduction 'skew'"
  1526. elif is_dt64:
  1527. msg = "datetime64 type does not support"
  1528. elif is_per:
  1529. msg = "Period type does not support"
  1530. else:
  1531. msg = "category type does not support"
  1532. with pytest.raises(TypeError, match=msg):
  1533. get_result()
  1534. if not isinstance(columns, list):
  1535. # i.e. SeriesGroupBy
  1536. return
  1537. elif op == "skew":
  1538. # TODO: test the numeric_only=True case
  1539. return
  1540. else:
  1541. # i.e. op in ["prod", "sum"]:
  1542. # i.e. DataFrameGroupBy
  1543. # ops that require more than just ordered-ness
  1544. # GH#41291
  1545. result = get_result(numeric_only=True)
  1546. # with numeric_only=True, these are dropped, and we get
  1547. # an empty DataFrame back
  1548. expected = df.set_index(keys)[[]]
  1549. if is_cat:
  1550. expected = get_categorical_invalid_expected()
  1551. tm.assert_equal(result, expected)
  1552. return
  1553. result = get_result()
  1554. expected = df.set_index(keys)[columns]
  1555. if override_dtype is not None:
  1556. expected = expected.astype(override_dtype)
  1557. if len(keys) == 1:
  1558. expected.index.name = keys[0]
  1559. tm.assert_equal(result, expected)
  1560. def test_empty_groupby_apply_nonunique_columns():
  1561. # GH#44417
  1562. df = DataFrame(np.random.randn(0, 4))
  1563. df[3] = df[3].astype(np.int64)
  1564. df.columns = [0, 1, 2, 0]
  1565. gb = df.groupby(df[1], group_keys=False)
  1566. res = gb.apply(lambda x: x)
  1567. assert (res.dtypes == df.dtypes).all()
  1568. def test_tuple_as_grouping():
  1569. # https://github.com/pandas-dev/pandas/issues/18314
  1570. df = DataFrame(
  1571. {
  1572. ("a", "b"): [1, 1, 1, 1],
  1573. "a": [2, 2, 2, 2],
  1574. "b": [2, 2, 2, 2],
  1575. "c": [1, 1, 1, 1],
  1576. }
  1577. )
  1578. with pytest.raises(KeyError, match=r"('a', 'b')"):
  1579. df[["a", "b", "c"]].groupby(("a", "b"))
  1580. result = df.groupby(("a", "b"))["c"].sum()
  1581. expected = Series([4], name="c", index=Index([1], name=("a", "b")))
  1582. tm.assert_series_equal(result, expected)
  1583. def test_tuple_correct_keyerror():
  1584. # https://github.com/pandas-dev/pandas/issues/18798
  1585. df = DataFrame(1, index=range(3), columns=MultiIndex.from_product([[1, 2], [3, 4]]))
  1586. with pytest.raises(KeyError, match=r"^\(7, 8\)$"):
  1587. df.groupby((7, 8)).mean()
  1588. def test_groupby_agg_ohlc_non_first():
  1589. # GH 21716
  1590. df = DataFrame(
  1591. [[1], [1]],
  1592. columns=Index(["foo"], name="mycols"),
  1593. index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
  1594. )
  1595. expected = DataFrame(
  1596. [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]],
  1597. columns=MultiIndex.from_tuples(
  1598. (
  1599. ("foo", "sum", "foo"),
  1600. ("foo", "ohlc", "open"),
  1601. ("foo", "ohlc", "high"),
  1602. ("foo", "ohlc", "low"),
  1603. ("foo", "ohlc", "close"),
  1604. ),
  1605. names=["mycols", None, None],
  1606. ),
  1607. index=date_range("2018-01-01", periods=2, freq="D", name="dti"),
  1608. )
  1609. result = df.groupby(Grouper(freq="D")).agg(["sum", "ohlc"])
  1610. tm.assert_frame_equal(result, expected)
  1611. def test_groupby_multiindex_nat():
  1612. # GH 9236
  1613. values = [
  1614. (pd.NaT, "a"),
  1615. (datetime(2012, 1, 2), "a"),
  1616. (datetime(2012, 1, 2), "b"),
  1617. (datetime(2012, 1, 3), "a"),
  1618. ]
  1619. mi = MultiIndex.from_tuples(values, names=["date", None])
  1620. ser = Series([3, 2, 2.5, 4], index=mi)
  1621. result = ser.groupby(level=1).mean()
  1622. expected = Series([3.0, 2.5], index=["a", "b"])
  1623. tm.assert_series_equal(result, expected)
  1624. def test_groupby_empty_list_raises():
  1625. # GH 5289
  1626. values = zip(range(10), range(10))
  1627. df = DataFrame(values, columns=["apple", "b"])
  1628. msg = "Grouper and axis must be same length"
  1629. with pytest.raises(ValueError, match=msg):
  1630. df.groupby([[]])
  1631. def test_groupby_multiindex_series_keys_len_equal_group_axis():
  1632. # GH 25704
  1633. index_array = [["x", "x"], ["a", "b"], ["k", "k"]]
  1634. index_names = ["first", "second", "third"]
  1635. ri = MultiIndex.from_arrays(index_array, names=index_names)
  1636. s = Series(data=[1, 2], index=ri)
  1637. result = s.groupby(["first", "third"]).sum()
  1638. index_array = [["x"], ["k"]]
  1639. index_names = ["first", "third"]
  1640. ei = MultiIndex.from_arrays(index_array, names=index_names)
  1641. expected = Series([3], index=ei)
  1642. tm.assert_series_equal(result, expected)
  1643. def test_groupby_groups_in_BaseGrouper():
  1644. # GH 26326
  1645. # Test if DataFrame grouped with a pandas.Grouper has correct groups
  1646. mi = MultiIndex.from_product([["A", "B"], ["C", "D"]], names=["alpha", "beta"])
  1647. df = DataFrame({"foo": [1, 2, 1, 2], "bar": [1, 2, 3, 4]}, index=mi)
  1648. result = df.groupby([Grouper(level="alpha"), "beta"])
  1649. expected = df.groupby(["alpha", "beta"])
  1650. assert result.groups == expected.groups
  1651. result = df.groupby(["beta", Grouper(level="alpha")])
  1652. expected = df.groupby(["beta", "alpha"])
  1653. assert result.groups == expected.groups
  1654. @pytest.mark.parametrize("group_name", ["x", ["x"]])
  1655. def test_groupby_axis_1(group_name):
  1656. # GH 27614
  1657. df = DataFrame(
  1658. np.arange(12).reshape(3, 4), index=[0, 1, 0], columns=[10, 20, 10, 20]
  1659. )
  1660. df.index.name = "y"
  1661. df.columns.name = "x"
  1662. results = df.groupby(group_name, axis=1).sum()
  1663. expected = df.T.groupby(group_name).sum().T
  1664. tm.assert_frame_equal(results, expected)
  1665. # test on MI column
  1666. iterables = [["bar", "baz", "foo"], ["one", "two"]]
  1667. mi = MultiIndex.from_product(iterables=iterables, names=["x", "x1"])
  1668. df = DataFrame(np.arange(18).reshape(3, 6), index=[0, 1, 0], columns=mi)
  1669. results = df.groupby(group_name, axis=1).sum()
  1670. expected = df.T.groupby(group_name).sum().T
  1671. tm.assert_frame_equal(results, expected)
  1672. @pytest.mark.parametrize(
  1673. "op, expected",
  1674. [
  1675. (
  1676. "shift",
  1677. {
  1678. "time": [
  1679. None,
  1680. None,
  1681. Timestamp("2019-01-01 12:00:00"),
  1682. Timestamp("2019-01-01 12:30:00"),
  1683. None,
  1684. None,
  1685. ]
  1686. },
  1687. ),
  1688. (
  1689. "bfill",
  1690. {
  1691. "time": [
  1692. Timestamp("2019-01-01 12:00:00"),
  1693. Timestamp("2019-01-01 12:30:00"),
  1694. Timestamp("2019-01-01 14:00:00"),
  1695. Timestamp("2019-01-01 14:30:00"),
  1696. Timestamp("2019-01-01 14:00:00"),
  1697. Timestamp("2019-01-01 14:30:00"),
  1698. ]
  1699. },
  1700. ),
  1701. (
  1702. "ffill",
  1703. {
  1704. "time": [
  1705. Timestamp("2019-01-01 12:00:00"),
  1706. Timestamp("2019-01-01 12:30:00"),
  1707. Timestamp("2019-01-01 12:00:00"),
  1708. Timestamp("2019-01-01 12:30:00"),
  1709. Timestamp("2019-01-01 14:00:00"),
  1710. Timestamp("2019-01-01 14:30:00"),
  1711. ]
  1712. },
  1713. ),
  1714. ],
  1715. )
  1716. def test_shift_bfill_ffill_tz(tz_naive_fixture, op, expected):
  1717. # GH19995, GH27992: Check that timezone does not drop in shift, bfill, and ffill
  1718. tz = tz_naive_fixture
  1719. data = {
  1720. "id": ["A", "B", "A", "B", "A", "B"],
  1721. "time": [
  1722. Timestamp("2019-01-01 12:00:00"),
  1723. Timestamp("2019-01-01 12:30:00"),
  1724. None,
  1725. None,
  1726. Timestamp("2019-01-01 14:00:00"),
  1727. Timestamp("2019-01-01 14:30:00"),
  1728. ],
  1729. }
  1730. df = DataFrame(data).assign(time=lambda x: x.time.dt.tz_localize(tz))
  1731. grouped = df.groupby("id")
  1732. result = getattr(grouped, op)()
  1733. expected = DataFrame(expected).assign(time=lambda x: x.time.dt.tz_localize(tz))
  1734. tm.assert_frame_equal(result, expected)
  1735. def test_groupby_only_none_group():
  1736. # see GH21624
  1737. # this was crashing with "ValueError: Length of passed values is 1, index implies 0"
  1738. df = DataFrame({"g": [None], "x": 1})
  1739. actual = df.groupby("g")["x"].transform("sum")
  1740. expected = Series([np.nan], name="x")
  1741. tm.assert_series_equal(actual, expected)
  1742. def test_groupby_duplicate_index():
  1743. # GH#29189 the groupby call here used to raise
  1744. ser = Series([2, 5, 6, 8], index=[2.0, 4.0, 4.0, 5.0])
  1745. gb = ser.groupby(level=0)
  1746. result = gb.mean()
  1747. expected = Series([2, 5.5, 8], index=[2.0, 4.0, 5.0])
  1748. tm.assert_series_equal(result, expected)
  1749. def test_group_on_empty_multiindex(transformation_func, request):
  1750. # GH 47787
  1751. # With one row, those are transforms so the schema should be the same
  1752. df = DataFrame(
  1753. data=[[1, Timestamp("today"), 3, 4]],
  1754. columns=["col_1", "col_2", "col_3", "col_4"],
  1755. )
  1756. df["col_3"] = df["col_3"].astype(int)
  1757. df["col_4"] = df["col_4"].astype(int)
  1758. df = df.set_index(["col_1", "col_2"])
  1759. if transformation_func == "fillna":
  1760. args = ("ffill",)
  1761. else:
  1762. args = ()
  1763. result = df.iloc[:0].groupby(["col_1"]).transform(transformation_func, *args)
  1764. expected = df.groupby(["col_1"]).transform(transformation_func, *args).iloc[:0]
  1765. if transformation_func in ("diff", "shift"):
  1766. expected = expected.astype(int)
  1767. tm.assert_equal(result, expected)
  1768. result = (
  1769. df["col_3"].iloc[:0].groupby(["col_1"]).transform(transformation_func, *args)
  1770. )
  1771. expected = (
  1772. df["col_3"].groupby(["col_1"]).transform(transformation_func, *args).iloc[:0]
  1773. )
  1774. if transformation_func in ("diff", "shift"):
  1775. expected = expected.astype(int)
  1776. tm.assert_equal(result, expected)
  1777. @pytest.mark.parametrize(
  1778. "idx",
  1779. [
  1780. Index(["a", "a"], name="foo"),
  1781. MultiIndex.from_tuples((("a", "a"), ("a", "a")), names=["foo", "bar"]),
  1782. ],
  1783. )
  1784. def test_dup_labels_output_shape(groupby_func, idx):
  1785. if groupby_func in {"size", "ngroup", "cumcount"}:
  1786. pytest.skip(f"Not applicable for {groupby_func}")
  1787. df = DataFrame([[1, 1]], columns=idx)
  1788. grp_by = df.groupby([0])
  1789. args = get_groupby_method_args(groupby_func, df)
  1790. result = getattr(grp_by, groupby_func)(*args)
  1791. assert result.shape == (1, 2)
  1792. tm.assert_index_equal(result.columns, idx)
  1793. def test_groupby_crash_on_nunique(axis):
  1794. # Fix following 30253
  1795. dti = date_range("2016-01-01", periods=2, name="foo")
  1796. df = DataFrame({("A", "B"): [1, 2], ("A", "C"): [1, 3], ("D", "B"): [0, 0]})
  1797. df.columns.names = ("bar", "baz")
  1798. df.index = dti
  1799. axis_number = df._get_axis_number(axis)
  1800. if not axis_number:
  1801. df = df.T
  1802. gb = df.groupby(axis=axis_number, level=0)
  1803. result = gb.nunique()
  1804. expected = DataFrame({"A": [1, 2], "D": [1, 1]}, index=dti)
  1805. expected.columns.name = "bar"
  1806. if not axis_number:
  1807. expected = expected.T
  1808. tm.assert_frame_equal(result, expected)
  1809. if axis_number == 0:
  1810. # same thing, but empty columns
  1811. gb2 = df[[]].groupby(axis=axis_number, level=0)
  1812. exp = expected[[]]
  1813. else:
  1814. # same thing, but empty rows
  1815. gb2 = df.loc[[]].groupby(axis=axis_number, level=0)
  1816. # default for empty when we can't infer a dtype is float64
  1817. exp = expected.loc[[]].astype(np.float64)
  1818. res = gb2.nunique()
  1819. tm.assert_frame_equal(res, exp)
  1820. def test_groupby_list_level():
  1821. # GH 9790
  1822. expected = DataFrame(np.arange(0, 9).reshape(3, 3), dtype=float)
  1823. result = expected.groupby(level=[0]).mean()
  1824. tm.assert_frame_equal(result, expected)
  1825. @pytest.mark.parametrize(
  1826. "max_seq_items, expected",
  1827. [
  1828. (5, "{0: [0], 1: [1], 2: [2], 3: [3], 4: [4]}"),
  1829. (4, "{0: [0], 1: [1], 2: [2], 3: [3], ...}"),
  1830. (1, "{0: [0], ...}"),
  1831. ],
  1832. )
  1833. def test_groups_repr_truncates(max_seq_items, expected):
  1834. # GH 1135
  1835. df = DataFrame(np.random.randn(5, 1))
  1836. df["a"] = df.index
  1837. with pd.option_context("display.max_seq_items", max_seq_items):
  1838. result = df.groupby("a").groups.__repr__()
  1839. assert result == expected
  1840. result = df.groupby(np.array(df.a)).groups.__repr__()
  1841. assert result == expected
  1842. def test_group_on_two_row_multiindex_returns_one_tuple_key():
  1843. # GH 18451
  1844. df = DataFrame([{"a": 1, "b": 2, "c": 99}, {"a": 1, "b": 2, "c": 88}])
  1845. df = df.set_index(["a", "b"])
  1846. grp = df.groupby(["a", "b"])
  1847. result = grp.indices
  1848. expected = {(1, 2): np.array([0, 1], dtype=np.int64)}
  1849. assert len(result) == 1
  1850. key = (1, 2)
  1851. assert (result[key] == expected[key]).all()
  1852. @pytest.mark.parametrize(
  1853. "klass, attr, value",
  1854. [
  1855. (DataFrame, "level", "a"),
  1856. (DataFrame, "as_index", False),
  1857. (DataFrame, "sort", False),
  1858. (DataFrame, "group_keys", False),
  1859. (DataFrame, "observed", True),
  1860. (DataFrame, "dropna", False),
  1861. (Series, "level", "a"),
  1862. (Series, "as_index", False),
  1863. (Series, "sort", False),
  1864. (Series, "group_keys", False),
  1865. (Series, "observed", True),
  1866. (Series, "dropna", False),
  1867. ],
  1868. )
  1869. def test_subsetting_columns_keeps_attrs(klass, attr, value):
  1870. # GH 9959 - When subsetting columns, don't drop attributes
  1871. df = DataFrame({"a": [1], "b": [2], "c": [3]})
  1872. if attr != "axis":
  1873. df = df.set_index("a")
  1874. expected = df.groupby("a", **{attr: value})
  1875. result = expected[["b"]] if klass is DataFrame else expected["b"]
  1876. assert getattr(result, attr) == getattr(expected, attr)
  1877. def test_subsetting_columns_axis_1():
  1878. # GH 37725
  1879. g = DataFrame({"A": [1], "B": [2], "C": [3]}).groupby([0, 0, 1], axis=1)
  1880. match = "Cannot subset columns when using axis=1"
  1881. with pytest.raises(ValueError, match=match):
  1882. g[["A", "B"]].sum()
  1883. @pytest.mark.parametrize("func", ["sum", "any", "shift"])
  1884. def test_groupby_column_index_name_lost(func):
  1885. # GH: 29764 groupby loses index sometimes
  1886. expected = Index(["a"], name="idx")
  1887. df = DataFrame([[1]], columns=expected)
  1888. df_grouped = df.groupby([1])
  1889. result = getattr(df_grouped, func)().columns
  1890. tm.assert_index_equal(result, expected)
  1891. def test_groupby_duplicate_columns():
  1892. # GH: 31735
  1893. df = DataFrame(
  1894. {"A": ["f", "e", "g", "h"], "B": ["a", "b", "c", "d"], "C": [1, 2, 3, 4]}
  1895. ).astype(object)
  1896. df.columns = ["A", "B", "B"]
  1897. result = df.groupby([0, 0, 0, 0]).min()
  1898. expected = DataFrame(
  1899. [["e", "a", 1]], index=np.array([0]), columns=["A", "B", "B"], dtype=object
  1900. )
  1901. tm.assert_frame_equal(result, expected)
  1902. def test_groupby_series_with_tuple_name():
  1903. # GH 37755
  1904. ser = Series([1, 2, 3, 4], index=[1, 1, 2, 2], name=("a", "a"))
  1905. ser.index.name = ("b", "b")
  1906. result = ser.groupby(level=0).last()
  1907. expected = Series([2, 4], index=[1, 2], name=("a", "a"))
  1908. expected.index.name = ("b", "b")
  1909. tm.assert_series_equal(result, expected)
  1910. @pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
  1911. @pytest.mark.parametrize(
  1912. "func, values", [("sum", [97.0, 98.0]), ("mean", [24.25, 24.5])]
  1913. )
  1914. def test_groupby_numerical_stability_sum_mean(func, values):
  1915. # GH#38778
  1916. data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
  1917. df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
  1918. result = getattr(df.groupby("group"), func)()
  1919. expected = DataFrame({"a": values, "b": values}, index=Index([1, 2], name="group"))
  1920. tm.assert_frame_equal(result, expected)
  1921. @pytest.mark.xfail(not IS64, reason="GH#38778: fail on 32-bit system")
  1922. def test_groupby_numerical_stability_cumsum():
  1923. # GH#38934
  1924. data = [1e16, 1e16, 97, 98, -5e15, -5e15, -5e15, -5e15]
  1925. df = DataFrame({"group": [1, 2] * 4, "a": data, "b": data})
  1926. result = df.groupby("group").cumsum()
  1927. exp_data = (
  1928. [1e16] * 2 + [1e16 + 96, 1e16 + 98] + [5e15 + 97, 5e15 + 98] + [97.0, 98.0]
  1929. )
  1930. expected = DataFrame({"a": exp_data, "b": exp_data})
  1931. tm.assert_frame_equal(result, expected, check_exact=True)
  1932. def test_groupby_cumsum_skipna_false():
  1933. # GH#46216 don't propagate np.nan above the diagonal
  1934. arr = np.random.randn(5, 5)
  1935. df = DataFrame(arr)
  1936. for i in range(5):
  1937. df.iloc[i, i] = np.nan
  1938. df["A"] = 1
  1939. gb = df.groupby("A")
  1940. res = gb.cumsum(skipna=False)
  1941. expected = df[[0, 1, 2, 3, 4]].cumsum(skipna=False)
  1942. tm.assert_frame_equal(res, expected)
  1943. def test_groupby_cumsum_timedelta64():
  1944. # GH#46216 don't ignore is_datetimelike in libgroupby.group_cumsum
  1945. dti = date_range("2016-01-01", periods=5)
  1946. ser = Series(dti) - dti[0]
  1947. ser[2] = pd.NaT
  1948. df = DataFrame({"A": 1, "B": ser})
  1949. gb = df.groupby("A")
  1950. res = gb.cumsum(numeric_only=False, skipna=True)
  1951. exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, ser[4], ser[4] * 2]})
  1952. tm.assert_frame_equal(res, exp)
  1953. res = gb.cumsum(numeric_only=False, skipna=False)
  1954. exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, pd.NaT, pd.NaT]})
  1955. tm.assert_frame_equal(res, exp)
  1956. def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex):
  1957. dups = rand_series_with_duplicate_datetimeindex
  1958. result = dups.groupby(level=0).mean()
  1959. expected = dups.groupby(dups.index).mean()
  1960. tm.assert_series_equal(result, expected)
  1961. def test_groupby_all_nan_groups_drop():
  1962. # GH 15036
  1963. s = Series([1, 2, 3], [np.nan, np.nan, np.nan])
  1964. result = s.groupby(s.index).sum()
  1965. expected = Series([], index=Index([], dtype=np.float64), dtype=np.int64)
  1966. tm.assert_series_equal(result, expected)
  1967. @pytest.mark.parametrize("numeric_only", [True, False])
  1968. def test_groupby_empty_multi_column(as_index, numeric_only):
  1969. # GH 15106 & GH 41998
  1970. df = DataFrame(data=[], columns=["A", "B", "C"])
  1971. gb = df.groupby(["A", "B"], as_index=as_index)
  1972. result = gb.sum(numeric_only=numeric_only)
  1973. if as_index:
  1974. index = MultiIndex([[], []], [[], []], names=["A", "B"])
  1975. columns = ["C"] if not numeric_only else []
  1976. else:
  1977. index = RangeIndex(0)
  1978. columns = ["A", "B", "C"] if not numeric_only else ["A", "B"]
  1979. expected = DataFrame([], columns=columns, index=index)
  1980. tm.assert_frame_equal(result, expected)
  1981. def test_groupby_aggregation_non_numeric_dtype():
  1982. # GH #43108
  1983. df = DataFrame(
  1984. [["M", [1]], ["M", [1]], ["W", [10]], ["W", [20]]], columns=["MW", "v"]
  1985. )
  1986. expected = DataFrame(
  1987. {
  1988. "v": [[1, 1], [10, 20]],
  1989. },
  1990. index=Index(["M", "W"], dtype="object", name="MW"),
  1991. )
  1992. gb = df.groupby(by=["MW"])
  1993. result = gb.sum()
  1994. tm.assert_frame_equal(result, expected)
  1995. def test_groupby_aggregation_multi_non_numeric_dtype():
  1996. # GH #42395
  1997. df = DataFrame(
  1998. {
  1999. "x": [1, 0, 1, 1, 0],
  2000. "y": [Timedelta(i, "days") for i in range(1, 6)],
  2001. "z": [Timedelta(i * 10, "days") for i in range(1, 6)],
  2002. }
  2003. )
  2004. expected = DataFrame(
  2005. {
  2006. "y": [Timedelta(i, "days") for i in range(7, 9)],
  2007. "z": [Timedelta(i * 10, "days") for i in range(7, 9)],
  2008. },
  2009. index=Index([0, 1], dtype="int64", name="x"),
  2010. )
  2011. gb = df.groupby(by=["x"])
  2012. result = gb.sum()
  2013. tm.assert_frame_equal(result, expected)
  2014. def test_groupby_aggregation_numeric_with_non_numeric_dtype():
  2015. # GH #43108
  2016. df = DataFrame(
  2017. {
  2018. "x": [1, 0, 1, 1, 0],
  2019. "y": [Timedelta(i, "days") for i in range(1, 6)],
  2020. "z": list(range(1, 6)),
  2021. }
  2022. )
  2023. expected = DataFrame(
  2024. {"y": [Timedelta(7, "days"), Timedelta(8, "days")], "z": [7, 8]},
  2025. index=Index([0, 1], dtype="int64", name="x"),
  2026. )
  2027. gb = df.groupby(by=["x"])
  2028. result = gb.sum()
  2029. tm.assert_frame_equal(result, expected)
  2030. def test_groupby_filtered_df_std():
  2031. # GH 16174
  2032. dicts = [
  2033. {"filter_col": False, "groupby_col": True, "bool_col": True, "float_col": 10.5},
  2034. {"filter_col": True, "groupby_col": True, "bool_col": True, "float_col": 20.5},
  2035. {"filter_col": True, "groupby_col": True, "bool_col": True, "float_col": 30.5},
  2036. ]
  2037. df = DataFrame(dicts)
  2038. df_filter = df[df["filter_col"] == True] # noqa:E712
  2039. dfgb = df_filter.groupby("groupby_col")
  2040. result = dfgb.std()
  2041. expected = DataFrame(
  2042. [[0.0, 0.0, 7.071068]],
  2043. columns=["filter_col", "bool_col", "float_col"],
  2044. index=Index([True], name="groupby_col"),
  2045. )
  2046. tm.assert_frame_equal(result, expected)
  2047. def test_datetime_categorical_multikey_groupby_indices():
  2048. # GH 26859
  2049. df = DataFrame(
  2050. {
  2051. "a": Series(list("abc")),
  2052. "b": Series(
  2053. to_datetime(["2018-01-01", "2018-02-01", "2018-03-01"]),
  2054. dtype="category",
  2055. ),
  2056. "c": Categorical.from_codes([-1, 0, 1], categories=[0, 1]),
  2057. }
  2058. )
  2059. result = df.groupby(["a", "b"]).indices
  2060. expected = {
  2061. ("a", Timestamp("2018-01-01 00:00:00")): np.array([0]),
  2062. ("b", Timestamp("2018-02-01 00:00:00")): np.array([1]),
  2063. ("c", Timestamp("2018-03-01 00:00:00")): np.array([2]),
  2064. }
  2065. assert result == expected
  2066. def test_rolling_wrong_param_min_period():
  2067. # GH34037
  2068. name_l = ["Alice"] * 5 + ["Bob"] * 5
  2069. val_l = [np.nan, np.nan, 1, 2, 3] + [np.nan, 1, 2, 3, 4]
  2070. test_df = DataFrame([name_l, val_l]).T
  2071. test_df.columns = ["name", "val"]
  2072. result_error_msg = r"__init__\(\) got an unexpected keyword argument 'min_period'"
  2073. with pytest.raises(TypeError, match=result_error_msg):
  2074. test_df.groupby("name")["val"].rolling(window=2, min_period=1).sum()
  2075. def test_by_column_values_with_same_starting_value():
  2076. # GH29635
  2077. df = DataFrame(
  2078. {
  2079. "Name": ["Thomas", "Thomas", "Thomas John"],
  2080. "Credit": [1200, 1300, 900],
  2081. "Mood": ["sad", "happy", "happy"],
  2082. }
  2083. )
  2084. aggregate_details = {"Mood": Series.mode, "Credit": "sum"}
  2085. result = df.groupby(["Name"]).agg(aggregate_details)
  2086. expected_result = DataFrame(
  2087. {
  2088. "Mood": [["happy", "sad"], "happy"],
  2089. "Credit": [2500, 900],
  2090. "Name": ["Thomas", "Thomas John"],
  2091. }
  2092. ).set_index("Name")
  2093. tm.assert_frame_equal(result, expected_result)
  2094. def test_groupby_none_in_first_mi_level():
  2095. # GH#47348
  2096. arr = [[None, 1, 0, 1], [2, 3, 2, 3]]
  2097. ser = Series(1, index=MultiIndex.from_arrays(arr, names=["a", "b"]))
  2098. result = ser.groupby(level=[0, 1]).sum()
  2099. expected = Series(
  2100. [1, 2], MultiIndex.from_tuples([(0.0, 2), (1.0, 3)], names=["a", "b"])
  2101. )
  2102. tm.assert_series_equal(result, expected)
  2103. def test_groupby_none_column_name():
  2104. # GH#47348
  2105. df = DataFrame({None: [1, 1, 2, 2], "b": [1, 1, 2, 3], "c": [4, 5, 6, 7]})
  2106. result = df.groupby(by=[None]).sum()
  2107. expected = DataFrame({"b": [2, 5], "c": [9, 13]}, index=Index([1, 2], name=None))
  2108. tm.assert_frame_equal(result, expected)
  2109. def test_single_element_list_grouping():
  2110. # GH 42795
  2111. df = DataFrame({"a": [1, 2], "b": [np.nan, 5], "c": [np.nan, 2]}, index=["x", "y"])
  2112. result = [key for key, _ in df.groupby(["a"])]
  2113. expected = [(1,), (2,)]
  2114. assert result == expected
  2115. @pytest.mark.parametrize("func", ["sum", "cumsum", "cumprod", "prod"])
  2116. def test_groupby_avoid_casting_to_float(func):
  2117. # GH#37493
  2118. val = 922337203685477580
  2119. df = DataFrame({"a": 1, "b": [val]})
  2120. result = getattr(df.groupby("a"), func)() - val
  2121. expected = DataFrame({"b": [0]}, index=Index([1], name="a"))
  2122. if func in ["cumsum", "cumprod"]:
  2123. expected = expected.reset_index(drop=True)
  2124. tm.assert_frame_equal(result, expected)
  2125. @pytest.mark.parametrize("func, val", [("sum", 3), ("prod", 2)])
  2126. def test_groupby_sum_support_mask(any_numeric_ea_dtype, func, val):
  2127. # GH#37493
  2128. df = DataFrame({"a": 1, "b": [1, 2, pd.NA]}, dtype=any_numeric_ea_dtype)
  2129. result = getattr(df.groupby("a"), func)()
  2130. expected = DataFrame(
  2131. {"b": [val]},
  2132. index=Index([1], name="a", dtype=any_numeric_ea_dtype),
  2133. dtype=any_numeric_ea_dtype,
  2134. )
  2135. tm.assert_frame_equal(result, expected)
  2136. @pytest.mark.parametrize("val, dtype", [(111, "int"), (222, "uint")])
  2137. def test_groupby_overflow(val, dtype):
  2138. # GH#37493
  2139. df = DataFrame({"a": 1, "b": [val, val]}, dtype=f"{dtype}8")
  2140. result = df.groupby("a").sum()
  2141. expected = DataFrame(
  2142. {"b": [val * 2]},
  2143. index=Index([1], name="a", dtype=f"{dtype}8"),
  2144. dtype=f"{dtype}64",
  2145. )
  2146. tm.assert_frame_equal(result, expected)
  2147. result = df.groupby("a").cumsum()
  2148. expected = DataFrame({"b": [val, val * 2]}, dtype=f"{dtype}64")
  2149. tm.assert_frame_equal(result, expected)
  2150. result = df.groupby("a").prod()
  2151. expected = DataFrame(
  2152. {"b": [val * val]},
  2153. index=Index([1], name="a", dtype=f"{dtype}8"),
  2154. dtype=f"{dtype}64",
  2155. )
  2156. tm.assert_frame_equal(result, expected)
  2157. @pytest.mark.parametrize("skipna, val", [(True, 3), (False, pd.NA)])
  2158. def test_groupby_cumsum_mask(any_numeric_ea_dtype, skipna, val):
  2159. # GH#37493
  2160. df = DataFrame({"a": 1, "b": [1, pd.NA, 2]}, dtype=any_numeric_ea_dtype)
  2161. result = df.groupby("a").cumsum(skipna=skipna)
  2162. expected = DataFrame(
  2163. {"b": [1, pd.NA, val]},
  2164. dtype=any_numeric_ea_dtype,
  2165. )
  2166. tm.assert_frame_equal(result, expected)
  2167. @pytest.mark.parametrize(
  2168. "val_in, index, val_out",
  2169. [
  2170. (
  2171. [1.0, 2.0, 3.0, 4.0, 5.0],
  2172. ["foo", "foo", "bar", "baz", "blah"],
  2173. [3.0, 4.0, 5.0, 3.0],
  2174. ),
  2175. (
  2176. [1.0, 2.0, 3.0, 4.0, 5.0, 6.0],
  2177. ["foo", "foo", "bar", "baz", "blah", "blah"],
  2178. [3.0, 4.0, 11.0, 3.0],
  2179. ),
  2180. ],
  2181. )
  2182. def test_groupby_index_name_in_index_content(val_in, index, val_out):
  2183. # GH 48567
  2184. series = Series(data=val_in, name="values", index=Index(index, name="blah"))
  2185. result = series.groupby("blah").sum()
  2186. expected = Series(
  2187. data=val_out,
  2188. name="values",
  2189. index=Index(["bar", "baz", "blah", "foo"], name="blah"),
  2190. )
  2191. tm.assert_series_equal(result, expected)
  2192. result = series.to_frame().groupby("blah").sum()
  2193. expected = expected.to_frame()
  2194. tm.assert_frame_equal(result, expected)
  2195. @pytest.mark.parametrize("n", [1, 10, 32, 100, 1000])
  2196. def test_sum_of_booleans(n):
  2197. # GH 50347
  2198. df = DataFrame({"groupby_col": 1, "bool": [True] * n})
  2199. df["bool"] = df["bool"].eq(True)
  2200. result = df.groupby("groupby_col").sum()
  2201. expected = DataFrame({"bool": [n]}, index=Index([1], name="groupby_col"))
  2202. tm.assert_frame_equal(result, expected)
  2203. @pytest.mark.filterwarnings(
  2204. "ignore:invalid value encountered in remainder:RuntimeWarning"
  2205. )
  2206. @pytest.mark.parametrize("method", ["head", "tail", "nth", "first", "last"])
  2207. def test_groupby_method_drop_na(method):
  2208. # GH 21755
  2209. df = DataFrame({"A": ["a", np.nan, "b", np.nan, "c"], "B": range(5)})
  2210. if method == "nth":
  2211. result = getattr(df.groupby("A"), method)(n=0)
  2212. else:
  2213. result = getattr(df.groupby("A"), method)()
  2214. if method in ["first", "last"]:
  2215. expected = DataFrame({"B": [0, 2, 4]}).set_index(
  2216. Series(["a", "b", "c"], name="A")
  2217. )
  2218. else:
  2219. expected = DataFrame({"A": ["a", "b", "c"], "B": [0, 2, 4]}, index=[0, 2, 4])
  2220. tm.assert_frame_equal(result, expected)
  2221. def test_groupby_reduce_period():
  2222. # GH#51040
  2223. pi = pd.period_range("2016-01-01", periods=100, freq="D")
  2224. grps = list(range(10)) * 10
  2225. ser = pi.to_series()
  2226. gb = ser.groupby(grps)
  2227. with pytest.raises(TypeError, match="Period type does not support sum operations"):
  2228. gb.sum()
  2229. with pytest.raises(
  2230. TypeError, match="Period type does not support cumsum operations"
  2231. ):
  2232. gb.cumsum()
  2233. with pytest.raises(TypeError, match="Period type does not support prod operations"):
  2234. gb.prod()
  2235. with pytest.raises(
  2236. TypeError, match="Period type does not support cumprod operations"
  2237. ):
  2238. gb.cumprod()
  2239. res = gb.max()
  2240. expected = ser[-10:]
  2241. expected.index = Index(range(10), dtype=np.int_)
  2242. tm.assert_series_equal(res, expected)
  2243. res = gb.min()
  2244. expected = ser[:10]
  2245. expected.index = Index(range(10), dtype=np.int_)
  2246. tm.assert_series_equal(res, expected)
  2247. def test_obj_with_exclusions_duplicate_columns():
  2248. # GH#50806
  2249. df = DataFrame([[0, 1, 2, 3]])
  2250. df.columns = [0, 1, 2, 0]
  2251. gb = df.groupby(df[1])
  2252. result = gb._obj_with_exclusions
  2253. expected = df.take([0, 2, 3], axis=1)
  2254. tm.assert_frame_equal(result, expected)