test_categorical.py 68 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057
  1. from datetime import datetime
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import (
  6. Categorical,
  7. CategoricalIndex,
  8. DataFrame,
  9. Index,
  10. MultiIndex,
  11. Series,
  12. qcut,
  13. )
  14. import pandas._testing as tm
  15. from pandas.core.groupby.generic import SeriesGroupBy
  16. from pandas.tests.groupby import get_groupby_method_args
  17. def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN):
  18. """Reindex to a cartesian production for the groupers,
  19. preserving the nature (Categorical) of each grouper
  20. """
  21. def f(a):
  22. if isinstance(a, (CategoricalIndex, Categorical)):
  23. categories = a.categories
  24. a = Categorical.from_codes(
  25. np.arange(len(categories)), categories=categories, ordered=a.ordered
  26. )
  27. return a
  28. index = MultiIndex.from_product(map(f, args), names=names)
  29. return result.reindex(index, fill_value=fill_value).sort_index()
  30. _results_for_groupbys_with_missing_categories = {
  31. # This maps the builtin groupby functions to their expected outputs for
  32. # missing categories when they are called on a categorical grouper with
  33. # observed=False. Some functions are expected to return NaN, some zero.
  34. # These expected values can be used across several tests (i.e. they are
  35. # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be
  36. # hardcoded in one place.
  37. "all": np.NaN,
  38. "any": np.NaN,
  39. "count": 0,
  40. "corrwith": np.NaN,
  41. "first": np.NaN,
  42. "idxmax": np.NaN,
  43. "idxmin": np.NaN,
  44. "last": np.NaN,
  45. "max": np.NaN,
  46. "mean": np.NaN,
  47. "median": np.NaN,
  48. "min": np.NaN,
  49. "nth": np.NaN,
  50. "nunique": 0,
  51. "prod": np.NaN,
  52. "quantile": np.NaN,
  53. "sem": np.NaN,
  54. "size": 0,
  55. "skew": np.NaN,
  56. "std": np.NaN,
  57. "sum": 0,
  58. "var": np.NaN,
  59. }
  60. def test_apply_use_categorical_name(df):
  61. cats = qcut(df.C, 4)
  62. def get_stats(group):
  63. return {
  64. "min": group.min(),
  65. "max": group.max(),
  66. "count": group.count(),
  67. "mean": group.mean(),
  68. }
  69. result = df.groupby(cats, observed=False).D.apply(get_stats)
  70. assert result.index.names[0] == "C"
  71. def test_basic(): # TODO: split this test
  72. cats = Categorical(
  73. ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
  74. categories=["a", "b", "c", "d"],
  75. ordered=True,
  76. )
  77. data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
  78. exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True)
  79. expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index)
  80. result = data.groupby("b", observed=False).mean()
  81. tm.assert_frame_equal(result, expected)
  82. cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
  83. cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
  84. df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
  85. # single grouper
  86. gb = df.groupby("A", observed=False)
  87. exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
  88. expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
  89. result = gb.sum(numeric_only=True)
  90. tm.assert_frame_equal(result, expected)
  91. # GH 8623
  92. x = DataFrame(
  93. [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
  94. columns=["person_id", "person_name"],
  95. )
  96. x["person_name"] = Categorical(x.person_name)
  97. g = x.groupby(["person_id"], observed=False)
  98. result = g.transform(lambda x: x)
  99. tm.assert_frame_equal(result, x[["person_name"]])
  100. result = x.drop_duplicates("person_name")
  101. expected = x.iloc[[0, 1]]
  102. tm.assert_frame_equal(result, expected)
  103. def f(x):
  104. return x.drop_duplicates("person_name").iloc[0]
  105. result = g.apply(f)
  106. expected = x.iloc[[0, 1]].copy()
  107. expected.index = Index([1, 2], name="person_id")
  108. expected["person_name"] = expected["person_name"].astype("object")
  109. tm.assert_frame_equal(result, expected)
  110. # GH 9921
  111. # Monotonic
  112. df = DataFrame({"a": [5, 15, 25]})
  113. c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
  114. result = df.a.groupby(c, observed=False).transform(sum)
  115. tm.assert_series_equal(result, df["a"])
  116. tm.assert_series_equal(
  117. df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
  118. )
  119. tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]])
  120. gbc = df.groupby(c, observed=False)
  121. result = gbc.transform(lambda xs: np.max(xs, axis=0))
  122. tm.assert_frame_equal(result, df[["a"]])
  123. with tm.assert_produces_warning(None):
  124. result2 = gbc.transform(lambda xs: np.max(xs, axis=0))
  125. result3 = gbc.transform(max)
  126. result4 = gbc.transform(np.maximum.reduce)
  127. result5 = gbc.transform(lambda xs: np.maximum.reduce(xs))
  128. tm.assert_frame_equal(result2, df[["a"]], check_dtype=False)
  129. tm.assert_frame_equal(result3, df[["a"]], check_dtype=False)
  130. tm.assert_frame_equal(result4, df[["a"]])
  131. tm.assert_frame_equal(result5, df[["a"]])
  132. # Filter
  133. tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"])
  134. tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)
  135. # Non-monotonic
  136. df = DataFrame({"a": [5, 15, 25, -5]})
  137. c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
  138. result = df.a.groupby(c, observed=False).transform(sum)
  139. tm.assert_series_equal(result, df["a"])
  140. tm.assert_series_equal(
  141. df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
  142. )
  143. tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]])
  144. tm.assert_frame_equal(
  145. df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]]
  146. )
  147. # GH 9603
  148. df = DataFrame({"a": [1, 0, 0, 0]})
  149. c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
  150. result = df.groupby(c, observed=False).apply(len)
  151. exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
  152. expected = Series([1, 0, 0, 0], index=exp_index)
  153. expected.index.name = "a"
  154. tm.assert_series_equal(result, expected)
  155. # more basic
  156. levels = ["foo", "bar", "baz", "qux"]
  157. codes = np.random.randint(0, 4, size=100)
  158. cats = Categorical.from_codes(codes, levels, ordered=True)
  159. data = DataFrame(np.random.randn(100, 4))
  160. result = data.groupby(cats, observed=False).mean()
  161. expected = data.groupby(np.asarray(cats), observed=False).mean()
  162. exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True)
  163. expected = expected.reindex(exp_idx)
  164. tm.assert_frame_equal(result, expected)
  165. grouped = data.groupby(cats, observed=False)
  166. desc_result = grouped.describe()
  167. idx = cats.codes.argsort()
  168. ord_labels = np.asarray(cats).take(idx)
  169. ord_data = data.take(idx)
  170. exp_cats = Categorical(
  171. ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"]
  172. )
  173. expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe()
  174. tm.assert_frame_equal(desc_result, expected)
  175. # GH 10460
  176. expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
  177. exp = CategoricalIndex(expc)
  178. tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
  179. exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
  180. tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
  181. def test_level_get_group(observed):
  182. # GH15155
  183. df = DataFrame(
  184. data=np.arange(2, 22, 2),
  185. index=MultiIndex(
  186. levels=[CategoricalIndex(["a", "b"]), range(10)],
  187. codes=[[0] * 5 + [1] * 5, range(10)],
  188. names=["Index1", "Index2"],
  189. ),
  190. )
  191. g = df.groupby(level=["Index1"], observed=observed)
  192. # expected should equal test.loc[["a"]]
  193. # GH15166
  194. expected = DataFrame(
  195. data=np.arange(2, 12, 2),
  196. index=MultiIndex(
  197. levels=[CategoricalIndex(["a", "b"]), range(5)],
  198. codes=[[0] * 5, range(5)],
  199. names=["Index1", "Index2"],
  200. ),
  201. )
  202. result = g.get_group("a")
  203. tm.assert_frame_equal(result, expected)
  204. def test_sorting_with_different_categoricals():
  205. # GH 24271
  206. df = DataFrame(
  207. {
  208. "group": ["A"] * 6 + ["B"] * 6,
  209. "dose": ["high", "med", "low"] * 4,
  210. "outcomes": np.arange(12.0),
  211. }
  212. )
  213. df.dose = Categorical(df.dose, categories=["low", "med", "high"], ordered=True)
  214. result = df.groupby("group")["dose"].value_counts()
  215. result = result.sort_index(level=0, sort_remaining=True)
  216. index = ["low", "med", "high", "low", "med", "high"]
  217. index = Categorical(index, categories=["low", "med", "high"], ordered=True)
  218. index = [["A", "A", "A", "B", "B", "B"], CategoricalIndex(index)]
  219. index = MultiIndex.from_arrays(index, names=["group", "dose"])
  220. expected = Series([2] * 6, index=index, name="count")
  221. tm.assert_series_equal(result, expected)
  222. @pytest.mark.parametrize("ordered", [True, False])
  223. def test_apply(ordered):
  224. # GH 10138
  225. dense = Categorical(list("abc"), ordered=ordered)
  226. # 'b' is in the categories but not in the list
  227. missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered)
  228. values = np.arange(len(dense))
  229. df = DataFrame({"missing": missing, "dense": dense, "values": values})
  230. grouped = df.groupby(["missing", "dense"], observed=True)
  231. # missing category 'b' should still exist in the output index
  232. idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
  233. expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"])
  234. result = grouped.apply(lambda x: np.mean(x, axis=0))
  235. tm.assert_frame_equal(result, expected)
  236. result = grouped.mean()
  237. tm.assert_frame_equal(result, expected)
  238. result = grouped.agg(np.mean)
  239. tm.assert_frame_equal(result, expected)
  240. # but for transform we should still get back the original index
  241. idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
  242. expected = Series(1, index=idx)
  243. result = grouped.apply(lambda x: 1)
  244. tm.assert_series_equal(result, expected)
  245. def test_observed(observed):
  246. # multiple groupers, don't re-expand the output space
  247. # of the grouper
  248. # gh-14942 (implement)
  249. # gh-10132 (back-compat)
  250. # gh-8138 (back-compat)
  251. # gh-8869
  252. cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
  253. cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
  254. df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
  255. df["C"] = ["foo", "bar"] * 2
  256. # multiple groupers with a non-cat
  257. gb = df.groupby(["A", "B", "C"], observed=observed)
  258. exp_index = MultiIndex.from_arrays(
  259. [cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"]
  260. )
  261. expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index()
  262. result = gb.sum()
  263. if not observed:
  264. expected = cartesian_product_for_groupers(
  265. expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
  266. )
  267. tm.assert_frame_equal(result, expected)
  268. gb = df.groupby(["A", "B"], observed=observed)
  269. exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
  270. expected = DataFrame(
  271. {"values": [1, 2, 3, 4], "C": ["foo", "bar", "foo", "bar"]}, index=exp_index
  272. )
  273. result = gb.sum()
  274. if not observed:
  275. expected = cartesian_product_for_groupers(
  276. expected, [cat1, cat2], list("AB"), fill_value=0
  277. )
  278. tm.assert_frame_equal(result, expected)
  279. # https://github.com/pandas-dev/pandas/issues/8138
  280. d = {
  281. "cat": Categorical(
  282. ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True
  283. ),
  284. "ints": [1, 1, 2, 2],
  285. "val": [10, 20, 30, 40],
  286. }
  287. df = DataFrame(d)
  288. # Grouping on a single column
  289. groups_single_key = df.groupby("cat", observed=observed)
  290. result = groups_single_key.mean()
  291. exp_index = CategoricalIndex(
  292. list("ab"), name="cat", categories=list("abc"), ordered=True
  293. )
  294. expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index)
  295. if not observed:
  296. index = CategoricalIndex(
  297. list("abc"), name="cat", categories=list("abc"), ordered=True
  298. )
  299. expected = expected.reindex(index)
  300. tm.assert_frame_equal(result, expected)
  301. # Grouping on two columns
  302. groups_double_key = df.groupby(["cat", "ints"], observed=observed)
  303. result = groups_double_key.agg("mean")
  304. expected = DataFrame(
  305. {
  306. "val": [10.0, 30.0, 20.0, 40.0],
  307. "cat": Categorical(
  308. ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
  309. ),
  310. "ints": [1, 2, 1, 2],
  311. }
  312. ).set_index(["cat", "ints"])
  313. if not observed:
  314. expected = cartesian_product_for_groupers(
  315. expected, [df.cat.values, [1, 2]], ["cat", "ints"]
  316. )
  317. tm.assert_frame_equal(result, expected)
  318. # GH 10132
  319. for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]:
  320. c, i = key
  321. result = groups_double_key.get_group(key)
  322. expected = df[(df.cat == c) & (df.ints == i)]
  323. tm.assert_frame_equal(result, expected)
  324. # gh-8869
  325. # with as_index
  326. d = {
  327. "foo": [10, 8, 4, 8, 4, 1, 1],
  328. "bar": [10, 20, 30, 40, 50, 60, 70],
  329. "baz": ["d", "c", "e", "a", "a", "d", "c"],
  330. }
  331. df = DataFrame(d)
  332. cat = pd.cut(df["foo"], np.linspace(0, 10, 3))
  333. df["range"] = cat
  334. groups = df.groupby(["range", "baz"], as_index=False, observed=observed)
  335. result = groups.agg("mean")
  336. groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed)
  337. expected = groups2.agg("mean").reset_index()
  338. tm.assert_frame_equal(result, expected)
  339. def test_observed_codes_remap(observed):
  340. d = {"C1": [3, 3, 4, 5], "C2": [1, 2, 3, 4], "C3": [10, 100, 200, 34]}
  341. df = DataFrame(d)
  342. values = pd.cut(df["C1"], [1, 2, 3, 6])
  343. values.name = "cat"
  344. groups_double_key = df.groupby([values, "C2"], observed=observed)
  345. idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"])
  346. expected = DataFrame(
  347. {"C1": [3.0, 3.0, 4.0, 5.0], "C3": [10.0, 100.0, 200.0, 34.0]}, index=idx
  348. )
  349. if not observed:
  350. expected = cartesian_product_for_groupers(
  351. expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"]
  352. )
  353. result = groups_double_key.agg("mean")
  354. tm.assert_frame_equal(result, expected)
  355. def test_observed_perf():
  356. # we create a cartesian product, so this is
  357. # non-performant if we don't use observed values
  358. # gh-14942
  359. df = DataFrame(
  360. {
  361. "cat": np.random.randint(0, 255, size=30000),
  362. "int_id": np.random.randint(0, 255, size=30000),
  363. "other_id": np.random.randint(0, 10000, size=30000),
  364. "foo": 0,
  365. }
  366. )
  367. df["cat"] = df.cat.astype(str).astype("category")
  368. grouped = df.groupby(["cat", "int_id", "other_id"], observed=True)
  369. result = grouped.count()
  370. assert result.index.levels[0].nunique() == df.cat.nunique()
  371. assert result.index.levels[1].nunique() == df.int_id.nunique()
  372. assert result.index.levels[2].nunique() == df.other_id.nunique()
  373. def test_observed_groups(observed):
  374. # gh-20583
  375. # test that we have the appropriate groups
  376. cat = Categorical(["a", "c", "a"], categories=["a", "b", "c"])
  377. df = DataFrame({"cat": cat, "vals": [1, 2, 3]})
  378. g = df.groupby("cat", observed=observed)
  379. result = g.groups
  380. if observed:
  381. expected = {"a": Index([0, 2], dtype="int64"), "c": Index([1], dtype="int64")}
  382. else:
  383. expected = {
  384. "a": Index([0, 2], dtype="int64"),
  385. "b": Index([], dtype="int64"),
  386. "c": Index([1], dtype="int64"),
  387. }
  388. tm.assert_dict_equal(result, expected)
  389. @pytest.mark.parametrize(
  390. "keys, expected_values, expected_index_levels",
  391. [
  392. ("a", [15, 9, 0], CategoricalIndex([1, 2, 3], name="a")),
  393. (
  394. ["a", "b"],
  395. [7, 8, 0, 0, 0, 9, 0, 0, 0],
  396. [CategoricalIndex([1, 2, 3], name="a"), Index([4, 5, 6])],
  397. ),
  398. (
  399. ["a", "a2"],
  400. [15, 0, 0, 0, 9, 0, 0, 0, 0],
  401. [
  402. CategoricalIndex([1, 2, 3], name="a"),
  403. CategoricalIndex([1, 2, 3], name="a"),
  404. ],
  405. ),
  406. ],
  407. )
  408. @pytest.mark.parametrize("test_series", [True, False])
  409. def test_unobserved_in_index(keys, expected_values, expected_index_levels, test_series):
  410. # GH#49354 - ensure unobserved cats occur when grouping by index levels
  411. df = DataFrame(
  412. {
  413. "a": Categorical([1, 1, 2], categories=[1, 2, 3]),
  414. "a2": Categorical([1, 1, 2], categories=[1, 2, 3]),
  415. "b": [4, 5, 6],
  416. "c": [7, 8, 9],
  417. }
  418. ).set_index(["a", "a2"])
  419. if "b" not in keys:
  420. # Only keep b when it is used for grouping for consistent columns in the result
  421. df = df.drop(columns="b")
  422. gb = df.groupby(keys, observed=False)
  423. if test_series:
  424. gb = gb["c"]
  425. result = gb.sum()
  426. if len(keys) == 1:
  427. index = expected_index_levels
  428. else:
  429. codes = [[0, 0, 0, 1, 1, 1, 2, 2, 2], 3 * [0, 1, 2]]
  430. index = MultiIndex(
  431. expected_index_levels,
  432. codes=codes,
  433. names=keys,
  434. )
  435. expected = DataFrame({"c": expected_values}, index=index)
  436. if test_series:
  437. expected = expected["c"]
  438. tm.assert_equal(result, expected)
  439. def test_observed_groups_with_nan(observed):
  440. # GH 24740
  441. df = DataFrame(
  442. {
  443. "cat": Categorical(["a", np.nan, "a"], categories=["a", "b", "d"]),
  444. "vals": [1, 2, 3],
  445. }
  446. )
  447. g = df.groupby("cat", observed=observed)
  448. result = g.groups
  449. if observed:
  450. expected = {"a": Index([0, 2], dtype="int64")}
  451. else:
  452. expected = {
  453. "a": Index([0, 2], dtype="int64"),
  454. "b": Index([], dtype="int64"),
  455. "d": Index([], dtype="int64"),
  456. }
  457. tm.assert_dict_equal(result, expected)
  458. def test_observed_nth():
  459. # GH 26385
  460. cat = Categorical(["a", np.nan, np.nan], categories=["a", "b", "c"])
  461. ser = Series([1, 2, 3])
  462. df = DataFrame({"cat": cat, "ser": ser})
  463. result = df.groupby("cat", observed=False)["ser"].nth(0)
  464. expected = df["ser"].iloc[[0]]
  465. tm.assert_series_equal(result, expected)
  466. def test_dataframe_categorical_with_nan(observed):
  467. # GH 21151
  468. s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"])
  469. s2 = Series([1, 2, 3, 4])
  470. df = DataFrame({"s1": s1, "s2": s2})
  471. result = df.groupby("s1", observed=observed).first().reset_index()
  472. if observed:
  473. expected = DataFrame(
  474. {"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]}
  475. )
  476. else:
  477. expected = DataFrame(
  478. {
  479. "s1": Categorical(["a", "b", "c"], categories=["a", "b", "c"]),
  480. "s2": [2, np.nan, np.nan],
  481. }
  482. )
  483. tm.assert_frame_equal(result, expected)
  484. @pytest.mark.parametrize("ordered", [True, False])
  485. @pytest.mark.parametrize("observed", [True, False])
  486. @pytest.mark.parametrize("sort", [True, False])
  487. def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
  488. # GH 25871: Fix groupby sorting on ordered Categoricals
  489. # GH 25167: Groupby with observed=True doesn't sort
  490. # Build a dataframe with cat having one unobserved category ('missing'),
  491. # and a Series with identical values
  492. label = Categorical(
  493. ["d", "a", "b", "a", "d", "b"],
  494. categories=["a", "b", "missing", "d"],
  495. ordered=ordered,
  496. )
  497. val = Series(["d", "a", "b", "a", "d", "b"])
  498. df = DataFrame({"label": label, "val": val})
  499. # aggregate on the Categorical
  500. result = df.groupby("label", observed=observed, sort=sort)["val"].aggregate("first")
  501. # If ordering works, we expect index labels equal to aggregation results,
  502. # except for 'observed=False': label 'missing' has aggregation None
  503. label = Series(result.index.array, dtype="object")
  504. aggr = Series(result.array)
  505. if not observed:
  506. aggr[aggr.isna()] = "missing"
  507. if not all(label == aggr):
  508. msg = (
  509. "Labels and aggregation results not consistently sorted\n"
  510. f"for (ordered={ordered}, observed={observed}, sort={sort})\n"
  511. f"Result:\n{result}"
  512. )
  513. assert False, msg
  514. def test_datetime():
  515. # GH9049: ensure backward compatibility
  516. levels = pd.date_range("2014-01-01", periods=4)
  517. codes = np.random.randint(0, 4, size=100)
  518. cats = Categorical.from_codes(codes, levels, ordered=True)
  519. data = DataFrame(np.random.randn(100, 4))
  520. result = data.groupby(cats, observed=False).mean()
  521. expected = data.groupby(np.asarray(cats), observed=False).mean()
  522. expected = expected.reindex(levels)
  523. expected.index = CategoricalIndex(
  524. expected.index, categories=expected.index, ordered=True
  525. )
  526. tm.assert_frame_equal(result, expected)
  527. grouped = data.groupby(cats, observed=False)
  528. desc_result = grouped.describe()
  529. idx = cats.codes.argsort()
  530. ord_labels = cats.take(idx)
  531. ord_data = data.take(idx)
  532. expected = ord_data.groupby(ord_labels, observed=False).describe()
  533. tm.assert_frame_equal(desc_result, expected)
  534. tm.assert_index_equal(desc_result.index, expected.index)
  535. tm.assert_index_equal(
  536. desc_result.index.get_level_values(0), expected.index.get_level_values(0)
  537. )
  538. # GH 10460
  539. expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
  540. exp = CategoricalIndex(expc)
  541. tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
  542. exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
  543. tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
  544. def test_categorical_index():
  545. s = np.random.RandomState(12345)
  546. levels = ["foo", "bar", "baz", "qux"]
  547. codes = s.randint(0, 4, size=20)
  548. cats = Categorical.from_codes(codes, levels, ordered=True)
  549. df = DataFrame(np.repeat(np.arange(20), 4).reshape(-1, 4), columns=list("abcd"))
  550. df["cats"] = cats
  551. # with a cat index
  552. result = df.set_index("cats").groupby(level=0, observed=False).sum()
  553. expected = df[list("abcd")].groupby(cats.codes, observed=False).sum()
  554. expected.index = CategoricalIndex(
  555. Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats"
  556. )
  557. tm.assert_frame_equal(result, expected)
  558. # with a cat column, should produce a cat index
  559. result = df.groupby("cats", observed=False).sum()
  560. expected = df[list("abcd")].groupby(cats.codes, observed=False).sum()
  561. expected.index = CategoricalIndex(
  562. Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats"
  563. )
  564. tm.assert_frame_equal(result, expected)
  565. def test_describe_categorical_columns():
  566. # GH 11558
  567. cats = CategoricalIndex(
  568. ["qux", "foo", "baz", "bar"],
  569. categories=["foo", "bar", "baz", "qux"],
  570. ordered=True,
  571. )
  572. df = DataFrame(np.random.randn(20, 4), columns=cats)
  573. result = df.groupby([1, 2, 3, 4] * 5).describe()
  574. tm.assert_index_equal(result.stack().columns, cats)
  575. tm.assert_categorical_equal(result.stack().columns.values, cats.values)
  576. def test_unstack_categorical():
  577. # GH11558 (example is taken from the original issue)
  578. df = DataFrame(
  579. {"a": range(10), "medium": ["A", "B"] * 5, "artist": list("XYXXY") * 2}
  580. )
  581. df["medium"] = df["medium"].astype("category")
  582. gcat = df.groupby(["artist", "medium"], observed=False)["a"].count().unstack()
  583. result = gcat.describe()
  584. exp_columns = CategoricalIndex(["A", "B"], ordered=False, name="medium")
  585. tm.assert_index_equal(result.columns, exp_columns)
  586. tm.assert_categorical_equal(result.columns.values, exp_columns.values)
  587. result = gcat["A"] + gcat["B"]
  588. expected = Series([6, 4], index=Index(["X", "Y"], name="artist"))
  589. tm.assert_series_equal(result, expected)
  590. def test_bins_unequal_len():
  591. # GH3011
  592. series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
  593. bins = pd.cut(series.dropna().values, 4)
  594. # len(bins) != len(series) here
  595. with pytest.raises(ValueError, match="Grouper and axis must be same length"):
  596. series.groupby(bins).mean()
  597. @pytest.mark.parametrize(
  598. ["series", "data"],
  599. [
  600. # Group a series with length and index equal to those of the grouper.
  601. (Series(range(4)), {"A": [0, 3], "B": [1, 2]}),
  602. # Group a series with length equal to that of the grouper and index unequal to
  603. # that of the grouper.
  604. (Series(range(4)).rename(lambda idx: idx + 1), {"A": [2], "B": [0, 1]}),
  605. # GH44179: Group a series with length unequal to that of the grouper.
  606. (Series(range(7)), {"A": [0, 3], "B": [1, 2]}),
  607. ],
  608. )
  609. def test_categorical_series(series, data):
  610. # Group the given series by a series with categorical data type such that group A
  611. # takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in
  612. # the given data.
  613. groupby = series.groupby(Series(list("ABBA"), dtype="category"))
  614. result = groupby.aggregate(list)
  615. expected = Series(data, index=CategoricalIndex(data.keys()))
  616. tm.assert_series_equal(result, expected)
  617. def test_as_index():
  618. # GH13204
  619. df = DataFrame(
  620. {
  621. "cat": Categorical([1, 2, 2], [1, 2, 3]),
  622. "A": [10, 11, 11],
  623. "B": [101, 102, 103],
  624. }
  625. )
  626. result = df.groupby(["cat", "A"], as_index=False, observed=True).sum()
  627. expected = DataFrame(
  628. {
  629. "cat": Categorical([1, 2], categories=df.cat.cat.categories),
  630. "A": [10, 11],
  631. "B": [101, 205],
  632. },
  633. columns=["cat", "A", "B"],
  634. )
  635. tm.assert_frame_equal(result, expected)
  636. # function grouper
  637. f = lambda r: df.loc[r, "A"]
  638. result = df.groupby(["cat", f], as_index=False, observed=True).sum()
  639. expected = DataFrame(
  640. {
  641. "cat": Categorical([1, 2], categories=df.cat.cat.categories),
  642. "A": [10, 22],
  643. "B": [101, 205],
  644. },
  645. columns=["cat", "A", "B"],
  646. )
  647. tm.assert_frame_equal(result, expected)
  648. # another not in-axis grouper (conflicting names in index)
  649. s = Series(["a", "b", "b"], name="cat")
  650. result = df.groupby(["cat", s], as_index=False, observed=True).sum()
  651. tm.assert_frame_equal(result, expected)
  652. # is original index dropped?
  653. group_columns = ["cat", "A"]
  654. expected = DataFrame(
  655. {
  656. "cat": Categorical([1, 2], categories=df.cat.cat.categories),
  657. "A": [10, 11],
  658. "B": [101, 205],
  659. },
  660. columns=["cat", "A", "B"],
  661. )
  662. for name in [None, "X", "B"]:
  663. df.index = Index(list("abc"), name=name)
  664. result = df.groupby(group_columns, as_index=False, observed=True).sum()
  665. tm.assert_frame_equal(result, expected)
  666. def test_preserve_categories():
  667. # GH-13179
  668. categories = list("abc")
  669. # ordered=True
  670. df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)})
  671. sort_index = CategoricalIndex(categories, categories, ordered=True, name="A")
  672. nosort_index = CategoricalIndex(list("bac"), categories, ordered=True, name="A")
  673. tm.assert_index_equal(
  674. df.groupby("A", sort=True, observed=False).first().index, sort_index
  675. )
  676. # GH#42482 - don't sort result when sort=False, even when ordered=True
  677. tm.assert_index_equal(
  678. df.groupby("A", sort=False, observed=False).first().index, nosort_index
  679. )
  680. # ordered=False
  681. df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
  682. sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
  683. # GH#48749 - don't change order of categories
  684. # GH#42482 - don't sort result when sort=False, even when ordered=True
  685. nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
  686. tm.assert_index_equal(
  687. df.groupby("A", sort=True, observed=False).first().index, sort_index
  688. )
  689. tm.assert_index_equal(
  690. df.groupby("A", sort=False, observed=False).first().index, nosort_index
  691. )
  692. def test_preserve_categorical_dtype():
  693. # GH13743, GH13854
  694. df = DataFrame(
  695. {
  696. "A": [1, 2, 1, 1, 2],
  697. "B": [10, 16, 22, 28, 34],
  698. "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False),
  699. "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True),
  700. }
  701. )
  702. # single grouper
  703. exp_full = DataFrame(
  704. {
  705. "A": [2.0, 1.0, np.nan],
  706. "B": [25.0, 20.0, np.nan],
  707. "C1": Categorical(list("bac"), categories=list("bac"), ordered=False),
  708. "C2": Categorical(list("bac"), categories=list("bac"), ordered=True),
  709. }
  710. )
  711. for col in ["C1", "C2"]:
  712. result1 = df.groupby(by=col, as_index=False, observed=False).mean(
  713. numeric_only=True
  714. )
  715. result2 = (
  716. df.groupby(by=col, as_index=True, observed=False)
  717. .mean(numeric_only=True)
  718. .reset_index()
  719. )
  720. expected = exp_full.reindex(columns=result1.columns)
  721. tm.assert_frame_equal(result1, expected)
  722. tm.assert_frame_equal(result2, expected)
  723. @pytest.mark.parametrize(
  724. "func, values",
  725. [
  726. ("first", ["second", "first"]),
  727. ("last", ["fourth", "third"]),
  728. ("min", ["fourth", "first"]),
  729. ("max", ["second", "third"]),
  730. ],
  731. )
  732. def test_preserve_on_ordered_ops(func, values):
  733. # gh-18502
  734. # preserve the categoricals on ops
  735. c = Categorical(["first", "second", "third", "fourth"], ordered=True)
  736. df = DataFrame({"payload": [-1, -2, -1, -2], "col": c})
  737. g = df.groupby("payload")
  738. result = getattr(g, func)()
  739. expected = DataFrame(
  740. {"payload": [-2, -1], "col": Series(values, dtype=c.dtype)}
  741. ).set_index("payload")
  742. tm.assert_frame_equal(result, expected)
  743. # we should also preserve categorical for SeriesGroupBy
  744. sgb = df.groupby("payload")["col"]
  745. result = getattr(sgb, func)()
  746. expected = expected["col"]
  747. tm.assert_series_equal(result, expected)
  748. def test_categorical_no_compress():
  749. data = Series(np.random.randn(9))
  750. codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
  751. cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
  752. result = data.groupby(cats, observed=False).mean()
  753. exp = data.groupby(codes, observed=False).mean()
  754. exp.index = CategoricalIndex(
  755. exp.index, categories=cats.categories, ordered=cats.ordered
  756. )
  757. tm.assert_series_equal(result, exp)
  758. codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
  759. cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
  760. result = data.groupby(cats, observed=False).mean()
  761. exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
  762. exp.index = CategoricalIndex(
  763. exp.index, categories=cats.categories, ordered=cats.ordered
  764. )
  765. tm.assert_series_equal(result, exp)
  766. cats = Categorical(
  767. ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
  768. categories=["a", "b", "c", "d"],
  769. ordered=True,
  770. )
  771. data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
  772. result = data.groupby("b", observed=False).mean()
  773. result = result["a"].values
  774. exp = np.array([1, 2, 4, np.nan])
  775. tm.assert_numpy_array_equal(result, exp)
  776. def test_groupby_empty_with_category():
  777. # GH-9614
  778. # test fix for when group by on None resulted in
  779. # coercion of dtype categorical -> float
  780. df = DataFrame({"A": [None] * 3, "B": Categorical(["train", "train", "test"])})
  781. result = df.groupby("A").first()["B"]
  782. expected = Series(
  783. Categorical([], categories=["test", "train"]),
  784. index=Series([], dtype="object", name="A"),
  785. name="B",
  786. )
  787. tm.assert_series_equal(result, expected)
  788. def test_sort():
  789. # https://stackoverflow.com/questions/23814368/sorting-pandas-
  790. # categorical-labels-after-groupby
  791. # This should result in a properly sorted Series so that the plot
  792. # has a sorted x axis
  793. # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
  794. df = DataFrame({"value": np.random.randint(0, 10000, 100)})
  795. labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)]
  796. cat_labels = Categorical(labels, labels)
  797. df = df.sort_values(by=["value"], ascending=True)
  798. df["value_group"] = pd.cut(
  799. df.value, range(0, 10500, 500), right=False, labels=cat_labels
  800. )
  801. res = df.groupby(["value_group"], observed=False)["value_group"].count()
  802. exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
  803. exp.index = CategoricalIndex(exp.index, name=exp.index.name)
  804. tm.assert_series_equal(res, exp)
  805. @pytest.mark.parametrize("ordered", [True, False])
  806. def test_sort2(sort, ordered):
  807. # dataframe groupby sort was being ignored # GH 8868
  808. # GH#48749 - don't change order of categories
  809. # GH#42482 - don't sort result when sort=False, even when ordered=True
  810. df = DataFrame(
  811. [
  812. ["(7.5, 10]", 10, 10],
  813. ["(7.5, 10]", 8, 20],
  814. ["(2.5, 5]", 5, 30],
  815. ["(5, 7.5]", 6, 40],
  816. ["(2.5, 5]", 4, 50],
  817. ["(0, 2.5]", 1, 60],
  818. ["(5, 7.5]", 7, 70],
  819. ],
  820. columns=["range", "foo", "bar"],
  821. )
  822. df["range"] = Categorical(df["range"], ordered=ordered)
  823. result = df.groupby("range", sort=sort, observed=False).first()
  824. if sort:
  825. data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
  826. index_values = ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"]
  827. else:
  828. data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
  829. index_values = ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"]
  830. expected = DataFrame(
  831. data_values,
  832. columns=["foo", "bar"],
  833. index=CategoricalIndex(index_values, name="range", ordered=ordered),
  834. )
  835. tm.assert_frame_equal(result, expected)
  836. @pytest.mark.parametrize("ordered", [True, False])
  837. def test_sort_datetimelike(sort, ordered):
  838. # GH10505
  839. # GH#42482 - don't sort result when sort=False, even when ordered=True
  840. # use same data as test_groupby_sort_categorical, which category is
  841. # corresponding to datetime.month
  842. df = DataFrame(
  843. {
  844. "dt": [
  845. datetime(2011, 7, 1),
  846. datetime(2011, 7, 1),
  847. datetime(2011, 2, 1),
  848. datetime(2011, 5, 1),
  849. datetime(2011, 2, 1),
  850. datetime(2011, 1, 1),
  851. datetime(2011, 5, 1),
  852. ],
  853. "foo": [10, 8, 5, 6, 4, 1, 7],
  854. "bar": [10, 20, 30, 40, 50, 60, 70],
  855. },
  856. columns=["dt", "foo", "bar"],
  857. )
  858. # ordered=True
  859. df["dt"] = Categorical(df["dt"], ordered=ordered)
  860. if sort:
  861. data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
  862. index_values = [
  863. datetime(2011, 1, 1),
  864. datetime(2011, 2, 1),
  865. datetime(2011, 5, 1),
  866. datetime(2011, 7, 1),
  867. ]
  868. else:
  869. data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
  870. index_values = [
  871. datetime(2011, 7, 1),
  872. datetime(2011, 2, 1),
  873. datetime(2011, 5, 1),
  874. datetime(2011, 1, 1),
  875. ]
  876. expected = DataFrame(
  877. data_values,
  878. columns=["foo", "bar"],
  879. index=CategoricalIndex(index_values, name="dt", ordered=ordered),
  880. )
  881. result = df.groupby("dt", sort=sort, observed=False).first()
  882. tm.assert_frame_equal(result, expected)
  883. def test_empty_sum():
  884. # https://github.com/pandas-dev/pandas/issues/18678
  885. df = DataFrame(
  886. {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]}
  887. )
  888. expected_idx = CategoricalIndex(["a", "b", "c"], name="A")
  889. # 0 by default
  890. result = df.groupby("A", observed=False).B.sum()
  891. expected = Series([3, 1, 0], expected_idx, name="B")
  892. tm.assert_series_equal(result, expected)
  893. # min_count=0
  894. result = df.groupby("A", observed=False).B.sum(min_count=0)
  895. expected = Series([3, 1, 0], expected_idx, name="B")
  896. tm.assert_series_equal(result, expected)
  897. # min_count=1
  898. result = df.groupby("A", observed=False).B.sum(min_count=1)
  899. expected = Series([3, 1, np.nan], expected_idx, name="B")
  900. tm.assert_series_equal(result, expected)
  901. # min_count>1
  902. result = df.groupby("A", observed=False).B.sum(min_count=2)
  903. expected = Series([3, np.nan, np.nan], expected_idx, name="B")
  904. tm.assert_series_equal(result, expected)
  905. def test_empty_prod():
  906. # https://github.com/pandas-dev/pandas/issues/18678
  907. df = DataFrame(
  908. {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]}
  909. )
  910. expected_idx = CategoricalIndex(["a", "b", "c"], name="A")
  911. # 1 by default
  912. result = df.groupby("A", observed=False).B.prod()
  913. expected = Series([2, 1, 1], expected_idx, name="B")
  914. tm.assert_series_equal(result, expected)
  915. # min_count=0
  916. result = df.groupby("A", observed=False).B.prod(min_count=0)
  917. expected = Series([2, 1, 1], expected_idx, name="B")
  918. tm.assert_series_equal(result, expected)
  919. # min_count=1
  920. result = df.groupby("A", observed=False).B.prod(min_count=1)
  921. expected = Series([2, 1, np.nan], expected_idx, name="B")
  922. tm.assert_series_equal(result, expected)
  923. def test_groupby_multiindex_categorical_datetime():
  924. # https://github.com/pandas-dev/pandas/issues/21390
  925. df = DataFrame(
  926. {
  927. "key1": Categorical(list("abcbabcba")),
  928. "key2": Categorical(
  929. list(pd.date_range("2018-06-01 00", freq="1T", periods=3)) * 3
  930. ),
  931. "values": np.arange(9),
  932. }
  933. )
  934. result = df.groupby(["key1", "key2"]).mean()
  935. idx = MultiIndex.from_product(
  936. [
  937. Categorical(["a", "b", "c"]),
  938. Categorical(pd.date_range("2018-06-01 00", freq="1T", periods=3)),
  939. ],
  940. names=["key1", "key2"],
  941. )
  942. expected = DataFrame({"values": [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
  943. tm.assert_frame_equal(result, expected)
  944. @pytest.mark.parametrize(
  945. "as_index, expected",
  946. [
  947. (
  948. True,
  949. Series(
  950. index=MultiIndex.from_arrays(
  951. [Series([1, 1, 2], dtype="category"), [1, 2, 2]], names=["a", "b"]
  952. ),
  953. data=[1, 2, 3],
  954. name="x",
  955. ),
  956. ),
  957. (
  958. False,
  959. DataFrame(
  960. {
  961. "a": Series([1, 1, 2], dtype="category"),
  962. "b": [1, 2, 2],
  963. "x": [1, 2, 3],
  964. }
  965. ),
  966. ),
  967. ],
  968. )
  969. def test_groupby_agg_observed_true_single_column(as_index, expected):
  970. # GH-23970
  971. df = DataFrame(
  972. {"a": Series([1, 1, 2], dtype="category"), "b": [1, 2, 2], "x": [1, 2, 3]}
  973. )
  974. result = df.groupby(["a", "b"], as_index=as_index, observed=True)["x"].sum()
  975. tm.assert_equal(result, expected)
  976. @pytest.mark.parametrize("fill_value", [None, np.nan, pd.NaT])
  977. def test_shift(fill_value):
  978. ct = Categorical(
  979. ["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False
  980. )
  981. expected = Categorical(
  982. [None, "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False
  983. )
  984. res = ct.shift(1, fill_value=fill_value)
  985. tm.assert_equal(res, expected)
  986. @pytest.fixture
  987. def df_cat(df):
  988. """
  989. DataFrame with multiple categorical columns and a column of integers.
  990. Shortened so as not to contain all possible combinations of categories.
  991. Useful for testing `observed` kwarg functionality on GroupBy objects.
  992. Parameters
  993. ----------
  994. df: DataFrame
  995. Non-categorical, longer DataFrame from another fixture, used to derive
  996. this one
  997. Returns
  998. -------
  999. df_cat: DataFrame
  1000. """
  1001. df_cat = df.copy()[:4] # leave out some groups
  1002. df_cat["A"] = df_cat["A"].astype("category")
  1003. df_cat["B"] = df_cat["B"].astype("category")
  1004. df_cat["C"] = Series([1, 2, 3, 4])
  1005. df_cat = df_cat.drop(["D"], axis=1)
  1006. return df_cat
  1007. @pytest.mark.parametrize("operation", ["agg", "apply"])
  1008. def test_seriesgroupby_observed_true(df_cat, operation):
  1009. # GH#24880
  1010. # GH#49223 - order of results was wrong when grouping by index levels
  1011. lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
  1012. lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
  1013. index = MultiIndex.from_arrays([lev_a, lev_b])
  1014. expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index()
  1015. grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
  1016. result = getattr(grouped, operation)(sum)
  1017. tm.assert_series_equal(result, expected)
  1018. @pytest.mark.parametrize("operation", ["agg", "apply"])
  1019. @pytest.mark.parametrize("observed", [False, None])
  1020. def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
  1021. # GH 24880
  1022. # GH#49223 - order of results was wrong when grouping by index levels
  1023. index, _ = MultiIndex.from_product(
  1024. [
  1025. CategoricalIndex(["bar", "foo"], ordered=False),
  1026. CategoricalIndex(["one", "three", "two"], ordered=False),
  1027. ],
  1028. names=["A", "B"],
  1029. ).sortlevel()
  1030. expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
  1031. if operation == "agg":
  1032. expected = expected.fillna(0, downcast="infer")
  1033. grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
  1034. result = getattr(grouped, operation)(sum)
  1035. tm.assert_series_equal(result, expected)
  1036. @pytest.mark.parametrize(
  1037. "observed, index, data",
  1038. [
  1039. (
  1040. True,
  1041. MultiIndex.from_arrays(
  1042. [
  1043. Index(["bar"] * 4 + ["foo"] * 4, dtype="category", name="A"),
  1044. Index(
  1045. ["one", "one", "three", "three", "one", "one", "two", "two"],
  1046. dtype="category",
  1047. name="B",
  1048. ),
  1049. Index(["min", "max"] * 4),
  1050. ]
  1051. ),
  1052. [2, 2, 4, 4, 1, 1, 3, 3],
  1053. ),
  1054. (
  1055. False,
  1056. MultiIndex.from_product(
  1057. [
  1058. CategoricalIndex(["bar", "foo"], ordered=False),
  1059. CategoricalIndex(["one", "three", "two"], ordered=False),
  1060. Index(["min", "max"]),
  1061. ],
  1062. names=["A", "B", None],
  1063. ),
  1064. [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
  1065. ),
  1066. (
  1067. None,
  1068. MultiIndex.from_product(
  1069. [
  1070. CategoricalIndex(["bar", "foo"], ordered=False),
  1071. CategoricalIndex(["one", "three", "two"], ordered=False),
  1072. Index(["min", "max"]),
  1073. ],
  1074. names=["A", "B", None],
  1075. ),
  1076. [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
  1077. ),
  1078. ],
  1079. )
  1080. def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data):
  1081. # GH 24880
  1082. expected = Series(data=data, index=index, name="C")
  1083. result = df_cat.groupby(["A", "B"], observed=observed)["C"].apply(
  1084. lambda x: {"min": x.min(), "max": x.max()}
  1085. )
  1086. tm.assert_series_equal(result, expected)
  1087. def test_groupby_categorical_series_dataframe_consistent(df_cat):
  1088. # GH 20416
  1089. expected = df_cat.groupby(["A", "B"])["C"].mean()
  1090. result = df_cat.groupby(["A", "B"]).mean()["C"]
  1091. tm.assert_series_equal(result, expected)
  1092. @pytest.mark.parametrize("code", [([1, 0, 0]), ([0, 0, 0])])
  1093. def test_groupby_categorical_axis_1(code):
  1094. # GH 13420
  1095. df = DataFrame({"a": [1, 2, 3, 4], "b": [-1, -2, -3, -4], "c": [5, 6, 7, 8]})
  1096. cat = Categorical.from_codes(code, categories=list("abc"))
  1097. result = df.groupby(cat, axis=1).mean()
  1098. expected = df.T.groupby(cat, axis=0).mean().T
  1099. tm.assert_frame_equal(result, expected)
  1100. def test_groupby_cat_preserves_structure(observed, ordered):
  1101. # GH 28787
  1102. df = DataFrame(
  1103. {"Name": Categorical(["Bob", "Greg"], ordered=ordered), "Item": [1, 2]},
  1104. columns=["Name", "Item"],
  1105. )
  1106. expected = df.copy()
  1107. result = (
  1108. df.groupby("Name", observed=observed)
  1109. .agg(DataFrame.sum, skipna=True)
  1110. .reset_index()
  1111. )
  1112. tm.assert_frame_equal(result, expected)
  1113. def test_get_nonexistent_category():
  1114. # Accessing a Category that is not in the dataframe
  1115. df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)})
  1116. with pytest.raises(KeyError, match="'vau'"):
  1117. df.groupby("var").apply(
  1118. lambda rows: DataFrame(
  1119. {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
  1120. )
  1121. )
  1122. def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed, request):
  1123. # GH 17605
  1124. if reduction_func == "ngroup":
  1125. pytest.skip("ngroup is not truly a reduction")
  1126. if reduction_func == "corrwith": # GH 32293
  1127. mark = pytest.mark.xfail(
  1128. reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
  1129. )
  1130. request.node.add_marker(mark)
  1131. df = DataFrame(
  1132. {
  1133. "cat_1": Categorical(list("AABB"), categories=list("ABCD")),
  1134. "cat_2": Categorical(list("AB") * 2, categories=list("ABCD")),
  1135. "value": [0.1] * 4,
  1136. }
  1137. )
  1138. args = get_groupby_method_args(reduction_func, df)
  1139. expected_length = 4 if observed else 16
  1140. series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
  1141. agg = getattr(series_groupby, reduction_func)
  1142. result = agg(*args)
  1143. assert len(result) == expected_length
  1144. def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
  1145. reduction_func, request
  1146. ):
  1147. # GH 17605
  1148. # Tests whether the unobserved categories in the result contain 0 or NaN
  1149. if reduction_func == "ngroup":
  1150. pytest.skip("ngroup is not truly a reduction")
  1151. if reduction_func == "corrwith": # GH 32293
  1152. mark = pytest.mark.xfail(
  1153. reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
  1154. )
  1155. request.node.add_marker(mark)
  1156. df = DataFrame(
  1157. {
  1158. "cat_1": Categorical(list("AABB"), categories=list("ABC")),
  1159. "cat_2": Categorical(list("AB") * 2, categories=list("ABC")),
  1160. "value": [0.1] * 4,
  1161. }
  1162. )
  1163. unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
  1164. args = get_groupby_method_args(reduction_func, df)
  1165. series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
  1166. agg = getattr(series_groupby, reduction_func)
  1167. result = agg(*args)
  1168. zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
  1169. for idx in unobserved:
  1170. val = result.loc[idx]
  1171. assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
  1172. # If we expect unobserved values to be zero, we also expect the dtype to be int.
  1173. # Except for .sum(). If the observed categories sum to dtype=float (i.e. their
  1174. # sums have decimals), then the zeros for the missing categories should also be
  1175. # floats.
  1176. if zero_or_nan == 0 and reduction_func != "sum":
  1177. assert np.issubdtype(result.dtype, np.integer)
  1178. def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func):
  1179. # GH 23865
  1180. # GH 27075
  1181. # Ensure that df.groupby, when 'by' is two Categorical variables,
  1182. # does not return the categories that are not in df when observed=True
  1183. if reduction_func == "ngroup":
  1184. pytest.skip("ngroup does not return the Categories on the index")
  1185. df = DataFrame(
  1186. {
  1187. "cat_1": Categorical(list("AABB"), categories=list("ABC")),
  1188. "cat_2": Categorical(list("1111"), categories=list("12")),
  1189. "value": [0.1, 0.1, 0.1, 0.1],
  1190. }
  1191. )
  1192. unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
  1193. df_grp = df.groupby(["cat_1", "cat_2"], observed=True)
  1194. args = get_groupby_method_args(reduction_func, df)
  1195. res = getattr(df_grp, reduction_func)(*args)
  1196. for cat in unobserved_cats:
  1197. assert cat not in res.index
  1198. @pytest.mark.parametrize("observed", [False, None])
  1199. def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
  1200. reduction_func, observed
  1201. ):
  1202. # GH 23865
  1203. # GH 27075
  1204. # Ensure that df.groupby, when 'by' is two Categorical variables,
  1205. # returns the categories that are not in df when observed=False/None
  1206. if reduction_func == "ngroup":
  1207. pytest.skip("ngroup does not return the Categories on the index")
  1208. df = DataFrame(
  1209. {
  1210. "cat_1": Categorical(list("AABB"), categories=list("ABC")),
  1211. "cat_2": Categorical(list("1111"), categories=list("12")),
  1212. "value": [0.1, 0.1, 0.1, 0.1],
  1213. }
  1214. )
  1215. unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
  1216. df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)
  1217. args = get_groupby_method_args(reduction_func, df)
  1218. res = getattr(df_grp, reduction_func)(*args)
  1219. expected = _results_for_groupbys_with_missing_categories[reduction_func]
  1220. if expected is np.nan:
  1221. assert res.loc[unobserved_cats].isnull().all().all()
  1222. else:
  1223. assert (res.loc[unobserved_cats] == expected).all().all()
  1224. def test_series_groupby_categorical_aggregation_getitem():
  1225. # GH 8870
  1226. d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]}
  1227. df = DataFrame(d)
  1228. cat = pd.cut(df["foo"], np.linspace(0, 20, 5))
  1229. df["range"] = cat
  1230. groups = df.groupby(["range", "baz"], as_index=True, sort=True)
  1231. result = groups["foo"].agg("mean")
  1232. expected = groups.agg("mean")["foo"]
  1233. tm.assert_series_equal(result, expected)
  1234. @pytest.mark.parametrize(
  1235. "func, expected_values",
  1236. [(Series.nunique, [1, 1, 2]), (Series.count, [1, 2, 2])],
  1237. )
  1238. def test_groupby_agg_categorical_columns(func, expected_values):
  1239. # 31256
  1240. df = DataFrame(
  1241. {
  1242. "id": [0, 1, 2, 3, 4],
  1243. "groups": [0, 1, 1, 2, 2],
  1244. "value": Categorical([0, 0, 0, 0, 1]),
  1245. }
  1246. ).set_index("id")
  1247. result = df.groupby("groups").agg(func)
  1248. expected = DataFrame(
  1249. {"value": expected_values}, index=Index([0, 1, 2], name="groups")
  1250. )
  1251. tm.assert_frame_equal(result, expected)
  1252. def test_groupby_agg_non_numeric():
  1253. df = DataFrame({"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"])})
  1254. expected = DataFrame({"A": [2, 1]}, index=np.array([1, 2]))
  1255. result = df.groupby([1, 2, 1]).agg(Series.nunique)
  1256. tm.assert_frame_equal(result, expected)
  1257. result = df.groupby([1, 2, 1]).nunique()
  1258. tm.assert_frame_equal(result, expected)
  1259. @pytest.mark.parametrize("func", ["first", "last"])
  1260. def test_groupby_first_returned_categorical_instead_of_dataframe(func):
  1261. # GH 28641: groupby drops index, when grouping over categorical column with
  1262. # first/last. Renamed Categorical instead of DataFrame previously.
  1263. df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()})
  1264. df_grouped = df.groupby("A")["B"]
  1265. result = getattr(df_grouped, func)()
  1266. # ordered categorical dtype should be preserved
  1267. expected = Series(
  1268. ["b"], index=Index([1997], name="A"), name="B", dtype=df["B"].dtype
  1269. )
  1270. tm.assert_series_equal(result, expected)
  1271. def test_read_only_category_no_sort():
  1272. # GH33410
  1273. cats = np.array([1, 2])
  1274. cats.flags.writeable = False
  1275. df = DataFrame(
  1276. {"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))}
  1277. )
  1278. expected = DataFrame(data={"a": [2.0, 6.0]}, index=CategoricalIndex(cats, name="b"))
  1279. result = df.groupby("b", sort=False).mean()
  1280. tm.assert_frame_equal(result, expected)
  1281. def test_sorted_missing_category_values():
  1282. # GH 28597
  1283. df = DataFrame(
  1284. {
  1285. "foo": [
  1286. "small",
  1287. "large",
  1288. "large",
  1289. "large",
  1290. "medium",
  1291. "large",
  1292. "large",
  1293. "medium",
  1294. ],
  1295. "bar": ["C", "A", "A", "C", "A", "C", "A", "C"],
  1296. }
  1297. )
  1298. df["foo"] = (
  1299. df["foo"]
  1300. .astype("category")
  1301. .cat.set_categories(["tiny", "small", "medium", "large"], ordered=True)
  1302. )
  1303. expected = DataFrame(
  1304. {
  1305. "tiny": {"A": 0, "C": 0},
  1306. "small": {"A": 0, "C": 1},
  1307. "medium": {"A": 1, "C": 1},
  1308. "large": {"A": 3, "C": 2},
  1309. }
  1310. )
  1311. expected = expected.rename_axis("bar", axis="index")
  1312. expected.columns = CategoricalIndex(
  1313. ["tiny", "small", "medium", "large"],
  1314. categories=["tiny", "small", "medium", "large"],
  1315. ordered=True,
  1316. name="foo",
  1317. dtype="category",
  1318. )
  1319. result = df.groupby(["bar", "foo"]).size().unstack()
  1320. tm.assert_frame_equal(result, expected)
  1321. def test_agg_cython_category_not_implemented_fallback():
  1322. # https://github.com/pandas-dev/pandas/issues/31450
  1323. df = DataFrame({"col_num": [1, 1, 2, 3]})
  1324. df["col_cat"] = df["col_num"].astype("category")
  1325. result = df.groupby("col_num").col_cat.first()
  1326. # ordered categorical dtype should definitely be preserved;
  1327. # this is unordered, so is less-clear case (if anything, it should raise)
  1328. expected = Series(
  1329. [1, 2, 3],
  1330. index=Index([1, 2, 3], name="col_num"),
  1331. name="col_cat",
  1332. dtype=df["col_cat"].dtype,
  1333. )
  1334. tm.assert_series_equal(result, expected)
  1335. result = df.groupby("col_num").agg({"col_cat": "first"})
  1336. expected = expected.to_frame()
  1337. tm.assert_frame_equal(result, expected)
  1338. def test_aggregate_categorical_with_isnan():
  1339. # GH 29837
  1340. df = DataFrame(
  1341. {
  1342. "A": [1, 1, 1, 1],
  1343. "B": [1, 2, 1, 2],
  1344. "numerical_col": [0.1, 0.2, np.nan, 0.3],
  1345. "object_col": ["foo", "bar", "foo", "fee"],
  1346. "categorical_col": ["foo", "bar", "foo", "fee"],
  1347. }
  1348. )
  1349. df = df.astype({"categorical_col": "category"})
  1350. result = df.groupby(["A", "B"]).agg(lambda df: df.isna().sum())
  1351. index = MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B"))
  1352. expected = DataFrame(
  1353. data={
  1354. "numerical_col": [1, 0],
  1355. "object_col": [0, 0],
  1356. "categorical_col": [0, 0],
  1357. },
  1358. index=index,
  1359. )
  1360. tm.assert_frame_equal(result, expected)
  1361. def test_categorical_transform():
  1362. # GH 29037
  1363. df = DataFrame(
  1364. {
  1365. "package_id": [1, 1, 1, 2, 2, 3],
  1366. "status": [
  1367. "Waiting",
  1368. "OnTheWay",
  1369. "Delivered",
  1370. "Waiting",
  1371. "OnTheWay",
  1372. "Waiting",
  1373. ],
  1374. }
  1375. )
  1376. delivery_status_type = pd.CategoricalDtype(
  1377. categories=["Waiting", "OnTheWay", "Delivered"], ordered=True
  1378. )
  1379. df["status"] = df["status"].astype(delivery_status_type)
  1380. df["last_status"] = df.groupby("package_id")["status"].transform(max)
  1381. result = df.copy()
  1382. expected = DataFrame(
  1383. {
  1384. "package_id": [1, 1, 1, 2, 2, 3],
  1385. "status": [
  1386. "Waiting",
  1387. "OnTheWay",
  1388. "Delivered",
  1389. "Waiting",
  1390. "OnTheWay",
  1391. "Waiting",
  1392. ],
  1393. "last_status": [
  1394. "Delivered",
  1395. "Delivered",
  1396. "Delivered",
  1397. "OnTheWay",
  1398. "OnTheWay",
  1399. "Waiting",
  1400. ],
  1401. }
  1402. )
  1403. expected["status"] = expected["status"].astype(delivery_status_type)
  1404. # .transform(max) should preserve ordered categoricals
  1405. expected["last_status"] = expected["last_status"].astype(delivery_status_type)
  1406. tm.assert_frame_equal(result, expected)
  1407. @pytest.mark.parametrize("func", ["first", "last"])
  1408. def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals(
  1409. func: str, observed: bool
  1410. ):
  1411. # GH 34951
  1412. cat = Categorical([0, 0, 1, 1])
  1413. val = [0, 1, 1, 0]
  1414. df = DataFrame({"a": cat, "b": cat, "c": val})
  1415. cat2 = Categorical([0, 1])
  1416. idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"])
  1417. expected_dict = {
  1418. "first": Series([0, np.NaN, np.NaN, 1], idx, name="c"),
  1419. "last": Series([1, np.NaN, np.NaN, 0], idx, name="c"),
  1420. }
  1421. expected = expected_dict[func]
  1422. if observed:
  1423. expected = expected.dropna().astype(np.int64)
  1424. srs_grp = df.groupby(["a", "b"], observed=observed)["c"]
  1425. result = getattr(srs_grp, func)()
  1426. tm.assert_series_equal(result, expected)
  1427. @pytest.mark.parametrize("func", ["first", "last"])
  1428. def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
  1429. func: str, observed: bool
  1430. ):
  1431. # GH 34951
  1432. cat = Categorical([0, 0, 1, 1])
  1433. val = [0, 1, 1, 0]
  1434. df = DataFrame({"a": cat, "b": cat, "c": val})
  1435. cat2 = Categorical([0, 1])
  1436. idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"])
  1437. expected_dict = {
  1438. "first": Series([0, np.NaN, np.NaN, 1], idx, name="c"),
  1439. "last": Series([1, np.NaN, np.NaN, 0], idx, name="c"),
  1440. }
  1441. expected = expected_dict[func].to_frame()
  1442. if observed:
  1443. expected = expected.dropna().astype(np.int64)
  1444. df_grp = df.groupby(["a", "b"], observed=observed)
  1445. result = getattr(df_grp, func)()
  1446. tm.assert_frame_equal(result, expected)
  1447. def test_groupby_categorical_indices_unused_categories():
  1448. # GH#38642
  1449. df = DataFrame(
  1450. {
  1451. "key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]),
  1452. "col": range(3),
  1453. }
  1454. )
  1455. grouped = df.groupby("key", sort=False)
  1456. result = grouped.indices
  1457. expected = {
  1458. "b": np.array([0, 1], dtype="intp"),
  1459. "a": np.array([2], dtype="intp"),
  1460. "c": np.array([], dtype="intp"),
  1461. }
  1462. assert result.keys() == expected.keys()
  1463. for key in result.keys():
  1464. tm.assert_numpy_array_equal(result[key], expected[key])
  1465. @pytest.mark.parametrize("func", ["first", "last"])
  1466. def test_groupby_last_first_preserve_categoricaldtype(func):
  1467. # GH#33090
  1468. df = DataFrame({"a": [1, 2, 3]})
  1469. df["b"] = df["a"].astype("category")
  1470. result = getattr(df.groupby("a")["b"], func)()
  1471. expected = Series(
  1472. Categorical([1, 2, 3]), name="b", index=Index([1, 2, 3], name="a")
  1473. )
  1474. tm.assert_series_equal(expected, result)
  1475. def test_groupby_categorical_observed_nunique():
  1476. # GH#45128
  1477. df = DataFrame({"a": [1, 2], "b": [1, 2], "c": [10, 11]})
  1478. df = df.astype(dtype={"a": "category", "b": "category"})
  1479. result = df.groupby(["a", "b"], observed=True).nunique()["c"]
  1480. expected = Series(
  1481. [1, 1],
  1482. index=MultiIndex.from_arrays(
  1483. [CategoricalIndex([1, 2], name="a"), CategoricalIndex([1, 2], name="b")]
  1484. ),
  1485. name="c",
  1486. )
  1487. tm.assert_series_equal(result, expected)
  1488. def test_groupby_categorical_aggregate_functions():
  1489. # GH#37275
  1490. dtype = pd.CategoricalDtype(categories=["small", "big"], ordered=True)
  1491. df = DataFrame(
  1492. [[1, "small"], [1, "big"], [2, "small"]], columns=["grp", "description"]
  1493. ).astype({"description": dtype})
  1494. result = df.groupby("grp")["description"].max()
  1495. expected = Series(
  1496. ["big", "small"],
  1497. index=Index([1, 2], name="grp"),
  1498. name="description",
  1499. dtype=pd.CategoricalDtype(categories=["small", "big"], ordered=True),
  1500. )
  1501. tm.assert_series_equal(result, expected)
  1502. def test_groupby_categorical_dropna(observed, dropna):
  1503. # GH#48645 - dropna should have no impact on the result when there are no NA values
  1504. cat = Categorical([1, 2], categories=[1, 2, 3])
  1505. df = DataFrame({"x": Categorical([1, 2], categories=[1, 2, 3]), "y": [3, 4]})
  1506. gb = df.groupby("x", observed=observed, dropna=dropna)
  1507. result = gb.sum()
  1508. if observed:
  1509. expected = DataFrame({"y": [3, 4]}, index=cat)
  1510. else:
  1511. index = CategoricalIndex([1, 2, 3], [1, 2, 3])
  1512. expected = DataFrame({"y": [3, 4, 0]}, index=index)
  1513. expected.index.name = "x"
  1514. tm.assert_frame_equal(result, expected)
  1515. @pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
  1516. @pytest.mark.parametrize("ordered", [True, False])
  1517. def test_category_order_reducer(
  1518. request, as_index, sort, observed, reduction_func, index_kind, ordered
  1519. ):
  1520. # GH#48749
  1521. if (
  1522. reduction_func in ("idxmax", "idxmin")
  1523. and not observed
  1524. and index_kind != "multi"
  1525. ):
  1526. msg = "GH#10694 - idxmax/min fail with unused categories"
  1527. request.node.add_marker(pytest.mark.xfail(reason=msg))
  1528. elif reduction_func == "corrwith" and not as_index:
  1529. msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
  1530. request.node.add_marker(pytest.mark.xfail(reason=msg))
  1531. elif index_kind != "range" and not as_index:
  1532. pytest.skip(reason="Result doesn't have categories, nothing to test")
  1533. df = DataFrame(
  1534. {
  1535. "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
  1536. "b": range(4),
  1537. }
  1538. )
  1539. if index_kind == "range":
  1540. keys = ["a"]
  1541. elif index_kind == "single":
  1542. keys = ["a"]
  1543. df = df.set_index(keys)
  1544. elif index_kind == "multi":
  1545. keys = ["a", "a2"]
  1546. df["a2"] = df["a"]
  1547. df = df.set_index(keys)
  1548. args = get_groupby_method_args(reduction_func, df)
  1549. gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
  1550. op_result = getattr(gb, reduction_func)(*args)
  1551. if as_index:
  1552. result = op_result.index.get_level_values("a").categories
  1553. else:
  1554. result = op_result["a"].cat.categories
  1555. expected = Index([1, 4, 3, 2])
  1556. tm.assert_index_equal(result, expected)
  1557. if index_kind == "multi":
  1558. result = op_result.index.get_level_values("a2").categories
  1559. tm.assert_index_equal(result, expected)
  1560. @pytest.mark.parametrize("index_kind", ["single", "multi"])
  1561. @pytest.mark.parametrize("ordered", [True, False])
  1562. def test_category_order_transformer(
  1563. as_index, sort, observed, transformation_func, index_kind, ordered
  1564. ):
  1565. # GH#48749
  1566. df = DataFrame(
  1567. {
  1568. "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
  1569. "b": range(4),
  1570. }
  1571. )
  1572. if index_kind == "single":
  1573. keys = ["a"]
  1574. df = df.set_index(keys)
  1575. elif index_kind == "multi":
  1576. keys = ["a", "a2"]
  1577. df["a2"] = df["a"]
  1578. df = df.set_index(keys)
  1579. args = get_groupby_method_args(transformation_func, df)
  1580. gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
  1581. op_result = getattr(gb, transformation_func)(*args)
  1582. result = op_result.index.get_level_values("a").categories
  1583. expected = Index([1, 4, 3, 2])
  1584. tm.assert_index_equal(result, expected)
  1585. if index_kind == "multi":
  1586. result = op_result.index.get_level_values("a2").categories
  1587. tm.assert_index_equal(result, expected)
  1588. @pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
  1589. @pytest.mark.parametrize("method", ["head", "tail"])
  1590. @pytest.mark.parametrize("ordered", [True, False])
  1591. def test_category_order_head_tail(
  1592. as_index, sort, observed, method, index_kind, ordered
  1593. ):
  1594. # GH#48749
  1595. df = DataFrame(
  1596. {
  1597. "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
  1598. "b": range(4),
  1599. }
  1600. )
  1601. if index_kind == "range":
  1602. keys = ["a"]
  1603. elif index_kind == "single":
  1604. keys = ["a"]
  1605. df = df.set_index(keys)
  1606. elif index_kind == "multi":
  1607. keys = ["a", "a2"]
  1608. df["a2"] = df["a"]
  1609. df = df.set_index(keys)
  1610. gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
  1611. op_result = getattr(gb, method)()
  1612. if index_kind == "range":
  1613. result = op_result["a"].cat.categories
  1614. else:
  1615. result = op_result.index.get_level_values("a").categories
  1616. expected = Index([1, 4, 3, 2])
  1617. tm.assert_index_equal(result, expected)
  1618. if index_kind == "multi":
  1619. result = op_result.index.get_level_values("a2").categories
  1620. tm.assert_index_equal(result, expected)
  1621. @pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
  1622. @pytest.mark.parametrize("method", ["apply", "agg", "transform"])
  1623. @pytest.mark.parametrize("ordered", [True, False])
  1624. def test_category_order_apply(as_index, sort, observed, method, index_kind, ordered):
  1625. # GH#48749
  1626. if (method == "transform" and index_kind == "range") or (
  1627. not as_index and index_kind != "range"
  1628. ):
  1629. pytest.skip("No categories in result, nothing to test")
  1630. df = DataFrame(
  1631. {
  1632. "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
  1633. "b": range(4),
  1634. }
  1635. )
  1636. if index_kind == "range":
  1637. keys = ["a"]
  1638. elif index_kind == "single":
  1639. keys = ["a"]
  1640. df = df.set_index(keys)
  1641. elif index_kind == "multi":
  1642. keys = ["a", "a2"]
  1643. df["a2"] = df["a"]
  1644. df = df.set_index(keys)
  1645. gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
  1646. op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True))
  1647. if (method == "transform" or not as_index) and index_kind == "range":
  1648. result = op_result["a"].cat.categories
  1649. else:
  1650. result = op_result.index.get_level_values("a").categories
  1651. expected = Index([1, 4, 3, 2])
  1652. tm.assert_index_equal(result, expected)
  1653. if index_kind == "multi":
  1654. result = op_result.index.get_level_values("a2").categories
  1655. tm.assert_index_equal(result, expected)
  1656. @pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
  1657. def test_many_categories(as_index, sort, index_kind, ordered):
  1658. # GH#48749 - Test when the grouper has many categories
  1659. if index_kind != "range" and not as_index:
  1660. pytest.skip(reason="Result doesn't have categories, nothing to test")
  1661. categories = np.arange(9999, -1, -1)
  1662. grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered)
  1663. df = DataFrame({"a": grouper, "b": range(4)})
  1664. if index_kind == "range":
  1665. keys = ["a"]
  1666. elif index_kind == "single":
  1667. keys = ["a"]
  1668. df = df.set_index(keys)
  1669. elif index_kind == "multi":
  1670. keys = ["a", "a2"]
  1671. df["a2"] = df["a"]
  1672. df = df.set_index(keys)
  1673. gb = df.groupby(keys, as_index=as_index, sort=sort, observed=True)
  1674. result = gb.sum()
  1675. # Test is setup so that data and index are the same values
  1676. data = [3, 2, 1] if sort else [2, 1, 3]
  1677. index = CategoricalIndex(
  1678. data, categories=grouper.categories, ordered=ordered, name="a"
  1679. )
  1680. if as_index:
  1681. expected = DataFrame({"b": data})
  1682. if index_kind == "multi":
  1683. expected.index = MultiIndex.from_frame(DataFrame({"a": index, "a2": index}))
  1684. else:
  1685. expected.index = index
  1686. elif index_kind == "multi":
  1687. expected = DataFrame({"a": Series(index), "a2": Series(index), "b": data})
  1688. else:
  1689. expected = DataFrame({"a": Series(index), "b": data})
  1690. tm.assert_frame_equal(result, expected)
  1691. @pytest.mark.parametrize("test_series", [True, False])
  1692. @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
  1693. def test_agg_list(request, as_index, observed, reduction_func, test_series, keys):
  1694. # GH#52760
  1695. if test_series and reduction_func == "corrwith":
  1696. assert not hasattr(SeriesGroupBy, "corrwith")
  1697. pytest.skip("corrwith not implemented for SeriesGroupBy")
  1698. elif reduction_func == "corrwith":
  1699. msg = "GH#32293: attempts to call SeriesGroupBy.corrwith"
  1700. request.node.add_marker(pytest.mark.xfail(reason=msg))
  1701. elif (
  1702. reduction_func == "nunique"
  1703. and not test_series
  1704. and len(keys) != 1
  1705. and not observed
  1706. and not as_index
  1707. ):
  1708. msg = "GH#52848 - raises a ValueError"
  1709. request.node.add_marker(pytest.mark.xfail(reason=msg))
  1710. df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]})
  1711. df = df.astype({"a1": "category", "a2": "category"})
  1712. if "a2" not in keys:
  1713. df = df.drop(columns="a2")
  1714. gb = df.groupby(by=keys, as_index=as_index, observed=observed)
  1715. if test_series:
  1716. gb = gb["b"]
  1717. args = get_groupby_method_args(reduction_func, df)
  1718. result = gb.agg([reduction_func], *args)
  1719. expected = getattr(gb, reduction_func)(*args)
  1720. if as_index and (test_series or reduction_func == "size"):
  1721. expected = expected.to_frame(reduction_func)
  1722. if not test_series:
  1723. if not as_index:
  1724. # TODO: GH#52849 - as_index=False is not respected
  1725. expected = expected.set_index(keys)
  1726. expected.columns = MultiIndex(
  1727. levels=[["b"], [reduction_func]], codes=[[0], [0]]
  1728. )
  1729. elif not as_index:
  1730. expected.columns = keys + [reduction_func]
  1731. tm.assert_equal(result, expected)