12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057 |
- from datetime import datetime
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import (
- Categorical,
- CategoricalIndex,
- DataFrame,
- Index,
- MultiIndex,
- Series,
- qcut,
- )
- import pandas._testing as tm
- from pandas.core.groupby.generic import SeriesGroupBy
- from pandas.tests.groupby import get_groupby_method_args
- def cartesian_product_for_groupers(result, args, names, fill_value=np.NaN):
- """Reindex to a cartesian production for the groupers,
- preserving the nature (Categorical) of each grouper
- """
- def f(a):
- if isinstance(a, (CategoricalIndex, Categorical)):
- categories = a.categories
- a = Categorical.from_codes(
- np.arange(len(categories)), categories=categories, ordered=a.ordered
- )
- return a
- index = MultiIndex.from_product(map(f, args), names=names)
- return result.reindex(index, fill_value=fill_value).sort_index()
- _results_for_groupbys_with_missing_categories = {
- # This maps the builtin groupby functions to their expected outputs for
- # missing categories when they are called on a categorical grouper with
- # observed=False. Some functions are expected to return NaN, some zero.
- # These expected values can be used across several tests (i.e. they are
- # the same for SeriesGroupBy and DataFrameGroupBy) but they should only be
- # hardcoded in one place.
- "all": np.NaN,
- "any": np.NaN,
- "count": 0,
- "corrwith": np.NaN,
- "first": np.NaN,
- "idxmax": np.NaN,
- "idxmin": np.NaN,
- "last": np.NaN,
- "max": np.NaN,
- "mean": np.NaN,
- "median": np.NaN,
- "min": np.NaN,
- "nth": np.NaN,
- "nunique": 0,
- "prod": np.NaN,
- "quantile": np.NaN,
- "sem": np.NaN,
- "size": 0,
- "skew": np.NaN,
- "std": np.NaN,
- "sum": 0,
- "var": np.NaN,
- }
- def test_apply_use_categorical_name(df):
- cats = qcut(df.C, 4)
- def get_stats(group):
- return {
- "min": group.min(),
- "max": group.max(),
- "count": group.count(),
- "mean": group.mean(),
- }
- result = df.groupby(cats, observed=False).D.apply(get_stats)
- assert result.index.names[0] == "C"
- def test_basic(): # TODO: split this test
- cats = Categorical(
- ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
- categories=["a", "b", "c", "d"],
- ordered=True,
- )
- data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
- exp_index = CategoricalIndex(list("abcd"), name="b", ordered=True)
- expected = DataFrame({"a": [1, 2, 4, np.nan]}, index=exp_index)
- result = data.groupby("b", observed=False).mean()
- tm.assert_frame_equal(result, expected)
- cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
- cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
- df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
- # single grouper
- gb = df.groupby("A", observed=False)
- exp_idx = CategoricalIndex(["a", "b", "z"], name="A", ordered=True)
- expected = DataFrame({"values": Series([3, 7, 0], index=exp_idx)})
- result = gb.sum(numeric_only=True)
- tm.assert_frame_equal(result, expected)
- # GH 8623
- x = DataFrame(
- [[1, "John P. Doe"], [2, "Jane Dove"], [1, "John P. Doe"]],
- columns=["person_id", "person_name"],
- )
- x["person_name"] = Categorical(x.person_name)
- g = x.groupby(["person_id"], observed=False)
- result = g.transform(lambda x: x)
- tm.assert_frame_equal(result, x[["person_name"]])
- result = x.drop_duplicates("person_name")
- expected = x.iloc[[0, 1]]
- tm.assert_frame_equal(result, expected)
- def f(x):
- return x.drop_duplicates("person_name").iloc[0]
- result = g.apply(f)
- expected = x.iloc[[0, 1]].copy()
- expected.index = Index([1, 2], name="person_id")
- expected["person_name"] = expected["person_name"].astype("object")
- tm.assert_frame_equal(result, expected)
- # GH 9921
- # Monotonic
- df = DataFrame({"a": [5, 15, 25]})
- c = pd.cut(df.a, bins=[0, 10, 20, 30, 40])
- result = df.a.groupby(c, observed=False).transform(sum)
- tm.assert_series_equal(result, df["a"])
- tm.assert_series_equal(
- df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
- )
- tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]])
- gbc = df.groupby(c, observed=False)
- result = gbc.transform(lambda xs: np.max(xs, axis=0))
- tm.assert_frame_equal(result, df[["a"]])
- with tm.assert_produces_warning(None):
- result2 = gbc.transform(lambda xs: np.max(xs, axis=0))
- result3 = gbc.transform(max)
- result4 = gbc.transform(np.maximum.reduce)
- result5 = gbc.transform(lambda xs: np.maximum.reduce(xs))
- tm.assert_frame_equal(result2, df[["a"]], check_dtype=False)
- tm.assert_frame_equal(result3, df[["a"]], check_dtype=False)
- tm.assert_frame_equal(result4, df[["a"]])
- tm.assert_frame_equal(result5, df[["a"]])
- # Filter
- tm.assert_series_equal(df.a.groupby(c, observed=False).filter(np.all), df["a"])
- tm.assert_frame_equal(df.groupby(c, observed=False).filter(np.all), df)
- # Non-monotonic
- df = DataFrame({"a": [5, 15, 25, -5]})
- c = pd.cut(df.a, bins=[-10, 0, 10, 20, 30, 40])
- result = df.a.groupby(c, observed=False).transform(sum)
- tm.assert_series_equal(result, df["a"])
- tm.assert_series_equal(
- df.a.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df["a"]
- )
- tm.assert_frame_equal(df.groupby(c, observed=False).transform(sum), df[["a"]])
- tm.assert_frame_equal(
- df.groupby(c, observed=False).transform(lambda xs: np.sum(xs)), df[["a"]]
- )
- # GH 9603
- df = DataFrame({"a": [1, 0, 0, 0]})
- c = pd.cut(df.a, [0, 1, 2, 3, 4], labels=Categorical(list("abcd")))
- result = df.groupby(c, observed=False).apply(len)
- exp_index = CategoricalIndex(c.values.categories, ordered=c.values.ordered)
- expected = Series([1, 0, 0, 0], index=exp_index)
- expected.index.name = "a"
- tm.assert_series_equal(result, expected)
- # more basic
- levels = ["foo", "bar", "baz", "qux"]
- codes = np.random.randint(0, 4, size=100)
- cats = Categorical.from_codes(codes, levels, ordered=True)
- data = DataFrame(np.random.randn(100, 4))
- result = data.groupby(cats, observed=False).mean()
- expected = data.groupby(np.asarray(cats), observed=False).mean()
- exp_idx = CategoricalIndex(levels, categories=cats.categories, ordered=True)
- expected = expected.reindex(exp_idx)
- tm.assert_frame_equal(result, expected)
- grouped = data.groupby(cats, observed=False)
- desc_result = grouped.describe()
- idx = cats.codes.argsort()
- ord_labels = np.asarray(cats).take(idx)
- ord_data = data.take(idx)
- exp_cats = Categorical(
- ord_labels, ordered=True, categories=["foo", "bar", "baz", "qux"]
- )
- expected = ord_data.groupby(exp_cats, sort=False, observed=False).describe()
- tm.assert_frame_equal(desc_result, expected)
- # GH 10460
- expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
- exp = CategoricalIndex(expc)
- tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
- exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
- tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
- def test_level_get_group(observed):
- # GH15155
- df = DataFrame(
- data=np.arange(2, 22, 2),
- index=MultiIndex(
- levels=[CategoricalIndex(["a", "b"]), range(10)],
- codes=[[0] * 5 + [1] * 5, range(10)],
- names=["Index1", "Index2"],
- ),
- )
- g = df.groupby(level=["Index1"], observed=observed)
- # expected should equal test.loc[["a"]]
- # GH15166
- expected = DataFrame(
- data=np.arange(2, 12, 2),
- index=MultiIndex(
- levels=[CategoricalIndex(["a", "b"]), range(5)],
- codes=[[0] * 5, range(5)],
- names=["Index1", "Index2"],
- ),
- )
- result = g.get_group("a")
- tm.assert_frame_equal(result, expected)
- def test_sorting_with_different_categoricals():
- # GH 24271
- df = DataFrame(
- {
- "group": ["A"] * 6 + ["B"] * 6,
- "dose": ["high", "med", "low"] * 4,
- "outcomes": np.arange(12.0),
- }
- )
- df.dose = Categorical(df.dose, categories=["low", "med", "high"], ordered=True)
- result = df.groupby("group")["dose"].value_counts()
- result = result.sort_index(level=0, sort_remaining=True)
- index = ["low", "med", "high", "low", "med", "high"]
- index = Categorical(index, categories=["low", "med", "high"], ordered=True)
- index = [["A", "A", "A", "B", "B", "B"], CategoricalIndex(index)]
- index = MultiIndex.from_arrays(index, names=["group", "dose"])
- expected = Series([2] * 6, index=index, name="count")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("ordered", [True, False])
- def test_apply(ordered):
- # GH 10138
- dense = Categorical(list("abc"), ordered=ordered)
- # 'b' is in the categories but not in the list
- missing = Categorical(list("aaa"), categories=["a", "b"], ordered=ordered)
- values = np.arange(len(dense))
- df = DataFrame({"missing": missing, "dense": dense, "values": values})
- grouped = df.groupby(["missing", "dense"], observed=True)
- # missing category 'b' should still exist in the output index
- idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
- expected = DataFrame([0, 1, 2.0], index=idx, columns=["values"])
- result = grouped.apply(lambda x: np.mean(x, axis=0))
- tm.assert_frame_equal(result, expected)
- result = grouped.mean()
- tm.assert_frame_equal(result, expected)
- result = grouped.agg(np.mean)
- tm.assert_frame_equal(result, expected)
- # but for transform we should still get back the original index
- idx = MultiIndex.from_arrays([missing, dense], names=["missing", "dense"])
- expected = Series(1, index=idx)
- result = grouped.apply(lambda x: 1)
- tm.assert_series_equal(result, expected)
- def test_observed(observed):
- # multiple groupers, don't re-expand the output space
- # of the grouper
- # gh-14942 (implement)
- # gh-10132 (back-compat)
- # gh-8138 (back-compat)
- # gh-8869
- cat1 = Categorical(["a", "a", "b", "b"], categories=["a", "b", "z"], ordered=True)
- cat2 = Categorical(["c", "d", "c", "d"], categories=["c", "d", "y"], ordered=True)
- df = DataFrame({"A": cat1, "B": cat2, "values": [1, 2, 3, 4]})
- df["C"] = ["foo", "bar"] * 2
- # multiple groupers with a non-cat
- gb = df.groupby(["A", "B", "C"], observed=observed)
- exp_index = MultiIndex.from_arrays(
- [cat1, cat2, ["foo", "bar"] * 2], names=["A", "B", "C"]
- )
- expected = DataFrame({"values": Series([1, 2, 3, 4], index=exp_index)}).sort_index()
- result = gb.sum()
- if not observed:
- expected = cartesian_product_for_groupers(
- expected, [cat1, cat2, ["foo", "bar"]], list("ABC"), fill_value=0
- )
- tm.assert_frame_equal(result, expected)
- gb = df.groupby(["A", "B"], observed=observed)
- exp_index = MultiIndex.from_arrays([cat1, cat2], names=["A", "B"])
- expected = DataFrame(
- {"values": [1, 2, 3, 4], "C": ["foo", "bar", "foo", "bar"]}, index=exp_index
- )
- result = gb.sum()
- if not observed:
- expected = cartesian_product_for_groupers(
- expected, [cat1, cat2], list("AB"), fill_value=0
- )
- tm.assert_frame_equal(result, expected)
- # https://github.com/pandas-dev/pandas/issues/8138
- d = {
- "cat": Categorical(
- ["a", "b", "a", "b"], categories=["a", "b", "c"], ordered=True
- ),
- "ints": [1, 1, 2, 2],
- "val": [10, 20, 30, 40],
- }
- df = DataFrame(d)
- # Grouping on a single column
- groups_single_key = df.groupby("cat", observed=observed)
- result = groups_single_key.mean()
- exp_index = CategoricalIndex(
- list("ab"), name="cat", categories=list("abc"), ordered=True
- )
- expected = DataFrame({"ints": [1.5, 1.5], "val": [20.0, 30]}, index=exp_index)
- if not observed:
- index = CategoricalIndex(
- list("abc"), name="cat", categories=list("abc"), ordered=True
- )
- expected = expected.reindex(index)
- tm.assert_frame_equal(result, expected)
- # Grouping on two columns
- groups_double_key = df.groupby(["cat", "ints"], observed=observed)
- result = groups_double_key.agg("mean")
- expected = DataFrame(
- {
- "val": [10.0, 30.0, 20.0, 40.0],
- "cat": Categorical(
- ["a", "a", "b", "b"], categories=["a", "b", "c"], ordered=True
- ),
- "ints": [1, 2, 1, 2],
- }
- ).set_index(["cat", "ints"])
- if not observed:
- expected = cartesian_product_for_groupers(
- expected, [df.cat.values, [1, 2]], ["cat", "ints"]
- )
- tm.assert_frame_equal(result, expected)
- # GH 10132
- for key in [("a", 1), ("b", 2), ("b", 1), ("a", 2)]:
- c, i = key
- result = groups_double_key.get_group(key)
- expected = df[(df.cat == c) & (df.ints == i)]
- tm.assert_frame_equal(result, expected)
- # gh-8869
- # with as_index
- d = {
- "foo": [10, 8, 4, 8, 4, 1, 1],
- "bar": [10, 20, 30, 40, 50, 60, 70],
- "baz": ["d", "c", "e", "a", "a", "d", "c"],
- }
- df = DataFrame(d)
- cat = pd.cut(df["foo"], np.linspace(0, 10, 3))
- df["range"] = cat
- groups = df.groupby(["range", "baz"], as_index=False, observed=observed)
- result = groups.agg("mean")
- groups2 = df.groupby(["range", "baz"], as_index=True, observed=observed)
- expected = groups2.agg("mean").reset_index()
- tm.assert_frame_equal(result, expected)
- def test_observed_codes_remap(observed):
- d = {"C1": [3, 3, 4, 5], "C2": [1, 2, 3, 4], "C3": [10, 100, 200, 34]}
- df = DataFrame(d)
- values = pd.cut(df["C1"], [1, 2, 3, 6])
- values.name = "cat"
- groups_double_key = df.groupby([values, "C2"], observed=observed)
- idx = MultiIndex.from_arrays([values, [1, 2, 3, 4]], names=["cat", "C2"])
- expected = DataFrame(
- {"C1": [3.0, 3.0, 4.0, 5.0], "C3": [10.0, 100.0, 200.0, 34.0]}, index=idx
- )
- if not observed:
- expected = cartesian_product_for_groupers(
- expected, [values.values, [1, 2, 3, 4]], ["cat", "C2"]
- )
- result = groups_double_key.agg("mean")
- tm.assert_frame_equal(result, expected)
- def test_observed_perf():
- # we create a cartesian product, so this is
- # non-performant if we don't use observed values
- # gh-14942
- df = DataFrame(
- {
- "cat": np.random.randint(0, 255, size=30000),
- "int_id": np.random.randint(0, 255, size=30000),
- "other_id": np.random.randint(0, 10000, size=30000),
- "foo": 0,
- }
- )
- df["cat"] = df.cat.astype(str).astype("category")
- grouped = df.groupby(["cat", "int_id", "other_id"], observed=True)
- result = grouped.count()
- assert result.index.levels[0].nunique() == df.cat.nunique()
- assert result.index.levels[1].nunique() == df.int_id.nunique()
- assert result.index.levels[2].nunique() == df.other_id.nunique()
- def test_observed_groups(observed):
- # gh-20583
- # test that we have the appropriate groups
- cat = Categorical(["a", "c", "a"], categories=["a", "b", "c"])
- df = DataFrame({"cat": cat, "vals": [1, 2, 3]})
- g = df.groupby("cat", observed=observed)
- result = g.groups
- if observed:
- expected = {"a": Index([0, 2], dtype="int64"), "c": Index([1], dtype="int64")}
- else:
- expected = {
- "a": Index([0, 2], dtype="int64"),
- "b": Index([], dtype="int64"),
- "c": Index([1], dtype="int64"),
- }
- tm.assert_dict_equal(result, expected)
- @pytest.mark.parametrize(
- "keys, expected_values, expected_index_levels",
- [
- ("a", [15, 9, 0], CategoricalIndex([1, 2, 3], name="a")),
- (
- ["a", "b"],
- [7, 8, 0, 0, 0, 9, 0, 0, 0],
- [CategoricalIndex([1, 2, 3], name="a"), Index([4, 5, 6])],
- ),
- (
- ["a", "a2"],
- [15, 0, 0, 0, 9, 0, 0, 0, 0],
- [
- CategoricalIndex([1, 2, 3], name="a"),
- CategoricalIndex([1, 2, 3], name="a"),
- ],
- ),
- ],
- )
- @pytest.mark.parametrize("test_series", [True, False])
- def test_unobserved_in_index(keys, expected_values, expected_index_levels, test_series):
- # GH#49354 - ensure unobserved cats occur when grouping by index levels
- df = DataFrame(
- {
- "a": Categorical([1, 1, 2], categories=[1, 2, 3]),
- "a2": Categorical([1, 1, 2], categories=[1, 2, 3]),
- "b": [4, 5, 6],
- "c": [7, 8, 9],
- }
- ).set_index(["a", "a2"])
- if "b" not in keys:
- # Only keep b when it is used for grouping for consistent columns in the result
- df = df.drop(columns="b")
- gb = df.groupby(keys, observed=False)
- if test_series:
- gb = gb["c"]
- result = gb.sum()
- if len(keys) == 1:
- index = expected_index_levels
- else:
- codes = [[0, 0, 0, 1, 1, 1, 2, 2, 2], 3 * [0, 1, 2]]
- index = MultiIndex(
- expected_index_levels,
- codes=codes,
- names=keys,
- )
- expected = DataFrame({"c": expected_values}, index=index)
- if test_series:
- expected = expected["c"]
- tm.assert_equal(result, expected)
- def test_observed_groups_with_nan(observed):
- # GH 24740
- df = DataFrame(
- {
- "cat": Categorical(["a", np.nan, "a"], categories=["a", "b", "d"]),
- "vals": [1, 2, 3],
- }
- )
- g = df.groupby("cat", observed=observed)
- result = g.groups
- if observed:
- expected = {"a": Index([0, 2], dtype="int64")}
- else:
- expected = {
- "a": Index([0, 2], dtype="int64"),
- "b": Index([], dtype="int64"),
- "d": Index([], dtype="int64"),
- }
- tm.assert_dict_equal(result, expected)
- def test_observed_nth():
- # GH 26385
- cat = Categorical(["a", np.nan, np.nan], categories=["a", "b", "c"])
- ser = Series([1, 2, 3])
- df = DataFrame({"cat": cat, "ser": ser})
- result = df.groupby("cat", observed=False)["ser"].nth(0)
- expected = df["ser"].iloc[[0]]
- tm.assert_series_equal(result, expected)
- def test_dataframe_categorical_with_nan(observed):
- # GH 21151
- s1 = Categorical([np.nan, "a", np.nan, "a"], categories=["a", "b", "c"])
- s2 = Series([1, 2, 3, 4])
- df = DataFrame({"s1": s1, "s2": s2})
- result = df.groupby("s1", observed=observed).first().reset_index()
- if observed:
- expected = DataFrame(
- {"s1": Categorical(["a"], categories=["a", "b", "c"]), "s2": [2]}
- )
- else:
- expected = DataFrame(
- {
- "s1": Categorical(["a", "b", "c"], categories=["a", "b", "c"]),
- "s2": [2, np.nan, np.nan],
- }
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("ordered", [True, False])
- @pytest.mark.parametrize("observed", [True, False])
- @pytest.mark.parametrize("sort", [True, False])
- def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
- # GH 25871: Fix groupby sorting on ordered Categoricals
- # GH 25167: Groupby with observed=True doesn't sort
- # Build a dataframe with cat having one unobserved category ('missing'),
- # and a Series with identical values
- label = Categorical(
- ["d", "a", "b", "a", "d", "b"],
- categories=["a", "b", "missing", "d"],
- ordered=ordered,
- )
- val = Series(["d", "a", "b", "a", "d", "b"])
- df = DataFrame({"label": label, "val": val})
- # aggregate on the Categorical
- result = df.groupby("label", observed=observed, sort=sort)["val"].aggregate("first")
- # If ordering works, we expect index labels equal to aggregation results,
- # except for 'observed=False': label 'missing' has aggregation None
- label = Series(result.index.array, dtype="object")
- aggr = Series(result.array)
- if not observed:
- aggr[aggr.isna()] = "missing"
- if not all(label == aggr):
- msg = (
- "Labels and aggregation results not consistently sorted\n"
- f"for (ordered={ordered}, observed={observed}, sort={sort})\n"
- f"Result:\n{result}"
- )
- assert False, msg
- def test_datetime():
- # GH9049: ensure backward compatibility
- levels = pd.date_range("2014-01-01", periods=4)
- codes = np.random.randint(0, 4, size=100)
- cats = Categorical.from_codes(codes, levels, ordered=True)
- data = DataFrame(np.random.randn(100, 4))
- result = data.groupby(cats, observed=False).mean()
- expected = data.groupby(np.asarray(cats), observed=False).mean()
- expected = expected.reindex(levels)
- expected.index = CategoricalIndex(
- expected.index, categories=expected.index, ordered=True
- )
- tm.assert_frame_equal(result, expected)
- grouped = data.groupby(cats, observed=False)
- desc_result = grouped.describe()
- idx = cats.codes.argsort()
- ord_labels = cats.take(idx)
- ord_data = data.take(idx)
- expected = ord_data.groupby(ord_labels, observed=False).describe()
- tm.assert_frame_equal(desc_result, expected)
- tm.assert_index_equal(desc_result.index, expected.index)
- tm.assert_index_equal(
- desc_result.index.get_level_values(0), expected.index.get_level_values(0)
- )
- # GH 10460
- expc = Categorical.from_codes(np.arange(4).repeat(8), levels, ordered=True)
- exp = CategoricalIndex(expc)
- tm.assert_index_equal((desc_result.stack().index.get_level_values(0)), exp)
- exp = Index(["count", "mean", "std", "min", "25%", "50%", "75%", "max"] * 4)
- tm.assert_index_equal((desc_result.stack().index.get_level_values(1)), exp)
- def test_categorical_index():
- s = np.random.RandomState(12345)
- levels = ["foo", "bar", "baz", "qux"]
- codes = s.randint(0, 4, size=20)
- cats = Categorical.from_codes(codes, levels, ordered=True)
- df = DataFrame(np.repeat(np.arange(20), 4).reshape(-1, 4), columns=list("abcd"))
- df["cats"] = cats
- # with a cat index
- result = df.set_index("cats").groupby(level=0, observed=False).sum()
- expected = df[list("abcd")].groupby(cats.codes, observed=False).sum()
- expected.index = CategoricalIndex(
- Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats"
- )
- tm.assert_frame_equal(result, expected)
- # with a cat column, should produce a cat index
- result = df.groupby("cats", observed=False).sum()
- expected = df[list("abcd")].groupby(cats.codes, observed=False).sum()
- expected.index = CategoricalIndex(
- Categorical.from_codes([0, 1, 2, 3], levels, ordered=True), name="cats"
- )
- tm.assert_frame_equal(result, expected)
- def test_describe_categorical_columns():
- # GH 11558
- cats = CategoricalIndex(
- ["qux", "foo", "baz", "bar"],
- categories=["foo", "bar", "baz", "qux"],
- ordered=True,
- )
- df = DataFrame(np.random.randn(20, 4), columns=cats)
- result = df.groupby([1, 2, 3, 4] * 5).describe()
- tm.assert_index_equal(result.stack().columns, cats)
- tm.assert_categorical_equal(result.stack().columns.values, cats.values)
- def test_unstack_categorical():
- # GH11558 (example is taken from the original issue)
- df = DataFrame(
- {"a": range(10), "medium": ["A", "B"] * 5, "artist": list("XYXXY") * 2}
- )
- df["medium"] = df["medium"].astype("category")
- gcat = df.groupby(["artist", "medium"], observed=False)["a"].count().unstack()
- result = gcat.describe()
- exp_columns = CategoricalIndex(["A", "B"], ordered=False, name="medium")
- tm.assert_index_equal(result.columns, exp_columns)
- tm.assert_categorical_equal(result.columns.values, exp_columns.values)
- result = gcat["A"] + gcat["B"]
- expected = Series([6, 4], index=Index(["X", "Y"], name="artist"))
- tm.assert_series_equal(result, expected)
- def test_bins_unequal_len():
- # GH3011
- series = Series([np.nan, np.nan, 1, 1, 2, 2, 3, 3, 4, 4])
- bins = pd.cut(series.dropna().values, 4)
- # len(bins) != len(series) here
- with pytest.raises(ValueError, match="Grouper and axis must be same length"):
- series.groupby(bins).mean()
- @pytest.mark.parametrize(
- ["series", "data"],
- [
- # Group a series with length and index equal to those of the grouper.
- (Series(range(4)), {"A": [0, 3], "B": [1, 2]}),
- # Group a series with length equal to that of the grouper and index unequal to
- # that of the grouper.
- (Series(range(4)).rename(lambda idx: idx + 1), {"A": [2], "B": [0, 1]}),
- # GH44179: Group a series with length unequal to that of the grouper.
- (Series(range(7)), {"A": [0, 3], "B": [1, 2]}),
- ],
- )
- def test_categorical_series(series, data):
- # Group the given series by a series with categorical data type such that group A
- # takes indices 0 and 3 and group B indices 1 and 2, obtaining the values mapped in
- # the given data.
- groupby = series.groupby(Series(list("ABBA"), dtype="category"))
- result = groupby.aggregate(list)
- expected = Series(data, index=CategoricalIndex(data.keys()))
- tm.assert_series_equal(result, expected)
- def test_as_index():
- # GH13204
- df = DataFrame(
- {
- "cat": Categorical([1, 2, 2], [1, 2, 3]),
- "A": [10, 11, 11],
- "B": [101, 102, 103],
- }
- )
- result = df.groupby(["cat", "A"], as_index=False, observed=True).sum()
- expected = DataFrame(
- {
- "cat": Categorical([1, 2], categories=df.cat.cat.categories),
- "A": [10, 11],
- "B": [101, 205],
- },
- columns=["cat", "A", "B"],
- )
- tm.assert_frame_equal(result, expected)
- # function grouper
- f = lambda r: df.loc[r, "A"]
- result = df.groupby(["cat", f], as_index=False, observed=True).sum()
- expected = DataFrame(
- {
- "cat": Categorical([1, 2], categories=df.cat.cat.categories),
- "A": [10, 22],
- "B": [101, 205],
- },
- columns=["cat", "A", "B"],
- )
- tm.assert_frame_equal(result, expected)
- # another not in-axis grouper (conflicting names in index)
- s = Series(["a", "b", "b"], name="cat")
- result = df.groupby(["cat", s], as_index=False, observed=True).sum()
- tm.assert_frame_equal(result, expected)
- # is original index dropped?
- group_columns = ["cat", "A"]
- expected = DataFrame(
- {
- "cat": Categorical([1, 2], categories=df.cat.cat.categories),
- "A": [10, 11],
- "B": [101, 205],
- },
- columns=["cat", "A", "B"],
- )
- for name in [None, "X", "B"]:
- df.index = Index(list("abc"), name=name)
- result = df.groupby(group_columns, as_index=False, observed=True).sum()
- tm.assert_frame_equal(result, expected)
- def test_preserve_categories():
- # GH-13179
- categories = list("abc")
- # ordered=True
- df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=True)})
- sort_index = CategoricalIndex(categories, categories, ordered=True, name="A")
- nosort_index = CategoricalIndex(list("bac"), categories, ordered=True, name="A")
- tm.assert_index_equal(
- df.groupby("A", sort=True, observed=False).first().index, sort_index
- )
- # GH#42482 - don't sort result when sort=False, even when ordered=True
- tm.assert_index_equal(
- df.groupby("A", sort=False, observed=False).first().index, nosort_index
- )
- # ordered=False
- df = DataFrame({"A": Categorical(list("ba"), categories=categories, ordered=False)})
- sort_index = CategoricalIndex(categories, categories, ordered=False, name="A")
- # GH#48749 - don't change order of categories
- # GH#42482 - don't sort result when sort=False, even when ordered=True
- nosort_index = CategoricalIndex(list("bac"), list("abc"), ordered=False, name="A")
- tm.assert_index_equal(
- df.groupby("A", sort=True, observed=False).first().index, sort_index
- )
- tm.assert_index_equal(
- df.groupby("A", sort=False, observed=False).first().index, nosort_index
- )
- def test_preserve_categorical_dtype():
- # GH13743, GH13854
- df = DataFrame(
- {
- "A": [1, 2, 1, 1, 2],
- "B": [10, 16, 22, 28, 34],
- "C1": Categorical(list("abaab"), categories=list("bac"), ordered=False),
- "C2": Categorical(list("abaab"), categories=list("bac"), ordered=True),
- }
- )
- # single grouper
- exp_full = DataFrame(
- {
- "A": [2.0, 1.0, np.nan],
- "B": [25.0, 20.0, np.nan],
- "C1": Categorical(list("bac"), categories=list("bac"), ordered=False),
- "C2": Categorical(list("bac"), categories=list("bac"), ordered=True),
- }
- )
- for col in ["C1", "C2"]:
- result1 = df.groupby(by=col, as_index=False, observed=False).mean(
- numeric_only=True
- )
- result2 = (
- df.groupby(by=col, as_index=True, observed=False)
- .mean(numeric_only=True)
- .reset_index()
- )
- expected = exp_full.reindex(columns=result1.columns)
- tm.assert_frame_equal(result1, expected)
- tm.assert_frame_equal(result2, expected)
- @pytest.mark.parametrize(
- "func, values",
- [
- ("first", ["second", "first"]),
- ("last", ["fourth", "third"]),
- ("min", ["fourth", "first"]),
- ("max", ["second", "third"]),
- ],
- )
- def test_preserve_on_ordered_ops(func, values):
- # gh-18502
- # preserve the categoricals on ops
- c = Categorical(["first", "second", "third", "fourth"], ordered=True)
- df = DataFrame({"payload": [-1, -2, -1, -2], "col": c})
- g = df.groupby("payload")
- result = getattr(g, func)()
- expected = DataFrame(
- {"payload": [-2, -1], "col": Series(values, dtype=c.dtype)}
- ).set_index("payload")
- tm.assert_frame_equal(result, expected)
- # we should also preserve categorical for SeriesGroupBy
- sgb = df.groupby("payload")["col"]
- result = getattr(sgb, func)()
- expected = expected["col"]
- tm.assert_series_equal(result, expected)
- def test_categorical_no_compress():
- data = Series(np.random.randn(9))
- codes = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
- cats = Categorical.from_codes(codes, [0, 1, 2], ordered=True)
- result = data.groupby(cats, observed=False).mean()
- exp = data.groupby(codes, observed=False).mean()
- exp.index = CategoricalIndex(
- exp.index, categories=cats.categories, ordered=cats.ordered
- )
- tm.assert_series_equal(result, exp)
- codes = np.array([0, 0, 0, 1, 1, 1, 3, 3, 3])
- cats = Categorical.from_codes(codes, [0, 1, 2, 3], ordered=True)
- result = data.groupby(cats, observed=False).mean()
- exp = data.groupby(codes, observed=False).mean().reindex(cats.categories)
- exp.index = CategoricalIndex(
- exp.index, categories=cats.categories, ordered=cats.ordered
- )
- tm.assert_series_equal(result, exp)
- cats = Categorical(
- ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
- categories=["a", "b", "c", "d"],
- ordered=True,
- )
- data = DataFrame({"a": [1, 1, 1, 2, 2, 2, 3, 4, 5], "b": cats})
- result = data.groupby("b", observed=False).mean()
- result = result["a"].values
- exp = np.array([1, 2, 4, np.nan])
- tm.assert_numpy_array_equal(result, exp)
- def test_groupby_empty_with_category():
- # GH-9614
- # test fix for when group by on None resulted in
- # coercion of dtype categorical -> float
- df = DataFrame({"A": [None] * 3, "B": Categorical(["train", "train", "test"])})
- result = df.groupby("A").first()["B"]
- expected = Series(
- Categorical([], categories=["test", "train"]),
- index=Series([], dtype="object", name="A"),
- name="B",
- )
- tm.assert_series_equal(result, expected)
- def test_sort():
- # https://stackoverflow.com/questions/23814368/sorting-pandas-
- # categorical-labels-after-groupby
- # This should result in a properly sorted Series so that the plot
- # has a sorted x axis
- # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
- df = DataFrame({"value": np.random.randint(0, 10000, 100)})
- labels = [f"{i} - {i+499}" for i in range(0, 10000, 500)]
- cat_labels = Categorical(labels, labels)
- df = df.sort_values(by=["value"], ascending=True)
- df["value_group"] = pd.cut(
- df.value, range(0, 10500, 500), right=False, labels=cat_labels
- )
- res = df.groupby(["value_group"], observed=False)["value_group"].count()
- exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
- exp.index = CategoricalIndex(exp.index, name=exp.index.name)
- tm.assert_series_equal(res, exp)
- @pytest.mark.parametrize("ordered", [True, False])
- def test_sort2(sort, ordered):
- # dataframe groupby sort was being ignored # GH 8868
- # GH#48749 - don't change order of categories
- # GH#42482 - don't sort result when sort=False, even when ordered=True
- df = DataFrame(
- [
- ["(7.5, 10]", 10, 10],
- ["(7.5, 10]", 8, 20],
- ["(2.5, 5]", 5, 30],
- ["(5, 7.5]", 6, 40],
- ["(2.5, 5]", 4, 50],
- ["(0, 2.5]", 1, 60],
- ["(5, 7.5]", 7, 70],
- ],
- columns=["range", "foo", "bar"],
- )
- df["range"] = Categorical(df["range"], ordered=ordered)
- result = df.groupby("range", sort=sort, observed=False).first()
- if sort:
- data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
- index_values = ["(0, 2.5]", "(2.5, 5]", "(5, 7.5]", "(7.5, 10]"]
- else:
- data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
- index_values = ["(7.5, 10]", "(2.5, 5]", "(5, 7.5]", "(0, 2.5]"]
- expected = DataFrame(
- data_values,
- columns=["foo", "bar"],
- index=CategoricalIndex(index_values, name="range", ordered=ordered),
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("ordered", [True, False])
- def test_sort_datetimelike(sort, ordered):
- # GH10505
- # GH#42482 - don't sort result when sort=False, even when ordered=True
- # use same data as test_groupby_sort_categorical, which category is
- # corresponding to datetime.month
- df = DataFrame(
- {
- "dt": [
- datetime(2011, 7, 1),
- datetime(2011, 7, 1),
- datetime(2011, 2, 1),
- datetime(2011, 5, 1),
- datetime(2011, 2, 1),
- datetime(2011, 1, 1),
- datetime(2011, 5, 1),
- ],
- "foo": [10, 8, 5, 6, 4, 1, 7],
- "bar": [10, 20, 30, 40, 50, 60, 70],
- },
- columns=["dt", "foo", "bar"],
- )
- # ordered=True
- df["dt"] = Categorical(df["dt"], ordered=ordered)
- if sort:
- data_values = [[1, 60], [5, 30], [6, 40], [10, 10]]
- index_values = [
- datetime(2011, 1, 1),
- datetime(2011, 2, 1),
- datetime(2011, 5, 1),
- datetime(2011, 7, 1),
- ]
- else:
- data_values = [[10, 10], [5, 30], [6, 40], [1, 60]]
- index_values = [
- datetime(2011, 7, 1),
- datetime(2011, 2, 1),
- datetime(2011, 5, 1),
- datetime(2011, 1, 1),
- ]
- expected = DataFrame(
- data_values,
- columns=["foo", "bar"],
- index=CategoricalIndex(index_values, name="dt", ordered=ordered),
- )
- result = df.groupby("dt", sort=sort, observed=False).first()
- tm.assert_frame_equal(result, expected)
- def test_empty_sum():
- # https://github.com/pandas-dev/pandas/issues/18678
- df = DataFrame(
- {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]}
- )
- expected_idx = CategoricalIndex(["a", "b", "c"], name="A")
- # 0 by default
- result = df.groupby("A", observed=False).B.sum()
- expected = Series([3, 1, 0], expected_idx, name="B")
- tm.assert_series_equal(result, expected)
- # min_count=0
- result = df.groupby("A", observed=False).B.sum(min_count=0)
- expected = Series([3, 1, 0], expected_idx, name="B")
- tm.assert_series_equal(result, expected)
- # min_count=1
- result = df.groupby("A", observed=False).B.sum(min_count=1)
- expected = Series([3, 1, np.nan], expected_idx, name="B")
- tm.assert_series_equal(result, expected)
- # min_count>1
- result = df.groupby("A", observed=False).B.sum(min_count=2)
- expected = Series([3, np.nan, np.nan], expected_idx, name="B")
- tm.assert_series_equal(result, expected)
- def test_empty_prod():
- # https://github.com/pandas-dev/pandas/issues/18678
- df = DataFrame(
- {"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"]), "B": [1, 2, 1]}
- )
- expected_idx = CategoricalIndex(["a", "b", "c"], name="A")
- # 1 by default
- result = df.groupby("A", observed=False).B.prod()
- expected = Series([2, 1, 1], expected_idx, name="B")
- tm.assert_series_equal(result, expected)
- # min_count=0
- result = df.groupby("A", observed=False).B.prod(min_count=0)
- expected = Series([2, 1, 1], expected_idx, name="B")
- tm.assert_series_equal(result, expected)
- # min_count=1
- result = df.groupby("A", observed=False).B.prod(min_count=1)
- expected = Series([2, 1, np.nan], expected_idx, name="B")
- tm.assert_series_equal(result, expected)
- def test_groupby_multiindex_categorical_datetime():
- # https://github.com/pandas-dev/pandas/issues/21390
- df = DataFrame(
- {
- "key1": Categorical(list("abcbabcba")),
- "key2": Categorical(
- list(pd.date_range("2018-06-01 00", freq="1T", periods=3)) * 3
- ),
- "values": np.arange(9),
- }
- )
- result = df.groupby(["key1", "key2"]).mean()
- idx = MultiIndex.from_product(
- [
- Categorical(["a", "b", "c"]),
- Categorical(pd.date_range("2018-06-01 00", freq="1T", periods=3)),
- ],
- names=["key1", "key2"],
- )
- expected = DataFrame({"values": [0, 4, 8, 3, 4, 5, 6, np.nan, 2]}, index=idx)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "as_index, expected",
- [
- (
- True,
- Series(
- index=MultiIndex.from_arrays(
- [Series([1, 1, 2], dtype="category"), [1, 2, 2]], names=["a", "b"]
- ),
- data=[1, 2, 3],
- name="x",
- ),
- ),
- (
- False,
- DataFrame(
- {
- "a": Series([1, 1, 2], dtype="category"),
- "b": [1, 2, 2],
- "x": [1, 2, 3],
- }
- ),
- ),
- ],
- )
- def test_groupby_agg_observed_true_single_column(as_index, expected):
- # GH-23970
- df = DataFrame(
- {"a": Series([1, 1, 2], dtype="category"), "b": [1, 2, 2], "x": [1, 2, 3]}
- )
- result = df.groupby(["a", "b"], as_index=as_index, observed=True)["x"].sum()
- tm.assert_equal(result, expected)
- @pytest.mark.parametrize("fill_value", [None, np.nan, pd.NaT])
- def test_shift(fill_value):
- ct = Categorical(
- ["a", "b", "c", "d"], categories=["a", "b", "c", "d"], ordered=False
- )
- expected = Categorical(
- [None, "a", "b", "c"], categories=["a", "b", "c", "d"], ordered=False
- )
- res = ct.shift(1, fill_value=fill_value)
- tm.assert_equal(res, expected)
- @pytest.fixture
- def df_cat(df):
- """
- DataFrame with multiple categorical columns and a column of integers.
- Shortened so as not to contain all possible combinations of categories.
- Useful for testing `observed` kwarg functionality on GroupBy objects.
- Parameters
- ----------
- df: DataFrame
- Non-categorical, longer DataFrame from another fixture, used to derive
- this one
- Returns
- -------
- df_cat: DataFrame
- """
- df_cat = df.copy()[:4] # leave out some groups
- df_cat["A"] = df_cat["A"].astype("category")
- df_cat["B"] = df_cat["B"].astype("category")
- df_cat["C"] = Series([1, 2, 3, 4])
- df_cat = df_cat.drop(["D"], axis=1)
- return df_cat
- @pytest.mark.parametrize("operation", ["agg", "apply"])
- def test_seriesgroupby_observed_true(df_cat, operation):
- # GH#24880
- # GH#49223 - order of results was wrong when grouping by index levels
- lev_a = Index(["bar", "bar", "foo", "foo"], dtype=df_cat["A"].dtype, name="A")
- lev_b = Index(["one", "three", "one", "two"], dtype=df_cat["B"].dtype, name="B")
- index = MultiIndex.from_arrays([lev_a, lev_b])
- expected = Series(data=[2, 4, 1, 3], index=index, name="C").sort_index()
- grouped = df_cat.groupby(["A", "B"], observed=True)["C"]
- result = getattr(grouped, operation)(sum)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("operation", ["agg", "apply"])
- @pytest.mark.parametrize("observed", [False, None])
- def test_seriesgroupby_observed_false_or_none(df_cat, observed, operation):
- # GH 24880
- # GH#49223 - order of results was wrong when grouping by index levels
- index, _ = MultiIndex.from_product(
- [
- CategoricalIndex(["bar", "foo"], ordered=False),
- CategoricalIndex(["one", "three", "two"], ordered=False),
- ],
- names=["A", "B"],
- ).sortlevel()
- expected = Series(data=[2, 4, np.nan, 1, np.nan, 3], index=index, name="C")
- if operation == "agg":
- expected = expected.fillna(0, downcast="infer")
- grouped = df_cat.groupby(["A", "B"], observed=observed)["C"]
- result = getattr(grouped, operation)(sum)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "observed, index, data",
- [
- (
- True,
- MultiIndex.from_arrays(
- [
- Index(["bar"] * 4 + ["foo"] * 4, dtype="category", name="A"),
- Index(
- ["one", "one", "three", "three", "one", "one", "two", "two"],
- dtype="category",
- name="B",
- ),
- Index(["min", "max"] * 4),
- ]
- ),
- [2, 2, 4, 4, 1, 1, 3, 3],
- ),
- (
- False,
- MultiIndex.from_product(
- [
- CategoricalIndex(["bar", "foo"], ordered=False),
- CategoricalIndex(["one", "three", "two"], ordered=False),
- Index(["min", "max"]),
- ],
- names=["A", "B", None],
- ),
- [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
- ),
- (
- None,
- MultiIndex.from_product(
- [
- CategoricalIndex(["bar", "foo"], ordered=False),
- CategoricalIndex(["one", "three", "two"], ordered=False),
- Index(["min", "max"]),
- ],
- names=["A", "B", None],
- ),
- [2, 2, 4, 4, np.nan, np.nan, 1, 1, np.nan, np.nan, 3, 3],
- ),
- ],
- )
- def test_seriesgroupby_observed_apply_dict(df_cat, observed, index, data):
- # GH 24880
- expected = Series(data=data, index=index, name="C")
- result = df_cat.groupby(["A", "B"], observed=observed)["C"].apply(
- lambda x: {"min": x.min(), "max": x.max()}
- )
- tm.assert_series_equal(result, expected)
- def test_groupby_categorical_series_dataframe_consistent(df_cat):
- # GH 20416
- expected = df_cat.groupby(["A", "B"])["C"].mean()
- result = df_cat.groupby(["A", "B"]).mean()["C"]
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("code", [([1, 0, 0]), ([0, 0, 0])])
- def test_groupby_categorical_axis_1(code):
- # GH 13420
- df = DataFrame({"a": [1, 2, 3, 4], "b": [-1, -2, -3, -4], "c": [5, 6, 7, 8]})
- cat = Categorical.from_codes(code, categories=list("abc"))
- result = df.groupby(cat, axis=1).mean()
- expected = df.T.groupby(cat, axis=0).mean().T
- tm.assert_frame_equal(result, expected)
- def test_groupby_cat_preserves_structure(observed, ordered):
- # GH 28787
- df = DataFrame(
- {"Name": Categorical(["Bob", "Greg"], ordered=ordered), "Item": [1, 2]},
- columns=["Name", "Item"],
- )
- expected = df.copy()
- result = (
- df.groupby("Name", observed=observed)
- .agg(DataFrame.sum, skipna=True)
- .reset_index()
- )
- tm.assert_frame_equal(result, expected)
- def test_get_nonexistent_category():
- # Accessing a Category that is not in the dataframe
- df = DataFrame({"var": ["a", "a", "b", "b"], "val": range(4)})
- with pytest.raises(KeyError, match="'vau'"):
- df.groupby("var").apply(
- lambda rows: DataFrame(
- {"var": [rows.iloc[-1]["var"]], "val": [rows.iloc[-1]["vau"]]}
- )
- )
- def test_series_groupby_on_2_categoricals_unobserved(reduction_func, observed, request):
- # GH 17605
- if reduction_func == "ngroup":
- pytest.skip("ngroup is not truly a reduction")
- if reduction_func == "corrwith": # GH 32293
- mark = pytest.mark.xfail(
- reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
- )
- request.node.add_marker(mark)
- df = DataFrame(
- {
- "cat_1": Categorical(list("AABB"), categories=list("ABCD")),
- "cat_2": Categorical(list("AB") * 2, categories=list("ABCD")),
- "value": [0.1] * 4,
- }
- )
- args = get_groupby_method_args(reduction_func, df)
- expected_length = 4 if observed else 16
- series_groupby = df.groupby(["cat_1", "cat_2"], observed=observed)["value"]
- agg = getattr(series_groupby, reduction_func)
- result = agg(*args)
- assert len(result) == expected_length
- def test_series_groupby_on_2_categoricals_unobserved_zeroes_or_nans(
- reduction_func, request
- ):
- # GH 17605
- # Tests whether the unobserved categories in the result contain 0 or NaN
- if reduction_func == "ngroup":
- pytest.skip("ngroup is not truly a reduction")
- if reduction_func == "corrwith": # GH 32293
- mark = pytest.mark.xfail(
- reason="TODO: implemented SeriesGroupBy.corrwith. See GH 32293"
- )
- request.node.add_marker(mark)
- df = DataFrame(
- {
- "cat_1": Categorical(list("AABB"), categories=list("ABC")),
- "cat_2": Categorical(list("AB") * 2, categories=list("ABC")),
- "value": [0.1] * 4,
- }
- )
- unobserved = [tuple("AC"), tuple("BC"), tuple("CA"), tuple("CB"), tuple("CC")]
- args = get_groupby_method_args(reduction_func, df)
- series_groupby = df.groupby(["cat_1", "cat_2"], observed=False)["value"]
- agg = getattr(series_groupby, reduction_func)
- result = agg(*args)
- zero_or_nan = _results_for_groupbys_with_missing_categories[reduction_func]
- for idx in unobserved:
- val = result.loc[idx]
- assert (pd.isna(zero_or_nan) and pd.isna(val)) or (val == zero_or_nan)
- # If we expect unobserved values to be zero, we also expect the dtype to be int.
- # Except for .sum(). If the observed categories sum to dtype=float (i.e. their
- # sums have decimals), then the zeros for the missing categories should also be
- # floats.
- if zero_or_nan == 0 and reduction_func != "sum":
- assert np.issubdtype(result.dtype, np.integer)
- def test_dataframe_groupby_on_2_categoricals_when_observed_is_true(reduction_func):
- # GH 23865
- # GH 27075
- # Ensure that df.groupby, when 'by' is two Categorical variables,
- # does not return the categories that are not in df when observed=True
- if reduction_func == "ngroup":
- pytest.skip("ngroup does not return the Categories on the index")
- df = DataFrame(
- {
- "cat_1": Categorical(list("AABB"), categories=list("ABC")),
- "cat_2": Categorical(list("1111"), categories=list("12")),
- "value": [0.1, 0.1, 0.1, 0.1],
- }
- )
- unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
- df_grp = df.groupby(["cat_1", "cat_2"], observed=True)
- args = get_groupby_method_args(reduction_func, df)
- res = getattr(df_grp, reduction_func)(*args)
- for cat in unobserved_cats:
- assert cat not in res.index
- @pytest.mark.parametrize("observed", [False, None])
- def test_dataframe_groupby_on_2_categoricals_when_observed_is_false(
- reduction_func, observed
- ):
- # GH 23865
- # GH 27075
- # Ensure that df.groupby, when 'by' is two Categorical variables,
- # returns the categories that are not in df when observed=False/None
- if reduction_func == "ngroup":
- pytest.skip("ngroup does not return the Categories on the index")
- df = DataFrame(
- {
- "cat_1": Categorical(list("AABB"), categories=list("ABC")),
- "cat_2": Categorical(list("1111"), categories=list("12")),
- "value": [0.1, 0.1, 0.1, 0.1],
- }
- )
- unobserved_cats = [("A", "2"), ("B", "2"), ("C", "1"), ("C", "2")]
- df_grp = df.groupby(["cat_1", "cat_2"], observed=observed)
- args = get_groupby_method_args(reduction_func, df)
- res = getattr(df_grp, reduction_func)(*args)
- expected = _results_for_groupbys_with_missing_categories[reduction_func]
- if expected is np.nan:
- assert res.loc[unobserved_cats].isnull().all().all()
- else:
- assert (res.loc[unobserved_cats] == expected).all().all()
- def test_series_groupby_categorical_aggregation_getitem():
- # GH 8870
- d = {"foo": [10, 8, 4, 1], "bar": [10, 20, 30, 40], "baz": ["d", "c", "d", "c"]}
- df = DataFrame(d)
- cat = pd.cut(df["foo"], np.linspace(0, 20, 5))
- df["range"] = cat
- groups = df.groupby(["range", "baz"], as_index=True, sort=True)
- result = groups["foo"].agg("mean")
- expected = groups.agg("mean")["foo"]
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "func, expected_values",
- [(Series.nunique, [1, 1, 2]), (Series.count, [1, 2, 2])],
- )
- def test_groupby_agg_categorical_columns(func, expected_values):
- # 31256
- df = DataFrame(
- {
- "id": [0, 1, 2, 3, 4],
- "groups": [0, 1, 1, 2, 2],
- "value": Categorical([0, 0, 0, 0, 1]),
- }
- ).set_index("id")
- result = df.groupby("groups").agg(func)
- expected = DataFrame(
- {"value": expected_values}, index=Index([0, 1, 2], name="groups")
- )
- tm.assert_frame_equal(result, expected)
- def test_groupby_agg_non_numeric():
- df = DataFrame({"A": Categorical(["a", "a", "b"], categories=["a", "b", "c"])})
- expected = DataFrame({"A": [2, 1]}, index=np.array([1, 2]))
- result = df.groupby([1, 2, 1]).agg(Series.nunique)
- tm.assert_frame_equal(result, expected)
- result = df.groupby([1, 2, 1]).nunique()
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("func", ["first", "last"])
- def test_groupby_first_returned_categorical_instead_of_dataframe(func):
- # GH 28641: groupby drops index, when grouping over categorical column with
- # first/last. Renamed Categorical instead of DataFrame previously.
- df = DataFrame({"A": [1997], "B": Series(["b"], dtype="category").cat.as_ordered()})
- df_grouped = df.groupby("A")["B"]
- result = getattr(df_grouped, func)()
- # ordered categorical dtype should be preserved
- expected = Series(
- ["b"], index=Index([1997], name="A"), name="B", dtype=df["B"].dtype
- )
- tm.assert_series_equal(result, expected)
- def test_read_only_category_no_sort():
- # GH33410
- cats = np.array([1, 2])
- cats.flags.writeable = False
- df = DataFrame(
- {"a": [1, 3, 5, 7], "b": Categorical([1, 1, 2, 2], categories=Index(cats))}
- )
- expected = DataFrame(data={"a": [2.0, 6.0]}, index=CategoricalIndex(cats, name="b"))
- result = df.groupby("b", sort=False).mean()
- tm.assert_frame_equal(result, expected)
- def test_sorted_missing_category_values():
- # GH 28597
- df = DataFrame(
- {
- "foo": [
- "small",
- "large",
- "large",
- "large",
- "medium",
- "large",
- "large",
- "medium",
- ],
- "bar": ["C", "A", "A", "C", "A", "C", "A", "C"],
- }
- )
- df["foo"] = (
- df["foo"]
- .astype("category")
- .cat.set_categories(["tiny", "small", "medium", "large"], ordered=True)
- )
- expected = DataFrame(
- {
- "tiny": {"A": 0, "C": 0},
- "small": {"A": 0, "C": 1},
- "medium": {"A": 1, "C": 1},
- "large": {"A": 3, "C": 2},
- }
- )
- expected = expected.rename_axis("bar", axis="index")
- expected.columns = CategoricalIndex(
- ["tiny", "small", "medium", "large"],
- categories=["tiny", "small", "medium", "large"],
- ordered=True,
- name="foo",
- dtype="category",
- )
- result = df.groupby(["bar", "foo"]).size().unstack()
- tm.assert_frame_equal(result, expected)
- def test_agg_cython_category_not_implemented_fallback():
- # https://github.com/pandas-dev/pandas/issues/31450
- df = DataFrame({"col_num": [1, 1, 2, 3]})
- df["col_cat"] = df["col_num"].astype("category")
- result = df.groupby("col_num").col_cat.first()
- # ordered categorical dtype should definitely be preserved;
- # this is unordered, so is less-clear case (if anything, it should raise)
- expected = Series(
- [1, 2, 3],
- index=Index([1, 2, 3], name="col_num"),
- name="col_cat",
- dtype=df["col_cat"].dtype,
- )
- tm.assert_series_equal(result, expected)
- result = df.groupby("col_num").agg({"col_cat": "first"})
- expected = expected.to_frame()
- tm.assert_frame_equal(result, expected)
- def test_aggregate_categorical_with_isnan():
- # GH 29837
- df = DataFrame(
- {
- "A": [1, 1, 1, 1],
- "B": [1, 2, 1, 2],
- "numerical_col": [0.1, 0.2, np.nan, 0.3],
- "object_col": ["foo", "bar", "foo", "fee"],
- "categorical_col": ["foo", "bar", "foo", "fee"],
- }
- )
- df = df.astype({"categorical_col": "category"})
- result = df.groupby(["A", "B"]).agg(lambda df: df.isna().sum())
- index = MultiIndex.from_arrays([[1, 1], [1, 2]], names=("A", "B"))
- expected = DataFrame(
- data={
- "numerical_col": [1, 0],
- "object_col": [0, 0],
- "categorical_col": [0, 0],
- },
- index=index,
- )
- tm.assert_frame_equal(result, expected)
- def test_categorical_transform():
- # GH 29037
- df = DataFrame(
- {
- "package_id": [1, 1, 1, 2, 2, 3],
- "status": [
- "Waiting",
- "OnTheWay",
- "Delivered",
- "Waiting",
- "OnTheWay",
- "Waiting",
- ],
- }
- )
- delivery_status_type = pd.CategoricalDtype(
- categories=["Waiting", "OnTheWay", "Delivered"], ordered=True
- )
- df["status"] = df["status"].astype(delivery_status_type)
- df["last_status"] = df.groupby("package_id")["status"].transform(max)
- result = df.copy()
- expected = DataFrame(
- {
- "package_id": [1, 1, 1, 2, 2, 3],
- "status": [
- "Waiting",
- "OnTheWay",
- "Delivered",
- "Waiting",
- "OnTheWay",
- "Waiting",
- ],
- "last_status": [
- "Delivered",
- "Delivered",
- "Delivered",
- "OnTheWay",
- "OnTheWay",
- "Waiting",
- ],
- }
- )
- expected["status"] = expected["status"].astype(delivery_status_type)
- # .transform(max) should preserve ordered categoricals
- expected["last_status"] = expected["last_status"].astype(delivery_status_type)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("func", ["first", "last"])
- def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals(
- func: str, observed: bool
- ):
- # GH 34951
- cat = Categorical([0, 0, 1, 1])
- val = [0, 1, 1, 0]
- df = DataFrame({"a": cat, "b": cat, "c": val})
- cat2 = Categorical([0, 1])
- idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"])
- expected_dict = {
- "first": Series([0, np.NaN, np.NaN, 1], idx, name="c"),
- "last": Series([1, np.NaN, np.NaN, 0], idx, name="c"),
- }
- expected = expected_dict[func]
- if observed:
- expected = expected.dropna().astype(np.int64)
- srs_grp = df.groupby(["a", "b"], observed=observed)["c"]
- result = getattr(srs_grp, func)()
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("func", ["first", "last"])
- def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals(
- func: str, observed: bool
- ):
- # GH 34951
- cat = Categorical([0, 0, 1, 1])
- val = [0, 1, 1, 0]
- df = DataFrame({"a": cat, "b": cat, "c": val})
- cat2 = Categorical([0, 1])
- idx = MultiIndex.from_product([cat2, cat2], names=["a", "b"])
- expected_dict = {
- "first": Series([0, np.NaN, np.NaN, 1], idx, name="c"),
- "last": Series([1, np.NaN, np.NaN, 0], idx, name="c"),
- }
- expected = expected_dict[func].to_frame()
- if observed:
- expected = expected.dropna().astype(np.int64)
- df_grp = df.groupby(["a", "b"], observed=observed)
- result = getattr(df_grp, func)()
- tm.assert_frame_equal(result, expected)
- def test_groupby_categorical_indices_unused_categories():
- # GH#38642
- df = DataFrame(
- {
- "key": Categorical(["b", "b", "a"], categories=["a", "b", "c"]),
- "col": range(3),
- }
- )
- grouped = df.groupby("key", sort=False)
- result = grouped.indices
- expected = {
- "b": np.array([0, 1], dtype="intp"),
- "a": np.array([2], dtype="intp"),
- "c": np.array([], dtype="intp"),
- }
- assert result.keys() == expected.keys()
- for key in result.keys():
- tm.assert_numpy_array_equal(result[key], expected[key])
- @pytest.mark.parametrize("func", ["first", "last"])
- def test_groupby_last_first_preserve_categoricaldtype(func):
- # GH#33090
- df = DataFrame({"a": [1, 2, 3]})
- df["b"] = df["a"].astype("category")
- result = getattr(df.groupby("a")["b"], func)()
- expected = Series(
- Categorical([1, 2, 3]), name="b", index=Index([1, 2, 3], name="a")
- )
- tm.assert_series_equal(expected, result)
- def test_groupby_categorical_observed_nunique():
- # GH#45128
- df = DataFrame({"a": [1, 2], "b": [1, 2], "c": [10, 11]})
- df = df.astype(dtype={"a": "category", "b": "category"})
- result = df.groupby(["a", "b"], observed=True).nunique()["c"]
- expected = Series(
- [1, 1],
- index=MultiIndex.from_arrays(
- [CategoricalIndex([1, 2], name="a"), CategoricalIndex([1, 2], name="b")]
- ),
- name="c",
- )
- tm.assert_series_equal(result, expected)
- def test_groupby_categorical_aggregate_functions():
- # GH#37275
- dtype = pd.CategoricalDtype(categories=["small", "big"], ordered=True)
- df = DataFrame(
- [[1, "small"], [1, "big"], [2, "small"]], columns=["grp", "description"]
- ).astype({"description": dtype})
- result = df.groupby("grp")["description"].max()
- expected = Series(
- ["big", "small"],
- index=Index([1, 2], name="grp"),
- name="description",
- dtype=pd.CategoricalDtype(categories=["small", "big"], ordered=True),
- )
- tm.assert_series_equal(result, expected)
- def test_groupby_categorical_dropna(observed, dropna):
- # GH#48645 - dropna should have no impact on the result when there are no NA values
- cat = Categorical([1, 2], categories=[1, 2, 3])
- df = DataFrame({"x": Categorical([1, 2], categories=[1, 2, 3]), "y": [3, 4]})
- gb = df.groupby("x", observed=observed, dropna=dropna)
- result = gb.sum()
- if observed:
- expected = DataFrame({"y": [3, 4]}, index=cat)
- else:
- index = CategoricalIndex([1, 2, 3], [1, 2, 3])
- expected = DataFrame({"y": [3, 4, 0]}, index=index)
- expected.index.name = "x"
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
- @pytest.mark.parametrize("ordered", [True, False])
- def test_category_order_reducer(
- request, as_index, sort, observed, reduction_func, index_kind, ordered
- ):
- # GH#48749
- if (
- reduction_func in ("idxmax", "idxmin")
- and not observed
- and index_kind != "multi"
- ):
- msg = "GH#10694 - idxmax/min fail with unused categories"
- request.node.add_marker(pytest.mark.xfail(reason=msg))
- elif reduction_func == "corrwith" and not as_index:
- msg = "GH#49950 - corrwith with as_index=False may not have grouping column"
- request.node.add_marker(pytest.mark.xfail(reason=msg))
- elif index_kind != "range" and not as_index:
- pytest.skip(reason="Result doesn't have categories, nothing to test")
- df = DataFrame(
- {
- "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
- "b": range(4),
- }
- )
- if index_kind == "range":
- keys = ["a"]
- elif index_kind == "single":
- keys = ["a"]
- df = df.set_index(keys)
- elif index_kind == "multi":
- keys = ["a", "a2"]
- df["a2"] = df["a"]
- df = df.set_index(keys)
- args = get_groupby_method_args(reduction_func, df)
- gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
- op_result = getattr(gb, reduction_func)(*args)
- if as_index:
- result = op_result.index.get_level_values("a").categories
- else:
- result = op_result["a"].cat.categories
- expected = Index([1, 4, 3, 2])
- tm.assert_index_equal(result, expected)
- if index_kind == "multi":
- result = op_result.index.get_level_values("a2").categories
- tm.assert_index_equal(result, expected)
- @pytest.mark.parametrize("index_kind", ["single", "multi"])
- @pytest.mark.parametrize("ordered", [True, False])
- def test_category_order_transformer(
- as_index, sort, observed, transformation_func, index_kind, ordered
- ):
- # GH#48749
- df = DataFrame(
- {
- "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
- "b": range(4),
- }
- )
- if index_kind == "single":
- keys = ["a"]
- df = df.set_index(keys)
- elif index_kind == "multi":
- keys = ["a", "a2"]
- df["a2"] = df["a"]
- df = df.set_index(keys)
- args = get_groupby_method_args(transformation_func, df)
- gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
- op_result = getattr(gb, transformation_func)(*args)
- result = op_result.index.get_level_values("a").categories
- expected = Index([1, 4, 3, 2])
- tm.assert_index_equal(result, expected)
- if index_kind == "multi":
- result = op_result.index.get_level_values("a2").categories
- tm.assert_index_equal(result, expected)
- @pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
- @pytest.mark.parametrize("method", ["head", "tail"])
- @pytest.mark.parametrize("ordered", [True, False])
- def test_category_order_head_tail(
- as_index, sort, observed, method, index_kind, ordered
- ):
- # GH#48749
- df = DataFrame(
- {
- "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
- "b": range(4),
- }
- )
- if index_kind == "range":
- keys = ["a"]
- elif index_kind == "single":
- keys = ["a"]
- df = df.set_index(keys)
- elif index_kind == "multi":
- keys = ["a", "a2"]
- df["a2"] = df["a"]
- df = df.set_index(keys)
- gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
- op_result = getattr(gb, method)()
- if index_kind == "range":
- result = op_result["a"].cat.categories
- else:
- result = op_result.index.get_level_values("a").categories
- expected = Index([1, 4, 3, 2])
- tm.assert_index_equal(result, expected)
- if index_kind == "multi":
- result = op_result.index.get_level_values("a2").categories
- tm.assert_index_equal(result, expected)
- @pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
- @pytest.mark.parametrize("method", ["apply", "agg", "transform"])
- @pytest.mark.parametrize("ordered", [True, False])
- def test_category_order_apply(as_index, sort, observed, method, index_kind, ordered):
- # GH#48749
- if (method == "transform" and index_kind == "range") or (
- not as_index and index_kind != "range"
- ):
- pytest.skip("No categories in result, nothing to test")
- df = DataFrame(
- {
- "a": Categorical([2, 1, 2, 3], categories=[1, 4, 3, 2], ordered=ordered),
- "b": range(4),
- }
- )
- if index_kind == "range":
- keys = ["a"]
- elif index_kind == "single":
- keys = ["a"]
- df = df.set_index(keys)
- elif index_kind == "multi":
- keys = ["a", "a2"]
- df["a2"] = df["a"]
- df = df.set_index(keys)
- gb = df.groupby(keys, as_index=as_index, sort=sort, observed=observed)
- op_result = getattr(gb, method)(lambda x: x.sum(numeric_only=True))
- if (method == "transform" or not as_index) and index_kind == "range":
- result = op_result["a"].cat.categories
- else:
- result = op_result.index.get_level_values("a").categories
- expected = Index([1, 4, 3, 2])
- tm.assert_index_equal(result, expected)
- if index_kind == "multi":
- result = op_result.index.get_level_values("a2").categories
- tm.assert_index_equal(result, expected)
- @pytest.mark.parametrize("index_kind", ["range", "single", "multi"])
- def test_many_categories(as_index, sort, index_kind, ordered):
- # GH#48749 - Test when the grouper has many categories
- if index_kind != "range" and not as_index:
- pytest.skip(reason="Result doesn't have categories, nothing to test")
- categories = np.arange(9999, -1, -1)
- grouper = Categorical([2, 1, 2, 3], categories=categories, ordered=ordered)
- df = DataFrame({"a": grouper, "b": range(4)})
- if index_kind == "range":
- keys = ["a"]
- elif index_kind == "single":
- keys = ["a"]
- df = df.set_index(keys)
- elif index_kind == "multi":
- keys = ["a", "a2"]
- df["a2"] = df["a"]
- df = df.set_index(keys)
- gb = df.groupby(keys, as_index=as_index, sort=sort, observed=True)
- result = gb.sum()
- # Test is setup so that data and index are the same values
- data = [3, 2, 1] if sort else [2, 1, 3]
- index = CategoricalIndex(
- data, categories=grouper.categories, ordered=ordered, name="a"
- )
- if as_index:
- expected = DataFrame({"b": data})
- if index_kind == "multi":
- expected.index = MultiIndex.from_frame(DataFrame({"a": index, "a2": index}))
- else:
- expected.index = index
- elif index_kind == "multi":
- expected = DataFrame({"a": Series(index), "a2": Series(index), "b": data})
- else:
- expected = DataFrame({"a": Series(index), "b": data})
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("test_series", [True, False])
- @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
- def test_agg_list(request, as_index, observed, reduction_func, test_series, keys):
- # GH#52760
- if test_series and reduction_func == "corrwith":
- assert not hasattr(SeriesGroupBy, "corrwith")
- pytest.skip("corrwith not implemented for SeriesGroupBy")
- elif reduction_func == "corrwith":
- msg = "GH#32293: attempts to call SeriesGroupBy.corrwith"
- request.node.add_marker(pytest.mark.xfail(reason=msg))
- elif (
- reduction_func == "nunique"
- and not test_series
- and len(keys) != 1
- and not observed
- and not as_index
- ):
- msg = "GH#52848 - raises a ValueError"
- request.node.add_marker(pytest.mark.xfail(reason=msg))
- df = DataFrame({"a1": [0, 0, 1], "a2": [2, 3, 3], "b": [4, 5, 6]})
- df = df.astype({"a1": "category", "a2": "category"})
- if "a2" not in keys:
- df = df.drop(columns="a2")
- gb = df.groupby(by=keys, as_index=as_index, observed=observed)
- if test_series:
- gb = gb["b"]
- args = get_groupby_method_args(reduction_func, df)
- result = gb.agg([reduction_func], *args)
- expected = getattr(gb, reduction_func)(*args)
- if as_index and (test_series or reduction_func == "size"):
- expected = expected.to_frame(reduction_func)
- if not test_series:
- if not as_index:
- # TODO: GH#52849 - as_index=False is not respected
- expected = expected.set_index(keys)
- expected.columns = MultiIndex(
- levels=[["b"], [reduction_func]], codes=[[0], [0]]
- )
- elif not as_index:
- expected.columns = keys + [reduction_func]
- tm.assert_equal(result, expected)
|