12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637 |
- import builtins
- from io import StringIO
- import numpy as np
- import pytest
- from pandas._libs import lib
- from pandas.errors import UnsupportedFunctionCall
- import pandas as pd
- from pandas import (
- DataFrame,
- Index,
- MultiIndex,
- Series,
- Timestamp,
- date_range,
- )
- import pandas._testing as tm
- from pandas.core import nanops
- from pandas.tests.groupby import get_groupby_method_args
- from pandas.util import _test_decorators as td
- @pytest.fixture(
- params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
- ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
- )
- def dtypes_for_minmax(request):
- """
- Fixture of dtypes with min and max values used for testing
- cummin and cummax
- """
- dtype = request.param
- np_type = dtype
- if dtype == "Int64":
- np_type = np.int64
- elif dtype == "Float64":
- np_type = np.float64
- min_val = (
- np.iinfo(np_type).min
- if np.dtype(np_type).kind == "i"
- else np.finfo(np_type).min
- )
- max_val = (
- np.iinfo(np_type).max
- if np.dtype(np_type).kind == "i"
- else np.finfo(np_type).max
- )
- return (dtype, min_val, max_val)
- def test_intercept_builtin_sum():
- s = Series([1.0, 2.0, np.nan, 3.0])
- grouped = s.groupby([0, 1, 2, 2])
- result = grouped.agg(builtins.sum)
- result2 = grouped.apply(builtins.sum)
- expected = grouped.sum()
- tm.assert_series_equal(result, expected)
- tm.assert_series_equal(result2, expected)
- @pytest.mark.parametrize("f", [max, min, sum])
- @pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key
- def test_builtins_apply(keys, f):
- # see gh-8155
- df = DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"])
- df["jolie"] = np.random.randn(1000)
- gb = df.groupby(keys)
- fname = f.__name__
- result = gb.apply(f)
- ngroups = len(df.drop_duplicates(subset=keys))
- assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
- assert result.shape == (ngroups, 3), assert_msg
- npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function
- expected = gb.apply(npfunc)
- tm.assert_frame_equal(result, expected)
- with tm.assert_produces_warning(None):
- expected2 = gb.apply(lambda x: npfunc(x))
- tm.assert_frame_equal(result, expected2)
- if f != sum:
- expected = gb.agg(fname).reset_index()
- expected.set_index(keys, inplace=True, drop=False)
- tm.assert_frame_equal(result, expected, check_dtype=False)
- tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0))
- class TestNumericOnly:
- # make sure that we are passing thru kwargs to our agg functions
- @pytest.fixture
- def df(self):
- # GH3668
- # GH5724
- df = DataFrame(
- {
- "group": [1, 1, 2],
- "int": [1, 2, 3],
- "float": [4.0, 5.0, 6.0],
- "string": list("abc"),
- "category_string": Series(list("abc")).astype("category"),
- "category_int": [7, 8, 9],
- "datetime": date_range("20130101", periods=3),
- "datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
- "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
- },
- columns=[
- "group",
- "int",
- "float",
- "string",
- "category_string",
- "category_int",
- "datetime",
- "datetimetz",
- "timedelta",
- ],
- )
- return df
- @pytest.mark.parametrize("method", ["mean", "median"])
- def test_averages(self, df, method):
- # mean / median
- expected_columns_numeric = Index(["int", "float", "category_int"])
- gb = df.groupby("group")
- expected = DataFrame(
- {
- "category_int": [7.5, 9],
- "float": [4.5, 6.0],
- "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
- "int": [1.5, 3],
- "datetime": [
- Timestamp("2013-01-01 12:00:00"),
- Timestamp("2013-01-03 00:00:00"),
- ],
- "datetimetz": [
- Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
- Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
- ],
- },
- index=Index([1, 2], name="group"),
- columns=[
- "int",
- "float",
- "category_int",
- ],
- )
- result = getattr(gb, method)(numeric_only=True)
- tm.assert_frame_equal(result.reindex_like(expected), expected)
- expected_columns = expected.columns
- self._check(df, method, expected_columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["min", "max"])
- def test_extrema(self, df, method):
- # TODO: min, max *should* handle
- # categorical (ordered) dtype
- expected_columns = Index(
- [
- "int",
- "float",
- "string",
- "category_int",
- "datetime",
- "datetimetz",
- "timedelta",
- ]
- )
- expected_columns_numeric = expected_columns
- self._check(df, method, expected_columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["first", "last"])
- def test_first_last(self, df, method):
- expected_columns = Index(
- [
- "int",
- "float",
- "string",
- "category_string",
- "category_int",
- "datetime",
- "datetimetz",
- "timedelta",
- ]
- )
- expected_columns_numeric = expected_columns
- self._check(df, method, expected_columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["sum", "cumsum"])
- def test_sum_cumsum(self, df, method):
- expected_columns_numeric = Index(["int", "float", "category_int"])
- expected_columns = Index(
- ["int", "float", "string", "category_int", "timedelta"]
- )
- if method == "cumsum":
- # cumsum loses string
- expected_columns = Index(["int", "float", "category_int", "timedelta"])
- self._check(df, method, expected_columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["prod", "cumprod"])
- def test_prod_cumprod(self, df, method):
- expected_columns = Index(["int", "float", "category_int"])
- expected_columns_numeric = expected_columns
- self._check(df, method, expected_columns, expected_columns_numeric)
- @pytest.mark.parametrize("method", ["cummin", "cummax"])
- def test_cummin_cummax(self, df, method):
- # like min, max, but don't include strings
- expected_columns = Index(
- ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
- )
- # GH#15561: numeric_only=False set by default like min/max
- expected_columns_numeric = expected_columns
- self._check(df, method, expected_columns, expected_columns_numeric)
- def _check(self, df, method, expected_columns, expected_columns_numeric):
- gb = df.groupby("group")
- # object dtypes for transformations are not implemented in Cython and
- # have no Python fallback
- exception = NotImplementedError if method.startswith("cum") else TypeError
- if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
- # The methods default to numeric_only=False and raise TypeError
- msg = "|".join(
- [
- "Categorical is not ordered",
- "function is not implemented for this dtype",
- f"Cannot perform {method} with non-ordered Categorical",
- ]
- )
- with pytest.raises(exception, match=msg):
- getattr(gb, method)()
- elif method in ("sum", "mean", "median", "prod"):
- msg = "|".join(
- [
- "category type does not support sum operations",
- "[Cc]ould not convert",
- "can't multiply sequence by non-int of type 'str'",
- ]
- )
- with pytest.raises(exception, match=msg):
- getattr(gb, method)()
- else:
- result = getattr(gb, method)()
- tm.assert_index_equal(result.columns, expected_columns_numeric)
- if method not in ("first", "last"):
- msg = "|".join(
- [
- "[Cc]ould not convert",
- "Categorical is not ordered",
- "category type does not support",
- "can't multiply sequence",
- "function is not implemented for this dtype",
- f"Cannot perform {method} with non-ordered Categorical",
- ]
- )
- with pytest.raises(exception, match=msg):
- getattr(gb, method)(numeric_only=False)
- else:
- result = getattr(gb, method)(numeric_only=False)
- tm.assert_index_equal(result.columns, expected_columns)
- class TestGroupByNonCythonPaths:
- # GH#5610 non-cython calls should not include the grouper
- # Tests for code not expected to go through cython paths.
- @pytest.fixture
- def df(self):
- df = DataFrame(
- [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
- columns=["A", "B", "C"],
- )
- return df
- @pytest.fixture
- def gb(self, df):
- gb = df.groupby("A")
- return gb
- @pytest.fixture
- def gni(self, df):
- gni = df.groupby("A", as_index=False)
- return gni
- def test_describe(self, df, gb, gni):
- # describe
- expected_index = Index([1, 3], name="A")
- expected_col = MultiIndex(
- levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
- codes=[[0] * 8, list(range(8))],
- )
- expected = DataFrame(
- [
- [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
- [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
- ],
- index=expected_index,
- columns=expected_col,
- )
- result = gb.describe()
- tm.assert_frame_equal(result, expected)
- expected = expected.reset_index()
- result = gni.describe()
- tm.assert_frame_equal(result, expected)
- def test_cython_api2():
- # this takes the fast apply path
- # cumsum (GH5614)
- df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
- expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
- result = df.groupby("A").cumsum()
- tm.assert_frame_equal(result, expected)
- # GH 5755 - cumsum is a transformer and should ignore as_index
- result = df.groupby("A", as_index=False).cumsum()
- tm.assert_frame_equal(result, expected)
- # GH 13994
- result = df.groupby("A").cumsum(axis=1)
- expected = df.cumsum(axis=1)
- tm.assert_frame_equal(result, expected)
- result = df.groupby("A").cumprod(axis=1)
- expected = df.cumprod(axis=1)
- tm.assert_frame_equal(result, expected)
- def test_cython_median():
- arr = np.random.randn(1000)
- arr[::2] = np.nan
- df = DataFrame(arr)
- labels = np.random.randint(0, 50, size=1000).astype(float)
- labels[::17] = np.nan
- result = df.groupby(labels).median()
- exp = df.groupby(labels).agg(nanops.nanmedian)
- tm.assert_frame_equal(result, exp)
- df = DataFrame(np.random.randn(1000, 5))
- rs = df.groupby(labels).agg(np.median)
- xp = df.groupby(labels).median()
- tm.assert_frame_equal(rs, xp)
- def test_median_empty_bins(observed):
- df = DataFrame(np.random.randint(0, 44, 500))
- grps = range(0, 55, 5)
- bins = pd.cut(df[0], grps)
- result = df.groupby(bins, observed=observed).median()
- expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
- )
- @pytest.mark.parametrize(
- "method,data",
- [
- ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
- ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
- ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
- ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
- ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}),
- ],
- )
- def test_groupby_non_arithmetic_agg_types(dtype, method, data):
- # GH9311, GH6620
- df = DataFrame(
- [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}]
- )
- df["b"] = df.b.astype(dtype)
- if "args" not in data:
- data["args"] = []
- if "out_type" in data:
- out_type = data["out_type"]
- else:
- out_type = dtype
- exp = data["df"]
- df_out = DataFrame(exp)
- df_out["b"] = df_out.b.astype(out_type)
- df_out.set_index("a", inplace=True)
- grpd = df.groupby("a")
- t = getattr(grpd, method)(*data["args"])
- tm.assert_frame_equal(t, df_out)
- @pytest.mark.parametrize(
- "i",
- [
- (
- Timestamp("2011-01-15 12:50:28.502376"),
- Timestamp("2011-01-20 12:50:28.593448"),
- ),
- (24650000000000001, 24650000000000002),
- ],
- )
- def test_groupby_non_arithmetic_agg_int_like_precision(i):
- # see gh-6620, gh-9311
- df = DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}])
- grp_exp = {
- "first": {"expected": i[0]},
- "last": {"expected": i[1]},
- "min": {"expected": i[0]},
- "max": {"expected": i[1]},
- "nth": {"expected": i[1], "args": [1]},
- "count": {"expected": 2},
- }
- for method, data in grp_exp.items():
- if "args" not in data:
- data["args"] = []
- grouped = df.groupby("a")
- res = getattr(grouped, method)(*data["args"])
- assert res.iloc[0].b == data["expected"]
- @pytest.mark.parametrize(
- "func, values",
- [
- ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}),
- ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}),
- ],
- )
- @pytest.mark.parametrize("numeric_only", [True, False])
- def test_idxmin_idxmax_returns_int_types(func, values, numeric_only):
- # GH 25444
- df = DataFrame(
- {
- "name": ["A", "A", "B", "B"],
- "c_int": [1, 2, 3, 4],
- "c_float": [4.02, 3.03, 2.04, 1.05],
- "c_date": ["2019", "2018", "2016", "2017"],
- }
- )
- df["c_date"] = pd.to_datetime(df["c_date"])
- df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific")
- df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0]
- df["c_period"] = df["c_date"].dt.to_period("W")
- df["c_Integer"] = df["c_int"].astype("Int64")
- df["c_Floating"] = df["c_float"].astype("Float64")
- result = getattr(df.groupby("name"), func)(numeric_only=numeric_only)
- expected = DataFrame(values, index=Index(["A", "B"], name="name"))
- if numeric_only:
- expected = expected.drop(columns=["c_date"])
- else:
- expected["c_date_tz"] = expected["c_date"]
- expected["c_timedelta"] = expected["c_date"]
- expected["c_period"] = expected["c_date"]
- expected["c_Integer"] = expected["c_int"]
- expected["c_Floating"] = expected["c_float"]
- tm.assert_frame_equal(result, expected)
- def test_idxmin_idxmax_axis1():
- df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
- df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
- gb = df.groupby("A")
- res = gb.idxmax(axis=1)
- alt = df.iloc[:, 1:].idxmax(axis=1)
- indexer = res.index.get_level_values(1)
- tm.assert_series_equal(alt[indexer], res.droplevel("A"))
- df["E"] = date_range("2016-01-01", periods=10)
- gb2 = df.groupby("A")
- msg = "reduction operation 'argmax' not allowed for this dtype"
- with pytest.raises(TypeError, match=msg):
- gb2.idxmax(axis=1)
- @pytest.mark.parametrize("numeric_only", [True, False, None])
- def test_axis1_numeric_only(request, groupby_func, numeric_only):
- if groupby_func in ("idxmax", "idxmin"):
- pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1")
- if groupby_func in ("corrwith", "skew"):
- msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1"
- request.node.add_marker(pytest.mark.xfail(reason=msg))
- df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
- df["E"] = "x"
- groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
- gb = df.groupby(groups)
- method = getattr(gb, groupby_func)
- args = get_groupby_method_args(groupby_func, df)
- kwargs = {"axis": 1}
- if numeric_only is not None:
- # when numeric_only is None we don't pass any argument
- kwargs["numeric_only"] = numeric_only
- # Functions without numeric_only and axis args
- no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift")
- # Functions with axis args
- has_axis = (
- "cumprod",
- "cumsum",
- "diff",
- "pct_change",
- "rank",
- "shift",
- "cummax",
- "cummin",
- "idxmin",
- "idxmax",
- "fillna",
- )
- if numeric_only is not None and groupby_func in no_args:
- msg = "got an unexpected keyword argument 'numeric_only'"
- with pytest.raises(TypeError, match=msg):
- method(*args, **kwargs)
- elif groupby_func not in has_axis:
- msg = "got an unexpected keyword argument 'axis'"
- with pytest.raises(TypeError, match=msg):
- method(*args, **kwargs)
- # fillna and shift are successful even on object dtypes
- elif (numeric_only is None or not numeric_only) and groupby_func not in (
- "fillna",
- "shift",
- ):
- msgs = (
- # cummax, cummin, rank
- "not supported between instances of",
- # cumprod
- "can't multiply sequence by non-int of type 'float'",
- # cumsum, diff, pct_change
- "unsupported operand type",
- )
- with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"):
- method(*args, **kwargs)
- else:
- result = method(*args, **kwargs)
- df_expected = df.drop(columns="E").T if numeric_only else df.T
- expected = getattr(df_expected, groupby_func)(*args).T
- if groupby_func == "shift" and not numeric_only:
- # shift with axis=1 leaves the leftmost column as numeric
- # but transposing for expected gives us object dtype
- expected = expected.astype(float)
- tm.assert_equal(result, expected)
- def test_groupby_cumprod():
- # GH 4095
- df = DataFrame({"key": ["b"] * 10, "value": 2})
- actual = df.groupby("key")["value"].cumprod()
- expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
- expected.name = "value"
- tm.assert_series_equal(actual, expected)
- df = DataFrame({"key": ["b"] * 100, "value": 2})
- df["value"] = df["value"].astype(float)
- actual = df.groupby("key")["value"].cumprod()
- expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
- expected.name = "value"
- tm.assert_series_equal(actual, expected)
- def test_groupby_cumprod_overflow():
- # GH#37493 if we overflow we return garbage consistent with numpy
- df = DataFrame({"key": ["b"] * 4, "value": 100_000})
- actual = df.groupby("key")["value"].cumprod()
- expected = Series(
- [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
- name="value",
- )
- tm.assert_series_equal(actual, expected)
- numpy_result = df.groupby("key", group_keys=False)["value"].apply(
- lambda x: x.cumprod()
- )
- numpy_result.name = "value"
- tm.assert_series_equal(actual, numpy_result)
- def test_groupby_cumprod_nan_influences_other_columns():
- # GH#48064
- df = DataFrame(
- {
- "a": 1,
- "b": [1, np.nan, 2],
- "c": [1, 2, 3.0],
- }
- )
- result = df.groupby("a").cumprod(numeric_only=True, skipna=False)
- expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]})
- tm.assert_frame_equal(result, expected)
- def scipy_sem(*args, **kwargs):
- from scipy.stats import sem
- return sem(*args, ddof=1, **kwargs)
- @pytest.mark.parametrize(
- "op,targop",
- [
- ("mean", np.mean),
- ("median", np.median),
- ("std", np.std),
- ("var", np.var),
- ("sum", np.sum),
- ("prod", np.prod),
- ("min", np.min),
- ("max", np.max),
- ("first", lambda x: x.iloc[0]),
- ("last", lambda x: x.iloc[-1]),
- ("count", np.size),
- pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy),
- ],
- )
- def test_ops_general(op, targop):
- df = DataFrame(np.random.randn(1000))
- labels = np.random.randint(0, 50, size=1000).astype(float)
- result = getattr(df.groupby(labels), op)()
- expected = df.groupby(labels).agg(targop)
- tm.assert_frame_equal(result, expected)
- def test_max_nan_bug():
- raw = """,Date,app,File
- -04-23,2013-04-23 00:00:00,,log080001.log
- -05-06,2013-05-06 00:00:00,,log.log
- -05-07,2013-05-07 00:00:00,OE,xlsx"""
- with tm.assert_produces_warning(UserWarning, match="Could not infer format"):
- df = pd.read_csv(StringIO(raw), parse_dates=[0])
- gb = df.groupby("Date")
- r = gb[["File"]].max()
- e = gb["File"].max().to_frame()
- tm.assert_frame_equal(r, e)
- assert not r["File"].isna().any()
- def test_nlargest():
- a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
- b = Series(list("a" * 5 + "b" * 5))
- gb = a.groupby(b)
- r = gb.nlargest(3)
- e = Series(
- [7, 5, 3, 10, 9, 6],
- index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
- )
- tm.assert_series_equal(r, e)
- a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
- gb = a.groupby(b)
- e = Series(
- [3, 2, 1, 3, 3, 2],
- index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
- )
- tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
- def test_nlargest_mi_grouper():
- # see gh-21411
- npr = np.random.RandomState(123456789)
- dts = date_range("20180101", periods=10)
- iterables = [dts, ["one", "two"]]
- idx = MultiIndex.from_product(iterables, names=["first", "second"])
- s = Series(npr.randn(20), index=idx)
- result = s.groupby("first").nlargest(1)
- exp_idx = MultiIndex.from_tuples(
- [
- (dts[0], dts[0], "one"),
- (dts[1], dts[1], "one"),
- (dts[2], dts[2], "one"),
- (dts[3], dts[3], "two"),
- (dts[4], dts[4], "one"),
- (dts[5], dts[5], "one"),
- (dts[6], dts[6], "one"),
- (dts[7], dts[7], "one"),
- (dts[8], dts[8], "two"),
- (dts[9], dts[9], "one"),
- ],
- names=["first", "first", "second"],
- )
- exp_values = [
- 2.2129019979039612,
- 1.8417114045748335,
- 0.858963679564603,
- 1.3759151378258088,
- 0.9430284594687134,
- 0.5296914208183142,
- 0.8318045593815487,
- -0.8476703342910327,
- 0.3804446884133735,
- -0.8028845810770998,
- ]
- expected = Series(exp_values, index=exp_idx)
- tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
- def test_nsmallest():
- a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
- b = Series(list("a" * 5 + "b" * 5))
- gb = a.groupby(b)
- r = gb.nsmallest(3)
- e = Series(
- [1, 2, 3, 0, 4, 6],
- index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
- )
- tm.assert_series_equal(r, e)
- a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
- gb = a.groupby(b)
- e = Series(
- [0, 1, 1, 0, 1, 2],
- index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
- )
- tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
- @pytest.mark.parametrize(
- "data, groups",
- [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
- )
- @pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
- @pytest.mark.parametrize("method", ["nlargest", "nsmallest"])
- def test_nlargest_and_smallest_noop(data, groups, dtype, method):
- # GH 15272, GH 16345, GH 29129
- # Test nlargest/smallest when it results in a noop,
- # i.e. input is sorted and group size <= n
- if dtype is not None:
- data = np.array(data, dtype=dtype)
- if method == "nlargest":
- data = list(reversed(data))
- ser = Series(data, name="a")
- result = getattr(ser.groupby(groups), method)(n=2)
- expidx = np.array(groups, dtype=np.int_) if isinstance(groups, list) else groups
- expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("func", ["cumprod", "cumsum"])
- def test_numpy_compat(func):
- # see gh-12811
- df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
- g = df.groupby("A")
- msg = "numpy operations are not valid with groupby"
- with pytest.raises(UnsupportedFunctionCall, match=msg):
- getattr(g, func)(1, 2, 3)
- with pytest.raises(UnsupportedFunctionCall, match=msg):
- getattr(g, func)(foo=1)
- def test_cummin(dtypes_for_minmax):
- dtype = dtypes_for_minmax[0]
- min_val = dtypes_for_minmax[1]
- # GH 15048
- base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
- expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
- df = base_df.astype(dtype)
- expected = DataFrame({"B": expected_mins}).astype(dtype)
- result = df.groupby("A").cummin()
- tm.assert_frame_equal(result, expected)
- result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
- tm.assert_frame_equal(result, expected)
- # Test w/ min value for dtype
- df.loc[[2, 6], "B"] = min_val
- df.loc[[1, 5], "B"] = min_val + 1
- expected.loc[[2, 3, 6, 7], "B"] = min_val
- expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
- result = df.groupby("A").cummin()
- tm.assert_frame_equal(result, expected, check_exact=True)
- expected = (
- df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
- )
- tm.assert_frame_equal(result, expected, check_exact=True)
- # Test nan in some values
- # Explicit cast to float to avoid implicit cast when setting nan
- base_df = base_df.astype({"B": "float"})
- base_df.loc[[0, 2, 4, 6], "B"] = np.nan
- expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
- result = base_df.groupby("A").cummin()
- tm.assert_frame_equal(result, expected)
- expected = (
- base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
- )
- tm.assert_frame_equal(result, expected)
- # GH 15561
- df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
- expected = Series(pd.to_datetime("2001"), index=[0], name="b")
- result = df.groupby("a")["b"].cummin()
- tm.assert_series_equal(expected, result)
- # GH 15635
- df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
- result = df.groupby("a").b.cummin()
- expected = Series([1, 2, 1], name="b")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("method", ["cummin", "cummax"])
- @pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
- def test_cummin_max_all_nan_column(method, dtype):
- base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
- base_df["B"] = base_df["B"].astype(dtype)
- grouped = base_df.groupby("A")
- expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype)
- result = getattr(grouped, method)()
- tm.assert_frame_equal(expected, result)
- result = getattr(grouped["B"], method)().to_frame()
- tm.assert_frame_equal(expected, result)
- def test_cummax(dtypes_for_minmax):
- dtype = dtypes_for_minmax[0]
- max_val = dtypes_for_minmax[2]
- # GH 15048
- base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
- expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
- df = base_df.astype(dtype)
- expected = DataFrame({"B": expected_maxs}).astype(dtype)
- result = df.groupby("A").cummax()
- tm.assert_frame_equal(result, expected)
- result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
- tm.assert_frame_equal(result, expected)
- # Test w/ max value for dtype
- df.loc[[2, 6], "B"] = max_val
- expected.loc[[2, 3, 6, 7], "B"] = max_val
- result = df.groupby("A").cummax()
- tm.assert_frame_equal(result, expected)
- expected = (
- df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
- )
- tm.assert_frame_equal(result, expected)
- # Test nan in some values
- # Explicit cast to float to avoid implicit cast when setting nan
- base_df = base_df.astype({"B": "float"})
- base_df.loc[[0, 2, 4, 6], "B"] = np.nan
- expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
- result = base_df.groupby("A").cummax()
- tm.assert_frame_equal(result, expected)
- expected = (
- base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
- )
- tm.assert_frame_equal(result, expected)
- # GH 15561
- df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
- expected = Series(pd.to_datetime("2001"), index=[0], name="b")
- result = df.groupby("a")["b"].cummax()
- tm.assert_series_equal(expected, result)
- # GH 15635
- df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
- result = df.groupby("a").b.cummax()
- expected = Series([2, 1, 2], name="b")
- tm.assert_series_equal(result, expected)
- def test_cummax_i8_at_implementation_bound():
- # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT
- # for int64 dtype GH#46382
- ser = Series([pd.NaT._value + n for n in range(5)])
- df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")})
- gb = df.groupby("A")
- res = gb.cummax()
- exp = df[["B", "C"]]
- tm.assert_frame_equal(res, exp)
- @pytest.mark.parametrize("method", ["cummin", "cummax"])
- @pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
- @pytest.mark.parametrize(
- "groups,expected_data",
- [
- ([1, 1, 1], [1, None, None]),
- ([1, 2, 3], [1, None, 2]),
- ([1, 3, 3], [1, None, None]),
- ],
- )
- def test_cummin_max_skipna(method, dtype, groups, expected_data):
- # GH-34047
- df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
- orig = df.copy()
- gb = df.groupby(groups)["a"]
- result = getattr(gb, method)(skipna=False)
- expected = Series(expected_data, dtype=dtype, name="a")
- # check we didn't accidentally alter df
- tm.assert_frame_equal(df, orig)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("method", ["cummin", "cummax"])
- def test_cummin_max_skipna_multiple_cols(method):
- # Ensure missing value in "a" doesn't cause "b" to be nan-filled
- df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
- gb = df.groupby([1, 1, 1])[["a", "b"]]
- result = getattr(gb, method)(skipna=False)
- expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})
- tm.assert_frame_equal(result, expected)
- @td.skip_if_32bit
- @pytest.mark.parametrize("method", ["cummin", "cummax"])
- @pytest.mark.parametrize(
- "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)]
- )
- def test_nullable_int_not_cast_as_float(method, dtype, val):
- data = [val, pd.NA]
- df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
- grouped = df.groupby("grp")
- result = grouped.transform(method)
- expected = DataFrame({"b": data}, dtype=dtype)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "in_vals, out_vals",
- [
- # Basics: strictly increasing (T), strictly decreasing (F),
- # abs val increasing (F), non-strictly increasing (T)
- ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
- # Test with inf vals
- (
- [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
- [True, False, True, False],
- ),
- # Test with nan vals; should always be False
- (
- [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
- [False, False, False, False],
- ),
- ],
- )
- def test_is_monotonic_increasing(in_vals, out_vals):
- # GH 17015
- source_dict = {
- "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
- "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
- "C": in_vals,
- }
- df = DataFrame(source_dict)
- result = df.groupby("B").C.is_monotonic_increasing
- index = Index(list("abcd"), name="B")
- expected = Series(index=index, data=out_vals, name="C")
- tm.assert_series_equal(result, expected)
- # Also check result equal to manually taking x.is_monotonic_increasing.
- expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "in_vals, out_vals",
- [
- # Basics: strictly decreasing (T), strictly increasing (F),
- # abs val decreasing (F), non-strictly increasing (T)
- ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
- # Test with inf vals
- (
- [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
- [True, True, False, True],
- ),
- # Test with nan vals; should always be False
- (
- [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
- [False, False, False, False],
- ),
- ],
- )
- def test_is_monotonic_decreasing(in_vals, out_vals):
- # GH 17015
- source_dict = {
- "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
- "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
- "C": in_vals,
- }
- df = DataFrame(source_dict)
- result = df.groupby("B").C.is_monotonic_decreasing
- index = Index(list("abcd"), name="B")
- expected = Series(index=index, data=out_vals, name="C")
- tm.assert_series_equal(result, expected)
- # describe
- # --------------------------------
- def test_apply_describe_bug(mframe):
- grouped = mframe.groupby(level="first")
- grouped.describe() # it works!
- def test_series_describe_multikey():
- ts = tm.makeTimeSeries()
- grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.describe()
- tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
- tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
- tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
- def test_series_describe_single():
- ts = tm.makeTimeSeries()
- grouped = ts.groupby(lambda x: x.month)
- result = grouped.apply(lambda x: x.describe())
- expected = grouped.describe().stack()
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
- def test_series_describe_as_index(as_index, keys):
- # GH#49256
- df = DataFrame(
- {
- "key1": ["one", "two", "two", "three", "two"],
- "key2": ["one", "two", "two", "three", "two"],
- "foo2": [1, 2, 4, 4, 6],
- }
- )
- gb = df.groupby(keys, as_index=as_index)["foo2"]
- result = gb.describe()
- expected = DataFrame(
- {
- "key1": ["one", "three", "two"],
- "count": [1.0, 1.0, 3.0],
- "mean": [1.0, 4.0, 4.0],
- "std": [np.nan, np.nan, 2.0],
- "min": [1.0, 4.0, 2.0],
- "25%": [1.0, 4.0, 3.0],
- "50%": [1.0, 4.0, 4.0],
- "75%": [1.0, 4.0, 5.0],
- "max": [1.0, 4.0, 6.0],
- }
- )
- if len(keys) == 2:
- expected.insert(1, "key2", expected["key1"])
- if as_index:
- expected = expected.set_index(keys)
- tm.assert_frame_equal(result, expected)
- def test_series_index_name(df):
- grouped = df.loc[:, ["C"]].groupby(df["A"])
- result = grouped.agg(lambda x: x.mean())
- assert result.index.name == "A"
- def test_frame_describe_multikey(tsframe):
- grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.describe()
- desc_groups = []
- for col in tsframe:
- group = grouped[col].describe()
- # GH 17464 - Remove duplicate MultiIndex levels
- group_col = MultiIndex(
- levels=[[col], group.columns],
- codes=[[0] * len(group.columns), range(len(group.columns))],
- )
- group = DataFrame(group.values, columns=group_col, index=group.index)
- desc_groups.append(group)
- expected = pd.concat(desc_groups, axis=1)
- tm.assert_frame_equal(result, expected)
- groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
- result = groupedT.describe()
- expected = tsframe.describe().T
- # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
- expected.index = MultiIndex(
- levels=[[0, 1], expected.index],
- codes=[[0, 0, 1, 1], range(len(expected.index))],
- )
- tm.assert_frame_equal(result, expected)
- def test_frame_describe_tupleindex():
- # GH 14848 - regression from 0.19.0 to 0.19.1
- df1 = DataFrame(
- {
- "x": [1, 2, 3, 4, 5] * 3,
- "y": [10, 20, 30, 40, 50] * 3,
- "z": [100, 200, 300, 400, 500] * 3,
- }
- )
- df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
- df2 = df1.rename(columns={"k": "key"})
- msg = "Names should be list-like for a MultiIndex"
- with pytest.raises(ValueError, match=msg):
- df1.groupby("k").describe()
- with pytest.raises(ValueError, match=msg):
- df2.groupby("key").describe()
- def test_frame_describe_unstacked_format():
- # GH 4792
- prices = {
- Timestamp("2011-01-06 10:59:05", tz=None): 24990,
- Timestamp("2011-01-06 12:43:33", tz=None): 25499,
- Timestamp("2011-01-06 12:54:09", tz=None): 25499,
- }
- volumes = {
- Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
- Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
- Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
- }
- df = DataFrame({"PRICE": prices, "VOLUME": volumes})
- result = df.groupby("PRICE").VOLUME.describe()
- data = [
- df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
- df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
- ]
- expected = DataFrame(
- data,
- index=Index([24990, 25499], name="PRICE"),
- columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.filterwarnings(
- "ignore:"
- "indexing past lexsort depth may impact performance:"
- "pandas.errors.PerformanceWarning"
- )
- @pytest.mark.parametrize("as_index", [True, False])
- @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
- def test_describe_with_duplicate_output_column_names(as_index, keys):
- # GH 35314
- df = DataFrame(
- {
- "a1": [99, 99, 99, 88, 88, 88],
- "a2": [99, 99, 99, 88, 88, 88],
- "b": [1, 2, 3, 4, 5, 6],
- "c": [10, 20, 30, 40, 50, 60],
- },
- columns=["a1", "a2", "b", "b"],
- copy=False,
- )
- if keys == ["a1"]:
- df = df.drop(columns="a2")
- expected = (
- DataFrame.from_records(
- [
- ("b", "count", 3.0, 3.0),
- ("b", "mean", 5.0, 2.0),
- ("b", "std", 1.0, 1.0),
- ("b", "min", 4.0, 1.0),
- ("b", "25%", 4.5, 1.5),
- ("b", "50%", 5.0, 2.0),
- ("b", "75%", 5.5, 2.5),
- ("b", "max", 6.0, 3.0),
- ("b", "count", 3.0, 3.0),
- ("b", "mean", 5.0, 2.0),
- ("b", "std", 1.0, 1.0),
- ("b", "min", 4.0, 1.0),
- ("b", "25%", 4.5, 1.5),
- ("b", "50%", 5.0, 2.0),
- ("b", "75%", 5.5, 2.5),
- ("b", "max", 6.0, 3.0),
- ],
- )
- .set_index([0, 1])
- .T
- )
- expected.columns.names = [None, None]
- if len(keys) == 2:
- expected.index = MultiIndex(
- levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
- )
- else:
- expected.index = Index([88, 99], name="a1")
- if not as_index:
- expected = expected.reset_index()
- result = df.groupby(keys, as_index=as_index).describe()
- tm.assert_frame_equal(result, expected)
- def test_describe_duplicate_columns():
- # GH#50806
- df = DataFrame([[0, 1, 2, 3]])
- df.columns = [0, 1, 2, 0]
- gb = df.groupby(df[1])
- result = gb.describe(percentiles=[])
- columns = ["count", "mean", "std", "min", "50%", "max"]
- frames = [
- DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
- for val in (0.0, 2.0, 3.0)
- ]
- expected = pd.concat(frames, axis=1)
- expected.columns = MultiIndex(
- levels=[[0, 2], columns],
- codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
- )
- expected.index.names = [1]
- tm.assert_frame_equal(result, expected)
- def test_groupby_mean_no_overflow():
- # Regression test for (#22487)
- df = DataFrame(
- {
- "user": ["A", "A", "A", "A", "A"],
- "connections": [4970, 4749, 4719, 4704, 18446744073699999744],
- }
- )
- assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840
- @pytest.mark.parametrize(
- "values",
- [
- {
- "a": [1, 1, 1, 2, 2, 2, 3, 3, 3],
- "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2],
- },
- {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]},
- ],
- )
- @pytest.mark.parametrize("function", ["mean", "median", "var"])
- def test_apply_to_nullable_integer_returns_float(values, function):
- # https://github.com/pandas-dev/pandas/issues/32219
- output = 0.5 if function == "var" else 1.5
- arr = np.array([output] * 3, dtype=float)
- idx = Index([1, 2, 3], name="a", dtype="Int64")
- expected = DataFrame({"b": arr}, index=idx).astype("Float64")
- groups = DataFrame(values, dtype="Int64").groupby("a")
- result = getattr(groups, function)()
- tm.assert_frame_equal(result, expected)
- result = groups.agg(function)
- tm.assert_frame_equal(result, expected)
- result = groups.agg([function])
- expected.columns = MultiIndex.from_tuples([("b", function)])
- tm.assert_frame_equal(result, expected)
- def test_groupby_sum_below_mincount_nullable_integer():
- # https://github.com/pandas-dev/pandas/issues/32861
- df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64")
- grouped = df.groupby("a")
- idx = Index([0, 1, 2], name="a", dtype="Int64")
- result = grouped["b"].sum(min_count=2)
- expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b")
- tm.assert_series_equal(result, expected)
- result = grouped.sum(min_count=2)
- expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx)
- tm.assert_frame_equal(result, expected)
- def test_mean_on_timedelta():
- # GH 17382
- df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5})
- result = df.groupby("cat")["time"].mean()
- expected = Series(
- pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat")
- )
- tm.assert_series_equal(result, expected)
- def test_groupby_sum_timedelta_with_nat():
- # GH#42659
- df = DataFrame(
- {
- "a": [1, 1, 2, 2],
- "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT],
- }
- )
- td3 = pd.Timedelta(days=3)
- gb = df.groupby("a")
- res = gb.sum()
- expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a"))
- tm.assert_frame_equal(res, expected)
- res = gb["b"].sum()
- tm.assert_series_equal(res, expected["b"])
- res = gb["b"].sum(min_count=2)
- expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index)
- tm.assert_series_equal(res, expected)
- @pytest.mark.parametrize(
- "kernel, has_arg",
- [
- ("all", False),
- ("any", False),
- ("bfill", False),
- ("corr", True),
- ("corrwith", True),
- ("cov", True),
- ("cummax", True),
- ("cummin", True),
- ("cumprod", True),
- ("cumsum", True),
- ("diff", False),
- ("ffill", False),
- ("fillna", False),
- ("first", True),
- ("idxmax", True),
- ("idxmin", True),
- ("last", True),
- ("max", True),
- ("mean", True),
- ("median", True),
- ("min", True),
- ("nth", False),
- ("nunique", False),
- ("pct_change", False),
- ("prod", True),
- ("quantile", True),
- ("sem", True),
- ("skew", True),
- ("std", True),
- ("sum", True),
- ("var", True),
- ],
- )
- @pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
- @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
- def test_numeric_only(kernel, has_arg, numeric_only, keys):
- # GH#46072
- # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
- # has_arg: Whether the op has a numeric_only arg
- df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]})
- args = get_groupby_method_args(kernel, df)
- kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
- gb = df.groupby(keys)
- method = getattr(gb, kernel)
- if has_arg and numeric_only is True:
- # Cases where b does not appear in the result
- result = method(*args, **kwargs)
- assert "b" not in result.columns
- elif (
- # kernels that work on any dtype and have numeric_only arg
- kernel in ("first", "last")
- or (
- # kernels that work on any dtype and don't have numeric_only arg
- kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique")
- and numeric_only is lib.no_default
- )
- ):
- result = method(*args, **kwargs)
- assert "b" in result.columns
- elif has_arg or kernel in ("idxmax", "idxmin"):
- assert numeric_only is not True
- # kernels that are successful on any dtype were above; this will fail
- # object dtypes for transformations are not implemented in Cython and
- # have no Python fallback
- exception = NotImplementedError if kernel.startswith("cum") else TypeError
- msg = "|".join(
- [
- "not allowed for this dtype",
- "must be a string or a number",
- "cannot be performed against 'object' dtypes",
- "must be a string or a real number",
- "unsupported operand type",
- "not supported between instances of",
- "function is not implemented for this dtype",
- ]
- )
- with pytest.raises(exception, match=msg):
- method(*args, **kwargs)
- elif not has_arg and numeric_only is not lib.no_default:
- with pytest.raises(
- TypeError, match="got an unexpected keyword argument 'numeric_only'"
- ):
- method(*args, **kwargs)
- else:
- assert kernel in ("diff", "pct_change")
- assert numeric_only is lib.no_default
- # Doesn't have numeric_only argument and fails on nuisance columns
- with pytest.raises(TypeError, match=r"unsupported operand type"):
- method(*args, **kwargs)
- @pytest.mark.parametrize("dtype", [bool, int, float, object])
- def test_deprecate_numeric_only_series(dtype, groupby_func, request):
- # GH#46560
- if groupby_func == "corrwith":
- msg = "corrwith is not implemented on SeriesGroupBy"
- request.node.add_marker(pytest.mark.xfail(reason=msg))
- grouper = [0, 0, 1]
- ser = Series([1, 0, 0], dtype=dtype)
- gb = ser.groupby(grouper)
- method = getattr(gb, groupby_func)
- expected_ser = Series([1, 0, 0])
- expected_gb = expected_ser.groupby(grouper)
- expected_method = getattr(expected_gb, groupby_func)
- args = get_groupby_method_args(groupby_func, ser)
- fails_on_numeric_object = (
- "corr",
- "cov",
- "cummax",
- "cummin",
- "cumprod",
- "cumsum",
- "idxmax",
- "idxmin",
- "quantile",
- )
- # ops that give an object result on object input
- obj_result = (
- "first",
- "last",
- "nth",
- "bfill",
- "ffill",
- "shift",
- "sum",
- "diff",
- "pct_change",
- "var",
- "mean",
- "median",
- "min",
- "max",
- "prod",
- )
- # Test default behavior; kernels that fail may be enabled in the future but kernels
- # that succeed should not be allowed to fail (without deprecation, at least)
- if groupby_func in fails_on_numeric_object and dtype is object:
- if groupby_func in ("idxmax", "idxmin"):
- msg = "not allowed for this dtype"
- elif groupby_func == "quantile":
- msg = "cannot be performed against 'object' dtypes"
- else:
- msg = "is not supported for object dtype"
- with pytest.raises(TypeError, match=msg):
- method(*args)
- elif dtype is object:
- result = method(*args)
- expected = expected_method(*args)
- if groupby_func in obj_result:
- expected = expected.astype(object)
- tm.assert_series_equal(result, expected)
- has_numeric_only = (
- "first",
- "last",
- "max",
- "mean",
- "median",
- "min",
- "prod",
- "quantile",
- "sem",
- "skew",
- "std",
- "sum",
- "var",
- "cummax",
- "cummin",
- "cumprod",
- "cumsum",
- )
- if groupby_func not in has_numeric_only:
- msg = "got an unexpected keyword argument 'numeric_only'"
- with pytest.raises(TypeError, match=msg):
- method(*args, numeric_only=True)
- elif dtype is object:
- msg = "|".join(
- [
- "SeriesGroupBy.sem called with numeric_only=True and dtype object",
- "Series.skew does not allow numeric_only=True with non-numeric",
- "cum(sum|prod|min|max) is not supported for object dtype",
- r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
- ]
- )
- with pytest.raises(TypeError, match=msg):
- method(*args, numeric_only=True)
- else:
- result = method(*args, numeric_only=True)
- expected = method(*args, numeric_only=False)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("dtype", [int, float, object])
- @pytest.mark.parametrize(
- "kwargs",
- [
- {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
- {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
- {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
- ],
- )
- def test_groupby_empty_dataset(dtype, kwargs):
- # GH#41575
- df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
- df["B"] = df["B"].astype(int)
- df["C"] = df["C"].astype(float)
- result = df.iloc[:0].groupby("A").describe(**kwargs)
- expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
- tm.assert_frame_equal(result, expected)
- result = df.iloc[:0].groupby("A").B.describe(**kwargs)
- expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
- expected.index = Index([])
- tm.assert_frame_equal(result, expected)
- def test_corrwith_with_1_axis():
- # GH 47723
- df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
- result = df.groupby("a").corrwith(df, axis=1)
- index = Index(
- data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
- name=("a", None),
- )
- expected = Series([np.nan] * 6, index=index)
- tm.assert_series_equal(result, expected)
- def test_multiindex_group_all_columns_when_empty(groupby_func):
- # GH 32464
- df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"])
- gb = df.groupby(["a", "b", "c"], group_keys=False)
- method = getattr(gb, groupby_func)
- args = get_groupby_method_args(groupby_func, df)
- result = method(*args).index
- expected = df.index
- tm.assert_index_equal(result, expected)
- def test_duplicate_columns(request, groupby_func, as_index):
- # GH#50806
- if groupby_func == "corrwith":
- msg = "GH#50845 - corrwith fails when there are duplicate columns"
- request.node.add_marker(pytest.mark.xfail(reason=msg))
- df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
- args = get_groupby_method_args(groupby_func, df)
- gb = df.groupby("a", as_index=as_index)
- result = getattr(gb, groupby_func)(*args)
- expected_df = df.set_axis(["a", "b", "c"], axis=1)
- expected_args = get_groupby_method_args(groupby_func, expected_df)
- expected_gb = expected_df.groupby("a", as_index=as_index)
- expected = getattr(expected_gb, groupby_func)(*expected_args)
- if groupby_func not in ("size", "ngroup", "cumcount"):
- expected = expected.rename(columns={"c": "b"})
- tm.assert_equal(result, expected)
|