123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664 |
- """
- test all other .agg behavior
- """
- import datetime as dt
- from functools import partial
- import numpy as np
- import pytest
- from pandas.errors import SpecificationError
- import pandas as pd
- from pandas import (
- DataFrame,
- Index,
- MultiIndex,
- PeriodIndex,
- Series,
- date_range,
- period_range,
- )
- import pandas._testing as tm
- from pandas.io.formats.printing import pprint_thing
- def test_agg_partial_failure_raises():
- # GH#43741
- df = DataFrame(
- {
- "data1": np.random.randn(5),
- "data2": np.random.randn(5),
- "key1": ["a", "a", "b", "b", "a"],
- "key2": ["one", "two", "one", "two", "one"],
- }
- )
- grouped = df.groupby("key1")
- def peak_to_peak(arr):
- return arr.max() - arr.min()
- with pytest.raises(TypeError, match="unsupported operand type"):
- grouped.agg([peak_to_peak])
- with pytest.raises(TypeError, match="unsupported operand type"):
- grouped.agg(peak_to_peak)
- def test_agg_datetimes_mixed():
- data = [[1, "2012-01-01", 1.0], [2, "2012-01-02", 2.0], [3, None, 3.0]]
- df1 = DataFrame(
- {
- "key": [x[0] for x in data],
- "date": [x[1] for x in data],
- "value": [x[2] for x in data],
- }
- )
- data = [
- [
- row[0],
- (dt.datetime.strptime(row[1], "%Y-%m-%d").date() if row[1] else None),
- row[2],
- ]
- for row in data
- ]
- df2 = DataFrame(
- {
- "key": [x[0] for x in data],
- "date": [x[1] for x in data],
- "value": [x[2] for x in data],
- }
- )
- df1["weights"] = df1["value"] / df1["value"].sum()
- gb1 = df1.groupby("date").aggregate(np.sum)
- df2["weights"] = df1["value"] / df1["value"].sum()
- gb2 = df2.groupby("date").aggregate(np.sum)
- assert len(gb1) == len(gb2)
- def test_agg_period_index():
- prng = period_range("2012-1-1", freq="M", periods=3)
- df = DataFrame(np.random.randn(3, 2), index=prng)
- rs = df.groupby(level=0).sum()
- assert isinstance(rs.index, PeriodIndex)
- # GH 3579
- index = period_range(start="1999-01", periods=5, freq="M")
- s1 = Series(np.random.rand(len(index)), index=index)
- s2 = Series(np.random.rand(len(index)), index=index)
- df = DataFrame.from_dict({"s1": s1, "s2": s2})
- grouped = df.groupby(df.index.month)
- list(grouped)
- def test_agg_dict_parameter_cast_result_dtypes():
- # GH 12821
- df = DataFrame(
- {
- "class": ["A", "A", "B", "B", "C", "C", "D", "D"],
- "time": date_range("1/1/2011", periods=8, freq="H"),
- }
- )
- df.loc[[0, 1, 2, 5], "time"] = None
- # test for `first` function
- exp = df.loc[[0, 3, 4, 6]].set_index("class")
- grouped = df.groupby("class")
- tm.assert_frame_equal(grouped.first(), exp)
- tm.assert_frame_equal(grouped.agg("first"), exp)
- tm.assert_frame_equal(grouped.agg({"time": "first"}), exp)
- tm.assert_series_equal(grouped.time.first(), exp["time"])
- tm.assert_series_equal(grouped.time.agg("first"), exp["time"])
- # test for `last` function
- exp = df.loc[[0, 3, 4, 7]].set_index("class")
- grouped = df.groupby("class")
- tm.assert_frame_equal(grouped.last(), exp)
- tm.assert_frame_equal(grouped.agg("last"), exp)
- tm.assert_frame_equal(grouped.agg({"time": "last"}), exp)
- tm.assert_series_equal(grouped.time.last(), exp["time"])
- tm.assert_series_equal(grouped.time.agg("last"), exp["time"])
- # count
- exp = Series([2, 2, 2, 2], index=Index(list("ABCD"), name="class"), name="time")
- tm.assert_series_equal(grouped.time.agg(len), exp)
- tm.assert_series_equal(grouped.time.size(), exp)
- exp = Series([0, 1, 1, 2], index=Index(list("ABCD"), name="class"), name="time")
- tm.assert_series_equal(grouped.time.count(), exp)
- def test_agg_cast_results_dtypes():
- # similar to GH12821
- # xref #11444
- u = [dt.datetime(2015, x + 1, 1) for x in range(12)]
- v = list("aaabbbbbbccd")
- df = DataFrame({"X": v, "Y": u})
- result = df.groupby("X")["Y"].agg(len)
- expected = df.groupby("X")["Y"].count()
- tm.assert_series_equal(result, expected)
- def test_aggregate_float64_no_int64():
- # see gh-11199
- df = DataFrame({"a": [1, 2, 3, 4, 5], "b": [1, 2, 2, 4, 5], "c": [1, 2, 3, 4, 5]})
- expected = DataFrame({"a": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
- expected.index.name = "b"
- result = df.groupby("b")[["a"]].mean()
- tm.assert_frame_equal(result, expected)
- expected = DataFrame({"a": [1, 2.5, 4, 5], "c": [1, 2.5, 4, 5]}, index=[1, 2, 4, 5])
- expected.index.name = "b"
- result = df.groupby("b")[["a", "c"]].mean()
- tm.assert_frame_equal(result, expected)
- def test_aggregate_api_consistency():
- # GH 9052
- # make sure that the aggregates via dict
- # are consistent
- df = DataFrame(
- {
- "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
- "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
- "C": np.random.randn(8) + 1.0,
- "D": np.arange(8),
- }
- )
- grouped = df.groupby(["A", "B"])
- c_mean = grouped["C"].mean()
- c_sum = grouped["C"].sum()
- d_mean = grouped["D"].mean()
- d_sum = grouped["D"].sum()
- result = grouped["D"].agg(["sum", "mean"])
- expected = pd.concat([d_sum, d_mean], axis=1)
- expected.columns = ["sum", "mean"]
- tm.assert_frame_equal(result, expected, check_like=True)
- result = grouped.agg([np.sum, np.mean])
- expected = pd.concat([c_sum, c_mean, d_sum, d_mean], axis=1)
- expected.columns = MultiIndex.from_product([["C", "D"], ["sum", "mean"]])
- tm.assert_frame_equal(result, expected, check_like=True)
- result = grouped[["D", "C"]].agg([np.sum, np.mean])
- expected = pd.concat([d_sum, d_mean, c_sum, c_mean], axis=1)
- expected.columns = MultiIndex.from_product([["D", "C"], ["sum", "mean"]])
- tm.assert_frame_equal(result, expected, check_like=True)
- result = grouped.agg({"C": "mean", "D": "sum"})
- expected = pd.concat([d_sum, c_mean], axis=1)
- tm.assert_frame_equal(result, expected, check_like=True)
- result = grouped.agg({"C": ["mean", "sum"], "D": ["mean", "sum"]})
- expected = pd.concat([c_mean, c_sum, d_mean, d_sum], axis=1)
- expected.columns = MultiIndex.from_product([["C", "D"], ["mean", "sum"]])
- msg = r"Column\(s\) \['r', 'r2'\] do not exist"
- with pytest.raises(KeyError, match=msg):
- grouped[["D", "C"]].agg({"r": np.sum, "r2": np.mean})
- def test_agg_dict_renaming_deprecation():
- # 15931
- df = DataFrame({"A": [1, 1, 1, 2, 2], "B": range(5), "C": range(5)})
- msg = r"nested renamer is not supported"
- with pytest.raises(SpecificationError, match=msg):
- df.groupby("A").agg(
- {"B": {"foo": ["sum", "max"]}, "C": {"bar": ["count", "min"]}}
- )
- msg = r"Column\(s\) \['ma'\] do not exist"
- with pytest.raises(KeyError, match=msg):
- df.groupby("A")[["B", "C"]].agg({"ma": "max"})
- msg = r"nested renamer is not supported"
- with pytest.raises(SpecificationError, match=msg):
- df.groupby("A").B.agg({"foo": "count"})
- def test_agg_compat():
- # GH 12334
- df = DataFrame(
- {
- "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
- "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
- "C": np.random.randn(8) + 1.0,
- "D": np.arange(8),
- }
- )
- g = df.groupby(["A", "B"])
- msg = r"nested renamer is not supported"
- with pytest.raises(SpecificationError, match=msg):
- g["D"].agg({"C": ["sum", "std"]})
- with pytest.raises(SpecificationError, match=msg):
- g["D"].agg({"C": "sum", "D": "std"})
- def test_agg_nested_dicts():
- # API change for disallowing these types of nested dicts
- df = DataFrame(
- {
- "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
- "B": ["one", "one", "two", "two", "two", "two", "one", "two"],
- "C": np.random.randn(8) + 1.0,
- "D": np.arange(8),
- }
- )
- g = df.groupby(["A", "B"])
- msg = r"nested renamer is not supported"
- with pytest.raises(SpecificationError, match=msg):
- g.aggregate({"r1": {"C": ["mean", "sum"]}, "r2": {"D": ["mean", "sum"]}})
- with pytest.raises(SpecificationError, match=msg):
- g.agg({"C": {"ra": ["mean", "std"]}, "D": {"rb": ["mean", "std"]}})
- # same name as the original column
- # GH9052
- with pytest.raises(SpecificationError, match=msg):
- g["D"].agg({"result1": np.sum, "result2": np.mean})
- with pytest.raises(SpecificationError, match=msg):
- g["D"].agg({"D": np.sum, "result2": np.mean})
- def test_agg_item_by_item_raise_typeerror():
- df = DataFrame(np.random.randint(10, size=(20, 10)))
- def raiseException(df):
- pprint_thing("----------------------------------------")
- pprint_thing(df.to_string())
- raise TypeError("test")
- with pytest.raises(TypeError, match="test"):
- df.groupby(0).agg(raiseException)
- def test_series_agg_multikey():
- ts = tm.makeTimeSeries()
- grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
- result = grouped.agg(np.sum)
- expected = grouped.sum()
- tm.assert_series_equal(result, expected)
- def test_series_agg_multi_pure_python():
- data = DataFrame(
- {
- "A": [
- "foo",
- "foo",
- "foo",
- "foo",
- "bar",
- "bar",
- "bar",
- "bar",
- "foo",
- "foo",
- "foo",
- ],
- "B": [
- "one",
- "one",
- "one",
- "two",
- "one",
- "one",
- "one",
- "two",
- "two",
- "two",
- "one",
- ],
- "C": [
- "dull",
- "dull",
- "shiny",
- "dull",
- "dull",
- "shiny",
- "shiny",
- "dull",
- "shiny",
- "shiny",
- "shiny",
- ],
- "D": np.random.randn(11),
- "E": np.random.randn(11),
- "F": np.random.randn(11),
- }
- )
- def bad(x):
- assert len(x.values.base) > 0
- return "foo"
- result = data.groupby(["A", "B"]).agg(bad)
- expected = data.groupby(["A", "B"]).agg(lambda x: "foo")
- tm.assert_frame_equal(result, expected)
- def test_agg_consistency():
- # agg with ([]) and () not consistent
- # GH 6715
- def P1(a):
- return np.percentile(a.dropna(), q=1)
- df = DataFrame(
- {
- "col1": [1, 2, 3, 4],
- "col2": [10, 25, 26, 31],
- "date": [
- dt.date(2013, 2, 10),
- dt.date(2013, 2, 10),
- dt.date(2013, 2, 11),
- dt.date(2013, 2, 11),
- ],
- }
- )
- g = df.groupby("date")
- expected = g.agg([P1])
- expected.columns = expected.columns.levels[0]
- result = g.agg(P1)
- tm.assert_frame_equal(result, expected)
- def test_agg_callables():
- # GH 7929
- df = DataFrame({"foo": [1, 2], "bar": [3, 4]}).astype(np.int64)
- class fn_class:
- def __call__(self, x):
- return sum(x)
- equiv_callables = [
- sum,
- np.sum,
- lambda x: sum(x),
- lambda x: x.sum(),
- partial(sum),
- fn_class(),
- ]
- expected = df.groupby("foo").agg(sum)
- for ecall in equiv_callables:
- result = df.groupby("foo").agg(ecall)
- tm.assert_frame_equal(result, expected)
- def test_agg_over_numpy_arrays():
- # GH 3788
- df = DataFrame(
- [
- [1, np.array([10, 20, 30])],
- [1, np.array([40, 50, 60])],
- [2, np.array([20, 30, 40])],
- ],
- columns=["category", "arraydata"],
- )
- gb = df.groupby("category")
- expected_data = [[np.array([50, 70, 90])], [np.array([20, 30, 40])]]
- expected_index = Index([1, 2], name="category")
- expected_column = ["arraydata"]
- expected = DataFrame(expected_data, index=expected_index, columns=expected_column)
- alt = gb.sum(numeric_only=False)
- tm.assert_frame_equal(alt, expected)
- result = gb.agg("sum", numeric_only=False)
- tm.assert_frame_equal(result, expected)
- # FIXME: the original version of this test called `gb.agg(sum)`
- # and that raises TypeError if `numeric_only=False` is passed
- @pytest.mark.parametrize("as_period", [True, False])
- def test_agg_tzaware_non_datetime_result(as_period):
- # discussed in GH#29589, fixed in GH#29641, operating on tzaware values
- # with function that is not dtype-preserving
- dti = date_range("2012-01-01", periods=4, tz="UTC")
- if as_period:
- dti = dti.tz_localize(None).to_period("D")
- df = DataFrame({"a": [0, 0, 1, 1], "b": dti})
- gb = df.groupby("a")
- # Case that _does_ preserve the dtype
- result = gb["b"].agg(lambda x: x.iloc[0])
- expected = Series(dti[::2], name="b")
- expected.index.name = "a"
- tm.assert_series_equal(result, expected)
- # Cases that do _not_ preserve the dtype
- result = gb["b"].agg(lambda x: x.iloc[0].year)
- expected = Series([2012, 2012], name="b")
- expected.index.name = "a"
- tm.assert_series_equal(result, expected)
- result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0])
- expected = Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b")
- expected.index.name = "a"
- if as_period:
- expected = Series([pd.offsets.Day(1), pd.offsets.Day(1)], name="b")
- expected.index.name = "a"
- tm.assert_series_equal(result, expected)
- def test_agg_timezone_round_trip():
- # GH 15426
- ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")
- df = DataFrame({"a": 1, "b": [ts + dt.timedelta(minutes=nn) for nn in range(10)]})
- result1 = df.groupby("a")["b"].agg(np.min).iloc[0]
- result2 = df.groupby("a")["b"].agg(lambda x: np.min(x)).iloc[0]
- result3 = df.groupby("a")["b"].min().iloc[0]
- assert result1 == ts
- assert result2 == ts
- assert result3 == ts
- dates = [
- pd.Timestamp(f"2016-01-0{i:d} 12:00:00", tz="US/Pacific") for i in range(1, 5)
- ]
- df = DataFrame({"A": ["a", "b"] * 2, "B": dates})
- grouped = df.groupby("A")
- ts = df["B"].iloc[0]
- assert ts == grouped.nth(0)["B"].iloc[0]
- assert ts == grouped.head(1)["B"].iloc[0]
- assert ts == grouped.first()["B"].iloc[0]
- # GH#27110 applying iloc should return a DataFrame
- assert ts == grouped.apply(lambda x: x.iloc[0]).iloc[0, 1]
- ts = df["B"].iloc[2]
- assert ts == grouped.last()["B"].iloc[0]
- # GH#27110 applying iloc should return a DataFrame
- assert ts == grouped.apply(lambda x: x.iloc[-1]).iloc[0, 1]
- def test_sum_uint64_overflow():
- # see gh-14758
- # Convert to uint64 and don't overflow
- df = DataFrame([[1, 2], [3, 4], [5, 6]], dtype=object)
- df = df + 9223372036854775807
- index = Index(
- [9223372036854775808, 9223372036854775810, 9223372036854775812], dtype=np.uint64
- )
- expected = DataFrame(
- {1: [9223372036854775809, 9223372036854775811, 9223372036854775813]},
- index=index,
- dtype=object,
- )
- expected.index.name = 0
- result = df.groupby(0).sum(numeric_only=False)
- tm.assert_frame_equal(result, expected)
- # out column is non-numeric, so with numeric_only=True it is dropped
- result2 = df.groupby(0).sum(numeric_only=True)
- expected2 = expected[[]]
- tm.assert_frame_equal(result2, expected2)
- @pytest.mark.parametrize(
- "structure, expected",
- [
- (tuple, DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}})),
- (list, DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}})),
- (
- lambda x: tuple(x),
- DataFrame({"C": {(1, 1): (1, 1, 1), (3, 4): (3, 4, 4)}}),
- ),
- (
- lambda x: list(x),
- DataFrame({"C": {(1, 1): [1, 1, 1], (3, 4): [3, 4, 4]}}),
- ),
- ],
- )
- def test_agg_structs_dataframe(structure, expected):
- df = DataFrame(
- {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
- )
- result = df.groupby(["A", "B"]).aggregate(structure)
- expected.index.names = ["A", "B"]
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "structure, expected",
- [
- (tuple, Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
- (list, Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
- (lambda x: tuple(x), Series([(1, 1, 1), (3, 4, 4)], index=[1, 3], name="C")),
- (lambda x: list(x), Series([[1, 1, 1], [3, 4, 4]], index=[1, 3], name="C")),
- ],
- )
- def test_agg_structs_series(structure, expected):
- # Issue #18079
- df = DataFrame(
- {"A": [1, 1, 1, 3, 3, 3], "B": [1, 1, 1, 4, 4, 4], "C": [1, 1, 1, 3, 4, 4]}
- )
- result = df.groupby("A")["C"].aggregate(structure)
- expected.index.name = "A"
- tm.assert_series_equal(result, expected)
- def test_agg_category_nansum(observed):
- categories = ["a", "b", "c"]
- df = DataFrame(
- {"A": pd.Categorical(["a", "a", "b"], categories=categories), "B": [1, 2, 3]}
- )
- result = df.groupby("A", observed=observed).B.agg(np.nansum)
- expected = Series(
- [3, 3, 0],
- index=pd.CategoricalIndex(["a", "b", "c"], categories=categories, name="A"),
- name="B",
- )
- if observed:
- expected = expected[expected != 0]
- tm.assert_series_equal(result, expected)
- def test_agg_list_like_func():
- # GH 18473
- df = DataFrame({"A": [str(x) for x in range(3)], "B": [str(x) for x in range(3)]})
- grouped = df.groupby("A", as_index=False, sort=False)
- result = grouped.agg({"B": lambda x: list(x)})
- expected = DataFrame(
- {"A": [str(x) for x in range(3)], "B": [[str(x)] for x in range(3)]}
- )
- tm.assert_frame_equal(result, expected)
- def test_agg_lambda_with_timezone():
- # GH 23683
- df = DataFrame(
- {
- "tag": [1, 1],
- "date": [
- pd.Timestamp("2018-01-01", tz="UTC"),
- pd.Timestamp("2018-01-02", tz="UTC"),
- ],
- }
- )
- result = df.groupby("tag").agg({"date": lambda e: e.head(1)})
- expected = DataFrame(
- [pd.Timestamp("2018-01-01", tz="UTC")],
- index=Index([1], name="tag"),
- columns=["date"],
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "err_cls",
- [
- NotImplementedError,
- RuntimeError,
- KeyError,
- IndexError,
- OSError,
- ValueError,
- ArithmeticError,
- AttributeError,
- ],
- )
- def test_groupby_agg_err_catching(err_cls):
- # make sure we suppress anything other than TypeError or AssertionError
- # in _python_agg_general
- # Use a non-standard EA to make sure we don't go down ndarray paths
- from pandas.tests.extension.decimal.array import (
- DecimalArray,
- make_data,
- to_decimal,
- )
- data = make_data()[:5]
- df = DataFrame(
- {"id1": [0, 0, 0, 1, 1], "id2": [0, 1, 0, 1, 1], "decimals": DecimalArray(data)}
- )
- expected = Series(to_decimal([data[0], data[3]]))
- def weird_func(x):
- # weird function that raise something other than TypeError or IndexError
- # in _python_agg_general
- if len(x) == 0:
- raise err_cls
- return x.iloc[0]
- result = df["decimals"].groupby(df["id1"]).agg(weird_func)
- tm.assert_series_equal(result, expected, check_names=False)
|