123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249 |
- import numpy as np
- import pytest
- from pandas._libs.tslibs import iNaT
- import pandas as pd
- from pandas import (
- DataFrame,
- Index,
- Series,
- )
- import pandas._testing as tm
- def test_max_min_non_numeric():
- # #2700
- aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]})
- result = aa.groupby("nn").max()
- assert "ss" in result
- result = aa.groupby("nn").max(numeric_only=False)
- assert "ss" in result
- result = aa.groupby("nn").min()
- assert "ss" in result
- result = aa.groupby("nn").min(numeric_only=False)
- assert "ss" in result
- def test_max_min_object_multiple_columns(using_array_manager):
- # GH#41111 case where the aggregation is valid for some columns but not
- # others; we split object blocks column-wise, consistent with
- # DataFrame._reduce
- df = DataFrame(
- {
- "A": [1, 1, 2, 2, 3],
- "B": [1, "foo", 2, "bar", False],
- "C": ["a", "b", "c", "d", "e"],
- }
- )
- df._consolidate_inplace() # should already be consolidate, but double-check
- if not using_array_manager:
- assert len(df._mgr.blocks) == 2
- gb = df.groupby("A")
- result = gb[["C"]].max()
- # "max" is valid for column "C" but not for "B"
- ei = Index([1, 2, 3], name="A")
- expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
- tm.assert_frame_equal(result, expected)
- result = gb[["C"]].min()
- # "min" is valid for column "C" but not for "B"
- ei = Index([1, 2, 3], name="A")
- expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
- tm.assert_frame_equal(result, expected)
- def test_min_date_with_nans():
- # GH26321
- dates = pd.to_datetime(
- Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d"
- ).dt.date
- df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates})
- result = df.groupby("b", as_index=False)["c"].min()["c"]
- expected = pd.to_datetime(
- Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d"
- ).dt.date
- tm.assert_series_equal(result, expected)
- result = df.groupby("b")["c"].min()
- expected.index.name = "b"
- tm.assert_series_equal(result, expected)
- def test_max_inat():
- # GH#40767 dont interpret iNaT as NaN
- ser = Series([1, iNaT])
- key = np.array([1, 1], dtype=np.int64)
- gb = ser.groupby(key)
- result = gb.max(min_count=2)
- expected = Series({1: 1}, dtype=np.int64)
- tm.assert_series_equal(result, expected, check_exact=True)
- result = gb.min(min_count=2)
- expected = Series({1: iNaT}, dtype=np.int64)
- tm.assert_series_equal(result, expected, check_exact=True)
- # not enough entries -> gets masked to NaN
- result = gb.min(min_count=3)
- expected = Series({1: np.nan})
- tm.assert_series_equal(result, expected, check_exact=True)
- def test_max_inat_not_all_na():
- # GH#40767 dont interpret iNaT as NaN
- # make sure we dont round iNaT+1 to iNaT
- ser = Series([1, iNaT, 2, iNaT + 1])
- gb = ser.groupby([1, 2, 3, 3])
- result = gb.min(min_count=2)
- # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy
- expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1})
- expected.index = expected.index.astype(np.int_)
- tm.assert_series_equal(result, expected, check_exact=True)
- @pytest.mark.parametrize("func", ["min", "max"])
- def test_groupby_aggregate_period_column(func):
- # GH 31471
- groups = [1, 2]
- periods = pd.period_range("2020", periods=2, freq="Y")
- df = DataFrame({"a": groups, "b": periods})
- result = getattr(df.groupby("a")["b"], func)()
- idx = Index([1, 2], name="a")
- expected = Series(periods, index=idx, name="b")
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("func", ["min", "max"])
- def test_groupby_aggregate_period_frame(func):
- # GH 31471
- groups = [1, 2]
- periods = pd.period_range("2020", periods=2, freq="Y")
- df = DataFrame({"a": groups, "b": periods})
- result = getattr(df.groupby("a"), func)()
- idx = Index([1, 2], name="a")
- expected = DataFrame({"b": periods}, index=idx)
- tm.assert_frame_equal(result, expected)
- def test_aggregate_numeric_object_dtype():
- # https://github.com/pandas-dev/pandas/issues/39329
- # simplified case: multiple object columns where one is all-NaN
- # -> gets split as the all-NaN is inferred as float
- df = DataFrame(
- {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4},
- ).astype(object)
- result = df.groupby("key").min()
- expected = (
- DataFrame(
- {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]},
- )
- .set_index("key")
- .astype(object)
- )
- tm.assert_frame_equal(result, expected)
- # same but with numbers
- df = DataFrame(
- {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)},
- ).astype(object)
- result = df.groupby("key").min()
- expected = (
- DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]})
- .set_index("key")
- .astype(object)
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("func", ["min", "max"])
- def test_aggregate_categorical_lost_index(func: str):
- # GH: 28641 groupby drops index, when grouping over categorical column with min/max
- ds = Series(["b"], dtype="category").cat.as_ordered()
- df = DataFrame({"A": [1997], "B": ds})
- result = df.groupby("A").agg({"B": func})
- expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A"))
- # ordered categorical dtype should be preserved
- expected["B"] = expected["B"].astype(ds.dtype)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"])
- def test_groupby_min_max_nullable(dtype):
- if dtype == "Int64":
- # GH#41743 avoid precision loss
- ts = 1618556707013635762
- elif dtype == "boolean":
- ts = 0
- else:
- ts = 4.0
- df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]})
- df["ts"] = df["ts"].astype(dtype)
- gb = df.groupby("id")
- result = gb.min()
- expected = df.iloc[:1].set_index("id")
- tm.assert_frame_equal(result, expected)
- res_max = gb.max()
- expected_max = df.iloc[1:].set_index("id")
- tm.assert_frame_equal(res_max, expected_max)
- result2 = gb.min(min_count=3)
- expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype)
- tm.assert_frame_equal(result2, expected2)
- res_max2 = gb.max(min_count=3)
- tm.assert_frame_equal(res_max2, expected2)
- # Case with NA values
- df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]})
- df2["ts"] = df2["ts"].astype(dtype)
- gb2 = df2.groupby("id")
- result3 = gb2.min()
- tm.assert_frame_equal(result3, expected)
- res_max3 = gb2.max()
- tm.assert_frame_equal(res_max3, expected_max)
- result4 = gb2.min(min_count=100)
- tm.assert_frame_equal(result4, expected2)
- res_max4 = gb2.max(min_count=100)
- tm.assert_frame_equal(res_max4, expected2)
- def test_min_max_nullable_uint64_empty_group():
- # don't raise NotImplementedError from libgroupby
- cat = pd.Categorical([0] * 10, categories=[0, 1])
- df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))})
- gb = df.groupby("A")
- res = gb.min()
- idx = pd.CategoricalIndex([0, 1], dtype=cat.dtype, name="A")
- expected = DataFrame({"B": pd.array([0, pd.NA], dtype="UInt64")}, index=idx)
- tm.assert_frame_equal(res, expected)
- res = gb.max()
- expected.iloc[0, 0] = 9
- tm.assert_frame_equal(res, expected)
|