123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377 |
- from itertools import product
- from string import ascii_lowercase
- import numpy as np
- import pytest
- from pandas import (
- DataFrame,
- Index,
- MultiIndex,
- Period,
- Series,
- Timedelta,
- Timestamp,
- date_range,
- )
- import pandas._testing as tm
- class TestCounting:
- def test_cumcount(self):
- df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
- g = df.groupby("A")
- sg = g.A
- expected = Series([0, 1, 2, 0, 3])
- tm.assert_series_equal(expected, g.cumcount())
- tm.assert_series_equal(expected, sg.cumcount())
- def test_cumcount_empty(self):
- ge = DataFrame().groupby(level=0)
- se = Series(dtype=object).groupby(level=0)
- # edge case, as this is usually considered float
- e = Series(dtype="int64")
- tm.assert_series_equal(e, ge.cumcount())
- tm.assert_series_equal(e, se.cumcount())
- def test_cumcount_dupe_index(self):
- df = DataFrame(
- [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
- )
- g = df.groupby("A")
- sg = g.A
- expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
- tm.assert_series_equal(expected, g.cumcount())
- tm.assert_series_equal(expected, sg.cumcount())
- def test_cumcount_mi(self):
- mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
- df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
- g = df.groupby("A")
- sg = g.A
- expected = Series([0, 1, 2, 0, 3], index=mi)
- tm.assert_series_equal(expected, g.cumcount())
- tm.assert_series_equal(expected, sg.cumcount())
- def test_cumcount_groupby_not_col(self):
- df = DataFrame(
- [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
- )
- g = df.groupby([0, 0, 0, 1, 0])
- sg = g.A
- expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
- tm.assert_series_equal(expected, g.cumcount())
- tm.assert_series_equal(expected, sg.cumcount())
- def test_ngroup(self):
- df = DataFrame({"A": list("aaaba")})
- g = df.groupby("A")
- sg = g.A
- expected = Series([0, 0, 0, 1, 0])
- tm.assert_series_equal(expected, g.ngroup())
- tm.assert_series_equal(expected, sg.ngroup())
- def test_ngroup_distinct(self):
- df = DataFrame({"A": list("abcde")})
- g = df.groupby("A")
- sg = g.A
- expected = Series(range(5), dtype="int64")
- tm.assert_series_equal(expected, g.ngroup())
- tm.assert_series_equal(expected, sg.ngroup())
- def test_ngroup_one_group(self):
- df = DataFrame({"A": [0] * 5})
- g = df.groupby("A")
- sg = g.A
- expected = Series([0] * 5)
- tm.assert_series_equal(expected, g.ngroup())
- tm.assert_series_equal(expected, sg.ngroup())
- def test_ngroup_empty(self):
- ge = DataFrame().groupby(level=0)
- se = Series(dtype=object).groupby(level=0)
- # edge case, as this is usually considered float
- e = Series(dtype="int64")
- tm.assert_series_equal(e, ge.ngroup())
- tm.assert_series_equal(e, se.ngroup())
- def test_ngroup_series_matches_frame(self):
- df = DataFrame({"A": list("aaaba")})
- s = Series(list("aaaba"))
- tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
- def test_ngroup_dupe_index(self):
- df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
- g = df.groupby("A")
- sg = g.A
- expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
- tm.assert_series_equal(expected, g.ngroup())
- tm.assert_series_equal(expected, sg.ngroup())
- def test_ngroup_mi(self):
- mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
- df = DataFrame({"A": list("aaaba")}, index=mi)
- g = df.groupby("A")
- sg = g.A
- expected = Series([0, 0, 0, 1, 0], index=mi)
- tm.assert_series_equal(expected, g.ngroup())
- tm.assert_series_equal(expected, sg.ngroup())
- def test_ngroup_groupby_not_col(self):
- df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
- g = df.groupby([0, 0, 0, 1, 0])
- sg = g.A
- expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
- tm.assert_series_equal(expected, g.ngroup())
- tm.assert_series_equal(expected, sg.ngroup())
- def test_ngroup_descending(self):
- df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
- g = df.groupby(["A"])
- ascending = Series([0, 0, 1, 0, 1])
- descending = Series([1, 1, 0, 1, 0])
- tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
- tm.assert_series_equal(ascending, g.ngroup(ascending=True))
- tm.assert_series_equal(descending, g.ngroup(ascending=False))
- def test_ngroup_matches_cumcount(self):
- # verify one manually-worked out case works
- df = DataFrame(
- [["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
- columns=["A", "X"],
- )
- g = df.groupby(["A", "X"])
- g_ngroup = g.ngroup()
- g_cumcount = g.cumcount()
- expected_ngroup = Series([0, 1, 2, 0, 3])
- expected_cumcount = Series([0, 0, 0, 1, 0])
- tm.assert_series_equal(g_ngroup, expected_ngroup)
- tm.assert_series_equal(g_cumcount, expected_cumcount)
- def test_ngroup_cumcount_pair(self):
- # brute force comparison for all small series
- for p in product(range(3), repeat=4):
- df = DataFrame({"a": p})
- g = df.groupby(["a"])
- order = sorted(set(p))
- ngroupd = [order.index(val) for val in p]
- cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
- tm.assert_series_equal(g.ngroup(), Series(ngroupd))
- tm.assert_series_equal(g.cumcount(), Series(cumcounted))
- def test_ngroup_respects_groupby_order(self, sort):
- np.random.seed(0)
- df = DataFrame({"a": np.random.choice(list("abcdef"), 100)})
- g = df.groupby("a", sort=sort)
- df["group_id"] = -1
- df["group_index"] = -1
- for i, (_, group) in enumerate(g):
- df.loc[group.index, "group_id"] = i
- for j, ind in enumerate(group.index):
- df.loc[ind, "group_index"] = j
- tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
- tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
- @pytest.mark.parametrize(
- "datetimelike",
- [
- [Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
- [Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
- [Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
- [Timedelta(x, unit="h") for x in range(1, 4)],
- [Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
- ],
- )
- def test_count_with_datetimelike(self, datetimelike):
- # test for #13393, where DataframeGroupBy.count() fails
- # when counting a datetimelike column.
- df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
- res = df.groupby("x").count()
- expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
- expected.index.name = "x"
- tm.assert_frame_equal(expected, res)
- def test_count_with_only_nans_in_first_group(self):
- # GH21956
- df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
- result = df.groupby(["A", "B"]).C.count()
- mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
- expected = Series([], index=mi, dtype=np.int64, name="C")
- tm.assert_series_equal(result, expected, check_index_type=False)
- def test_count_groupby_column_with_nan_in_groupby_column(self):
- # https://github.com/pandas-dev/pandas/issues/32841
- df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]})
- res = df.groupby(["B"]).count()
- expected = DataFrame(
- index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
- )
- tm.assert_frame_equal(expected, res)
- def test_groupby_count_dateparseerror(self):
- dr = date_range(start="1/1/2012", freq="5min", periods=10)
- # BAD Example, datetimes first
- ser = Series(np.arange(10), index=[dr, np.arange(10)])
- grouped = ser.groupby(lambda x: x[1] % 2 == 0)
- result = grouped.count()
- ser = Series(np.arange(10), index=[np.arange(10), dr])
- grouped = ser.groupby(lambda x: x[0] % 2 == 0)
- expected = grouped.count()
- tm.assert_series_equal(result, expected)
- def test_groupby_timedelta_cython_count():
- df = DataFrame(
- {"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")}
- )
- expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delt")
- result = df.groupby("g").delt.count()
- tm.assert_series_equal(expected, result)
- def test_count():
- n = 1 << 15
- dr = date_range("2015-08-30", periods=n // 10, freq="T")
- df = DataFrame(
- {
- "1st": np.random.choice(list(ascii_lowercase), n),
- "2nd": np.random.randint(0, 5, n),
- "3rd": np.random.randn(n).round(3),
- "4th": np.random.randint(-10, 10, n),
- "5th": np.random.choice(dr, n),
- "6th": np.random.randn(n).round(3),
- "7th": np.random.randn(n).round(3),
- "8th": np.random.choice(dr, n) - np.random.choice(dr, 1),
- "9th": np.random.choice(list(ascii_lowercase), n),
- }
- )
- for col in df.columns.drop(["1st", "2nd", "4th"]):
- df.loc[np.random.choice(n, n // 10), col] = np.nan
- df["9th"] = df["9th"].astype("category")
- for key in ["1st", "2nd", ["1st", "2nd"]]:
- left = df.groupby(key).count()
- right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
- tm.assert_frame_equal(left, right)
- def test_count_non_nulls():
- # GH#5610
- # count counts non-nulls
- df = DataFrame(
- [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
- columns=["A", "B", "C"],
- )
- count_as = df.groupby("A").count()
- count_not_as = df.groupby("A", as_index=False).count()
- expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
- expected.index.name = "A"
- tm.assert_frame_equal(count_not_as, expected.reset_index())
- tm.assert_frame_equal(count_as, expected)
- count_B = df.groupby("A")["B"].count()
- tm.assert_series_equal(count_B, expected["B"])
- def test_count_object():
- df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
- result = df.groupby("c").a.count()
- expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
- tm.assert_series_equal(result, expected)
- df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
- result = df.groupby("c").a.count()
- expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
- tm.assert_series_equal(result, expected)
- def test_count_cross_type():
- # GH8169
- vals = np.hstack(
- (np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2)))
- )
- df = DataFrame(vals, columns=["a", "b", "c", "d"])
- df[df == 2] = np.nan
- expected = df.groupby(["c", "d"]).count()
- for t in ["float32", "object"]:
- df["a"] = df["a"].astype(t)
- df["b"] = df["b"].astype(t)
- result = df.groupby(["c", "d"]).count()
- tm.assert_frame_equal(result, expected)
- def test_lower_int_prec_count():
- df = DataFrame(
- {
- "a": np.array([0, 1, 2, 100], np.int8),
- "b": np.array([1, 2, 3, 6], np.uint32),
- "c": np.array([4, 5, 6, 8], np.int16),
- "grp": list("ab" * 2),
- }
- )
- result = df.groupby("grp").count()
- expected = DataFrame(
- {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
- )
- tm.assert_frame_equal(result, expected)
- def test_count_uses_size_on_exception():
- class RaisingObjectException(Exception):
- pass
- class RaisingObject:
- def __init__(self, msg="I will raise inside Cython") -> None:
- super().__init__()
- self.msg = msg
- def __eq__(self, other):
- # gets called in Cython to check that raising calls the method
- raise RaisingObjectException(self.msg)
- df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
- result = df.groupby("grp").count()
- expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
- tm.assert_frame_equal(result, expected)
|