123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824 |
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import (
- DataFrame,
- Index,
- MultiIndex,
- Series,
- Timestamp,
- isna,
- )
- import pandas._testing as tm
- def test_first_last_nth(df):
- # tests for first / last / nth
- grouped = df.groupby("A")
- first = grouped.first()
- expected = df.loc[[1, 0], ["B", "C", "D"]]
- expected.index = Index(["bar", "foo"], name="A")
- expected = expected.sort_index()
- tm.assert_frame_equal(first, expected)
- nth = grouped.nth(0)
- expected = df.loc[[0, 1]]
- tm.assert_frame_equal(nth, expected)
- last = grouped.last()
- expected = df.loc[[5, 7], ["B", "C", "D"]]
- expected.index = Index(["bar", "foo"], name="A")
- tm.assert_frame_equal(last, expected)
- nth = grouped.nth(-1)
- expected = df.iloc[[5, 7]]
- tm.assert_frame_equal(nth, expected)
- nth = grouped.nth(1)
- expected = df.iloc[[2, 3]]
- tm.assert_frame_equal(nth, expected)
- # it works!
- grouped["B"].first()
- grouped["B"].last()
- grouped["B"].nth(0)
- df.loc[df["A"] == "foo", "B"] = np.nan
- assert isna(grouped["B"].first()["foo"])
- assert isna(grouped["B"].last()["foo"])
- assert isna(grouped["B"].nth(0).iloc[0])
- # v0.14.0 whatsnew
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
- g = df.groupby("A")
- result = g.first()
- expected = df.iloc[[1, 2]].set_index("A")
- tm.assert_frame_equal(result, expected)
- expected = df.iloc[[1, 2]]
- result = g.nth(0, dropna="any")
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("method", ["first", "last"])
- def test_first_last_with_na_object(method, nulls_fixture):
- # https://github.com/pandas-dev/pandas/issues/32123
- groups = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]}).groupby("a")
- result = getattr(groups, method)()
- if method == "first":
- values = [1, 3]
- else:
- values = [2, 3]
- values = np.array(values, dtype=result["b"].dtype)
- idx = Index([1, 2], name="a")
- expected = DataFrame({"b": values}, index=idx)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("index", [0, -1])
- def test_nth_with_na_object(index, nulls_fixture):
- # https://github.com/pandas-dev/pandas/issues/32123
- df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 2, 3, nulls_fixture]})
- groups = df.groupby("a")
- result = groups.nth(index)
- expected = df.iloc[[0, 2]] if index == 0 else df.iloc[[1, 3]]
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("method", ["first", "last"])
- def test_first_last_with_None(method):
- # https://github.com/pandas-dev/pandas/issues/32800
- # None should be preserved as object dtype
- df = DataFrame.from_dict({"id": ["a"], "value": [None]})
- groups = df.groupby("id", as_index=False)
- result = getattr(groups, method)()
- tm.assert_frame_equal(result, df)
- @pytest.mark.parametrize("method", ["first", "last"])
- @pytest.mark.parametrize(
- "df, expected",
- [
- (
- DataFrame({"id": "a", "value": [None, "foo", np.nan]}),
- DataFrame({"value": ["foo"]}, index=Index(["a"], name="id")),
- ),
- (
- DataFrame({"id": "a", "value": [np.nan]}, dtype=object),
- DataFrame({"value": [None]}, index=Index(["a"], name="id")),
- ),
- ],
- )
- def test_first_last_with_None_expanded(method, df, expected):
- # GH 32800, 38286
- result = getattr(df.groupby("id"), method)()
- tm.assert_frame_equal(result, expected)
- def test_first_last_nth_dtypes(df_mixed_floats):
- df = df_mixed_floats.copy()
- df["E"] = True
- df["F"] = 1
- # tests for first / last / nth
- grouped = df.groupby("A")
- first = grouped.first()
- expected = df.loc[[1, 0], ["B", "C", "D", "E", "F"]]
- expected.index = Index(["bar", "foo"], name="A")
- expected = expected.sort_index()
- tm.assert_frame_equal(first, expected)
- last = grouped.last()
- expected = df.loc[[5, 7], ["B", "C", "D", "E", "F"]]
- expected.index = Index(["bar", "foo"], name="A")
- expected = expected.sort_index()
- tm.assert_frame_equal(last, expected)
- nth = grouped.nth(1)
- expected = df.iloc[[2, 3]]
- tm.assert_frame_equal(nth, expected)
- # GH 2763, first/last shifting dtypes
- idx = list(range(10))
- idx.append(9)
- s = Series(data=range(11), index=idx, name="IntCol")
- assert s.dtype == "int64"
- f = s.groupby(level=0).first()
- assert f.dtype == "int64"
- def test_first_last_nth_nan_dtype():
- # GH 33591
- df = DataFrame({"data": ["A"], "nans": Series([np.nan], dtype=object)})
- grouped = df.groupby("data")
- expected = df.set_index("data").nans
- tm.assert_series_equal(grouped.nans.first(), expected)
- tm.assert_series_equal(grouped.nans.last(), expected)
- expected = df.nans
- tm.assert_series_equal(grouped.nans.nth(-1), expected)
- tm.assert_series_equal(grouped.nans.nth(0), expected)
- def test_first_strings_timestamps():
- # GH 11244
- test = DataFrame(
- {
- Timestamp("2012-01-01 00:00:00"): ["a", "b"],
- Timestamp("2012-01-02 00:00:00"): ["c", "d"],
- "name": ["e", "e"],
- "aaaa": ["f", "g"],
- }
- )
- result = test.groupby("name").first()
- expected = DataFrame(
- [["a", "c", "f"]],
- columns=Index([Timestamp("2012-01-01"), Timestamp("2012-01-02"), "aaaa"]),
- index=Index(["e"], name="name"),
- )
- tm.assert_frame_equal(result, expected)
- def test_nth():
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
- g = df.groupby("A")
- tm.assert_frame_equal(g.nth(0), df.iloc[[0, 2]])
- tm.assert_frame_equal(g.nth(1), df.iloc[[1]])
- tm.assert_frame_equal(g.nth(2), df.loc[[]])
- tm.assert_frame_equal(g.nth(-1), df.iloc[[1, 2]])
- tm.assert_frame_equal(g.nth(-2), df.iloc[[0]])
- tm.assert_frame_equal(g.nth(-3), df.loc[[]])
- tm.assert_series_equal(g.B.nth(0), df.B.iloc[[0, 2]])
- tm.assert_series_equal(g.B.nth(1), df.B.iloc[[1]])
- tm.assert_frame_equal(g[["B"]].nth(0), df[["B"]].iloc[[0, 2]])
- tm.assert_frame_equal(g.nth(0, dropna="any"), df.iloc[[1, 2]])
- tm.assert_frame_equal(g.nth(-1, dropna="any"), df.iloc[[1, 2]])
- tm.assert_frame_equal(g.nth(7, dropna="any"), df.iloc[:0])
- tm.assert_frame_equal(g.nth(2, dropna="any"), df.iloc[:0])
- # out of bounds, regression from 0.13.1
- # GH 6621
- df = DataFrame(
- {
- "color": {0: "green", 1: "green", 2: "red", 3: "red", 4: "red"},
- "food": {0: "ham", 1: "eggs", 2: "eggs", 3: "ham", 4: "pork"},
- "two": {
- 0: 1.5456590000000001,
- 1: -0.070345000000000005,
- 2: -2.4004539999999999,
- 3: 0.46206000000000003,
- 4: 0.52350799999999997,
- },
- "one": {
- 0: 0.56573799999999996,
- 1: -0.9742360000000001,
- 2: 1.033801,
- 3: -0.78543499999999999,
- 4: 0.70422799999999997,
- },
- }
- ).set_index(["color", "food"])
- result = df.groupby(level=0, as_index=False).nth(2)
- expected = df.iloc[[-1]]
- tm.assert_frame_equal(result, expected)
- result = df.groupby(level=0, as_index=False).nth(3)
- expected = df.loc[[]]
- tm.assert_frame_equal(result, expected)
- # GH 7559
- # from the vbench
- df = DataFrame(np.random.randint(1, 10, (100, 2)), dtype="int64")
- s = df[1]
- g = df[0]
- expected = s.groupby(g).first()
- expected2 = s.groupby(g).apply(lambda x: x.iloc[0])
- tm.assert_series_equal(expected2, expected, check_names=False)
- assert expected.name == 1
- assert expected2.name == 1
- # validate first
- v = s[g == 1].iloc[0]
- assert expected.iloc[0] == v
- assert expected2.iloc[0] == v
- with pytest.raises(ValueError, match="For a DataFrame"):
- s.groupby(g, sort=False).nth(0, dropna=True)
- # doc example
- df = DataFrame([[1, np.nan], [1, 4], [5, 6]], columns=["A", "B"])
- g = df.groupby("A")
- result = g.B.nth(0, dropna="all")
- expected = df.B.iloc[[1, 2]]
- tm.assert_series_equal(result, expected)
- # test multiple nth values
- df = DataFrame([[1, np.nan], [1, 3], [1, 4], [5, 6], [5, 7]], columns=["A", "B"])
- g = df.groupby("A")
- tm.assert_frame_equal(g.nth(0), df.iloc[[0, 3]])
- tm.assert_frame_equal(g.nth([0]), df.iloc[[0, 3]])
- tm.assert_frame_equal(g.nth([0, 1]), df.iloc[[0, 1, 3, 4]])
- tm.assert_frame_equal(g.nth([0, -1]), df.iloc[[0, 2, 3, 4]])
- tm.assert_frame_equal(g.nth([0, 1, 2]), df.iloc[[0, 1, 2, 3, 4]])
- tm.assert_frame_equal(g.nth([0, 1, -1]), df.iloc[[0, 1, 2, 3, 4]])
- tm.assert_frame_equal(g.nth([2]), df.iloc[[2]])
- tm.assert_frame_equal(g.nth([3, 4]), df.loc[[]])
- business_dates = pd.date_range(start="4/1/2014", end="6/30/2014", freq="B")
- df = DataFrame(1, index=business_dates, columns=["a", "b"])
- # get the first, fourth and last two business days for each month
- key = [df.index.year, df.index.month]
- result = df.groupby(key, as_index=False).nth([0, 3, -2, -1])
- expected_dates = pd.to_datetime(
- [
- "2014/4/1",
- "2014/4/4",
- "2014/4/29",
- "2014/4/30",
- "2014/5/1",
- "2014/5/6",
- "2014/5/29",
- "2014/5/30",
- "2014/6/2",
- "2014/6/5",
- "2014/6/27",
- "2014/6/30",
- ]
- )
- expected = DataFrame(1, columns=["a", "b"], index=expected_dates)
- tm.assert_frame_equal(result, expected)
- def test_nth_multi_grouper(three_group):
- # PR 9090, related to issue 8979
- # test nth on multiple groupers
- grouped = three_group.groupby(["A", "B"])
- result = grouped.nth(0)
- expected = three_group.iloc[[0, 3, 4, 7]]
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "data, expected_first, expected_last",
- [
- (
- {
- "id": ["A"],
- "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
- "foo": [1],
- },
- {
- "id": ["A"],
- "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
- "foo": [1],
- },
- {
- "id": ["A"],
- "time": Timestamp("2012-02-01 14:00:00", tz="US/Central"),
- "foo": [1],
- },
- ),
- (
- {
- "id": ["A", "B", "A"],
- "time": [
- Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
- Timestamp("2012-02-01 14:00:00", tz="US/Central"),
- Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
- ],
- "foo": [1, 2, 3],
- },
- {
- "id": ["A", "B"],
- "time": [
- Timestamp("2012-01-01 13:00:00", tz="America/New_York"),
- Timestamp("2012-02-01 14:00:00", tz="US/Central"),
- ],
- "foo": [1, 2],
- },
- {
- "id": ["A", "B"],
- "time": [
- Timestamp("2012-03-01 12:00:00", tz="Europe/London"),
- Timestamp("2012-02-01 14:00:00", tz="US/Central"),
- ],
- "foo": [3, 2],
- },
- ),
- ],
- )
- def test_first_last_tz(data, expected_first, expected_last):
- # GH15884
- # Test that the timezone is retained when calling first
- # or last on groupby with as_index=False
- df = DataFrame(data)
- result = df.groupby("id", as_index=False).first()
- expected = DataFrame(expected_first)
- cols = ["id", "time", "foo"]
- tm.assert_frame_equal(result[cols], expected[cols])
- result = df.groupby("id", as_index=False)["time"].first()
- tm.assert_frame_equal(result, expected[["id", "time"]])
- result = df.groupby("id", as_index=False).last()
- expected = DataFrame(expected_last)
- cols = ["id", "time", "foo"]
- tm.assert_frame_equal(result[cols], expected[cols])
- result = df.groupby("id", as_index=False)["time"].last()
- tm.assert_frame_equal(result, expected[["id", "time"]])
- @pytest.mark.parametrize(
- "method, ts, alpha",
- [
- ["first", Timestamp("2013-01-01", tz="US/Eastern"), "a"],
- ["last", Timestamp("2013-01-02", tz="US/Eastern"), "b"],
- ],
- )
- def test_first_last_tz_multi_column(method, ts, alpha):
- # GH 21603
- category_string = Series(list("abc")).astype("category")
- df = DataFrame(
- {
- "group": [1, 1, 2],
- "category_string": category_string,
- "datetimetz": pd.date_range("20130101", periods=3, tz="US/Eastern"),
- }
- )
- result = getattr(df.groupby("group"), method)()
- expected = DataFrame(
- {
- "category_string": pd.Categorical(
- [alpha, "c"], dtype=category_string.dtype
- ),
- "datetimetz": [ts, Timestamp("2013-01-03", tz="US/Eastern")],
- },
- index=Index([1, 2], name="group"),
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "values",
- [
- pd.array([True, False], dtype="boolean"),
- pd.array([1, 2], dtype="Int64"),
- pd.to_datetime(["2020-01-01", "2020-02-01"]),
- pd.to_timedelta([1, 2], unit="D"),
- ],
- )
- @pytest.mark.parametrize("function", ["first", "last", "min", "max"])
- def test_first_last_extension_array_keeps_dtype(values, function):
- # https://github.com/pandas-dev/pandas/issues/33071
- # https://github.com/pandas-dev/pandas/issues/32194
- df = DataFrame({"a": [1, 2], "b": values})
- grouped = df.groupby("a")
- idx = Index([1, 2], name="a")
- expected_series = Series(values, name="b", index=idx)
- expected_frame = DataFrame({"b": values}, index=idx)
- result_series = getattr(grouped["b"], function)()
- tm.assert_series_equal(result_series, expected_series)
- result_frame = grouped.agg({"b": function})
- tm.assert_frame_equal(result_frame, expected_frame)
- def test_nth_multi_index_as_expected():
- # PR 9090, related to issue 8979
- # test nth on MultiIndex
- three_group = DataFrame(
- {
- "A": [
- "foo",
- "foo",
- "foo",
- "foo",
- "bar",
- "bar",
- "bar",
- "bar",
- "foo",
- "foo",
- "foo",
- ],
- "B": [
- "one",
- "one",
- "one",
- "two",
- "one",
- "one",
- "one",
- "two",
- "two",
- "two",
- "one",
- ],
- "C": [
- "dull",
- "dull",
- "shiny",
- "dull",
- "dull",
- "shiny",
- "shiny",
- "dull",
- "shiny",
- "shiny",
- "shiny",
- ],
- }
- )
- grouped = three_group.groupby(["A", "B"])
- result = grouped.nth(0)
- expected = three_group.iloc[[0, 3, 4, 7]]
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "op, n, expected_rows",
- [
- ("head", -1, [0]),
- ("head", 0, []),
- ("head", 1, [0, 2]),
- ("head", 7, [0, 1, 2]),
- ("tail", -1, [1]),
- ("tail", 0, []),
- ("tail", 1, [1, 2]),
- ("tail", 7, [0, 1, 2]),
- ],
- )
- @pytest.mark.parametrize("columns", [None, [], ["A"], ["B"], ["A", "B"]])
- @pytest.mark.parametrize("as_index", [True, False])
- def test_groupby_head_tail(op, n, expected_rows, columns, as_index):
- df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
- g = df.groupby("A", as_index=as_index)
- expected = df.iloc[expected_rows]
- if columns is not None:
- g = g[columns]
- expected = expected[columns]
- result = getattr(g, op)(n)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "op, n, expected_cols",
- [
- ("head", -1, [0]),
- ("head", 0, []),
- ("head", 1, [0, 2]),
- ("head", 7, [0, 1, 2]),
- ("tail", -1, [1]),
- ("tail", 0, []),
- ("tail", 1, [1, 2]),
- ("tail", 7, [0, 1, 2]),
- ],
- )
- def test_groupby_head_tail_axis_1(op, n, expected_cols):
- # GH 9772
- df = DataFrame(
- [[1, 2, 3], [1, 4, 5], [2, 6, 7], [3, 8, 9]], columns=["A", "B", "C"]
- )
- g = df.groupby([0, 0, 1], axis=1)
- expected = df.iloc[:, expected_cols]
- result = getattr(g, op)(n)
- tm.assert_frame_equal(result, expected)
- def test_group_selection_cache():
- # GH 12839 nth, head, and tail should return same result consistently
- df = DataFrame([[1, 2], [1, 4], [5, 6]], columns=["A", "B"])
- expected = df.iloc[[0, 2]]
- g = df.groupby("A")
- result1 = g.head(n=2)
- result2 = g.nth(0)
- tm.assert_frame_equal(result1, df)
- tm.assert_frame_equal(result2, expected)
- g = df.groupby("A")
- result1 = g.tail(n=2)
- result2 = g.nth(0)
- tm.assert_frame_equal(result1, df)
- tm.assert_frame_equal(result2, expected)
- g = df.groupby("A")
- result1 = g.nth(0)
- result2 = g.head(n=2)
- tm.assert_frame_equal(result1, expected)
- tm.assert_frame_equal(result2, df)
- g = df.groupby("A")
- result1 = g.nth(0)
- result2 = g.tail(n=2)
- tm.assert_frame_equal(result1, expected)
- tm.assert_frame_equal(result2, df)
- def test_nth_empty():
- # GH 16064
- df = DataFrame(index=[0], columns=["a", "b", "c"])
- result = df.groupby("a").nth(10)
- expected = df.iloc[:0]
- tm.assert_frame_equal(result, expected)
- result = df.groupby(["a", "b"]).nth(10)
- expected = df.iloc[:0]
- tm.assert_frame_equal(result, expected)
- def test_nth_column_order():
- # GH 20760
- # Check that nth preserves column order
- df = DataFrame(
- [[1, "b", 100], [1, "a", 50], [1, "a", np.nan], [2, "c", 200], [2, "d", 150]],
- columns=["A", "C", "B"],
- )
- result = df.groupby("A").nth(0)
- expected = df.iloc[[0, 3]]
- tm.assert_frame_equal(result, expected)
- result = df.groupby("A").nth(-1, dropna="any")
- expected = df.iloc[[1, 4]]
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("dropna", [None, "any", "all"])
- def test_nth_nan_in_grouper(dropna):
- # GH 26011
- df = DataFrame(
- {
- "a": [np.nan, "a", np.nan, "b", np.nan],
- "b": [0, 2, 4, 6, 8],
- "c": [1, 3, 5, 7, 9],
- }
- )
- result = df.groupby("a").nth(0, dropna=dropna)
- expected = df.iloc[[1, 3]]
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("dropna", [None, "any", "all"])
- def test_nth_nan_in_grouper_series(dropna):
- # GH 26454
- df = DataFrame(
- {
- "a": [np.nan, "a", np.nan, "b", np.nan],
- "b": [0, 2, 4, 6, 8],
- }
- )
- result = df.groupby("a")["b"].nth(0, dropna=dropna)
- expected = df["b"].iloc[[1, 3]]
- tm.assert_series_equal(result, expected)
- def test_first_categorical_and_datetime_data_nat():
- # GH 20520
- df = DataFrame(
- {
- "group": ["first", "first", "second", "third", "third"],
- "time": 5 * [np.datetime64("NaT")],
- "categories": Series(["a", "b", "c", "a", "b"], dtype="category"),
- }
- )
- result = df.groupby("group").first()
- expected = DataFrame(
- {
- "time": 3 * [np.datetime64("NaT")],
- "categories": Series(["a", "c", "a"]).astype(
- pd.CategoricalDtype(["a", "b", "c"])
- ),
- }
- )
- expected.index = Index(["first", "second", "third"], name="group")
- tm.assert_frame_equal(result, expected)
- def test_first_multi_key_groupby_categorical():
- # GH 22512
- df = DataFrame(
- {
- "A": [1, 1, 1, 2, 2],
- "B": [100, 100, 200, 100, 100],
- "C": ["apple", "orange", "mango", "mango", "orange"],
- "D": ["jupiter", "mercury", "mars", "venus", "venus"],
- }
- )
- df = df.astype({"D": "category"})
- result = df.groupby(by=["A", "B"]).first()
- expected = DataFrame(
- {
- "C": ["apple", "mango", "mango"],
- "D": Series(["jupiter", "mars", "venus"]).astype(
- pd.CategoricalDtype(["jupiter", "mars", "mercury", "venus"])
- ),
- }
- )
- expected.index = MultiIndex.from_tuples(
- [(1, 100), (1, 200), (2, 100)], names=["A", "B"]
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("method", ["first", "last", "nth"])
- def test_groupby_last_first_nth_with_none(method, nulls_fixture):
- # GH29645
- expected = Series(["y"])
- data = Series(
- [nulls_fixture, nulls_fixture, nulls_fixture, "y", nulls_fixture],
- index=[0, 0, 0, 0, 0],
- ).groupby(level=0)
- if method == "nth":
- result = getattr(data, method)(3)
- else:
- result = getattr(data, method)()
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "arg, expected_rows",
- [
- [slice(None, 3, 2), [0, 1, 4, 5]],
- [slice(None, -2), [0, 2, 5]],
- [[slice(None, 2), slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
- [[0, 1, slice(-2, None)], [0, 1, 2, 3, 4, 6, 7]],
- ],
- )
- def test_slice(slice_test_df, slice_test_grouped, arg, expected_rows):
- # Test slices GH #42947
- result = slice_test_grouped.nth[arg]
- equivalent = slice_test_grouped.nth(arg)
- expected = slice_test_df.iloc[expected_rows]
- tm.assert_frame_equal(result, expected)
- tm.assert_frame_equal(equivalent, expected)
- def test_nth_indexed(slice_test_df, slice_test_grouped):
- # Test index notation GH #44688
- result = slice_test_grouped.nth[0, 1, -2:]
- equivalent = slice_test_grouped.nth([0, 1, slice(-2, None)])
- expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
- tm.assert_frame_equal(result, expected)
- tm.assert_frame_equal(equivalent, expected)
- def test_invalid_argument(slice_test_grouped):
- # Test for error on invalid argument
- with pytest.raises(TypeError, match="Invalid index"):
- slice_test_grouped.nth(3.14)
- def test_negative_step(slice_test_grouped):
- # Test for error on negative slice step
- with pytest.raises(ValueError, match="Invalid step"):
- slice_test_grouped.nth(slice(None, None, -1))
- def test_np_ints(slice_test_df, slice_test_grouped):
- # Test np ints work
- result = slice_test_grouped.nth(np.array([0, 1]))
- expected = slice_test_df.iloc[[0, 1, 2, 3, 4]]
- tm.assert_frame_equal(result, expected)
- def test_groupby_nth_with_column_axis():
- # GH43926
- df = DataFrame(
- [
- [4, 5, 6],
- [8, 8, 7],
- ],
- index=["z", "y"],
- columns=["C", "B", "A"],
- )
- result = df.groupby(df.iloc[1], axis=1).nth(0)
- expected = df.iloc[:, [0, 2]]
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "start, stop, expected_values, expected_columns",
- [
- (None, None, [0, 1, 2, 3, 4], list("ABCDE")),
- (None, 1, [0, 3], list("AD")),
- (None, 9, [0, 1, 2, 3, 4], list("ABCDE")),
- (None, -1, [0, 1, 3], list("ABD")),
- (1, None, [1, 2, 4], list("BCE")),
- (1, -1, [1], list("B")),
- (-1, None, [2, 4], list("CE")),
- (-1, 2, [4], list("E")),
- ],
- )
- @pytest.mark.parametrize("method", ["call", "index"])
- def test_nth_slices_with_column_axis(
- start, stop, expected_values, expected_columns, method
- ):
- df = DataFrame([range(5)], columns=[list("ABCDE")])
- gb = df.groupby([5, 5, 5, 6, 6], axis=1)
- result = {
- "call": lambda start, stop: gb.nth(slice(start, stop)),
- "index": lambda start, stop: gb.nth[start:stop],
- }[method](start, stop)
- expected = DataFrame([expected_values], columns=[expected_columns])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.filterwarnings(
- "ignore:invalid value encountered in remainder:RuntimeWarning"
- )
- def test_head_tail_dropna_true():
- # GH#45089
- df = DataFrame(
- [["a", "z"], ["b", np.nan], ["c", np.nan], ["c", np.nan]], columns=["X", "Y"]
- )
- expected = DataFrame([["a", "z"]], columns=["X", "Y"])
- result = df.groupby(["X", "Y"]).head(n=1)
- tm.assert_frame_equal(result, expected)
- result = df.groupby(["X", "Y"]).tail(n=1)
- tm.assert_frame_equal(result, expected)
- result = df.groupby(["X", "Y"]).nth(n=0)
- tm.assert_frame_equal(result, expected)
- def test_head_tail_dropna_false():
- # GH#45089
- df = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
- expected = DataFrame([["a", "z"], ["b", np.nan], ["c", np.nan]], columns=["X", "Y"])
- result = df.groupby(["X", "Y"], dropna=False).head(n=1)
- tm.assert_frame_equal(result, expected)
- result = df.groupby(["X", "Y"], dropna=False).tail(n=1)
- tm.assert_frame_equal(result, expected)
- result = df.groupby(["X", "Y"], dropna=False).nth(n=0)
- tm.assert_frame_equal(result, expected)
|