123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698 |
- from datetime import datetime
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import (
- DataFrame,
- NaT,
- Series,
- concat,
- )
- import pandas._testing as tm
- def test_rank_unordered_categorical_typeerror():
- # GH#51034 should be TypeError, not NotImplementedError
- cat = pd.Categorical([], ordered=False)
- ser = Series(cat)
- df = ser.to_frame()
- msg = "Cannot perform rank with non-ordered Categorical"
- gb = ser.groupby(cat)
- with pytest.raises(TypeError, match=msg):
- gb.rank()
- gb2 = df.groupby(cat)
- with pytest.raises(TypeError, match=msg):
- gb2.rank()
- def test_rank_apply():
- lev1 = tm.rands_array(10, 100)
- lev2 = tm.rands_array(10, 130)
- lab1 = np.random.randint(0, 100, size=500)
- lab2 = np.random.randint(0, 130, size=500)
- df = DataFrame(
- {
- "value": np.random.randn(500),
- "key1": lev1.take(lab1),
- "key2": lev2.take(lab2),
- }
- )
- result = df.groupby(["key1", "key2"]).value.rank()
- expected = [piece.value.rank() for key, piece in df.groupby(["key1", "key2"])]
- expected = concat(expected, axis=0)
- expected = expected.reindex(result.index)
- tm.assert_series_equal(result, expected)
- result = df.groupby(["key1", "key2"]).value.rank(pct=True)
- expected = [
- piece.value.rank(pct=True) for key, piece in df.groupby(["key1", "key2"])
- ]
- expected = concat(expected, axis=0)
- expected = expected.reindex(result.index)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
- @pytest.mark.parametrize(
- "vals",
- [
- np.array([2, 2, 8, 2, 6], dtype=dtype)
- for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
- ]
- + [
- [
- pd.Timestamp("2018-01-02"),
- pd.Timestamp("2018-01-02"),
- pd.Timestamp("2018-01-08"),
- pd.Timestamp("2018-01-02"),
- pd.Timestamp("2018-01-06"),
- ],
- [
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- pd.Timestamp("2018-01-08", tz="US/Pacific"),
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- pd.Timestamp("2018-01-06", tz="US/Pacific"),
- ],
- [
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- pd.Timestamp("2018-01-08") - pd.Timestamp(0),
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- pd.Timestamp("2018-01-06") - pd.Timestamp(0),
- ],
- [
- pd.Timestamp("2018-01-02").to_period("D"),
- pd.Timestamp("2018-01-02").to_period("D"),
- pd.Timestamp("2018-01-08").to_period("D"),
- pd.Timestamp("2018-01-02").to_period("D"),
- pd.Timestamp("2018-01-06").to_period("D"),
- ],
- ],
- ids=lambda x: type(x[0]),
- )
- @pytest.mark.parametrize(
- "ties_method,ascending,pct,exp",
- [
- ("average", True, False, [2.0, 2.0, 5.0, 2.0, 4.0]),
- ("average", True, True, [0.4, 0.4, 1.0, 0.4, 0.8]),
- ("average", False, False, [4.0, 4.0, 1.0, 4.0, 2.0]),
- ("average", False, True, [0.8, 0.8, 0.2, 0.8, 0.4]),
- ("min", True, False, [1.0, 1.0, 5.0, 1.0, 4.0]),
- ("min", True, True, [0.2, 0.2, 1.0, 0.2, 0.8]),
- ("min", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
- ("min", False, True, [0.6, 0.6, 0.2, 0.6, 0.4]),
- ("max", True, False, [3.0, 3.0, 5.0, 3.0, 4.0]),
- ("max", True, True, [0.6, 0.6, 1.0, 0.6, 0.8]),
- ("max", False, False, [5.0, 5.0, 1.0, 5.0, 2.0]),
- ("max", False, True, [1.0, 1.0, 0.2, 1.0, 0.4]),
- ("first", True, False, [1.0, 2.0, 5.0, 3.0, 4.0]),
- ("first", True, True, [0.2, 0.4, 1.0, 0.6, 0.8]),
- ("first", False, False, [3.0, 4.0, 1.0, 5.0, 2.0]),
- ("first", False, True, [0.6, 0.8, 0.2, 1.0, 0.4]),
- ("dense", True, False, [1.0, 1.0, 3.0, 1.0, 2.0]),
- ("dense", True, True, [1.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 2.0 / 3.0]),
- ("dense", False, False, [3.0, 3.0, 1.0, 3.0, 2.0]),
- ("dense", False, True, [3.0 / 3.0, 3.0 / 3.0, 1.0 / 3.0, 3.0 / 3.0, 2.0 / 3.0]),
- ],
- )
- def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
- key = np.repeat(grps, len(vals))
- orig_vals = vals
- vals = list(vals) * len(grps)
- if isinstance(orig_vals, np.ndarray):
- vals = np.array(vals, dtype=orig_vals.dtype)
- df = DataFrame({"key": key, "val": vals})
- result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
- exp_df = DataFrame(exp * len(grps), columns=["val"])
- tm.assert_frame_equal(result, exp_df)
- @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
- @pytest.mark.parametrize(
- "vals", [[-np.inf, -np.inf, np.nan, 1.0, np.nan, np.inf, np.inf]]
- )
- @pytest.mark.parametrize(
- "ties_method,ascending,na_option,exp",
- [
- ("average", True, "keep", [1.5, 1.5, np.nan, 3, np.nan, 4.5, 4.5]),
- ("average", True, "top", [3.5, 3.5, 1.5, 5.0, 1.5, 6.5, 6.5]),
- ("average", True, "bottom", [1.5, 1.5, 6.5, 3.0, 6.5, 4.5, 4.5]),
- ("average", False, "keep", [4.5, 4.5, np.nan, 3, np.nan, 1.5, 1.5]),
- ("average", False, "top", [6.5, 6.5, 1.5, 5.0, 1.5, 3.5, 3.5]),
- ("average", False, "bottom", [4.5, 4.5, 6.5, 3.0, 6.5, 1.5, 1.5]),
- ("min", True, "keep", [1.0, 1.0, np.nan, 3.0, np.nan, 4.0, 4.0]),
- ("min", True, "top", [3.0, 3.0, 1.0, 5.0, 1.0, 6.0, 6.0]),
- ("min", True, "bottom", [1.0, 1.0, 6.0, 3.0, 6.0, 4.0, 4.0]),
- ("min", False, "keep", [4.0, 4.0, np.nan, 3.0, np.nan, 1.0, 1.0]),
- ("min", False, "top", [6.0, 6.0, 1.0, 5.0, 1.0, 3.0, 3.0]),
- ("min", False, "bottom", [4.0, 4.0, 6.0, 3.0, 6.0, 1.0, 1.0]),
- ("max", True, "keep", [2.0, 2.0, np.nan, 3.0, np.nan, 5.0, 5.0]),
- ("max", True, "top", [4.0, 4.0, 2.0, 5.0, 2.0, 7.0, 7.0]),
- ("max", True, "bottom", [2.0, 2.0, 7.0, 3.0, 7.0, 5.0, 5.0]),
- ("max", False, "keep", [5.0, 5.0, np.nan, 3.0, np.nan, 2.0, 2.0]),
- ("max", False, "top", [7.0, 7.0, 2.0, 5.0, 2.0, 4.0, 4.0]),
- ("max", False, "bottom", [5.0, 5.0, 7.0, 3.0, 7.0, 2.0, 2.0]),
- ("first", True, "keep", [1.0, 2.0, np.nan, 3.0, np.nan, 4.0, 5.0]),
- ("first", True, "top", [3.0, 4.0, 1.0, 5.0, 2.0, 6.0, 7.0]),
- ("first", True, "bottom", [1.0, 2.0, 6.0, 3.0, 7.0, 4.0, 5.0]),
- ("first", False, "keep", [4.0, 5.0, np.nan, 3.0, np.nan, 1.0, 2.0]),
- ("first", False, "top", [6.0, 7.0, 1.0, 5.0, 2.0, 3.0, 4.0]),
- ("first", False, "bottom", [4.0, 5.0, 6.0, 3.0, 7.0, 1.0, 2.0]),
- ("dense", True, "keep", [1.0, 1.0, np.nan, 2.0, np.nan, 3.0, 3.0]),
- ("dense", True, "top", [2.0, 2.0, 1.0, 3.0, 1.0, 4.0, 4.0]),
- ("dense", True, "bottom", [1.0, 1.0, 4.0, 2.0, 4.0, 3.0, 3.0]),
- ("dense", False, "keep", [3.0, 3.0, np.nan, 2.0, np.nan, 1.0, 1.0]),
- ("dense", False, "top", [4.0, 4.0, 1.0, 3.0, 1.0, 2.0, 2.0]),
- ("dense", False, "bottom", [3.0, 3.0, 4.0, 2.0, 4.0, 1.0, 1.0]),
- ],
- )
- def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
- # GH 20561
- key = np.repeat(grps, len(vals))
- vals = vals * len(grps)
- df = DataFrame({"key": key, "val": vals})
- result = df.groupby("key").rank(
- method=ties_method, ascending=ascending, na_option=na_option
- )
- exp_df = DataFrame(exp * len(grps), columns=["val"])
- tm.assert_frame_equal(result, exp_df)
- @pytest.mark.parametrize("grps", [["qux"], ["qux", "quux"]])
- @pytest.mark.parametrize(
- "vals",
- [
- np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
- for dtype in ["f8", "f4", "f2"]
- ]
- + [
- [
- pd.Timestamp("2018-01-02"),
- pd.Timestamp("2018-01-02"),
- np.nan,
- pd.Timestamp("2018-01-08"),
- pd.Timestamp("2018-01-02"),
- pd.Timestamp("2018-01-06"),
- np.nan,
- np.nan,
- ],
- [
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- np.nan,
- pd.Timestamp("2018-01-08", tz="US/Pacific"),
- pd.Timestamp("2018-01-02", tz="US/Pacific"),
- pd.Timestamp("2018-01-06", tz="US/Pacific"),
- np.nan,
- np.nan,
- ],
- [
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- np.nan,
- pd.Timestamp("2018-01-08") - pd.Timestamp(0),
- pd.Timestamp("2018-01-02") - pd.Timestamp(0),
- pd.Timestamp("2018-01-06") - pd.Timestamp(0),
- np.nan,
- np.nan,
- ],
- [
- pd.Timestamp("2018-01-02").to_period("D"),
- pd.Timestamp("2018-01-02").to_period("D"),
- np.nan,
- pd.Timestamp("2018-01-08").to_period("D"),
- pd.Timestamp("2018-01-02").to_period("D"),
- pd.Timestamp("2018-01-06").to_period("D"),
- np.nan,
- np.nan,
- ],
- ],
- ids=lambda x: type(x[0]),
- )
- @pytest.mark.parametrize(
- "ties_method,ascending,na_option,pct,exp",
- [
- (
- "average",
- True,
- "keep",
- False,
- [2.0, 2.0, np.nan, 5.0, 2.0, 4.0, np.nan, np.nan],
- ),
- (
- "average",
- True,
- "keep",
- True,
- [0.4, 0.4, np.nan, 1.0, 0.4, 0.8, np.nan, np.nan],
- ),
- (
- "average",
- False,
- "keep",
- False,
- [4.0, 4.0, np.nan, 1.0, 4.0, 2.0, np.nan, np.nan],
- ),
- (
- "average",
- False,
- "keep",
- True,
- [0.8, 0.8, np.nan, 0.2, 0.8, 0.4, np.nan, np.nan],
- ),
- ("min", True, "keep", False, [1.0, 1.0, np.nan, 5.0, 1.0, 4.0, np.nan, np.nan]),
- ("min", True, "keep", True, [0.2, 0.2, np.nan, 1.0, 0.2, 0.8, np.nan, np.nan]),
- (
- "min",
- False,
- "keep",
- False,
- [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
- ),
- ("min", False, "keep", True, [0.6, 0.6, np.nan, 0.2, 0.6, 0.4, np.nan, np.nan]),
- ("max", True, "keep", False, [3.0, 3.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan]),
- ("max", True, "keep", True, [0.6, 0.6, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan]),
- (
- "max",
- False,
- "keep",
- False,
- [5.0, 5.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
- ),
- ("max", False, "keep", True, [1.0, 1.0, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan]),
- (
- "first",
- True,
- "keep",
- False,
- [1.0, 2.0, np.nan, 5.0, 3.0, 4.0, np.nan, np.nan],
- ),
- (
- "first",
- True,
- "keep",
- True,
- [0.2, 0.4, np.nan, 1.0, 0.6, 0.8, np.nan, np.nan],
- ),
- (
- "first",
- False,
- "keep",
- False,
- [3.0, 4.0, np.nan, 1.0, 5.0, 2.0, np.nan, np.nan],
- ),
- (
- "first",
- False,
- "keep",
- True,
- [0.6, 0.8, np.nan, 0.2, 1.0, 0.4, np.nan, np.nan],
- ),
- (
- "dense",
- True,
- "keep",
- False,
- [1.0, 1.0, np.nan, 3.0, 1.0, 2.0, np.nan, np.nan],
- ),
- (
- "dense",
- True,
- "keep",
- True,
- [
- 1.0 / 3.0,
- 1.0 / 3.0,
- np.nan,
- 3.0 / 3.0,
- 1.0 / 3.0,
- 2.0 / 3.0,
- np.nan,
- np.nan,
- ],
- ),
- (
- "dense",
- False,
- "keep",
- False,
- [3.0, 3.0, np.nan, 1.0, 3.0, 2.0, np.nan, np.nan],
- ),
- (
- "dense",
- False,
- "keep",
- True,
- [
- 3.0 / 3.0,
- 3.0 / 3.0,
- np.nan,
- 1.0 / 3.0,
- 3.0 / 3.0,
- 2.0 / 3.0,
- np.nan,
- np.nan,
- ],
- ),
- ("average", True, "bottom", False, [2.0, 2.0, 7.0, 5.0, 2.0, 4.0, 7.0, 7.0]),
- (
- "average",
- True,
- "bottom",
- True,
- [0.25, 0.25, 0.875, 0.625, 0.25, 0.5, 0.875, 0.875],
- ),
- ("average", False, "bottom", False, [4.0, 4.0, 7.0, 1.0, 4.0, 2.0, 7.0, 7.0]),
- (
- "average",
- False,
- "bottom",
- True,
- [0.5, 0.5, 0.875, 0.125, 0.5, 0.25, 0.875, 0.875],
- ),
- ("min", True, "bottom", False, [1.0, 1.0, 6.0, 5.0, 1.0, 4.0, 6.0, 6.0]),
- (
- "min",
- True,
- "bottom",
- True,
- [0.125, 0.125, 0.75, 0.625, 0.125, 0.5, 0.75, 0.75],
- ),
- ("min", False, "bottom", False, [3.0, 3.0, 6.0, 1.0, 3.0, 2.0, 6.0, 6.0]),
- (
- "min",
- False,
- "bottom",
- True,
- [0.375, 0.375, 0.75, 0.125, 0.375, 0.25, 0.75, 0.75],
- ),
- ("max", True, "bottom", False, [3.0, 3.0, 8.0, 5.0, 3.0, 4.0, 8.0, 8.0]),
- ("max", True, "bottom", True, [0.375, 0.375, 1.0, 0.625, 0.375, 0.5, 1.0, 1.0]),
- ("max", False, "bottom", False, [5.0, 5.0, 8.0, 1.0, 5.0, 2.0, 8.0, 8.0]),
- (
- "max",
- False,
- "bottom",
- True,
- [0.625, 0.625, 1.0, 0.125, 0.625, 0.25, 1.0, 1.0],
- ),
- ("first", True, "bottom", False, [1.0, 2.0, 6.0, 5.0, 3.0, 4.0, 7.0, 8.0]),
- (
- "first",
- True,
- "bottom",
- True,
- [0.125, 0.25, 0.75, 0.625, 0.375, 0.5, 0.875, 1.0],
- ),
- ("first", False, "bottom", False, [3.0, 4.0, 6.0, 1.0, 5.0, 2.0, 7.0, 8.0]),
- (
- "first",
- False,
- "bottom",
- True,
- [0.375, 0.5, 0.75, 0.125, 0.625, 0.25, 0.875, 1.0],
- ),
- ("dense", True, "bottom", False, [1.0, 1.0, 4.0, 3.0, 1.0, 2.0, 4.0, 4.0]),
- ("dense", True, "bottom", True, [0.25, 0.25, 1.0, 0.75, 0.25, 0.5, 1.0, 1.0]),
- ("dense", False, "bottom", False, [3.0, 3.0, 4.0, 1.0, 3.0, 2.0, 4.0, 4.0]),
- ("dense", False, "bottom", True, [0.75, 0.75, 1.0, 0.25, 0.75, 0.5, 1.0, 1.0]),
- ],
- )
- def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
- key = np.repeat(grps, len(vals))
- orig_vals = vals
- vals = list(vals) * len(grps)
- if isinstance(orig_vals, np.ndarray):
- vals = np.array(vals, dtype=orig_vals.dtype)
- df = DataFrame({"key": key, "val": vals})
- result = df.groupby("key").rank(
- method=ties_method, ascending=ascending, na_option=na_option, pct=pct
- )
- exp_df = DataFrame(exp * len(grps), columns=["val"])
- tm.assert_frame_equal(result, exp_df)
- @pytest.mark.parametrize(
- "pct,exp", [(False, [3.0, 3.0, 3.0, 3.0, 3.0]), (True, [0.6, 0.6, 0.6, 0.6, 0.6])]
- )
- def test_rank_resets_each_group(pct, exp):
- df = DataFrame(
- {"key": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"], "val": [1] * 10}
- )
- result = df.groupby("key").rank(pct=pct)
- exp_df = DataFrame(exp * 2, columns=["val"])
- tm.assert_frame_equal(result, exp_df)
- @pytest.mark.parametrize(
- "dtype", ["int64", "int32", "uint64", "uint32", "float64", "float32"]
- )
- @pytest.mark.parametrize("upper", [True, False])
- def test_rank_avg_even_vals(dtype, upper):
- if upper:
- # use IntegerDtype/FloatingDtype
- dtype = dtype[0].upper() + dtype[1:]
- dtype = dtype.replace("Ui", "UI")
- df = DataFrame({"key": ["a"] * 4, "val": [1] * 4})
- df["val"] = df["val"].astype(dtype)
- assert df["val"].dtype == dtype
- result = df.groupby("key").rank()
- exp_df = DataFrame([2.5, 2.5, 2.5, 2.5], columns=["val"])
- if upper:
- exp_df = exp_df.astype("Float64")
- tm.assert_frame_equal(result, exp_df)
- @pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
- @pytest.mark.parametrize("ascending", [True, False])
- @pytest.mark.parametrize("na_option", ["keep", "top", "bottom"])
- @pytest.mark.parametrize("pct", [True, False])
- @pytest.mark.parametrize(
- "vals", [["bar", "bar", "foo", "bar", "baz"], ["bar", np.nan, "foo", np.nan, "baz"]]
- )
- def test_rank_object_dtype(ties_method, ascending, na_option, pct, vals):
- df = DataFrame({"key": ["foo"] * 5, "val": vals})
- mask = df["val"].isna()
- gb = df.groupby("key")
- res = gb.rank(method=ties_method, ascending=ascending, na_option=na_option, pct=pct)
- # construct our expected by using numeric values with the same ordering
- if mask.any():
- df2 = DataFrame({"key": ["foo"] * 5, "val": [0, np.nan, 2, np.nan, 1]})
- else:
- df2 = DataFrame({"key": ["foo"] * 5, "val": [0, 0, 2, 0, 1]})
- gb2 = df2.groupby("key")
- alt = gb2.rank(
- method=ties_method, ascending=ascending, na_option=na_option, pct=pct
- )
- tm.assert_frame_equal(res, alt)
- @pytest.mark.parametrize("na_option", [True, "bad", 1])
- @pytest.mark.parametrize("ties_method", ["average", "min", "max", "first", "dense"])
- @pytest.mark.parametrize("ascending", [True, False])
- @pytest.mark.parametrize("pct", [True, False])
- @pytest.mark.parametrize(
- "vals",
- [
- ["bar", "bar", "foo", "bar", "baz"],
- ["bar", np.nan, "foo", np.nan, "baz"],
- [1, np.nan, 2, np.nan, 3],
- ],
- )
- def test_rank_naoption_raises(ties_method, ascending, na_option, pct, vals):
- df = DataFrame({"key": ["foo"] * 5, "val": vals})
- msg = "na_option must be one of 'keep', 'top', or 'bottom'"
- with pytest.raises(ValueError, match=msg):
- df.groupby("key").rank(
- method=ties_method, ascending=ascending, na_option=na_option, pct=pct
- )
- def test_rank_empty_group():
- # see gh-22519
- column = "A"
- df = DataFrame({"A": [0, 1, 0], "B": [1.0, np.nan, 2.0]})
- result = df.groupby(column).B.rank(pct=True)
- expected = Series([0.5, np.nan, 1.0], name="B")
- tm.assert_series_equal(result, expected)
- result = df.groupby(column).rank(pct=True)
- expected = DataFrame({"B": [0.5, np.nan, 1.0]})
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "input_key,input_value,output_value",
- [
- ([1, 2], [1, 1], [1.0, 1.0]),
- ([1, 1, 2, 2], [1, 2, 1, 2], [0.5, 1.0, 0.5, 1.0]),
- ([1, 1, 2, 2], [1, 2, 1, np.nan], [0.5, 1.0, 1.0, np.nan]),
- ([1, 1, 2], [1, 2, np.nan], [0.5, 1.0, np.nan]),
- ],
- )
- def test_rank_zero_div(input_key, input_value, output_value):
- # GH 23666
- df = DataFrame({"A": input_key, "B": input_value})
- result = df.groupby("A").rank(method="dense", pct=True)
- expected = DataFrame({"B": output_value})
- tm.assert_frame_equal(result, expected)
- def test_rank_min_int():
- # GH-32859
- df = DataFrame(
- {
- "grp": [1, 1, 2],
- "int_col": [
- np.iinfo(np.int64).min,
- np.iinfo(np.int64).max,
- np.iinfo(np.int64).min,
- ],
- "datetimelike": [NaT, datetime(2001, 1, 1), NaT],
- }
- )
- result = df.groupby("grp").rank()
- expected = DataFrame(
- {"int_col": [1.0, 2.0, 1.0], "datetimelike": [np.NaN, 1.0, np.NaN]}
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("use_nan", [True, False])
- def test_rank_pct_equal_values_on_group_transition(use_nan):
- # GH#40518
- fill_value = np.nan if use_nan else 3
- df = DataFrame(
- [
- [-1, 1],
- [-1, 2],
- [1, fill_value],
- [-1, fill_value],
- ],
- columns=["group", "val"],
- )
- result = df.groupby(["group"])["val"].rank(
- method="dense",
- pct=True,
- )
- if use_nan:
- expected = Series([0.5, 1, np.nan, np.nan], name="val")
- else:
- expected = Series([1 / 3, 2 / 3, 1, 1], name="val")
- tm.assert_series_equal(result, expected)
- def test_rank_multiindex():
- # GH27721
- df = concat(
- {
- "a": DataFrame({"col1": [3, 4], "col2": [1, 2]}),
- "b": DataFrame({"col3": [5, 6], "col4": [7, 8]}),
- },
- axis=1,
- )
- gb = df.groupby(level=0, axis=1)
- result = gb.rank(axis=1)
- expected = concat(
- [
- df["a"].rank(axis=1),
- df["b"].rank(axis=1),
- ],
- axis=1,
- keys=["a", "b"],
- )
- tm.assert_frame_equal(result, expected)
- def test_groupby_axis0_rank_axis1():
- # GH#41320
- df = DataFrame(
- {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
- index=["a", "a", "b", "b"],
- )
- gb = df.groupby(level=0, axis=0)
- res = gb.rank(axis=1)
- # This should match what we get when "manually" operating group-by-group
- expected = concat([df.loc["a"].rank(axis=1), df.loc["b"].rank(axis=1)], axis=0)
- tm.assert_frame_equal(res, expected)
- # check that we haven't accidentally written a case that coincidentally
- # matches rank(axis=0)
- alt = gb.rank(axis=0)
- assert not alt.equals(expected)
- def test_groupby_axis0_cummax_axis1():
- # case where groupby axis is 0 and axis keyword in transform is 1
- # df has mixed dtype -> multiple blocks
- df = DataFrame(
- {0: [1, 3, 5, 7], 1: [2, 4, 6, 8], 2: [1.5, 3.5, 5.5, 7.5]},
- index=["a", "a", "b", "b"],
- )
- gb = df.groupby(level=0, axis=0)
- cmax = gb.cummax(axis=1)
- expected = df[[0, 1]].astype(np.float64)
- expected[2] = expected[1]
- tm.assert_frame_equal(cmax, expected)
- def test_non_unique_index():
- # GH 16577
- df = DataFrame(
- {"A": [1.0, 2.0, 3.0, np.nan], "value": 1.0},
- index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
- )
- result = df.groupby([df.index, "A"]).value.rank(ascending=True, pct=True)
- expected = Series(
- [1.0, 1.0, 1.0, np.nan],
- index=[pd.Timestamp("20170101", tz="US/Eastern")] * 4,
- name="value",
- )
- tm.assert_series_equal(result, expected)
- def test_rank_categorical():
- cat = pd.Categorical(["a", "a", "b", np.nan, "c", "b"], ordered=True)
- cat2 = pd.Categorical([1, 2, 3, np.nan, 4, 5], ordered=True)
- df = DataFrame({"col1": [0, 1, 0, 1, 0, 1], "col2": cat, "col3": cat2})
- gb = df.groupby("col1")
- res = gb.rank()
- expected = df.astype(object).groupby("col1").rank()
- tm.assert_frame_equal(res, expected)
|