123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556 |
- import re
- import numpy as np
- import pytest
- from pandas.core.dtypes.common import is_categorical_dtype
- import pandas as pd
- from pandas import (
- Categorical,
- CategoricalIndex,
- DataFrame,
- Index,
- Interval,
- Series,
- Timedelta,
- Timestamp,
- )
- import pandas._testing as tm
- from pandas.api.types import CategoricalDtype as CDT
- @pytest.fixture
- def df():
- return DataFrame(
- {
- "A": np.arange(6, dtype="int64"),
- },
- index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"),
- )
- @pytest.fixture
- def df2():
- return DataFrame(
- {
- "A": np.arange(6, dtype="int64"),
- },
- index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
- )
- class TestCategoricalIndex:
- def test_loc_scalar(self, df):
- dtype = CDT(list("cab"))
- result = df.loc["a"]
- bidx = Series(list("aaa"), name="B").astype(dtype)
- assert bidx.dtype == dtype
- expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx))
- tm.assert_frame_equal(result, expected)
- df = df.copy()
- df.loc["a"] = 20
- bidx2 = Series(list("aabbca"), name="B").astype(dtype)
- assert bidx2.dtype == dtype
- expected = DataFrame(
- {
- "A": [20, 20, 2, 3, 4, 20],
- },
- index=Index(bidx2),
- )
- tm.assert_frame_equal(df, expected)
- # value not in the categories
- with pytest.raises(KeyError, match=r"^'d'$"):
- df.loc["d"]
- df2 = df.copy()
- expected = df2.copy()
- expected.index = expected.index.astype(object)
- expected.loc["d"] = 10
- df2.loc["d"] = 10
- tm.assert_frame_equal(df2, expected)
- def test_loc_setitem_with_expansion_non_category(self, df):
- # Setting-with-expansion with a new key "d" that is not among caegories
- df.loc["a"] = 20
- # Setting a new row on an existing column
- df3 = df.copy()
- df3.loc["d", "A"] = 10
- bidx3 = Index(list("aabbcad"), name="B")
- expected3 = DataFrame(
- {
- "A": [20, 20, 2, 3, 4, 20, 10.0],
- },
- index=Index(bidx3),
- )
- tm.assert_frame_equal(df3, expected3)
- # Settig a new row _and_ new column
- df4 = df.copy()
- df4.loc["d", "C"] = 10
- expected3 = DataFrame(
- {
- "A": [20, 20, 2, 3, 4, 20, np.nan],
- "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10],
- },
- index=Index(bidx3),
- )
- tm.assert_frame_equal(df4, expected3)
- def test_loc_getitem_scalar_non_category(self, df):
- with pytest.raises(KeyError, match="^1$"):
- df.loc[1]
- def test_slicing(self):
- cat = Series(Categorical([1, 2, 3, 4]))
- reverse = cat[::-1]
- exp = np.array([4, 3, 2, 1], dtype=np.int64)
- tm.assert_numpy_array_equal(reverse.__array__(), exp)
- df = DataFrame({"value": (np.arange(100) + 1).astype("int64")})
- df["D"] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])
- expected = Series([11, Interval(0, 25)], index=["value", "D"], name=10)
- result = df.iloc[10]
- tm.assert_series_equal(result, expected)
- expected = DataFrame(
- {"value": np.arange(11, 21).astype("int64")},
- index=np.arange(10, 20).astype("int64"),
- )
- expected["D"] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
- result = df.iloc[10:20]
- tm.assert_frame_equal(result, expected)
- expected = Series([9, Interval(0, 25)], index=["value", "D"], name=8)
- result = df.loc[8]
- tm.assert_series_equal(result, expected)
- def test_slicing_and_getting_ops(self):
- # systematically test the slicing operations:
- # for all slicing ops:
- # - returning a dataframe
- # - returning a column
- # - returning a row
- # - returning a single value
- cats = Categorical(
- ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"]
- )
- idx = Index(["h", "i", "j", "k", "l", "m", "n"])
- values = [1, 2, 3, 4, 5, 6, 7]
- df = DataFrame({"cats": cats, "values": values}, index=idx)
- # the expected values
- cats2 = Categorical(["b", "c"], categories=["a", "b", "c"])
- idx2 = Index(["j", "k"])
- values2 = [3, 4]
- # 2:4,: | "j":"k",:
- exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2)
- # :,"cats" | :,0
- exp_col = Series(cats, index=idx, name="cats")
- # "j",: | 2,:
- exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j")
- # "j","cats | 2,0
- exp_val = "b"
- # iloc
- # frame
- res_df = df.iloc[2:4, :]
- tm.assert_frame_equal(res_df, exp_df)
- assert is_categorical_dtype(res_df["cats"].dtype)
- # row
- res_row = df.iloc[2, :]
- tm.assert_series_equal(res_row, exp_row)
- assert isinstance(res_row["cats"], str)
- # col
- res_col = df.iloc[:, 0]
- tm.assert_series_equal(res_col, exp_col)
- assert is_categorical_dtype(res_col.dtype)
- # single value
- res_val = df.iloc[2, 0]
- assert res_val == exp_val
- # loc
- # frame
- res_df = df.loc["j":"k", :]
- tm.assert_frame_equal(res_df, exp_df)
- assert is_categorical_dtype(res_df["cats"].dtype)
- # row
- res_row = df.loc["j", :]
- tm.assert_series_equal(res_row, exp_row)
- assert isinstance(res_row["cats"], str)
- # col
- res_col = df.loc[:, "cats"]
- tm.assert_series_equal(res_col, exp_col)
- assert is_categorical_dtype(res_col.dtype)
- # single value
- res_val = df.loc["j", "cats"]
- assert res_val == exp_val
- # single value
- res_val = df.loc["j", df.columns[0]]
- assert res_val == exp_val
- # iat
- res_val = df.iat[2, 0]
- assert res_val == exp_val
- # at
- res_val = df.at["j", "cats"]
- assert res_val == exp_val
- # fancy indexing
- exp_fancy = df.iloc[[2]]
- res_fancy = df[df["cats"] == "b"]
- tm.assert_frame_equal(res_fancy, exp_fancy)
- res_fancy = df[df["values"] == 3]
- tm.assert_frame_equal(res_fancy, exp_fancy)
- # get_value
- res_val = df.at["j", "cats"]
- assert res_val == exp_val
- # i : int, slice, or sequence of integers
- res_row = df.iloc[2]
- tm.assert_series_equal(res_row, exp_row)
- assert isinstance(res_row["cats"], str)
- res_df = df.iloc[slice(2, 4)]
- tm.assert_frame_equal(res_df, exp_df)
- assert is_categorical_dtype(res_df["cats"].dtype)
- res_df = df.iloc[[2, 3]]
- tm.assert_frame_equal(res_df, exp_df)
- assert is_categorical_dtype(res_df["cats"].dtype)
- res_col = df.iloc[:, 0]
- tm.assert_series_equal(res_col, exp_col)
- assert is_categorical_dtype(res_col.dtype)
- res_df = df.iloc[:, slice(0, 2)]
- tm.assert_frame_equal(res_df, df)
- assert is_categorical_dtype(res_df["cats"].dtype)
- res_df = df.iloc[:, [0, 1]]
- tm.assert_frame_equal(res_df, df)
- assert is_categorical_dtype(res_df["cats"].dtype)
- def test_slicing_doc_examples(self):
- # GH 7918
- cats = Categorical(
- ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"]
- )
- idx = Index(["h", "i", "j", "k", "l", "m", "n"])
- values = [1, 2, 2, 2, 3, 4, 5]
- df = DataFrame({"cats": cats, "values": values}, index=idx)
- result = df.iloc[2:4, :]
- expected = DataFrame(
- {
- "cats": Categorical(["b", "b"], categories=["a", "b", "c"]),
- "values": [2, 2],
- },
- index=["j", "k"],
- )
- tm.assert_frame_equal(result, expected)
- result = df.iloc[2:4, :].dtypes
- expected = Series(["category", "int64"], ["cats", "values"])
- tm.assert_series_equal(result, expected)
- result = df.loc["h":"j", "cats"]
- expected = Series(
- Categorical(["a", "b", "b"], categories=["a", "b", "c"]),
- index=["h", "i", "j"],
- name="cats",
- )
- tm.assert_series_equal(result, expected)
- result = df.loc["h":"j", df.columns[0:1]]
- expected = DataFrame(
- {"cats": Categorical(["a", "b", "b"], categories=["a", "b", "c"])},
- index=["h", "i", "j"],
- )
- tm.assert_frame_equal(result, expected)
- def test_loc_getitem_listlike_labels(self, df):
- # list of labels
- result = df.loc[["c", "a"]]
- expected = df.iloc[[4, 0, 1, 5]]
- tm.assert_frame_equal(result, expected, check_index_type=True)
- def test_loc_getitem_listlike_unused_category(self, df2):
- # GH#37901 a label that is in index.categories but not in index
- # listlike containing an element in the categories but not in the values
- with pytest.raises(KeyError, match=re.escape("['e'] not in index")):
- df2.loc[["a", "b", "e"]]
- def test_loc_getitem_label_unused_category(self, df2):
- # element in the categories but not in the values
- with pytest.raises(KeyError, match=r"^'e'$"):
- df2.loc["e"]
- def test_loc_getitem_non_category(self, df2):
- # not all labels in the categories
- with pytest.raises(KeyError, match=re.escape("['d'] not in index")):
- df2.loc[["a", "d"]]
- def test_loc_setitem_expansion_label_unused_category(self, df2):
- # assigning with a label that is in the categories but not in the index
- df = df2.copy()
- df.loc["e"] = 20
- result = df.loc[["a", "b", "e"]]
- exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B")
- expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index)
- tm.assert_frame_equal(result, expected)
- def test_loc_listlike_dtypes(self):
- # GH 11586
- # unique categories and codes
- index = CategoricalIndex(["a", "b", "c"])
- df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)
- # unique slice
- res = df.loc[["a", "b"]]
- exp_index = CategoricalIndex(["a", "b"], categories=index.categories)
- exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index)
- tm.assert_frame_equal(res, exp, check_index_type=True)
- # duplicated slice
- res = df.loc[["a", "a", "b"]]
- exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories)
- exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index)
- tm.assert_frame_equal(res, exp, check_index_type=True)
- with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
- df.loc[["a", "x"]]
- def test_loc_listlike_dtypes_duplicated_categories_and_codes(self):
- # duplicated categories and codes
- index = CategoricalIndex(["a", "b", "a"])
- df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)
- # unique slice
- res = df.loc[["a", "b"]]
- exp = DataFrame(
- {"A": [1, 3, 2], "B": [4, 6, 5]}, index=CategoricalIndex(["a", "a", "b"])
- )
- tm.assert_frame_equal(res, exp, check_index_type=True)
- # duplicated slice
- res = df.loc[["a", "a", "b"]]
- exp = DataFrame(
- {"A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5]},
- index=CategoricalIndex(["a", "a", "a", "a", "b"]),
- )
- tm.assert_frame_equal(res, exp, check_index_type=True)
- with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
- df.loc[["a", "x"]]
- def test_loc_listlike_dtypes_unused_category(self):
- # contains unused category
- index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde"))
- df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)
- res = df.loc[["a", "b"]]
- exp = DataFrame(
- {"A": [1, 3, 2], "B": [5, 7, 6]},
- index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")),
- )
- tm.assert_frame_equal(res, exp, check_index_type=True)
- # duplicated slice
- res = df.loc[["a", "a", "b"]]
- exp = DataFrame(
- {"A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6]},
- index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")),
- )
- tm.assert_frame_equal(res, exp, check_index_type=True)
- with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
- df.loc[["a", "x"]]
- def test_loc_getitem_listlike_unused_category_raises_keyerror(self):
- # key that is an *unused* category raises
- index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde"))
- df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)
- with pytest.raises(KeyError, match="e"):
- # For comparison, check the scalar behavior
- df.loc["e"]
- with pytest.raises(KeyError, match=re.escape("['e'] not in index")):
- df.loc[["a", "e"]]
- def test_ix_categorical_index(self):
- # GH 12531
- df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ"))
- cdf = df.copy()
- cdf.index = CategoricalIndex(df.index)
- cdf.columns = CategoricalIndex(df.columns)
- expect = Series(df.loc["A", :], index=cdf.columns, name="A")
- tm.assert_series_equal(cdf.loc["A", :], expect)
- expect = Series(df.loc[:, "X"], index=cdf.index, name="X")
- tm.assert_series_equal(cdf.loc[:, "X"], expect)
- exp_index = CategoricalIndex(list("AB"), categories=["A", "B", "C"])
- expect = DataFrame(df.loc[["A", "B"], :], columns=cdf.columns, index=exp_index)
- tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
- exp_columns = CategoricalIndex(list("XY"), categories=["X", "Y", "Z"])
- expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns)
- tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
- def test_ix_categorical_index_non_unique(self):
- # non-unique
- df = DataFrame(np.random.randn(3, 3), index=list("ABA"), columns=list("XYX"))
- cdf = df.copy()
- cdf.index = CategoricalIndex(df.index)
- cdf.columns = CategoricalIndex(df.columns)
- exp_index = CategoricalIndex(list("AA"), categories=["A", "B"])
- expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index)
- tm.assert_frame_equal(cdf.loc["A", :], expect)
- exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"])
- expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns)
- tm.assert_frame_equal(cdf.loc[:, "X"], expect)
- expect = DataFrame(
- df.loc[["A", "B"], :],
- columns=cdf.columns,
- index=CategoricalIndex(list("AAB")),
- )
- tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
- expect = DataFrame(
- df.loc[:, ["X", "Y"]],
- index=cdf.index,
- columns=CategoricalIndex(list("XXY")),
- )
- tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
- def test_loc_slice(self, df):
- # GH9748
- msg = (
- "cannot do slice indexing on CategoricalIndex with these "
- r"indexers \[1\] of type int"
- )
- with pytest.raises(TypeError, match=msg):
- df.loc[1:5]
- result = df.loc["b":"c"]
- expected = df.iloc[[2, 3, 4]]
- tm.assert_frame_equal(result, expected)
- def test_loc_and_at_with_categorical_index(self):
- # GH 20629
- df = DataFrame(
- [[1, 2], [3, 4], [5, 6]], index=CategoricalIndex(["A", "B", "C"])
- )
- s = df[0]
- assert s.loc["A"] == 1
- assert s.at["A"] == 1
- assert df.loc["B", 1] == 4
- assert df.at["B", 1] == 4
- @pytest.mark.parametrize(
- "idx_values",
- [
- # python types
- [1, 2, 3],
- [-1, -2, -3],
- [1.5, 2.5, 3.5],
- [-1.5, -2.5, -3.5],
- # numpy int/uint
- *(np.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_NUMPY_DTYPES),
- # numpy floats
- *(np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in tm.FLOAT_NUMPY_DTYPES),
- # numpy object
- np.array([1, "b", 3.5], dtype=object),
- # pandas scalars
- [Interval(1, 4), Interval(4, 6), Interval(6, 9)],
- [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)],
- [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")],
- # pandas Integer arrays
- *(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES),
- # other pandas arrays
- pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array,
- pd.date_range("2019-01-01", periods=3).array,
- pd.timedelta_range(start="1d", periods=3).array,
- ],
- )
- def test_loc_getitem_with_non_string_categories(self, idx_values, ordered):
- # GH-17569
- cat_idx = CategoricalIndex(idx_values, ordered=ordered)
- df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx)
- sl = slice(idx_values[0], idx_values[1])
- # scalar selection
- result = df.loc[idx_values[0]]
- expected = Series(["foo"], index=["A"], name=idx_values[0])
- tm.assert_series_equal(result, expected)
- # list selection
- result = df.loc[idx_values[:2]]
- expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
- tm.assert_frame_equal(result, expected)
- # slice selection
- result = df.loc[sl]
- expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
- tm.assert_frame_equal(result, expected)
- # scalar assignment
- result = df.copy()
- result.loc[idx_values[0]] = "qux"
- expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx)
- tm.assert_frame_equal(result, expected)
- # list assignment
- result = df.copy()
- result.loc[idx_values[:2], "A"] = ["qux", "qux2"]
- expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
- tm.assert_frame_equal(result, expected)
- # slice assignment
- result = df.copy()
- result.loc[sl, "A"] = ["qux", "qux2"]
- expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
- tm.assert_frame_equal(result, expected)
- def test_getitem_categorical_with_nan(self):
- # GH#41933
- ci = CategoricalIndex(["A", "B", np.nan])
- ser = Series(range(3), index=ci)
- assert ser[np.nan] == 2
- assert ser.loc[np.nan] == 2
- df = DataFrame(ser)
- assert df.loc[np.nan, 0] == 2
- assert df.loc[np.nan][0] == 2
|