123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422 |
- import numpy as np
- import pytest
- from pandas.errors import InvalidIndexError
- import pandas as pd
- from pandas import (
- CategoricalIndex,
- Index,
- IntervalIndex,
- Timestamp,
- )
- import pandas._testing as tm
- class TestTake:
- def test_take_fill_value(self):
- # GH 12631
- # numeric category
- idx = CategoricalIndex([1, 2, 3], name="xxx")
- result = idx.take(np.array([1, 0, -1]))
- expected = CategoricalIndex([2, 1, 3], name="xxx")
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
- # fill_value
- result = idx.take(np.array([1, 0, -1]), fill_value=True)
- expected = CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx")
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
- # allow_fill=False
- result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
- expected = CategoricalIndex([2, 1, 3], name="xxx")
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
- # object category
- idx = CategoricalIndex(
- list("CBA"), categories=list("ABC"), ordered=True, name="xxx"
- )
- result = idx.take(np.array([1, 0, -1]))
- expected = CategoricalIndex(
- list("BCA"), categories=list("ABC"), ordered=True, name="xxx"
- )
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
- # fill_value
- result = idx.take(np.array([1, 0, -1]), fill_value=True)
- expected = CategoricalIndex(
- ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx"
- )
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
- # allow_fill=False
- result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
- expected = CategoricalIndex(
- list("BCA"), categories=list("ABC"), ordered=True, name="xxx"
- )
- tm.assert_index_equal(result, expected)
- tm.assert_categorical_equal(result.values, expected.values)
- msg = (
- "When allow_fill=True and fill_value is not None, "
- "all indices must be >= -1"
- )
- with pytest.raises(ValueError, match=msg):
- idx.take(np.array([1, 0, -2]), fill_value=True)
- with pytest.raises(ValueError, match=msg):
- idx.take(np.array([1, 0, -5]), fill_value=True)
- msg = "index -5 is out of bounds for (axis 0 with )?size 3"
- with pytest.raises(IndexError, match=msg):
- idx.take(np.array([1, -5]))
- def test_take_fill_value_datetime(self):
- # datetime category
- idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx")
- idx = CategoricalIndex(idx)
- result = idx.take(np.array([1, 0, -1]))
- expected = pd.DatetimeIndex(
- ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx"
- )
- expected = CategoricalIndex(expected)
- tm.assert_index_equal(result, expected)
- # fill_value
- result = idx.take(np.array([1, 0, -1]), fill_value=True)
- expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx")
- exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"])
- expected = CategoricalIndex(expected, categories=exp_cats)
- tm.assert_index_equal(result, expected)
- # allow_fill=False
- result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
- expected = pd.DatetimeIndex(
- ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx"
- )
- expected = CategoricalIndex(expected)
- tm.assert_index_equal(result, expected)
- msg = (
- "When allow_fill=True and fill_value is not None, "
- "all indices must be >= -1"
- )
- with pytest.raises(ValueError, match=msg):
- idx.take(np.array([1, 0, -2]), fill_value=True)
- with pytest.raises(ValueError, match=msg):
- idx.take(np.array([1, 0, -5]), fill_value=True)
- msg = "index -5 is out of bounds for (axis 0 with )?size 3"
- with pytest.raises(IndexError, match=msg):
- idx.take(np.array([1, -5]))
- def test_take_invalid_kwargs(self):
- idx = CategoricalIndex([1, 2, 3], name="foo")
- indices = [1, 0, -1]
- msg = r"take\(\) got an unexpected keyword argument 'foo'"
- with pytest.raises(TypeError, match=msg):
- idx.take(indices, foo=2)
- msg = "the 'out' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- idx.take(indices, out=indices)
- msg = "the 'mode' parameter is not supported"
- with pytest.raises(ValueError, match=msg):
- idx.take(indices, mode="clip")
- class TestGetLoc:
- def test_get_loc(self):
- # GH 12531
- cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc"))
- idx1 = Index(list("abcde"))
- assert cidx1.get_loc("a") == idx1.get_loc("a")
- assert cidx1.get_loc("e") == idx1.get_loc("e")
- for i in [cidx1, idx1]:
- with pytest.raises(KeyError, match="'NOT-EXIST'"):
- i.get_loc("NOT-EXIST")
- # non-unique
- cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc"))
- idx2 = Index(list("aacded"))
- # results in bool array
- res = cidx2.get_loc("d")
- tm.assert_numpy_array_equal(res, idx2.get_loc("d"))
- tm.assert_numpy_array_equal(
- res, np.array([False, False, False, True, False, True])
- )
- # unique element results in scalar
- res = cidx2.get_loc("e")
- assert res == idx2.get_loc("e")
- assert res == 4
- for i in [cidx2, idx2]:
- with pytest.raises(KeyError, match="'NOT-EXIST'"):
- i.get_loc("NOT-EXIST")
- # non-unique, sliceable
- cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc"))
- idx3 = Index(list("aabbb"))
- # results in slice
- res = cidx3.get_loc("a")
- assert res == idx3.get_loc("a")
- assert res == slice(0, 2, None)
- res = cidx3.get_loc("b")
- assert res == idx3.get_loc("b")
- assert res == slice(2, 5, None)
- for i in [cidx3, idx3]:
- with pytest.raises(KeyError, match="'c'"):
- i.get_loc("c")
- def test_get_loc_unique(self):
- cidx = CategoricalIndex(list("abc"))
- result = cidx.get_loc("b")
- assert result == 1
- def test_get_loc_monotonic_nonunique(self):
- cidx = CategoricalIndex(list("abbc"))
- result = cidx.get_loc("b")
- expected = slice(1, 3, None)
- assert result == expected
- def test_get_loc_nonmonotonic_nonunique(self):
- cidx = CategoricalIndex(list("abcb"))
- result = cidx.get_loc("b")
- expected = np.array([False, True, False, True], dtype=bool)
- tm.assert_numpy_array_equal(result, expected)
- def test_get_loc_nan(self):
- # GH#41933
- ci = CategoricalIndex(["A", "B", np.nan])
- res = ci.get_loc(np.nan)
- assert res == 2
- class TestGetIndexer:
- def test_get_indexer_base(self):
- # Determined by cat ordering.
- idx = CategoricalIndex(list("cab"), categories=list("cab"))
- expected = np.arange(len(idx), dtype=np.intp)
- actual = idx.get_indexer(idx)
- tm.assert_numpy_array_equal(expected, actual)
- with pytest.raises(ValueError, match="Invalid fill method"):
- idx.get_indexer(idx, method="invalid")
- def test_get_indexer_requires_unique(self):
- np.random.seed(123456789)
- ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
- oidx = Index(np.array(ci))
- msg = "Reindexing only valid with uniquely valued Index objects"
- for n in [1, 2, 5, len(ci)]:
- finder = oidx[np.random.randint(0, len(ci), size=n)]
- with pytest.raises(InvalidIndexError, match=msg):
- ci.get_indexer(finder)
- # see gh-17323
- #
- # Even when indexer is equal to the
- # members in the index, we should
- # respect duplicates instead of taking
- # the fast-track path.
- for finder in [list("aabbca"), list("aababca")]:
- with pytest.raises(InvalidIndexError, match=msg):
- ci.get_indexer(finder)
- def test_get_indexer_non_unique(self):
- idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
- idx2 = CategoricalIndex(list("abf"))
- for indexer in [idx2, list("abf"), Index(list("abf"))]:
- msg = "Reindexing only valid with uniquely valued Index objects"
- with pytest.raises(InvalidIndexError, match=msg):
- idx1.get_indexer(indexer)
- r1, _ = idx1.get_indexer_non_unique(indexer)
- expected = np.array([0, 1, 2, -1], dtype=np.intp)
- tm.assert_almost_equal(r1, expected)
- def test_get_indexer_method(self):
- idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
- idx2 = CategoricalIndex(list("abf"))
- msg = "method pad not yet implemented for CategoricalIndex"
- with pytest.raises(NotImplementedError, match=msg):
- idx2.get_indexer(idx1, method="pad")
- msg = "method backfill not yet implemented for CategoricalIndex"
- with pytest.raises(NotImplementedError, match=msg):
- idx2.get_indexer(idx1, method="backfill")
- msg = "method nearest not yet implemented for CategoricalIndex"
- with pytest.raises(NotImplementedError, match=msg):
- idx2.get_indexer(idx1, method="nearest")
- def test_get_indexer_array(self):
- arr = np.array(
- [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")],
- dtype=object,
- )
- cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")]
- ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category")
- result = ci.get_indexer(arr)
- expected = np.array([0, 1], dtype="intp")
- tm.assert_numpy_array_equal(result, expected)
- def test_get_indexer_same_categories_same_order(self):
- ci = CategoricalIndex(["a", "b"], categories=["a", "b"])
- result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"]))
- expected = np.array([1, 1], dtype="intp")
- tm.assert_numpy_array_equal(result, expected)
- def test_get_indexer_same_categories_different_order(self):
- # https://github.com/pandas-dev/pandas/issues/19551
- ci = CategoricalIndex(["a", "b"], categories=["a", "b"])
- result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"]))
- expected = np.array([1, 1], dtype="intp")
- tm.assert_numpy_array_equal(result, expected)
- def test_get_indexer_nans_in_index_and_target(self):
- # GH 45361
- ci = CategoricalIndex([1, 2, np.nan, 3])
- other1 = [2, 3, 4, np.nan]
- res1 = ci.get_indexer(other1)
- expected1 = np.array([1, 3, -1, 2], dtype=np.intp)
- tm.assert_numpy_array_equal(res1, expected1)
- other2 = [1, 4, 2, 3]
- res2 = ci.get_indexer(other2)
- expected2 = np.array([0, -1, 1, 3], dtype=np.intp)
- tm.assert_numpy_array_equal(res2, expected2)
- class TestWhere:
- def test_where(self, listlike_box):
- klass = listlike_box
- i = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
- cond = [True] * len(i)
- expected = i
- result = i.where(klass(cond))
- tm.assert_index_equal(result, expected)
- cond = [False] + [True] * (len(i) - 1)
- expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories)
- result = i.where(klass(cond))
- tm.assert_index_equal(result, expected)
- def test_where_non_categories(self):
- ci = CategoricalIndex(["a", "b", "c", "d"])
- mask = np.array([True, False, True, False])
- result = ci.where(mask, 2)
- expected = Index(["a", 2, "c", 2], dtype=object)
- tm.assert_index_equal(result, expected)
- msg = "Cannot setitem on a Categorical with a new category"
- with pytest.raises(TypeError, match=msg):
- # Test the Categorical method directly
- ci._data._where(mask, 2)
- class TestContains:
- def test_contains(self):
- ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=False)
- assert "a" in ci
- assert "z" not in ci
- assert "e" not in ci
- assert np.nan not in ci
- # assert codes NOT in index
- assert 0 not in ci
- assert 1 not in ci
- def test_contains_nan(self):
- ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef"))
- assert np.nan in ci
- @pytest.mark.parametrize("unwrap", [True, False])
- def test_contains_na_dtype(self, unwrap):
- dti = pd.date_range("2016-01-01", periods=100).insert(0, pd.NaT)
- pi = dti.to_period("D")
- tdi = dti - dti[-1]
- ci = CategoricalIndex(dti)
- obj = ci
- if unwrap:
- obj = ci._data
- assert np.nan in obj
- assert None in obj
- assert pd.NaT in obj
- assert np.datetime64("NaT") in obj
- assert np.timedelta64("NaT") not in obj
- obj2 = CategoricalIndex(tdi)
- if unwrap:
- obj2 = obj2._data
- assert np.nan in obj2
- assert None in obj2
- assert pd.NaT in obj2
- assert np.datetime64("NaT") not in obj2
- assert np.timedelta64("NaT") in obj2
- obj3 = CategoricalIndex(pi)
- if unwrap:
- obj3 = obj3._data
- assert np.nan in obj3
- assert None in obj3
- assert pd.NaT in obj3
- assert np.datetime64("NaT") not in obj3
- assert np.timedelta64("NaT") not in obj3
- @pytest.mark.parametrize(
- "item, expected",
- [
- (pd.Interval(0, 1), True),
- (1.5, True),
- (pd.Interval(0.5, 1.5), False),
- ("a", False),
- (Timestamp(1), False),
- (pd.Timedelta(1), False),
- ],
- ids=str,
- )
- def test_contains_interval(self, item, expected):
- # GH 23705
- ci = CategoricalIndex(IntervalIndex.from_breaks(range(3)))
- result = item in ci
- assert result is expected
- def test_contains_list(self):
- # GH#21729
- idx = CategoricalIndex([1, 2, 3])
- assert "a" not in idx
- with pytest.raises(TypeError, match="unhashable type"):
- ["a"] in idx
- with pytest.raises(TypeError, match="unhashable type"):
- ["a", "b"] in idx
|