123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- import collections
- import numpy as np
- import pytest
- from pandas.core.dtypes.dtypes import CategoricalDtype
- import pandas as pd
- from pandas import (
- Categorical,
- DataFrame,
- Index,
- Series,
- isna,
- )
- import pandas._testing as tm
- class TestCategoricalMissing:
- def test_isna(self):
- exp = np.array([False, False, True])
- cat = Categorical(["a", "b", np.nan])
- res = cat.isna()
- tm.assert_numpy_array_equal(res, exp)
- def test_na_flags_int_categories(self):
- # #1457
- categories = list(range(10))
- labels = np.random.randint(0, 10, 20)
- labels[::5] = -1
- cat = Categorical(labels, categories, fastpath=True)
- repr(cat)
- tm.assert_numpy_array_equal(isna(cat), labels == -1)
- def test_nan_handling(self):
- # Nans are represented as -1 in codes
- c = Categorical(["a", "b", np.nan, "a"])
- tm.assert_index_equal(c.categories, Index(["a", "b"]))
- tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
- c[1] = np.nan
- tm.assert_index_equal(c.categories, Index(["a", "b"]))
- tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0], dtype=np.int8))
- # Adding nan to categories should make assigned nan point to the
- # category!
- c = Categorical(["a", "b", np.nan, "a"])
- tm.assert_index_equal(c.categories, Index(["a", "b"]))
- tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0], dtype=np.int8))
- def test_set_dtype_nans(self):
- c = Categorical(["a", "b", np.nan])
- result = c._set_dtype(CategoricalDtype(["a", "c"]))
- tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1], dtype="int8"))
- def test_set_item_nan(self):
- cat = Categorical([1, 2, 3])
- cat[1] = np.nan
- exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
- tm.assert_categorical_equal(cat, exp)
- @pytest.mark.parametrize(
- "fillna_kwargs, msg",
- [
- (
- {"value": 1, "method": "ffill"},
- "Cannot specify both 'value' and 'method'.",
- ),
- ({}, "Must specify a fill 'value' or 'method'."),
- ({"method": "bad"}, "Invalid fill method. Expecting .* bad"),
- (
- {"value": Series([1, 2, 3, 4, "a"])},
- "Cannot setitem on a Categorical with a new category",
- ),
- ],
- )
- def test_fillna_raises(self, fillna_kwargs, msg):
- # https://github.com/pandas-dev/pandas/issues/19682
- # https://github.com/pandas-dev/pandas/issues/13628
- cat = Categorical([1, 2, 3, None, None])
- if len(fillna_kwargs) == 1 and "value" in fillna_kwargs:
- err = TypeError
- else:
- err = ValueError
- with pytest.raises(err, match=msg):
- cat.fillna(**fillna_kwargs)
- @pytest.mark.parametrize("named", [True, False])
- def test_fillna_iterable_category(self, named):
- # https://github.com/pandas-dev/pandas/issues/21097
- if named:
- Point = collections.namedtuple("Point", "x y")
- else:
- Point = lambda *args: args # tuple
- cat = Categorical(np.array([Point(0, 0), Point(0, 1), None], dtype=object))
- result = cat.fillna(Point(0, 0))
- expected = Categorical([Point(0, 0), Point(0, 1), Point(0, 0)])
- tm.assert_categorical_equal(result, expected)
- # Case where the Point is not among our categories; we want ValueError,
- # not NotImplementedError GH#41914
- cat = Categorical(np.array([Point(1, 0), Point(0, 1), None], dtype=object))
- msg = "Cannot setitem on a Categorical with a new category"
- with pytest.raises(TypeError, match=msg):
- cat.fillna(Point(0, 0))
- def test_fillna_array(self):
- # accept Categorical or ndarray value if it holds appropriate values
- cat = Categorical(["A", "B", "C", None, None])
- other = cat.fillna("C")
- result = cat.fillna(other)
- tm.assert_categorical_equal(result, other)
- assert isna(cat[-1]) # didn't modify original inplace
- other = np.array(["A", "B", "C", "B", "A"])
- result = cat.fillna(other)
- expected = Categorical(["A", "B", "C", "B", "A"], dtype=cat.dtype)
- tm.assert_categorical_equal(result, expected)
- assert isna(cat[-1]) # didn't modify original inplace
- @pytest.mark.parametrize(
- "values, expected",
- [
- ([1, 2, 3], np.array([False, False, False])),
- ([1, 2, np.nan], np.array([False, False, True])),
- ([1, 2, np.inf], np.array([False, False, True])),
- ([1, 2, pd.NA], np.array([False, False, True])),
- ],
- )
- def test_use_inf_as_na(self, values, expected):
- # https://github.com/pandas-dev/pandas/issues/33594
- with pd.option_context("mode.use_inf_as_na", True):
- cat = Categorical(values)
- result = cat.isna()
- tm.assert_numpy_array_equal(result, expected)
- result = Series(cat).isna()
- expected = Series(expected)
- tm.assert_series_equal(result, expected)
- result = DataFrame(cat).isna()
- expected = DataFrame(expected)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "values, expected",
- [
- ([1, 2, 3], np.array([False, False, False])),
- ([1, 2, np.nan], np.array([False, False, True])),
- ([1, 2, np.inf], np.array([False, False, True])),
- ([1, 2, pd.NA], np.array([False, False, True])),
- ],
- )
- def test_use_inf_as_na_outside_context(self, values, expected):
- # https://github.com/pandas-dev/pandas/issues/33594
- # Using isna directly for Categorical will fail in general here
- cat = Categorical(values)
- with pd.option_context("mode.use_inf_as_na", True):
- result = isna(cat)
- tm.assert_numpy_array_equal(result, expected)
- result = isna(Series(cat))
- expected = Series(expected)
- tm.assert_series_equal(result, expected)
- result = isna(DataFrame(cat))
- expected = DataFrame(expected)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "a1, a2, categories",
- [
- (["a", "b", "c"], [np.nan, "a", "b"], ["a", "b", "c"]),
- ([1, 2, 3], [np.nan, 1, 2], [1, 2, 3]),
- ],
- )
- def test_compare_categorical_with_missing(self, a1, a2, categories):
- # GH 28384
- cat_type = CategoricalDtype(categories)
- # !=
- result = Series(a1, dtype=cat_type) != Series(a2, dtype=cat_type)
- expected = Series(a1) != Series(a2)
- tm.assert_series_equal(result, expected)
- # ==
- result = Series(a1, dtype=cat_type) == Series(a2, dtype=cat_type)
- expected = Series(a1) == Series(a2)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "na_value, dtype",
- [
- (pd.NaT, "datetime64[ns]"),
- (None, "float64"),
- (np.nan, "float64"),
- (pd.NA, "float64"),
- ],
- )
- def test_categorical_only_missing_values_no_cast(self, na_value, dtype):
- # GH#44900
- result = Categorical([na_value, na_value])
- tm.assert_index_equal(result.categories, Index([], dtype=dtype))
|