123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759 |
- from datetime import (
- date,
- datetime,
- )
- import numpy as np
- import pytest
- from pandas.core.dtypes.common import (
- is_float_dtype,
- is_integer_dtype,
- )
- from pandas.core.dtypes.dtypes import CategoricalDtype
- import pandas as pd
- from pandas import (
- Categorical,
- CategoricalIndex,
- DatetimeIndex,
- Index,
- Interval,
- IntervalIndex,
- MultiIndex,
- NaT,
- Series,
- Timestamp,
- date_range,
- period_range,
- timedelta_range,
- )
- import pandas._testing as tm
- class TestCategoricalConstructors:
- def test_categorical_from_cat_and_dtype_str_preserve_ordered(self):
- # GH#49309 we should preserve orderedness in `res`
- cat = Categorical([3, 1], categories=[3, 2, 1], ordered=True)
- res = Categorical(cat, dtype="category")
- assert res.dtype.ordered
- def test_categorical_disallows_scalar(self):
- # GH#38433
- with pytest.raises(TypeError, match="Categorical input must be list-like"):
- Categorical("A", categories=["A", "B"])
- def test_categorical_1d_only(self):
- # ndim > 1
- msg = "> 1 ndim Categorical are not supported at this time"
- with pytest.raises(NotImplementedError, match=msg):
- Categorical(np.array([list("abcd")]))
- def test_validate_ordered(self):
- # see gh-14058
- exp_msg = "'ordered' must either be 'True' or 'False'"
- exp_err = TypeError
- # This should be a boolean.
- ordered = np.array([0, 1, 2])
- with pytest.raises(exp_err, match=exp_msg):
- Categorical([1, 2, 3], ordered=ordered)
- with pytest.raises(exp_err, match=exp_msg):
- Categorical.from_codes(
- [0, 0, 1], categories=["a", "b", "c"], ordered=ordered
- )
- def test_constructor_empty(self):
- # GH 17248
- c = Categorical([])
- expected = Index([])
- tm.assert_index_equal(c.categories, expected)
- c = Categorical([], categories=[1, 2, 3])
- expected = Index([1, 2, 3], dtype=np.int64)
- tm.assert_index_equal(c.categories, expected)
- def test_constructor_empty_boolean(self):
- # see gh-22702
- cat = Categorical([], categories=[True, False])
- categories = sorted(cat.categories.tolist())
- assert categories == [False, True]
- def test_constructor_tuples(self):
- values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
- result = Categorical(values)
- expected = Index([(1,), (1, 2)], tupleize_cols=False)
- tm.assert_index_equal(result.categories, expected)
- assert result.ordered is False
- def test_constructor_tuples_datetimes(self):
- # numpy will auto reshape when all of the tuples are the
- # same len, so add an extra one with 2 items and slice it off
- values = np.array(
- [
- (Timestamp("2010-01-01"),),
- (Timestamp("2010-01-02"),),
- (Timestamp("2010-01-01"),),
- (Timestamp("2010-01-02"),),
- ("a", "b"),
- ],
- dtype=object,
- )[:-1]
- result = Categorical(values)
- expected = Index(
- [(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)],
- tupleize_cols=False,
- )
- tm.assert_index_equal(result.categories, expected)
- def test_constructor_unsortable(self):
- # it works!
- arr = np.array([1, 2, 3, datetime.now()], dtype="O")
- factor = Categorical(arr, ordered=False)
- assert not factor.ordered
- # this however will raise as cannot be sorted
- msg = (
- "'values' is not ordered, please explicitly specify the "
- "categories order by passing in a categories argument."
- )
- with pytest.raises(TypeError, match=msg):
- Categorical(arr, ordered=True)
- def test_constructor_interval(self):
- result = Categorical(
- [Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True
- )
- ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)])
- exp = Categorical(ii, ordered=True)
- tm.assert_categorical_equal(result, exp)
- tm.assert_index_equal(result.categories, ii)
- def test_constructor(self):
- exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_)
- c1 = Categorical(exp_arr)
- tm.assert_numpy_array_equal(c1.__array__(), exp_arr)
- c2 = Categorical(exp_arr, categories=["a", "b", "c"])
- tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
- c2 = Categorical(exp_arr, categories=["c", "b", "a"])
- tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
- # categories must be unique
- msg = "Categorical categories must be unique"
- with pytest.raises(ValueError, match=msg):
- Categorical([1, 2], [1, 2, 2])
- with pytest.raises(ValueError, match=msg):
- Categorical(["a", "b"], ["a", "b", "b"])
- # The default should be unordered
- c1 = Categorical(["a", "b", "c", "a"])
- assert not c1.ordered
- # Categorical as input
- c1 = Categorical(["a", "b", "c", "a"])
- c2 = Categorical(c1)
- tm.assert_categorical_equal(c1, c2)
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
- c2 = Categorical(c1)
- tm.assert_categorical_equal(c1, c2)
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
- c2 = Categorical(c1)
- tm.assert_categorical_equal(c1, c2)
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
- c2 = Categorical(c1, categories=["a", "b", "c"])
- tm.assert_numpy_array_equal(c1.__array__(), c2.__array__())
- tm.assert_index_equal(c2.categories, Index(["a", "b", "c"]))
- # Series of dtype category
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
- c2 = Categorical(Series(c1))
- tm.assert_categorical_equal(c1, c2)
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
- c2 = Categorical(Series(c1))
- tm.assert_categorical_equal(c1, c2)
- # Series
- c1 = Categorical(["a", "b", "c", "a"])
- c2 = Categorical(Series(["a", "b", "c", "a"]))
- tm.assert_categorical_equal(c1, c2)
- c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
- c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"])
- tm.assert_categorical_equal(c1, c2)
- # This should result in integer categories, not float!
- cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
- assert is_integer_dtype(cat.categories)
- # https://github.com/pandas-dev/pandas/issues/3678
- cat = Categorical([np.nan, 1, 2, 3])
- assert is_integer_dtype(cat.categories)
- # this should result in floats
- cat = Categorical([np.nan, 1, 2.0, 3])
- assert is_float_dtype(cat.categories)
- cat = Categorical([np.nan, 1.0, 2.0, 3.0])
- assert is_float_dtype(cat.categories)
- # This doesn't work -> this would probably need some kind of "remember
- # the original type" feature to try to cast the array interface result
- # to...
- # vals = np.asarray(cat[cat.notna()])
- # assert is_integer_dtype(vals)
- # corner cases
- cat = Categorical([1])
- assert len(cat.categories) == 1
- assert cat.categories[0] == 1
- assert len(cat.codes) == 1
- assert cat.codes[0] == 0
- cat = Categorical(["a"])
- assert len(cat.categories) == 1
- assert cat.categories[0] == "a"
- assert len(cat.codes) == 1
- assert cat.codes[0] == 0
- # two arrays
- # - when the first is an integer dtype and the second is not
- # - when the resulting codes are all -1/NaN
- with tm.assert_produces_warning(None):
- Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"])
- with tm.assert_produces_warning(None):
- Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5])
- # the next one are from the old docs
- with tm.assert_produces_warning(None):
- Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
- cat = Categorical([1, 2], categories=[1, 2, 3])
- # this is a legitimate constructor
- with tm.assert_produces_warning(None):
- Categorical(np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True)
- def test_constructor_with_existing_categories(self):
- # GH25318: constructing with pd.Series used to bogusly skip recoding
- # categories
- c0 = Categorical(["a", "b", "c", "a"])
- c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
- c2 = Categorical(c0, categories=c1.categories)
- tm.assert_categorical_equal(c1, c2)
- c3 = Categorical(Series(c0), categories=c1.categories)
- tm.assert_categorical_equal(c1, c3)
- def test_constructor_not_sequence(self):
- # https://github.com/pandas-dev/pandas/issues/16022
- msg = r"^Parameter 'categories' must be list-like, was"
- with pytest.raises(TypeError, match=msg):
- Categorical(["a", "b"], categories="a")
- def test_constructor_with_null(self):
- # Cannot have NaN in categories
- msg = "Categorical categories cannot be null"
- with pytest.raises(ValueError, match=msg):
- Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"])
- with pytest.raises(ValueError, match=msg):
- Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"])
- with pytest.raises(ValueError, match=msg):
- Categorical(
- DatetimeIndex(["nat", "20160101"]),
- categories=[NaT, Timestamp("20160101")],
- )
- def test_constructor_with_index(self):
- ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
- tm.assert_categorical_equal(ci.values, Categorical(ci))
- ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
- tm.assert_categorical_equal(
- ci.values, Categorical(ci.astype(object), categories=ci.categories)
- )
- def test_constructor_with_generator(self):
- # This was raising an Error in isna(single_val).any() because isna
- # returned a scalar for a generator
- exp = Categorical([0, 1, 2])
- cat = Categorical(x for x in [0, 1, 2])
- tm.assert_categorical_equal(cat, exp)
- cat = Categorical(range(3))
- tm.assert_categorical_equal(cat, exp)
- MultiIndex.from_product([range(5), ["a", "b", "c"]])
- # check that categories accept generators and sequences
- cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
- tm.assert_categorical_equal(cat, exp)
- cat = Categorical([0, 1, 2], categories=range(3))
- tm.assert_categorical_equal(cat, exp)
- def test_constructor_with_rangeindex(self):
- # RangeIndex is preserved in Categories
- rng = Index(range(3))
- cat = Categorical(rng)
- tm.assert_index_equal(cat.categories, rng, exact=True)
- cat = Categorical([1, 2, 0], categories=rng)
- tm.assert_index_equal(cat.categories, rng, exact=True)
- @pytest.mark.parametrize(
- "dtl",
- [
- date_range("1995-01-01 00:00:00", periods=5, freq="s"),
- date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"),
- timedelta_range("1 day", periods=5, freq="s"),
- ],
- )
- def test_constructor_with_datetimelike(self, dtl):
- # see gh-12077
- # constructor with a datetimelike and NaT
- s = Series(dtl)
- c = Categorical(s)
- expected = type(dtl)(s)
- expected._data.freq = None
- tm.assert_index_equal(c.categories, expected)
- tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8"))
- # with NaT
- s2 = s.copy()
- s2.iloc[-1] = NaT
- c = Categorical(s2)
- expected = type(dtl)(s2.dropna())
- expected._data.freq = None
- tm.assert_index_equal(c.categories, expected)
- exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
- tm.assert_numpy_array_equal(c.codes, exp)
- result = repr(c)
- assert "NaT" in result
- def test_constructor_from_index_series_datetimetz(self):
- idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern")
- idx = idx._with_freq(None) # freq not preserved in result.categories
- result = Categorical(idx)
- tm.assert_index_equal(result.categories, idx)
- result = Categorical(Series(idx))
- tm.assert_index_equal(result.categories, idx)
- def test_constructor_date_objects(self):
- # we dont cast date objects to timestamps, matching Index constructor
- v = date.today()
- cat = Categorical([v, v])
- assert cat.categories.dtype == object
- assert type(cat.categories[0]) is date
- def test_constructor_from_index_series_timedelta(self):
- idx = timedelta_range("1 days", freq="D", periods=3)
- idx = idx._with_freq(None) # freq not preserved in result.categories
- result = Categorical(idx)
- tm.assert_index_equal(result.categories, idx)
- result = Categorical(Series(idx))
- tm.assert_index_equal(result.categories, idx)
- def test_constructor_from_index_series_period(self):
- idx = period_range("2015-01-01", freq="D", periods=3)
- result = Categorical(idx)
- tm.assert_index_equal(result.categories, idx)
- result = Categorical(Series(idx))
- tm.assert_index_equal(result.categories, idx)
- @pytest.mark.parametrize(
- "values",
- [
- np.array([1.0, 1.2, 1.8, np.nan]),
- np.array([1, 2, 3], dtype="int64"),
- ["a", "b", "c", np.nan],
- [pd.Period("2014-01"), pd.Period("2014-02"), NaT],
- [Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT],
- [
- Timestamp("2014-01-01", tz="US/Eastern"),
- Timestamp("2014-01-02", tz="US/Eastern"),
- NaT,
- ],
- ],
- )
- def test_constructor_invariant(self, values):
- # GH 14190
- c = Categorical(values)
- c2 = Categorical(c)
- tm.assert_categorical_equal(c, c2)
- @pytest.mark.parametrize("ordered", [True, False])
- def test_constructor_with_dtype(self, ordered):
- categories = ["b", "a", "c"]
- dtype = CategoricalDtype(categories, ordered=ordered)
- result = Categorical(["a", "b", "a", "c"], dtype=dtype)
- expected = Categorical(
- ["a", "b", "a", "c"], categories=categories, ordered=ordered
- )
- tm.assert_categorical_equal(result, expected)
- assert result.ordered is ordered
- def test_constructor_dtype_and_others_raises(self):
- dtype = CategoricalDtype(["a", "b"], ordered=True)
- msg = "Cannot specify `categories` or `ordered` together with `dtype`."
- with pytest.raises(ValueError, match=msg):
- Categorical(["a", "b"], categories=["a", "b"], dtype=dtype)
- with pytest.raises(ValueError, match=msg):
- Categorical(["a", "b"], ordered=True, dtype=dtype)
- with pytest.raises(ValueError, match=msg):
- Categorical(["a", "b"], ordered=False, dtype=dtype)
- @pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]])
- @pytest.mark.parametrize("ordered", [True, False])
- def test_constructor_str_category(self, categories, ordered):
- result = Categorical(
- ["a", "b"], categories=categories, ordered=ordered, dtype="category"
- )
- expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
- tm.assert_categorical_equal(result, expected)
- def test_constructor_str_unknown(self):
- with pytest.raises(ValueError, match="Unknown dtype"):
- Categorical([1, 2], dtype="foo")
- def test_constructor_np_strs(self):
- # GH#31499 Hashtable.map_locations needs to work on np.str_ objects
- cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
- assert all(isinstance(x, np.str_) for x in cat.categories)
- def test_constructor_from_categorical_with_dtype(self):
- dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
- values = Categorical(["a", "b", "d"])
- result = Categorical(values, dtype=dtype)
- # We use dtype.categories, not values.categories
- expected = Categorical(
- ["a", "b", "d"], categories=["a", "b", "c"], ordered=True
- )
- tm.assert_categorical_equal(result, expected)
- def test_constructor_from_categorical_with_unknown_dtype(self):
- dtype = CategoricalDtype(None, ordered=True)
- values = Categorical(["a", "b", "d"])
- result = Categorical(values, dtype=dtype)
- # We use values.categories, not dtype.categories
- expected = Categorical(
- ["a", "b", "d"], categories=["a", "b", "d"], ordered=True
- )
- tm.assert_categorical_equal(result, expected)
- def test_constructor_from_categorical_string(self):
- values = Categorical(["a", "b", "d"])
- # use categories, ordered
- result = Categorical(
- values, categories=["a", "b", "c"], ordered=True, dtype="category"
- )
- expected = Categorical(
- ["a", "b", "d"], categories=["a", "b", "c"], ordered=True
- )
- tm.assert_categorical_equal(result, expected)
- # No string
- result = Categorical(values, categories=["a", "b", "c"], ordered=True)
- tm.assert_categorical_equal(result, expected)
- def test_constructor_with_categorical_categories(self):
- # GH17884
- expected = Categorical(["a", "b"], categories=["a", "b", "c"])
- result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"]))
- tm.assert_categorical_equal(result, expected)
- result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"]))
- tm.assert_categorical_equal(result, expected)
- @pytest.mark.parametrize("klass", [lambda x: np.array(x, dtype=object), list])
- def test_construction_with_null(self, klass, nulls_fixture):
- # https://github.com/pandas-dev/pandas/issues/31927
- values = klass(["a", nulls_fixture, "b"])
- result = Categorical(values)
- dtype = CategoricalDtype(["a", "b"])
- codes = [0, -1, 1]
- expected = Categorical.from_codes(codes=codes, dtype=dtype)
- tm.assert_categorical_equal(result, expected)
- def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype):
- # GH#39649
- cats = pd.array(range(5), dtype=any_numeric_ea_dtype)
- codes = np.random.randint(5, size=3)
- dtype = CategoricalDtype(cats)
- arr = Categorical.from_codes(codes, dtype=dtype)
- assert arr.categories.dtype == cats.dtype
- tm.assert_index_equal(arr.categories, Index(cats))
- def test_from_codes_empty(self):
- cat = ["a", "b", "c"]
- result = Categorical.from_codes([], categories=cat)
- expected = Categorical([], categories=cat)
- tm.assert_categorical_equal(result, expected)
- def test_from_codes_too_few_categories(self):
- dtype = CategoricalDtype(categories=[1, 2])
- msg = "codes need to be between "
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes([1, 2], categories=dtype.categories)
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes([1, 2], dtype=dtype)
- def test_from_codes_non_int_codes(self):
- dtype = CategoricalDtype(categories=[1, 2])
- msg = "codes need to be array-like integers"
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes(["a"], categories=dtype.categories)
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes(["a"], dtype=dtype)
- def test_from_codes_non_unique_categories(self):
- with pytest.raises(ValueError, match="Categorical categories must be unique"):
- Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
- def test_from_codes_nan_cat_included(self):
- with pytest.raises(ValueError, match="Categorical categories cannot be null"):
- Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
- def test_from_codes_too_negative(self):
- dtype = CategoricalDtype(categories=["a", "b", "c"])
- msg = r"codes need to be between -1 and len\(categories\)-1"
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes([-2, 1, 2], dtype=dtype)
- def test_from_codes(self):
- dtype = CategoricalDtype(categories=["a", "b", "c"])
- exp = Categorical(["a", "b", "c"], ordered=False)
- res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
- tm.assert_categorical_equal(exp, res)
- res = Categorical.from_codes([0, 1, 2], dtype=dtype)
- tm.assert_categorical_equal(exp, res)
- @pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
- def test_from_codes_with_categorical_categories(self, klass):
- # GH17884
- expected = Categorical(["a", "b"], categories=["a", "b", "c"])
- result = Categorical.from_codes([0, 1], categories=klass(["a", "b", "c"]))
- tm.assert_categorical_equal(result, expected)
- @pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
- def test_from_codes_with_non_unique_categorical_categories(self, klass):
- with pytest.raises(ValueError, match="Categorical categories must be unique"):
- Categorical.from_codes([0, 1], klass(["a", "b", "a"]))
- def test_from_codes_with_nan_code(self):
- # GH21767
- codes = [1, 2, np.nan]
- dtype = CategoricalDtype(categories=["a", "b", "c"])
- with pytest.raises(ValueError, match="codes need to be array-like integers"):
- Categorical.from_codes(codes, categories=dtype.categories)
- with pytest.raises(ValueError, match="codes need to be array-like integers"):
- Categorical.from_codes(codes, dtype=dtype)
- @pytest.mark.parametrize("codes", [[1.0, 2.0, 0], [1.1, 2.0, 0]])
- def test_from_codes_with_float(self, codes):
- # GH21767
- # float codes should raise even if values are equal to integers
- dtype = CategoricalDtype(categories=["a", "b", "c"])
- msg = "codes need to be array-like integers"
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes(codes, dtype.categories)
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes(codes, dtype=dtype)
- def test_from_codes_with_dtype_raises(self):
- msg = "Cannot specify"
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes(
- [0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"])
- )
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes(
- [0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"])
- )
- def test_from_codes_neither(self):
- msg = "Both were None"
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes([0, 1])
- def test_from_codes_with_nullable_int(self):
- codes = pd.array([0, 1], dtype="Int64")
- categories = ["a", "b"]
- result = Categorical.from_codes(codes, categories=categories)
- expected = Categorical.from_codes(codes.to_numpy(int), categories=categories)
- tm.assert_categorical_equal(result, expected)
- def test_from_codes_with_nullable_int_na_raises(self):
- codes = pd.array([0, None], dtype="Int64")
- categories = ["a", "b"]
- msg = "codes cannot contain NA values"
- with pytest.raises(ValueError, match=msg):
- Categorical.from_codes(codes, categories=categories)
- @pytest.mark.parametrize("dtype", [None, "category"])
- def test_from_inferred_categories(self, dtype):
- cats = ["a", "b"]
- codes = np.array([0, 0, 1, 1], dtype="i8")
- result = Categorical._from_inferred_categories(cats, codes, dtype)
- expected = Categorical.from_codes(codes, cats)
- tm.assert_categorical_equal(result, expected)
- @pytest.mark.parametrize("dtype", [None, "category"])
- def test_from_inferred_categories_sorts(self, dtype):
- cats = ["b", "a"]
- codes = np.array([0, 1, 1, 1], dtype="i8")
- result = Categorical._from_inferred_categories(cats, codes, dtype)
- expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"])
- tm.assert_categorical_equal(result, expected)
- def test_from_inferred_categories_dtype(self):
- cats = ["a", "b", "d"]
- codes = np.array([0, 1, 0, 2], dtype="i8")
- dtype = CategoricalDtype(["c", "b", "a"], ordered=True)
- result = Categorical._from_inferred_categories(cats, codes, dtype)
- expected = Categorical(
- ["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True
- )
- tm.assert_categorical_equal(result, expected)
- def test_from_inferred_categories_coerces(self):
- cats = ["1", "2", "bad"]
- codes = np.array([0, 0, 1, 2], dtype="i8")
- dtype = CategoricalDtype([1, 2])
- result = Categorical._from_inferred_categories(cats, codes, dtype)
- expected = Categorical([1, 1, 2, np.nan])
- tm.assert_categorical_equal(result, expected)
- @pytest.mark.parametrize("ordered", [None, True, False])
- def test_construction_with_ordered(self, ordered):
- # GH 9347, 9190
- cat = Categorical([0, 1, 2], ordered=ordered)
- assert cat.ordered == bool(ordered)
- def test_constructor_imaginary(self):
- values = [1, 2, 3 + 1j]
- c1 = Categorical(values)
- tm.assert_index_equal(c1.categories, Index(values))
- tm.assert_numpy_array_equal(np.array(c1), np.array(values))
- def test_constructor_string_and_tuples(self):
- # GH 21416
- c = Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
- expected_index = Index([("a", "b"), ("b", "a"), "c"])
- assert c.categories.equals(expected_index)
- def test_interval(self):
- idx = pd.interval_range(0, 10, periods=10)
- cat = Categorical(idx, categories=idx)
- expected_codes = np.arange(10, dtype="int8")
- tm.assert_numpy_array_equal(cat.codes, expected_codes)
- tm.assert_index_equal(cat.categories, idx)
- # infer categories
- cat = Categorical(idx)
- tm.assert_numpy_array_equal(cat.codes, expected_codes)
- tm.assert_index_equal(cat.categories, idx)
- # list values
- cat = Categorical(list(idx))
- tm.assert_numpy_array_equal(cat.codes, expected_codes)
- tm.assert_index_equal(cat.categories, idx)
- # list values, categories
- cat = Categorical(list(idx), categories=list(idx))
- tm.assert_numpy_array_equal(cat.codes, expected_codes)
- tm.assert_index_equal(cat.categories, idx)
- # shuffled
- values = idx.take([1, 2, 0])
- cat = Categorical(values, categories=idx)
- tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8"))
- tm.assert_index_equal(cat.categories, idx)
- # extra
- values = pd.interval_range(8, 11, periods=3)
- cat = Categorical(values, categories=idx)
- expected_codes = np.array([8, 9, -1], dtype="int8")
- tm.assert_numpy_array_equal(cat.codes, expected_codes)
- tm.assert_index_equal(cat.categories, idx)
- # overlapping
- idx = IntervalIndex([Interval(0, 2), Interval(0, 1)])
- cat = Categorical(idx, categories=idx)
- expected_codes = np.array([0, 1], dtype="int8")
- tm.assert_numpy_array_equal(cat.codes, expected_codes)
- tm.assert_index_equal(cat.categories, idx)
- def test_categorical_extension_array_nullable(self, nulls_fixture):
- # GH:
- arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2)
- result = Categorical(arr)
- assert arr.dtype == result.categories.dtype
- expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))
- tm.assert_categorical_equal(result, expected)
- def test_from_sequence_copy(self):
- cat = Categorical(np.arange(5).repeat(2))
- result = Categorical._from_sequence(cat, dtype=None, copy=False)
- # more generally, we'd be OK with a view
- assert result._codes is cat._codes
- result = Categorical._from_sequence(cat, dtype=None, copy=True)
- assert not tm.shares_memory(result, cat)
- def test_constructor_datetime64_non_nano(self):
- categories = np.arange(10).view("M8[D]")
- values = categories[::2].copy()
- cat = Categorical(values, categories=categories)
- assert (cat == values).all()
- def test_constructor_preserves_freq(self):
- # GH33830 freq retention in categorical
- dti = date_range("2016-01-01", periods=5)
- expected = dti.freq
- cat = Categorical(dti)
- result = cat.categories.freq
- assert expected == result
|