test_constructors.py 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759
  1. from datetime import (
  2. date,
  3. datetime,
  4. )
  5. import numpy as np
  6. import pytest
  7. from pandas.core.dtypes.common import (
  8. is_float_dtype,
  9. is_integer_dtype,
  10. )
  11. from pandas.core.dtypes.dtypes import CategoricalDtype
  12. import pandas as pd
  13. from pandas import (
  14. Categorical,
  15. CategoricalIndex,
  16. DatetimeIndex,
  17. Index,
  18. Interval,
  19. IntervalIndex,
  20. MultiIndex,
  21. NaT,
  22. Series,
  23. Timestamp,
  24. date_range,
  25. period_range,
  26. timedelta_range,
  27. )
  28. import pandas._testing as tm
  29. class TestCategoricalConstructors:
  30. def test_categorical_from_cat_and_dtype_str_preserve_ordered(self):
  31. # GH#49309 we should preserve orderedness in `res`
  32. cat = Categorical([3, 1], categories=[3, 2, 1], ordered=True)
  33. res = Categorical(cat, dtype="category")
  34. assert res.dtype.ordered
  35. def test_categorical_disallows_scalar(self):
  36. # GH#38433
  37. with pytest.raises(TypeError, match="Categorical input must be list-like"):
  38. Categorical("A", categories=["A", "B"])
  39. def test_categorical_1d_only(self):
  40. # ndim > 1
  41. msg = "> 1 ndim Categorical are not supported at this time"
  42. with pytest.raises(NotImplementedError, match=msg):
  43. Categorical(np.array([list("abcd")]))
  44. def test_validate_ordered(self):
  45. # see gh-14058
  46. exp_msg = "'ordered' must either be 'True' or 'False'"
  47. exp_err = TypeError
  48. # This should be a boolean.
  49. ordered = np.array([0, 1, 2])
  50. with pytest.raises(exp_err, match=exp_msg):
  51. Categorical([1, 2, 3], ordered=ordered)
  52. with pytest.raises(exp_err, match=exp_msg):
  53. Categorical.from_codes(
  54. [0, 0, 1], categories=["a", "b", "c"], ordered=ordered
  55. )
  56. def test_constructor_empty(self):
  57. # GH 17248
  58. c = Categorical([])
  59. expected = Index([])
  60. tm.assert_index_equal(c.categories, expected)
  61. c = Categorical([], categories=[1, 2, 3])
  62. expected = Index([1, 2, 3], dtype=np.int64)
  63. tm.assert_index_equal(c.categories, expected)
  64. def test_constructor_empty_boolean(self):
  65. # see gh-22702
  66. cat = Categorical([], categories=[True, False])
  67. categories = sorted(cat.categories.tolist())
  68. assert categories == [False, True]
  69. def test_constructor_tuples(self):
  70. values = np.array([(1,), (1, 2), (1,), (1, 2)], dtype=object)
  71. result = Categorical(values)
  72. expected = Index([(1,), (1, 2)], tupleize_cols=False)
  73. tm.assert_index_equal(result.categories, expected)
  74. assert result.ordered is False
  75. def test_constructor_tuples_datetimes(self):
  76. # numpy will auto reshape when all of the tuples are the
  77. # same len, so add an extra one with 2 items and slice it off
  78. values = np.array(
  79. [
  80. (Timestamp("2010-01-01"),),
  81. (Timestamp("2010-01-02"),),
  82. (Timestamp("2010-01-01"),),
  83. (Timestamp("2010-01-02"),),
  84. ("a", "b"),
  85. ],
  86. dtype=object,
  87. )[:-1]
  88. result = Categorical(values)
  89. expected = Index(
  90. [(Timestamp("2010-01-01"),), (Timestamp("2010-01-02"),)],
  91. tupleize_cols=False,
  92. )
  93. tm.assert_index_equal(result.categories, expected)
  94. def test_constructor_unsortable(self):
  95. # it works!
  96. arr = np.array([1, 2, 3, datetime.now()], dtype="O")
  97. factor = Categorical(arr, ordered=False)
  98. assert not factor.ordered
  99. # this however will raise as cannot be sorted
  100. msg = (
  101. "'values' is not ordered, please explicitly specify the "
  102. "categories order by passing in a categories argument."
  103. )
  104. with pytest.raises(TypeError, match=msg):
  105. Categorical(arr, ordered=True)
  106. def test_constructor_interval(self):
  107. result = Categorical(
  108. [Interval(1, 2), Interval(2, 3), Interval(3, 6)], ordered=True
  109. )
  110. ii = IntervalIndex([Interval(1, 2), Interval(2, 3), Interval(3, 6)])
  111. exp = Categorical(ii, ordered=True)
  112. tm.assert_categorical_equal(result, exp)
  113. tm.assert_index_equal(result.categories, ii)
  114. def test_constructor(self):
  115. exp_arr = np.array(["a", "b", "c", "a", "b", "c"], dtype=np.object_)
  116. c1 = Categorical(exp_arr)
  117. tm.assert_numpy_array_equal(c1.__array__(), exp_arr)
  118. c2 = Categorical(exp_arr, categories=["a", "b", "c"])
  119. tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
  120. c2 = Categorical(exp_arr, categories=["c", "b", "a"])
  121. tm.assert_numpy_array_equal(c2.__array__(), exp_arr)
  122. # categories must be unique
  123. msg = "Categorical categories must be unique"
  124. with pytest.raises(ValueError, match=msg):
  125. Categorical([1, 2], [1, 2, 2])
  126. with pytest.raises(ValueError, match=msg):
  127. Categorical(["a", "b"], ["a", "b", "b"])
  128. # The default should be unordered
  129. c1 = Categorical(["a", "b", "c", "a"])
  130. assert not c1.ordered
  131. # Categorical as input
  132. c1 = Categorical(["a", "b", "c", "a"])
  133. c2 = Categorical(c1)
  134. tm.assert_categorical_equal(c1, c2)
  135. c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
  136. c2 = Categorical(c1)
  137. tm.assert_categorical_equal(c1, c2)
  138. c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
  139. c2 = Categorical(c1)
  140. tm.assert_categorical_equal(c1, c2)
  141. c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
  142. c2 = Categorical(c1, categories=["a", "b", "c"])
  143. tm.assert_numpy_array_equal(c1.__array__(), c2.__array__())
  144. tm.assert_index_equal(c2.categories, Index(["a", "b", "c"]))
  145. # Series of dtype category
  146. c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
  147. c2 = Categorical(Series(c1))
  148. tm.assert_categorical_equal(c1, c2)
  149. c1 = Categorical(["a", "b", "c", "a"], categories=["a", "c", "b"])
  150. c2 = Categorical(Series(c1))
  151. tm.assert_categorical_equal(c1, c2)
  152. # Series
  153. c1 = Categorical(["a", "b", "c", "a"])
  154. c2 = Categorical(Series(["a", "b", "c", "a"]))
  155. tm.assert_categorical_equal(c1, c2)
  156. c1 = Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
  157. c2 = Categorical(Series(["a", "b", "c", "a"]), categories=["a", "b", "c", "d"])
  158. tm.assert_categorical_equal(c1, c2)
  159. # This should result in integer categories, not float!
  160. cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
  161. assert is_integer_dtype(cat.categories)
  162. # https://github.com/pandas-dev/pandas/issues/3678
  163. cat = Categorical([np.nan, 1, 2, 3])
  164. assert is_integer_dtype(cat.categories)
  165. # this should result in floats
  166. cat = Categorical([np.nan, 1, 2.0, 3])
  167. assert is_float_dtype(cat.categories)
  168. cat = Categorical([np.nan, 1.0, 2.0, 3.0])
  169. assert is_float_dtype(cat.categories)
  170. # This doesn't work -> this would probably need some kind of "remember
  171. # the original type" feature to try to cast the array interface result
  172. # to...
  173. # vals = np.asarray(cat[cat.notna()])
  174. # assert is_integer_dtype(vals)
  175. # corner cases
  176. cat = Categorical([1])
  177. assert len(cat.categories) == 1
  178. assert cat.categories[0] == 1
  179. assert len(cat.codes) == 1
  180. assert cat.codes[0] == 0
  181. cat = Categorical(["a"])
  182. assert len(cat.categories) == 1
  183. assert cat.categories[0] == "a"
  184. assert len(cat.codes) == 1
  185. assert cat.codes[0] == 0
  186. # two arrays
  187. # - when the first is an integer dtype and the second is not
  188. # - when the resulting codes are all -1/NaN
  189. with tm.assert_produces_warning(None):
  190. Categorical([0, 1, 2, 0, 1, 2], categories=["a", "b", "c"])
  191. with tm.assert_produces_warning(None):
  192. Categorical([0, 1, 2, 0, 1, 2], categories=[3, 4, 5])
  193. # the next one are from the old docs
  194. with tm.assert_produces_warning(None):
  195. Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3])
  196. cat = Categorical([1, 2], categories=[1, 2, 3])
  197. # this is a legitimate constructor
  198. with tm.assert_produces_warning(None):
  199. Categorical(np.array([], dtype="int64"), categories=[3, 2, 1], ordered=True)
  200. def test_constructor_with_existing_categories(self):
  201. # GH25318: constructing with pd.Series used to bogusly skip recoding
  202. # categories
  203. c0 = Categorical(["a", "b", "c", "a"])
  204. c1 = Categorical(["a", "b", "c", "a"], categories=["b", "c"])
  205. c2 = Categorical(c0, categories=c1.categories)
  206. tm.assert_categorical_equal(c1, c2)
  207. c3 = Categorical(Series(c0), categories=c1.categories)
  208. tm.assert_categorical_equal(c1, c3)
  209. def test_constructor_not_sequence(self):
  210. # https://github.com/pandas-dev/pandas/issues/16022
  211. msg = r"^Parameter 'categories' must be list-like, was"
  212. with pytest.raises(TypeError, match=msg):
  213. Categorical(["a", "b"], categories="a")
  214. def test_constructor_with_null(self):
  215. # Cannot have NaN in categories
  216. msg = "Categorical categories cannot be null"
  217. with pytest.raises(ValueError, match=msg):
  218. Categorical([np.nan, "a", "b", "c"], categories=[np.nan, "a", "b", "c"])
  219. with pytest.raises(ValueError, match=msg):
  220. Categorical([None, "a", "b", "c"], categories=[None, "a", "b", "c"])
  221. with pytest.raises(ValueError, match=msg):
  222. Categorical(
  223. DatetimeIndex(["nat", "20160101"]),
  224. categories=[NaT, Timestamp("20160101")],
  225. )
  226. def test_constructor_with_index(self):
  227. ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
  228. tm.assert_categorical_equal(ci.values, Categorical(ci))
  229. ci = CategoricalIndex(list("aabbca"), categories=list("cab"))
  230. tm.assert_categorical_equal(
  231. ci.values, Categorical(ci.astype(object), categories=ci.categories)
  232. )
  233. def test_constructor_with_generator(self):
  234. # This was raising an Error in isna(single_val).any() because isna
  235. # returned a scalar for a generator
  236. exp = Categorical([0, 1, 2])
  237. cat = Categorical(x for x in [0, 1, 2])
  238. tm.assert_categorical_equal(cat, exp)
  239. cat = Categorical(range(3))
  240. tm.assert_categorical_equal(cat, exp)
  241. MultiIndex.from_product([range(5), ["a", "b", "c"]])
  242. # check that categories accept generators and sequences
  243. cat = Categorical([0, 1, 2], categories=(x for x in [0, 1, 2]))
  244. tm.assert_categorical_equal(cat, exp)
  245. cat = Categorical([0, 1, 2], categories=range(3))
  246. tm.assert_categorical_equal(cat, exp)
  247. def test_constructor_with_rangeindex(self):
  248. # RangeIndex is preserved in Categories
  249. rng = Index(range(3))
  250. cat = Categorical(rng)
  251. tm.assert_index_equal(cat.categories, rng, exact=True)
  252. cat = Categorical([1, 2, 0], categories=rng)
  253. tm.assert_index_equal(cat.categories, rng, exact=True)
  254. @pytest.mark.parametrize(
  255. "dtl",
  256. [
  257. date_range("1995-01-01 00:00:00", periods=5, freq="s"),
  258. date_range("1995-01-01 00:00:00", periods=5, freq="s", tz="US/Eastern"),
  259. timedelta_range("1 day", periods=5, freq="s"),
  260. ],
  261. )
  262. def test_constructor_with_datetimelike(self, dtl):
  263. # see gh-12077
  264. # constructor with a datetimelike and NaT
  265. s = Series(dtl)
  266. c = Categorical(s)
  267. expected = type(dtl)(s)
  268. expected._data.freq = None
  269. tm.assert_index_equal(c.categories, expected)
  270. tm.assert_numpy_array_equal(c.codes, np.arange(5, dtype="int8"))
  271. # with NaT
  272. s2 = s.copy()
  273. s2.iloc[-1] = NaT
  274. c = Categorical(s2)
  275. expected = type(dtl)(s2.dropna())
  276. expected._data.freq = None
  277. tm.assert_index_equal(c.categories, expected)
  278. exp = np.array([0, 1, 2, 3, -1], dtype=np.int8)
  279. tm.assert_numpy_array_equal(c.codes, exp)
  280. result = repr(c)
  281. assert "NaT" in result
  282. def test_constructor_from_index_series_datetimetz(self):
  283. idx = date_range("2015-01-01 10:00", freq="D", periods=3, tz="US/Eastern")
  284. idx = idx._with_freq(None) # freq not preserved in result.categories
  285. result = Categorical(idx)
  286. tm.assert_index_equal(result.categories, idx)
  287. result = Categorical(Series(idx))
  288. tm.assert_index_equal(result.categories, idx)
  289. def test_constructor_date_objects(self):
  290. # we dont cast date objects to timestamps, matching Index constructor
  291. v = date.today()
  292. cat = Categorical([v, v])
  293. assert cat.categories.dtype == object
  294. assert type(cat.categories[0]) is date
  295. def test_constructor_from_index_series_timedelta(self):
  296. idx = timedelta_range("1 days", freq="D", periods=3)
  297. idx = idx._with_freq(None) # freq not preserved in result.categories
  298. result = Categorical(idx)
  299. tm.assert_index_equal(result.categories, idx)
  300. result = Categorical(Series(idx))
  301. tm.assert_index_equal(result.categories, idx)
  302. def test_constructor_from_index_series_period(self):
  303. idx = period_range("2015-01-01", freq="D", periods=3)
  304. result = Categorical(idx)
  305. tm.assert_index_equal(result.categories, idx)
  306. result = Categorical(Series(idx))
  307. tm.assert_index_equal(result.categories, idx)
  308. @pytest.mark.parametrize(
  309. "values",
  310. [
  311. np.array([1.0, 1.2, 1.8, np.nan]),
  312. np.array([1, 2, 3], dtype="int64"),
  313. ["a", "b", "c", np.nan],
  314. [pd.Period("2014-01"), pd.Period("2014-02"), NaT],
  315. [Timestamp("2014-01-01"), Timestamp("2014-01-02"), NaT],
  316. [
  317. Timestamp("2014-01-01", tz="US/Eastern"),
  318. Timestamp("2014-01-02", tz="US/Eastern"),
  319. NaT,
  320. ],
  321. ],
  322. )
  323. def test_constructor_invariant(self, values):
  324. # GH 14190
  325. c = Categorical(values)
  326. c2 = Categorical(c)
  327. tm.assert_categorical_equal(c, c2)
  328. @pytest.mark.parametrize("ordered", [True, False])
  329. def test_constructor_with_dtype(self, ordered):
  330. categories = ["b", "a", "c"]
  331. dtype = CategoricalDtype(categories, ordered=ordered)
  332. result = Categorical(["a", "b", "a", "c"], dtype=dtype)
  333. expected = Categorical(
  334. ["a", "b", "a", "c"], categories=categories, ordered=ordered
  335. )
  336. tm.assert_categorical_equal(result, expected)
  337. assert result.ordered is ordered
  338. def test_constructor_dtype_and_others_raises(self):
  339. dtype = CategoricalDtype(["a", "b"], ordered=True)
  340. msg = "Cannot specify `categories` or `ordered` together with `dtype`."
  341. with pytest.raises(ValueError, match=msg):
  342. Categorical(["a", "b"], categories=["a", "b"], dtype=dtype)
  343. with pytest.raises(ValueError, match=msg):
  344. Categorical(["a", "b"], ordered=True, dtype=dtype)
  345. with pytest.raises(ValueError, match=msg):
  346. Categorical(["a", "b"], ordered=False, dtype=dtype)
  347. @pytest.mark.parametrize("categories", [None, ["a", "b"], ["a", "c"]])
  348. @pytest.mark.parametrize("ordered", [True, False])
  349. def test_constructor_str_category(self, categories, ordered):
  350. result = Categorical(
  351. ["a", "b"], categories=categories, ordered=ordered, dtype="category"
  352. )
  353. expected = Categorical(["a", "b"], categories=categories, ordered=ordered)
  354. tm.assert_categorical_equal(result, expected)
  355. def test_constructor_str_unknown(self):
  356. with pytest.raises(ValueError, match="Unknown dtype"):
  357. Categorical([1, 2], dtype="foo")
  358. def test_constructor_np_strs(self):
  359. # GH#31499 Hashtable.map_locations needs to work on np.str_ objects
  360. cat = Categorical(["1", "0", "1"], [np.str_("0"), np.str_("1")])
  361. assert all(isinstance(x, np.str_) for x in cat.categories)
  362. def test_constructor_from_categorical_with_dtype(self):
  363. dtype = CategoricalDtype(["a", "b", "c"], ordered=True)
  364. values = Categorical(["a", "b", "d"])
  365. result = Categorical(values, dtype=dtype)
  366. # We use dtype.categories, not values.categories
  367. expected = Categorical(
  368. ["a", "b", "d"], categories=["a", "b", "c"], ordered=True
  369. )
  370. tm.assert_categorical_equal(result, expected)
  371. def test_constructor_from_categorical_with_unknown_dtype(self):
  372. dtype = CategoricalDtype(None, ordered=True)
  373. values = Categorical(["a", "b", "d"])
  374. result = Categorical(values, dtype=dtype)
  375. # We use values.categories, not dtype.categories
  376. expected = Categorical(
  377. ["a", "b", "d"], categories=["a", "b", "d"], ordered=True
  378. )
  379. tm.assert_categorical_equal(result, expected)
  380. def test_constructor_from_categorical_string(self):
  381. values = Categorical(["a", "b", "d"])
  382. # use categories, ordered
  383. result = Categorical(
  384. values, categories=["a", "b", "c"], ordered=True, dtype="category"
  385. )
  386. expected = Categorical(
  387. ["a", "b", "d"], categories=["a", "b", "c"], ordered=True
  388. )
  389. tm.assert_categorical_equal(result, expected)
  390. # No string
  391. result = Categorical(values, categories=["a", "b", "c"], ordered=True)
  392. tm.assert_categorical_equal(result, expected)
  393. def test_constructor_with_categorical_categories(self):
  394. # GH17884
  395. expected = Categorical(["a", "b"], categories=["a", "b", "c"])
  396. result = Categorical(["a", "b"], categories=Categorical(["a", "b", "c"]))
  397. tm.assert_categorical_equal(result, expected)
  398. result = Categorical(["a", "b"], categories=CategoricalIndex(["a", "b", "c"]))
  399. tm.assert_categorical_equal(result, expected)
  400. @pytest.mark.parametrize("klass", [lambda x: np.array(x, dtype=object), list])
  401. def test_construction_with_null(self, klass, nulls_fixture):
  402. # https://github.com/pandas-dev/pandas/issues/31927
  403. values = klass(["a", nulls_fixture, "b"])
  404. result = Categorical(values)
  405. dtype = CategoricalDtype(["a", "b"])
  406. codes = [0, -1, 1]
  407. expected = Categorical.from_codes(codes=codes, dtype=dtype)
  408. tm.assert_categorical_equal(result, expected)
  409. def test_from_codes_nullable_int_categories(self, any_numeric_ea_dtype):
  410. # GH#39649
  411. cats = pd.array(range(5), dtype=any_numeric_ea_dtype)
  412. codes = np.random.randint(5, size=3)
  413. dtype = CategoricalDtype(cats)
  414. arr = Categorical.from_codes(codes, dtype=dtype)
  415. assert arr.categories.dtype == cats.dtype
  416. tm.assert_index_equal(arr.categories, Index(cats))
  417. def test_from_codes_empty(self):
  418. cat = ["a", "b", "c"]
  419. result = Categorical.from_codes([], categories=cat)
  420. expected = Categorical([], categories=cat)
  421. tm.assert_categorical_equal(result, expected)
  422. def test_from_codes_too_few_categories(self):
  423. dtype = CategoricalDtype(categories=[1, 2])
  424. msg = "codes need to be between "
  425. with pytest.raises(ValueError, match=msg):
  426. Categorical.from_codes([1, 2], categories=dtype.categories)
  427. with pytest.raises(ValueError, match=msg):
  428. Categorical.from_codes([1, 2], dtype=dtype)
  429. def test_from_codes_non_int_codes(self):
  430. dtype = CategoricalDtype(categories=[1, 2])
  431. msg = "codes need to be array-like integers"
  432. with pytest.raises(ValueError, match=msg):
  433. Categorical.from_codes(["a"], categories=dtype.categories)
  434. with pytest.raises(ValueError, match=msg):
  435. Categorical.from_codes(["a"], dtype=dtype)
  436. def test_from_codes_non_unique_categories(self):
  437. with pytest.raises(ValueError, match="Categorical categories must be unique"):
  438. Categorical.from_codes([0, 1, 2], categories=["a", "a", "b"])
  439. def test_from_codes_nan_cat_included(self):
  440. with pytest.raises(ValueError, match="Categorical categories cannot be null"):
  441. Categorical.from_codes([0, 1, 2], categories=["a", "b", np.nan])
  442. def test_from_codes_too_negative(self):
  443. dtype = CategoricalDtype(categories=["a", "b", "c"])
  444. msg = r"codes need to be between -1 and len\(categories\)-1"
  445. with pytest.raises(ValueError, match=msg):
  446. Categorical.from_codes([-2, 1, 2], categories=dtype.categories)
  447. with pytest.raises(ValueError, match=msg):
  448. Categorical.from_codes([-2, 1, 2], dtype=dtype)
  449. def test_from_codes(self):
  450. dtype = CategoricalDtype(categories=["a", "b", "c"])
  451. exp = Categorical(["a", "b", "c"], ordered=False)
  452. res = Categorical.from_codes([0, 1, 2], categories=dtype.categories)
  453. tm.assert_categorical_equal(exp, res)
  454. res = Categorical.from_codes([0, 1, 2], dtype=dtype)
  455. tm.assert_categorical_equal(exp, res)
  456. @pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
  457. def test_from_codes_with_categorical_categories(self, klass):
  458. # GH17884
  459. expected = Categorical(["a", "b"], categories=["a", "b", "c"])
  460. result = Categorical.from_codes([0, 1], categories=klass(["a", "b", "c"]))
  461. tm.assert_categorical_equal(result, expected)
  462. @pytest.mark.parametrize("klass", [Categorical, CategoricalIndex])
  463. def test_from_codes_with_non_unique_categorical_categories(self, klass):
  464. with pytest.raises(ValueError, match="Categorical categories must be unique"):
  465. Categorical.from_codes([0, 1], klass(["a", "b", "a"]))
  466. def test_from_codes_with_nan_code(self):
  467. # GH21767
  468. codes = [1, 2, np.nan]
  469. dtype = CategoricalDtype(categories=["a", "b", "c"])
  470. with pytest.raises(ValueError, match="codes need to be array-like integers"):
  471. Categorical.from_codes(codes, categories=dtype.categories)
  472. with pytest.raises(ValueError, match="codes need to be array-like integers"):
  473. Categorical.from_codes(codes, dtype=dtype)
  474. @pytest.mark.parametrize("codes", [[1.0, 2.0, 0], [1.1, 2.0, 0]])
  475. def test_from_codes_with_float(self, codes):
  476. # GH21767
  477. # float codes should raise even if values are equal to integers
  478. dtype = CategoricalDtype(categories=["a", "b", "c"])
  479. msg = "codes need to be array-like integers"
  480. with pytest.raises(ValueError, match=msg):
  481. Categorical.from_codes(codes, dtype.categories)
  482. with pytest.raises(ValueError, match=msg):
  483. Categorical.from_codes(codes, dtype=dtype)
  484. def test_from_codes_with_dtype_raises(self):
  485. msg = "Cannot specify"
  486. with pytest.raises(ValueError, match=msg):
  487. Categorical.from_codes(
  488. [0, 1], categories=["a", "b"], dtype=CategoricalDtype(["a", "b"])
  489. )
  490. with pytest.raises(ValueError, match=msg):
  491. Categorical.from_codes(
  492. [0, 1], ordered=True, dtype=CategoricalDtype(["a", "b"])
  493. )
  494. def test_from_codes_neither(self):
  495. msg = "Both were None"
  496. with pytest.raises(ValueError, match=msg):
  497. Categorical.from_codes([0, 1])
  498. def test_from_codes_with_nullable_int(self):
  499. codes = pd.array([0, 1], dtype="Int64")
  500. categories = ["a", "b"]
  501. result = Categorical.from_codes(codes, categories=categories)
  502. expected = Categorical.from_codes(codes.to_numpy(int), categories=categories)
  503. tm.assert_categorical_equal(result, expected)
  504. def test_from_codes_with_nullable_int_na_raises(self):
  505. codes = pd.array([0, None], dtype="Int64")
  506. categories = ["a", "b"]
  507. msg = "codes cannot contain NA values"
  508. with pytest.raises(ValueError, match=msg):
  509. Categorical.from_codes(codes, categories=categories)
  510. @pytest.mark.parametrize("dtype", [None, "category"])
  511. def test_from_inferred_categories(self, dtype):
  512. cats = ["a", "b"]
  513. codes = np.array([0, 0, 1, 1], dtype="i8")
  514. result = Categorical._from_inferred_categories(cats, codes, dtype)
  515. expected = Categorical.from_codes(codes, cats)
  516. tm.assert_categorical_equal(result, expected)
  517. @pytest.mark.parametrize("dtype", [None, "category"])
  518. def test_from_inferred_categories_sorts(self, dtype):
  519. cats = ["b", "a"]
  520. codes = np.array([0, 1, 1, 1], dtype="i8")
  521. result = Categorical._from_inferred_categories(cats, codes, dtype)
  522. expected = Categorical.from_codes([1, 0, 0, 0], ["a", "b"])
  523. tm.assert_categorical_equal(result, expected)
  524. def test_from_inferred_categories_dtype(self):
  525. cats = ["a", "b", "d"]
  526. codes = np.array([0, 1, 0, 2], dtype="i8")
  527. dtype = CategoricalDtype(["c", "b", "a"], ordered=True)
  528. result = Categorical._from_inferred_categories(cats, codes, dtype)
  529. expected = Categorical(
  530. ["a", "b", "a", "d"], categories=["c", "b", "a"], ordered=True
  531. )
  532. tm.assert_categorical_equal(result, expected)
  533. def test_from_inferred_categories_coerces(self):
  534. cats = ["1", "2", "bad"]
  535. codes = np.array([0, 0, 1, 2], dtype="i8")
  536. dtype = CategoricalDtype([1, 2])
  537. result = Categorical._from_inferred_categories(cats, codes, dtype)
  538. expected = Categorical([1, 1, 2, np.nan])
  539. tm.assert_categorical_equal(result, expected)
  540. @pytest.mark.parametrize("ordered", [None, True, False])
  541. def test_construction_with_ordered(self, ordered):
  542. # GH 9347, 9190
  543. cat = Categorical([0, 1, 2], ordered=ordered)
  544. assert cat.ordered == bool(ordered)
  545. def test_constructor_imaginary(self):
  546. values = [1, 2, 3 + 1j]
  547. c1 = Categorical(values)
  548. tm.assert_index_equal(c1.categories, Index(values))
  549. tm.assert_numpy_array_equal(np.array(c1), np.array(values))
  550. def test_constructor_string_and_tuples(self):
  551. # GH 21416
  552. c = Categorical(np.array(["c", ("a", "b"), ("b", "a"), "c"], dtype=object))
  553. expected_index = Index([("a", "b"), ("b", "a"), "c"])
  554. assert c.categories.equals(expected_index)
  555. def test_interval(self):
  556. idx = pd.interval_range(0, 10, periods=10)
  557. cat = Categorical(idx, categories=idx)
  558. expected_codes = np.arange(10, dtype="int8")
  559. tm.assert_numpy_array_equal(cat.codes, expected_codes)
  560. tm.assert_index_equal(cat.categories, idx)
  561. # infer categories
  562. cat = Categorical(idx)
  563. tm.assert_numpy_array_equal(cat.codes, expected_codes)
  564. tm.assert_index_equal(cat.categories, idx)
  565. # list values
  566. cat = Categorical(list(idx))
  567. tm.assert_numpy_array_equal(cat.codes, expected_codes)
  568. tm.assert_index_equal(cat.categories, idx)
  569. # list values, categories
  570. cat = Categorical(list(idx), categories=list(idx))
  571. tm.assert_numpy_array_equal(cat.codes, expected_codes)
  572. tm.assert_index_equal(cat.categories, idx)
  573. # shuffled
  574. values = idx.take([1, 2, 0])
  575. cat = Categorical(values, categories=idx)
  576. tm.assert_numpy_array_equal(cat.codes, np.array([1, 2, 0], dtype="int8"))
  577. tm.assert_index_equal(cat.categories, idx)
  578. # extra
  579. values = pd.interval_range(8, 11, periods=3)
  580. cat = Categorical(values, categories=idx)
  581. expected_codes = np.array([8, 9, -1], dtype="int8")
  582. tm.assert_numpy_array_equal(cat.codes, expected_codes)
  583. tm.assert_index_equal(cat.categories, idx)
  584. # overlapping
  585. idx = IntervalIndex([Interval(0, 2), Interval(0, 1)])
  586. cat = Categorical(idx, categories=idx)
  587. expected_codes = np.array([0, 1], dtype="int8")
  588. tm.assert_numpy_array_equal(cat.codes, expected_codes)
  589. tm.assert_index_equal(cat.categories, idx)
  590. def test_categorical_extension_array_nullable(self, nulls_fixture):
  591. # GH:
  592. arr = pd.arrays.StringArray._from_sequence([nulls_fixture] * 2)
  593. result = Categorical(arr)
  594. assert arr.dtype == result.categories.dtype
  595. expected = Categorical(Series([pd.NA, pd.NA], dtype=arr.dtype))
  596. tm.assert_categorical_equal(result, expected)
  597. def test_from_sequence_copy(self):
  598. cat = Categorical(np.arange(5).repeat(2))
  599. result = Categorical._from_sequence(cat, dtype=None, copy=False)
  600. # more generally, we'd be OK with a view
  601. assert result._codes is cat._codes
  602. result = Categorical._from_sequence(cat, dtype=None, copy=True)
  603. assert not tm.shares_memory(result, cat)
  604. def test_constructor_datetime64_non_nano(self):
  605. categories = np.arange(10).view("M8[D]")
  606. values = categories[::2].copy()
  607. cat = Categorical(values, categories=categories)
  608. assert (cat == values).all()
  609. def test_constructor_preserves_freq(self):
  610. # GH33830 freq retention in categorical
  611. dti = date_range("2016-01-01", periods=5)
  612. expected = dti.freq
  613. cat = Categorical(dti)
  614. result = cat.categories.freq
  615. assert expected == result