test_api.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. import re
  2. import numpy as np
  3. import pytest
  4. from pandas.compat import PY311
  5. from pandas import (
  6. Categorical,
  7. CategoricalIndex,
  8. DataFrame,
  9. Index,
  10. Series,
  11. StringDtype,
  12. )
  13. import pandas._testing as tm
  14. from pandas.core.arrays.categorical import recode_for_categories
  15. class TestCategoricalAPI:
  16. def test_ordered_api(self):
  17. # GH 9347
  18. cat1 = Categorical(list("acb"), ordered=False)
  19. tm.assert_index_equal(cat1.categories, Index(["a", "b", "c"]))
  20. assert not cat1.ordered
  21. cat2 = Categorical(list("acb"), categories=list("bca"), ordered=False)
  22. tm.assert_index_equal(cat2.categories, Index(["b", "c", "a"]))
  23. assert not cat2.ordered
  24. cat3 = Categorical(list("acb"), ordered=True)
  25. tm.assert_index_equal(cat3.categories, Index(["a", "b", "c"]))
  26. assert cat3.ordered
  27. cat4 = Categorical(list("acb"), categories=list("bca"), ordered=True)
  28. tm.assert_index_equal(cat4.categories, Index(["b", "c", "a"]))
  29. assert cat4.ordered
  30. def test_set_ordered(self):
  31. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  32. cat2 = cat.as_unordered()
  33. assert not cat2.ordered
  34. cat2 = cat.as_ordered()
  35. assert cat2.ordered
  36. assert cat2.set_ordered(True).ordered
  37. assert not cat2.set_ordered(False).ordered
  38. # removed in 0.19.0
  39. msg = (
  40. "property 'ordered' of 'Categorical' object has no setter"
  41. if PY311
  42. else "can't set attribute"
  43. )
  44. with pytest.raises(AttributeError, match=msg):
  45. cat.ordered = True
  46. with pytest.raises(AttributeError, match=msg):
  47. cat.ordered = False
  48. def test_rename_categories(self):
  49. cat = Categorical(["a", "b", "c", "a"])
  50. # inplace=False: the old one must not be changed
  51. res = cat.rename_categories([1, 2, 3])
  52. tm.assert_numpy_array_equal(
  53. res.__array__(), np.array([1, 2, 3, 1], dtype=np.int64)
  54. )
  55. tm.assert_index_equal(res.categories, Index([1, 2, 3]))
  56. exp_cat = np.array(["a", "b", "c", "a"], dtype=np.object_)
  57. tm.assert_numpy_array_equal(cat.__array__(), exp_cat)
  58. exp_cat = Index(["a", "b", "c"])
  59. tm.assert_index_equal(cat.categories, exp_cat)
  60. # GH18862 (let rename_categories take callables)
  61. result = cat.rename_categories(lambda x: x.upper())
  62. expected = Categorical(["A", "B", "C", "A"])
  63. tm.assert_categorical_equal(result, expected)
  64. @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
  65. def test_rename_categories_wrong_length_raises(self, new_categories):
  66. cat = Categorical(["a", "b", "c", "a"])
  67. msg = (
  68. "new categories need to have the same number of items as the "
  69. "old categories!"
  70. )
  71. with pytest.raises(ValueError, match=msg):
  72. cat.rename_categories(new_categories)
  73. def test_rename_categories_series(self):
  74. # https://github.com/pandas-dev/pandas/issues/17981
  75. c = Categorical(["a", "b"])
  76. result = c.rename_categories(Series([0, 1], index=["a", "b"]))
  77. expected = Categorical([0, 1])
  78. tm.assert_categorical_equal(result, expected)
  79. def test_rename_categories_dict(self):
  80. # GH 17336
  81. cat = Categorical(["a", "b", "c", "d"])
  82. res = cat.rename_categories({"a": 4, "b": 3, "c": 2, "d": 1})
  83. expected = Index([4, 3, 2, 1])
  84. tm.assert_index_equal(res.categories, expected)
  85. # Test for dicts of smaller length
  86. cat = Categorical(["a", "b", "c", "d"])
  87. res = cat.rename_categories({"a": 1, "c": 3})
  88. expected = Index([1, "b", 3, "d"])
  89. tm.assert_index_equal(res.categories, expected)
  90. # Test for dicts with bigger length
  91. cat = Categorical(["a", "b", "c", "d"])
  92. res = cat.rename_categories({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5, "f": 6})
  93. expected = Index([1, 2, 3, 4])
  94. tm.assert_index_equal(res.categories, expected)
  95. # Test for dicts with no items from old categories
  96. cat = Categorical(["a", "b", "c", "d"])
  97. res = cat.rename_categories({"f": 1, "g": 3})
  98. expected = Index(["a", "b", "c", "d"])
  99. tm.assert_index_equal(res.categories, expected)
  100. def test_reorder_categories(self):
  101. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  102. old = cat.copy()
  103. new = Categorical(
  104. ["a", "b", "c", "a"], categories=["c", "b", "a"], ordered=True
  105. )
  106. res = cat.reorder_categories(["c", "b", "a"])
  107. # cat must be the same as before
  108. tm.assert_categorical_equal(cat, old)
  109. # only res is changed
  110. tm.assert_categorical_equal(res, new)
  111. @pytest.mark.parametrize(
  112. "new_categories",
  113. [
  114. ["a"], # not all "old" included in "new"
  115. ["a", "b", "d"], # still not all "old" in "new"
  116. ["a", "b", "c", "d"], # all "old" included in "new", but too long
  117. ],
  118. )
  119. def test_reorder_categories_raises(self, new_categories):
  120. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  121. msg = "items in new_categories are not the same as in old categories"
  122. with pytest.raises(ValueError, match=msg):
  123. cat.reorder_categories(new_categories)
  124. def test_add_categories(self):
  125. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  126. old = cat.copy()
  127. new = Categorical(
  128. ["a", "b", "c", "a"], categories=["a", "b", "c", "d"], ordered=True
  129. )
  130. res = cat.add_categories("d")
  131. tm.assert_categorical_equal(cat, old)
  132. tm.assert_categorical_equal(res, new)
  133. res = cat.add_categories(["d"])
  134. tm.assert_categorical_equal(cat, old)
  135. tm.assert_categorical_equal(res, new)
  136. # GH 9927
  137. cat = Categorical(list("abc"), ordered=True)
  138. expected = Categorical(list("abc"), categories=list("abcde"), ordered=True)
  139. # test with Series, np.array, index, list
  140. res = cat.add_categories(Series(["d", "e"]))
  141. tm.assert_categorical_equal(res, expected)
  142. res = cat.add_categories(np.array(["d", "e"]))
  143. tm.assert_categorical_equal(res, expected)
  144. res = cat.add_categories(Index(["d", "e"]))
  145. tm.assert_categorical_equal(res, expected)
  146. res = cat.add_categories(["d", "e"])
  147. tm.assert_categorical_equal(res, expected)
  148. def test_add_categories_existing_raises(self):
  149. # new is in old categories
  150. cat = Categorical(["a", "b", "c", "d"], ordered=True)
  151. msg = re.escape("new categories must not include old categories: {'d'}")
  152. with pytest.raises(ValueError, match=msg):
  153. cat.add_categories(["d"])
  154. def test_add_categories_losing_dtype_information(self):
  155. # GH#48812
  156. cat = Categorical(Series([1, 2], dtype="Int64"))
  157. ser = Series([4], dtype="Int64")
  158. result = cat.add_categories(ser)
  159. expected = Categorical(
  160. Series([1, 2], dtype="Int64"), categories=Series([1, 2, 4], dtype="Int64")
  161. )
  162. tm.assert_categorical_equal(result, expected)
  163. cat = Categorical(Series(["a", "b", "a"], dtype=StringDtype()))
  164. ser = Series(["d"], dtype=StringDtype())
  165. result = cat.add_categories(ser)
  166. expected = Categorical(
  167. Series(["a", "b", "a"], dtype=StringDtype()),
  168. categories=Series(["a", "b", "d"], dtype=StringDtype()),
  169. )
  170. tm.assert_categorical_equal(result, expected)
  171. def test_set_categories(self):
  172. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  173. exp_categories = Index(["c", "b", "a"])
  174. exp_values = np.array(["a", "b", "c", "a"], dtype=np.object_)
  175. cat = cat.set_categories(["c", "b", "a"])
  176. res = cat.set_categories(["a", "b", "c"])
  177. # cat must be the same as before
  178. tm.assert_index_equal(cat.categories, exp_categories)
  179. tm.assert_numpy_array_equal(cat.__array__(), exp_values)
  180. # only res is changed
  181. exp_categories_back = Index(["a", "b", "c"])
  182. tm.assert_index_equal(res.categories, exp_categories_back)
  183. tm.assert_numpy_array_equal(res.__array__(), exp_values)
  184. # not all "old" included in "new" -> all not included ones are now
  185. # np.nan
  186. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  187. res = cat.set_categories(["a"])
  188. tm.assert_numpy_array_equal(res.codes, np.array([0, -1, -1, 0], dtype=np.int8))
  189. # still not all "old" in "new"
  190. res = cat.set_categories(["a", "b", "d"])
  191. tm.assert_numpy_array_equal(res.codes, np.array([0, 1, -1, 0], dtype=np.int8))
  192. tm.assert_index_equal(res.categories, Index(["a", "b", "d"]))
  193. # all "old" included in "new"
  194. cat = cat.set_categories(["a", "b", "c", "d"])
  195. exp_categories = Index(["a", "b", "c", "d"])
  196. tm.assert_index_equal(cat.categories, exp_categories)
  197. # internals...
  198. c = Categorical([1, 2, 3, 4, 1], categories=[1, 2, 3, 4], ordered=True)
  199. tm.assert_numpy_array_equal(c._codes, np.array([0, 1, 2, 3, 0], dtype=np.int8))
  200. tm.assert_index_equal(c.categories, Index([1, 2, 3, 4]))
  201. exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
  202. tm.assert_numpy_array_equal(np.asarray(c), exp)
  203. # all "pointers" to '4' must be changed from 3 to 0,...
  204. c = c.set_categories([4, 3, 2, 1])
  205. # positions are changed
  206. tm.assert_numpy_array_equal(c._codes, np.array([3, 2, 1, 0, 3], dtype=np.int8))
  207. # categories are now in new order
  208. tm.assert_index_equal(c.categories, Index([4, 3, 2, 1]))
  209. # output is the same
  210. exp = np.array([1, 2, 3, 4, 1], dtype=np.int64)
  211. tm.assert_numpy_array_equal(np.asarray(c), exp)
  212. assert c.min() == 4
  213. assert c.max() == 1
  214. # set_categories should set the ordering if specified
  215. c2 = c.set_categories([4, 3, 2, 1], ordered=False)
  216. assert not c2.ordered
  217. tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
  218. # set_categories should pass thru the ordering
  219. c2 = c.set_ordered(False).set_categories([4, 3, 2, 1])
  220. assert not c2.ordered
  221. tm.assert_numpy_array_equal(np.asarray(c), np.asarray(c2))
  222. @pytest.mark.parametrize(
  223. "values, categories, new_categories",
  224. [
  225. # No NaNs, same cats, same order
  226. (["a", "b", "a"], ["a", "b"], ["a", "b"]),
  227. # No NaNs, same cats, different order
  228. (["a", "b", "a"], ["a", "b"], ["b", "a"]),
  229. # Same, unsorted
  230. (["b", "a", "a"], ["a", "b"], ["a", "b"]),
  231. # No NaNs, same cats, different order
  232. (["b", "a", "a"], ["a", "b"], ["b", "a"]),
  233. # NaNs
  234. (["a", "b", "c"], ["a", "b"], ["a", "b"]),
  235. (["a", "b", "c"], ["a", "b"], ["b", "a"]),
  236. (["b", "a", "c"], ["a", "b"], ["a", "b"]),
  237. (["b", "a", "c"], ["a", "b"], ["a", "b"]),
  238. # Introduce NaNs
  239. (["a", "b", "c"], ["a", "b"], ["a"]),
  240. (["a", "b", "c"], ["a", "b"], ["b"]),
  241. (["b", "a", "c"], ["a", "b"], ["a"]),
  242. (["b", "a", "c"], ["a", "b"], ["a"]),
  243. # No overlap
  244. (["a", "b", "c"], ["a", "b"], ["d", "e"]),
  245. ],
  246. )
  247. @pytest.mark.parametrize("ordered", [True, False])
  248. def test_set_categories_many(self, values, categories, new_categories, ordered):
  249. c = Categorical(values, categories)
  250. expected = Categorical(values, new_categories, ordered)
  251. result = c.set_categories(new_categories, ordered=ordered)
  252. tm.assert_categorical_equal(result, expected)
  253. def test_set_categories_rename_less(self):
  254. # GH 24675
  255. cat = Categorical(["A", "B"])
  256. result = cat.set_categories(["A"], rename=True)
  257. expected = Categorical(["A", np.nan])
  258. tm.assert_categorical_equal(result, expected)
  259. def test_set_categories_private(self):
  260. cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
  261. cat._set_categories(["a", "c", "d", "e"])
  262. expected = Categorical(["a", "c", "d"], categories=list("acde"))
  263. tm.assert_categorical_equal(cat, expected)
  264. # fastpath
  265. cat = Categorical(["a", "b", "c"], categories=["a", "b", "c", "d"])
  266. cat._set_categories(["a", "c", "d", "e"], fastpath=True)
  267. expected = Categorical(["a", "c", "d"], categories=list("acde"))
  268. tm.assert_categorical_equal(cat, expected)
  269. def test_remove_categories(self):
  270. cat = Categorical(["a", "b", "c", "a"], ordered=True)
  271. old = cat.copy()
  272. new = Categorical(["a", "b", np.nan, "a"], categories=["a", "b"], ordered=True)
  273. res = cat.remove_categories("c")
  274. tm.assert_categorical_equal(cat, old)
  275. tm.assert_categorical_equal(res, new)
  276. res = cat.remove_categories(["c"])
  277. tm.assert_categorical_equal(cat, old)
  278. tm.assert_categorical_equal(res, new)
  279. @pytest.mark.parametrize("removals", [["c"], ["c", np.nan], "c", ["c", "c"]])
  280. def test_remove_categories_raises(self, removals):
  281. cat = Categorical(["a", "b", "a"])
  282. message = re.escape("removals must all be in old categories: {'c'}")
  283. with pytest.raises(ValueError, match=message):
  284. cat.remove_categories(removals)
  285. def test_remove_unused_categories(self):
  286. c = Categorical(["a", "b", "c", "d", "a"], categories=["a", "b", "c", "d", "e"])
  287. exp_categories_all = Index(["a", "b", "c", "d", "e"])
  288. exp_categories_dropped = Index(["a", "b", "c", "d"])
  289. tm.assert_index_equal(c.categories, exp_categories_all)
  290. res = c.remove_unused_categories()
  291. tm.assert_index_equal(res.categories, exp_categories_dropped)
  292. tm.assert_index_equal(c.categories, exp_categories_all)
  293. # with NaN values (GH11599)
  294. c = Categorical(["a", "b", "c", np.nan], categories=["a", "b", "c", "d", "e"])
  295. res = c.remove_unused_categories()
  296. tm.assert_index_equal(res.categories, Index(np.array(["a", "b", "c"])))
  297. exp_codes = np.array([0, 1, 2, -1], dtype=np.int8)
  298. tm.assert_numpy_array_equal(res.codes, exp_codes)
  299. tm.assert_index_equal(c.categories, exp_categories_all)
  300. val = ["F", np.nan, "D", "B", "D", "F", np.nan]
  301. cat = Categorical(values=val, categories=list("ABCDEFG"))
  302. out = cat.remove_unused_categories()
  303. tm.assert_index_equal(out.categories, Index(["B", "D", "F"]))
  304. exp_codes = np.array([2, -1, 1, 0, 1, 2, -1], dtype=np.int8)
  305. tm.assert_numpy_array_equal(out.codes, exp_codes)
  306. assert out.tolist() == val
  307. alpha = list("abcdefghijklmnopqrstuvwxyz")
  308. val = np.random.choice(alpha[::2], 10000).astype("object")
  309. val[np.random.choice(len(val), 100)] = np.nan
  310. cat = Categorical(values=val, categories=alpha)
  311. out = cat.remove_unused_categories()
  312. assert out.tolist() == val.tolist()
  313. class TestCategoricalAPIWithFactor:
  314. def test_describe(self, factor):
  315. # string type
  316. desc = factor.describe()
  317. assert factor.ordered
  318. exp_index = CategoricalIndex(
  319. ["a", "b", "c"], name="categories", ordered=factor.ordered
  320. )
  321. expected = DataFrame(
  322. {"counts": [3, 2, 3], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0]}, index=exp_index
  323. )
  324. tm.assert_frame_equal(desc, expected)
  325. # check unused categories
  326. cat = factor.copy()
  327. cat = cat.set_categories(["a", "b", "c", "d"])
  328. desc = cat.describe()
  329. exp_index = CategoricalIndex(
  330. list("abcd"), ordered=factor.ordered, name="categories"
  331. )
  332. expected = DataFrame(
  333. {"counts": [3, 2, 3, 0], "freqs": [3 / 8.0, 2 / 8.0, 3 / 8.0, 0]},
  334. index=exp_index,
  335. )
  336. tm.assert_frame_equal(desc, expected)
  337. # check an integer one
  338. cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
  339. desc = cat.describe()
  340. exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered, name="categories")
  341. expected = DataFrame(
  342. {"counts": [5, 3, 3], "freqs": [5 / 11.0, 3 / 11.0, 3 / 11.0]},
  343. index=exp_index,
  344. )
  345. tm.assert_frame_equal(desc, expected)
  346. # https://github.com/pandas-dev/pandas/issues/3678
  347. # describe should work with NaN
  348. cat = Categorical([np.nan, 1, 2, 2])
  349. desc = cat.describe()
  350. expected = DataFrame(
  351. {"counts": [1, 2, 1], "freqs": [1 / 4.0, 2 / 4.0, 1 / 4.0]},
  352. index=CategoricalIndex(
  353. [1, 2, np.nan], categories=[1, 2], name="categories"
  354. ),
  355. )
  356. tm.assert_frame_equal(desc, expected)
  357. class TestPrivateCategoricalAPI:
  358. def test_codes_immutable(self):
  359. # Codes should be read only
  360. c = Categorical(["a", "b", "c", "a", np.nan])
  361. exp = np.array([0, 1, 2, 0, -1], dtype="int8")
  362. tm.assert_numpy_array_equal(c.codes, exp)
  363. # Assignments to codes should raise
  364. msg = (
  365. "property 'codes' of 'Categorical' object has no setter"
  366. if PY311
  367. else "can't set attribute"
  368. )
  369. with pytest.raises(AttributeError, match=msg):
  370. c.codes = np.array([0, 1, 2, 0, 1], dtype="int8")
  371. # changes in the codes array should raise
  372. codes = c.codes
  373. with pytest.raises(ValueError, match="assignment destination is read-only"):
  374. codes[4] = 1
  375. # But even after getting the codes, the original array should still be
  376. # writeable!
  377. c[4] = "a"
  378. exp = np.array([0, 1, 2, 0, 0], dtype="int8")
  379. tm.assert_numpy_array_equal(c.codes, exp)
  380. c._codes[4] = 2
  381. exp = np.array([0, 1, 2, 0, 2], dtype="int8")
  382. tm.assert_numpy_array_equal(c.codes, exp)
  383. @pytest.mark.parametrize(
  384. "codes, old, new, expected",
  385. [
  386. ([0, 1], ["a", "b"], ["a", "b"], [0, 1]),
  387. ([0, 1], ["b", "a"], ["b", "a"], [0, 1]),
  388. ([0, 1], ["a", "b"], ["b", "a"], [1, 0]),
  389. ([0, 1], ["b", "a"], ["a", "b"], [1, 0]),
  390. ([0, 1, 0, 1], ["a", "b"], ["a", "b", "c"], [0, 1, 0, 1]),
  391. ([0, 1, 2, 2], ["a", "b", "c"], ["a", "b"], [0, 1, -1, -1]),
  392. ([0, 1, -1], ["a", "b", "c"], ["a", "b", "c"], [0, 1, -1]),
  393. ([0, 1, -1], ["a", "b", "c"], ["b"], [-1, 0, -1]),
  394. ([0, 1, -1], ["a", "b", "c"], ["d"], [-1, -1, -1]),
  395. ([0, 1, -1], ["a", "b", "c"], [], [-1, -1, -1]),
  396. ([-1, -1], [], ["a", "b"], [-1, -1]),
  397. ([1, 0], ["b", "a"], ["a", "b"], [0, 1]),
  398. ],
  399. )
  400. def test_recode_to_categories(self, codes, old, new, expected):
  401. codes = np.asanyarray(codes, dtype=np.int8)
  402. expected = np.asanyarray(expected, dtype=np.int8)
  403. old = Index(old)
  404. new = Index(new)
  405. result = recode_for_categories(codes, old, new)
  406. tm.assert_numpy_array_equal(result, expected)
  407. def test_recode_to_categories_large(self):
  408. N = 1000
  409. codes = np.arange(N)
  410. old = Index(codes)
  411. expected = np.arange(N - 1, -1, -1, dtype=np.int16)
  412. new = Index(expected)
  413. result = recode_for_categories(codes, old, new)
  414. tm.assert_numpy_array_equal(result, expected)