test_categorical.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556
  1. import re
  2. import numpy as np
  3. import pytest
  4. from pandas.core.dtypes.common import is_categorical_dtype
  5. import pandas as pd
  6. from pandas import (
  7. Categorical,
  8. CategoricalIndex,
  9. DataFrame,
  10. Index,
  11. Interval,
  12. Series,
  13. Timedelta,
  14. Timestamp,
  15. )
  16. import pandas._testing as tm
  17. from pandas.api.types import CategoricalDtype as CDT
  18. @pytest.fixture
  19. def df():
  20. return DataFrame(
  21. {
  22. "A": np.arange(6, dtype="int64"),
  23. },
  24. index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cab")), name="B"),
  25. )
  26. @pytest.fixture
  27. def df2():
  28. return DataFrame(
  29. {
  30. "A": np.arange(6, dtype="int64"),
  31. },
  32. index=CategoricalIndex(list("aabbca"), dtype=CDT(list("cabe")), name="B"),
  33. )
  34. class TestCategoricalIndex:
  35. def test_loc_scalar(self, df):
  36. dtype = CDT(list("cab"))
  37. result = df.loc["a"]
  38. bidx = Series(list("aaa"), name="B").astype(dtype)
  39. assert bidx.dtype == dtype
  40. expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx))
  41. tm.assert_frame_equal(result, expected)
  42. df = df.copy()
  43. df.loc["a"] = 20
  44. bidx2 = Series(list("aabbca"), name="B").astype(dtype)
  45. assert bidx2.dtype == dtype
  46. expected = DataFrame(
  47. {
  48. "A": [20, 20, 2, 3, 4, 20],
  49. },
  50. index=Index(bidx2),
  51. )
  52. tm.assert_frame_equal(df, expected)
  53. # value not in the categories
  54. with pytest.raises(KeyError, match=r"^'d'$"):
  55. df.loc["d"]
  56. df2 = df.copy()
  57. expected = df2.copy()
  58. expected.index = expected.index.astype(object)
  59. expected.loc["d"] = 10
  60. df2.loc["d"] = 10
  61. tm.assert_frame_equal(df2, expected)
  62. def test_loc_setitem_with_expansion_non_category(self, df):
  63. # Setting-with-expansion with a new key "d" that is not among caegories
  64. df.loc["a"] = 20
  65. # Setting a new row on an existing column
  66. df3 = df.copy()
  67. df3.loc["d", "A"] = 10
  68. bidx3 = Index(list("aabbcad"), name="B")
  69. expected3 = DataFrame(
  70. {
  71. "A": [20, 20, 2, 3, 4, 20, 10.0],
  72. },
  73. index=Index(bidx3),
  74. )
  75. tm.assert_frame_equal(df3, expected3)
  76. # Settig a new row _and_ new column
  77. df4 = df.copy()
  78. df4.loc["d", "C"] = 10
  79. expected3 = DataFrame(
  80. {
  81. "A": [20, 20, 2, 3, 4, 20, np.nan],
  82. "C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10],
  83. },
  84. index=Index(bidx3),
  85. )
  86. tm.assert_frame_equal(df4, expected3)
  87. def test_loc_getitem_scalar_non_category(self, df):
  88. with pytest.raises(KeyError, match="^1$"):
  89. df.loc[1]
  90. def test_slicing(self):
  91. cat = Series(Categorical([1, 2, 3, 4]))
  92. reverse = cat[::-1]
  93. exp = np.array([4, 3, 2, 1], dtype=np.int64)
  94. tm.assert_numpy_array_equal(reverse.__array__(), exp)
  95. df = DataFrame({"value": (np.arange(100) + 1).astype("int64")})
  96. df["D"] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])
  97. expected = Series([11, Interval(0, 25)], index=["value", "D"], name=10)
  98. result = df.iloc[10]
  99. tm.assert_series_equal(result, expected)
  100. expected = DataFrame(
  101. {"value": np.arange(11, 21).astype("int64")},
  102. index=np.arange(10, 20).astype("int64"),
  103. )
  104. expected["D"] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
  105. result = df.iloc[10:20]
  106. tm.assert_frame_equal(result, expected)
  107. expected = Series([9, Interval(0, 25)], index=["value", "D"], name=8)
  108. result = df.loc[8]
  109. tm.assert_series_equal(result, expected)
  110. def test_slicing_and_getting_ops(self):
  111. # systematically test the slicing operations:
  112. # for all slicing ops:
  113. # - returning a dataframe
  114. # - returning a column
  115. # - returning a row
  116. # - returning a single value
  117. cats = Categorical(
  118. ["a", "c", "b", "c", "c", "c", "c"], categories=["a", "b", "c"]
  119. )
  120. idx = Index(["h", "i", "j", "k", "l", "m", "n"])
  121. values = [1, 2, 3, 4, 5, 6, 7]
  122. df = DataFrame({"cats": cats, "values": values}, index=idx)
  123. # the expected values
  124. cats2 = Categorical(["b", "c"], categories=["a", "b", "c"])
  125. idx2 = Index(["j", "k"])
  126. values2 = [3, 4]
  127. # 2:4,: | "j":"k",:
  128. exp_df = DataFrame({"cats": cats2, "values": values2}, index=idx2)
  129. # :,"cats" | :,0
  130. exp_col = Series(cats, index=idx, name="cats")
  131. # "j",: | 2,:
  132. exp_row = Series(["b", 3], index=["cats", "values"], dtype="object", name="j")
  133. # "j","cats | 2,0
  134. exp_val = "b"
  135. # iloc
  136. # frame
  137. res_df = df.iloc[2:4, :]
  138. tm.assert_frame_equal(res_df, exp_df)
  139. assert is_categorical_dtype(res_df["cats"].dtype)
  140. # row
  141. res_row = df.iloc[2, :]
  142. tm.assert_series_equal(res_row, exp_row)
  143. assert isinstance(res_row["cats"], str)
  144. # col
  145. res_col = df.iloc[:, 0]
  146. tm.assert_series_equal(res_col, exp_col)
  147. assert is_categorical_dtype(res_col.dtype)
  148. # single value
  149. res_val = df.iloc[2, 0]
  150. assert res_val == exp_val
  151. # loc
  152. # frame
  153. res_df = df.loc["j":"k", :]
  154. tm.assert_frame_equal(res_df, exp_df)
  155. assert is_categorical_dtype(res_df["cats"].dtype)
  156. # row
  157. res_row = df.loc["j", :]
  158. tm.assert_series_equal(res_row, exp_row)
  159. assert isinstance(res_row["cats"], str)
  160. # col
  161. res_col = df.loc[:, "cats"]
  162. tm.assert_series_equal(res_col, exp_col)
  163. assert is_categorical_dtype(res_col.dtype)
  164. # single value
  165. res_val = df.loc["j", "cats"]
  166. assert res_val == exp_val
  167. # single value
  168. res_val = df.loc["j", df.columns[0]]
  169. assert res_val == exp_val
  170. # iat
  171. res_val = df.iat[2, 0]
  172. assert res_val == exp_val
  173. # at
  174. res_val = df.at["j", "cats"]
  175. assert res_val == exp_val
  176. # fancy indexing
  177. exp_fancy = df.iloc[[2]]
  178. res_fancy = df[df["cats"] == "b"]
  179. tm.assert_frame_equal(res_fancy, exp_fancy)
  180. res_fancy = df[df["values"] == 3]
  181. tm.assert_frame_equal(res_fancy, exp_fancy)
  182. # get_value
  183. res_val = df.at["j", "cats"]
  184. assert res_val == exp_val
  185. # i : int, slice, or sequence of integers
  186. res_row = df.iloc[2]
  187. tm.assert_series_equal(res_row, exp_row)
  188. assert isinstance(res_row["cats"], str)
  189. res_df = df.iloc[slice(2, 4)]
  190. tm.assert_frame_equal(res_df, exp_df)
  191. assert is_categorical_dtype(res_df["cats"].dtype)
  192. res_df = df.iloc[[2, 3]]
  193. tm.assert_frame_equal(res_df, exp_df)
  194. assert is_categorical_dtype(res_df["cats"].dtype)
  195. res_col = df.iloc[:, 0]
  196. tm.assert_series_equal(res_col, exp_col)
  197. assert is_categorical_dtype(res_col.dtype)
  198. res_df = df.iloc[:, slice(0, 2)]
  199. tm.assert_frame_equal(res_df, df)
  200. assert is_categorical_dtype(res_df["cats"].dtype)
  201. res_df = df.iloc[:, [0, 1]]
  202. tm.assert_frame_equal(res_df, df)
  203. assert is_categorical_dtype(res_df["cats"].dtype)
  204. def test_slicing_doc_examples(self):
  205. # GH 7918
  206. cats = Categorical(
  207. ["a", "b", "b", "b", "c", "c", "c"], categories=["a", "b", "c"]
  208. )
  209. idx = Index(["h", "i", "j", "k", "l", "m", "n"])
  210. values = [1, 2, 2, 2, 3, 4, 5]
  211. df = DataFrame({"cats": cats, "values": values}, index=idx)
  212. result = df.iloc[2:4, :]
  213. expected = DataFrame(
  214. {
  215. "cats": Categorical(["b", "b"], categories=["a", "b", "c"]),
  216. "values": [2, 2],
  217. },
  218. index=["j", "k"],
  219. )
  220. tm.assert_frame_equal(result, expected)
  221. result = df.iloc[2:4, :].dtypes
  222. expected = Series(["category", "int64"], ["cats", "values"])
  223. tm.assert_series_equal(result, expected)
  224. result = df.loc["h":"j", "cats"]
  225. expected = Series(
  226. Categorical(["a", "b", "b"], categories=["a", "b", "c"]),
  227. index=["h", "i", "j"],
  228. name="cats",
  229. )
  230. tm.assert_series_equal(result, expected)
  231. result = df.loc["h":"j", df.columns[0:1]]
  232. expected = DataFrame(
  233. {"cats": Categorical(["a", "b", "b"], categories=["a", "b", "c"])},
  234. index=["h", "i", "j"],
  235. )
  236. tm.assert_frame_equal(result, expected)
  237. def test_loc_getitem_listlike_labels(self, df):
  238. # list of labels
  239. result = df.loc[["c", "a"]]
  240. expected = df.iloc[[4, 0, 1, 5]]
  241. tm.assert_frame_equal(result, expected, check_index_type=True)
  242. def test_loc_getitem_listlike_unused_category(self, df2):
  243. # GH#37901 a label that is in index.categories but not in index
  244. # listlike containing an element in the categories but not in the values
  245. with pytest.raises(KeyError, match=re.escape("['e'] not in index")):
  246. df2.loc[["a", "b", "e"]]
  247. def test_loc_getitem_label_unused_category(self, df2):
  248. # element in the categories but not in the values
  249. with pytest.raises(KeyError, match=r"^'e'$"):
  250. df2.loc["e"]
  251. def test_loc_getitem_non_category(self, df2):
  252. # not all labels in the categories
  253. with pytest.raises(KeyError, match=re.escape("['d'] not in index")):
  254. df2.loc[["a", "d"]]
  255. def test_loc_setitem_expansion_label_unused_category(self, df2):
  256. # assigning with a label that is in the categories but not in the index
  257. df = df2.copy()
  258. df.loc["e"] = 20
  259. result = df.loc[["a", "b", "e"]]
  260. exp_index = CategoricalIndex(list("aaabbe"), categories=list("cabe"), name="B")
  261. expected = DataFrame({"A": [0, 1, 5, 2, 3, 20]}, index=exp_index)
  262. tm.assert_frame_equal(result, expected)
  263. def test_loc_listlike_dtypes(self):
  264. # GH 11586
  265. # unique categories and codes
  266. index = CategoricalIndex(["a", "b", "c"])
  267. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)
  268. # unique slice
  269. res = df.loc[["a", "b"]]
  270. exp_index = CategoricalIndex(["a", "b"], categories=index.categories)
  271. exp = DataFrame({"A": [1, 2], "B": [4, 5]}, index=exp_index)
  272. tm.assert_frame_equal(res, exp, check_index_type=True)
  273. # duplicated slice
  274. res = df.loc[["a", "a", "b"]]
  275. exp_index = CategoricalIndex(["a", "a", "b"], categories=index.categories)
  276. exp = DataFrame({"A": [1, 1, 2], "B": [4, 4, 5]}, index=exp_index)
  277. tm.assert_frame_equal(res, exp, check_index_type=True)
  278. with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
  279. df.loc[["a", "x"]]
  280. def test_loc_listlike_dtypes_duplicated_categories_and_codes(self):
  281. # duplicated categories and codes
  282. index = CategoricalIndex(["a", "b", "a"])
  283. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=index)
  284. # unique slice
  285. res = df.loc[["a", "b"]]
  286. exp = DataFrame(
  287. {"A": [1, 3, 2], "B": [4, 6, 5]}, index=CategoricalIndex(["a", "a", "b"])
  288. )
  289. tm.assert_frame_equal(res, exp, check_index_type=True)
  290. # duplicated slice
  291. res = df.loc[["a", "a", "b"]]
  292. exp = DataFrame(
  293. {"A": [1, 3, 1, 3, 2], "B": [4, 6, 4, 6, 5]},
  294. index=CategoricalIndex(["a", "a", "a", "a", "b"]),
  295. )
  296. tm.assert_frame_equal(res, exp, check_index_type=True)
  297. with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
  298. df.loc[["a", "x"]]
  299. def test_loc_listlike_dtypes_unused_category(self):
  300. # contains unused category
  301. index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde"))
  302. df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)
  303. res = df.loc[["a", "b"]]
  304. exp = DataFrame(
  305. {"A": [1, 3, 2], "B": [5, 7, 6]},
  306. index=CategoricalIndex(["a", "a", "b"], categories=list("abcde")),
  307. )
  308. tm.assert_frame_equal(res, exp, check_index_type=True)
  309. # duplicated slice
  310. res = df.loc[["a", "a", "b"]]
  311. exp = DataFrame(
  312. {"A": [1, 3, 1, 3, 2], "B": [5, 7, 5, 7, 6]},
  313. index=CategoricalIndex(["a", "a", "a", "a", "b"], categories=list("abcde")),
  314. )
  315. tm.assert_frame_equal(res, exp, check_index_type=True)
  316. with pytest.raises(KeyError, match=re.escape("['x'] not in index")):
  317. df.loc[["a", "x"]]
  318. def test_loc_getitem_listlike_unused_category_raises_keyerror(self):
  319. # key that is an *unused* category raises
  320. index = CategoricalIndex(["a", "b", "a", "c"], categories=list("abcde"))
  321. df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]}, index=index)
  322. with pytest.raises(KeyError, match="e"):
  323. # For comparison, check the scalar behavior
  324. df.loc["e"]
  325. with pytest.raises(KeyError, match=re.escape("['e'] not in index")):
  326. df.loc[["a", "e"]]
  327. def test_ix_categorical_index(self):
  328. # GH 12531
  329. df = DataFrame(np.random.randn(3, 3), index=list("ABC"), columns=list("XYZ"))
  330. cdf = df.copy()
  331. cdf.index = CategoricalIndex(df.index)
  332. cdf.columns = CategoricalIndex(df.columns)
  333. expect = Series(df.loc["A", :], index=cdf.columns, name="A")
  334. tm.assert_series_equal(cdf.loc["A", :], expect)
  335. expect = Series(df.loc[:, "X"], index=cdf.index, name="X")
  336. tm.assert_series_equal(cdf.loc[:, "X"], expect)
  337. exp_index = CategoricalIndex(list("AB"), categories=["A", "B", "C"])
  338. expect = DataFrame(df.loc[["A", "B"], :], columns=cdf.columns, index=exp_index)
  339. tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
  340. exp_columns = CategoricalIndex(list("XY"), categories=["X", "Y", "Z"])
  341. expect = DataFrame(df.loc[:, ["X", "Y"]], index=cdf.index, columns=exp_columns)
  342. tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
  343. def test_ix_categorical_index_non_unique(self):
  344. # non-unique
  345. df = DataFrame(np.random.randn(3, 3), index=list("ABA"), columns=list("XYX"))
  346. cdf = df.copy()
  347. cdf.index = CategoricalIndex(df.index)
  348. cdf.columns = CategoricalIndex(df.columns)
  349. exp_index = CategoricalIndex(list("AA"), categories=["A", "B"])
  350. expect = DataFrame(df.loc["A", :], columns=cdf.columns, index=exp_index)
  351. tm.assert_frame_equal(cdf.loc["A", :], expect)
  352. exp_columns = CategoricalIndex(list("XX"), categories=["X", "Y"])
  353. expect = DataFrame(df.loc[:, "X"], index=cdf.index, columns=exp_columns)
  354. tm.assert_frame_equal(cdf.loc[:, "X"], expect)
  355. expect = DataFrame(
  356. df.loc[["A", "B"], :],
  357. columns=cdf.columns,
  358. index=CategoricalIndex(list("AAB")),
  359. )
  360. tm.assert_frame_equal(cdf.loc[["A", "B"], :], expect)
  361. expect = DataFrame(
  362. df.loc[:, ["X", "Y"]],
  363. index=cdf.index,
  364. columns=CategoricalIndex(list("XXY")),
  365. )
  366. tm.assert_frame_equal(cdf.loc[:, ["X", "Y"]], expect)
  367. def test_loc_slice(self, df):
  368. # GH9748
  369. msg = (
  370. "cannot do slice indexing on CategoricalIndex with these "
  371. r"indexers \[1\] of type int"
  372. )
  373. with pytest.raises(TypeError, match=msg):
  374. df.loc[1:5]
  375. result = df.loc["b":"c"]
  376. expected = df.iloc[[2, 3, 4]]
  377. tm.assert_frame_equal(result, expected)
  378. def test_loc_and_at_with_categorical_index(self):
  379. # GH 20629
  380. df = DataFrame(
  381. [[1, 2], [3, 4], [5, 6]], index=CategoricalIndex(["A", "B", "C"])
  382. )
  383. s = df[0]
  384. assert s.loc["A"] == 1
  385. assert s.at["A"] == 1
  386. assert df.loc["B", 1] == 4
  387. assert df.at["B", 1] == 4
  388. @pytest.mark.parametrize(
  389. "idx_values",
  390. [
  391. # python types
  392. [1, 2, 3],
  393. [-1, -2, -3],
  394. [1.5, 2.5, 3.5],
  395. [-1.5, -2.5, -3.5],
  396. # numpy int/uint
  397. *(np.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_NUMPY_DTYPES),
  398. # numpy floats
  399. *(np.array([1.5, 2.5, 3.5], dtype=dtyp) for dtyp in tm.FLOAT_NUMPY_DTYPES),
  400. # numpy object
  401. np.array([1, "b", 3.5], dtype=object),
  402. # pandas scalars
  403. [Interval(1, 4), Interval(4, 6), Interval(6, 9)],
  404. [Timestamp(2019, 1, 1), Timestamp(2019, 2, 1), Timestamp(2019, 3, 1)],
  405. [Timedelta(1, "d"), Timedelta(2, "d"), Timedelta(3, "D")],
  406. # pandas Integer arrays
  407. *(pd.array([1, 2, 3], dtype=dtype) for dtype in tm.ALL_INT_EA_DTYPES),
  408. # other pandas arrays
  409. pd.IntervalIndex.from_breaks([1, 4, 6, 9]).array,
  410. pd.date_range("2019-01-01", periods=3).array,
  411. pd.timedelta_range(start="1d", periods=3).array,
  412. ],
  413. )
  414. def test_loc_getitem_with_non_string_categories(self, idx_values, ordered):
  415. # GH-17569
  416. cat_idx = CategoricalIndex(idx_values, ordered=ordered)
  417. df = DataFrame({"A": ["foo", "bar", "baz"]}, index=cat_idx)
  418. sl = slice(idx_values[0], idx_values[1])
  419. # scalar selection
  420. result = df.loc[idx_values[0]]
  421. expected = Series(["foo"], index=["A"], name=idx_values[0])
  422. tm.assert_series_equal(result, expected)
  423. # list selection
  424. result = df.loc[idx_values[:2]]
  425. expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
  426. tm.assert_frame_equal(result, expected)
  427. # slice selection
  428. result = df.loc[sl]
  429. expected = DataFrame(["foo", "bar"], index=cat_idx[:2], columns=["A"])
  430. tm.assert_frame_equal(result, expected)
  431. # scalar assignment
  432. result = df.copy()
  433. result.loc[idx_values[0]] = "qux"
  434. expected = DataFrame({"A": ["qux", "bar", "baz"]}, index=cat_idx)
  435. tm.assert_frame_equal(result, expected)
  436. # list assignment
  437. result = df.copy()
  438. result.loc[idx_values[:2], "A"] = ["qux", "qux2"]
  439. expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
  440. tm.assert_frame_equal(result, expected)
  441. # slice assignment
  442. result = df.copy()
  443. result.loc[sl, "A"] = ["qux", "qux2"]
  444. expected = DataFrame({"A": ["qux", "qux2", "baz"]}, index=cat_idx)
  445. tm.assert_frame_equal(result, expected)
  446. def test_getitem_categorical_with_nan(self):
  447. # GH#41933
  448. ci = CategoricalIndex(["A", "B", np.nan])
  449. ser = Series(range(3), index=ci)
  450. assert ser[np.nan] == 2
  451. assert ser.loc[np.nan] == 2
  452. df = DataFrame(ser)
  453. assert df.loc[np.nan, 0] == 2
  454. assert df.loc[np.nan][0] == 2