test_indexing.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. import numpy as np
  2. import pytest
  3. from pandas.errors import InvalidIndexError
  4. import pandas as pd
  5. from pandas import (
  6. CategoricalIndex,
  7. Index,
  8. IntervalIndex,
  9. Timestamp,
  10. )
  11. import pandas._testing as tm
  12. class TestTake:
  13. def test_take_fill_value(self):
  14. # GH 12631
  15. # numeric category
  16. idx = CategoricalIndex([1, 2, 3], name="xxx")
  17. result = idx.take(np.array([1, 0, -1]))
  18. expected = CategoricalIndex([2, 1, 3], name="xxx")
  19. tm.assert_index_equal(result, expected)
  20. tm.assert_categorical_equal(result.values, expected.values)
  21. # fill_value
  22. result = idx.take(np.array([1, 0, -1]), fill_value=True)
  23. expected = CategoricalIndex([2, 1, np.nan], categories=[1, 2, 3], name="xxx")
  24. tm.assert_index_equal(result, expected)
  25. tm.assert_categorical_equal(result.values, expected.values)
  26. # allow_fill=False
  27. result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
  28. expected = CategoricalIndex([2, 1, 3], name="xxx")
  29. tm.assert_index_equal(result, expected)
  30. tm.assert_categorical_equal(result.values, expected.values)
  31. # object category
  32. idx = CategoricalIndex(
  33. list("CBA"), categories=list("ABC"), ordered=True, name="xxx"
  34. )
  35. result = idx.take(np.array([1, 0, -1]))
  36. expected = CategoricalIndex(
  37. list("BCA"), categories=list("ABC"), ordered=True, name="xxx"
  38. )
  39. tm.assert_index_equal(result, expected)
  40. tm.assert_categorical_equal(result.values, expected.values)
  41. # fill_value
  42. result = idx.take(np.array([1, 0, -1]), fill_value=True)
  43. expected = CategoricalIndex(
  44. ["B", "C", np.nan], categories=list("ABC"), ordered=True, name="xxx"
  45. )
  46. tm.assert_index_equal(result, expected)
  47. tm.assert_categorical_equal(result.values, expected.values)
  48. # allow_fill=False
  49. result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
  50. expected = CategoricalIndex(
  51. list("BCA"), categories=list("ABC"), ordered=True, name="xxx"
  52. )
  53. tm.assert_index_equal(result, expected)
  54. tm.assert_categorical_equal(result.values, expected.values)
  55. msg = (
  56. "When allow_fill=True and fill_value is not None, "
  57. "all indices must be >= -1"
  58. )
  59. with pytest.raises(ValueError, match=msg):
  60. idx.take(np.array([1, 0, -2]), fill_value=True)
  61. with pytest.raises(ValueError, match=msg):
  62. idx.take(np.array([1, 0, -5]), fill_value=True)
  63. msg = "index -5 is out of bounds for (axis 0 with )?size 3"
  64. with pytest.raises(IndexError, match=msg):
  65. idx.take(np.array([1, -5]))
  66. def test_take_fill_value_datetime(self):
  67. # datetime category
  68. idx = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"], name="xxx")
  69. idx = CategoricalIndex(idx)
  70. result = idx.take(np.array([1, 0, -1]))
  71. expected = pd.DatetimeIndex(
  72. ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx"
  73. )
  74. expected = CategoricalIndex(expected)
  75. tm.assert_index_equal(result, expected)
  76. # fill_value
  77. result = idx.take(np.array([1, 0, -1]), fill_value=True)
  78. expected = pd.DatetimeIndex(["2011-02-01", "2011-01-01", "NaT"], name="xxx")
  79. exp_cats = pd.DatetimeIndex(["2011-01-01", "2011-02-01", "2011-03-01"])
  80. expected = CategoricalIndex(expected, categories=exp_cats)
  81. tm.assert_index_equal(result, expected)
  82. # allow_fill=False
  83. result = idx.take(np.array([1, 0, -1]), allow_fill=False, fill_value=True)
  84. expected = pd.DatetimeIndex(
  85. ["2011-02-01", "2011-01-01", "2011-03-01"], name="xxx"
  86. )
  87. expected = CategoricalIndex(expected)
  88. tm.assert_index_equal(result, expected)
  89. msg = (
  90. "When allow_fill=True and fill_value is not None, "
  91. "all indices must be >= -1"
  92. )
  93. with pytest.raises(ValueError, match=msg):
  94. idx.take(np.array([1, 0, -2]), fill_value=True)
  95. with pytest.raises(ValueError, match=msg):
  96. idx.take(np.array([1, 0, -5]), fill_value=True)
  97. msg = "index -5 is out of bounds for (axis 0 with )?size 3"
  98. with pytest.raises(IndexError, match=msg):
  99. idx.take(np.array([1, -5]))
  100. def test_take_invalid_kwargs(self):
  101. idx = CategoricalIndex([1, 2, 3], name="foo")
  102. indices = [1, 0, -1]
  103. msg = r"take\(\) got an unexpected keyword argument 'foo'"
  104. with pytest.raises(TypeError, match=msg):
  105. idx.take(indices, foo=2)
  106. msg = "the 'out' parameter is not supported"
  107. with pytest.raises(ValueError, match=msg):
  108. idx.take(indices, out=indices)
  109. msg = "the 'mode' parameter is not supported"
  110. with pytest.raises(ValueError, match=msg):
  111. idx.take(indices, mode="clip")
  112. class TestGetLoc:
  113. def test_get_loc(self):
  114. # GH 12531
  115. cidx1 = CategoricalIndex(list("abcde"), categories=list("edabc"))
  116. idx1 = Index(list("abcde"))
  117. assert cidx1.get_loc("a") == idx1.get_loc("a")
  118. assert cidx1.get_loc("e") == idx1.get_loc("e")
  119. for i in [cidx1, idx1]:
  120. with pytest.raises(KeyError, match="'NOT-EXIST'"):
  121. i.get_loc("NOT-EXIST")
  122. # non-unique
  123. cidx2 = CategoricalIndex(list("aacded"), categories=list("edabc"))
  124. idx2 = Index(list("aacded"))
  125. # results in bool array
  126. res = cidx2.get_loc("d")
  127. tm.assert_numpy_array_equal(res, idx2.get_loc("d"))
  128. tm.assert_numpy_array_equal(
  129. res, np.array([False, False, False, True, False, True])
  130. )
  131. # unique element results in scalar
  132. res = cidx2.get_loc("e")
  133. assert res == idx2.get_loc("e")
  134. assert res == 4
  135. for i in [cidx2, idx2]:
  136. with pytest.raises(KeyError, match="'NOT-EXIST'"):
  137. i.get_loc("NOT-EXIST")
  138. # non-unique, sliceable
  139. cidx3 = CategoricalIndex(list("aabbb"), categories=list("abc"))
  140. idx3 = Index(list("aabbb"))
  141. # results in slice
  142. res = cidx3.get_loc("a")
  143. assert res == idx3.get_loc("a")
  144. assert res == slice(0, 2, None)
  145. res = cidx3.get_loc("b")
  146. assert res == idx3.get_loc("b")
  147. assert res == slice(2, 5, None)
  148. for i in [cidx3, idx3]:
  149. with pytest.raises(KeyError, match="'c'"):
  150. i.get_loc("c")
  151. def test_get_loc_unique(self):
  152. cidx = CategoricalIndex(list("abc"))
  153. result = cidx.get_loc("b")
  154. assert result == 1
  155. def test_get_loc_monotonic_nonunique(self):
  156. cidx = CategoricalIndex(list("abbc"))
  157. result = cidx.get_loc("b")
  158. expected = slice(1, 3, None)
  159. assert result == expected
  160. def test_get_loc_nonmonotonic_nonunique(self):
  161. cidx = CategoricalIndex(list("abcb"))
  162. result = cidx.get_loc("b")
  163. expected = np.array([False, True, False, True], dtype=bool)
  164. tm.assert_numpy_array_equal(result, expected)
  165. def test_get_loc_nan(self):
  166. # GH#41933
  167. ci = CategoricalIndex(["A", "B", np.nan])
  168. res = ci.get_loc(np.nan)
  169. assert res == 2
  170. class TestGetIndexer:
  171. def test_get_indexer_base(self):
  172. # Determined by cat ordering.
  173. idx = CategoricalIndex(list("cab"), categories=list("cab"))
  174. expected = np.arange(len(idx), dtype=np.intp)
  175. actual = idx.get_indexer(idx)
  176. tm.assert_numpy_array_equal(expected, actual)
  177. with pytest.raises(ValueError, match="Invalid fill method"):
  178. idx.get_indexer(idx, method="invalid")
  179. def test_get_indexer_requires_unique(self):
  180. np.random.seed(123456789)
  181. ci = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
  182. oidx = Index(np.array(ci))
  183. msg = "Reindexing only valid with uniquely valued Index objects"
  184. for n in [1, 2, 5, len(ci)]:
  185. finder = oidx[np.random.randint(0, len(ci), size=n)]
  186. with pytest.raises(InvalidIndexError, match=msg):
  187. ci.get_indexer(finder)
  188. # see gh-17323
  189. #
  190. # Even when indexer is equal to the
  191. # members in the index, we should
  192. # respect duplicates instead of taking
  193. # the fast-track path.
  194. for finder in [list("aabbca"), list("aababca")]:
  195. with pytest.raises(InvalidIndexError, match=msg):
  196. ci.get_indexer(finder)
  197. def test_get_indexer_non_unique(self):
  198. idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
  199. idx2 = CategoricalIndex(list("abf"))
  200. for indexer in [idx2, list("abf"), Index(list("abf"))]:
  201. msg = "Reindexing only valid with uniquely valued Index objects"
  202. with pytest.raises(InvalidIndexError, match=msg):
  203. idx1.get_indexer(indexer)
  204. r1, _ = idx1.get_indexer_non_unique(indexer)
  205. expected = np.array([0, 1, 2, -1], dtype=np.intp)
  206. tm.assert_almost_equal(r1, expected)
  207. def test_get_indexer_method(self):
  208. idx1 = CategoricalIndex(list("aabcde"), categories=list("edabc"))
  209. idx2 = CategoricalIndex(list("abf"))
  210. msg = "method pad not yet implemented for CategoricalIndex"
  211. with pytest.raises(NotImplementedError, match=msg):
  212. idx2.get_indexer(idx1, method="pad")
  213. msg = "method backfill not yet implemented for CategoricalIndex"
  214. with pytest.raises(NotImplementedError, match=msg):
  215. idx2.get_indexer(idx1, method="backfill")
  216. msg = "method nearest not yet implemented for CategoricalIndex"
  217. with pytest.raises(NotImplementedError, match=msg):
  218. idx2.get_indexer(idx1, method="nearest")
  219. def test_get_indexer_array(self):
  220. arr = np.array(
  221. [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")],
  222. dtype=object,
  223. )
  224. cats = [Timestamp("1999-12-31 00:00:00"), Timestamp("2000-12-31 00:00:00")]
  225. ci = CategoricalIndex(cats, categories=cats, ordered=False, dtype="category")
  226. result = ci.get_indexer(arr)
  227. expected = np.array([0, 1], dtype="intp")
  228. tm.assert_numpy_array_equal(result, expected)
  229. def test_get_indexer_same_categories_same_order(self):
  230. ci = CategoricalIndex(["a", "b"], categories=["a", "b"])
  231. result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["a", "b"]))
  232. expected = np.array([1, 1], dtype="intp")
  233. tm.assert_numpy_array_equal(result, expected)
  234. def test_get_indexer_same_categories_different_order(self):
  235. # https://github.com/pandas-dev/pandas/issues/19551
  236. ci = CategoricalIndex(["a", "b"], categories=["a", "b"])
  237. result = ci.get_indexer(CategoricalIndex(["b", "b"], categories=["b", "a"]))
  238. expected = np.array([1, 1], dtype="intp")
  239. tm.assert_numpy_array_equal(result, expected)
  240. def test_get_indexer_nans_in_index_and_target(self):
  241. # GH 45361
  242. ci = CategoricalIndex([1, 2, np.nan, 3])
  243. other1 = [2, 3, 4, np.nan]
  244. res1 = ci.get_indexer(other1)
  245. expected1 = np.array([1, 3, -1, 2], dtype=np.intp)
  246. tm.assert_numpy_array_equal(res1, expected1)
  247. other2 = [1, 4, 2, 3]
  248. res2 = ci.get_indexer(other2)
  249. expected2 = np.array([0, -1, 1, 3], dtype=np.intp)
  250. tm.assert_numpy_array_equal(res2, expected2)
  251. class TestWhere:
  252. def test_where(self, listlike_box):
  253. klass = listlike_box
  254. i = CategoricalIndex(list("aabbca"), categories=list("cab"), ordered=False)
  255. cond = [True] * len(i)
  256. expected = i
  257. result = i.where(klass(cond))
  258. tm.assert_index_equal(result, expected)
  259. cond = [False] + [True] * (len(i) - 1)
  260. expected = CategoricalIndex([np.nan] + i[1:].tolist(), categories=i.categories)
  261. result = i.where(klass(cond))
  262. tm.assert_index_equal(result, expected)
  263. def test_where_non_categories(self):
  264. ci = CategoricalIndex(["a", "b", "c", "d"])
  265. mask = np.array([True, False, True, False])
  266. result = ci.where(mask, 2)
  267. expected = Index(["a", 2, "c", 2], dtype=object)
  268. tm.assert_index_equal(result, expected)
  269. msg = "Cannot setitem on a Categorical with a new category"
  270. with pytest.raises(TypeError, match=msg):
  271. # Test the Categorical method directly
  272. ci._data._where(mask, 2)
  273. class TestContains:
  274. def test_contains(self):
  275. ci = CategoricalIndex(list("aabbca"), categories=list("cabdef"), ordered=False)
  276. assert "a" in ci
  277. assert "z" not in ci
  278. assert "e" not in ci
  279. assert np.nan not in ci
  280. # assert codes NOT in index
  281. assert 0 not in ci
  282. assert 1 not in ci
  283. def test_contains_nan(self):
  284. ci = CategoricalIndex(list("aabbca") + [np.nan], categories=list("cabdef"))
  285. assert np.nan in ci
  286. @pytest.mark.parametrize("unwrap", [True, False])
  287. def test_contains_na_dtype(self, unwrap):
  288. dti = pd.date_range("2016-01-01", periods=100).insert(0, pd.NaT)
  289. pi = dti.to_period("D")
  290. tdi = dti - dti[-1]
  291. ci = CategoricalIndex(dti)
  292. obj = ci
  293. if unwrap:
  294. obj = ci._data
  295. assert np.nan in obj
  296. assert None in obj
  297. assert pd.NaT in obj
  298. assert np.datetime64("NaT") in obj
  299. assert np.timedelta64("NaT") not in obj
  300. obj2 = CategoricalIndex(tdi)
  301. if unwrap:
  302. obj2 = obj2._data
  303. assert np.nan in obj2
  304. assert None in obj2
  305. assert pd.NaT in obj2
  306. assert np.datetime64("NaT") not in obj2
  307. assert np.timedelta64("NaT") in obj2
  308. obj3 = CategoricalIndex(pi)
  309. if unwrap:
  310. obj3 = obj3._data
  311. assert np.nan in obj3
  312. assert None in obj3
  313. assert pd.NaT in obj3
  314. assert np.datetime64("NaT") not in obj3
  315. assert np.timedelta64("NaT") not in obj3
  316. @pytest.mark.parametrize(
  317. "item, expected",
  318. [
  319. (pd.Interval(0, 1), True),
  320. (1.5, True),
  321. (pd.Interval(0.5, 1.5), False),
  322. ("a", False),
  323. (Timestamp(1), False),
  324. (pd.Timedelta(1), False),
  325. ],
  326. ids=str,
  327. )
  328. def test_contains_interval(self, item, expected):
  329. # GH 23705
  330. ci = CategoricalIndex(IntervalIndex.from_breaks(range(3)))
  331. result = item in ci
  332. assert result is expected
  333. def test_contains_list(self):
  334. # GH#21729
  335. idx = CategoricalIndex([1, 2, 3])
  336. assert "a" not in idx
  337. with pytest.raises(TypeError, match="unhashable type"):
  338. ["a"] in idx
  339. with pytest.raises(TypeError, match="unhashable type"):
  340. ["a", "b"] in idx