test_indexing.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382
  1. import math
  2. import numpy as np
  3. import pytest
  4. from pandas import (
  5. NA,
  6. Categorical,
  7. CategoricalIndex,
  8. Index,
  9. Interval,
  10. IntervalIndex,
  11. NaT,
  12. PeriodIndex,
  13. Series,
  14. Timedelta,
  15. Timestamp,
  16. )
  17. import pandas._testing as tm
  18. import pandas.core.common as com
  19. class TestCategoricalIndexingWithFactor:
  20. def test_getitem(self, factor):
  21. assert factor[0] == "a"
  22. assert factor[-1] == "c"
  23. subf = factor[[0, 1, 2]]
  24. tm.assert_numpy_array_equal(subf._codes, np.array([0, 1, 1], dtype=np.int8))
  25. subf = factor[np.asarray(factor) == "c"]
  26. tm.assert_numpy_array_equal(subf._codes, np.array([2, 2, 2], dtype=np.int8))
  27. def test_setitem(self, factor):
  28. # int/positional
  29. c = factor.copy()
  30. c[0] = "b"
  31. assert c[0] == "b"
  32. c[-1] = "a"
  33. assert c[-1] == "a"
  34. # boolean
  35. c = factor.copy()
  36. indexer = np.zeros(len(c), dtype="bool")
  37. indexer[0] = True
  38. indexer[-1] = True
  39. c[indexer] = "c"
  40. expected = Categorical(["c", "b", "b", "a", "a", "c", "c", "c"], ordered=True)
  41. tm.assert_categorical_equal(c, expected)
  42. @pytest.mark.parametrize(
  43. "other",
  44. [Categorical(["b", "a"]), Categorical(["b", "a"], categories=["b", "a"])],
  45. )
  46. def test_setitem_same_but_unordered(self, other):
  47. # GH-24142
  48. target = Categorical(["a", "b"], categories=["a", "b"])
  49. mask = np.array([True, False])
  50. target[mask] = other[mask]
  51. expected = Categorical(["b", "b"], categories=["a", "b"])
  52. tm.assert_categorical_equal(target, expected)
  53. @pytest.mark.parametrize(
  54. "other",
  55. [
  56. Categorical(["b", "a"], categories=["b", "a", "c"]),
  57. Categorical(["b", "a"], categories=["a", "b", "c"]),
  58. Categorical(["a", "a"], categories=["a"]),
  59. Categorical(["b", "b"], categories=["b"]),
  60. ],
  61. )
  62. def test_setitem_different_unordered_raises(self, other):
  63. # GH-24142
  64. target = Categorical(["a", "b"], categories=["a", "b"])
  65. mask = np.array([True, False])
  66. msg = "Cannot set a Categorical with another, without identical categories"
  67. with pytest.raises(TypeError, match=msg):
  68. target[mask] = other[mask]
  69. @pytest.mark.parametrize(
  70. "other",
  71. [
  72. Categorical(["b", "a"]),
  73. Categorical(["b", "a"], categories=["b", "a"], ordered=True),
  74. Categorical(["b", "a"], categories=["a", "b", "c"], ordered=True),
  75. ],
  76. )
  77. def test_setitem_same_ordered_raises(self, other):
  78. # Gh-24142
  79. target = Categorical(["a", "b"], categories=["a", "b"], ordered=True)
  80. mask = np.array([True, False])
  81. msg = "Cannot set a Categorical with another, without identical categories"
  82. with pytest.raises(TypeError, match=msg):
  83. target[mask] = other[mask]
  84. def test_setitem_tuple(self):
  85. # GH#20439
  86. cat = Categorical([(0, 1), (0, 2), (0, 1)])
  87. # This should not raise
  88. cat[1] = cat[0]
  89. assert cat[1] == (0, 1)
  90. def test_setitem_listlike(self):
  91. # GH#9469
  92. # properly coerce the input indexers
  93. np.random.seed(1)
  94. cat = Categorical(
  95. np.random.randint(0, 5, size=150000).astype(np.int8)
  96. ).add_categories([-1000])
  97. indexer = np.array([100000]).astype(np.int64)
  98. cat[indexer] = -1000
  99. # we are asserting the code result here
  100. # which maps to the -1000 category
  101. result = cat.codes[np.array([100000]).astype(np.int64)]
  102. tm.assert_numpy_array_equal(result, np.array([5], dtype="int8"))
  103. class TestCategoricalIndexing:
  104. def test_getitem_slice(self):
  105. cat = Categorical(["a", "b", "c", "d", "a", "b", "c"])
  106. sliced = cat[3]
  107. assert sliced == "d"
  108. sliced = cat[3:5]
  109. expected = Categorical(["d", "a"], categories=["a", "b", "c", "d"])
  110. tm.assert_categorical_equal(sliced, expected)
  111. def test_getitem_listlike(self):
  112. # GH 9469
  113. # properly coerce the input indexers
  114. np.random.seed(1)
  115. c = Categorical(np.random.randint(0, 5, size=150000).astype(np.int8))
  116. result = c.codes[np.array([100000]).astype(np.int64)]
  117. expected = c[np.array([100000]).astype(np.int64)].codes
  118. tm.assert_numpy_array_equal(result, expected)
  119. def test_periodindex(self):
  120. idx1 = PeriodIndex(
  121. ["2014-01", "2014-01", "2014-02", "2014-02", "2014-03", "2014-03"], freq="M"
  122. )
  123. cat1 = Categorical(idx1)
  124. str(cat1)
  125. exp_arr = np.array([0, 0, 1, 1, 2, 2], dtype=np.int8)
  126. exp_idx = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
  127. tm.assert_numpy_array_equal(cat1._codes, exp_arr)
  128. tm.assert_index_equal(cat1.categories, exp_idx)
  129. idx2 = PeriodIndex(
  130. ["2014-03", "2014-03", "2014-02", "2014-01", "2014-03", "2014-01"], freq="M"
  131. )
  132. cat2 = Categorical(idx2, ordered=True)
  133. str(cat2)
  134. exp_arr = np.array([2, 2, 1, 0, 2, 0], dtype=np.int8)
  135. exp_idx2 = PeriodIndex(["2014-01", "2014-02", "2014-03"], freq="M")
  136. tm.assert_numpy_array_equal(cat2._codes, exp_arr)
  137. tm.assert_index_equal(cat2.categories, exp_idx2)
  138. idx3 = PeriodIndex(
  139. [
  140. "2013-12",
  141. "2013-11",
  142. "2013-10",
  143. "2013-09",
  144. "2013-08",
  145. "2013-07",
  146. "2013-05",
  147. ],
  148. freq="M",
  149. )
  150. cat3 = Categorical(idx3, ordered=True)
  151. exp_arr = np.array([6, 5, 4, 3, 2, 1, 0], dtype=np.int8)
  152. exp_idx = PeriodIndex(
  153. [
  154. "2013-05",
  155. "2013-07",
  156. "2013-08",
  157. "2013-09",
  158. "2013-10",
  159. "2013-11",
  160. "2013-12",
  161. ],
  162. freq="M",
  163. )
  164. tm.assert_numpy_array_equal(cat3._codes, exp_arr)
  165. tm.assert_index_equal(cat3.categories, exp_idx)
  166. @pytest.mark.parametrize(
  167. "null_val",
  168. [None, np.nan, NaT, NA, math.nan, "NaT", "nat", "NAT", "nan", "NaN", "NAN"],
  169. )
  170. def test_periodindex_on_null_types(self, null_val):
  171. # GH 46673
  172. result = PeriodIndex(["2022-04-06", "2022-04-07", null_val], freq="D")
  173. expected = PeriodIndex(["2022-04-06", "2022-04-07", "NaT"], dtype="period[D]")
  174. assert result[2] is NaT
  175. tm.assert_index_equal(result, expected)
  176. @pytest.mark.parametrize("new_categories", [[1, 2, 3, 4], [1, 2]])
  177. def test_categories_assignments_wrong_length_raises(self, new_categories):
  178. cat = Categorical(["a", "b", "c", "a"])
  179. msg = (
  180. "new categories need to have the same number of items "
  181. "as the old categories!"
  182. )
  183. with pytest.raises(ValueError, match=msg):
  184. cat.rename_categories(new_categories)
  185. # Combinations of sorted/unique:
  186. @pytest.mark.parametrize(
  187. "idx_values", [[1, 2, 3, 4], [1, 3, 2, 4], [1, 3, 3, 4], [1, 2, 2, 4]]
  188. )
  189. # Combinations of missing/unique
  190. @pytest.mark.parametrize("key_values", [[1, 2], [1, 5], [1, 1], [5, 5]])
  191. @pytest.mark.parametrize("key_class", [Categorical, CategoricalIndex])
  192. @pytest.mark.parametrize("dtype", [None, "category", "key"])
  193. def test_get_indexer_non_unique(self, idx_values, key_values, key_class, dtype):
  194. # GH 21448
  195. key = key_class(key_values, categories=range(1, 5))
  196. if dtype == "key":
  197. dtype = key.dtype
  198. # Test for flat index and CategoricalIndex with same/different cats:
  199. idx = Index(idx_values, dtype=dtype)
  200. expected, exp_miss = idx.get_indexer_non_unique(key_values)
  201. result, res_miss = idx.get_indexer_non_unique(key)
  202. tm.assert_numpy_array_equal(expected, result)
  203. tm.assert_numpy_array_equal(exp_miss, res_miss)
  204. exp_unique = idx.unique().get_indexer(key_values)
  205. res_unique = idx.unique().get_indexer(key)
  206. tm.assert_numpy_array_equal(res_unique, exp_unique)
  207. def test_where_unobserved_nan(self):
  208. ser = Series(Categorical(["a", "b"]))
  209. result = ser.where([True, False])
  210. expected = Series(Categorical(["a", None], categories=["a", "b"]))
  211. tm.assert_series_equal(result, expected)
  212. # all NA
  213. ser = Series(Categorical(["a", "b"]))
  214. result = ser.where([False, False])
  215. expected = Series(Categorical([None, None], categories=["a", "b"]))
  216. tm.assert_series_equal(result, expected)
  217. def test_where_unobserved_categories(self):
  218. ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
  219. result = ser.where([True, True, False], other="b")
  220. expected = Series(Categorical(["a", "b", "b"], categories=ser.cat.categories))
  221. tm.assert_series_equal(result, expected)
  222. def test_where_other_categorical(self):
  223. ser = Series(Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"]))
  224. other = Categorical(["b", "c", "a"], categories=["a", "c", "b", "d"])
  225. result = ser.where([True, False, True], other)
  226. expected = Series(Categorical(["a", "c", "c"], dtype=ser.dtype))
  227. tm.assert_series_equal(result, expected)
  228. def test_where_new_category_raises(self):
  229. ser = Series(Categorical(["a", "b", "c"]))
  230. msg = "Cannot setitem on a Categorical with a new category"
  231. with pytest.raises(TypeError, match=msg):
  232. ser.where([True, False, True], "d")
  233. def test_where_ordered_differs_rasies(self):
  234. ser = Series(
  235. Categorical(["a", "b", "c"], categories=["d", "c", "b", "a"], ordered=True)
  236. )
  237. other = Categorical(
  238. ["b", "c", "a"], categories=["a", "c", "b", "d"], ordered=True
  239. )
  240. with pytest.raises(TypeError, match="without identical categories"):
  241. ser.where([True, False, True], other)
  242. class TestContains:
  243. def test_contains(self):
  244. # GH#21508
  245. cat = Categorical(list("aabbca"), categories=list("cab"))
  246. assert "b" in cat
  247. assert "z" not in cat
  248. assert np.nan not in cat
  249. with pytest.raises(TypeError, match="unhashable type: 'list'"):
  250. assert [1] in cat
  251. # assert codes NOT in index
  252. assert 0 not in cat
  253. assert 1 not in cat
  254. cat = Categorical(list("aabbca") + [np.nan], categories=list("cab"))
  255. assert np.nan in cat
  256. @pytest.mark.parametrize(
  257. "item, expected",
  258. [
  259. (Interval(0, 1), True),
  260. (1.5, True),
  261. (Interval(0.5, 1.5), False),
  262. ("a", False),
  263. (Timestamp(1), False),
  264. (Timedelta(1), False),
  265. ],
  266. ids=str,
  267. )
  268. def test_contains_interval(self, item, expected):
  269. # GH#23705
  270. cat = Categorical(IntervalIndex.from_breaks(range(3)))
  271. result = item in cat
  272. assert result is expected
  273. def test_contains_list(self):
  274. # GH#21729
  275. cat = Categorical([1, 2, 3])
  276. assert "a" not in cat
  277. with pytest.raises(TypeError, match="unhashable type"):
  278. ["a"] in cat
  279. with pytest.raises(TypeError, match="unhashable type"):
  280. ["a", "b"] in cat
  281. @pytest.mark.parametrize("index", [True, False])
  282. def test_mask_with_boolean(index):
  283. ser = Series(range(3))
  284. idx = Categorical([True, False, True])
  285. if index:
  286. idx = CategoricalIndex(idx)
  287. assert com.is_bool_indexer(idx)
  288. result = ser[idx]
  289. expected = ser[idx.astype("object")]
  290. tm.assert_series_equal(result, expected)
  291. @pytest.mark.parametrize("index", [True, False])
  292. def test_mask_with_boolean_na_treated_as_false(index):
  293. # https://github.com/pandas-dev/pandas/issues/31503
  294. ser = Series(range(3))
  295. idx = Categorical([True, False, None])
  296. if index:
  297. idx = CategoricalIndex(idx)
  298. result = ser[idx]
  299. expected = ser[idx.fillna(False)]
  300. tm.assert_series_equal(result, expected)
  301. @pytest.fixture
  302. def non_coercible_categorical(monkeypatch):
  303. """
  304. Monkeypatch Categorical.__array__ to ensure no implicit conversion.
  305. Raises
  306. ------
  307. ValueError
  308. When Categorical.__array__ is called.
  309. """
  310. # TODO(Categorical): identify other places where this may be
  311. # useful and move to a conftest.py
  312. def array(self, dtype=None):
  313. raise ValueError("I cannot be converted.")
  314. with monkeypatch.context() as m:
  315. m.setattr(Categorical, "__array__", array)
  316. yield
  317. def test_series_at():
  318. arr = Categorical(["a", "b", "c"])
  319. ser = Series(arr)
  320. result = ser.at[0]
  321. assert result == "a"