test_analytics.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335
  1. import re
  2. import sys
  3. import numpy as np
  4. import pytest
  5. from pandas.compat import PYPY
  6. from pandas import (
  7. Categorical,
  8. CategoricalDtype,
  9. Index,
  10. NaT,
  11. Series,
  12. date_range,
  13. )
  14. import pandas._testing as tm
  15. from pandas.api.types import is_scalar
  16. class TestCategoricalAnalytics:
  17. @pytest.mark.parametrize("aggregation", ["min", "max"])
  18. def test_min_max_not_ordered_raises(self, aggregation):
  19. # unordered cats have no min/max
  20. cat = Categorical(["a", "b", "c", "d"], ordered=False)
  21. msg = f"Categorical is not ordered for operation {aggregation}"
  22. agg_func = getattr(cat, aggregation)
  23. with pytest.raises(TypeError, match=msg):
  24. agg_func()
  25. ufunc = np.minimum if aggregation == "min" else np.maximum
  26. with pytest.raises(TypeError, match=msg):
  27. ufunc.reduce(cat)
  28. def test_min_max_ordered(self, index_or_series_or_array):
  29. cat = Categorical(["a", "b", "c", "d"], ordered=True)
  30. obj = index_or_series_or_array(cat)
  31. _min = obj.min()
  32. _max = obj.max()
  33. assert _min == "a"
  34. assert _max == "d"
  35. assert np.minimum.reduce(obj) == "a"
  36. assert np.maximum.reduce(obj) == "d"
  37. # TODO: raises if we pass axis=0 (on Index and Categorical, not Series)
  38. cat = Categorical(
  39. ["a", "b", "c", "d"], categories=["d", "c", "b", "a"], ordered=True
  40. )
  41. obj = index_or_series_or_array(cat)
  42. _min = obj.min()
  43. _max = obj.max()
  44. assert _min == "d"
  45. assert _max == "a"
  46. assert np.minimum.reduce(obj) == "d"
  47. assert np.maximum.reduce(obj) == "a"
  48. @pytest.mark.parametrize(
  49. "categories,expected",
  50. [
  51. (list("ABC"), np.NaN),
  52. ([1, 2, 3], np.NaN),
  53. pytest.param(
  54. Series(date_range("2020-01-01", periods=3), dtype="category"),
  55. NaT,
  56. marks=pytest.mark.xfail(
  57. reason="https://github.com/pandas-dev/pandas/issues/29962"
  58. ),
  59. ),
  60. ],
  61. )
  62. @pytest.mark.parametrize("aggregation", ["min", "max"])
  63. def test_min_max_ordered_empty(self, categories, expected, aggregation):
  64. # GH 30227
  65. cat = Categorical([], categories=categories, ordered=True)
  66. agg_func = getattr(cat, aggregation)
  67. result = agg_func()
  68. assert result is expected
  69. @pytest.mark.parametrize(
  70. "values, categories",
  71. [(["a", "b", "c", np.nan], list("cba")), ([1, 2, 3, np.nan], [3, 2, 1])],
  72. )
  73. @pytest.mark.parametrize("skipna", [True, False])
  74. @pytest.mark.parametrize("function", ["min", "max"])
  75. def test_min_max_with_nan(self, values, categories, function, skipna):
  76. # GH 25303
  77. cat = Categorical(values, categories=categories, ordered=True)
  78. result = getattr(cat, function)(skipna=skipna)
  79. if skipna is False:
  80. assert result is np.nan
  81. else:
  82. expected = categories[0] if function == "min" else categories[2]
  83. assert result == expected
  84. @pytest.mark.parametrize("function", ["min", "max"])
  85. @pytest.mark.parametrize("skipna", [True, False])
  86. def test_min_max_only_nan(self, function, skipna):
  87. # https://github.com/pandas-dev/pandas/issues/33450
  88. cat = Categorical([np.nan], categories=[1, 2], ordered=True)
  89. result = getattr(cat, function)(skipna=skipna)
  90. assert result is np.nan
  91. @pytest.mark.parametrize("method", ["min", "max"])
  92. def test_numeric_only_min_max_raises(self, method):
  93. # GH 25303
  94. cat = Categorical(
  95. [np.nan, 1, 2, np.nan], categories=[5, 4, 3, 2, 1], ordered=True
  96. )
  97. with pytest.raises(TypeError, match=".* got an unexpected keyword"):
  98. getattr(cat, method)(numeric_only=True)
  99. @pytest.mark.parametrize("method", ["min", "max"])
  100. def test_numpy_min_max_raises(self, method):
  101. cat = Categorical(["a", "b", "c", "b"], ordered=False)
  102. msg = (
  103. f"Categorical is not ordered for operation {method}\n"
  104. "you can use .as_ordered() to change the Categorical to an ordered one"
  105. )
  106. method = getattr(np, method)
  107. with pytest.raises(TypeError, match=re.escape(msg)):
  108. method(cat)
  109. @pytest.mark.parametrize("kwarg", ["axis", "out", "keepdims"])
  110. @pytest.mark.parametrize("method", ["min", "max"])
  111. def test_numpy_min_max_unsupported_kwargs_raises(self, method, kwarg):
  112. cat = Categorical(["a", "b", "c", "b"], ordered=True)
  113. msg = (
  114. f"the '{kwarg}' parameter is not supported in the pandas implementation "
  115. f"of {method}"
  116. )
  117. if kwarg == "axis":
  118. msg = r"`axis` must be fewer than the number of dimensions \(1\)"
  119. kwargs = {kwarg: 42}
  120. method = getattr(np, method)
  121. with pytest.raises(ValueError, match=msg):
  122. method(cat, **kwargs)
  123. @pytest.mark.parametrize("method, expected", [("min", "a"), ("max", "c")])
  124. def test_numpy_min_max_axis_equals_none(self, method, expected):
  125. cat = Categorical(["a", "b", "c", "b"], ordered=True)
  126. method = getattr(np, method)
  127. result = method(cat, axis=None)
  128. assert result == expected
  129. @pytest.mark.parametrize(
  130. "values,categories,exp_mode",
  131. [
  132. ([1, 1, 2, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5]),
  133. ([1, 1, 1, 4, 5, 5, 5], [5, 4, 3, 2, 1], [5, 1]),
  134. ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1], [5, 4, 3, 2, 1]),
  135. ([np.nan, np.nan, np.nan, 4, 5], [5, 4, 3, 2, 1], [5, 4]),
  136. ([np.nan, np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
  137. ([np.nan, np.nan, 4, 5, 4], [5, 4, 3, 2, 1], [4]),
  138. ],
  139. )
  140. def test_mode(self, values, categories, exp_mode):
  141. cat = Categorical(values, categories=categories, ordered=True)
  142. res = Series(cat).mode()._values
  143. exp = Categorical(exp_mode, categories=categories, ordered=True)
  144. tm.assert_categorical_equal(res, exp)
  145. def test_searchsorted(self, ordered):
  146. # https://github.com/pandas-dev/pandas/issues/8420
  147. # https://github.com/pandas-dev/pandas/issues/14522
  148. cat = Categorical(
  149. ["cheese", "milk", "apple", "bread", "bread"],
  150. categories=["cheese", "milk", "apple", "bread"],
  151. ordered=ordered,
  152. )
  153. ser = Series(cat)
  154. # Searching for single item argument, side='left' (default)
  155. res_cat = cat.searchsorted("apple")
  156. assert res_cat == 2
  157. assert is_scalar(res_cat)
  158. res_ser = ser.searchsorted("apple")
  159. assert res_ser == 2
  160. assert is_scalar(res_ser)
  161. # Searching for single item array, side='left' (default)
  162. res_cat = cat.searchsorted(["bread"])
  163. res_ser = ser.searchsorted(["bread"])
  164. exp = np.array([3], dtype=np.intp)
  165. tm.assert_numpy_array_equal(res_cat, exp)
  166. tm.assert_numpy_array_equal(res_ser, exp)
  167. # Searching for several items array, side='right'
  168. res_cat = cat.searchsorted(["apple", "bread"], side="right")
  169. res_ser = ser.searchsorted(["apple", "bread"], side="right")
  170. exp = np.array([3, 5], dtype=np.intp)
  171. tm.assert_numpy_array_equal(res_cat, exp)
  172. tm.assert_numpy_array_equal(res_ser, exp)
  173. # Searching for a single value that is not from the Categorical
  174. with pytest.raises(TypeError, match="cucumber"):
  175. cat.searchsorted("cucumber")
  176. with pytest.raises(TypeError, match="cucumber"):
  177. ser.searchsorted("cucumber")
  178. # Searching for multiple values one of each is not from the Categorical
  179. msg = (
  180. "Cannot setitem on a Categorical with a new category, "
  181. "set the categories first"
  182. )
  183. with pytest.raises(TypeError, match=msg):
  184. cat.searchsorted(["bread", "cucumber"])
  185. with pytest.raises(TypeError, match=msg):
  186. ser.searchsorted(["bread", "cucumber"])
  187. def test_unique(self, ordered):
  188. # GH38140
  189. dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered)
  190. # categories are reordered based on value when ordered=False
  191. cat = Categorical(["a", "b", "c"], dtype=dtype)
  192. res = cat.unique()
  193. tm.assert_categorical_equal(res, cat)
  194. cat = Categorical(["a", "b", "a", "a"], dtype=dtype)
  195. res = cat.unique()
  196. tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype))
  197. cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype)
  198. res = cat.unique()
  199. exp_cat = Categorical(["c", "a", "b"], dtype=dtype)
  200. tm.assert_categorical_equal(res, exp_cat)
  201. # nan must be removed
  202. cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype)
  203. res = cat.unique()
  204. exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype)
  205. tm.assert_categorical_equal(res, exp_cat)
  206. def test_unique_index_series(self, ordered):
  207. # GH38140
  208. dtype = CategoricalDtype([3, 2, 1], ordered=ordered)
  209. c = Categorical([3, 1, 2, 2, 1], dtype=dtype)
  210. # Categorical.unique sorts categories by appearance order
  211. # if ordered=False
  212. exp = Categorical([3, 1, 2], dtype=dtype)
  213. tm.assert_categorical_equal(c.unique(), exp)
  214. tm.assert_index_equal(Index(c).unique(), Index(exp))
  215. tm.assert_categorical_equal(Series(c).unique(), exp)
  216. c = Categorical([1, 1, 2, 2], dtype=dtype)
  217. exp = Categorical([1, 2], dtype=dtype)
  218. tm.assert_categorical_equal(c.unique(), exp)
  219. tm.assert_index_equal(Index(c).unique(), Index(exp))
  220. tm.assert_categorical_equal(Series(c).unique(), exp)
  221. def test_shift(self):
  222. # GH 9416
  223. cat = Categorical(["a", "b", "c", "d", "a"])
  224. # shift forward
  225. sp1 = cat.shift(1)
  226. xp1 = Categorical([np.nan, "a", "b", "c", "d"])
  227. tm.assert_categorical_equal(sp1, xp1)
  228. tm.assert_categorical_equal(cat[:-1], sp1[1:])
  229. # shift back
  230. sn2 = cat.shift(-2)
  231. xp2 = Categorical(
  232. ["c", "d", "a", np.nan, np.nan], categories=["a", "b", "c", "d"]
  233. )
  234. tm.assert_categorical_equal(sn2, xp2)
  235. tm.assert_categorical_equal(cat[2:], sn2[:-2])
  236. # shift by zero
  237. tm.assert_categorical_equal(cat, cat.shift(0))
  238. def test_nbytes(self):
  239. cat = Categorical([1, 2, 3])
  240. exp = 3 + 3 * 8 # 3 int8s for values + 3 int64s for categories
  241. assert cat.nbytes == exp
  242. def test_memory_usage(self):
  243. cat = Categorical([1, 2, 3])
  244. # .categories is an index, so we include the hashtable
  245. assert 0 < cat.nbytes <= cat.memory_usage()
  246. assert 0 < cat.nbytes <= cat.memory_usage(deep=True)
  247. cat = Categorical(["foo", "foo", "bar"])
  248. assert cat.memory_usage(deep=True) > cat.nbytes
  249. if not PYPY:
  250. # sys.getsizeof will call the .memory_usage with
  251. # deep=True, and add on some GC overhead
  252. diff = cat.memory_usage(deep=True) - sys.getsizeof(cat)
  253. assert abs(diff) < 100
  254. def test_map(self):
  255. c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
  256. result = c.map(lambda x: x.lower())
  257. exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
  258. tm.assert_categorical_equal(result, exp)
  259. c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False)
  260. result = c.map(lambda x: x.lower())
  261. exp = Categorical(list("ababc"), categories=list("abc"), ordered=False)
  262. tm.assert_categorical_equal(result, exp)
  263. result = c.map(lambda x: 1)
  264. # GH 12766: Return an index not an array
  265. tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
  266. @pytest.mark.parametrize("value", [1, "True", [1, 2, 3], 5.0])
  267. def test_validate_inplace_raises(self, value):
  268. cat = Categorical(["A", "B", "B", "C", "A"])
  269. msg = (
  270. 'For argument "inplace" expected type bool, '
  271. f"received type {type(value).__name__}"
  272. )
  273. with pytest.raises(ValueError, match=msg):
  274. cat.sort_values(inplace=value)
  275. def test_quantile_empty(self):
  276. # make sure we have correct itemsize on resulting codes
  277. cat = Categorical(["A", "B"])
  278. idx = Index([0.0, 0.5])
  279. result = cat[:0]._quantile(idx, interpolation="linear")
  280. assert result._codes.dtype == np.int8
  281. expected = cat.take([-1, -1], allow_fill=True)
  282. tm.assert_extension_array_equal(result, expected)