test_value_counts.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. import collections
  2. from datetime import timedelta
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. from pandas import (
  7. DatetimeIndex,
  8. Index,
  9. Interval,
  10. IntervalIndex,
  11. MultiIndex,
  12. Series,
  13. Timedelta,
  14. TimedeltaIndex,
  15. )
  16. import pandas._testing as tm
  17. from pandas.tests.base.common import allow_na_ops
  18. def test_value_counts(index_or_series_obj):
  19. obj = index_or_series_obj
  20. obj = np.repeat(obj, range(1, len(obj) + 1))
  21. result = obj.value_counts()
  22. counter = collections.Counter(obj)
  23. expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
  24. if obj.dtype != np.float16:
  25. expected.index = expected.index.astype(obj.dtype)
  26. else:
  27. with pytest.raises(NotImplementedError, match="float16 indexes are not "):
  28. expected.index.astype(obj.dtype)
  29. return
  30. if isinstance(expected.index, MultiIndex):
  31. expected.index.names = obj.names
  32. else:
  33. expected.index.name = obj.name
  34. if not isinstance(result.dtype, np.dtype):
  35. if getattr(obj.dtype, "storage", "") == "pyarrow":
  36. expected = expected.astype("int64[pyarrow]")
  37. else:
  38. # i.e IntegerDtype
  39. expected = expected.astype("Int64")
  40. # TODO(GH#32514): Order of entries with the same count is inconsistent
  41. # on CI (gh-32449)
  42. if obj.duplicated().any():
  43. result = result.sort_index()
  44. expected = expected.sort_index()
  45. tm.assert_series_equal(result, expected)
  46. @pytest.mark.parametrize("null_obj", [np.nan, None])
  47. def test_value_counts_null(null_obj, index_or_series_obj):
  48. orig = index_or_series_obj
  49. obj = orig.copy()
  50. if not allow_na_ops(obj):
  51. pytest.skip("type doesn't allow for NA operations")
  52. elif len(obj) < 1:
  53. pytest.skip("Test doesn't make sense on empty data")
  54. elif isinstance(orig, MultiIndex):
  55. pytest.skip(f"MultiIndex can't hold '{null_obj}'")
  56. values = obj._values
  57. values[0:2] = null_obj
  58. klass = type(obj)
  59. repeated_values = np.repeat(values, range(1, len(values) + 1))
  60. obj = klass(repeated_values, dtype=obj.dtype)
  61. # because np.nan == np.nan is False, but None == None is True
  62. # np.nan would be duplicated, whereas None wouldn't
  63. counter = collections.Counter(obj.dropna())
  64. expected = Series(dict(counter.most_common()), dtype=np.int64, name="count")
  65. if obj.dtype != np.float16:
  66. expected.index = expected.index.astype(obj.dtype)
  67. else:
  68. with pytest.raises(NotImplementedError, match="float16 indexes are not "):
  69. expected.index.astype(obj.dtype)
  70. return
  71. expected.index.name = obj.name
  72. result = obj.value_counts()
  73. if obj.duplicated().any():
  74. # TODO(GH#32514):
  75. # Order of entries with the same count is inconsistent on CI (gh-32449)
  76. expected = expected.sort_index()
  77. result = result.sort_index()
  78. if not isinstance(result.dtype, np.dtype):
  79. if getattr(obj.dtype, "storage", "") == "pyarrow":
  80. expected = expected.astype("int64[pyarrow]")
  81. else:
  82. # i.e IntegerDtype
  83. expected = expected.astype("Int64")
  84. tm.assert_series_equal(result, expected)
  85. expected[null_obj] = 3
  86. result = obj.value_counts(dropna=False)
  87. if obj.duplicated().any():
  88. # TODO(GH#32514):
  89. # Order of entries with the same count is inconsistent on CI (gh-32449)
  90. expected = expected.sort_index()
  91. result = result.sort_index()
  92. tm.assert_series_equal(result, expected)
  93. def test_value_counts_inferred(index_or_series):
  94. klass = index_or_series
  95. s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
  96. s = klass(s_values)
  97. expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"], name="count")
  98. tm.assert_series_equal(s.value_counts(), expected)
  99. if isinstance(s, Index):
  100. exp = Index(np.unique(np.array(s_values, dtype=np.object_)))
  101. tm.assert_index_equal(s.unique(), exp)
  102. else:
  103. exp = np.unique(np.array(s_values, dtype=np.object_))
  104. tm.assert_numpy_array_equal(s.unique(), exp)
  105. assert s.nunique() == 4
  106. # don't sort, have to sort after the fact as not sorting is
  107. # platform-dep
  108. hist = s.value_counts(sort=False).sort_values()
  109. expected = Series([3, 1, 4, 2], index=list("acbd"), name="count").sort_values()
  110. tm.assert_series_equal(hist, expected)
  111. # sort ascending
  112. hist = s.value_counts(ascending=True)
  113. expected = Series([1, 2, 3, 4], index=list("cdab"), name="count")
  114. tm.assert_series_equal(hist, expected)
  115. # relative histogram.
  116. hist = s.value_counts(normalize=True)
  117. expected = Series(
  118. [0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"], name="proportion"
  119. )
  120. tm.assert_series_equal(hist, expected)
  121. def test_value_counts_bins(index_or_series):
  122. klass = index_or_series
  123. s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
  124. s = klass(s_values)
  125. # bins
  126. msg = "bins argument only works with numeric data"
  127. with pytest.raises(TypeError, match=msg):
  128. s.value_counts(bins=1)
  129. s1 = Series([1, 1, 2, 3])
  130. res1 = s1.value_counts(bins=1)
  131. exp1 = Series({Interval(0.997, 3.0): 4}, name="count")
  132. tm.assert_series_equal(res1, exp1)
  133. res1n = s1.value_counts(bins=1, normalize=True)
  134. exp1n = Series({Interval(0.997, 3.0): 1.0}, name="proportion")
  135. tm.assert_series_equal(res1n, exp1n)
  136. if isinstance(s1, Index):
  137. tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
  138. else:
  139. exp = np.array([1, 2, 3], dtype=np.int64)
  140. tm.assert_numpy_array_equal(s1.unique(), exp)
  141. assert s1.nunique() == 3
  142. # these return the same
  143. res4 = s1.value_counts(bins=4, dropna=True)
  144. intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
  145. exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
  146. tm.assert_series_equal(res4, exp4)
  147. res4 = s1.value_counts(bins=4, dropna=False)
  148. intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
  149. exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 1, 3, 2]), name="count")
  150. tm.assert_series_equal(res4, exp4)
  151. res4n = s1.value_counts(bins=4, normalize=True)
  152. exp4n = Series(
  153. [0.5, 0.25, 0.25, 0], index=intervals.take([0, 1, 3, 2]), name="proportion"
  154. )
  155. tm.assert_series_equal(res4n, exp4n)
  156. # handle NA's properly
  157. s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
  158. s = klass(s_values)
  159. expected = Series([4, 3, 2], index=["b", "a", "d"], name="count")
  160. tm.assert_series_equal(s.value_counts(), expected)
  161. if isinstance(s, Index):
  162. exp = Index(["a", "b", np.nan, "d"])
  163. tm.assert_index_equal(s.unique(), exp)
  164. else:
  165. exp = np.array(["a", "b", np.nan, "d"], dtype=object)
  166. tm.assert_numpy_array_equal(s.unique(), exp)
  167. assert s.nunique() == 3
  168. s = klass({}) if klass is dict else klass({}, dtype=object)
  169. expected = Series([], dtype=np.int64, name="count")
  170. tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
  171. # returned dtype differs depending on original
  172. if isinstance(s, Index):
  173. tm.assert_index_equal(s.unique(), Index([]), exact=False)
  174. else:
  175. tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False)
  176. assert s.nunique() == 0
  177. def test_value_counts_datetime64(index_or_series):
  178. klass = index_or_series
  179. # GH 3002, datetime64[ns]
  180. # don't test names though
  181. df = pd.DataFrame(
  182. {
  183. "person_id": ["xxyyzz", "xxyyzz", "xxyyzz", "xxyyww", "foofoo", "foofoo"],
  184. "dt": pd.to_datetime(
  185. [
  186. "2010-01-01",
  187. "2010-01-01",
  188. "2010-01-01",
  189. "2009-01-01",
  190. "2008-09-09",
  191. "2008-09-09",
  192. ]
  193. ),
  194. "food": ["PIE", "GUM", "EGG", "EGG", "PIE", "GUM"],
  195. }
  196. )
  197. s = klass(df["dt"].copy())
  198. s.name = None
  199. idx = pd.to_datetime(
  200. ["2010-01-01 00:00:00", "2008-09-09 00:00:00", "2009-01-01 00:00:00"]
  201. )
  202. expected_s = Series([3, 2, 1], index=idx, name="count")
  203. tm.assert_series_equal(s.value_counts(), expected_s)
  204. expected = pd.array(
  205. np.array(
  206. ["2010-01-01 00:00:00", "2009-01-01 00:00:00", "2008-09-09 00:00:00"],
  207. dtype="datetime64[ns]",
  208. )
  209. )
  210. if isinstance(s, Index):
  211. tm.assert_index_equal(s.unique(), DatetimeIndex(expected))
  212. else:
  213. tm.assert_extension_array_equal(s.unique(), expected)
  214. assert s.nunique() == 3
  215. # with NaT
  216. s = df["dt"].copy()
  217. s = klass(list(s.values) + [pd.NaT] * 4)
  218. result = s.value_counts()
  219. assert result.index.dtype == "datetime64[ns]"
  220. tm.assert_series_equal(result, expected_s)
  221. result = s.value_counts(dropna=False)
  222. expected_s = pd.concat(
  223. [Series([4], index=DatetimeIndex([pd.NaT]), name="count"), expected_s]
  224. )
  225. tm.assert_series_equal(result, expected_s)
  226. assert s.dtype == "datetime64[ns]"
  227. unique = s.unique()
  228. assert unique.dtype == "datetime64[ns]"
  229. # numpy_array_equal cannot compare pd.NaT
  230. if isinstance(s, Index):
  231. exp_idx = DatetimeIndex(expected.tolist() + [pd.NaT])
  232. tm.assert_index_equal(unique, exp_idx)
  233. else:
  234. tm.assert_extension_array_equal(unique[:3], expected)
  235. assert pd.isna(unique[3])
  236. assert s.nunique() == 3
  237. assert s.nunique(dropna=False) == 4
  238. # timedelta64[ns]
  239. td = df.dt - df.dt + timedelta(1)
  240. td = klass(td, name="dt")
  241. result = td.value_counts()
  242. expected_s = Series([6], index=Index([Timedelta("1day")], name="dt"), name="count")
  243. tm.assert_series_equal(result, expected_s)
  244. expected = TimedeltaIndex(["1 days"], name="dt")
  245. if isinstance(td, Index):
  246. tm.assert_index_equal(td.unique(), expected)
  247. else:
  248. tm.assert_extension_array_equal(td.unique(), expected._values)
  249. td2 = timedelta(1) + (df.dt - df.dt)
  250. td2 = klass(td2, name="dt")
  251. result2 = td2.value_counts()
  252. tm.assert_series_equal(result2, expected_s)
  253. @pytest.mark.parametrize("dropna", [True, False])
  254. def test_value_counts_with_nan(dropna, index_or_series):
  255. # GH31944
  256. klass = index_or_series
  257. values = [True, pd.NA, np.nan]
  258. obj = klass(values)
  259. res = obj.value_counts(dropna=dropna)
  260. if dropna is True:
  261. expected = Series([1], index=Index([True], dtype=obj.dtype), name="count")
  262. else:
  263. expected = Series([1, 1, 1], index=[True, pd.NA, np.nan], name="count")
  264. tm.assert_series_equal(res, expected)