test_value_counts.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. Categorical,
  6. CategoricalIndex,
  7. Index,
  8. Series,
  9. )
  10. import pandas._testing as tm
  11. class TestSeriesValueCounts:
  12. def test_value_counts_datetime(self):
  13. # most dtypes are tested in tests/base
  14. values = [
  15. pd.Timestamp("2011-01-01 09:00"),
  16. pd.Timestamp("2011-01-01 10:00"),
  17. pd.Timestamp("2011-01-01 11:00"),
  18. pd.Timestamp("2011-01-01 09:00"),
  19. pd.Timestamp("2011-01-01 09:00"),
  20. pd.Timestamp("2011-01-01 11:00"),
  21. ]
  22. exp_idx = pd.DatetimeIndex(
  23. ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"],
  24. name="xxx",
  25. )
  26. exp = Series([3, 2, 1], index=exp_idx, name="count")
  27. ser = Series(values, name="xxx")
  28. tm.assert_series_equal(ser.value_counts(), exp)
  29. # check DatetimeIndex outputs the same result
  30. idx = pd.DatetimeIndex(values, name="xxx")
  31. tm.assert_series_equal(idx.value_counts(), exp)
  32. # normalize
  33. exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion")
  34. tm.assert_series_equal(ser.value_counts(normalize=True), exp)
  35. tm.assert_series_equal(idx.value_counts(normalize=True), exp)
  36. def test_value_counts_datetime_tz(self):
  37. values = [
  38. pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"),
  39. pd.Timestamp("2011-01-01 10:00", tz="US/Eastern"),
  40. pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"),
  41. pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"),
  42. pd.Timestamp("2011-01-01 09:00", tz="US/Eastern"),
  43. pd.Timestamp("2011-01-01 11:00", tz="US/Eastern"),
  44. ]
  45. exp_idx = pd.DatetimeIndex(
  46. ["2011-01-01 09:00", "2011-01-01 11:00", "2011-01-01 10:00"],
  47. tz="US/Eastern",
  48. name="xxx",
  49. )
  50. exp = Series([3, 2, 1], index=exp_idx, name="count")
  51. ser = Series(values, name="xxx")
  52. tm.assert_series_equal(ser.value_counts(), exp)
  53. idx = pd.DatetimeIndex(values, name="xxx")
  54. tm.assert_series_equal(idx.value_counts(), exp)
  55. exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion")
  56. tm.assert_series_equal(ser.value_counts(normalize=True), exp)
  57. tm.assert_series_equal(idx.value_counts(normalize=True), exp)
  58. def test_value_counts_period(self):
  59. values = [
  60. pd.Period("2011-01", freq="M"),
  61. pd.Period("2011-02", freq="M"),
  62. pd.Period("2011-03", freq="M"),
  63. pd.Period("2011-01", freq="M"),
  64. pd.Period("2011-01", freq="M"),
  65. pd.Period("2011-03", freq="M"),
  66. ]
  67. exp_idx = pd.PeriodIndex(
  68. ["2011-01", "2011-03", "2011-02"], freq="M", name="xxx"
  69. )
  70. exp = Series([3, 2, 1], index=exp_idx, name="count")
  71. ser = Series(values, name="xxx")
  72. tm.assert_series_equal(ser.value_counts(), exp)
  73. # check DatetimeIndex outputs the same result
  74. idx = pd.PeriodIndex(values, name="xxx")
  75. tm.assert_series_equal(idx.value_counts(), exp)
  76. # normalize
  77. exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion")
  78. tm.assert_series_equal(ser.value_counts(normalize=True), exp)
  79. tm.assert_series_equal(idx.value_counts(normalize=True), exp)
  80. def test_value_counts_categorical_ordered(self):
  81. # most dtypes are tested in tests/base
  82. values = Categorical([1, 2, 3, 1, 1, 3], ordered=True)
  83. exp_idx = CategoricalIndex(
  84. [1, 3, 2], categories=[1, 2, 3], ordered=True, name="xxx"
  85. )
  86. exp = Series([3, 2, 1], index=exp_idx, name="count")
  87. ser = Series(values, name="xxx")
  88. tm.assert_series_equal(ser.value_counts(), exp)
  89. # check CategoricalIndex outputs the same result
  90. idx = CategoricalIndex(values, name="xxx")
  91. tm.assert_series_equal(idx.value_counts(), exp)
  92. # normalize
  93. exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion")
  94. tm.assert_series_equal(ser.value_counts(normalize=True), exp)
  95. tm.assert_series_equal(idx.value_counts(normalize=True), exp)
  96. def test_value_counts_categorical_not_ordered(self):
  97. values = Categorical([1, 2, 3, 1, 1, 3], ordered=False)
  98. exp_idx = CategoricalIndex(
  99. [1, 3, 2], categories=[1, 2, 3], ordered=False, name="xxx"
  100. )
  101. exp = Series([3, 2, 1], index=exp_idx, name="count")
  102. ser = Series(values, name="xxx")
  103. tm.assert_series_equal(ser.value_counts(), exp)
  104. # check CategoricalIndex outputs the same result
  105. idx = CategoricalIndex(values, name="xxx")
  106. tm.assert_series_equal(idx.value_counts(), exp)
  107. # normalize
  108. exp = Series(np.array([3.0, 2.0, 1]) / 6.0, index=exp_idx, name="proportion")
  109. tm.assert_series_equal(ser.value_counts(normalize=True), exp)
  110. tm.assert_series_equal(idx.value_counts(normalize=True), exp)
  111. def test_value_counts_categorical(self):
  112. # GH#12835
  113. cats = Categorical(list("abcccb"), categories=list("cabd"))
  114. ser = Series(cats, name="xxx")
  115. res = ser.value_counts(sort=False)
  116. exp_index = CategoricalIndex(
  117. list("cabd"), categories=cats.categories, name="xxx"
  118. )
  119. exp = Series([3, 1, 2, 0], name="count", index=exp_index)
  120. tm.assert_series_equal(res, exp)
  121. res = ser.value_counts(sort=True)
  122. exp_index = CategoricalIndex(
  123. list("cbad"), categories=cats.categories, name="xxx"
  124. )
  125. exp = Series([3, 2, 1, 0], name="count", index=exp_index)
  126. tm.assert_series_equal(res, exp)
  127. # check object dtype handles the Series.name as the same
  128. # (tested in tests/base)
  129. ser = Series(["a", "b", "c", "c", "c", "b"], name="xxx")
  130. res = ser.value_counts()
  131. exp = Series([3, 2, 1], name="count", index=Index(["c", "b", "a"], name="xxx"))
  132. tm.assert_series_equal(res, exp)
  133. def test_value_counts_categorical_with_nan(self):
  134. # see GH#9443
  135. # sanity check
  136. ser = Series(["a", "b", "a"], dtype="category")
  137. exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), name="count")
  138. res = ser.value_counts(dropna=True)
  139. tm.assert_series_equal(res, exp)
  140. res = ser.value_counts(dropna=True)
  141. tm.assert_series_equal(res, exp)
  142. # same Series via two different constructions --> same behaviour
  143. series = [
  144. Series(["a", "b", None, "a", None, None], dtype="category"),
  145. Series(
  146. Categorical(["a", "b", None, "a", None, None], categories=["a", "b"])
  147. ),
  148. ]
  149. for ser in series:
  150. # None is a NaN value, so we exclude its count here
  151. exp = Series([2, 1], index=CategoricalIndex(["a", "b"]), name="count")
  152. res = ser.value_counts(dropna=True)
  153. tm.assert_series_equal(res, exp)
  154. # we don't exclude the count of None and sort by counts
  155. exp = Series(
  156. [3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]), name="count"
  157. )
  158. res = ser.value_counts(dropna=False)
  159. tm.assert_series_equal(res, exp)
  160. # When we aren't sorting by counts, and np.nan isn't a
  161. # category, it should be last.
  162. exp = Series(
  163. [2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]), name="count"
  164. )
  165. res = ser.value_counts(dropna=False, sort=False)
  166. tm.assert_series_equal(res, exp)
  167. @pytest.mark.parametrize(
  168. "ser, dropna, exp",
  169. [
  170. (
  171. Series([False, True, True, pd.NA]),
  172. False,
  173. Series([2, 1, 1], index=[True, False, pd.NA], name="count"),
  174. ),
  175. (
  176. Series([False, True, True, pd.NA]),
  177. True,
  178. Series([2, 1], index=Index([True, False], dtype=object), name="count"),
  179. ),
  180. (
  181. Series(range(3), index=[True, False, np.nan]).index,
  182. False,
  183. Series([1, 1, 1], index=[True, False, np.nan], name="count"),
  184. ),
  185. ],
  186. )
  187. def test_value_counts_bool_with_nan(self, ser, dropna, exp):
  188. # GH32146
  189. out = ser.value_counts(dropna=dropna)
  190. tm.assert_series_equal(out, exp)
  191. @pytest.mark.parametrize(
  192. "input_array,expected",
  193. [
  194. (
  195. [1 + 1j, 1 + 1j, 1, 3j, 3j, 3j],
  196. Series(
  197. [3, 2, 1],
  198. index=Index([3j, 1 + 1j, 1], dtype=np.complex128),
  199. name="count",
  200. ),
  201. ),
  202. (
  203. np.array([1 + 1j, 1 + 1j, 1, 3j, 3j, 3j], dtype=np.complex64),
  204. Series(
  205. [3, 2, 1],
  206. index=Index([3j, 1 + 1j, 1], dtype=np.complex64),
  207. name="count",
  208. ),
  209. ),
  210. ],
  211. )
  212. def test_value_counts_complex_numbers(self, input_array, expected):
  213. # GH 17927
  214. result = Series(input_array).value_counts()
  215. tm.assert_series_equal(result, expected)