test_value_counts.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. import pandas._testing as tm
  5. def test_data_frame_value_counts_unsorted():
  6. df = pd.DataFrame(
  7. {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
  8. index=["falcon", "dog", "cat", "ant"],
  9. )
  10. result = df.value_counts(sort=False)
  11. expected = pd.Series(
  12. data=[1, 2, 1],
  13. index=pd.MultiIndex.from_arrays(
  14. [(2, 4, 6), (2, 0, 0)], names=["num_legs", "num_wings"]
  15. ),
  16. name="count",
  17. )
  18. tm.assert_series_equal(result, expected)
  19. def test_data_frame_value_counts_ascending():
  20. df = pd.DataFrame(
  21. {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
  22. index=["falcon", "dog", "cat", "ant"],
  23. )
  24. result = df.value_counts(ascending=True)
  25. expected = pd.Series(
  26. data=[1, 1, 2],
  27. index=pd.MultiIndex.from_arrays(
  28. [(2, 6, 4), (2, 0, 0)], names=["num_legs", "num_wings"]
  29. ),
  30. name="count",
  31. )
  32. tm.assert_series_equal(result, expected)
  33. def test_data_frame_value_counts_default():
  34. df = pd.DataFrame(
  35. {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
  36. index=["falcon", "dog", "cat", "ant"],
  37. )
  38. result = df.value_counts()
  39. expected = pd.Series(
  40. data=[2, 1, 1],
  41. index=pd.MultiIndex.from_arrays(
  42. [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
  43. ),
  44. name="count",
  45. )
  46. tm.assert_series_equal(result, expected)
  47. def test_data_frame_value_counts_normalize():
  48. df = pd.DataFrame(
  49. {"num_legs": [2, 4, 4, 6], "num_wings": [2, 0, 0, 0]},
  50. index=["falcon", "dog", "cat", "ant"],
  51. )
  52. result = df.value_counts(normalize=True)
  53. expected = pd.Series(
  54. data=[0.5, 0.25, 0.25],
  55. index=pd.MultiIndex.from_arrays(
  56. [(4, 2, 6), (0, 2, 0)], names=["num_legs", "num_wings"]
  57. ),
  58. name="proportion",
  59. )
  60. tm.assert_series_equal(result, expected)
  61. def test_data_frame_value_counts_single_col_default():
  62. df = pd.DataFrame({"num_legs": [2, 4, 4, 6]})
  63. result = df.value_counts()
  64. expected = pd.Series(
  65. data=[2, 1, 1],
  66. index=pd.MultiIndex.from_arrays([[4, 2, 6]], names=["num_legs"]),
  67. name="count",
  68. )
  69. tm.assert_series_equal(result, expected)
  70. def test_data_frame_value_counts_empty():
  71. df_no_cols = pd.DataFrame()
  72. result = df_no_cols.value_counts()
  73. expected = pd.Series(
  74. [], dtype=np.int64, name="count", index=np.array([], dtype=np.intp)
  75. )
  76. tm.assert_series_equal(result, expected)
  77. def test_data_frame_value_counts_empty_normalize():
  78. df_no_cols = pd.DataFrame()
  79. result = df_no_cols.value_counts(normalize=True)
  80. expected = pd.Series(
  81. [], dtype=np.float64, name="proportion", index=np.array([], dtype=np.intp)
  82. )
  83. tm.assert_series_equal(result, expected)
  84. def test_data_frame_value_counts_dropna_true(nulls_fixture):
  85. # GH 41334
  86. df = pd.DataFrame(
  87. {
  88. "first_name": ["John", "Anne", "John", "Beth"],
  89. "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
  90. },
  91. )
  92. result = df.value_counts()
  93. expected = pd.Series(
  94. data=[1, 1],
  95. index=pd.MultiIndex.from_arrays(
  96. [("Beth", "John"), ("Louise", "Smith")], names=["first_name", "middle_name"]
  97. ),
  98. name="count",
  99. )
  100. tm.assert_series_equal(result, expected)
  101. def test_data_frame_value_counts_dropna_false(nulls_fixture):
  102. # GH 41334
  103. df = pd.DataFrame(
  104. {
  105. "first_name": ["John", "Anne", "John", "Beth"],
  106. "middle_name": ["Smith", nulls_fixture, nulls_fixture, "Louise"],
  107. },
  108. )
  109. result = df.value_counts(dropna=False)
  110. expected = pd.Series(
  111. data=[1, 1, 1, 1],
  112. index=pd.MultiIndex(
  113. levels=[
  114. pd.Index(["Anne", "Beth", "John"]),
  115. pd.Index(["Louise", "Smith", nulls_fixture]),
  116. ],
  117. codes=[[0, 1, 2, 2], [2, 0, 1, 2]],
  118. names=["first_name", "middle_name"],
  119. ),
  120. name="count",
  121. )
  122. tm.assert_series_equal(result, expected)
  123. @pytest.mark.parametrize("columns", (["first_name", "middle_name"], [0, 1]))
  124. def test_data_frame_value_counts_subset(nulls_fixture, columns):
  125. # GH 50829
  126. df = pd.DataFrame(
  127. {
  128. columns[0]: ["John", "Anne", "John", "Beth"],
  129. columns[1]: ["Smith", nulls_fixture, nulls_fixture, "Louise"],
  130. },
  131. )
  132. result = df.value_counts(columns[0])
  133. expected = pd.Series(
  134. data=[2, 1, 1],
  135. index=pd.Index(["John", "Anne", "Beth"], name=columns[0]),
  136. name="count",
  137. )
  138. tm.assert_series_equal(result, expected)