test_nunique.py 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. import datetime as dt
  2. from string import ascii_lowercase
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. MultiIndex,
  9. NaT,
  10. Series,
  11. Timestamp,
  12. date_range,
  13. )
  14. import pandas._testing as tm
  15. @pytest.mark.slow
  16. @pytest.mark.parametrize("n", 10 ** np.arange(2, 6))
  17. @pytest.mark.parametrize("m", [10, 100, 1000])
  18. @pytest.mark.parametrize("sort", [False, True])
  19. @pytest.mark.parametrize("dropna", [False, True])
  20. def test_series_groupby_nunique(n, m, sort, dropna):
  21. def check_nunique(df, keys, as_index=True):
  22. original_df = df.copy()
  23. gr = df.groupby(keys, as_index=as_index, sort=sort)
  24. left = gr["julie"].nunique(dropna=dropna)
  25. gr = df.groupby(keys, as_index=as_index, sort=sort)
  26. right = gr["julie"].apply(Series.nunique, dropna=dropna)
  27. if not as_index:
  28. right = right.reset_index(drop=True)
  29. if as_index:
  30. tm.assert_series_equal(left, right, check_names=False)
  31. else:
  32. tm.assert_frame_equal(left, right, check_names=False)
  33. tm.assert_frame_equal(df, original_df)
  34. days = date_range("2015-08-23", periods=10)
  35. frame = DataFrame(
  36. {
  37. "jim": np.random.choice(list(ascii_lowercase), n),
  38. "joe": np.random.choice(days, n),
  39. "julie": np.random.randint(0, m, n),
  40. }
  41. )
  42. check_nunique(frame, ["jim"])
  43. check_nunique(frame, ["jim", "joe"])
  44. frame.loc[1::17, "jim"] = None
  45. frame.loc[3::37, "joe"] = None
  46. frame.loc[7::19, "julie"] = None
  47. frame.loc[8::19, "julie"] = None
  48. frame.loc[9::19, "julie"] = None
  49. check_nunique(frame, ["jim"])
  50. check_nunique(frame, ["jim", "joe"])
  51. check_nunique(frame, ["jim"], as_index=False)
  52. check_nunique(frame, ["jim", "joe"], as_index=False)
  53. def test_nunique():
  54. df = DataFrame({"A": list("abbacc"), "B": list("abxacc"), "C": list("abbacx")})
  55. expected = DataFrame({"A": list("abc"), "B": [1, 2, 1], "C": [1, 1, 2]})
  56. result = df.groupby("A", as_index=False).nunique()
  57. tm.assert_frame_equal(result, expected)
  58. # as_index
  59. expected.index = list("abc")
  60. expected.index.name = "A"
  61. expected = expected.drop(columns="A")
  62. result = df.groupby("A").nunique()
  63. tm.assert_frame_equal(result, expected)
  64. # with na
  65. result = df.replace({"x": None}).groupby("A").nunique(dropna=False)
  66. tm.assert_frame_equal(result, expected)
  67. # dropna
  68. expected = DataFrame({"B": [1] * 3, "C": [1] * 3}, index=list("abc"))
  69. expected.index.name = "A"
  70. result = df.replace({"x": None}).groupby("A").nunique()
  71. tm.assert_frame_equal(result, expected)
  72. def test_nunique_with_object():
  73. # GH 11077
  74. data = DataFrame(
  75. [
  76. [100, 1, "Alice"],
  77. [200, 2, "Bob"],
  78. [300, 3, "Charlie"],
  79. [-400, 4, "Dan"],
  80. [500, 5, "Edith"],
  81. ],
  82. columns=["amount", "id", "name"],
  83. )
  84. result = data.groupby(["id", "amount"])["name"].nunique()
  85. index = MultiIndex.from_arrays([data.id, data.amount])
  86. expected = Series([1] * 5, name="name", index=index)
  87. tm.assert_series_equal(result, expected)
  88. def test_nunique_with_empty_series():
  89. # GH 12553
  90. data = Series(name="name", dtype=object)
  91. result = data.groupby(level=0).nunique()
  92. expected = Series(name="name", dtype="int64")
  93. tm.assert_series_equal(result, expected)
  94. def test_nunique_with_timegrouper():
  95. # GH 13453
  96. test = DataFrame(
  97. {
  98. "time": [
  99. Timestamp("2016-06-28 09:35:35"),
  100. Timestamp("2016-06-28 16:09:30"),
  101. Timestamp("2016-06-28 16:46:28"),
  102. ],
  103. "data": ["1", "2", "3"],
  104. }
  105. ).set_index("time")
  106. result = test.groupby(pd.Grouper(freq="h"))["data"].nunique()
  107. expected = test.groupby(pd.Grouper(freq="h"))["data"].apply(Series.nunique)
  108. tm.assert_series_equal(result, expected)
  109. @pytest.mark.parametrize(
  110. "key, data, dropna, expected",
  111. [
  112. (
  113. ["x", "x", "x"],
  114. [Timestamp("2019-01-01"), NaT, Timestamp("2019-01-01")],
  115. True,
  116. Series([1], index=pd.Index(["x"], name="key"), name="data"),
  117. ),
  118. (
  119. ["x", "x", "x"],
  120. [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
  121. True,
  122. Series([1], index=pd.Index(["x"], name="key"), name="data"),
  123. ),
  124. (
  125. ["x", "x", "x", "y", "y"],
  126. [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
  127. False,
  128. Series([2, 2], index=pd.Index(["x", "y"], name="key"), name="data"),
  129. ),
  130. (
  131. ["x", "x", "x", "x", "y"],
  132. [dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1), NaT, dt.date(2019, 1, 1)],
  133. False,
  134. Series([2, 1], index=pd.Index(["x", "y"], name="key"), name="data"),
  135. ),
  136. ],
  137. )
  138. def test_nunique_with_NaT(key, data, dropna, expected):
  139. # GH 27951
  140. df = DataFrame({"key": key, "data": data})
  141. result = df.groupby(["key"])["data"].nunique(dropna=dropna)
  142. tm.assert_series_equal(result, expected)
  143. def test_nunique_preserves_column_level_names():
  144. # GH 23222
  145. test = DataFrame([1, 2, 2], columns=pd.Index(["A"], name="level_0"))
  146. result = test.groupby([0, 0, 0]).nunique()
  147. expected = DataFrame([2], index=np.array([0]), columns=test.columns)
  148. tm.assert_frame_equal(result, expected)
  149. def test_nunique_transform_with_datetime():
  150. # GH 35109 - transform with nunique on datetimes results in integers
  151. df = DataFrame(date_range("2008-12-31", "2009-01-02"), columns=["date"])
  152. result = df.groupby([0, 0, 1])["date"].transform("nunique")
  153. expected = Series([2, 2, 1], name="date")
  154. tm.assert_series_equal(result, expected)
  155. def test_empty_categorical(observed):
  156. # GH#21334
  157. cat = Series([1]).astype("category")
  158. ser = cat[:0]
  159. gb = ser.groupby(ser, observed=observed)
  160. result = gb.nunique()
  161. if observed:
  162. expected = Series([], index=cat[:0], dtype="int64")
  163. else:
  164. expected = Series([0], index=cat, dtype="int64")
  165. tm.assert_series_equal(result, expected)