test_counting.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. from itertools import product
  2. from string import ascii_lowercase
  3. import numpy as np
  4. import pytest
  5. from pandas import (
  6. DataFrame,
  7. Index,
  8. MultiIndex,
  9. Period,
  10. Series,
  11. Timedelta,
  12. Timestamp,
  13. date_range,
  14. )
  15. import pandas._testing as tm
  16. class TestCounting:
  17. def test_cumcount(self):
  18. df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"])
  19. g = df.groupby("A")
  20. sg = g.A
  21. expected = Series([0, 1, 2, 0, 3])
  22. tm.assert_series_equal(expected, g.cumcount())
  23. tm.assert_series_equal(expected, sg.cumcount())
  24. def test_cumcount_empty(self):
  25. ge = DataFrame().groupby(level=0)
  26. se = Series(dtype=object).groupby(level=0)
  27. # edge case, as this is usually considered float
  28. e = Series(dtype="int64")
  29. tm.assert_series_equal(e, ge.cumcount())
  30. tm.assert_series_equal(e, se.cumcount())
  31. def test_cumcount_dupe_index(self):
  32. df = DataFrame(
  33. [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
  34. )
  35. g = df.groupby("A")
  36. sg = g.A
  37. expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
  38. tm.assert_series_equal(expected, g.cumcount())
  39. tm.assert_series_equal(expected, sg.cumcount())
  40. def test_cumcount_mi(self):
  41. mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
  42. df = DataFrame([["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=mi)
  43. g = df.groupby("A")
  44. sg = g.A
  45. expected = Series([0, 1, 2, 0, 3], index=mi)
  46. tm.assert_series_equal(expected, g.cumcount())
  47. tm.assert_series_equal(expected, sg.cumcount())
  48. def test_cumcount_groupby_not_col(self):
  49. df = DataFrame(
  50. [["a"], ["a"], ["a"], ["b"], ["a"]], columns=["A"], index=[0] * 5
  51. )
  52. g = df.groupby([0, 0, 0, 1, 0])
  53. sg = g.A
  54. expected = Series([0, 1, 2, 0, 3], index=[0] * 5)
  55. tm.assert_series_equal(expected, g.cumcount())
  56. tm.assert_series_equal(expected, sg.cumcount())
  57. def test_ngroup(self):
  58. df = DataFrame({"A": list("aaaba")})
  59. g = df.groupby("A")
  60. sg = g.A
  61. expected = Series([0, 0, 0, 1, 0])
  62. tm.assert_series_equal(expected, g.ngroup())
  63. tm.assert_series_equal(expected, sg.ngroup())
  64. def test_ngroup_distinct(self):
  65. df = DataFrame({"A": list("abcde")})
  66. g = df.groupby("A")
  67. sg = g.A
  68. expected = Series(range(5), dtype="int64")
  69. tm.assert_series_equal(expected, g.ngroup())
  70. tm.assert_series_equal(expected, sg.ngroup())
  71. def test_ngroup_one_group(self):
  72. df = DataFrame({"A": [0] * 5})
  73. g = df.groupby("A")
  74. sg = g.A
  75. expected = Series([0] * 5)
  76. tm.assert_series_equal(expected, g.ngroup())
  77. tm.assert_series_equal(expected, sg.ngroup())
  78. def test_ngroup_empty(self):
  79. ge = DataFrame().groupby(level=0)
  80. se = Series(dtype=object).groupby(level=0)
  81. # edge case, as this is usually considered float
  82. e = Series(dtype="int64")
  83. tm.assert_series_equal(e, ge.ngroup())
  84. tm.assert_series_equal(e, se.ngroup())
  85. def test_ngroup_series_matches_frame(self):
  86. df = DataFrame({"A": list("aaaba")})
  87. s = Series(list("aaaba"))
  88. tm.assert_series_equal(df.groupby(s).ngroup(), s.groupby(s).ngroup())
  89. def test_ngroup_dupe_index(self):
  90. df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
  91. g = df.groupby("A")
  92. sg = g.A
  93. expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
  94. tm.assert_series_equal(expected, g.ngroup())
  95. tm.assert_series_equal(expected, sg.ngroup())
  96. def test_ngroup_mi(self):
  97. mi = MultiIndex.from_tuples([[0, 1], [1, 2], [2, 2], [2, 2], [1, 0]])
  98. df = DataFrame({"A": list("aaaba")}, index=mi)
  99. g = df.groupby("A")
  100. sg = g.A
  101. expected = Series([0, 0, 0, 1, 0], index=mi)
  102. tm.assert_series_equal(expected, g.ngroup())
  103. tm.assert_series_equal(expected, sg.ngroup())
  104. def test_ngroup_groupby_not_col(self):
  105. df = DataFrame({"A": list("aaaba")}, index=[0] * 5)
  106. g = df.groupby([0, 0, 0, 1, 0])
  107. sg = g.A
  108. expected = Series([0, 0, 0, 1, 0], index=[0] * 5)
  109. tm.assert_series_equal(expected, g.ngroup())
  110. tm.assert_series_equal(expected, sg.ngroup())
  111. def test_ngroup_descending(self):
  112. df = DataFrame(["a", "a", "b", "a", "b"], columns=["A"])
  113. g = df.groupby(["A"])
  114. ascending = Series([0, 0, 1, 0, 1])
  115. descending = Series([1, 1, 0, 1, 0])
  116. tm.assert_series_equal(descending, (g.ngroups - 1) - ascending)
  117. tm.assert_series_equal(ascending, g.ngroup(ascending=True))
  118. tm.assert_series_equal(descending, g.ngroup(ascending=False))
  119. def test_ngroup_matches_cumcount(self):
  120. # verify one manually-worked out case works
  121. df = DataFrame(
  122. [["a", "x"], ["a", "y"], ["b", "x"], ["a", "x"], ["b", "y"]],
  123. columns=["A", "X"],
  124. )
  125. g = df.groupby(["A", "X"])
  126. g_ngroup = g.ngroup()
  127. g_cumcount = g.cumcount()
  128. expected_ngroup = Series([0, 1, 2, 0, 3])
  129. expected_cumcount = Series([0, 0, 0, 1, 0])
  130. tm.assert_series_equal(g_ngroup, expected_ngroup)
  131. tm.assert_series_equal(g_cumcount, expected_cumcount)
  132. def test_ngroup_cumcount_pair(self):
  133. # brute force comparison for all small series
  134. for p in product(range(3), repeat=4):
  135. df = DataFrame({"a": p})
  136. g = df.groupby(["a"])
  137. order = sorted(set(p))
  138. ngroupd = [order.index(val) for val in p]
  139. cumcounted = [p[:i].count(val) for i, val in enumerate(p)]
  140. tm.assert_series_equal(g.ngroup(), Series(ngroupd))
  141. tm.assert_series_equal(g.cumcount(), Series(cumcounted))
  142. def test_ngroup_respects_groupby_order(self, sort):
  143. np.random.seed(0)
  144. df = DataFrame({"a": np.random.choice(list("abcdef"), 100)})
  145. g = df.groupby("a", sort=sort)
  146. df["group_id"] = -1
  147. df["group_index"] = -1
  148. for i, (_, group) in enumerate(g):
  149. df.loc[group.index, "group_id"] = i
  150. for j, ind in enumerate(group.index):
  151. df.loc[ind, "group_index"] = j
  152. tm.assert_series_equal(Series(df["group_id"].values), g.ngroup())
  153. tm.assert_series_equal(Series(df["group_index"].values), g.cumcount())
  154. @pytest.mark.parametrize(
  155. "datetimelike",
  156. [
  157. [Timestamp(f"2016-05-{i:02d} 20:09:25+00:00") for i in range(1, 4)],
  158. [Timestamp(f"2016-05-{i:02d} 20:09:25") for i in range(1, 4)],
  159. [Timestamp(f"2016-05-{i:02d} 20:09:25", tz="UTC") for i in range(1, 4)],
  160. [Timedelta(x, unit="h") for x in range(1, 4)],
  161. [Period(freq="2W", year=2017, month=x) for x in range(1, 4)],
  162. ],
  163. )
  164. def test_count_with_datetimelike(self, datetimelike):
  165. # test for #13393, where DataframeGroupBy.count() fails
  166. # when counting a datetimelike column.
  167. df = DataFrame({"x": ["a", "a", "b"], "y": datetimelike})
  168. res = df.groupby("x").count()
  169. expected = DataFrame({"y": [2, 1]}, index=["a", "b"])
  170. expected.index.name = "x"
  171. tm.assert_frame_equal(expected, res)
  172. def test_count_with_only_nans_in_first_group(self):
  173. # GH21956
  174. df = DataFrame({"A": [np.nan, np.nan], "B": ["a", "b"], "C": [1, 2]})
  175. result = df.groupby(["A", "B"]).C.count()
  176. mi = MultiIndex(levels=[[], ["a", "b"]], codes=[[], []], names=["A", "B"])
  177. expected = Series([], index=mi, dtype=np.int64, name="C")
  178. tm.assert_series_equal(result, expected, check_index_type=False)
  179. def test_count_groupby_column_with_nan_in_groupby_column(self):
  180. # https://github.com/pandas-dev/pandas/issues/32841
  181. df = DataFrame({"A": [1, 1, 1, 1, 1], "B": [5, 4, np.NaN, 3, 0]})
  182. res = df.groupby(["B"]).count()
  183. expected = DataFrame(
  184. index=Index([0.0, 3.0, 4.0, 5.0], name="B"), data={"A": [1, 1, 1, 1]}
  185. )
  186. tm.assert_frame_equal(expected, res)
  187. def test_groupby_count_dateparseerror(self):
  188. dr = date_range(start="1/1/2012", freq="5min", periods=10)
  189. # BAD Example, datetimes first
  190. ser = Series(np.arange(10), index=[dr, np.arange(10)])
  191. grouped = ser.groupby(lambda x: x[1] % 2 == 0)
  192. result = grouped.count()
  193. ser = Series(np.arange(10), index=[np.arange(10), dr])
  194. grouped = ser.groupby(lambda x: x[0] % 2 == 0)
  195. expected = grouped.count()
  196. tm.assert_series_equal(result, expected)
  197. def test_groupby_timedelta_cython_count():
  198. df = DataFrame(
  199. {"g": list("ab" * 2), "delt": np.arange(4).astype("timedelta64[ns]")}
  200. )
  201. expected = Series([2, 2], index=Index(["a", "b"], name="g"), name="delt")
  202. result = df.groupby("g").delt.count()
  203. tm.assert_series_equal(expected, result)
  204. def test_count():
  205. n = 1 << 15
  206. dr = date_range("2015-08-30", periods=n // 10, freq="T")
  207. df = DataFrame(
  208. {
  209. "1st": np.random.choice(list(ascii_lowercase), n),
  210. "2nd": np.random.randint(0, 5, n),
  211. "3rd": np.random.randn(n).round(3),
  212. "4th": np.random.randint(-10, 10, n),
  213. "5th": np.random.choice(dr, n),
  214. "6th": np.random.randn(n).round(3),
  215. "7th": np.random.randn(n).round(3),
  216. "8th": np.random.choice(dr, n) - np.random.choice(dr, 1),
  217. "9th": np.random.choice(list(ascii_lowercase), n),
  218. }
  219. )
  220. for col in df.columns.drop(["1st", "2nd", "4th"]):
  221. df.loc[np.random.choice(n, n // 10), col] = np.nan
  222. df["9th"] = df["9th"].astype("category")
  223. for key in ["1st", "2nd", ["1st", "2nd"]]:
  224. left = df.groupby(key).count()
  225. right = df.groupby(key).apply(DataFrame.count).drop(key, axis=1)
  226. tm.assert_frame_equal(left, right)
  227. def test_count_non_nulls():
  228. # GH#5610
  229. # count counts non-nulls
  230. df = DataFrame(
  231. [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, np.nan]],
  232. columns=["A", "B", "C"],
  233. )
  234. count_as = df.groupby("A").count()
  235. count_not_as = df.groupby("A", as_index=False).count()
  236. expected = DataFrame([[1, 2], [0, 0]], columns=["B", "C"], index=[1, 3])
  237. expected.index.name = "A"
  238. tm.assert_frame_equal(count_not_as, expected.reset_index())
  239. tm.assert_frame_equal(count_as, expected)
  240. count_B = df.groupby("A")["B"].count()
  241. tm.assert_series_equal(count_B, expected["B"])
  242. def test_count_object():
  243. df = DataFrame({"a": ["a"] * 3 + ["b"] * 3, "c": [2] * 3 + [3] * 3})
  244. result = df.groupby("c").a.count()
  245. expected = Series([3, 3], index=Index([2, 3], name="c"), name="a")
  246. tm.assert_series_equal(result, expected)
  247. df = DataFrame({"a": ["a", np.nan, np.nan] + ["b"] * 3, "c": [2] * 3 + [3] * 3})
  248. result = df.groupby("c").a.count()
  249. expected = Series([1, 3], index=Index([2, 3], name="c"), name="a")
  250. tm.assert_series_equal(result, expected)
  251. def test_count_cross_type():
  252. # GH8169
  253. vals = np.hstack(
  254. (np.random.randint(0, 5, (100, 2)), np.random.randint(0, 2, (100, 2)))
  255. )
  256. df = DataFrame(vals, columns=["a", "b", "c", "d"])
  257. df[df == 2] = np.nan
  258. expected = df.groupby(["c", "d"]).count()
  259. for t in ["float32", "object"]:
  260. df["a"] = df["a"].astype(t)
  261. df["b"] = df["b"].astype(t)
  262. result = df.groupby(["c", "d"]).count()
  263. tm.assert_frame_equal(result, expected)
  264. def test_lower_int_prec_count():
  265. df = DataFrame(
  266. {
  267. "a": np.array([0, 1, 2, 100], np.int8),
  268. "b": np.array([1, 2, 3, 6], np.uint32),
  269. "c": np.array([4, 5, 6, 8], np.int16),
  270. "grp": list("ab" * 2),
  271. }
  272. )
  273. result = df.groupby("grp").count()
  274. expected = DataFrame(
  275. {"a": [2, 2], "b": [2, 2], "c": [2, 2]}, index=Index(list("ab"), name="grp")
  276. )
  277. tm.assert_frame_equal(result, expected)
  278. def test_count_uses_size_on_exception():
  279. class RaisingObjectException(Exception):
  280. pass
  281. class RaisingObject:
  282. def __init__(self, msg="I will raise inside Cython") -> None:
  283. super().__init__()
  284. self.msg = msg
  285. def __eq__(self, other):
  286. # gets called in Cython to check that raising calls the method
  287. raise RaisingObjectException(self.msg)
  288. df = DataFrame({"a": [RaisingObject() for _ in range(4)], "grp": list("ab" * 2)})
  289. result = df.groupby("grp").count()
  290. expected = DataFrame({"a": [2, 2]}, index=Index(list("ab"), name="grp"))
  291. tm.assert_frame_equal(result, expected)