groupby.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. import pytest
  2. from pandas.core.dtypes.common import (
  3. is_bool_dtype,
  4. is_numeric_dtype,
  5. is_object_dtype,
  6. is_string_dtype,
  7. )
  8. import pandas as pd
  9. import pandas._testing as tm
  10. from pandas.tests.extension.base.base import BaseExtensionTests
  11. class BaseGroupbyTests(BaseExtensionTests):
  12. """Groupby-specific tests."""
  13. def test_grouping_grouper(self, data_for_grouping):
  14. df = pd.DataFrame(
  15. {"A": ["B", "B", None, None, "A", "A", "B", "C"], "B": data_for_grouping}
  16. )
  17. gr1 = df.groupby("A").grouper.groupings[0]
  18. gr2 = df.groupby("B").grouper.groupings[0]
  19. tm.assert_numpy_array_equal(gr1.grouping_vector, df.A.values)
  20. tm.assert_extension_array_equal(gr2.grouping_vector, data_for_grouping)
  21. @pytest.mark.parametrize("as_index", [True, False])
  22. def test_groupby_extension_agg(self, as_index, data_for_grouping):
  23. df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
  24. result = df.groupby("B", as_index=as_index).A.mean()
  25. _, uniques = pd.factorize(data_for_grouping, sort=True)
  26. if as_index:
  27. index = pd.Index(uniques, name="B")
  28. expected = pd.Series([3.0, 1.0, 4.0], index=index, name="A")
  29. self.assert_series_equal(result, expected)
  30. else:
  31. expected = pd.DataFrame({"B": uniques, "A": [3.0, 1.0, 4.0]})
  32. self.assert_frame_equal(result, expected)
  33. def test_groupby_agg_extension(self, data_for_grouping):
  34. # GH#38980 groupby agg on extension type fails for non-numeric types
  35. df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
  36. expected = df.iloc[[0, 2, 4, 7]]
  37. expected = expected.set_index("A")
  38. result = df.groupby("A").agg({"B": "first"})
  39. self.assert_frame_equal(result, expected)
  40. result = df.groupby("A").agg("first")
  41. self.assert_frame_equal(result, expected)
  42. result = df.groupby("A").first()
  43. self.assert_frame_equal(result, expected)
  44. def test_groupby_agg_extension_timedelta_cumsum_with_named_aggregation(self):
  45. # GH#41720
  46. expected = pd.DataFrame(
  47. {
  48. "td": {
  49. 0: pd.Timedelta("0 days 01:00:00"),
  50. 1: pd.Timedelta("0 days 01:15:00"),
  51. 2: pd.Timedelta("0 days 01:15:00"),
  52. }
  53. }
  54. )
  55. df = pd.DataFrame(
  56. {
  57. "td": pd.Series(
  58. ["0 days 01:00:00", "0 days 00:15:00", "0 days 01:15:00"],
  59. dtype="timedelta64[ns]",
  60. ),
  61. "grps": ["a", "a", "b"],
  62. }
  63. )
  64. gb = df.groupby("grps")
  65. result = gb.agg(td=("td", "cumsum"))
  66. self.assert_frame_equal(result, expected)
  67. def test_groupby_extension_no_sort(self, data_for_grouping):
  68. df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
  69. result = df.groupby("B", sort=False).A.mean()
  70. _, index = pd.factorize(data_for_grouping, sort=False)
  71. index = pd.Index(index, name="B")
  72. expected = pd.Series([1.0, 3.0, 4.0], index=index, name="A")
  73. self.assert_series_equal(result, expected)
  74. def test_groupby_extension_transform(self, data_for_grouping):
  75. valid = data_for_grouping[~data_for_grouping.isna()]
  76. df = pd.DataFrame({"A": [1, 1, 3, 3, 1, 4], "B": valid})
  77. result = df.groupby("B").A.transform(len)
  78. expected = pd.Series([3, 3, 2, 2, 3, 1], name="A")
  79. self.assert_series_equal(result, expected)
  80. def test_groupby_extension_apply(self, data_for_grouping, groupby_apply_op):
  81. df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
  82. df.groupby("B", group_keys=False).apply(groupby_apply_op)
  83. df.groupby("B", group_keys=False).A.apply(groupby_apply_op)
  84. df.groupby("A", group_keys=False).apply(groupby_apply_op)
  85. df.groupby("A", group_keys=False).B.apply(groupby_apply_op)
  86. def test_groupby_apply_identity(self, data_for_grouping):
  87. df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping})
  88. result = df.groupby("A").B.apply(lambda x: x.array)
  89. expected = pd.Series(
  90. [
  91. df.B.iloc[[0, 1, 6]].array,
  92. df.B.iloc[[2, 3]].array,
  93. df.B.iloc[[4, 5]].array,
  94. df.B.iloc[[7]].array,
  95. ],
  96. index=pd.Index([1, 2, 3, 4], name="A"),
  97. name="B",
  98. )
  99. self.assert_series_equal(result, expected)
  100. def test_in_numeric_groupby(self, data_for_grouping):
  101. df = pd.DataFrame(
  102. {
  103. "A": [1, 1, 2, 2, 3, 3, 1, 4],
  104. "B": data_for_grouping,
  105. "C": [1, 1, 1, 1, 1, 1, 1, 1],
  106. }
  107. )
  108. dtype = data_for_grouping.dtype
  109. if (
  110. is_numeric_dtype(dtype)
  111. or is_bool_dtype(dtype)
  112. or dtype.name == "decimal"
  113. or is_string_dtype(dtype)
  114. or is_object_dtype(dtype)
  115. or dtype.kind == "m" # in particular duration[*][pyarrow]
  116. ):
  117. expected = pd.Index(["B", "C"])
  118. result = df.groupby("A").sum().columns
  119. else:
  120. expected = pd.Index(["C"])
  121. with pytest.raises(TypeError, match="does not support"):
  122. df.groupby("A").sum().columns
  123. result = df.groupby("A").sum(numeric_only=True).columns
  124. tm.assert_index_equal(result, expected)