test_min_max.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249
  1. import numpy as np
  2. import pytest
  3. from pandas._libs.tslibs import iNaT
  4. import pandas as pd
  5. from pandas import (
  6. DataFrame,
  7. Index,
  8. Series,
  9. )
  10. import pandas._testing as tm
  11. def test_max_min_non_numeric():
  12. # #2700
  13. aa = DataFrame({"nn": [11, 11, 22, 22], "ii": [1, 2, 3, 4], "ss": 4 * ["mama"]})
  14. result = aa.groupby("nn").max()
  15. assert "ss" in result
  16. result = aa.groupby("nn").max(numeric_only=False)
  17. assert "ss" in result
  18. result = aa.groupby("nn").min()
  19. assert "ss" in result
  20. result = aa.groupby("nn").min(numeric_only=False)
  21. assert "ss" in result
  22. def test_max_min_object_multiple_columns(using_array_manager):
  23. # GH#41111 case where the aggregation is valid for some columns but not
  24. # others; we split object blocks column-wise, consistent with
  25. # DataFrame._reduce
  26. df = DataFrame(
  27. {
  28. "A": [1, 1, 2, 2, 3],
  29. "B": [1, "foo", 2, "bar", False],
  30. "C": ["a", "b", "c", "d", "e"],
  31. }
  32. )
  33. df._consolidate_inplace() # should already be consolidate, but double-check
  34. if not using_array_manager:
  35. assert len(df._mgr.blocks) == 2
  36. gb = df.groupby("A")
  37. result = gb[["C"]].max()
  38. # "max" is valid for column "C" but not for "B"
  39. ei = Index([1, 2, 3], name="A")
  40. expected = DataFrame({"C": ["b", "d", "e"]}, index=ei)
  41. tm.assert_frame_equal(result, expected)
  42. result = gb[["C"]].min()
  43. # "min" is valid for column "C" but not for "B"
  44. ei = Index([1, 2, 3], name="A")
  45. expected = DataFrame({"C": ["a", "c", "e"]}, index=ei)
  46. tm.assert_frame_equal(result, expected)
  47. def test_min_date_with_nans():
  48. # GH26321
  49. dates = pd.to_datetime(
  50. Series(["2019-05-09", "2019-05-09", "2019-05-09"]), format="%Y-%m-%d"
  51. ).dt.date
  52. df = DataFrame({"a": [np.nan, "1", np.nan], "b": [0, 1, 1], "c": dates})
  53. result = df.groupby("b", as_index=False)["c"].min()["c"]
  54. expected = pd.to_datetime(
  55. Series(["2019-05-09", "2019-05-09"], name="c"), format="%Y-%m-%d"
  56. ).dt.date
  57. tm.assert_series_equal(result, expected)
  58. result = df.groupby("b")["c"].min()
  59. expected.index.name = "b"
  60. tm.assert_series_equal(result, expected)
  61. def test_max_inat():
  62. # GH#40767 dont interpret iNaT as NaN
  63. ser = Series([1, iNaT])
  64. key = np.array([1, 1], dtype=np.int64)
  65. gb = ser.groupby(key)
  66. result = gb.max(min_count=2)
  67. expected = Series({1: 1}, dtype=np.int64)
  68. tm.assert_series_equal(result, expected, check_exact=True)
  69. result = gb.min(min_count=2)
  70. expected = Series({1: iNaT}, dtype=np.int64)
  71. tm.assert_series_equal(result, expected, check_exact=True)
  72. # not enough entries -> gets masked to NaN
  73. result = gb.min(min_count=3)
  74. expected = Series({1: np.nan})
  75. tm.assert_series_equal(result, expected, check_exact=True)
  76. def test_max_inat_not_all_na():
  77. # GH#40767 dont interpret iNaT as NaN
  78. # make sure we dont round iNaT+1 to iNaT
  79. ser = Series([1, iNaT, 2, iNaT + 1])
  80. gb = ser.groupby([1, 2, 3, 3])
  81. result = gb.min(min_count=2)
  82. # Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy
  83. expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1})
  84. expected.index = expected.index.astype(np.int_)
  85. tm.assert_series_equal(result, expected, check_exact=True)
  86. @pytest.mark.parametrize("func", ["min", "max"])
  87. def test_groupby_aggregate_period_column(func):
  88. # GH 31471
  89. groups = [1, 2]
  90. periods = pd.period_range("2020", periods=2, freq="Y")
  91. df = DataFrame({"a": groups, "b": periods})
  92. result = getattr(df.groupby("a")["b"], func)()
  93. idx = Index([1, 2], name="a")
  94. expected = Series(periods, index=idx, name="b")
  95. tm.assert_series_equal(result, expected)
  96. @pytest.mark.parametrize("func", ["min", "max"])
  97. def test_groupby_aggregate_period_frame(func):
  98. # GH 31471
  99. groups = [1, 2]
  100. periods = pd.period_range("2020", periods=2, freq="Y")
  101. df = DataFrame({"a": groups, "b": periods})
  102. result = getattr(df.groupby("a"), func)()
  103. idx = Index([1, 2], name="a")
  104. expected = DataFrame({"b": periods}, index=idx)
  105. tm.assert_frame_equal(result, expected)
  106. def test_aggregate_numeric_object_dtype():
  107. # https://github.com/pandas-dev/pandas/issues/39329
  108. # simplified case: multiple object columns where one is all-NaN
  109. # -> gets split as the all-NaN is inferred as float
  110. df = DataFrame(
  111. {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": [np.nan] * 4},
  112. ).astype(object)
  113. result = df.groupby("key").min()
  114. expected = (
  115. DataFrame(
  116. {"key": ["A", "B"], "col1": ["a", "c"], "col2": [np.nan, np.nan]},
  117. )
  118. .set_index("key")
  119. .astype(object)
  120. )
  121. tm.assert_frame_equal(result, expected)
  122. # same but with numbers
  123. df = DataFrame(
  124. {"key": ["A", "A", "B", "B"], "col1": list("abcd"), "col2": range(4)},
  125. ).astype(object)
  126. result = df.groupby("key").min()
  127. expected = (
  128. DataFrame({"key": ["A", "B"], "col1": ["a", "c"], "col2": [0, 2]})
  129. .set_index("key")
  130. .astype(object)
  131. )
  132. tm.assert_frame_equal(result, expected)
  133. @pytest.mark.parametrize("func", ["min", "max"])
  134. def test_aggregate_categorical_lost_index(func: str):
  135. # GH: 28641 groupby drops index, when grouping over categorical column with min/max
  136. ds = Series(["b"], dtype="category").cat.as_ordered()
  137. df = DataFrame({"A": [1997], "B": ds})
  138. result = df.groupby("A").agg({"B": func})
  139. expected = DataFrame({"B": ["b"]}, index=Index([1997], name="A"))
  140. # ordered categorical dtype should be preserved
  141. expected["B"] = expected["B"].astype(ds.dtype)
  142. tm.assert_frame_equal(result, expected)
  143. @pytest.mark.parametrize("dtype", ["Int64", "Int32", "Float64", "Float32", "boolean"])
  144. def test_groupby_min_max_nullable(dtype):
  145. if dtype == "Int64":
  146. # GH#41743 avoid precision loss
  147. ts = 1618556707013635762
  148. elif dtype == "boolean":
  149. ts = 0
  150. else:
  151. ts = 4.0
  152. df = DataFrame({"id": [2, 2], "ts": [ts, ts + 1]})
  153. df["ts"] = df["ts"].astype(dtype)
  154. gb = df.groupby("id")
  155. result = gb.min()
  156. expected = df.iloc[:1].set_index("id")
  157. tm.assert_frame_equal(result, expected)
  158. res_max = gb.max()
  159. expected_max = df.iloc[1:].set_index("id")
  160. tm.assert_frame_equal(res_max, expected_max)
  161. result2 = gb.min(min_count=3)
  162. expected2 = DataFrame({"ts": [pd.NA]}, index=expected.index, dtype=dtype)
  163. tm.assert_frame_equal(result2, expected2)
  164. res_max2 = gb.max(min_count=3)
  165. tm.assert_frame_equal(res_max2, expected2)
  166. # Case with NA values
  167. df2 = DataFrame({"id": [2, 2, 2], "ts": [ts, pd.NA, ts + 1]})
  168. df2["ts"] = df2["ts"].astype(dtype)
  169. gb2 = df2.groupby("id")
  170. result3 = gb2.min()
  171. tm.assert_frame_equal(result3, expected)
  172. res_max3 = gb2.max()
  173. tm.assert_frame_equal(res_max3, expected_max)
  174. result4 = gb2.min(min_count=100)
  175. tm.assert_frame_equal(result4, expected2)
  176. res_max4 = gb2.max(min_count=100)
  177. tm.assert_frame_equal(res_max4, expected2)
  178. def test_min_max_nullable_uint64_empty_group():
  179. # don't raise NotImplementedError from libgroupby
  180. cat = pd.Categorical([0] * 10, categories=[0, 1])
  181. df = DataFrame({"A": cat, "B": pd.array(np.arange(10, dtype=np.uint64))})
  182. gb = df.groupby("A")
  183. res = gb.min()
  184. idx = pd.CategoricalIndex([0, 1], dtype=cat.dtype, name="A")
  185. expected = DataFrame({"B": pd.array([0, pd.NA], dtype="UInt64")}, index=idx)
  186. tm.assert_frame_equal(res, expected)
  187. res = gb.max()
  188. expected.iloc[0, 0] = 9
  189. tm.assert_frame_equal(res, expected)