test_any_all.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. import builtins
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import (
  6. DataFrame,
  7. Index,
  8. Series,
  9. isna,
  10. )
  11. import pandas._testing as tm
  12. @pytest.mark.parametrize("agg_func", ["any", "all"])
  13. @pytest.mark.parametrize("skipna", [True, False])
  14. @pytest.mark.parametrize(
  15. "vals",
  16. [
  17. ["foo", "bar", "baz"],
  18. ["foo", "", ""],
  19. ["", "", ""],
  20. [1, 2, 3],
  21. [1, 0, 0],
  22. [0, 0, 0],
  23. [1.0, 2.0, 3.0],
  24. [1.0, 0.0, 0.0],
  25. [0.0, 0.0, 0.0],
  26. [True, True, True],
  27. [True, False, False],
  28. [False, False, False],
  29. [np.nan, np.nan, np.nan],
  30. ],
  31. )
  32. def test_groupby_bool_aggs(agg_func, skipna, vals):
  33. df = DataFrame({"key": ["a"] * 3 + ["b"] * 3, "val": vals * 2})
  34. # Figure out expectation using Python builtin
  35. exp = getattr(builtins, agg_func)(vals)
  36. # edge case for missing data with skipna and 'any'
  37. if skipna and all(isna(vals)) and agg_func == "any":
  38. exp = False
  39. exp_df = DataFrame([exp] * 2, columns=["val"], index=Index(["a", "b"], name="key"))
  40. result = getattr(df.groupby("key"), agg_func)(skipna=skipna)
  41. tm.assert_frame_equal(result, exp_df)
  42. def test_any():
  43. df = DataFrame(
  44. [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
  45. columns=["A", "B", "C"],
  46. )
  47. expected = DataFrame(
  48. [[True, True], [False, True]], columns=["B", "C"], index=[1, 3]
  49. )
  50. expected.index.name = "A"
  51. result = df.groupby("A").any()
  52. tm.assert_frame_equal(result, expected)
  53. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  54. def test_bool_aggs_dup_column_labels(bool_agg_func):
  55. # 21668
  56. df = DataFrame([[True, True]], columns=["a", "a"])
  57. grp_by = df.groupby([0])
  58. result = getattr(grp_by, bool_agg_func)()
  59. expected = df.set_axis(np.array([0]))
  60. tm.assert_frame_equal(result, expected)
  61. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  62. @pytest.mark.parametrize("skipna", [True, False])
  63. @pytest.mark.parametrize(
  64. "data",
  65. [
  66. [False, False, False],
  67. [True, True, True],
  68. [pd.NA, pd.NA, pd.NA],
  69. [False, pd.NA, False],
  70. [True, pd.NA, True],
  71. [True, pd.NA, False],
  72. ],
  73. )
  74. def test_masked_kleene_logic(bool_agg_func, skipna, data):
  75. # GH#37506
  76. ser = Series(data, dtype="boolean")
  77. # The result should match aggregating on the whole series. Correctness
  78. # there is verified in test_reductions.py::test_any_all_boolean_kleene_logic
  79. expected_data = getattr(ser, bool_agg_func)(skipna=skipna)
  80. expected = Series(expected_data, index=np.array([0]), dtype="boolean")
  81. result = ser.groupby([0, 0, 0]).agg(bool_agg_func, skipna=skipna)
  82. tm.assert_series_equal(result, expected)
  83. @pytest.mark.parametrize(
  84. "dtype1,dtype2,exp_col1,exp_col2",
  85. [
  86. (
  87. "float",
  88. "Float64",
  89. np.array([True], dtype=bool),
  90. pd.array([pd.NA], dtype="boolean"),
  91. ),
  92. (
  93. "Int64",
  94. "float",
  95. pd.array([pd.NA], dtype="boolean"),
  96. np.array([True], dtype=bool),
  97. ),
  98. (
  99. "Int64",
  100. "Int64",
  101. pd.array([pd.NA], dtype="boolean"),
  102. pd.array([pd.NA], dtype="boolean"),
  103. ),
  104. (
  105. "Float64",
  106. "boolean",
  107. pd.array([pd.NA], dtype="boolean"),
  108. pd.array([pd.NA], dtype="boolean"),
  109. ),
  110. ],
  111. )
  112. def test_masked_mixed_types(dtype1, dtype2, exp_col1, exp_col2):
  113. # GH#37506
  114. data = [1.0, np.nan]
  115. df = DataFrame(
  116. {"col1": pd.array(data, dtype=dtype1), "col2": pd.array(data, dtype=dtype2)}
  117. )
  118. result = df.groupby([1, 1]).agg("all", skipna=False)
  119. expected = DataFrame({"col1": exp_col1, "col2": exp_col2}, index=np.array([1]))
  120. tm.assert_frame_equal(result, expected)
  121. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  122. @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
  123. @pytest.mark.parametrize("skipna", [True, False])
  124. def test_masked_bool_aggs_skipna(bool_agg_func, dtype, skipna, frame_or_series):
  125. # GH#40585
  126. obj = frame_or_series([pd.NA, 1], dtype=dtype)
  127. expected_res = True
  128. if not skipna and bool_agg_func == "all":
  129. expected_res = pd.NA
  130. expected = frame_or_series([expected_res], index=np.array([1]), dtype="boolean")
  131. result = obj.groupby([1, 1]).agg(bool_agg_func, skipna=skipna)
  132. tm.assert_equal(result, expected)
  133. @pytest.mark.parametrize(
  134. "bool_agg_func,data,expected_res",
  135. [
  136. ("any", [pd.NA, np.nan], False),
  137. ("any", [pd.NA, 1, np.nan], True),
  138. ("all", [pd.NA, pd.NaT], True),
  139. ("all", [pd.NA, False, pd.NaT], False),
  140. ],
  141. )
  142. def test_object_type_missing_vals(bool_agg_func, data, expected_res, frame_or_series):
  143. # GH#37501
  144. obj = frame_or_series(data, dtype=object)
  145. result = obj.groupby([1] * len(data)).agg(bool_agg_func)
  146. expected = frame_or_series([expected_res], index=np.array([1]), dtype="bool")
  147. tm.assert_equal(result, expected)
  148. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  149. def test_object_NA_raises_with_skipna_false(bool_agg_func):
  150. # GH#37501
  151. ser = Series([pd.NA], dtype=object)
  152. with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
  153. ser.groupby([1]).agg(bool_agg_func, skipna=False)
  154. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  155. def test_empty(frame_or_series, bool_agg_func):
  156. # GH 45231
  157. kwargs = {"columns": ["a"]} if frame_or_series is DataFrame else {"name": "a"}
  158. obj = frame_or_series(**kwargs, dtype=object)
  159. result = getattr(obj.groupby(obj.index), bool_agg_func)()
  160. expected = frame_or_series(**kwargs, dtype=bool)
  161. tm.assert_equal(result, expected)