test_allowlist.py 7.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326
  1. """
  2. test methods relating to generic function evaluation
  3. the so-called white/black lists
  4. """
  5. from string import ascii_lowercase
  6. import numpy as np
  7. import pytest
  8. from pandas import (
  9. DataFrame,
  10. Series,
  11. date_range,
  12. )
  13. import pandas._testing as tm
  14. from pandas.core.groupby.base import (
  15. groupby_other_methods,
  16. reduction_kernels,
  17. transformation_kernels,
  18. )
  19. AGG_FUNCTIONS = [
  20. "sum",
  21. "prod",
  22. "min",
  23. "max",
  24. "median",
  25. "mean",
  26. "skew",
  27. "std",
  28. "var",
  29. "sem",
  30. ]
  31. AGG_FUNCTIONS_WITH_SKIPNA = ["skew"]
  32. @pytest.fixture
  33. def df():
  34. return DataFrame(
  35. {
  36. "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
  37. "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
  38. "C": np.random.randn(8),
  39. "D": np.random.randn(8),
  40. }
  41. )
  42. @pytest.fixture
  43. def df_letters():
  44. letters = np.array(list(ascii_lowercase))
  45. N = 10
  46. random_letters = letters.take(np.random.randint(0, 26, N))
  47. df = DataFrame(
  48. {
  49. "floats": N / 10 * Series(np.random.random(N)),
  50. "letters": Series(random_letters),
  51. }
  52. )
  53. return df
  54. @pytest.fixture
  55. def raw_frame():
  56. return DataFrame([0])
  57. @pytest.mark.parametrize("op", AGG_FUNCTIONS)
  58. @pytest.mark.parametrize("axis", [0, 1])
  59. @pytest.mark.parametrize("skipna", [True, False])
  60. @pytest.mark.parametrize("sort", [True, False])
  61. def test_regression_allowlist_methods(raw_frame, op, axis, skipna, sort):
  62. # GH6944
  63. # GH 17537
  64. # explicitly test the allowlist methods
  65. if axis == 0:
  66. frame = raw_frame
  67. else:
  68. frame = raw_frame.T
  69. if op in AGG_FUNCTIONS_WITH_SKIPNA:
  70. grouped = frame.groupby(level=0, axis=axis, sort=sort)
  71. result = getattr(grouped, op)(skipna=skipna)
  72. expected = frame.groupby(level=0).apply(
  73. lambda h: getattr(h, op)(axis=axis, skipna=skipna)
  74. )
  75. if sort:
  76. expected = expected.sort_index(axis=axis)
  77. tm.assert_frame_equal(result, expected)
  78. else:
  79. grouped = frame.groupby(level=0, axis=axis, sort=sort)
  80. result = getattr(grouped, op)()
  81. expected = frame.groupby(level=0).apply(lambda h: getattr(h, op)(axis=axis))
  82. if sort:
  83. expected = expected.sort_index(axis=axis)
  84. tm.assert_frame_equal(result, expected)
  85. def test_groupby_blocklist(df_letters):
  86. df = df_letters
  87. s = df_letters.floats
  88. blocklist = [
  89. "eval",
  90. "query",
  91. "abs",
  92. "where",
  93. "mask",
  94. "align",
  95. "groupby",
  96. "clip",
  97. "astype",
  98. "at",
  99. "combine",
  100. "consolidate",
  101. "convert_objects",
  102. ]
  103. to_methods = [method for method in dir(df) if method.startswith("to_")]
  104. blocklist.extend(to_methods)
  105. for bl in blocklist:
  106. for obj in (df, s):
  107. gb = obj.groupby(df.letters)
  108. # e.g., to_csv
  109. defined_but_not_allowed = (
  110. f"(?:^Cannot.+{repr(bl)}.+'{type(gb).__name__}'.+try "
  111. f"using the 'apply' method$)"
  112. )
  113. # e.g., query, eval
  114. not_defined = (
  115. f"(?:^'{type(gb).__name__}' object has no attribute {repr(bl)}$)"
  116. )
  117. msg = f"{defined_but_not_allowed}|{not_defined}"
  118. with pytest.raises(AttributeError, match=msg):
  119. getattr(gb, bl)
  120. def test_tab_completion(mframe):
  121. grp = mframe.groupby(level="second")
  122. results = {v for v in dir(grp) if not v.startswith("_")}
  123. expected = {
  124. "A",
  125. "B",
  126. "C",
  127. "agg",
  128. "aggregate",
  129. "apply",
  130. "boxplot",
  131. "filter",
  132. "first",
  133. "get_group",
  134. "groups",
  135. "hist",
  136. "indices",
  137. "last",
  138. "max",
  139. "mean",
  140. "median",
  141. "min",
  142. "ngroups",
  143. "nth",
  144. "ohlc",
  145. "plot",
  146. "prod",
  147. "size",
  148. "std",
  149. "sum",
  150. "transform",
  151. "var",
  152. "sem",
  153. "count",
  154. "nunique",
  155. "head",
  156. "describe",
  157. "cummax",
  158. "quantile",
  159. "rank",
  160. "cumprod",
  161. "tail",
  162. "resample",
  163. "cummin",
  164. "fillna",
  165. "cumsum",
  166. "cumcount",
  167. "ngroup",
  168. "all",
  169. "shift",
  170. "skew",
  171. "take",
  172. "pct_change",
  173. "any",
  174. "corr",
  175. "corrwith",
  176. "cov",
  177. "dtypes",
  178. "ndim",
  179. "diff",
  180. "idxmax",
  181. "idxmin",
  182. "ffill",
  183. "bfill",
  184. "rolling",
  185. "expanding",
  186. "pipe",
  187. "sample",
  188. "ewm",
  189. "value_counts",
  190. }
  191. assert results == expected
  192. def test_groupby_function_rename(mframe):
  193. grp = mframe.groupby(level="second")
  194. for name in ["sum", "prod", "min", "max", "first", "last"]:
  195. f = getattr(grp, name)
  196. assert f.__name__ == name
  197. @pytest.mark.parametrize(
  198. "method",
  199. [
  200. "count",
  201. "corr",
  202. "cummax",
  203. "cummin",
  204. "cumprod",
  205. "describe",
  206. "rank",
  207. "quantile",
  208. "diff",
  209. "shift",
  210. "all",
  211. "any",
  212. "idxmin",
  213. "idxmax",
  214. "ffill",
  215. "bfill",
  216. "pct_change",
  217. ],
  218. )
  219. def test_groupby_selection_with_methods(df, method):
  220. # some methods which require DatetimeIndex
  221. rng = date_range("2014", periods=len(df))
  222. df.index = rng
  223. g = df.groupby(["A"])[["C"]]
  224. g_exp = df[["C"]].groupby(df["A"])
  225. # TODO check groupby with > 1 col ?
  226. res = getattr(g, method)()
  227. exp = getattr(g_exp, method)()
  228. # should always be frames!
  229. tm.assert_frame_equal(res, exp)
  230. def test_groupby_selection_other_methods(df):
  231. # some methods which require DatetimeIndex
  232. rng = date_range("2014", periods=len(df))
  233. df.columns.name = "foo"
  234. df.index = rng
  235. g = df.groupby(["A"])[["C"]]
  236. g_exp = df[["C"]].groupby(df["A"])
  237. # methods which aren't just .foo()
  238. tm.assert_frame_equal(g.fillna(0), g_exp.fillna(0))
  239. tm.assert_frame_equal(g.dtypes, g_exp.dtypes)
  240. tm.assert_frame_equal(g.apply(lambda x: x.sum()), g_exp.apply(lambda x: x.sum()))
  241. tm.assert_frame_equal(g.resample("D").mean(), g_exp.resample("D").mean())
  242. tm.assert_frame_equal(g.resample("D").ohlc(), g_exp.resample("D").ohlc())
  243. tm.assert_frame_equal(
  244. g.filter(lambda x: len(x) == 3), g_exp.filter(lambda x: len(x) == 3)
  245. )
  246. def test_all_methods_categorized(mframe):
  247. grp = mframe.groupby(mframe.iloc[:, 0])
  248. names = {_ for _ in dir(grp) if not _.startswith("_")} - set(mframe.columns)
  249. new_names = set(names)
  250. new_names -= reduction_kernels
  251. new_names -= transformation_kernels
  252. new_names -= groupby_other_methods
  253. assert not reduction_kernels & transformation_kernels
  254. assert not reduction_kernels & groupby_other_methods
  255. assert not transformation_kernels & groupby_other_methods
  256. # new public method?
  257. if new_names:
  258. msg = f"""
  259. There are uncategorized methods defined on the Grouper class:
  260. {new_names}.
  261. Was a new method recently added?
  262. Every public method On Grouper must appear in exactly one the
  263. following three lists defined in pandas.core.groupby.base:
  264. - `reduction_kernels`
  265. - `transformation_kernels`
  266. - `groupby_other_methods`
  267. see the comments in pandas/core/groupby/base.py for guidance on
  268. how to fix this test.
  269. """
  270. raise AssertionError(msg)
  271. # removed a public method?
  272. all_categorized = reduction_kernels | transformation_kernels | groupby_other_methods
  273. if names != all_categorized:
  274. msg = f"""
  275. Some methods which are supposed to be on the Grouper class
  276. are missing:
  277. {all_categorized - names}.
  278. They're still defined in one of the lists that live in pandas/core/groupby/base.py.
  279. If you removed a method, you should update them
  280. """
  281. raise AssertionError(msg)