test_reductions.py 62 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764
  1. from datetime import timedelta
  2. from decimal import Decimal
  3. import re
  4. from dateutil.tz import tzlocal
  5. import numpy as np
  6. import pytest
  7. from pandas.compat import is_platform_windows
  8. import pandas.util._test_decorators as td
  9. from pandas.core.dtypes.common import is_categorical_dtype
  10. import pandas as pd
  11. from pandas import (
  12. Categorical,
  13. DataFrame,
  14. Index,
  15. Series,
  16. Timestamp,
  17. date_range,
  18. isna,
  19. notna,
  20. to_datetime,
  21. to_timedelta,
  22. )
  23. import pandas._testing as tm
  24. from pandas.core import (
  25. algorithms,
  26. nanops,
  27. )
  28. def assert_stat_op_calc(
  29. opname,
  30. alternative,
  31. frame,
  32. has_skipna=True,
  33. check_dtype=True,
  34. check_dates=False,
  35. rtol=1e-5,
  36. atol=1e-8,
  37. skipna_alternative=None,
  38. ):
  39. """
  40. Check that operator opname works as advertised on frame
  41. Parameters
  42. ----------
  43. opname : str
  44. Name of the operator to test on frame
  45. alternative : function
  46. Function that opname is tested against; i.e. "frame.opname()" should
  47. equal "alternative(frame)".
  48. frame : DataFrame
  49. The object that the tests are executed on
  50. has_skipna : bool, default True
  51. Whether the method "opname" has the kwarg "skip_na"
  52. check_dtype : bool, default True
  53. Whether the dtypes of the result of "frame.opname()" and
  54. "alternative(frame)" should be checked.
  55. check_dates : bool, default false
  56. Whether opname should be tested on a Datetime Series
  57. rtol : float, default 1e-5
  58. Relative tolerance.
  59. atol : float, default 1e-8
  60. Absolute tolerance.
  61. skipna_alternative : function, default None
  62. NaN-safe version of alternative
  63. """
  64. f = getattr(frame, opname)
  65. if check_dates:
  66. df = DataFrame({"b": date_range("1/1/2001", periods=2)})
  67. with tm.assert_produces_warning(None):
  68. result = getattr(df, opname)()
  69. assert isinstance(result, Series)
  70. df["a"] = range(len(df))
  71. with tm.assert_produces_warning(None):
  72. result = getattr(df, opname)()
  73. assert isinstance(result, Series)
  74. assert len(result)
  75. if has_skipna:
  76. def wrapper(x):
  77. return alternative(x.values)
  78. skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative)
  79. result0 = f(axis=0, skipna=False)
  80. result1 = f(axis=1, skipna=False)
  81. tm.assert_series_equal(
  82. result0, frame.apply(wrapper), check_dtype=check_dtype, rtol=rtol, atol=atol
  83. )
  84. tm.assert_series_equal(
  85. result1,
  86. frame.apply(wrapper, axis=1),
  87. rtol=rtol,
  88. atol=atol,
  89. )
  90. else:
  91. skipna_wrapper = alternative
  92. result0 = f(axis=0)
  93. result1 = f(axis=1)
  94. tm.assert_series_equal(
  95. result0,
  96. frame.apply(skipna_wrapper),
  97. check_dtype=check_dtype,
  98. rtol=rtol,
  99. atol=atol,
  100. )
  101. if opname in ["sum", "prod"]:
  102. expected = frame.apply(skipna_wrapper, axis=1)
  103. tm.assert_series_equal(
  104. result1, expected, check_dtype=False, rtol=rtol, atol=atol
  105. )
  106. # check dtypes
  107. if check_dtype:
  108. lcd_dtype = frame.values.dtype
  109. assert lcd_dtype == result0.dtype
  110. assert lcd_dtype == result1.dtype
  111. # bad axis
  112. with pytest.raises(ValueError, match="No axis named 2"):
  113. f(axis=2)
  114. # all NA case
  115. if has_skipna:
  116. all_na = frame * np.NaN
  117. r0 = getattr(all_na, opname)(axis=0)
  118. r1 = getattr(all_na, opname)(axis=1)
  119. if opname in ["sum", "prod"]:
  120. unit = 1 if opname == "prod" else 0 # result for empty sum/prod
  121. expected = Series(unit, index=r0.index, dtype=r0.dtype)
  122. tm.assert_series_equal(r0, expected)
  123. expected = Series(unit, index=r1.index, dtype=r1.dtype)
  124. tm.assert_series_equal(r1, expected)
  125. class TestDataFrameAnalytics:
  126. # ---------------------------------------------------------------------
  127. # Reductions
  128. @pytest.mark.parametrize("axis", [0, 1])
  129. @pytest.mark.parametrize(
  130. "opname",
  131. [
  132. "count",
  133. "sum",
  134. "mean",
  135. "product",
  136. "median",
  137. "min",
  138. "max",
  139. "nunique",
  140. "var",
  141. "std",
  142. "sem",
  143. pytest.param("skew", marks=td.skip_if_no_scipy),
  144. pytest.param("kurt", marks=td.skip_if_no_scipy),
  145. ],
  146. )
  147. def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
  148. if (opname in ("sum", "min", "max") and axis == 0) or opname in (
  149. "count",
  150. "nunique",
  151. ):
  152. getattr(float_string_frame, opname)(axis=axis)
  153. else:
  154. msg = "|".join(
  155. [
  156. "Could not convert",
  157. "could not convert",
  158. "can't multiply sequence by non-int",
  159. "unsupported operand type",
  160. "not supported between instances of",
  161. ]
  162. )
  163. with pytest.raises(TypeError, match=msg):
  164. getattr(float_string_frame, opname)(axis=axis)
  165. if opname != "nunique":
  166. getattr(float_string_frame, opname)(axis=axis, numeric_only=True)
  167. @pytest.mark.parametrize("axis", [0, 1])
  168. @pytest.mark.parametrize(
  169. "opname",
  170. [
  171. "count",
  172. "sum",
  173. "mean",
  174. "product",
  175. "median",
  176. "min",
  177. "max",
  178. "var",
  179. "std",
  180. "sem",
  181. pytest.param("skew", marks=td.skip_if_no_scipy),
  182. pytest.param("kurt", marks=td.skip_if_no_scipy),
  183. ],
  184. )
  185. def test_stat_op_api_float_frame(self, float_frame, axis, opname):
  186. getattr(float_frame, opname)(axis=axis, numeric_only=False)
  187. def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame):
  188. def count(s):
  189. return notna(s).sum()
  190. def nunique(s):
  191. return len(algorithms.unique1d(s.dropna()))
  192. def var(x):
  193. return np.var(x, ddof=1)
  194. def std(x):
  195. return np.std(x, ddof=1)
  196. def sem(x):
  197. return np.std(x, ddof=1) / np.sqrt(len(x))
  198. assert_stat_op_calc(
  199. "nunique",
  200. nunique,
  201. float_frame_with_na,
  202. has_skipna=False,
  203. check_dtype=False,
  204. check_dates=True,
  205. )
  206. # GH#32571: rol needed for flaky CI builds
  207. # mixed types (with upcasting happening)
  208. assert_stat_op_calc(
  209. "sum",
  210. np.sum,
  211. mixed_float_frame.astype("float32"),
  212. check_dtype=False,
  213. rtol=1e-3,
  214. )
  215. assert_stat_op_calc(
  216. "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum
  217. )
  218. assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True)
  219. assert_stat_op_calc(
  220. "product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod
  221. )
  222. assert_stat_op_calc("var", var, float_frame_with_na)
  223. assert_stat_op_calc("std", std, float_frame_with_na)
  224. assert_stat_op_calc("sem", sem, float_frame_with_na)
  225. assert_stat_op_calc(
  226. "count",
  227. count,
  228. float_frame_with_na,
  229. has_skipna=False,
  230. check_dtype=False,
  231. check_dates=True,
  232. )
  233. @td.skip_if_no_scipy
  234. def test_stat_op_calc_skew_kurtosis(self, float_frame_with_na):
  235. def skewness(x):
  236. from scipy.stats import skew
  237. if len(x) < 3:
  238. return np.nan
  239. return skew(x, bias=False)
  240. def kurt(x):
  241. from scipy.stats import kurtosis
  242. if len(x) < 4:
  243. return np.nan
  244. return kurtosis(x, bias=False)
  245. assert_stat_op_calc("skew", skewness, float_frame_with_na)
  246. assert_stat_op_calc("kurt", kurt, float_frame_with_na)
  247. def test_median(self, float_frame_with_na, int_frame):
  248. def wrapper(x):
  249. if isna(x).any():
  250. return np.nan
  251. return np.median(x)
  252. assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True)
  253. assert_stat_op_calc(
  254. "median", wrapper, int_frame, check_dtype=False, check_dates=True
  255. )
  256. @pytest.mark.parametrize(
  257. "method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"]
  258. )
  259. @pytest.mark.parametrize(
  260. "df",
  261. [
  262. DataFrame(
  263. {
  264. "a": [
  265. -0.00049987540199591344,
  266. -0.0016467257772919831,
  267. 0.00067695870775883013,
  268. ],
  269. "b": [-0, -0, 0.0],
  270. "c": [
  271. 0.00031111847529610595,
  272. 0.0014902627951905339,
  273. -0.00094099200035979691,
  274. ],
  275. },
  276. index=["foo", "bar", "baz"],
  277. dtype="O",
  278. ),
  279. DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
  280. ],
  281. )
  282. def test_stat_operators_attempt_obj_array(self, method, df, axis):
  283. # GH#676
  284. assert df.values.dtype == np.object_
  285. result = getattr(df, method)(axis=axis)
  286. expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
  287. tm.assert_series_equal(result, expected)
  288. @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
  289. def test_mixed_ops(self, op):
  290. # GH#16116
  291. df = DataFrame(
  292. {
  293. "int": [1, 2, 3, 4],
  294. "float": [1.0, 2.0, 3.0, 4.0],
  295. "str": ["a", "b", "c", "d"],
  296. }
  297. )
  298. msg = "|".join(
  299. [
  300. "Could not convert",
  301. "could not convert",
  302. "can't multiply sequence by non-int",
  303. ]
  304. )
  305. with pytest.raises(TypeError, match=msg):
  306. getattr(df, op)()
  307. with pd.option_context("use_bottleneck", False):
  308. msg = "|".join(
  309. [
  310. "Could not convert",
  311. "could not convert",
  312. "can't multiply sequence by non-int",
  313. ]
  314. )
  315. with pytest.raises(TypeError, match=msg):
  316. getattr(df, op)()
  317. def test_reduce_mixed_frame(self):
  318. # GH 6806
  319. df = DataFrame(
  320. {
  321. "bool_data": [True, True, False, False, False],
  322. "int_data": [10, 20, 30, 40, 50],
  323. "string_data": ["a", "b", "c", "d", "e"],
  324. }
  325. )
  326. df.reindex(columns=["bool_data", "int_data", "string_data"])
  327. test = df.sum(axis=0)
  328. tm.assert_numpy_array_equal(
  329. test.values, np.array([2, 150, "abcde"], dtype=object)
  330. )
  331. alt = df.T.sum(axis=1)
  332. tm.assert_series_equal(test, alt)
  333. def test_nunique(self):
  334. df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]})
  335. tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2}))
  336. tm.assert_series_equal(
  337. df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3})
  338. )
  339. tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
  340. tm.assert_series_equal(
  341. df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2})
  342. )
  343. @pytest.mark.parametrize("tz", [None, "UTC"])
  344. def test_mean_mixed_datetime_numeric(self, tz):
  345. # https://github.com/pandas-dev/pandas/issues/24752
  346. df = DataFrame({"A": [1, 1], "B": [Timestamp("2000", tz=tz)] * 2})
  347. result = df.mean()
  348. expected = Series([1.0, Timestamp("2000", tz=tz)], index=["A", "B"])
  349. tm.assert_series_equal(result, expected)
  350. @pytest.mark.parametrize("tz", [None, "UTC"])
  351. def test_mean_includes_datetimes(self, tz):
  352. # https://github.com/pandas-dev/pandas/issues/24752
  353. # Behavior in 0.24.0rc1 was buggy.
  354. # As of 2.0 with numeric_only=None we do *not* drop datetime columns
  355. df = DataFrame({"A": [Timestamp("2000", tz=tz)] * 2})
  356. result = df.mean()
  357. expected = Series([Timestamp("2000", tz=tz)], index=["A"])
  358. tm.assert_series_equal(result, expected)
  359. def test_mean_mixed_string_decimal(self):
  360. # GH 11670
  361. # possible bug when calculating mean of DataFrame?
  362. d = [
  363. {"A": 2, "B": None, "C": Decimal("628.00")},
  364. {"A": 1, "B": None, "C": Decimal("383.00")},
  365. {"A": 3, "B": None, "C": Decimal("651.00")},
  366. {"A": 2, "B": None, "C": Decimal("575.00")},
  367. {"A": 4, "B": None, "C": Decimal("1114.00")},
  368. {"A": 1, "B": "TEST", "C": Decimal("241.00")},
  369. {"A": 2, "B": None, "C": Decimal("572.00")},
  370. {"A": 4, "B": None, "C": Decimal("609.00")},
  371. {"A": 3, "B": None, "C": Decimal("820.00")},
  372. {"A": 5, "B": None, "C": Decimal("1223.00")},
  373. ]
  374. df = DataFrame(d)
  375. with pytest.raises(TypeError, match="unsupported operand type"):
  376. df.mean()
  377. result = df[["A", "C"]].mean()
  378. expected = Series([2.7, 681.6], index=["A", "C"], dtype=object)
  379. tm.assert_series_equal(result, expected)
  380. def test_var_std(self, datetime_frame):
  381. result = datetime_frame.std(ddof=4)
  382. expected = datetime_frame.apply(lambda x: x.std(ddof=4))
  383. tm.assert_almost_equal(result, expected)
  384. result = datetime_frame.var(ddof=4)
  385. expected = datetime_frame.apply(lambda x: x.var(ddof=4))
  386. tm.assert_almost_equal(result, expected)
  387. arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
  388. result = nanops.nanvar(arr, axis=0)
  389. assert not (result < 0).any()
  390. with pd.option_context("use_bottleneck", False):
  391. result = nanops.nanvar(arr, axis=0)
  392. assert not (result < 0).any()
  393. @pytest.mark.parametrize("meth", ["sem", "var", "std"])
  394. def test_numeric_only_flag(self, meth):
  395. # GH 9201
  396. df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"])
  397. # Cast to object to avoid implicit cast when setting entry to "100" below
  398. df1 = df1.astype({"foo": object})
  399. # set one entry to a number in str format
  400. df1.loc[0, "foo"] = "100"
  401. df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"])
  402. # Cast to object to avoid implicit cast when setting entry to "a" below
  403. df2 = df2.astype({"foo": object})
  404. # set one entry to a non-number str
  405. df2.loc[0, "foo"] = "a"
  406. result = getattr(df1, meth)(axis=1, numeric_only=True)
  407. expected = getattr(df1[["bar", "baz"]], meth)(axis=1)
  408. tm.assert_series_equal(expected, result)
  409. result = getattr(df2, meth)(axis=1, numeric_only=True)
  410. expected = getattr(df2[["bar", "baz"]], meth)(axis=1)
  411. tm.assert_series_equal(expected, result)
  412. # df1 has all numbers, df2 has a letter inside
  413. msg = r"unsupported operand type\(s\) for -: 'float' and 'str'"
  414. with pytest.raises(TypeError, match=msg):
  415. getattr(df1, meth)(axis=1, numeric_only=False)
  416. msg = "could not convert string to float: 'a'"
  417. with pytest.raises(TypeError, match=msg):
  418. getattr(df2, meth)(axis=1, numeric_only=False)
  419. def test_sem(self, datetime_frame):
  420. result = datetime_frame.sem(ddof=4)
  421. expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x)))
  422. tm.assert_almost_equal(result, expected)
  423. arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
  424. result = nanops.nansem(arr, axis=0)
  425. assert not (result < 0).any()
  426. with pd.option_context("use_bottleneck", False):
  427. result = nanops.nansem(arr, axis=0)
  428. assert not (result < 0).any()
  429. @pytest.mark.parametrize(
  430. "dropna, expected",
  431. [
  432. (
  433. True,
  434. {
  435. "A": [12],
  436. "B": [10.0],
  437. "C": [1.0],
  438. "D": ["a"],
  439. "E": Categorical(["a"], categories=["a"]),
  440. "F": to_datetime(["2000-1-2"]),
  441. "G": to_timedelta(["1 days"]),
  442. },
  443. ),
  444. (
  445. False,
  446. {
  447. "A": [12],
  448. "B": [10.0],
  449. "C": [np.nan],
  450. "D": np.array([np.nan], dtype=object),
  451. "E": Categorical([np.nan], categories=["a"]),
  452. "F": [pd.NaT],
  453. "G": to_timedelta([pd.NaT]),
  454. },
  455. ),
  456. (
  457. True,
  458. {
  459. "H": [8, 9, np.nan, np.nan],
  460. "I": [8, 9, np.nan, np.nan],
  461. "J": [1, np.nan, np.nan, np.nan],
  462. "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]),
  463. "L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]),
  464. "M": to_timedelta(["1 days", "nan", "nan", "nan"]),
  465. "N": [0, 1, 2, 3],
  466. },
  467. ),
  468. (
  469. False,
  470. {
  471. "H": [8, 9, np.nan, np.nan],
  472. "I": [8, 9, np.nan, np.nan],
  473. "J": [1, np.nan, np.nan, np.nan],
  474. "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]),
  475. "L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]),
  476. "M": to_timedelta(["nan", "1 days", "nan", "nan"]),
  477. "N": [0, 1, 2, 3],
  478. },
  479. ),
  480. ],
  481. )
  482. def test_mode_dropna(self, dropna, expected):
  483. df = DataFrame(
  484. {
  485. "A": [12, 12, 19, 11],
  486. "B": [10, 10, np.nan, 3],
  487. "C": [1, np.nan, np.nan, np.nan],
  488. "D": [np.nan, np.nan, "a", np.nan],
  489. "E": Categorical([np.nan, np.nan, "a", np.nan]),
  490. "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]),
  491. "G": to_timedelta(["1 days", "nan", "nan", "nan"]),
  492. "H": [8, 8, 9, 9],
  493. "I": [9, 9, 8, 8],
  494. "J": [1, 1, np.nan, np.nan],
  495. "K": Categorical(["a", np.nan, "a", np.nan]),
  496. "L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]),
  497. "M": to_timedelta(["1 days", "nan", "1 days", "nan"]),
  498. "N": np.arange(4, dtype="int64"),
  499. }
  500. )
  501. result = df[sorted(expected.keys())].mode(dropna=dropna)
  502. expected = DataFrame(expected)
  503. tm.assert_frame_equal(result, expected)
  504. def test_mode_sortwarning(self):
  505. # Check for the warning that is raised when the mode
  506. # results cannot be sorted
  507. df = DataFrame({"A": [np.nan, np.nan, "a", "a"]})
  508. expected = DataFrame({"A": ["a", np.nan]})
  509. with tm.assert_produces_warning(UserWarning):
  510. result = df.mode(dropna=False)
  511. result = result.sort_values(by="A").reset_index(drop=True)
  512. tm.assert_frame_equal(result, expected)
  513. def test_mode_empty_df(self):
  514. df = DataFrame([], columns=["a", "b"])
  515. result = df.mode()
  516. expected = DataFrame([], columns=["a", "b"], index=Index([], dtype=np.int64))
  517. tm.assert_frame_equal(result, expected)
  518. def test_operators_timedelta64(self):
  519. df = DataFrame(
  520. {
  521. "A": date_range("2012-1-1", periods=3, freq="D"),
  522. "B": date_range("2012-1-2", periods=3, freq="D"),
  523. "C": Timestamp("20120101") - timedelta(minutes=5, seconds=5),
  524. }
  525. )
  526. diffs = DataFrame({"A": df["A"] - df["C"], "B": df["A"] - df["B"]})
  527. # min
  528. result = diffs.min()
  529. assert result[0] == diffs.loc[0, "A"]
  530. assert result[1] == diffs.loc[0, "B"]
  531. result = diffs.min(axis=1)
  532. assert (result == diffs.loc[0, "B"]).all()
  533. # max
  534. result = diffs.max()
  535. assert result[0] == diffs.loc[2, "A"]
  536. assert result[1] == diffs.loc[2, "B"]
  537. result = diffs.max(axis=1)
  538. assert (result == diffs["A"]).all()
  539. # abs
  540. result = diffs.abs()
  541. result2 = abs(diffs)
  542. expected = DataFrame({"A": df["A"] - df["C"], "B": df["B"] - df["A"]})
  543. tm.assert_frame_equal(result, expected)
  544. tm.assert_frame_equal(result2, expected)
  545. # mixed frame
  546. mixed = diffs.copy()
  547. mixed["C"] = "foo"
  548. mixed["D"] = 1
  549. mixed["E"] = 1.0
  550. mixed["F"] = Timestamp("20130101")
  551. # results in an object array
  552. result = mixed.min()
  553. expected = Series(
  554. [
  555. pd.Timedelta(timedelta(seconds=5 * 60 + 5)),
  556. pd.Timedelta(timedelta(days=-1)),
  557. "foo",
  558. 1,
  559. 1.0,
  560. Timestamp("20130101"),
  561. ],
  562. index=mixed.columns,
  563. )
  564. tm.assert_series_equal(result, expected)
  565. # excludes non-numeric
  566. result = mixed.min(axis=1, numeric_only=True)
  567. expected = Series([1, 1, 1.0], index=[0, 1, 2])
  568. tm.assert_series_equal(result, expected)
  569. # works when only those columns are selected
  570. result = mixed[["A", "B"]].min(1)
  571. expected = Series([timedelta(days=-1)] * 3)
  572. tm.assert_series_equal(result, expected)
  573. result = mixed[["A", "B"]].min()
  574. expected = Series(
  575. [timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"]
  576. )
  577. tm.assert_series_equal(result, expected)
  578. # GH 3106
  579. df = DataFrame(
  580. {
  581. "time": date_range("20130102", periods=5),
  582. "time2": date_range("20130105", periods=5),
  583. }
  584. )
  585. df["off1"] = df["time2"] - df["time"]
  586. assert df["off1"].dtype == "timedelta64[ns]"
  587. df["off2"] = df["time"] - df["time2"]
  588. df._consolidate_inplace()
  589. assert df["off1"].dtype == "timedelta64[ns]"
  590. assert df["off2"].dtype == "timedelta64[ns]"
  591. def test_std_timedelta64_skipna_false(self):
  592. # GH#37392
  593. tdi = pd.timedelta_range("1 Day", periods=10)
  594. df = DataFrame({"A": tdi, "B": tdi}, copy=True)
  595. df.iloc[-2, -1] = pd.NaT
  596. result = df.std(skipna=False)
  597. expected = Series(
  598. [df["A"].std(), pd.NaT], index=["A", "B"], dtype="timedelta64[ns]"
  599. )
  600. tm.assert_series_equal(result, expected)
  601. result = df.std(axis=1, skipna=False)
  602. expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)])
  603. tm.assert_series_equal(result, expected)
  604. @pytest.mark.parametrize(
  605. "values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]]
  606. )
  607. def test_std_datetime64_with_nat(
  608. self, values, skipna, using_array_manager, request
  609. ):
  610. # GH#51335
  611. if using_array_manager and (
  612. not skipna or all(value is pd.NaT for value in values)
  613. ):
  614. mark = pytest.mark.xfail(
  615. reason="GH#51446: Incorrect type inference on NaT in reduction result"
  616. )
  617. request.node.add_marker(mark)
  618. df = DataFrame({"a": to_datetime(values)})
  619. result = df.std(skipna=skipna)
  620. if not skipna or all(value is pd.NaT for value in values):
  621. expected = Series({"a": pd.NaT}, dtype="timedelta64[ns]")
  622. else:
  623. # 86400000000000ns == 1 day
  624. expected = Series({"a": 86400000000000}, dtype="timedelta64[ns]")
  625. tm.assert_series_equal(result, expected)
  626. def test_sum_corner(self):
  627. empty_frame = DataFrame()
  628. axis0 = empty_frame.sum(0)
  629. axis1 = empty_frame.sum(1)
  630. assert isinstance(axis0, Series)
  631. assert isinstance(axis1, Series)
  632. assert len(axis0) == 0
  633. assert len(axis1) == 0
  634. @pytest.mark.parametrize(
  635. "index",
  636. [
  637. tm.makeRangeIndex(0),
  638. tm.makeDateIndex(0),
  639. tm.makeNumericIndex(0, dtype=int),
  640. tm.makeNumericIndex(0, dtype=float),
  641. tm.makeDateIndex(0, freq="M"),
  642. tm.makePeriodIndex(0),
  643. ],
  644. )
  645. def test_axis_1_empty(self, all_reductions, index, using_array_manager):
  646. df = DataFrame(columns=["a"], index=index)
  647. result = getattr(df, all_reductions)(axis=1)
  648. if all_reductions in ("any", "all"):
  649. expected_dtype = "bool"
  650. elif all_reductions == "count":
  651. expected_dtype = "int64"
  652. else:
  653. expected_dtype = "object"
  654. expected = Series([], index=index, dtype=expected_dtype)
  655. tm.assert_series_equal(result, expected)
  656. @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
  657. @pytest.mark.parametrize("numeric_only", [None, True, False])
  658. def test_sum_prod_nanops(self, method, unit, numeric_only):
  659. idx = ["a", "b", "c"]
  660. df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]})
  661. # The default
  662. result = getattr(df, method)(numeric_only=numeric_only)
  663. expected = Series([unit, unit, unit], index=idx, dtype="float64")
  664. tm.assert_series_equal(result, expected)
  665. # min_count=1
  666. result = getattr(df, method)(numeric_only=numeric_only, min_count=1)
  667. expected = Series([unit, unit, np.nan], index=idx)
  668. tm.assert_series_equal(result, expected)
  669. # min_count=0
  670. result = getattr(df, method)(numeric_only=numeric_only, min_count=0)
  671. expected = Series([unit, unit, unit], index=idx, dtype="float64")
  672. tm.assert_series_equal(result, expected)
  673. result = getattr(df.iloc[1:], method)(numeric_only=numeric_only, min_count=1)
  674. expected = Series([unit, np.nan, np.nan], index=idx)
  675. tm.assert_series_equal(result, expected)
  676. # min_count > 1
  677. df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5})
  678. result = getattr(df, method)(numeric_only=numeric_only, min_count=5)
  679. expected = Series(result, index=["A", "B"])
  680. tm.assert_series_equal(result, expected)
  681. result = getattr(df, method)(numeric_only=numeric_only, min_count=6)
  682. expected = Series(result, index=["A", "B"])
  683. tm.assert_series_equal(result, expected)
  684. def test_sum_nanops_timedelta(self):
  685. # prod isn't defined on timedeltas
  686. idx = ["a", "b", "c"]
  687. df = DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]})
  688. df2 = df.apply(to_timedelta)
  689. # 0 by default
  690. result = df2.sum()
  691. expected = Series([0, 0, 0], dtype="m8[ns]", index=idx)
  692. tm.assert_series_equal(result, expected)
  693. # min_count=0
  694. result = df2.sum(min_count=0)
  695. tm.assert_series_equal(result, expected)
  696. # min_count=1
  697. result = df2.sum(min_count=1)
  698. expected = Series([0, 0, np.nan], dtype="m8[ns]", index=idx)
  699. tm.assert_series_equal(result, expected)
  700. def test_sum_nanops_min_count(self):
  701. # https://github.com/pandas-dev/pandas/issues/39738
  702. df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
  703. result = df.sum(min_count=10)
  704. expected = Series([np.nan, np.nan], index=["x", "y"])
  705. tm.assert_series_equal(result, expected)
  706. @pytest.mark.parametrize("float_type", ["float16", "float32", "float64"])
  707. @pytest.mark.parametrize(
  708. "kwargs, expected_result",
  709. [
  710. ({"axis": 1, "min_count": 2}, [3.2, 5.3, np.NaN]),
  711. ({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]),
  712. ({"axis": 1, "skipna": False}, [3.2, 5.3, np.NaN]),
  713. ],
  714. )
  715. def test_sum_nanops_dtype_min_count(self, float_type, kwargs, expected_result):
  716. # GH#46947
  717. df = DataFrame({"a": [1.0, 2.3, 4.4], "b": [2.2, 3, np.nan]}, dtype=float_type)
  718. result = df.sum(**kwargs)
  719. expected = Series(expected_result).astype(float_type)
  720. tm.assert_series_equal(result, expected)
  721. @pytest.mark.parametrize("float_type", ["float16", "float32", "float64"])
  722. @pytest.mark.parametrize(
  723. "kwargs, expected_result",
  724. [
  725. ({"axis": 1, "min_count": 2}, [2.0, 4.0, np.NaN]),
  726. ({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]),
  727. ({"axis": 1, "skipna": False}, [2.0, 4.0, np.NaN]),
  728. ],
  729. )
  730. def test_prod_nanops_dtype_min_count(self, float_type, kwargs, expected_result):
  731. # GH#46947
  732. df = DataFrame(
  733. {"a": [1.0, 2.0, 4.4], "b": [2.0, 2.0, np.nan]}, dtype=float_type
  734. )
  735. result = df.prod(**kwargs)
  736. expected = Series(expected_result).astype(float_type)
  737. tm.assert_series_equal(result, expected)
  738. def test_sum_object(self, float_frame):
  739. values = float_frame.values.astype(int)
  740. frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns)
  741. deltas = frame * timedelta(1)
  742. deltas.sum()
  743. def test_sum_bool(self, float_frame):
  744. # ensure this works, bug report
  745. bools = np.isnan(float_frame)
  746. bools.sum(1)
  747. bools.sum(0)
  748. def test_sum_mixed_datetime(self):
  749. # GH#30886
  750. df = DataFrame({"A": date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex(
  751. [2, 3, 4]
  752. )
  753. with pytest.raises(TypeError, match="does not support reduction 'sum'"):
  754. df.sum()
  755. def test_mean_corner(self, float_frame, float_string_frame):
  756. # unit test when have object data
  757. with pytest.raises(TypeError, match="Could not convert"):
  758. float_string_frame.mean(axis=0)
  759. # xs sum mixed type, just want to know it works...
  760. with pytest.raises(TypeError, match="unsupported operand type"):
  761. float_string_frame.mean(axis=1)
  762. # take mean of boolean column
  763. float_frame["bool"] = float_frame["A"] > 0
  764. means = float_frame.mean(0)
  765. assert means["bool"] == float_frame["bool"].values.mean()
  766. def test_mean_datetimelike(self):
  767. # GH#24757 check that datetimelike are excluded by default, handled
  768. # correctly with numeric_only=True
  769. # As of 2.0, datetimelike are *not* excluded with numeric_only=None
  770. df = DataFrame(
  771. {
  772. "A": np.arange(3),
  773. "B": date_range("2016-01-01", periods=3),
  774. "C": pd.timedelta_range("1D", periods=3),
  775. "D": pd.period_range("2016", periods=3, freq="A"),
  776. }
  777. )
  778. result = df.mean(numeric_only=True)
  779. expected = Series({"A": 1.0})
  780. tm.assert_series_equal(result, expected)
  781. with pytest.raises(TypeError, match="mean is not implemented for PeriodArray"):
  782. df.mean()
  783. def test_mean_datetimelike_numeric_only_false(self):
  784. df = DataFrame(
  785. {
  786. "A": np.arange(3),
  787. "B": date_range("2016-01-01", periods=3),
  788. "C": pd.timedelta_range("1D", periods=3),
  789. }
  790. )
  791. # datetime(tz) and timedelta work
  792. result = df.mean(numeric_only=False)
  793. expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]})
  794. tm.assert_series_equal(result, expected)
  795. # mean of period is not allowed
  796. df["D"] = pd.period_range("2016", periods=3, freq="A")
  797. with pytest.raises(TypeError, match="mean is not implemented for Period"):
  798. df.mean(numeric_only=False)
  799. def test_mean_extensionarray_numeric_only_true(self):
  800. # https://github.com/pandas-dev/pandas/issues/33256
  801. arr = np.random.randint(1000, size=(10, 5))
  802. df = DataFrame(arr, dtype="Int64")
  803. result = df.mean(numeric_only=True)
  804. expected = DataFrame(arr).mean()
  805. tm.assert_series_equal(result, expected)
  806. def test_stats_mixed_type(self, float_string_frame):
  807. with pytest.raises(TypeError, match="could not convert"):
  808. float_string_frame.std(1)
  809. with pytest.raises(TypeError, match="could not convert"):
  810. float_string_frame.var(1)
  811. with pytest.raises(TypeError, match="unsupported operand type"):
  812. float_string_frame.mean(1)
  813. with pytest.raises(TypeError, match="could not convert"):
  814. float_string_frame.skew(1)
  815. def test_sum_bools(self):
  816. df = DataFrame(index=range(1), columns=range(10))
  817. bools = isna(df)
  818. assert bools.sum(axis=1)[0] == 10
  819. # ----------------------------------------------------------------------
  820. # Index of max / min
  821. @pytest.mark.parametrize("skipna", [True, False])
  822. @pytest.mark.parametrize("axis", [0, 1])
  823. def test_idxmin(self, float_frame, int_frame, skipna, axis):
  824. frame = float_frame
  825. frame.iloc[5:10] = np.nan
  826. frame.iloc[15:20, -2:] = np.nan
  827. for df in [frame, int_frame]:
  828. result = df.idxmin(axis=axis, skipna=skipna)
  829. expected = df.apply(Series.idxmin, axis=axis, skipna=skipna)
  830. tm.assert_series_equal(result, expected)
  831. @pytest.mark.parametrize("numeric_only", [True, False])
  832. def test_idxmin_numeric_only(self, numeric_only):
  833. df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
  834. if numeric_only:
  835. result = df.idxmin(numeric_only=numeric_only)
  836. expected = Series([2, 1], index=["a", "b"])
  837. tm.assert_series_equal(result, expected)
  838. else:
  839. with pytest.raises(TypeError, match="not allowed for this dtype"):
  840. df.idxmin(numeric_only=numeric_only)
  841. def test_idxmin_axis_2(self, float_frame):
  842. frame = float_frame
  843. msg = "No axis named 2 for object type DataFrame"
  844. with pytest.raises(ValueError, match=msg):
  845. frame.idxmin(axis=2)
  846. @pytest.mark.parametrize("skipna", [True, False])
  847. @pytest.mark.parametrize("axis", [0, 1])
  848. def test_idxmax(self, float_frame, int_frame, skipna, axis):
  849. frame = float_frame
  850. frame.iloc[5:10] = np.nan
  851. frame.iloc[15:20, -2:] = np.nan
  852. for df in [frame, int_frame]:
  853. result = df.idxmax(axis=axis, skipna=skipna)
  854. expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
  855. tm.assert_series_equal(result, expected)
  856. @pytest.mark.parametrize("numeric_only", [True, False])
  857. def test_idxmax_numeric_only(self, numeric_only):
  858. df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
  859. if numeric_only:
  860. result = df.idxmax(numeric_only=numeric_only)
  861. expected = Series([1, 0], index=["a", "b"])
  862. tm.assert_series_equal(result, expected)
  863. else:
  864. with pytest.raises(TypeError, match="not allowed for this dtype"):
  865. df.idxmin(numeric_only=numeric_only)
  866. def test_idxmax_axis_2(self, float_frame):
  867. frame = float_frame
  868. msg = "No axis named 2 for object type DataFrame"
  869. with pytest.raises(ValueError, match=msg):
  870. frame.idxmax(axis=2)
  871. def test_idxmax_mixed_dtype(self):
  872. # don't cast to object, which would raise in nanops
  873. dti = date_range("2016-01-01", periods=3)
  874. # Copying dti is needed for ArrayManager otherwise when we set
  875. # df.loc[0, 3] = pd.NaT below it edits dti
  876. df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti.copy(deep=True)})
  877. result = df.idxmax()
  878. expected = Series([1, 0, 2], index=[1, 2, 3])
  879. tm.assert_series_equal(result, expected)
  880. result = df.idxmin()
  881. expected = Series([0, 2, 0], index=[1, 2, 3])
  882. tm.assert_series_equal(result, expected)
  883. # with NaTs
  884. df.loc[0, 3] = pd.NaT
  885. result = df.idxmax()
  886. expected = Series([1, 0, 2], index=[1, 2, 3])
  887. tm.assert_series_equal(result, expected)
  888. result = df.idxmin()
  889. expected = Series([0, 2, 1], index=[1, 2, 3])
  890. tm.assert_series_equal(result, expected)
  891. # with multi-column dt64 block
  892. df[4] = dti[::-1]
  893. df._consolidate_inplace()
  894. result = df.idxmax()
  895. expected = Series([1, 0, 2, 0], index=[1, 2, 3, 4])
  896. tm.assert_series_equal(result, expected)
  897. result = df.idxmin()
  898. expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4])
  899. tm.assert_series_equal(result, expected)
  900. @pytest.mark.parametrize(
  901. "op, expected_value",
  902. [("idxmax", [0, 4]), ("idxmin", [0, 5])],
  903. )
  904. def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):
  905. # GH 40346
  906. df = DataFrame(
  907. {
  908. "ID": [100, 100, 100, 200, 200, 200],
  909. "value": [0, 0, 0, 1, 2, 0],
  910. },
  911. dtype="Int64",
  912. )
  913. df = df.groupby("ID")
  914. result = getattr(df, op)()
  915. expected = DataFrame(
  916. {"value": expected_value},
  917. index=Index([100, 200], name="ID", dtype="Int64"),
  918. )
  919. tm.assert_frame_equal(result, expected)
  920. def test_idxmax_dt64_multicolumn_axis1(self):
  921. dti = date_range("2016-01-01", periods=3)
  922. df = DataFrame({3: dti, 4: dti[::-1]}, copy=True)
  923. df.iloc[0, 0] = pd.NaT
  924. df._consolidate_inplace()
  925. result = df.idxmax(axis=1)
  926. expected = Series([4, 3, 3])
  927. tm.assert_series_equal(result, expected)
  928. result = df.idxmin(axis=1)
  929. expected = Series([4, 3, 4])
  930. tm.assert_series_equal(result, expected)
  931. # ----------------------------------------------------------------------
  932. # Logical reductions
  933. @pytest.mark.parametrize("opname", ["any", "all"])
  934. @pytest.mark.parametrize("axis", [0, 1])
  935. @pytest.mark.parametrize("bool_only", [False, True])
  936. def test_any_all_mixed_float(self, opname, axis, bool_only, float_string_frame):
  937. # make sure op works on mixed-type frame
  938. mixed = float_string_frame
  939. mixed["_bool_"] = np.random.randn(len(mixed)) > 0.5
  940. getattr(mixed, opname)(axis=axis, bool_only=bool_only)
  941. @pytest.mark.parametrize("opname", ["any", "all"])
  942. @pytest.mark.parametrize("axis", [0, 1])
  943. def test_any_all_bool_with_na(self, opname, axis, bool_frame_with_na):
  944. getattr(bool_frame_with_na, opname)(axis=axis, bool_only=False)
  945. @pytest.mark.parametrize("opname", ["any", "all"])
  946. def test_any_all_bool_frame(self, opname, bool_frame_with_na):
  947. # GH#12863: numpy gives back non-boolean data for object type
  948. # so fill NaNs to compare with pandas behavior
  949. frame = bool_frame_with_na.fillna(True)
  950. alternative = getattr(np, opname)
  951. f = getattr(frame, opname)
  952. def skipna_wrapper(x):
  953. nona = x.dropna().values
  954. return alternative(nona)
  955. def wrapper(x):
  956. return alternative(x.values)
  957. result0 = f(axis=0, skipna=False)
  958. result1 = f(axis=1, skipna=False)
  959. tm.assert_series_equal(result0, frame.apply(wrapper))
  960. tm.assert_series_equal(result1, frame.apply(wrapper, axis=1))
  961. result0 = f(axis=0)
  962. result1 = f(axis=1)
  963. tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
  964. tm.assert_series_equal(
  965. result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False
  966. )
  967. # bad axis
  968. with pytest.raises(ValueError, match="No axis named 2"):
  969. f(axis=2)
  970. # all NA case
  971. all_na = frame * np.NaN
  972. r0 = getattr(all_na, opname)(axis=0)
  973. r1 = getattr(all_na, opname)(axis=1)
  974. if opname == "any":
  975. assert not r0.any()
  976. assert not r1.any()
  977. else:
  978. assert r0.all()
  979. assert r1.all()
  980. def test_any_all_extra(self):
  981. df = DataFrame(
  982. {
  983. "A": [True, False, False],
  984. "B": [True, True, False],
  985. "C": [True, True, True],
  986. },
  987. index=["a", "b", "c"],
  988. )
  989. result = df[["A", "B"]].any(axis=1)
  990. expected = Series([True, True, False], index=["a", "b", "c"])
  991. tm.assert_series_equal(result, expected)
  992. result = df[["A", "B"]].any(axis=1, bool_only=True)
  993. tm.assert_series_equal(result, expected)
  994. result = df.all(1)
  995. expected = Series([True, False, False], index=["a", "b", "c"])
  996. tm.assert_series_equal(result, expected)
  997. result = df.all(1, bool_only=True)
  998. tm.assert_series_equal(result, expected)
  999. # Axis is None
  1000. result = df.all(axis=None).item()
  1001. assert result is False
  1002. result = df.any(axis=None).item()
  1003. assert result is True
  1004. result = df[["C"]].all(axis=None).item()
  1005. assert result is True
  1006. @pytest.mark.parametrize("axis", [0, 1])
  1007. @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
  1008. @pytest.mark.parametrize("skipna", [True, False])
  1009. def test_any_all_object_dtype(self, axis, bool_agg_func, skipna):
  1010. # GH#35450
  1011. df = DataFrame(
  1012. data=[
  1013. [1, np.nan, np.nan, True],
  1014. [np.nan, 2, np.nan, True],
  1015. [np.nan, np.nan, np.nan, True],
  1016. [np.nan, np.nan, "5", np.nan],
  1017. ]
  1018. )
  1019. result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna)
  1020. expected = Series([True, True, True, True])
  1021. tm.assert_series_equal(result, expected)
  1022. # GH#50947 deprecates this but it is not emitting a warning in some builds.
  1023. @pytest.mark.filterwarnings(
  1024. "ignore:'any' with datetime64 dtypes is deprecated.*:FutureWarning"
  1025. )
  1026. def test_any_datetime(self):
  1027. # GH 23070
  1028. float_data = [1, np.nan, 3, np.nan]
  1029. datetime_data = [
  1030. Timestamp("1960-02-15"),
  1031. Timestamp("1960-02-16"),
  1032. pd.NaT,
  1033. pd.NaT,
  1034. ]
  1035. df = DataFrame({"A": float_data, "B": datetime_data})
  1036. result = df.any(axis=1)
  1037. expected = Series([True, True, True, False])
  1038. tm.assert_series_equal(result, expected)
  1039. def test_any_all_bool_only(self):
  1040. # GH 25101
  1041. df = DataFrame(
  1042. {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]}
  1043. )
  1044. result = df.all(bool_only=True)
  1045. expected = Series(dtype=np.bool_, index=[])
  1046. tm.assert_series_equal(result, expected)
  1047. df = DataFrame(
  1048. {
  1049. "col1": [1, 2, 3],
  1050. "col2": [4, 5, 6],
  1051. "col3": [None, None, None],
  1052. "col4": [False, False, True],
  1053. }
  1054. )
  1055. result = df.all(bool_only=True)
  1056. expected = Series({"col4": False})
  1057. tm.assert_series_equal(result, expected)
  1058. @pytest.mark.parametrize(
  1059. "func, data, expected",
  1060. [
  1061. (np.any, {}, False),
  1062. (np.all, {}, True),
  1063. (np.any, {"A": []}, False),
  1064. (np.all, {"A": []}, True),
  1065. (np.any, {"A": [False, False]}, False),
  1066. (np.all, {"A": [False, False]}, False),
  1067. (np.any, {"A": [True, False]}, True),
  1068. (np.all, {"A": [True, False]}, False),
  1069. (np.any, {"A": [True, True]}, True),
  1070. (np.all, {"A": [True, True]}, True),
  1071. (np.any, {"A": [False], "B": [False]}, False),
  1072. (np.all, {"A": [False], "B": [False]}, False),
  1073. (np.any, {"A": [False, False], "B": [False, True]}, True),
  1074. (np.all, {"A": [False, False], "B": [False, True]}, False),
  1075. # other types
  1076. (np.all, {"A": Series([0.0, 1.0], dtype="float")}, False),
  1077. (np.any, {"A": Series([0.0, 1.0], dtype="float")}, True),
  1078. (np.all, {"A": Series([0, 1], dtype=int)}, False),
  1079. (np.any, {"A": Series([0, 1], dtype=int)}, True),
  1080. pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns]")}, False),
  1081. pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, False),
  1082. pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns]")}, True),
  1083. pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, True),
  1084. pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns]")}, True),
  1085. pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True),
  1086. pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns]")}, True),
  1087. pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True),
  1088. pytest.param(np.all, {"A": Series([0, 1], dtype="m8[ns]")}, False),
  1089. pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True),
  1090. pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True),
  1091. pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True),
  1092. # np.all on Categorical raises, so the reduction drops the
  1093. # column, so all is being done on an empty Series, so is True
  1094. (np.all, {"A": Series([0, 1], dtype="category")}, True),
  1095. (np.any, {"A": Series([0, 1], dtype="category")}, False),
  1096. (np.all, {"A": Series([1, 2], dtype="category")}, True),
  1097. (np.any, {"A": Series([1, 2], dtype="category")}, False),
  1098. # Mix GH#21484
  1099. pytest.param(
  1100. np.all,
  1101. {
  1102. "A": Series([10, 20], dtype="M8[ns]"),
  1103. "B": Series([10, 20], dtype="m8[ns]"),
  1104. },
  1105. True,
  1106. ),
  1107. ],
  1108. )
  1109. def test_any_all_np_func(self, func, data, expected):
  1110. # GH 19976
  1111. data = DataFrame(data)
  1112. if any(is_categorical_dtype(x) for x in data.dtypes):
  1113. with pytest.raises(
  1114. TypeError, match="dtype category does not support reduction"
  1115. ):
  1116. func(data)
  1117. # method version
  1118. with pytest.raises(
  1119. TypeError, match="dtype category does not support reduction"
  1120. ):
  1121. getattr(DataFrame(data), func.__name__)(axis=None)
  1122. else:
  1123. msg = "'(any|all)' with datetime64 dtypes is deprecated"
  1124. if data.dtypes.apply(lambda x: x.kind == "M").any():
  1125. warn = FutureWarning
  1126. else:
  1127. warn = None
  1128. with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
  1129. # GH#34479
  1130. result = func(data)
  1131. assert isinstance(result, np.bool_)
  1132. assert result.item() is expected
  1133. # method version
  1134. with tm.assert_produces_warning(warn, match=msg):
  1135. # GH#34479
  1136. result = getattr(DataFrame(data), func.__name__)(axis=None)
  1137. assert isinstance(result, np.bool_)
  1138. assert result.item() is expected
  1139. def test_any_all_object(self):
  1140. # GH 19976
  1141. result = np.all(DataFrame(columns=["a", "b"])).item()
  1142. assert result is True
  1143. result = np.any(DataFrame(columns=["a", "b"])).item()
  1144. assert result is False
  1145. def test_any_all_object_bool_only(self):
  1146. df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object)
  1147. df._consolidate_inplace()
  1148. df["C"] = Series([True, True])
  1149. # Categorical of bools is _not_ considered booly
  1150. df["D"] = df["C"].astype("category")
  1151. # The underlying bug is in DataFrame._get_bool_data, so we check
  1152. # that while we're here
  1153. res = df._get_bool_data()
  1154. expected = df[["C"]]
  1155. tm.assert_frame_equal(res, expected)
  1156. res = df.all(bool_only=True, axis=0)
  1157. expected = Series([True], index=["C"])
  1158. tm.assert_series_equal(res, expected)
  1159. # operating on a subset of columns should not produce a _larger_ Series
  1160. res = df[["B", "C"]].all(bool_only=True, axis=0)
  1161. tm.assert_series_equal(res, expected)
  1162. assert df.all(bool_only=True, axis=None)
  1163. res = df.any(bool_only=True, axis=0)
  1164. expected = Series([True], index=["C"])
  1165. tm.assert_series_equal(res, expected)
  1166. # operating on a subset of columns should not produce a _larger_ Series
  1167. res = df[["C"]].any(bool_only=True, axis=0)
  1168. tm.assert_series_equal(res, expected)
  1169. assert df.any(bool_only=True, axis=None)
  1170. # ---------------------------------------------------------------------
  1171. # Unsorted
  1172. def test_series_broadcasting(self):
  1173. # smoke test for numpy warnings
  1174. # GH 16378, GH 16306
  1175. df = DataFrame([1.0, 1.0, 1.0])
  1176. df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]})
  1177. s = Series([1, 1, 1])
  1178. s_nan = Series([np.nan, np.nan, 1])
  1179. with tm.assert_produces_warning(None):
  1180. df_nan.clip(lower=s, axis=0)
  1181. for op in ["lt", "le", "gt", "ge", "eq", "ne"]:
  1182. getattr(df, op)(s_nan, axis=0)
  1183. class TestDataFrameReductions:
  1184. def test_min_max_dt64_with_NaT(self):
  1185. # Both NaT and Timestamp are in DataFrame.
  1186. df = DataFrame({"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01")]})
  1187. res = df.min()
  1188. exp = Series([Timestamp("2012-05-01")], index=["foo"])
  1189. tm.assert_series_equal(res, exp)
  1190. res = df.max()
  1191. exp = Series([Timestamp("2012-05-01")], index=["foo"])
  1192. tm.assert_series_equal(res, exp)
  1193. # GH12941, only NaTs are in DataFrame.
  1194. df = DataFrame({"foo": [pd.NaT, pd.NaT]})
  1195. res = df.min()
  1196. exp = Series([pd.NaT], index=["foo"])
  1197. tm.assert_series_equal(res, exp)
  1198. res = df.max()
  1199. exp = Series([pd.NaT], index=["foo"])
  1200. tm.assert_series_equal(res, exp)
  1201. def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture):
  1202. # GH#36907
  1203. tz = tz_naive_fixture
  1204. if isinstance(tz, tzlocal) and is_platform_windows():
  1205. pytest.skip(
  1206. "GH#37659 OSError raised within tzlocal bc Windows "
  1207. "chokes in times before 1970-01-01"
  1208. )
  1209. df = DataFrame(
  1210. {
  1211. "a": [
  1212. Timestamp("2020-01-01 08:00:00", tz=tz),
  1213. Timestamp("1920-02-01 09:00:00", tz=tz),
  1214. ],
  1215. "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT],
  1216. }
  1217. )
  1218. res = df.min(axis=1, skipna=False)
  1219. expected = Series([df.loc[0, "a"], pd.NaT])
  1220. assert expected.dtype == df["a"].dtype
  1221. tm.assert_series_equal(res, expected)
  1222. res = df.max(axis=1, skipna=False)
  1223. expected = Series([df.loc[0, "b"], pd.NaT])
  1224. assert expected.dtype == df["a"].dtype
  1225. tm.assert_series_equal(res, expected)
  1226. def test_min_max_dt64_api_consistency_with_NaT(self):
  1227. # Calling the following sum functions returned an error for dataframes but
  1228. # returned NaT for series. These tests check that the API is consistent in
  1229. # min/max calls on empty Series/DataFrames. See GH:33704 for more
  1230. # information
  1231. df = DataFrame({"x": to_datetime([])})
  1232. expected_dt_series = Series(to_datetime([]))
  1233. # check axis 0
  1234. assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT)
  1235. assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT)
  1236. # check axis 1
  1237. tm.assert_series_equal(df.min(axis=1), expected_dt_series)
  1238. tm.assert_series_equal(df.max(axis=1), expected_dt_series)
  1239. def test_min_max_dt64_api_consistency_empty_df(self):
  1240. # check DataFrame/Series api consistency when calling min/max on an empty
  1241. # DataFrame/Series.
  1242. df = DataFrame({"x": []})
  1243. expected_float_series = Series([], dtype=float)
  1244. # check axis 0
  1245. assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
  1246. assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())
  1247. # check axis 1
  1248. tm.assert_series_equal(df.min(axis=1), expected_float_series)
  1249. tm.assert_series_equal(df.min(axis=1), expected_float_series)
  1250. @pytest.mark.parametrize(
  1251. "initial",
  1252. ["2018-10-08 13:36:45+00:00", "2018-10-08 13:36:45+03:00"], # Non-UTC timezone
  1253. )
  1254. @pytest.mark.parametrize("method", ["min", "max"])
  1255. def test_preserve_timezone(self, initial: str, method):
  1256. # GH 28552
  1257. initial_dt = to_datetime(initial)
  1258. expected = Series([initial_dt])
  1259. df = DataFrame([expected])
  1260. result = getattr(df, method)(axis=1)
  1261. tm.assert_series_equal(result, expected)
  1262. @pytest.mark.parametrize("method", ["min", "max"])
  1263. def test_minmax_tzaware_skipna_axis_1(self, method, skipna):
  1264. # GH#51242
  1265. val = to_datetime("1900-01-01", utc=True)
  1266. df = DataFrame(
  1267. {"a": Series([pd.NaT, pd.NaT, val]), "b": Series([pd.NaT, val, val])}
  1268. )
  1269. op = getattr(df, method)
  1270. result = op(axis=1, skipna=skipna)
  1271. if skipna:
  1272. expected = Series([pd.NaT, val, val])
  1273. else:
  1274. expected = Series([pd.NaT, pd.NaT, val])
  1275. tm.assert_series_equal(result, expected)
  1276. def test_frame_any_with_timedelta(self):
  1277. # GH#17667
  1278. df = DataFrame(
  1279. {
  1280. "a": Series([0, 0]),
  1281. "t": Series([to_timedelta(0, "s"), to_timedelta(1, "ms")]),
  1282. }
  1283. )
  1284. result = df.any(axis=0)
  1285. expected = Series(data=[False, True], index=["a", "t"])
  1286. tm.assert_series_equal(result, expected)
  1287. result = df.any(axis=1)
  1288. expected = Series(data=[False, True])
  1289. tm.assert_series_equal(result, expected)
  1290. def test_reductions_skipna_none_raises(
  1291. self, request, frame_or_series, all_reductions
  1292. ):
  1293. if all_reductions == "count":
  1294. request.node.add_marker(
  1295. pytest.mark.xfail(reason="Count does not accept skipna")
  1296. )
  1297. obj = frame_or_series([1, 2, 3])
  1298. msg = 'For argument "skipna" expected type bool, received type NoneType.'
  1299. with pytest.raises(ValueError, match=msg):
  1300. getattr(obj, all_reductions)(skipna=None)
  1301. @td.skip_array_manager_invalid_test
  1302. def test_reduction_timestamp_smallest_unit(self):
  1303. # GH#52524
  1304. df = DataFrame(
  1305. {
  1306. "a": Series([Timestamp("2019-12-31")], dtype="datetime64[s]"),
  1307. "b": Series(
  1308. [Timestamp("2019-12-31 00:00:00.123")], dtype="datetime64[ms]"
  1309. ),
  1310. }
  1311. )
  1312. result = df.max()
  1313. expected = Series(
  1314. [Timestamp("2019-12-31"), Timestamp("2019-12-31 00:00:00.123")],
  1315. dtype="datetime64[ms]",
  1316. index=["a", "b"],
  1317. )
  1318. tm.assert_series_equal(result, expected)
  1319. @td.skip_array_manager_not_yet_implemented
  1320. def test_reduction_timedelta_smallest_unit(self):
  1321. # GH#52524
  1322. df = DataFrame(
  1323. {
  1324. "a": Series([pd.Timedelta("1 days")], dtype="timedelta64[s]"),
  1325. "b": Series([pd.Timedelta("1 days")], dtype="timedelta64[ms]"),
  1326. }
  1327. )
  1328. result = df.max()
  1329. expected = Series(
  1330. [pd.Timedelta("1 days"), pd.Timedelta("1 days")],
  1331. dtype="timedelta64[ms]",
  1332. index=["a", "b"],
  1333. )
  1334. tm.assert_series_equal(result, expected)
  1335. class TestNuisanceColumns:
  1336. @pytest.mark.parametrize("method", ["any", "all"])
  1337. def test_any_all_categorical_dtype_nuisance_column(self, method):
  1338. # GH#36076 DataFrame should match Series behavior
  1339. ser = Series([0, 1], dtype="category", name="A")
  1340. df = ser.to_frame()
  1341. # Double-check the Series behavior is to raise
  1342. with pytest.raises(TypeError, match="does not support reduction"):
  1343. getattr(ser, method)()
  1344. with pytest.raises(TypeError, match="does not support reduction"):
  1345. getattr(np, method)(ser)
  1346. with pytest.raises(TypeError, match="does not support reduction"):
  1347. getattr(df, method)(bool_only=False)
  1348. with pytest.raises(TypeError, match="does not support reduction"):
  1349. getattr(df, method)(bool_only=None)
  1350. with pytest.raises(TypeError, match="does not support reduction"):
  1351. getattr(np, method)(df, axis=0)
  1352. def test_median_categorical_dtype_nuisance_column(self):
  1353. # GH#21020 DataFrame.median should match Series.median
  1354. df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])})
  1355. ser = df["A"]
  1356. # Double-check the Series behavior is to raise
  1357. with pytest.raises(TypeError, match="does not support reduction"):
  1358. ser.median()
  1359. with pytest.raises(TypeError, match="does not support reduction"):
  1360. df.median(numeric_only=False)
  1361. with pytest.raises(TypeError, match="does not support reduction"):
  1362. df.median()
  1363. # same thing, but with an additional non-categorical column
  1364. df["B"] = df["A"].astype(int)
  1365. with pytest.raises(TypeError, match="does not support reduction"):
  1366. df.median(numeric_only=False)
  1367. with pytest.raises(TypeError, match="does not support reduction"):
  1368. df.median()
  1369. # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
  1370. # of expected.values
  1371. @pytest.mark.parametrize("method", ["min", "max"])
  1372. def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
  1373. # GH#28949 DataFrame.min should behave like Series.min
  1374. cat = Categorical(["a", "b", "c", "b"], ordered=False)
  1375. ser = Series(cat)
  1376. df = ser.to_frame("A")
  1377. # Double-check the Series behavior
  1378. with pytest.raises(TypeError, match="is not ordered for operation"):
  1379. getattr(ser, method)()
  1380. with pytest.raises(TypeError, match="is not ordered for operation"):
  1381. getattr(np, method)(ser)
  1382. with pytest.raises(TypeError, match="is not ordered for operation"):
  1383. getattr(df, method)(numeric_only=False)
  1384. with pytest.raises(TypeError, match="is not ordered for operation"):
  1385. getattr(df, method)()
  1386. with pytest.raises(TypeError, match="is not ordered for operation"):
  1387. getattr(np, method)(df, axis=0)
  1388. # same thing, but with an additional non-categorical column
  1389. df["B"] = df["A"].astype(object)
  1390. with pytest.raises(TypeError, match="is not ordered for operation"):
  1391. getattr(df, method)()
  1392. with pytest.raises(TypeError, match="is not ordered for operation"):
  1393. getattr(np, method)(df, axis=0)
  1394. def test_sum_timedelta64_skipna_false(using_array_manager, request):
  1395. # GH#17235
  1396. if using_array_manager:
  1397. mark = pytest.mark.xfail(
  1398. reason="Incorrect type inference on NaT in reduction result"
  1399. )
  1400. request.node.add_marker(mark)
  1401. arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2)
  1402. arr[-1, -1] = "Nat"
  1403. df = DataFrame(arr)
  1404. assert (df.dtypes == arr.dtype).all()
  1405. result = df.sum(skipna=False)
  1406. expected = Series([pd.Timedelta(seconds=12), pd.NaT], dtype="m8[s]")
  1407. tm.assert_series_equal(result, expected)
  1408. result = df.sum(axis=0, skipna=False)
  1409. tm.assert_series_equal(result, expected)
  1410. result = df.sum(axis=1, skipna=False)
  1411. expected = Series(
  1412. [
  1413. pd.Timedelta(seconds=1),
  1414. pd.Timedelta(seconds=5),
  1415. pd.Timedelta(seconds=9),
  1416. pd.NaT,
  1417. ],
  1418. dtype="m8[s]",
  1419. )
  1420. tm.assert_series_equal(result, expected)
  1421. def test_mixed_frame_with_integer_sum():
  1422. # https://github.com/pandas-dev/pandas/issues/34520
  1423. df = DataFrame([["a", 1]], columns=list("ab"))
  1424. df = df.astype({"b": "Int64"})
  1425. result = df.sum()
  1426. expected = Series(["a", 1], index=["a", "b"])
  1427. tm.assert_series_equal(result, expected)
  1428. @pytest.mark.parametrize("numeric_only", [True, False, None])
  1429. @pytest.mark.parametrize("method", ["min", "max"])
  1430. def test_minmax_extensionarray(method, numeric_only):
  1431. # https://github.com/pandas-dev/pandas/issues/32651
  1432. int64_info = np.iinfo("int64")
  1433. ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
  1434. df = DataFrame({"Int64": ser})
  1435. result = getattr(df, method)(numeric_only=numeric_only)
  1436. expected = Series(
  1437. [getattr(int64_info, method)], index=Index(["Int64"], dtype="object")
  1438. )
  1439. tm.assert_series_equal(result, expected)
  1440. @pytest.mark.parametrize("ts_value", [Timestamp("2000-01-01"), pd.NaT])
  1441. def test_frame_mixed_numeric_object_with_timestamp(ts_value):
  1442. # GH 13912
  1443. df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]})
  1444. with pytest.raises(TypeError, match="does not support reduction"):
  1445. df.sum()
  1446. def test_prod_sum_min_count_mixed_object():
  1447. # https://github.com/pandas-dev/pandas/issues/41074
  1448. df = DataFrame([1, "a", True])
  1449. result = df.prod(axis=0, min_count=1, numeric_only=False)
  1450. expected = Series(["a"])
  1451. tm.assert_series_equal(result, expected)
  1452. msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'")
  1453. with pytest.raises(TypeError, match=msg):
  1454. df.sum(axis=0, min_count=1, numeric_only=False)
  1455. @pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
  1456. @pytest.mark.parametrize("numeric_only", [True, False])
  1457. def test_reduction_axis_none_returns_scalar(method, numeric_only):
  1458. # GH#21597 As of 2.0, axis=None reduces over all axes.
  1459. df = DataFrame(np.random.randn(4, 4))
  1460. result = getattr(df, method)(axis=None, numeric_only=numeric_only)
  1461. np_arr = df.to_numpy()
  1462. if method in {"skew", "kurt"}:
  1463. comp_mod = pytest.importorskip("scipy.stats")
  1464. if method == "kurt":
  1465. method = "kurtosis"
  1466. expected = getattr(comp_mod, method)(np_arr, bias=False, axis=None)
  1467. tm.assert_almost_equal(result, expected)
  1468. else:
  1469. expected = getattr(np, method)(np_arr, axis=None)
  1470. assert result == expected
  1471. @pytest.mark.parametrize(
  1472. "kernel",
  1473. [
  1474. "corr",
  1475. "corrwith",
  1476. "cov",
  1477. "idxmax",
  1478. "idxmin",
  1479. "kurt",
  1480. "max",
  1481. "mean",
  1482. "median",
  1483. "min",
  1484. "prod",
  1485. "quantile",
  1486. "sem",
  1487. "skew",
  1488. "std",
  1489. "sum",
  1490. "var",
  1491. ],
  1492. )
  1493. def test_fails_on_non_numeric(kernel):
  1494. # GH#46852
  1495. df = DataFrame({"a": [1, 2, 3], "b": object})
  1496. args = (df,) if kernel == "corrwith" else ()
  1497. msg = "|".join(
  1498. [
  1499. "not allowed for this dtype",
  1500. "argument must be a string or a number",
  1501. "not supported between instances of",
  1502. "unsupported operand type",
  1503. "argument must be a string or a real number",
  1504. ]
  1505. )
  1506. with pytest.raises(TypeError, match=msg):
  1507. getattr(df, kernel)(*args)