123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764 |
- from datetime import timedelta
- from decimal import Decimal
- import re
- from dateutil.tz import tzlocal
- import numpy as np
- import pytest
- from pandas.compat import is_platform_windows
- import pandas.util._test_decorators as td
- from pandas.core.dtypes.common import is_categorical_dtype
- import pandas as pd
- from pandas import (
- Categorical,
- DataFrame,
- Index,
- Series,
- Timestamp,
- date_range,
- isna,
- notna,
- to_datetime,
- to_timedelta,
- )
- import pandas._testing as tm
- from pandas.core import (
- algorithms,
- nanops,
- )
- def assert_stat_op_calc(
- opname,
- alternative,
- frame,
- has_skipna=True,
- check_dtype=True,
- check_dates=False,
- rtol=1e-5,
- atol=1e-8,
- skipna_alternative=None,
- ):
- """
- Check that operator opname works as advertised on frame
- Parameters
- ----------
- opname : str
- Name of the operator to test on frame
- alternative : function
- Function that opname is tested against; i.e. "frame.opname()" should
- equal "alternative(frame)".
- frame : DataFrame
- The object that the tests are executed on
- has_skipna : bool, default True
- Whether the method "opname" has the kwarg "skip_na"
- check_dtype : bool, default True
- Whether the dtypes of the result of "frame.opname()" and
- "alternative(frame)" should be checked.
- check_dates : bool, default false
- Whether opname should be tested on a Datetime Series
- rtol : float, default 1e-5
- Relative tolerance.
- atol : float, default 1e-8
- Absolute tolerance.
- skipna_alternative : function, default None
- NaN-safe version of alternative
- """
- f = getattr(frame, opname)
- if check_dates:
- df = DataFrame({"b": date_range("1/1/2001", periods=2)})
- with tm.assert_produces_warning(None):
- result = getattr(df, opname)()
- assert isinstance(result, Series)
- df["a"] = range(len(df))
- with tm.assert_produces_warning(None):
- result = getattr(df, opname)()
- assert isinstance(result, Series)
- assert len(result)
- if has_skipna:
- def wrapper(x):
- return alternative(x.values)
- skipna_wrapper = tm._make_skipna_wrapper(alternative, skipna_alternative)
- result0 = f(axis=0, skipna=False)
- result1 = f(axis=1, skipna=False)
- tm.assert_series_equal(
- result0, frame.apply(wrapper), check_dtype=check_dtype, rtol=rtol, atol=atol
- )
- tm.assert_series_equal(
- result1,
- frame.apply(wrapper, axis=1),
- rtol=rtol,
- atol=atol,
- )
- else:
- skipna_wrapper = alternative
- result0 = f(axis=0)
- result1 = f(axis=1)
- tm.assert_series_equal(
- result0,
- frame.apply(skipna_wrapper),
- check_dtype=check_dtype,
- rtol=rtol,
- atol=atol,
- )
- if opname in ["sum", "prod"]:
- expected = frame.apply(skipna_wrapper, axis=1)
- tm.assert_series_equal(
- result1, expected, check_dtype=False, rtol=rtol, atol=atol
- )
- # check dtypes
- if check_dtype:
- lcd_dtype = frame.values.dtype
- assert lcd_dtype == result0.dtype
- assert lcd_dtype == result1.dtype
- # bad axis
- with pytest.raises(ValueError, match="No axis named 2"):
- f(axis=2)
- # all NA case
- if has_skipna:
- all_na = frame * np.NaN
- r0 = getattr(all_na, opname)(axis=0)
- r1 = getattr(all_na, opname)(axis=1)
- if opname in ["sum", "prod"]:
- unit = 1 if opname == "prod" else 0 # result for empty sum/prod
- expected = Series(unit, index=r0.index, dtype=r0.dtype)
- tm.assert_series_equal(r0, expected)
- expected = Series(unit, index=r1.index, dtype=r1.dtype)
- tm.assert_series_equal(r1, expected)
- class TestDataFrameAnalytics:
- # ---------------------------------------------------------------------
- # Reductions
- @pytest.mark.parametrize("axis", [0, 1])
- @pytest.mark.parametrize(
- "opname",
- [
- "count",
- "sum",
- "mean",
- "product",
- "median",
- "min",
- "max",
- "nunique",
- "var",
- "std",
- "sem",
- pytest.param("skew", marks=td.skip_if_no_scipy),
- pytest.param("kurt", marks=td.skip_if_no_scipy),
- ],
- )
- def test_stat_op_api_float_string_frame(self, float_string_frame, axis, opname):
- if (opname in ("sum", "min", "max") and axis == 0) or opname in (
- "count",
- "nunique",
- ):
- getattr(float_string_frame, opname)(axis=axis)
- else:
- msg = "|".join(
- [
- "Could not convert",
- "could not convert",
- "can't multiply sequence by non-int",
- "unsupported operand type",
- "not supported between instances of",
- ]
- )
- with pytest.raises(TypeError, match=msg):
- getattr(float_string_frame, opname)(axis=axis)
- if opname != "nunique":
- getattr(float_string_frame, opname)(axis=axis, numeric_only=True)
- @pytest.mark.parametrize("axis", [0, 1])
- @pytest.mark.parametrize(
- "opname",
- [
- "count",
- "sum",
- "mean",
- "product",
- "median",
- "min",
- "max",
- "var",
- "std",
- "sem",
- pytest.param("skew", marks=td.skip_if_no_scipy),
- pytest.param("kurt", marks=td.skip_if_no_scipy),
- ],
- )
- def test_stat_op_api_float_frame(self, float_frame, axis, opname):
- getattr(float_frame, opname)(axis=axis, numeric_only=False)
- def test_stat_op_calc(self, float_frame_with_na, mixed_float_frame):
- def count(s):
- return notna(s).sum()
- def nunique(s):
- return len(algorithms.unique1d(s.dropna()))
- def var(x):
- return np.var(x, ddof=1)
- def std(x):
- return np.std(x, ddof=1)
- def sem(x):
- return np.std(x, ddof=1) / np.sqrt(len(x))
- assert_stat_op_calc(
- "nunique",
- nunique,
- float_frame_with_na,
- has_skipna=False,
- check_dtype=False,
- check_dates=True,
- )
- # GH#32571: rol needed for flaky CI builds
- # mixed types (with upcasting happening)
- assert_stat_op_calc(
- "sum",
- np.sum,
- mixed_float_frame.astype("float32"),
- check_dtype=False,
- rtol=1e-3,
- )
- assert_stat_op_calc(
- "sum", np.sum, float_frame_with_na, skipna_alternative=np.nansum
- )
- assert_stat_op_calc("mean", np.mean, float_frame_with_na, check_dates=True)
- assert_stat_op_calc(
- "product", np.prod, float_frame_with_na, skipna_alternative=np.nanprod
- )
- assert_stat_op_calc("var", var, float_frame_with_na)
- assert_stat_op_calc("std", std, float_frame_with_na)
- assert_stat_op_calc("sem", sem, float_frame_with_na)
- assert_stat_op_calc(
- "count",
- count,
- float_frame_with_na,
- has_skipna=False,
- check_dtype=False,
- check_dates=True,
- )
- @td.skip_if_no_scipy
- def test_stat_op_calc_skew_kurtosis(self, float_frame_with_na):
- def skewness(x):
- from scipy.stats import skew
- if len(x) < 3:
- return np.nan
- return skew(x, bias=False)
- def kurt(x):
- from scipy.stats import kurtosis
- if len(x) < 4:
- return np.nan
- return kurtosis(x, bias=False)
- assert_stat_op_calc("skew", skewness, float_frame_with_na)
- assert_stat_op_calc("kurt", kurt, float_frame_with_na)
- def test_median(self, float_frame_with_na, int_frame):
- def wrapper(x):
- if isna(x).any():
- return np.nan
- return np.median(x)
- assert_stat_op_calc("median", wrapper, float_frame_with_na, check_dates=True)
- assert_stat_op_calc(
- "median", wrapper, int_frame, check_dtype=False, check_dates=True
- )
- @pytest.mark.parametrize(
- "method", ["sum", "mean", "prod", "var", "std", "skew", "min", "max"]
- )
- @pytest.mark.parametrize(
- "df",
- [
- DataFrame(
- {
- "a": [
- -0.00049987540199591344,
- -0.0016467257772919831,
- 0.00067695870775883013,
- ],
- "b": [-0, -0, 0.0],
- "c": [
- 0.00031111847529610595,
- 0.0014902627951905339,
- -0.00094099200035979691,
- ],
- },
- index=["foo", "bar", "baz"],
- dtype="O",
- ),
- DataFrame({0: [np.nan, 2], 1: [np.nan, 3], 2: [np.nan, 4]}, dtype=object),
- ],
- )
- def test_stat_operators_attempt_obj_array(self, method, df, axis):
- # GH#676
- assert df.values.dtype == np.object_
- result = getattr(df, method)(axis=axis)
- expected = getattr(df.astype("f8"), method)(axis=axis).astype(object)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("op", ["mean", "std", "var", "skew", "kurt", "sem"])
- def test_mixed_ops(self, op):
- # GH#16116
- df = DataFrame(
- {
- "int": [1, 2, 3, 4],
- "float": [1.0, 2.0, 3.0, 4.0],
- "str": ["a", "b", "c", "d"],
- }
- )
- msg = "|".join(
- [
- "Could not convert",
- "could not convert",
- "can't multiply sequence by non-int",
- ]
- )
- with pytest.raises(TypeError, match=msg):
- getattr(df, op)()
- with pd.option_context("use_bottleneck", False):
- msg = "|".join(
- [
- "Could not convert",
- "could not convert",
- "can't multiply sequence by non-int",
- ]
- )
- with pytest.raises(TypeError, match=msg):
- getattr(df, op)()
- def test_reduce_mixed_frame(self):
- # GH 6806
- df = DataFrame(
- {
- "bool_data": [True, True, False, False, False],
- "int_data": [10, 20, 30, 40, 50],
- "string_data": ["a", "b", "c", "d", "e"],
- }
- )
- df.reindex(columns=["bool_data", "int_data", "string_data"])
- test = df.sum(axis=0)
- tm.assert_numpy_array_equal(
- test.values, np.array([2, 150, "abcde"], dtype=object)
- )
- alt = df.T.sum(axis=1)
- tm.assert_series_equal(test, alt)
- def test_nunique(self):
- df = DataFrame({"A": [1, 1, 1], "B": [1, 2, 3], "C": [1, np.nan, 3]})
- tm.assert_series_equal(df.nunique(), Series({"A": 1, "B": 3, "C": 2}))
- tm.assert_series_equal(
- df.nunique(dropna=False), Series({"A": 1, "B": 3, "C": 3})
- )
- tm.assert_series_equal(df.nunique(axis=1), Series({0: 1, 1: 2, 2: 2}))
- tm.assert_series_equal(
- df.nunique(axis=1, dropna=False), Series({0: 1, 1: 3, 2: 2})
- )
- @pytest.mark.parametrize("tz", [None, "UTC"])
- def test_mean_mixed_datetime_numeric(self, tz):
- # https://github.com/pandas-dev/pandas/issues/24752
- df = DataFrame({"A": [1, 1], "B": [Timestamp("2000", tz=tz)] * 2})
- result = df.mean()
- expected = Series([1.0, Timestamp("2000", tz=tz)], index=["A", "B"])
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("tz", [None, "UTC"])
- def test_mean_includes_datetimes(self, tz):
- # https://github.com/pandas-dev/pandas/issues/24752
- # Behavior in 0.24.0rc1 was buggy.
- # As of 2.0 with numeric_only=None we do *not* drop datetime columns
- df = DataFrame({"A": [Timestamp("2000", tz=tz)] * 2})
- result = df.mean()
- expected = Series([Timestamp("2000", tz=tz)], index=["A"])
- tm.assert_series_equal(result, expected)
- def test_mean_mixed_string_decimal(self):
- # GH 11670
- # possible bug when calculating mean of DataFrame?
- d = [
- {"A": 2, "B": None, "C": Decimal("628.00")},
- {"A": 1, "B": None, "C": Decimal("383.00")},
- {"A": 3, "B": None, "C": Decimal("651.00")},
- {"A": 2, "B": None, "C": Decimal("575.00")},
- {"A": 4, "B": None, "C": Decimal("1114.00")},
- {"A": 1, "B": "TEST", "C": Decimal("241.00")},
- {"A": 2, "B": None, "C": Decimal("572.00")},
- {"A": 4, "B": None, "C": Decimal("609.00")},
- {"A": 3, "B": None, "C": Decimal("820.00")},
- {"A": 5, "B": None, "C": Decimal("1223.00")},
- ]
- df = DataFrame(d)
- with pytest.raises(TypeError, match="unsupported operand type"):
- df.mean()
- result = df[["A", "C"]].mean()
- expected = Series([2.7, 681.6], index=["A", "C"], dtype=object)
- tm.assert_series_equal(result, expected)
- def test_var_std(self, datetime_frame):
- result = datetime_frame.std(ddof=4)
- expected = datetime_frame.apply(lambda x: x.std(ddof=4))
- tm.assert_almost_equal(result, expected)
- result = datetime_frame.var(ddof=4)
- expected = datetime_frame.apply(lambda x: x.var(ddof=4))
- tm.assert_almost_equal(result, expected)
- arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
- result = nanops.nanvar(arr, axis=0)
- assert not (result < 0).any()
- with pd.option_context("use_bottleneck", False):
- result = nanops.nanvar(arr, axis=0)
- assert not (result < 0).any()
- @pytest.mark.parametrize("meth", ["sem", "var", "std"])
- def test_numeric_only_flag(self, meth):
- # GH 9201
- df1 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"])
- # Cast to object to avoid implicit cast when setting entry to "100" below
- df1 = df1.astype({"foo": object})
- # set one entry to a number in str format
- df1.loc[0, "foo"] = "100"
- df2 = DataFrame(np.random.randn(5, 3), columns=["foo", "bar", "baz"])
- # Cast to object to avoid implicit cast when setting entry to "a" below
- df2 = df2.astype({"foo": object})
- # set one entry to a non-number str
- df2.loc[0, "foo"] = "a"
- result = getattr(df1, meth)(axis=1, numeric_only=True)
- expected = getattr(df1[["bar", "baz"]], meth)(axis=1)
- tm.assert_series_equal(expected, result)
- result = getattr(df2, meth)(axis=1, numeric_only=True)
- expected = getattr(df2[["bar", "baz"]], meth)(axis=1)
- tm.assert_series_equal(expected, result)
- # df1 has all numbers, df2 has a letter inside
- msg = r"unsupported operand type\(s\) for -: 'float' and 'str'"
- with pytest.raises(TypeError, match=msg):
- getattr(df1, meth)(axis=1, numeric_only=False)
- msg = "could not convert string to float: 'a'"
- with pytest.raises(TypeError, match=msg):
- getattr(df2, meth)(axis=1, numeric_only=False)
- def test_sem(self, datetime_frame):
- result = datetime_frame.sem(ddof=4)
- expected = datetime_frame.apply(lambda x: x.std(ddof=4) / np.sqrt(len(x)))
- tm.assert_almost_equal(result, expected)
- arr = np.repeat(np.random.random((1, 1000)), 1000, 0)
- result = nanops.nansem(arr, axis=0)
- assert not (result < 0).any()
- with pd.option_context("use_bottleneck", False):
- result = nanops.nansem(arr, axis=0)
- assert not (result < 0).any()
- @pytest.mark.parametrize(
- "dropna, expected",
- [
- (
- True,
- {
- "A": [12],
- "B": [10.0],
- "C": [1.0],
- "D": ["a"],
- "E": Categorical(["a"], categories=["a"]),
- "F": to_datetime(["2000-1-2"]),
- "G": to_timedelta(["1 days"]),
- },
- ),
- (
- False,
- {
- "A": [12],
- "B": [10.0],
- "C": [np.nan],
- "D": np.array([np.nan], dtype=object),
- "E": Categorical([np.nan], categories=["a"]),
- "F": [pd.NaT],
- "G": to_timedelta([pd.NaT]),
- },
- ),
- (
- True,
- {
- "H": [8, 9, np.nan, np.nan],
- "I": [8, 9, np.nan, np.nan],
- "J": [1, np.nan, np.nan, np.nan],
- "K": Categorical(["a", np.nan, np.nan, np.nan], categories=["a"]),
- "L": to_datetime(["2000-1-2", "NaT", "NaT", "NaT"]),
- "M": to_timedelta(["1 days", "nan", "nan", "nan"]),
- "N": [0, 1, 2, 3],
- },
- ),
- (
- False,
- {
- "H": [8, 9, np.nan, np.nan],
- "I": [8, 9, np.nan, np.nan],
- "J": [1, np.nan, np.nan, np.nan],
- "K": Categorical([np.nan, "a", np.nan, np.nan], categories=["a"]),
- "L": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]),
- "M": to_timedelta(["nan", "1 days", "nan", "nan"]),
- "N": [0, 1, 2, 3],
- },
- ),
- ],
- )
- def test_mode_dropna(self, dropna, expected):
- df = DataFrame(
- {
- "A": [12, 12, 19, 11],
- "B": [10, 10, np.nan, 3],
- "C": [1, np.nan, np.nan, np.nan],
- "D": [np.nan, np.nan, "a", np.nan],
- "E": Categorical([np.nan, np.nan, "a", np.nan]),
- "F": to_datetime(["NaT", "2000-1-2", "NaT", "NaT"]),
- "G": to_timedelta(["1 days", "nan", "nan", "nan"]),
- "H": [8, 8, 9, 9],
- "I": [9, 9, 8, 8],
- "J": [1, 1, np.nan, np.nan],
- "K": Categorical(["a", np.nan, "a", np.nan]),
- "L": to_datetime(["2000-1-2", "2000-1-2", "NaT", "NaT"]),
- "M": to_timedelta(["1 days", "nan", "1 days", "nan"]),
- "N": np.arange(4, dtype="int64"),
- }
- )
- result = df[sorted(expected.keys())].mode(dropna=dropna)
- expected = DataFrame(expected)
- tm.assert_frame_equal(result, expected)
- def test_mode_sortwarning(self):
- # Check for the warning that is raised when the mode
- # results cannot be sorted
- df = DataFrame({"A": [np.nan, np.nan, "a", "a"]})
- expected = DataFrame({"A": ["a", np.nan]})
- with tm.assert_produces_warning(UserWarning):
- result = df.mode(dropna=False)
- result = result.sort_values(by="A").reset_index(drop=True)
- tm.assert_frame_equal(result, expected)
- def test_mode_empty_df(self):
- df = DataFrame([], columns=["a", "b"])
- result = df.mode()
- expected = DataFrame([], columns=["a", "b"], index=Index([], dtype=np.int64))
- tm.assert_frame_equal(result, expected)
- def test_operators_timedelta64(self):
- df = DataFrame(
- {
- "A": date_range("2012-1-1", periods=3, freq="D"),
- "B": date_range("2012-1-2", periods=3, freq="D"),
- "C": Timestamp("20120101") - timedelta(minutes=5, seconds=5),
- }
- )
- diffs = DataFrame({"A": df["A"] - df["C"], "B": df["A"] - df["B"]})
- # min
- result = diffs.min()
- assert result[0] == diffs.loc[0, "A"]
- assert result[1] == diffs.loc[0, "B"]
- result = diffs.min(axis=1)
- assert (result == diffs.loc[0, "B"]).all()
- # max
- result = diffs.max()
- assert result[0] == diffs.loc[2, "A"]
- assert result[1] == diffs.loc[2, "B"]
- result = diffs.max(axis=1)
- assert (result == diffs["A"]).all()
- # abs
- result = diffs.abs()
- result2 = abs(diffs)
- expected = DataFrame({"A": df["A"] - df["C"], "B": df["B"] - df["A"]})
- tm.assert_frame_equal(result, expected)
- tm.assert_frame_equal(result2, expected)
- # mixed frame
- mixed = diffs.copy()
- mixed["C"] = "foo"
- mixed["D"] = 1
- mixed["E"] = 1.0
- mixed["F"] = Timestamp("20130101")
- # results in an object array
- result = mixed.min()
- expected = Series(
- [
- pd.Timedelta(timedelta(seconds=5 * 60 + 5)),
- pd.Timedelta(timedelta(days=-1)),
- "foo",
- 1,
- 1.0,
- Timestamp("20130101"),
- ],
- index=mixed.columns,
- )
- tm.assert_series_equal(result, expected)
- # excludes non-numeric
- result = mixed.min(axis=1, numeric_only=True)
- expected = Series([1, 1, 1.0], index=[0, 1, 2])
- tm.assert_series_equal(result, expected)
- # works when only those columns are selected
- result = mixed[["A", "B"]].min(1)
- expected = Series([timedelta(days=-1)] * 3)
- tm.assert_series_equal(result, expected)
- result = mixed[["A", "B"]].min()
- expected = Series(
- [timedelta(seconds=5 * 60 + 5), timedelta(days=-1)], index=["A", "B"]
- )
- tm.assert_series_equal(result, expected)
- # GH 3106
- df = DataFrame(
- {
- "time": date_range("20130102", periods=5),
- "time2": date_range("20130105", periods=5),
- }
- )
- df["off1"] = df["time2"] - df["time"]
- assert df["off1"].dtype == "timedelta64[ns]"
- df["off2"] = df["time"] - df["time2"]
- df._consolidate_inplace()
- assert df["off1"].dtype == "timedelta64[ns]"
- assert df["off2"].dtype == "timedelta64[ns]"
- def test_std_timedelta64_skipna_false(self):
- # GH#37392
- tdi = pd.timedelta_range("1 Day", periods=10)
- df = DataFrame({"A": tdi, "B": tdi}, copy=True)
- df.iloc[-2, -1] = pd.NaT
- result = df.std(skipna=False)
- expected = Series(
- [df["A"].std(), pd.NaT], index=["A", "B"], dtype="timedelta64[ns]"
- )
- tm.assert_series_equal(result, expected)
- result = df.std(axis=1, skipna=False)
- expected = Series([pd.Timedelta(0)] * 8 + [pd.NaT, pd.Timedelta(0)])
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "values", [["2022-01-01", "2022-01-02", pd.NaT, "2022-01-03"], 4 * [pd.NaT]]
- )
- def test_std_datetime64_with_nat(
- self, values, skipna, using_array_manager, request
- ):
- # GH#51335
- if using_array_manager and (
- not skipna or all(value is pd.NaT for value in values)
- ):
- mark = pytest.mark.xfail(
- reason="GH#51446: Incorrect type inference on NaT in reduction result"
- )
- request.node.add_marker(mark)
- df = DataFrame({"a": to_datetime(values)})
- result = df.std(skipna=skipna)
- if not skipna or all(value is pd.NaT for value in values):
- expected = Series({"a": pd.NaT}, dtype="timedelta64[ns]")
- else:
- # 86400000000000ns == 1 day
- expected = Series({"a": 86400000000000}, dtype="timedelta64[ns]")
- tm.assert_series_equal(result, expected)
- def test_sum_corner(self):
- empty_frame = DataFrame()
- axis0 = empty_frame.sum(0)
- axis1 = empty_frame.sum(1)
- assert isinstance(axis0, Series)
- assert isinstance(axis1, Series)
- assert len(axis0) == 0
- assert len(axis1) == 0
- @pytest.mark.parametrize(
- "index",
- [
- tm.makeRangeIndex(0),
- tm.makeDateIndex(0),
- tm.makeNumericIndex(0, dtype=int),
- tm.makeNumericIndex(0, dtype=float),
- tm.makeDateIndex(0, freq="M"),
- tm.makePeriodIndex(0),
- ],
- )
- def test_axis_1_empty(self, all_reductions, index, using_array_manager):
- df = DataFrame(columns=["a"], index=index)
- result = getattr(df, all_reductions)(axis=1)
- if all_reductions in ("any", "all"):
- expected_dtype = "bool"
- elif all_reductions == "count":
- expected_dtype = "int64"
- else:
- expected_dtype = "object"
- expected = Series([], index=index, dtype=expected_dtype)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("method, unit", [("sum", 0), ("prod", 1)])
- @pytest.mark.parametrize("numeric_only", [None, True, False])
- def test_sum_prod_nanops(self, method, unit, numeric_only):
- idx = ["a", "b", "c"]
- df = DataFrame({"a": [unit, unit], "b": [unit, np.nan], "c": [np.nan, np.nan]})
- # The default
- result = getattr(df, method)(numeric_only=numeric_only)
- expected = Series([unit, unit, unit], index=idx, dtype="float64")
- tm.assert_series_equal(result, expected)
- # min_count=1
- result = getattr(df, method)(numeric_only=numeric_only, min_count=1)
- expected = Series([unit, unit, np.nan], index=idx)
- tm.assert_series_equal(result, expected)
- # min_count=0
- result = getattr(df, method)(numeric_only=numeric_only, min_count=0)
- expected = Series([unit, unit, unit], index=idx, dtype="float64")
- tm.assert_series_equal(result, expected)
- result = getattr(df.iloc[1:], method)(numeric_only=numeric_only, min_count=1)
- expected = Series([unit, np.nan, np.nan], index=idx)
- tm.assert_series_equal(result, expected)
- # min_count > 1
- df = DataFrame({"A": [unit] * 10, "B": [unit] * 5 + [np.nan] * 5})
- result = getattr(df, method)(numeric_only=numeric_only, min_count=5)
- expected = Series(result, index=["A", "B"])
- tm.assert_series_equal(result, expected)
- result = getattr(df, method)(numeric_only=numeric_only, min_count=6)
- expected = Series(result, index=["A", "B"])
- tm.assert_series_equal(result, expected)
- def test_sum_nanops_timedelta(self):
- # prod isn't defined on timedeltas
- idx = ["a", "b", "c"]
- df = DataFrame({"a": [0, 0], "b": [0, np.nan], "c": [np.nan, np.nan]})
- df2 = df.apply(to_timedelta)
- # 0 by default
- result = df2.sum()
- expected = Series([0, 0, 0], dtype="m8[ns]", index=idx)
- tm.assert_series_equal(result, expected)
- # min_count=0
- result = df2.sum(min_count=0)
- tm.assert_series_equal(result, expected)
- # min_count=1
- result = df2.sum(min_count=1)
- expected = Series([0, 0, np.nan], dtype="m8[ns]", index=idx)
- tm.assert_series_equal(result, expected)
- def test_sum_nanops_min_count(self):
- # https://github.com/pandas-dev/pandas/issues/39738
- df = DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
- result = df.sum(min_count=10)
- expected = Series([np.nan, np.nan], index=["x", "y"])
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("float_type", ["float16", "float32", "float64"])
- @pytest.mark.parametrize(
- "kwargs, expected_result",
- [
- ({"axis": 1, "min_count": 2}, [3.2, 5.3, np.NaN]),
- ({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]),
- ({"axis": 1, "skipna": False}, [3.2, 5.3, np.NaN]),
- ],
- )
- def test_sum_nanops_dtype_min_count(self, float_type, kwargs, expected_result):
- # GH#46947
- df = DataFrame({"a": [1.0, 2.3, 4.4], "b": [2.2, 3, np.nan]}, dtype=float_type)
- result = df.sum(**kwargs)
- expected = Series(expected_result).astype(float_type)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("float_type", ["float16", "float32", "float64"])
- @pytest.mark.parametrize(
- "kwargs, expected_result",
- [
- ({"axis": 1, "min_count": 2}, [2.0, 4.0, np.NaN]),
- ({"axis": 1, "min_count": 3}, [np.NaN, np.NaN, np.NaN]),
- ({"axis": 1, "skipna": False}, [2.0, 4.0, np.NaN]),
- ],
- )
- def test_prod_nanops_dtype_min_count(self, float_type, kwargs, expected_result):
- # GH#46947
- df = DataFrame(
- {"a": [1.0, 2.0, 4.4], "b": [2.0, 2.0, np.nan]}, dtype=float_type
- )
- result = df.prod(**kwargs)
- expected = Series(expected_result).astype(float_type)
- tm.assert_series_equal(result, expected)
- def test_sum_object(self, float_frame):
- values = float_frame.values.astype(int)
- frame = DataFrame(values, index=float_frame.index, columns=float_frame.columns)
- deltas = frame * timedelta(1)
- deltas.sum()
- def test_sum_bool(self, float_frame):
- # ensure this works, bug report
- bools = np.isnan(float_frame)
- bools.sum(1)
- bools.sum(0)
- def test_sum_mixed_datetime(self):
- # GH#30886
- df = DataFrame({"A": date_range("2000", periods=4), "B": [1, 2, 3, 4]}).reindex(
- [2, 3, 4]
- )
- with pytest.raises(TypeError, match="does not support reduction 'sum'"):
- df.sum()
- def test_mean_corner(self, float_frame, float_string_frame):
- # unit test when have object data
- with pytest.raises(TypeError, match="Could not convert"):
- float_string_frame.mean(axis=0)
- # xs sum mixed type, just want to know it works...
- with pytest.raises(TypeError, match="unsupported operand type"):
- float_string_frame.mean(axis=1)
- # take mean of boolean column
- float_frame["bool"] = float_frame["A"] > 0
- means = float_frame.mean(0)
- assert means["bool"] == float_frame["bool"].values.mean()
- def test_mean_datetimelike(self):
- # GH#24757 check that datetimelike are excluded by default, handled
- # correctly with numeric_only=True
- # As of 2.0, datetimelike are *not* excluded with numeric_only=None
- df = DataFrame(
- {
- "A": np.arange(3),
- "B": date_range("2016-01-01", periods=3),
- "C": pd.timedelta_range("1D", periods=3),
- "D": pd.period_range("2016", periods=3, freq="A"),
- }
- )
- result = df.mean(numeric_only=True)
- expected = Series({"A": 1.0})
- tm.assert_series_equal(result, expected)
- with pytest.raises(TypeError, match="mean is not implemented for PeriodArray"):
- df.mean()
- def test_mean_datetimelike_numeric_only_false(self):
- df = DataFrame(
- {
- "A": np.arange(3),
- "B": date_range("2016-01-01", periods=3),
- "C": pd.timedelta_range("1D", periods=3),
- }
- )
- # datetime(tz) and timedelta work
- result = df.mean(numeric_only=False)
- expected = Series({"A": 1, "B": df.loc[1, "B"], "C": df.loc[1, "C"]})
- tm.assert_series_equal(result, expected)
- # mean of period is not allowed
- df["D"] = pd.period_range("2016", periods=3, freq="A")
- with pytest.raises(TypeError, match="mean is not implemented for Period"):
- df.mean(numeric_only=False)
- def test_mean_extensionarray_numeric_only_true(self):
- # https://github.com/pandas-dev/pandas/issues/33256
- arr = np.random.randint(1000, size=(10, 5))
- df = DataFrame(arr, dtype="Int64")
- result = df.mean(numeric_only=True)
- expected = DataFrame(arr).mean()
- tm.assert_series_equal(result, expected)
- def test_stats_mixed_type(self, float_string_frame):
- with pytest.raises(TypeError, match="could not convert"):
- float_string_frame.std(1)
- with pytest.raises(TypeError, match="could not convert"):
- float_string_frame.var(1)
- with pytest.raises(TypeError, match="unsupported operand type"):
- float_string_frame.mean(1)
- with pytest.raises(TypeError, match="could not convert"):
- float_string_frame.skew(1)
- def test_sum_bools(self):
- df = DataFrame(index=range(1), columns=range(10))
- bools = isna(df)
- assert bools.sum(axis=1)[0] == 10
- # ----------------------------------------------------------------------
- # Index of max / min
- @pytest.mark.parametrize("skipna", [True, False])
- @pytest.mark.parametrize("axis", [0, 1])
- def test_idxmin(self, float_frame, int_frame, skipna, axis):
- frame = float_frame
- frame.iloc[5:10] = np.nan
- frame.iloc[15:20, -2:] = np.nan
- for df in [frame, int_frame]:
- result = df.idxmin(axis=axis, skipna=skipna)
- expected = df.apply(Series.idxmin, axis=axis, skipna=skipna)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("numeric_only", [True, False])
- def test_idxmin_numeric_only(self, numeric_only):
- df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
- if numeric_only:
- result = df.idxmin(numeric_only=numeric_only)
- expected = Series([2, 1], index=["a", "b"])
- tm.assert_series_equal(result, expected)
- else:
- with pytest.raises(TypeError, match="not allowed for this dtype"):
- df.idxmin(numeric_only=numeric_only)
- def test_idxmin_axis_2(self, float_frame):
- frame = float_frame
- msg = "No axis named 2 for object type DataFrame"
- with pytest.raises(ValueError, match=msg):
- frame.idxmin(axis=2)
- @pytest.mark.parametrize("skipna", [True, False])
- @pytest.mark.parametrize("axis", [0, 1])
- def test_idxmax(self, float_frame, int_frame, skipna, axis):
- frame = float_frame
- frame.iloc[5:10] = np.nan
- frame.iloc[15:20, -2:] = np.nan
- for df in [frame, int_frame]:
- result = df.idxmax(axis=axis, skipna=skipna)
- expected = df.apply(Series.idxmax, axis=axis, skipna=skipna)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("numeric_only", [True, False])
- def test_idxmax_numeric_only(self, numeric_only):
- df = DataFrame({"a": [2, 3, 1], "b": [2, 1, 1], "c": list("xyx")})
- if numeric_only:
- result = df.idxmax(numeric_only=numeric_only)
- expected = Series([1, 0], index=["a", "b"])
- tm.assert_series_equal(result, expected)
- else:
- with pytest.raises(TypeError, match="not allowed for this dtype"):
- df.idxmin(numeric_only=numeric_only)
- def test_idxmax_axis_2(self, float_frame):
- frame = float_frame
- msg = "No axis named 2 for object type DataFrame"
- with pytest.raises(ValueError, match=msg):
- frame.idxmax(axis=2)
- def test_idxmax_mixed_dtype(self):
- # don't cast to object, which would raise in nanops
- dti = date_range("2016-01-01", periods=3)
- # Copying dti is needed for ArrayManager otherwise when we set
- # df.loc[0, 3] = pd.NaT below it edits dti
- df = DataFrame({1: [0, 2, 1], 2: range(3)[::-1], 3: dti.copy(deep=True)})
- result = df.idxmax()
- expected = Series([1, 0, 2], index=[1, 2, 3])
- tm.assert_series_equal(result, expected)
- result = df.idxmin()
- expected = Series([0, 2, 0], index=[1, 2, 3])
- tm.assert_series_equal(result, expected)
- # with NaTs
- df.loc[0, 3] = pd.NaT
- result = df.idxmax()
- expected = Series([1, 0, 2], index=[1, 2, 3])
- tm.assert_series_equal(result, expected)
- result = df.idxmin()
- expected = Series([0, 2, 1], index=[1, 2, 3])
- tm.assert_series_equal(result, expected)
- # with multi-column dt64 block
- df[4] = dti[::-1]
- df._consolidate_inplace()
- result = df.idxmax()
- expected = Series([1, 0, 2, 0], index=[1, 2, 3, 4])
- tm.assert_series_equal(result, expected)
- result = df.idxmin()
- expected = Series([0, 2, 1, 2], index=[1, 2, 3, 4])
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "op, expected_value",
- [("idxmax", [0, 4]), ("idxmin", [0, 5])],
- )
- def test_idxmax_idxmin_convert_dtypes(self, op, expected_value):
- # GH 40346
- df = DataFrame(
- {
- "ID": [100, 100, 100, 200, 200, 200],
- "value": [0, 0, 0, 1, 2, 0],
- },
- dtype="Int64",
- )
- df = df.groupby("ID")
- result = getattr(df, op)()
- expected = DataFrame(
- {"value": expected_value},
- index=Index([100, 200], name="ID", dtype="Int64"),
- )
- tm.assert_frame_equal(result, expected)
- def test_idxmax_dt64_multicolumn_axis1(self):
- dti = date_range("2016-01-01", periods=3)
- df = DataFrame({3: dti, 4: dti[::-1]}, copy=True)
- df.iloc[0, 0] = pd.NaT
- df._consolidate_inplace()
- result = df.idxmax(axis=1)
- expected = Series([4, 3, 3])
- tm.assert_series_equal(result, expected)
- result = df.idxmin(axis=1)
- expected = Series([4, 3, 4])
- tm.assert_series_equal(result, expected)
- # ----------------------------------------------------------------------
- # Logical reductions
- @pytest.mark.parametrize("opname", ["any", "all"])
- @pytest.mark.parametrize("axis", [0, 1])
- @pytest.mark.parametrize("bool_only", [False, True])
- def test_any_all_mixed_float(self, opname, axis, bool_only, float_string_frame):
- # make sure op works on mixed-type frame
- mixed = float_string_frame
- mixed["_bool_"] = np.random.randn(len(mixed)) > 0.5
- getattr(mixed, opname)(axis=axis, bool_only=bool_only)
- @pytest.mark.parametrize("opname", ["any", "all"])
- @pytest.mark.parametrize("axis", [0, 1])
- def test_any_all_bool_with_na(self, opname, axis, bool_frame_with_na):
- getattr(bool_frame_with_na, opname)(axis=axis, bool_only=False)
- @pytest.mark.parametrize("opname", ["any", "all"])
- def test_any_all_bool_frame(self, opname, bool_frame_with_na):
- # GH#12863: numpy gives back non-boolean data for object type
- # so fill NaNs to compare with pandas behavior
- frame = bool_frame_with_na.fillna(True)
- alternative = getattr(np, opname)
- f = getattr(frame, opname)
- def skipna_wrapper(x):
- nona = x.dropna().values
- return alternative(nona)
- def wrapper(x):
- return alternative(x.values)
- result0 = f(axis=0, skipna=False)
- result1 = f(axis=1, skipna=False)
- tm.assert_series_equal(result0, frame.apply(wrapper))
- tm.assert_series_equal(result1, frame.apply(wrapper, axis=1))
- result0 = f(axis=0)
- result1 = f(axis=1)
- tm.assert_series_equal(result0, frame.apply(skipna_wrapper))
- tm.assert_series_equal(
- result1, frame.apply(skipna_wrapper, axis=1), check_dtype=False
- )
- # bad axis
- with pytest.raises(ValueError, match="No axis named 2"):
- f(axis=2)
- # all NA case
- all_na = frame * np.NaN
- r0 = getattr(all_na, opname)(axis=0)
- r1 = getattr(all_na, opname)(axis=1)
- if opname == "any":
- assert not r0.any()
- assert not r1.any()
- else:
- assert r0.all()
- assert r1.all()
- def test_any_all_extra(self):
- df = DataFrame(
- {
- "A": [True, False, False],
- "B": [True, True, False],
- "C": [True, True, True],
- },
- index=["a", "b", "c"],
- )
- result = df[["A", "B"]].any(axis=1)
- expected = Series([True, True, False], index=["a", "b", "c"])
- tm.assert_series_equal(result, expected)
- result = df[["A", "B"]].any(axis=1, bool_only=True)
- tm.assert_series_equal(result, expected)
- result = df.all(1)
- expected = Series([True, False, False], index=["a", "b", "c"])
- tm.assert_series_equal(result, expected)
- result = df.all(1, bool_only=True)
- tm.assert_series_equal(result, expected)
- # Axis is None
- result = df.all(axis=None).item()
- assert result is False
- result = df.any(axis=None).item()
- assert result is True
- result = df[["C"]].all(axis=None).item()
- assert result is True
- @pytest.mark.parametrize("axis", [0, 1])
- @pytest.mark.parametrize("bool_agg_func", ["any", "all"])
- @pytest.mark.parametrize("skipna", [True, False])
- def test_any_all_object_dtype(self, axis, bool_agg_func, skipna):
- # GH#35450
- df = DataFrame(
- data=[
- [1, np.nan, np.nan, True],
- [np.nan, 2, np.nan, True],
- [np.nan, np.nan, np.nan, True],
- [np.nan, np.nan, "5", np.nan],
- ]
- )
- result = getattr(df, bool_agg_func)(axis=axis, skipna=skipna)
- expected = Series([True, True, True, True])
- tm.assert_series_equal(result, expected)
- # GH#50947 deprecates this but it is not emitting a warning in some builds.
- @pytest.mark.filterwarnings(
- "ignore:'any' with datetime64 dtypes is deprecated.*:FutureWarning"
- )
- def test_any_datetime(self):
- # GH 23070
- float_data = [1, np.nan, 3, np.nan]
- datetime_data = [
- Timestamp("1960-02-15"),
- Timestamp("1960-02-16"),
- pd.NaT,
- pd.NaT,
- ]
- df = DataFrame({"A": float_data, "B": datetime_data})
- result = df.any(axis=1)
- expected = Series([True, True, True, False])
- tm.assert_series_equal(result, expected)
- def test_any_all_bool_only(self):
- # GH 25101
- df = DataFrame(
- {"col1": [1, 2, 3], "col2": [4, 5, 6], "col3": [None, None, None]}
- )
- result = df.all(bool_only=True)
- expected = Series(dtype=np.bool_, index=[])
- tm.assert_series_equal(result, expected)
- df = DataFrame(
- {
- "col1": [1, 2, 3],
- "col2": [4, 5, 6],
- "col3": [None, None, None],
- "col4": [False, False, True],
- }
- )
- result = df.all(bool_only=True)
- expected = Series({"col4": False})
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize(
- "func, data, expected",
- [
- (np.any, {}, False),
- (np.all, {}, True),
- (np.any, {"A": []}, False),
- (np.all, {"A": []}, True),
- (np.any, {"A": [False, False]}, False),
- (np.all, {"A": [False, False]}, False),
- (np.any, {"A": [True, False]}, True),
- (np.all, {"A": [True, False]}, False),
- (np.any, {"A": [True, True]}, True),
- (np.all, {"A": [True, True]}, True),
- (np.any, {"A": [False], "B": [False]}, False),
- (np.all, {"A": [False], "B": [False]}, False),
- (np.any, {"A": [False, False], "B": [False, True]}, True),
- (np.all, {"A": [False, False], "B": [False, True]}, False),
- # other types
- (np.all, {"A": Series([0.0, 1.0], dtype="float")}, False),
- (np.any, {"A": Series([0.0, 1.0], dtype="float")}, True),
- (np.all, {"A": Series([0, 1], dtype=int)}, False),
- (np.any, {"A": Series([0, 1], dtype=int)}, True),
- pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns]")}, False),
- pytest.param(np.all, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, False),
- pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns]")}, True),
- pytest.param(np.any, {"A": Series([0, 1], dtype="M8[ns, UTC]")}, True),
- pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns]")}, True),
- pytest.param(np.all, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True),
- pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns]")}, True),
- pytest.param(np.any, {"A": Series([1, 2], dtype="M8[ns, UTC]")}, True),
- pytest.param(np.all, {"A": Series([0, 1], dtype="m8[ns]")}, False),
- pytest.param(np.any, {"A": Series([0, 1], dtype="m8[ns]")}, True),
- pytest.param(np.all, {"A": Series([1, 2], dtype="m8[ns]")}, True),
- pytest.param(np.any, {"A": Series([1, 2], dtype="m8[ns]")}, True),
- # np.all on Categorical raises, so the reduction drops the
- # column, so all is being done on an empty Series, so is True
- (np.all, {"A": Series([0, 1], dtype="category")}, True),
- (np.any, {"A": Series([0, 1], dtype="category")}, False),
- (np.all, {"A": Series([1, 2], dtype="category")}, True),
- (np.any, {"A": Series([1, 2], dtype="category")}, False),
- # Mix GH#21484
- pytest.param(
- np.all,
- {
- "A": Series([10, 20], dtype="M8[ns]"),
- "B": Series([10, 20], dtype="m8[ns]"),
- },
- True,
- ),
- ],
- )
- def test_any_all_np_func(self, func, data, expected):
- # GH 19976
- data = DataFrame(data)
- if any(is_categorical_dtype(x) for x in data.dtypes):
- with pytest.raises(
- TypeError, match="dtype category does not support reduction"
- ):
- func(data)
- # method version
- with pytest.raises(
- TypeError, match="dtype category does not support reduction"
- ):
- getattr(DataFrame(data), func.__name__)(axis=None)
- else:
- msg = "'(any|all)' with datetime64 dtypes is deprecated"
- if data.dtypes.apply(lambda x: x.kind == "M").any():
- warn = FutureWarning
- else:
- warn = None
- with tm.assert_produces_warning(warn, match=msg, check_stacklevel=False):
- # GH#34479
- result = func(data)
- assert isinstance(result, np.bool_)
- assert result.item() is expected
- # method version
- with tm.assert_produces_warning(warn, match=msg):
- # GH#34479
- result = getattr(DataFrame(data), func.__name__)(axis=None)
- assert isinstance(result, np.bool_)
- assert result.item() is expected
- def test_any_all_object(self):
- # GH 19976
- result = np.all(DataFrame(columns=["a", "b"])).item()
- assert result is True
- result = np.any(DataFrame(columns=["a", "b"])).item()
- assert result is False
- def test_any_all_object_bool_only(self):
- df = DataFrame({"A": ["foo", 2], "B": [True, False]}).astype(object)
- df._consolidate_inplace()
- df["C"] = Series([True, True])
- # Categorical of bools is _not_ considered booly
- df["D"] = df["C"].astype("category")
- # The underlying bug is in DataFrame._get_bool_data, so we check
- # that while we're here
- res = df._get_bool_data()
- expected = df[["C"]]
- tm.assert_frame_equal(res, expected)
- res = df.all(bool_only=True, axis=0)
- expected = Series([True], index=["C"])
- tm.assert_series_equal(res, expected)
- # operating on a subset of columns should not produce a _larger_ Series
- res = df[["B", "C"]].all(bool_only=True, axis=0)
- tm.assert_series_equal(res, expected)
- assert df.all(bool_only=True, axis=None)
- res = df.any(bool_only=True, axis=0)
- expected = Series([True], index=["C"])
- tm.assert_series_equal(res, expected)
- # operating on a subset of columns should not produce a _larger_ Series
- res = df[["C"]].any(bool_only=True, axis=0)
- tm.assert_series_equal(res, expected)
- assert df.any(bool_only=True, axis=None)
- # ---------------------------------------------------------------------
- # Unsorted
- def test_series_broadcasting(self):
- # smoke test for numpy warnings
- # GH 16378, GH 16306
- df = DataFrame([1.0, 1.0, 1.0])
- df_nan = DataFrame({"A": [np.nan, 2.0, np.nan]})
- s = Series([1, 1, 1])
- s_nan = Series([np.nan, np.nan, 1])
- with tm.assert_produces_warning(None):
- df_nan.clip(lower=s, axis=0)
- for op in ["lt", "le", "gt", "ge", "eq", "ne"]:
- getattr(df, op)(s_nan, axis=0)
- class TestDataFrameReductions:
- def test_min_max_dt64_with_NaT(self):
- # Both NaT and Timestamp are in DataFrame.
- df = DataFrame({"foo": [pd.NaT, pd.NaT, Timestamp("2012-05-01")]})
- res = df.min()
- exp = Series([Timestamp("2012-05-01")], index=["foo"])
- tm.assert_series_equal(res, exp)
- res = df.max()
- exp = Series([Timestamp("2012-05-01")], index=["foo"])
- tm.assert_series_equal(res, exp)
- # GH12941, only NaTs are in DataFrame.
- df = DataFrame({"foo": [pd.NaT, pd.NaT]})
- res = df.min()
- exp = Series([pd.NaT], index=["foo"])
- tm.assert_series_equal(res, exp)
- res = df.max()
- exp = Series([pd.NaT], index=["foo"])
- tm.assert_series_equal(res, exp)
- def test_min_max_dt64_with_NaT_skipna_false(self, request, tz_naive_fixture):
- # GH#36907
- tz = tz_naive_fixture
- if isinstance(tz, tzlocal) and is_platform_windows():
- pytest.skip(
- "GH#37659 OSError raised within tzlocal bc Windows "
- "chokes in times before 1970-01-01"
- )
- df = DataFrame(
- {
- "a": [
- Timestamp("2020-01-01 08:00:00", tz=tz),
- Timestamp("1920-02-01 09:00:00", tz=tz),
- ],
- "b": [Timestamp("2020-02-01 08:00:00", tz=tz), pd.NaT],
- }
- )
- res = df.min(axis=1, skipna=False)
- expected = Series([df.loc[0, "a"], pd.NaT])
- assert expected.dtype == df["a"].dtype
- tm.assert_series_equal(res, expected)
- res = df.max(axis=1, skipna=False)
- expected = Series([df.loc[0, "b"], pd.NaT])
- assert expected.dtype == df["a"].dtype
- tm.assert_series_equal(res, expected)
- def test_min_max_dt64_api_consistency_with_NaT(self):
- # Calling the following sum functions returned an error for dataframes but
- # returned NaT for series. These tests check that the API is consistent in
- # min/max calls on empty Series/DataFrames. See GH:33704 for more
- # information
- df = DataFrame({"x": to_datetime([])})
- expected_dt_series = Series(to_datetime([]))
- # check axis 0
- assert (df.min(axis=0).x is pd.NaT) == (expected_dt_series.min() is pd.NaT)
- assert (df.max(axis=0).x is pd.NaT) == (expected_dt_series.max() is pd.NaT)
- # check axis 1
- tm.assert_series_equal(df.min(axis=1), expected_dt_series)
- tm.assert_series_equal(df.max(axis=1), expected_dt_series)
- def test_min_max_dt64_api_consistency_empty_df(self):
- # check DataFrame/Series api consistency when calling min/max on an empty
- # DataFrame/Series.
- df = DataFrame({"x": []})
- expected_float_series = Series([], dtype=float)
- # check axis 0
- assert np.isnan(df.min(axis=0).x) == np.isnan(expected_float_series.min())
- assert np.isnan(df.max(axis=0).x) == np.isnan(expected_float_series.max())
- # check axis 1
- tm.assert_series_equal(df.min(axis=1), expected_float_series)
- tm.assert_series_equal(df.min(axis=1), expected_float_series)
- @pytest.mark.parametrize(
- "initial",
- ["2018-10-08 13:36:45+00:00", "2018-10-08 13:36:45+03:00"], # Non-UTC timezone
- )
- @pytest.mark.parametrize("method", ["min", "max"])
- def test_preserve_timezone(self, initial: str, method):
- # GH 28552
- initial_dt = to_datetime(initial)
- expected = Series([initial_dt])
- df = DataFrame([expected])
- result = getattr(df, method)(axis=1)
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("method", ["min", "max"])
- def test_minmax_tzaware_skipna_axis_1(self, method, skipna):
- # GH#51242
- val = to_datetime("1900-01-01", utc=True)
- df = DataFrame(
- {"a": Series([pd.NaT, pd.NaT, val]), "b": Series([pd.NaT, val, val])}
- )
- op = getattr(df, method)
- result = op(axis=1, skipna=skipna)
- if skipna:
- expected = Series([pd.NaT, val, val])
- else:
- expected = Series([pd.NaT, pd.NaT, val])
- tm.assert_series_equal(result, expected)
- def test_frame_any_with_timedelta(self):
- # GH#17667
- df = DataFrame(
- {
- "a": Series([0, 0]),
- "t": Series([to_timedelta(0, "s"), to_timedelta(1, "ms")]),
- }
- )
- result = df.any(axis=0)
- expected = Series(data=[False, True], index=["a", "t"])
- tm.assert_series_equal(result, expected)
- result = df.any(axis=1)
- expected = Series(data=[False, True])
- tm.assert_series_equal(result, expected)
- def test_reductions_skipna_none_raises(
- self, request, frame_or_series, all_reductions
- ):
- if all_reductions == "count":
- request.node.add_marker(
- pytest.mark.xfail(reason="Count does not accept skipna")
- )
- obj = frame_or_series([1, 2, 3])
- msg = 'For argument "skipna" expected type bool, received type NoneType.'
- with pytest.raises(ValueError, match=msg):
- getattr(obj, all_reductions)(skipna=None)
- @td.skip_array_manager_invalid_test
- def test_reduction_timestamp_smallest_unit(self):
- # GH#52524
- df = DataFrame(
- {
- "a": Series([Timestamp("2019-12-31")], dtype="datetime64[s]"),
- "b": Series(
- [Timestamp("2019-12-31 00:00:00.123")], dtype="datetime64[ms]"
- ),
- }
- )
- result = df.max()
- expected = Series(
- [Timestamp("2019-12-31"), Timestamp("2019-12-31 00:00:00.123")],
- dtype="datetime64[ms]",
- index=["a", "b"],
- )
- tm.assert_series_equal(result, expected)
- @td.skip_array_manager_not_yet_implemented
- def test_reduction_timedelta_smallest_unit(self):
- # GH#52524
- df = DataFrame(
- {
- "a": Series([pd.Timedelta("1 days")], dtype="timedelta64[s]"),
- "b": Series([pd.Timedelta("1 days")], dtype="timedelta64[ms]"),
- }
- )
- result = df.max()
- expected = Series(
- [pd.Timedelta("1 days"), pd.Timedelta("1 days")],
- dtype="timedelta64[ms]",
- index=["a", "b"],
- )
- tm.assert_series_equal(result, expected)
- class TestNuisanceColumns:
- @pytest.mark.parametrize("method", ["any", "all"])
- def test_any_all_categorical_dtype_nuisance_column(self, method):
- # GH#36076 DataFrame should match Series behavior
- ser = Series([0, 1], dtype="category", name="A")
- df = ser.to_frame()
- # Double-check the Series behavior is to raise
- with pytest.raises(TypeError, match="does not support reduction"):
- getattr(ser, method)()
- with pytest.raises(TypeError, match="does not support reduction"):
- getattr(np, method)(ser)
- with pytest.raises(TypeError, match="does not support reduction"):
- getattr(df, method)(bool_only=False)
- with pytest.raises(TypeError, match="does not support reduction"):
- getattr(df, method)(bool_only=None)
- with pytest.raises(TypeError, match="does not support reduction"):
- getattr(np, method)(df, axis=0)
- def test_median_categorical_dtype_nuisance_column(self):
- # GH#21020 DataFrame.median should match Series.median
- df = DataFrame({"A": Categorical([1, 2, 2, 2, 3])})
- ser = df["A"]
- # Double-check the Series behavior is to raise
- with pytest.raises(TypeError, match="does not support reduction"):
- ser.median()
- with pytest.raises(TypeError, match="does not support reduction"):
- df.median(numeric_only=False)
- with pytest.raises(TypeError, match="does not support reduction"):
- df.median()
- # same thing, but with an additional non-categorical column
- df["B"] = df["A"].astype(int)
- with pytest.raises(TypeError, match="does not support reduction"):
- df.median(numeric_only=False)
- with pytest.raises(TypeError, match="does not support reduction"):
- df.median()
- # TODO: np.median(df, axis=0) gives np.array([2.0, 2.0]) instead
- # of expected.values
- @pytest.mark.parametrize("method", ["min", "max"])
- def test_min_max_categorical_dtype_non_ordered_nuisance_column(self, method):
- # GH#28949 DataFrame.min should behave like Series.min
- cat = Categorical(["a", "b", "c", "b"], ordered=False)
- ser = Series(cat)
- df = ser.to_frame("A")
- # Double-check the Series behavior
- with pytest.raises(TypeError, match="is not ordered for operation"):
- getattr(ser, method)()
- with pytest.raises(TypeError, match="is not ordered for operation"):
- getattr(np, method)(ser)
- with pytest.raises(TypeError, match="is not ordered for operation"):
- getattr(df, method)(numeric_only=False)
- with pytest.raises(TypeError, match="is not ordered for operation"):
- getattr(df, method)()
- with pytest.raises(TypeError, match="is not ordered for operation"):
- getattr(np, method)(df, axis=0)
- # same thing, but with an additional non-categorical column
- df["B"] = df["A"].astype(object)
- with pytest.raises(TypeError, match="is not ordered for operation"):
- getattr(df, method)()
- with pytest.raises(TypeError, match="is not ordered for operation"):
- getattr(np, method)(df, axis=0)
- def test_sum_timedelta64_skipna_false(using_array_manager, request):
- # GH#17235
- if using_array_manager:
- mark = pytest.mark.xfail(
- reason="Incorrect type inference on NaT in reduction result"
- )
- request.node.add_marker(mark)
- arr = np.arange(8).astype(np.int64).view("m8[s]").reshape(4, 2)
- arr[-1, -1] = "Nat"
- df = DataFrame(arr)
- assert (df.dtypes == arr.dtype).all()
- result = df.sum(skipna=False)
- expected = Series([pd.Timedelta(seconds=12), pd.NaT], dtype="m8[s]")
- tm.assert_series_equal(result, expected)
- result = df.sum(axis=0, skipna=False)
- tm.assert_series_equal(result, expected)
- result = df.sum(axis=1, skipna=False)
- expected = Series(
- [
- pd.Timedelta(seconds=1),
- pd.Timedelta(seconds=5),
- pd.Timedelta(seconds=9),
- pd.NaT,
- ],
- dtype="m8[s]",
- )
- tm.assert_series_equal(result, expected)
- def test_mixed_frame_with_integer_sum():
- # https://github.com/pandas-dev/pandas/issues/34520
- df = DataFrame([["a", 1]], columns=list("ab"))
- df = df.astype({"b": "Int64"})
- result = df.sum()
- expected = Series(["a", 1], index=["a", "b"])
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("numeric_only", [True, False, None])
- @pytest.mark.parametrize("method", ["min", "max"])
- def test_minmax_extensionarray(method, numeric_only):
- # https://github.com/pandas-dev/pandas/issues/32651
- int64_info = np.iinfo("int64")
- ser = Series([int64_info.max, None, int64_info.min], dtype=pd.Int64Dtype())
- df = DataFrame({"Int64": ser})
- result = getattr(df, method)(numeric_only=numeric_only)
- expected = Series(
- [getattr(int64_info, method)], index=Index(["Int64"], dtype="object")
- )
- tm.assert_series_equal(result, expected)
- @pytest.mark.parametrize("ts_value", [Timestamp("2000-01-01"), pd.NaT])
- def test_frame_mixed_numeric_object_with_timestamp(ts_value):
- # GH 13912
- df = DataFrame({"a": [1], "b": [1.1], "c": ["foo"], "d": [ts_value]})
- with pytest.raises(TypeError, match="does not support reduction"):
- df.sum()
- def test_prod_sum_min_count_mixed_object():
- # https://github.com/pandas-dev/pandas/issues/41074
- df = DataFrame([1, "a", True])
- result = df.prod(axis=0, min_count=1, numeric_only=False)
- expected = Series(["a"])
- tm.assert_series_equal(result, expected)
- msg = re.escape("unsupported operand type(s) for +: 'int' and 'str'")
- with pytest.raises(TypeError, match=msg):
- df.sum(axis=0, min_count=1, numeric_only=False)
- @pytest.mark.parametrize("method", ["min", "max", "mean", "median", "skew", "kurt"])
- @pytest.mark.parametrize("numeric_only", [True, False])
- def test_reduction_axis_none_returns_scalar(method, numeric_only):
- # GH#21597 As of 2.0, axis=None reduces over all axes.
- df = DataFrame(np.random.randn(4, 4))
- result = getattr(df, method)(axis=None, numeric_only=numeric_only)
- np_arr = df.to_numpy()
- if method in {"skew", "kurt"}:
- comp_mod = pytest.importorskip("scipy.stats")
- if method == "kurt":
- method = "kurtosis"
- expected = getattr(comp_mod, method)(np_arr, bias=False, axis=None)
- tm.assert_almost_equal(result, expected)
- else:
- expected = getattr(np, method)(np_arr, axis=None)
- assert result == expected
- @pytest.mark.parametrize(
- "kernel",
- [
- "corr",
- "corrwith",
- "cov",
- "idxmax",
- "idxmin",
- "kurt",
- "max",
- "mean",
- "median",
- "min",
- "prod",
- "quantile",
- "sem",
- "skew",
- "std",
- "sum",
- "var",
- ],
- )
- def test_fails_on_non_numeric(kernel):
- # GH#46852
- df = DataFrame({"a": [1, 2, 3], "b": object})
- args = (df,) if kernel == "corrwith" else ()
- msg = "|".join(
- [
- "not allowed for this dtype",
- "argument must be a string or a number",
- "not supported between instances of",
- "unsupported operand type",
- "argument must be a string or a real number",
- ]
- )
- with pytest.raises(TypeError, match=msg):
- getattr(df, kernel)(*args)
|