123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270 |
- """
- Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ...
- """
- import inspect
- import numpy as np
- import pytest
- import pandas.util._test_decorators as td
- import pandas as pd
- from pandas import (
- DataFrame,
- Series,
- )
- import pandas._testing as tm
- from pandas.core.arrays import (
- DatetimeArray,
- PeriodArray,
- TimedeltaArray,
- )
- class TestDatetimeLikeStatReductions:
- @pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray])
- def test_dt64_mean(self, tz_naive_fixture, box):
- tz = tz_naive_fixture
- dti = pd.date_range("2001-01-01", periods=11, tz=tz)
- # shuffle so that we are not just working with monotone-increasing
- dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
- dtarr = dti._data
- obj = box(dtarr)
- assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz)
- assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz)
- # dtarr[-2] will be the first date 2001-01-1
- dtarr[-2] = pd.NaT
- obj = box(dtarr)
- assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz)
- assert obj.mean(skipna=False) is pd.NaT
- @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray])
- @pytest.mark.parametrize("freq", ["S", "H", "D", "W", "B"])
- def test_period_mean(self, box, freq):
- # GH#24757
- dti = pd.date_range("2001-01-01", periods=11)
- # shuffle so that we are not just working with monotone-increasing
- dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
- parr = dti._data.to_period(freq)
- obj = box(parr)
- with pytest.raises(TypeError, match="ambiguous"):
- obj.mean()
- with pytest.raises(TypeError, match="ambiguous"):
- obj.mean(skipna=True)
- # parr[-2] will be the first date 2001-01-1
- parr[-2] = pd.NaT
- with pytest.raises(TypeError, match="ambiguous"):
- obj.mean()
- with pytest.raises(TypeError, match="ambiguous"):
- obj.mean(skipna=True)
- @pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray])
- def test_td64_mean(self, box):
- tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D")
- tdarr = tdi._data
- obj = box(tdarr, copy=False)
- result = obj.mean()
- expected = np.array(tdarr).mean()
- assert result == expected
- tdarr[0] = pd.NaT
- assert obj.mean(skipna=False) is pd.NaT
- result2 = obj.mean(skipna=True)
- assert result2 == tdi[1:].mean()
- # exact equality fails by 1 nanosecond
- assert result2.round("us") == (result * 11.0 / 10).round("us")
- class TestSeriesStatReductions:
- # Note: the name TestSeriesStatReductions indicates these tests
- # were moved from a series-specific test file, _not_ that these tests are
- # intended long-term to be series-specific
- def _check_stat_op(
- self, name, alternate, string_series_, check_objects=False, check_allna=False
- ):
- with pd.option_context("use_bottleneck", False):
- f = getattr(Series, name)
- # add some NaNs
- string_series_[5:15] = np.NaN
- # mean, idxmax, idxmin, min, and max are valid for dates
- if name not in ["max", "min", "mean", "median", "std"]:
- ds = Series(pd.date_range("1/1/2001", periods=10))
- msg = f"does not support reduction '{name}'"
- with pytest.raises(TypeError, match=msg):
- f(ds)
- # skipna or no
- assert pd.notna(f(string_series_))
- assert pd.isna(f(string_series_, skipna=False))
- # check the result is correct
- nona = string_series_.dropna()
- tm.assert_almost_equal(f(nona), alternate(nona.values))
- tm.assert_almost_equal(f(string_series_), alternate(nona.values))
- allna = string_series_ * np.nan
- if check_allna:
- assert np.isnan(f(allna))
- # dtype=object with None, it works!
- s = Series([1, 2, 3, None, 5])
- f(s)
- # GH#2888
- items = [0]
- items.extend(range(2**40, 2**40 + 1000))
- s = Series(items, dtype="int64")
- tm.assert_almost_equal(float(f(s)), float(alternate(s.values)))
- # check date range
- if check_objects:
- s = Series(pd.bdate_range("1/1/2000", periods=10))
- res = f(s)
- exp = alternate(s)
- assert res == exp
- # check on string data
- if name not in ["sum", "min", "max"]:
- with pytest.raises(TypeError, match=None):
- f(Series(list("abc")))
- # Invalid axis.
- msg = "No axis named 1 for object type Series"
- with pytest.raises(ValueError, match=msg):
- f(string_series_, axis=1)
- if "numeric_only" in inspect.getfullargspec(f).args:
- # only the index is string; dtype is float
- f(string_series_, numeric_only=True)
- def test_sum(self):
- string_series = tm.makeStringSeries().rename("series")
- self._check_stat_op("sum", np.sum, string_series, check_allna=False)
- def test_mean(self):
- string_series = tm.makeStringSeries().rename("series")
- self._check_stat_op("mean", np.mean, string_series)
- def test_median(self):
- string_series = tm.makeStringSeries().rename("series")
- self._check_stat_op("median", np.median, string_series)
- # test with integers, test failure
- int_ts = Series(np.ones(10, dtype=int), index=range(10))
- tm.assert_almost_equal(np.median(int_ts), int_ts.median())
- def test_prod(self):
- string_series = tm.makeStringSeries().rename("series")
- self._check_stat_op("prod", np.prod, string_series)
- def test_min(self):
- string_series = tm.makeStringSeries().rename("series")
- self._check_stat_op("min", np.min, string_series, check_objects=True)
- def test_max(self):
- string_series = tm.makeStringSeries().rename("series")
- self._check_stat_op("max", np.max, string_series, check_objects=True)
- def test_var_std(self):
- string_series = tm.makeStringSeries().rename("series")
- datetime_series = tm.makeTimeSeries().rename("ts")
- alt = lambda x: np.std(x, ddof=1)
- self._check_stat_op("std", alt, string_series)
- alt = lambda x: np.var(x, ddof=1)
- self._check_stat_op("var", alt, string_series)
- result = datetime_series.std(ddof=4)
- expected = np.std(datetime_series.values, ddof=4)
- tm.assert_almost_equal(result, expected)
- result = datetime_series.var(ddof=4)
- expected = np.var(datetime_series.values, ddof=4)
- tm.assert_almost_equal(result, expected)
- # 1 - element series with ddof=1
- s = datetime_series.iloc[[0]]
- result = s.var(ddof=1)
- assert pd.isna(result)
- result = s.std(ddof=1)
- assert pd.isna(result)
- def test_sem(self):
- string_series = tm.makeStringSeries().rename("series")
- datetime_series = tm.makeTimeSeries().rename("ts")
- alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
- self._check_stat_op("sem", alt, string_series)
- result = datetime_series.sem(ddof=4)
- expected = np.std(datetime_series.values, ddof=4) / np.sqrt(
- len(datetime_series.values)
- )
- tm.assert_almost_equal(result, expected)
- # 1 - element series with ddof=1
- s = datetime_series.iloc[[0]]
- result = s.sem(ddof=1)
- assert pd.isna(result)
- @td.skip_if_no_scipy
- def test_skew(self):
- from scipy.stats import skew
- string_series = tm.makeStringSeries().rename("series")
- alt = lambda x: skew(x, bias=False)
- self._check_stat_op("skew", alt, string_series)
- # test corner cases, skew() returns NaN unless there's at least 3
- # values
- min_N = 3
- for i in range(1, min_N + 1):
- s = Series(np.ones(i))
- df = DataFrame(np.ones((i, i)))
- if i < min_N:
- assert np.isnan(s.skew())
- assert np.isnan(df.skew()).all()
- else:
- assert 0 == s.skew()
- assert (df.skew() == 0).all()
- @td.skip_if_no_scipy
- def test_kurt(self):
- from scipy.stats import kurtosis
- string_series = tm.makeStringSeries().rename("series")
- alt = lambda x: kurtosis(x, bias=False)
- self._check_stat_op("kurt", alt, string_series)
- def test_kurt_corner(self):
- # test corner cases, kurt() returns NaN unless there's at least 4
- # values
- min_N = 4
- for i in range(1, min_N + 1):
- s = Series(np.ones(i))
- df = DataFrame(np.ones((i, i)))
- if i < min_N:
- assert np.isnan(s.kurt())
- assert np.isnan(df.kurt()).all()
- else:
- assert 0 == s.kurt()
- assert (df.kurt() == 0).all()
|