test_stat_reductions.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. """
  2. Tests for statistical reductions of 2nd moment or higher: var, skew, kurt, ...
  3. """
  4. import inspect
  5. import numpy as np
  6. import pytest
  7. import pandas.util._test_decorators as td
  8. import pandas as pd
  9. from pandas import (
  10. DataFrame,
  11. Series,
  12. )
  13. import pandas._testing as tm
  14. from pandas.core.arrays import (
  15. DatetimeArray,
  16. PeriodArray,
  17. TimedeltaArray,
  18. )
  19. class TestDatetimeLikeStatReductions:
  20. @pytest.mark.parametrize("box", [Series, pd.Index, DatetimeArray])
  21. def test_dt64_mean(self, tz_naive_fixture, box):
  22. tz = tz_naive_fixture
  23. dti = pd.date_range("2001-01-01", periods=11, tz=tz)
  24. # shuffle so that we are not just working with monotone-increasing
  25. dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
  26. dtarr = dti._data
  27. obj = box(dtarr)
  28. assert obj.mean() == pd.Timestamp("2001-01-06", tz=tz)
  29. assert obj.mean(skipna=False) == pd.Timestamp("2001-01-06", tz=tz)
  30. # dtarr[-2] will be the first date 2001-01-1
  31. dtarr[-2] = pd.NaT
  32. obj = box(dtarr)
  33. assert obj.mean() == pd.Timestamp("2001-01-06 07:12:00", tz=tz)
  34. assert obj.mean(skipna=False) is pd.NaT
  35. @pytest.mark.parametrize("box", [Series, pd.Index, PeriodArray])
  36. @pytest.mark.parametrize("freq", ["S", "H", "D", "W", "B"])
  37. def test_period_mean(self, box, freq):
  38. # GH#24757
  39. dti = pd.date_range("2001-01-01", periods=11)
  40. # shuffle so that we are not just working with monotone-increasing
  41. dti = dti.take([4, 1, 3, 10, 9, 7, 8, 5, 0, 2, 6])
  42. parr = dti._data.to_period(freq)
  43. obj = box(parr)
  44. with pytest.raises(TypeError, match="ambiguous"):
  45. obj.mean()
  46. with pytest.raises(TypeError, match="ambiguous"):
  47. obj.mean(skipna=True)
  48. # parr[-2] will be the first date 2001-01-1
  49. parr[-2] = pd.NaT
  50. with pytest.raises(TypeError, match="ambiguous"):
  51. obj.mean()
  52. with pytest.raises(TypeError, match="ambiguous"):
  53. obj.mean(skipna=True)
  54. @pytest.mark.parametrize("box", [Series, pd.Index, TimedeltaArray])
  55. def test_td64_mean(self, box):
  56. tdi = pd.TimedeltaIndex([0, 3, -2, -7, 1, 2, -1, 3, 5, -2, 4], unit="D")
  57. tdarr = tdi._data
  58. obj = box(tdarr, copy=False)
  59. result = obj.mean()
  60. expected = np.array(tdarr).mean()
  61. assert result == expected
  62. tdarr[0] = pd.NaT
  63. assert obj.mean(skipna=False) is pd.NaT
  64. result2 = obj.mean(skipna=True)
  65. assert result2 == tdi[1:].mean()
  66. # exact equality fails by 1 nanosecond
  67. assert result2.round("us") == (result * 11.0 / 10).round("us")
  68. class TestSeriesStatReductions:
  69. # Note: the name TestSeriesStatReductions indicates these tests
  70. # were moved from a series-specific test file, _not_ that these tests are
  71. # intended long-term to be series-specific
  72. def _check_stat_op(
  73. self, name, alternate, string_series_, check_objects=False, check_allna=False
  74. ):
  75. with pd.option_context("use_bottleneck", False):
  76. f = getattr(Series, name)
  77. # add some NaNs
  78. string_series_[5:15] = np.NaN
  79. # mean, idxmax, idxmin, min, and max are valid for dates
  80. if name not in ["max", "min", "mean", "median", "std"]:
  81. ds = Series(pd.date_range("1/1/2001", periods=10))
  82. msg = f"does not support reduction '{name}'"
  83. with pytest.raises(TypeError, match=msg):
  84. f(ds)
  85. # skipna or no
  86. assert pd.notna(f(string_series_))
  87. assert pd.isna(f(string_series_, skipna=False))
  88. # check the result is correct
  89. nona = string_series_.dropna()
  90. tm.assert_almost_equal(f(nona), alternate(nona.values))
  91. tm.assert_almost_equal(f(string_series_), alternate(nona.values))
  92. allna = string_series_ * np.nan
  93. if check_allna:
  94. assert np.isnan(f(allna))
  95. # dtype=object with None, it works!
  96. s = Series([1, 2, 3, None, 5])
  97. f(s)
  98. # GH#2888
  99. items = [0]
  100. items.extend(range(2**40, 2**40 + 1000))
  101. s = Series(items, dtype="int64")
  102. tm.assert_almost_equal(float(f(s)), float(alternate(s.values)))
  103. # check date range
  104. if check_objects:
  105. s = Series(pd.bdate_range("1/1/2000", periods=10))
  106. res = f(s)
  107. exp = alternate(s)
  108. assert res == exp
  109. # check on string data
  110. if name not in ["sum", "min", "max"]:
  111. with pytest.raises(TypeError, match=None):
  112. f(Series(list("abc")))
  113. # Invalid axis.
  114. msg = "No axis named 1 for object type Series"
  115. with pytest.raises(ValueError, match=msg):
  116. f(string_series_, axis=1)
  117. if "numeric_only" in inspect.getfullargspec(f).args:
  118. # only the index is string; dtype is float
  119. f(string_series_, numeric_only=True)
  120. def test_sum(self):
  121. string_series = tm.makeStringSeries().rename("series")
  122. self._check_stat_op("sum", np.sum, string_series, check_allna=False)
  123. def test_mean(self):
  124. string_series = tm.makeStringSeries().rename("series")
  125. self._check_stat_op("mean", np.mean, string_series)
  126. def test_median(self):
  127. string_series = tm.makeStringSeries().rename("series")
  128. self._check_stat_op("median", np.median, string_series)
  129. # test with integers, test failure
  130. int_ts = Series(np.ones(10, dtype=int), index=range(10))
  131. tm.assert_almost_equal(np.median(int_ts), int_ts.median())
  132. def test_prod(self):
  133. string_series = tm.makeStringSeries().rename("series")
  134. self._check_stat_op("prod", np.prod, string_series)
  135. def test_min(self):
  136. string_series = tm.makeStringSeries().rename("series")
  137. self._check_stat_op("min", np.min, string_series, check_objects=True)
  138. def test_max(self):
  139. string_series = tm.makeStringSeries().rename("series")
  140. self._check_stat_op("max", np.max, string_series, check_objects=True)
  141. def test_var_std(self):
  142. string_series = tm.makeStringSeries().rename("series")
  143. datetime_series = tm.makeTimeSeries().rename("ts")
  144. alt = lambda x: np.std(x, ddof=1)
  145. self._check_stat_op("std", alt, string_series)
  146. alt = lambda x: np.var(x, ddof=1)
  147. self._check_stat_op("var", alt, string_series)
  148. result = datetime_series.std(ddof=4)
  149. expected = np.std(datetime_series.values, ddof=4)
  150. tm.assert_almost_equal(result, expected)
  151. result = datetime_series.var(ddof=4)
  152. expected = np.var(datetime_series.values, ddof=4)
  153. tm.assert_almost_equal(result, expected)
  154. # 1 - element series with ddof=1
  155. s = datetime_series.iloc[[0]]
  156. result = s.var(ddof=1)
  157. assert pd.isna(result)
  158. result = s.std(ddof=1)
  159. assert pd.isna(result)
  160. def test_sem(self):
  161. string_series = tm.makeStringSeries().rename("series")
  162. datetime_series = tm.makeTimeSeries().rename("ts")
  163. alt = lambda x: np.std(x, ddof=1) / np.sqrt(len(x))
  164. self._check_stat_op("sem", alt, string_series)
  165. result = datetime_series.sem(ddof=4)
  166. expected = np.std(datetime_series.values, ddof=4) / np.sqrt(
  167. len(datetime_series.values)
  168. )
  169. tm.assert_almost_equal(result, expected)
  170. # 1 - element series with ddof=1
  171. s = datetime_series.iloc[[0]]
  172. result = s.sem(ddof=1)
  173. assert pd.isna(result)
  174. @td.skip_if_no_scipy
  175. def test_skew(self):
  176. from scipy.stats import skew
  177. string_series = tm.makeStringSeries().rename("series")
  178. alt = lambda x: skew(x, bias=False)
  179. self._check_stat_op("skew", alt, string_series)
  180. # test corner cases, skew() returns NaN unless there's at least 3
  181. # values
  182. min_N = 3
  183. for i in range(1, min_N + 1):
  184. s = Series(np.ones(i))
  185. df = DataFrame(np.ones((i, i)))
  186. if i < min_N:
  187. assert np.isnan(s.skew())
  188. assert np.isnan(df.skew()).all()
  189. else:
  190. assert 0 == s.skew()
  191. assert (df.skew() == 0).all()
  192. @td.skip_if_no_scipy
  193. def test_kurt(self):
  194. from scipy.stats import kurtosis
  195. string_series = tm.makeStringSeries().rename("series")
  196. alt = lambda x: kurtosis(x, bias=False)
  197. self._check_stat_op("kurt", alt, string_series)
  198. def test_kurt_corner(self):
  199. # test corner cases, kurt() returns NaN unless there's at least 4
  200. # values
  201. min_N = 4
  202. for i in range(1, min_N + 1):
  203. s = Series(np.ones(i))
  204. df = DataFrame(np.ones((i, i)))
  205. if i < min_N:
  206. assert np.isnan(s.kurt())
  207. assert np.isnan(df.kurt()).all()
  208. else:
  209. assert 0 == s.kurt()
  210. assert (df.kurt() == 0).all()