test_time_grouper.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. from datetime import datetime
  2. from operator import methodcaller
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. Series,
  9. Timestamp,
  10. )
  11. import pandas._testing as tm
  12. from pandas.core.groupby.grouper import Grouper
  13. from pandas.core.indexes.datetimes import date_range
  14. test_series = Series(np.random.randn(1000), index=date_range("1/1/2000", periods=1000))
  15. def test_apply():
  16. grouper = Grouper(freq="A", label="right", closed="right")
  17. grouped = test_series.groupby(grouper)
  18. def f(x):
  19. return x.sort_values()[-3:]
  20. applied = grouped.apply(f)
  21. expected = test_series.groupby(lambda x: x.year).apply(f)
  22. applied.index = applied.index.droplevel(0)
  23. expected.index = expected.index.droplevel(0)
  24. tm.assert_series_equal(applied, expected)
  25. def test_count():
  26. test_series[::3] = np.nan
  27. expected = test_series.groupby(lambda x: x.year).count()
  28. grouper = Grouper(freq="A", label="right", closed="right")
  29. result = test_series.groupby(grouper).count()
  30. expected.index = result.index
  31. tm.assert_series_equal(result, expected)
  32. result = test_series.resample("A").count()
  33. expected.index = result.index
  34. tm.assert_series_equal(result, expected)
  35. def test_numpy_reduction():
  36. result = test_series.resample("A", closed="right").prod()
  37. expected = test_series.groupby(lambda x: x.year).agg(np.prod)
  38. expected.index = result.index
  39. tm.assert_series_equal(result, expected)
  40. def test_apply_iteration():
  41. # #2300
  42. N = 1000
  43. ind = date_range(start="2000-01-01", freq="D", periods=N)
  44. df = DataFrame({"open": 1, "close": 2}, index=ind)
  45. tg = Grouper(freq="M")
  46. grouper, _ = tg._get_grouper(df)
  47. # Errors
  48. grouped = df.groupby(grouper, group_keys=False)
  49. def f(df):
  50. return df["close"] / df["open"]
  51. # it works!
  52. result = grouped.apply(f)
  53. tm.assert_index_equal(result.index, df.index)
  54. @pytest.mark.parametrize(
  55. "func",
  56. [
  57. tm.makeIntIndex,
  58. tm.makeStringIndex,
  59. tm.makeFloatIndex,
  60. (lambda m: tm.makeCustomIndex(m, 2)),
  61. ],
  62. )
  63. def test_fails_on_no_datetime_index(func):
  64. n = 2
  65. index = func(n)
  66. name = type(index).__name__
  67. df = DataFrame({"a": np.random.randn(n)}, index=index)
  68. msg = (
  69. "Only valid with DatetimeIndex, TimedeltaIndex "
  70. f"or PeriodIndex, but got an instance of '{name}'"
  71. )
  72. with pytest.raises(TypeError, match=msg):
  73. df.groupby(Grouper(freq="D"))
  74. def test_aaa_group_order():
  75. # GH 12840
  76. # check TimeGrouper perform stable sorts
  77. n = 20
  78. data = np.random.randn(n, 4)
  79. df = DataFrame(data, columns=["A", "B", "C", "D"])
  80. df["key"] = [
  81. datetime(2013, 1, 1),
  82. datetime(2013, 1, 2),
  83. datetime(2013, 1, 3),
  84. datetime(2013, 1, 4),
  85. datetime(2013, 1, 5),
  86. ] * 4
  87. grouped = df.groupby(Grouper(key="key", freq="D"))
  88. tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 1)), df[::5])
  89. tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 2)), df[1::5])
  90. tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 3)), df[2::5])
  91. tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 4)), df[3::5])
  92. tm.assert_frame_equal(grouped.get_group(datetime(2013, 1, 5)), df[4::5])
  93. def test_aggregate_normal(resample_method):
  94. """Check TimeGrouper's aggregation is identical as normal groupby."""
  95. data = np.random.randn(20, 4)
  96. normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
  97. normal_df["key"] = [1, 2, 3, 4, 5] * 4
  98. dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
  99. dt_df["key"] = [
  100. datetime(2013, 1, 1),
  101. datetime(2013, 1, 2),
  102. datetime(2013, 1, 3),
  103. datetime(2013, 1, 4),
  104. datetime(2013, 1, 5),
  105. ] * 4
  106. normal_grouped = normal_df.groupby("key")
  107. dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
  108. expected = getattr(normal_grouped, resample_method)()
  109. dt_result = getattr(dt_grouped, resample_method)()
  110. expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key")
  111. tm.assert_equal(expected, dt_result)
  112. @pytest.mark.xfail(reason="if TimeGrouper is used included, 'nth' doesn't work yet")
  113. def test_aggregate_nth():
  114. """Check TimeGrouper's aggregation is identical as normal groupby."""
  115. data = np.random.randn(20, 4)
  116. normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
  117. normal_df["key"] = [1, 2, 3, 4, 5] * 4
  118. dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
  119. dt_df["key"] = [
  120. datetime(2013, 1, 1),
  121. datetime(2013, 1, 2),
  122. datetime(2013, 1, 3),
  123. datetime(2013, 1, 4),
  124. datetime(2013, 1, 5),
  125. ] * 4
  126. normal_grouped = normal_df.groupby("key")
  127. dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
  128. expected = normal_grouped.nth(3)
  129. expected.index = date_range(start="2013-01-01", freq="D", periods=5, name="key")
  130. dt_result = dt_grouped.nth(3)
  131. tm.assert_frame_equal(expected, dt_result)
  132. @pytest.mark.parametrize(
  133. "method, method_args, unit",
  134. [
  135. ("sum", {}, 0),
  136. ("sum", {"min_count": 0}, 0),
  137. ("sum", {"min_count": 1}, np.nan),
  138. ("prod", {}, 1),
  139. ("prod", {"min_count": 0}, 1),
  140. ("prod", {"min_count": 1}, np.nan),
  141. ],
  142. )
  143. def test_resample_entirely_nat_window(method, method_args, unit):
  144. s = Series([0] * 2 + [np.nan] * 2, index=date_range("2017", periods=4))
  145. result = methodcaller(method, **method_args)(s.resample("2d"))
  146. expected = Series(
  147. [0.0, unit], index=pd.DatetimeIndex(["2017-01-01", "2017-01-03"], freq="2D")
  148. )
  149. tm.assert_series_equal(result, expected)
  150. @pytest.mark.parametrize(
  151. "func, fill_value",
  152. [("min", np.nan), ("max", np.nan), ("sum", 0), ("prod", 1), ("count", 0)],
  153. )
  154. def test_aggregate_with_nat(func, fill_value):
  155. # check TimeGrouper's aggregation is identical as normal groupby
  156. # if NaT is included, 'var', 'std', 'mean', 'first','last'
  157. # and 'nth' doesn't work yet
  158. n = 20
  159. data = np.random.randn(n, 4).astype("int64")
  160. normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
  161. normal_df["key"] = [1, 2, np.nan, 4, 5] * 4
  162. dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
  163. dt_df["key"] = [
  164. datetime(2013, 1, 1),
  165. datetime(2013, 1, 2),
  166. pd.NaT,
  167. datetime(2013, 1, 4),
  168. datetime(2013, 1, 5),
  169. ] * 4
  170. normal_grouped = normal_df.groupby("key")
  171. dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
  172. normal_result = getattr(normal_grouped, func)()
  173. dt_result = getattr(dt_grouped, func)()
  174. pad = DataFrame([[fill_value] * 4], index=[3], columns=["A", "B", "C", "D"])
  175. expected = pd.concat([normal_result, pad])
  176. expected = expected.sort_index()
  177. dti = date_range(start="2013-01-01", freq="D", periods=5, name="key")
  178. expected.index = dti._with_freq(None) # TODO: is this desired?
  179. tm.assert_frame_equal(expected, dt_result)
  180. assert dt_result.index.name == "key"
  181. def test_aggregate_with_nat_size():
  182. # GH 9925
  183. n = 20
  184. data = np.random.randn(n, 4).astype("int64")
  185. normal_df = DataFrame(data, columns=["A", "B", "C", "D"])
  186. normal_df["key"] = [1, 2, np.nan, 4, 5] * 4
  187. dt_df = DataFrame(data, columns=["A", "B", "C", "D"])
  188. dt_df["key"] = [
  189. datetime(2013, 1, 1),
  190. datetime(2013, 1, 2),
  191. pd.NaT,
  192. datetime(2013, 1, 4),
  193. datetime(2013, 1, 5),
  194. ] * 4
  195. normal_grouped = normal_df.groupby("key")
  196. dt_grouped = dt_df.groupby(Grouper(key="key", freq="D"))
  197. normal_result = normal_grouped.size()
  198. dt_result = dt_grouped.size()
  199. pad = Series([0], index=[3])
  200. expected = pd.concat([normal_result, pad])
  201. expected = expected.sort_index()
  202. expected.index = date_range(
  203. start="2013-01-01", freq="D", periods=5, name="key"
  204. )._with_freq(None)
  205. tm.assert_series_equal(expected, dt_result)
  206. assert dt_result.index.name == "key"
  207. def test_repr():
  208. # GH18203
  209. result = repr(Grouper(key="A", freq="H"))
  210. expected = (
  211. "TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, dropna=True, "
  212. "closed='left', label='left', how='mean', "
  213. "convention='e', origin='start_day')"
  214. )
  215. assert result == expected
  216. result = repr(Grouper(key="A", freq="H", origin="2000-01-01"))
  217. expected = (
  218. "TimeGrouper(key='A', freq=<Hour>, axis=0, sort=True, dropna=True, "
  219. "closed='left', label='left', how='mean', "
  220. "convention='e', origin=Timestamp('2000-01-01 00:00:00'))"
  221. )
  222. assert result == expected
  223. @pytest.mark.parametrize(
  224. "method, method_args, expected_values",
  225. [
  226. ("sum", {}, [1, 0, 1]),
  227. ("sum", {"min_count": 0}, [1, 0, 1]),
  228. ("sum", {"min_count": 1}, [1, np.nan, 1]),
  229. ("sum", {"min_count": 2}, [np.nan, np.nan, np.nan]),
  230. ("prod", {}, [1, 1, 1]),
  231. ("prod", {"min_count": 0}, [1, 1, 1]),
  232. ("prod", {"min_count": 1}, [1, np.nan, 1]),
  233. ("prod", {"min_count": 2}, [np.nan, np.nan, np.nan]),
  234. ],
  235. )
  236. def test_upsample_sum(method, method_args, expected_values):
  237. s = Series(1, index=date_range("2017", periods=2, freq="H"))
  238. resampled = s.resample("30T")
  239. index = pd.DatetimeIndex(
  240. ["2017-01-01T00:00:00", "2017-01-01T00:30:00", "2017-01-01T01:00:00"],
  241. freq="30T",
  242. )
  243. result = methodcaller(method, **method_args)(resampled)
  244. expected = Series(expected_values, index=index)
  245. tm.assert_series_equal(result, expected)
  246. def test_groupby_resample_interpolate():
  247. # GH 35325
  248. d = {"price": [10, 11, 9], "volume": [50, 60, 50]}
  249. df = DataFrame(d)
  250. df["week_starting"] = date_range("01/01/2018", periods=3, freq="W")
  251. result = (
  252. df.set_index("week_starting")
  253. .groupby("volume")
  254. .resample("1D")
  255. .interpolate(method="linear")
  256. )
  257. expected_ind = pd.MultiIndex.from_tuples(
  258. [
  259. (50, Timestamp("2018-01-07")),
  260. (50, Timestamp("2018-01-08")),
  261. (50, Timestamp("2018-01-09")),
  262. (50, Timestamp("2018-01-10")),
  263. (50, Timestamp("2018-01-11")),
  264. (50, Timestamp("2018-01-12")),
  265. (50, Timestamp("2018-01-13")),
  266. (50, Timestamp("2018-01-14")),
  267. (50, Timestamp("2018-01-15")),
  268. (50, Timestamp("2018-01-16")),
  269. (50, Timestamp("2018-01-17")),
  270. (50, Timestamp("2018-01-18")),
  271. (50, Timestamp("2018-01-19")),
  272. (50, Timestamp("2018-01-20")),
  273. (50, Timestamp("2018-01-21")),
  274. (60, Timestamp("2018-01-14")),
  275. ],
  276. names=["volume", "week_starting"],
  277. )
  278. expected = DataFrame(
  279. data={
  280. "price": [
  281. 10.0,
  282. 9.928571428571429,
  283. 9.857142857142858,
  284. 9.785714285714286,
  285. 9.714285714285714,
  286. 9.642857142857142,
  287. 9.571428571428571,
  288. 9.5,
  289. 9.428571428571429,
  290. 9.357142857142858,
  291. 9.285714285714286,
  292. 9.214285714285714,
  293. 9.142857142857142,
  294. 9.071428571428571,
  295. 9.0,
  296. 11.0,
  297. ],
  298. "volume": [50.0] * 15 + [60],
  299. },
  300. index=expected_ind,
  301. )
  302. tm.assert_frame_equal(result, expected)