test_base_indexer.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. DataFrame,
  5. MultiIndex,
  6. Series,
  7. concat,
  8. date_range,
  9. )
  10. import pandas._testing as tm
  11. from pandas.api.indexers import (
  12. BaseIndexer,
  13. FixedForwardWindowIndexer,
  14. )
  15. from pandas.core.indexers.objects import (
  16. ExpandingIndexer,
  17. FixedWindowIndexer,
  18. VariableOffsetWindowIndexer,
  19. )
  20. from pandas.tseries.offsets import BusinessDay
  21. def test_bad_get_window_bounds_signature():
  22. class BadIndexer(BaseIndexer):
  23. def get_window_bounds(self):
  24. return None
  25. indexer = BadIndexer()
  26. with pytest.raises(ValueError, match="BadIndexer does not implement"):
  27. Series(range(5)).rolling(indexer)
  28. def test_expanding_indexer():
  29. s = Series(range(10))
  30. indexer = ExpandingIndexer()
  31. result = s.rolling(indexer).mean()
  32. expected = s.expanding().mean()
  33. tm.assert_series_equal(result, expected)
  34. def test_indexer_constructor_arg():
  35. # Example found in computation.rst
  36. use_expanding = [True, False, True, False, True]
  37. df = DataFrame({"values": range(5)})
  38. class CustomIndexer(BaseIndexer):
  39. def get_window_bounds(self, num_values, min_periods, center, closed, step):
  40. start = np.empty(num_values, dtype=np.int64)
  41. end = np.empty(num_values, dtype=np.int64)
  42. for i in range(num_values):
  43. if self.use_expanding[i]:
  44. start[i] = 0
  45. end[i] = i + 1
  46. else:
  47. start[i] = i
  48. end[i] = i + self.window_size
  49. return start, end
  50. indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
  51. result = df.rolling(indexer).sum()
  52. expected = DataFrame({"values": [0.0, 1.0, 3.0, 3.0, 10.0]})
  53. tm.assert_frame_equal(result, expected)
  54. def test_indexer_accepts_rolling_args():
  55. df = DataFrame({"values": range(5)})
  56. class CustomIndexer(BaseIndexer):
  57. def get_window_bounds(self, num_values, min_periods, center, closed, step):
  58. start = np.empty(num_values, dtype=np.int64)
  59. end = np.empty(num_values, dtype=np.int64)
  60. for i in range(num_values):
  61. if (
  62. center
  63. and min_periods == 1
  64. and closed == "both"
  65. and step == 1
  66. and i == 2
  67. ):
  68. start[i] = 0
  69. end[i] = num_values
  70. else:
  71. start[i] = i
  72. end[i] = i + self.window_size
  73. return start, end
  74. indexer = CustomIndexer(window_size=1)
  75. result = df.rolling(
  76. indexer, center=True, min_periods=1, closed="both", step=1
  77. ).sum()
  78. expected = DataFrame({"values": [0.0, 1.0, 10.0, 3.0, 4.0]})
  79. tm.assert_frame_equal(result, expected)
  80. @pytest.mark.parametrize(
  81. "func,np_func,expected,np_kwargs",
  82. [
  83. ("count", len, [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 2.0, np.nan], {}),
  84. ("min", np.min, [0.0, 1.0, 2.0, 3.0, 4.0, 6.0, 6.0, 7.0, 8.0, np.nan], {}),
  85. (
  86. "max",
  87. np.max,
  88. [2.0, 3.0, 4.0, 100.0, 100.0, 100.0, 8.0, 9.0, 9.0, np.nan],
  89. {},
  90. ),
  91. (
  92. "std",
  93. np.std,
  94. [
  95. 1.0,
  96. 1.0,
  97. 1.0,
  98. 55.71654452,
  99. 54.85739087,
  100. 53.9845657,
  101. 1.0,
  102. 1.0,
  103. 0.70710678,
  104. np.nan,
  105. ],
  106. {"ddof": 1},
  107. ),
  108. (
  109. "var",
  110. np.var,
  111. [
  112. 1.0,
  113. 1.0,
  114. 1.0,
  115. 3104.333333,
  116. 3009.333333,
  117. 2914.333333,
  118. 1.0,
  119. 1.0,
  120. 0.500000,
  121. np.nan,
  122. ],
  123. {"ddof": 1},
  124. ),
  125. (
  126. "median",
  127. np.median,
  128. [1.0, 2.0, 3.0, 4.0, 6.0, 7.0, 7.0, 8.0, 8.5, np.nan],
  129. {},
  130. ),
  131. ],
  132. )
  133. def test_rolling_forward_window(
  134. frame_or_series, func, np_func, expected, np_kwargs, step
  135. ):
  136. # GH 32865
  137. values = np.arange(10.0)
  138. values[5] = 100.0
  139. indexer = FixedForwardWindowIndexer(window_size=3)
  140. match = "Forward-looking windows can't have center=True"
  141. with pytest.raises(ValueError, match=match):
  142. rolling = frame_or_series(values).rolling(window=indexer, center=True)
  143. getattr(rolling, func)()
  144. match = "Forward-looking windows don't support setting the closed argument"
  145. with pytest.raises(ValueError, match=match):
  146. rolling = frame_or_series(values).rolling(window=indexer, closed="right")
  147. getattr(rolling, func)()
  148. rolling = frame_or_series(values).rolling(window=indexer, min_periods=2, step=step)
  149. result = getattr(rolling, func)()
  150. # Check that the function output matches the explicitly provided array
  151. expected = frame_or_series(expected)[::step]
  152. tm.assert_equal(result, expected)
  153. # Check that the rolling function output matches applying an alternative
  154. # function to the rolling window object
  155. expected2 = frame_or_series(rolling.apply(lambda x: np_func(x, **np_kwargs)))
  156. tm.assert_equal(result, expected2)
  157. # Check that the function output matches applying an alternative function
  158. # if min_periods isn't specified
  159. # GH 39604: After count-min_periods deprecation, apply(lambda x: len(x))
  160. # is equivalent to count after setting min_periods=0
  161. min_periods = 0 if func == "count" else None
  162. rolling3 = frame_or_series(values).rolling(window=indexer, min_periods=min_periods)
  163. result3 = getattr(rolling3, func)()
  164. expected3 = frame_or_series(rolling3.apply(lambda x: np_func(x, **np_kwargs)))
  165. tm.assert_equal(result3, expected3)
  166. def test_rolling_forward_skewness(frame_or_series, step):
  167. values = np.arange(10.0)
  168. values[5] = 100.0
  169. indexer = FixedForwardWindowIndexer(window_size=5)
  170. rolling = frame_or_series(values).rolling(window=indexer, min_periods=3, step=step)
  171. result = rolling.skew()
  172. expected = frame_or_series(
  173. [
  174. 0.0,
  175. 2.232396,
  176. 2.229508,
  177. 2.228340,
  178. 2.229091,
  179. 2.231989,
  180. 0.0,
  181. 0.0,
  182. np.nan,
  183. np.nan,
  184. ]
  185. )[::step]
  186. tm.assert_equal(result, expected)
  187. @pytest.mark.parametrize(
  188. "func,expected",
  189. [
  190. ("cov", [2.0, 2.0, 2.0, 97.0, 2.0, -93.0, 2.0, 2.0, np.nan, np.nan]),
  191. (
  192. "corr",
  193. [
  194. 1.0,
  195. 1.0,
  196. 1.0,
  197. 0.8704775290207161,
  198. 0.018229084250926637,
  199. -0.861357304646493,
  200. 1.0,
  201. 1.0,
  202. np.nan,
  203. np.nan,
  204. ],
  205. ),
  206. ],
  207. )
  208. def test_rolling_forward_cov_corr(func, expected):
  209. values1 = np.arange(10).reshape(-1, 1)
  210. values2 = values1 * 2
  211. values1[5, 0] = 100
  212. values = np.concatenate([values1, values2], axis=1)
  213. indexer = FixedForwardWindowIndexer(window_size=3)
  214. rolling = DataFrame(values).rolling(window=indexer, min_periods=3)
  215. # We are interested in checking only pairwise covariance / correlation
  216. result = getattr(rolling, func)().loc[(slice(None), 1), 0]
  217. result = result.reset_index(drop=True)
  218. expected = Series(expected).reset_index(drop=True)
  219. expected.name = result.name
  220. tm.assert_equal(result, expected)
  221. @pytest.mark.parametrize(
  222. "closed,expected_data",
  223. [
  224. ["right", [0.0, 1.0, 2.0, 3.0, 7.0, 12.0, 6.0, 7.0, 8.0, 9.0]],
  225. ["left", [0.0, 0.0, 1.0, 2.0, 5.0, 9.0, 5.0, 6.0, 7.0, 8.0]],
  226. ],
  227. )
  228. def test_non_fixed_variable_window_indexer(closed, expected_data):
  229. index = date_range("2020", periods=10)
  230. df = DataFrame(range(10), index=index)
  231. offset = BusinessDay(1)
  232. indexer = VariableOffsetWindowIndexer(index=index, offset=offset)
  233. result = df.rolling(indexer, closed=closed).sum()
  234. expected = DataFrame(expected_data, index=index)
  235. tm.assert_frame_equal(result, expected)
  236. def test_fixed_forward_indexer_count(step):
  237. # GH: 35579
  238. df = DataFrame({"b": [None, None, None, 7]})
  239. indexer = FixedForwardWindowIndexer(window_size=2)
  240. result = df.rolling(window=indexer, min_periods=0, step=step).count()
  241. expected = DataFrame({"b": [0.0, 0.0, 1.0, 1.0]})[::step]
  242. tm.assert_frame_equal(result, expected)
  243. @pytest.mark.parametrize(
  244. ("end_value", "values"), [(1, [0.0, 1, 1, 3, 2]), (-1, [0.0, 1, 0, 3, 1])]
  245. )
  246. @pytest.mark.parametrize(("func", "args"), [("median", []), ("quantile", [0.5])])
  247. def test_indexer_quantile_sum(end_value, values, func, args):
  248. # GH 37153
  249. class CustomIndexer(BaseIndexer):
  250. def get_window_bounds(self, num_values, min_periods, center, closed, step):
  251. start = np.empty(num_values, dtype=np.int64)
  252. end = np.empty(num_values, dtype=np.int64)
  253. for i in range(num_values):
  254. if self.use_expanding[i]:
  255. start[i] = 0
  256. end[i] = max(i + end_value, 1)
  257. else:
  258. start[i] = i
  259. end[i] = i + self.window_size
  260. return start, end
  261. use_expanding = [True, False, True, False, True]
  262. df = DataFrame({"values": range(5)})
  263. indexer = CustomIndexer(window_size=1, use_expanding=use_expanding)
  264. result = getattr(df.rolling(indexer), func)(*args)
  265. expected = DataFrame({"values": values})
  266. tm.assert_frame_equal(result, expected)
  267. @pytest.mark.parametrize(
  268. "indexer_class", [FixedWindowIndexer, FixedForwardWindowIndexer, ExpandingIndexer]
  269. )
  270. @pytest.mark.parametrize("window_size", [1, 2, 12])
  271. @pytest.mark.parametrize(
  272. "df_data",
  273. [
  274. {"a": [1, 1], "b": [0, 1]},
  275. {"a": [1, 2], "b": [0, 1]},
  276. {"a": [1] * 16, "b": [np.nan, 1, 2, np.nan] + list(range(4, 16))},
  277. ],
  278. )
  279. def test_indexers_are_reusable_after_groupby_rolling(
  280. indexer_class, window_size, df_data
  281. ):
  282. # GH 43267
  283. df = DataFrame(df_data)
  284. num_trials = 3
  285. indexer = indexer_class(window_size=window_size)
  286. original_window_size = indexer.window_size
  287. for i in range(num_trials):
  288. df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
  289. assert indexer.window_size == original_window_size
  290. @pytest.mark.parametrize(
  291. "window_size, num_values, expected_start, expected_end",
  292. [
  293. (1, 1, [0], [1]),
  294. (1, 2, [0, 1], [1, 2]),
  295. (2, 1, [0], [1]),
  296. (2, 2, [0, 1], [2, 2]),
  297. (5, 12, range(12), list(range(5, 12)) + [12] * 5),
  298. (12, 5, range(5), [5] * 5),
  299. (0, 0, np.array([]), np.array([])),
  300. (1, 0, np.array([]), np.array([])),
  301. (0, 1, [0], [0]),
  302. ],
  303. )
  304. def test_fixed_forward_indexer_bounds(
  305. window_size, num_values, expected_start, expected_end, step
  306. ):
  307. # GH 43267
  308. indexer = FixedForwardWindowIndexer(window_size=window_size)
  309. start, end = indexer.get_window_bounds(num_values=num_values, step=step)
  310. tm.assert_numpy_array_equal(
  311. start, np.array(expected_start[::step]), check_dtype=False
  312. )
  313. tm.assert_numpy_array_equal(end, np.array(expected_end[::step]), check_dtype=False)
  314. assert len(start) == len(end)
  315. @pytest.mark.parametrize(
  316. "df, window_size, expected",
  317. [
  318. (
  319. DataFrame({"b": [0, 1, 2], "a": [1, 2, 2]}),
  320. 2,
  321. Series(
  322. [0, 1.5, 2.0],
  323. index=MultiIndex.from_arrays([[1, 2, 2], range(3)], names=["a", None]),
  324. name="b",
  325. dtype=np.float64,
  326. ),
  327. ),
  328. (
  329. DataFrame(
  330. {
  331. "b": [np.nan, 1, 2, np.nan] + list(range(4, 18)),
  332. "a": [1] * 7 + [2] * 11,
  333. "c": range(18),
  334. }
  335. ),
  336. 12,
  337. Series(
  338. [
  339. 3.6,
  340. 3.6,
  341. 4.25,
  342. 5.0,
  343. 5.0,
  344. 5.5,
  345. 6.0,
  346. 12.0,
  347. 12.5,
  348. 13.0,
  349. 13.5,
  350. 14.0,
  351. 14.5,
  352. 15.0,
  353. 15.5,
  354. 16.0,
  355. 16.5,
  356. 17.0,
  357. ],
  358. index=MultiIndex.from_arrays(
  359. [[1] * 7 + [2] * 11, range(18)], names=["a", None]
  360. ),
  361. name="b",
  362. dtype=np.float64,
  363. ),
  364. ),
  365. ],
  366. )
  367. def test_rolling_groupby_with_fixed_forward_specific(df, window_size, expected):
  368. # GH 43267
  369. indexer = FixedForwardWindowIndexer(window_size=window_size)
  370. result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
  371. tm.assert_series_equal(result, expected)
  372. @pytest.mark.parametrize(
  373. "group_keys",
  374. [
  375. (1,),
  376. (1, 2),
  377. (2, 1),
  378. (1, 1, 2),
  379. (1, 2, 1),
  380. (1, 1, 2, 2),
  381. (1, 2, 3, 2, 3),
  382. (1, 1, 2) * 4,
  383. (1, 2, 3) * 5,
  384. ],
  385. )
  386. @pytest.mark.parametrize("window_size", [1, 2, 3, 4, 5, 8, 20])
  387. def test_rolling_groupby_with_fixed_forward_many(group_keys, window_size):
  388. # GH 43267
  389. df = DataFrame(
  390. {
  391. "a": np.array(list(group_keys)),
  392. "b": np.arange(len(group_keys), dtype=np.float64) + 17,
  393. "c": np.arange(len(group_keys), dtype=np.int64),
  394. }
  395. )
  396. indexer = FixedForwardWindowIndexer(window_size=window_size)
  397. result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).sum()
  398. result.index.names = ["a", "c"]
  399. groups = df.groupby("a")[["a", "b", "c"]]
  400. manual = concat(
  401. [
  402. g.assign(
  403. b=[
  404. g["b"].iloc[i : i + window_size].sum(min_count=1)
  405. for i in range(len(g))
  406. ]
  407. )
  408. for _, g in groups
  409. ]
  410. )
  411. manual = manual.set_index(["a", "c"])["b"]
  412. tm.assert_series_equal(result, manual)
  413. def test_unequal_start_end_bounds():
  414. class CustomIndexer(BaseIndexer):
  415. def get_window_bounds(self, num_values, min_periods, center, closed, step):
  416. return np.array([1]), np.array([1, 2])
  417. indexer = CustomIndexer()
  418. roll = Series(1).rolling(indexer)
  419. match = "start"
  420. with pytest.raises(ValueError, match=match):
  421. roll.mean()
  422. with pytest.raises(ValueError, match=match):
  423. next(iter(roll))
  424. with pytest.raises(ValueError, match=match):
  425. roll.corr(pairwise=True)
  426. with pytest.raises(ValueError, match=match):
  427. roll.cov(pairwise=True)
  428. def test_unequal_bounds_to_object():
  429. # GH 44470
  430. class CustomIndexer(BaseIndexer):
  431. def get_window_bounds(self, num_values, min_periods, center, closed, step):
  432. return np.array([1]), np.array([2])
  433. indexer = CustomIndexer()
  434. roll = Series([1, 1]).rolling(indexer)
  435. match = "start and end"
  436. with pytest.raises(ValueError, match=match):
  437. roll.mean()
  438. with pytest.raises(ValueError, match=match):
  439. next(iter(roll))
  440. with pytest.raises(ValueError, match=match):
  441. roll.corr(pairwise=True)
  442. with pytest.raises(ValueError, match=match):
  443. roll.cov(pairwise=True)