test_timegrouper.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926
  1. """
  2. test with the TimeGrouper / grouping with datetimes
  3. """
  4. from datetime import (
  5. datetime,
  6. timedelta,
  7. )
  8. from io import StringIO
  9. import numpy as np
  10. import pytest
  11. import pytz
  12. import pandas.util._test_decorators as td
  13. import pandas as pd
  14. from pandas import (
  15. DataFrame,
  16. DatetimeIndex,
  17. Index,
  18. MultiIndex,
  19. Series,
  20. Timestamp,
  21. date_range,
  22. offsets,
  23. )
  24. import pandas._testing as tm
  25. from pandas.core.groupby.grouper import Grouper
  26. from pandas.core.groupby.ops import BinGrouper
  27. @pytest.fixture
  28. def frame_for_truncated_bingrouper():
  29. """
  30. DataFrame used by groupby_with_truncated_bingrouper, made into
  31. a separate fixture for easier re-use in
  32. test_groupby_apply_timegrouper_with_nat_apply_squeeze
  33. """
  34. df = DataFrame(
  35. {
  36. "Quantity": [18, 3, 5, 1, 9, 3],
  37. "Date": [
  38. Timestamp(2013, 9, 1, 13, 0),
  39. Timestamp(2013, 9, 1, 13, 5),
  40. Timestamp(2013, 10, 1, 20, 0),
  41. Timestamp(2013, 10, 3, 10, 0),
  42. pd.NaT,
  43. Timestamp(2013, 9, 2, 14, 0),
  44. ],
  45. }
  46. )
  47. return df
  48. @pytest.fixture
  49. def groupby_with_truncated_bingrouper(frame_for_truncated_bingrouper):
  50. """
  51. GroupBy object such that gb.grouper is a BinGrouper and
  52. len(gb.grouper.result_index) < len(gb.grouper.group_keys_seq)
  53. Aggregations on this groupby should have
  54. dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
  55. As either the index or an index level.
  56. """
  57. df = frame_for_truncated_bingrouper
  58. tdg = Grouper(key="Date", freq="5D")
  59. gb = df.groupby(tdg)
  60. # check we're testing the case we're interested in
  61. assert len(gb.grouper.result_index) != len(gb.grouper.group_keys_seq)
  62. return gb
  63. class TestGroupBy:
  64. def test_groupby_with_timegrouper(self):
  65. # GH 4161
  66. # TimeGrouper requires a sorted index
  67. # also verifies that the resultant index has the correct name
  68. df_original = DataFrame(
  69. {
  70. "Buyer": "Carl Carl Carl Carl Joe Carl".split(),
  71. "Quantity": [18, 3, 5, 1, 9, 3],
  72. "Date": [
  73. datetime(2013, 9, 1, 13, 0),
  74. datetime(2013, 9, 1, 13, 5),
  75. datetime(2013, 10, 1, 20, 0),
  76. datetime(2013, 10, 3, 10, 0),
  77. datetime(2013, 12, 2, 12, 0),
  78. datetime(2013, 9, 2, 14, 0),
  79. ],
  80. }
  81. )
  82. # GH 6908 change target column's order
  83. df_reordered = df_original.sort_values(by="Quantity")
  84. for df in [df_original, df_reordered]:
  85. df = df.set_index(["Date"])
  86. expected = DataFrame(
  87. {"Buyer": 0, "Quantity": 0},
  88. index=date_range(
  89. "20130901", "20131205", freq="5D", name="Date", inclusive="left"
  90. ),
  91. )
  92. # Cast to object to avoid implicit cast when setting entry to "CarlCarlCarl"
  93. expected = expected.astype({"Buyer": object})
  94. expected.iloc[0, 0] = "CarlCarlCarl"
  95. expected.iloc[6, 0] = "CarlCarl"
  96. expected.iloc[18, 0] = "Joe"
  97. expected.iloc[[0, 6, 18], 1] = np.array([24, 6, 9], dtype="int64")
  98. result1 = df.resample("5D").sum()
  99. tm.assert_frame_equal(result1, expected)
  100. df_sorted = df.sort_index()
  101. result2 = df_sorted.groupby(Grouper(freq="5D")).sum()
  102. tm.assert_frame_equal(result2, expected)
  103. result3 = df.groupby(Grouper(freq="5D")).sum()
  104. tm.assert_frame_equal(result3, expected)
  105. @pytest.mark.parametrize("should_sort", [True, False])
  106. def test_groupby_with_timegrouper_methods(self, should_sort):
  107. # GH 3881
  108. # make sure API of timegrouper conforms
  109. df = DataFrame(
  110. {
  111. "Branch": "A A A A A B".split(),
  112. "Buyer": "Carl Mark Carl Joe Joe Carl".split(),
  113. "Quantity": [1, 3, 5, 8, 9, 3],
  114. "Date": [
  115. datetime(2013, 1, 1, 13, 0),
  116. datetime(2013, 1, 1, 13, 5),
  117. datetime(2013, 10, 1, 20, 0),
  118. datetime(2013, 10, 2, 10, 0),
  119. datetime(2013, 12, 2, 12, 0),
  120. datetime(2013, 12, 2, 14, 0),
  121. ],
  122. }
  123. )
  124. if should_sort:
  125. df = df.sort_values(by="Quantity", ascending=False)
  126. df = df.set_index("Date", drop=False)
  127. g = df.groupby(Grouper(freq="6M"))
  128. assert g.group_keys
  129. assert isinstance(g.grouper, BinGrouper)
  130. groups = g.groups
  131. assert isinstance(groups, dict)
  132. assert len(groups) == 3
  133. def test_timegrouper_with_reg_groups(self):
  134. # GH 3794
  135. # allow combination of timegrouper/reg groups
  136. df_original = DataFrame(
  137. {
  138. "Branch": "A A A A A A A B".split(),
  139. "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
  140. "Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
  141. "Date": [
  142. datetime(2013, 1, 1, 13, 0),
  143. datetime(2013, 1, 1, 13, 5),
  144. datetime(2013, 10, 1, 20, 0),
  145. datetime(2013, 10, 2, 10, 0),
  146. datetime(2013, 10, 1, 20, 0),
  147. datetime(2013, 10, 2, 10, 0),
  148. datetime(2013, 12, 2, 12, 0),
  149. datetime(2013, 12, 2, 14, 0),
  150. ],
  151. }
  152. ).set_index("Date")
  153. df_sorted = df_original.sort_values(by="Quantity", ascending=False)
  154. for df in [df_original, df_sorted]:
  155. expected = DataFrame(
  156. {
  157. "Buyer": "Carl Joe Mark".split(),
  158. "Quantity": [10, 18, 3],
  159. "Date": [
  160. datetime(2013, 12, 31, 0, 0),
  161. datetime(2013, 12, 31, 0, 0),
  162. datetime(2013, 12, 31, 0, 0),
  163. ],
  164. }
  165. ).set_index(["Date", "Buyer"])
  166. msg = "The default value of numeric_only"
  167. result = df.groupby([Grouper(freq="A"), "Buyer"]).sum(numeric_only=True)
  168. tm.assert_frame_equal(result, expected)
  169. expected = DataFrame(
  170. {
  171. "Buyer": "Carl Mark Carl Joe".split(),
  172. "Quantity": [1, 3, 9, 18],
  173. "Date": [
  174. datetime(2013, 1, 1, 0, 0),
  175. datetime(2013, 1, 1, 0, 0),
  176. datetime(2013, 7, 1, 0, 0),
  177. datetime(2013, 7, 1, 0, 0),
  178. ],
  179. }
  180. ).set_index(["Date", "Buyer"])
  181. result = df.groupby([Grouper(freq="6MS"), "Buyer"]).sum(numeric_only=True)
  182. tm.assert_frame_equal(result, expected)
  183. df_original = DataFrame(
  184. {
  185. "Branch": "A A A A A A A B".split(),
  186. "Buyer": "Carl Mark Carl Carl Joe Joe Joe Carl".split(),
  187. "Quantity": [1, 3, 5, 1, 8, 1, 9, 3],
  188. "Date": [
  189. datetime(2013, 10, 1, 13, 0),
  190. datetime(2013, 10, 1, 13, 5),
  191. datetime(2013, 10, 1, 20, 0),
  192. datetime(2013, 10, 2, 10, 0),
  193. datetime(2013, 10, 1, 20, 0),
  194. datetime(2013, 10, 2, 10, 0),
  195. datetime(2013, 10, 2, 12, 0),
  196. datetime(2013, 10, 2, 14, 0),
  197. ],
  198. }
  199. ).set_index("Date")
  200. df_sorted = df_original.sort_values(by="Quantity", ascending=False)
  201. for df in [df_original, df_sorted]:
  202. expected = DataFrame(
  203. {
  204. "Buyer": "Carl Joe Mark Carl Joe".split(),
  205. "Quantity": [6, 8, 3, 4, 10],
  206. "Date": [
  207. datetime(2013, 10, 1, 0, 0),
  208. datetime(2013, 10, 1, 0, 0),
  209. datetime(2013, 10, 1, 0, 0),
  210. datetime(2013, 10, 2, 0, 0),
  211. datetime(2013, 10, 2, 0, 0),
  212. ],
  213. }
  214. ).set_index(["Date", "Buyer"])
  215. result = df.groupby([Grouper(freq="1D"), "Buyer"]).sum(numeric_only=True)
  216. tm.assert_frame_equal(result, expected)
  217. result = df.groupby([Grouper(freq="1M"), "Buyer"]).sum(numeric_only=True)
  218. expected = DataFrame(
  219. {
  220. "Buyer": "Carl Joe Mark".split(),
  221. "Quantity": [10, 18, 3],
  222. "Date": [
  223. datetime(2013, 10, 31, 0, 0),
  224. datetime(2013, 10, 31, 0, 0),
  225. datetime(2013, 10, 31, 0, 0),
  226. ],
  227. }
  228. ).set_index(["Date", "Buyer"])
  229. tm.assert_frame_equal(result, expected)
  230. # passing the name
  231. df = df.reset_index()
  232. result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum(
  233. numeric_only=True
  234. )
  235. tm.assert_frame_equal(result, expected)
  236. with pytest.raises(KeyError, match="'The grouper name foo is not found'"):
  237. df.groupby([Grouper(freq="1M", key="foo"), "Buyer"]).sum()
  238. # passing the level
  239. df = df.set_index("Date")
  240. result = df.groupby([Grouper(freq="1M", level="Date"), "Buyer"]).sum(
  241. numeric_only=True
  242. )
  243. tm.assert_frame_equal(result, expected)
  244. result = df.groupby([Grouper(freq="1M", level=0), "Buyer"]).sum(
  245. numeric_only=True
  246. )
  247. tm.assert_frame_equal(result, expected)
  248. with pytest.raises(ValueError, match="The level foo is not valid"):
  249. df.groupby([Grouper(freq="1M", level="foo"), "Buyer"]).sum()
  250. # multi names
  251. df = df.copy()
  252. df["Date"] = df.index + offsets.MonthEnd(2)
  253. result = df.groupby([Grouper(freq="1M", key="Date"), "Buyer"]).sum(
  254. numeric_only=True
  255. )
  256. expected = DataFrame(
  257. {
  258. "Buyer": "Carl Joe Mark".split(),
  259. "Quantity": [10, 18, 3],
  260. "Date": [
  261. datetime(2013, 11, 30, 0, 0),
  262. datetime(2013, 11, 30, 0, 0),
  263. datetime(2013, 11, 30, 0, 0),
  264. ],
  265. }
  266. ).set_index(["Date", "Buyer"])
  267. tm.assert_frame_equal(result, expected)
  268. # error as we have both a level and a name!
  269. msg = "The Grouper cannot specify both a key and a level!"
  270. with pytest.raises(ValueError, match=msg):
  271. df.groupby(
  272. [Grouper(freq="1M", key="Date", level="Date"), "Buyer"]
  273. ).sum()
  274. # single groupers
  275. expected = DataFrame(
  276. [[31]],
  277. columns=["Quantity"],
  278. index=DatetimeIndex(
  279. [datetime(2013, 10, 31, 0, 0)], freq=offsets.MonthEnd(), name="Date"
  280. ),
  281. )
  282. result = df.groupby(Grouper(freq="1M")).sum(numeric_only=True)
  283. tm.assert_frame_equal(result, expected)
  284. result = df.groupby([Grouper(freq="1M")]).sum(numeric_only=True)
  285. tm.assert_frame_equal(result, expected)
  286. expected.index = expected.index.shift(1)
  287. assert expected.index.freq == offsets.MonthEnd()
  288. result = df.groupby(Grouper(freq="1M", key="Date")).sum(numeric_only=True)
  289. tm.assert_frame_equal(result, expected)
  290. result = df.groupby([Grouper(freq="1M", key="Date")]).sum(numeric_only=True)
  291. tm.assert_frame_equal(result, expected)
  292. @pytest.mark.parametrize("freq", ["D", "M", "A", "Q-APR"])
  293. def test_timegrouper_with_reg_groups_freq(self, freq):
  294. # GH 6764 multiple grouping with/without sort
  295. df = DataFrame(
  296. {
  297. "date": pd.to_datetime(
  298. [
  299. "20121002",
  300. "20121007",
  301. "20130130",
  302. "20130202",
  303. "20130305",
  304. "20121002",
  305. "20121207",
  306. "20130130",
  307. "20130202",
  308. "20130305",
  309. "20130202",
  310. "20130305",
  311. ]
  312. ),
  313. "user_id": [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
  314. "whole_cost": [
  315. 1790,
  316. 364,
  317. 280,
  318. 259,
  319. 201,
  320. 623,
  321. 90,
  322. 312,
  323. 359,
  324. 301,
  325. 359,
  326. 801,
  327. ],
  328. "cost1": [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12],
  329. }
  330. ).set_index("date")
  331. expected = (
  332. df.groupby("user_id")["whole_cost"]
  333. .resample(freq)
  334. .sum(min_count=1) # XXX
  335. .dropna()
  336. .reorder_levels(["date", "user_id"])
  337. .sort_index()
  338. .astype("int64")
  339. )
  340. expected.name = "whole_cost"
  341. result1 = (
  342. df.sort_index().groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
  343. )
  344. tm.assert_series_equal(result1, expected)
  345. result2 = df.groupby([Grouper(freq=freq), "user_id"])["whole_cost"].sum()
  346. tm.assert_series_equal(result2, expected)
  347. def test_timegrouper_get_group(self):
  348. # GH 6914
  349. df_original = DataFrame(
  350. {
  351. "Buyer": "Carl Joe Joe Carl Joe Carl".split(),
  352. "Quantity": [18, 3, 5, 1, 9, 3],
  353. "Date": [
  354. datetime(2013, 9, 1, 13, 0),
  355. datetime(2013, 9, 1, 13, 5),
  356. datetime(2013, 10, 1, 20, 0),
  357. datetime(2013, 10, 3, 10, 0),
  358. datetime(2013, 12, 2, 12, 0),
  359. datetime(2013, 9, 2, 14, 0),
  360. ],
  361. }
  362. )
  363. df_reordered = df_original.sort_values(by="Quantity")
  364. # single grouping
  365. expected_list = [
  366. df_original.iloc[[0, 1, 5]],
  367. df_original.iloc[[2, 3]],
  368. df_original.iloc[[4]],
  369. ]
  370. dt_list = ["2013-09-30", "2013-10-31", "2013-12-31"]
  371. for df in [df_original, df_reordered]:
  372. grouped = df.groupby(Grouper(freq="M", key="Date"))
  373. for t, expected in zip(dt_list, expected_list):
  374. dt = Timestamp(t)
  375. result = grouped.get_group(dt)
  376. tm.assert_frame_equal(result, expected)
  377. # multiple grouping
  378. expected_list = [
  379. df_original.iloc[[1]],
  380. df_original.iloc[[3]],
  381. df_original.iloc[[4]],
  382. ]
  383. g_list = [("Joe", "2013-09-30"), ("Carl", "2013-10-31"), ("Joe", "2013-12-31")]
  384. for df in [df_original, df_reordered]:
  385. grouped = df.groupby(["Buyer", Grouper(freq="M", key="Date")])
  386. for (b, t), expected in zip(g_list, expected_list):
  387. dt = Timestamp(t)
  388. result = grouped.get_group((b, dt))
  389. tm.assert_frame_equal(result, expected)
  390. # with index
  391. df_original = df_original.set_index("Date")
  392. df_reordered = df_original.sort_values(by="Quantity")
  393. expected_list = [
  394. df_original.iloc[[0, 1, 5]],
  395. df_original.iloc[[2, 3]],
  396. df_original.iloc[[4]],
  397. ]
  398. for df in [df_original, df_reordered]:
  399. grouped = df.groupby(Grouper(freq="M"))
  400. for t, expected in zip(dt_list, expected_list):
  401. dt = Timestamp(t)
  402. result = grouped.get_group(dt)
  403. tm.assert_frame_equal(result, expected)
  404. def test_timegrouper_apply_return_type_series(self):
  405. # Using `apply` with the `TimeGrouper` should give the
  406. # same return type as an `apply` with a `Grouper`.
  407. # Issue #11742
  408. df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
  409. df_dt = df.copy()
  410. df_dt["date"] = pd.to_datetime(df_dt["date"])
  411. def sumfunc_series(x):
  412. return Series([x["value"].sum()], ("sum",))
  413. expected = df.groupby(Grouper(key="date")).apply(sumfunc_series)
  414. result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_series)
  415. tm.assert_frame_equal(
  416. result.reset_index(drop=True), expected.reset_index(drop=True)
  417. )
  418. def test_timegrouper_apply_return_type_value(self):
  419. # Using `apply` with the `TimeGrouper` should give the
  420. # same return type as an `apply` with a `Grouper`.
  421. # Issue #11742
  422. df = DataFrame({"date": ["10/10/2000", "11/10/2000"], "value": [10, 13]})
  423. df_dt = df.copy()
  424. df_dt["date"] = pd.to_datetime(df_dt["date"])
  425. def sumfunc_value(x):
  426. return x.value.sum()
  427. expected = df.groupby(Grouper(key="date")).apply(sumfunc_value)
  428. result = df_dt.groupby(Grouper(freq="M", key="date")).apply(sumfunc_value)
  429. tm.assert_series_equal(
  430. result.reset_index(drop=True), expected.reset_index(drop=True)
  431. )
  432. def test_groupby_groups_datetimeindex(self):
  433. # GH#1430
  434. periods = 1000
  435. ind = date_range(start="2012/1/1", freq="5min", periods=periods)
  436. df = DataFrame(
  437. {"high": np.arange(periods), "low": np.arange(periods)}, index=ind
  438. )
  439. grouped = df.groupby(lambda x: datetime(x.year, x.month, x.day))
  440. # it works!
  441. groups = grouped.groups
  442. assert isinstance(list(groups.keys())[0], datetime)
  443. # GH#11442
  444. index = date_range("2015/01/01", periods=5, name="date")
  445. df = DataFrame({"A": [5, 6, 7, 8, 9], "B": [1, 2, 3, 4, 5]}, index=index)
  446. result = df.groupby(level="date").groups
  447. dates = ["2015-01-05", "2015-01-04", "2015-01-03", "2015-01-02", "2015-01-01"]
  448. expected = {
  449. Timestamp(date): DatetimeIndex([date], name="date") for date in dates
  450. }
  451. tm.assert_dict_equal(result, expected)
  452. grouped = df.groupby(level="date")
  453. for date in dates:
  454. result = grouped.get_group(date)
  455. data = [[df.loc[date, "A"], df.loc[date, "B"]]]
  456. expected_index = DatetimeIndex([date], name="date", freq="D")
  457. expected = DataFrame(data, columns=list("AB"), index=expected_index)
  458. tm.assert_frame_equal(result, expected)
  459. def test_groupby_groups_datetimeindex_tz(self):
  460. # GH 3950
  461. dates = [
  462. "2011-07-19 07:00:00",
  463. "2011-07-19 08:00:00",
  464. "2011-07-19 09:00:00",
  465. "2011-07-19 07:00:00",
  466. "2011-07-19 08:00:00",
  467. "2011-07-19 09:00:00",
  468. ]
  469. df = DataFrame(
  470. {
  471. "label": ["a", "a", "a", "b", "b", "b"],
  472. "datetime": dates,
  473. "value1": np.arange(6, dtype="int64"),
  474. "value2": [1, 2] * 3,
  475. }
  476. )
  477. df["datetime"] = df["datetime"].apply(lambda d: Timestamp(d, tz="US/Pacific"))
  478. exp_idx1 = DatetimeIndex(
  479. [
  480. "2011-07-19 07:00:00",
  481. "2011-07-19 07:00:00",
  482. "2011-07-19 08:00:00",
  483. "2011-07-19 08:00:00",
  484. "2011-07-19 09:00:00",
  485. "2011-07-19 09:00:00",
  486. ],
  487. tz="US/Pacific",
  488. name="datetime",
  489. )
  490. exp_idx2 = Index(["a", "b"] * 3, name="label")
  491. exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
  492. expected = DataFrame(
  493. {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
  494. index=exp_idx,
  495. columns=["value1", "value2"],
  496. )
  497. result = df.groupby(["datetime", "label"]).sum()
  498. tm.assert_frame_equal(result, expected)
  499. # by level
  500. didx = DatetimeIndex(dates, tz="Asia/Tokyo")
  501. df = DataFrame(
  502. {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
  503. index=didx,
  504. )
  505. exp_idx = DatetimeIndex(
  506. ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
  507. tz="Asia/Tokyo",
  508. )
  509. expected = DataFrame(
  510. {"value1": [3, 5, 7], "value2": [2, 4, 6]},
  511. index=exp_idx,
  512. columns=["value1", "value2"],
  513. )
  514. result = df.groupby(level=0).sum()
  515. tm.assert_frame_equal(result, expected)
  516. def test_frame_datetime64_handling_groupby(self):
  517. # it works!
  518. df = DataFrame(
  519. [(3, np.datetime64("2012-07-03")), (3, np.datetime64("2012-07-04"))],
  520. columns=["a", "date"],
  521. )
  522. result = df.groupby("a").first()
  523. assert result["date"][3] == Timestamp("2012-07-03")
  524. def test_groupby_multi_timezone(self):
  525. # combining multiple / different timezones yields UTC
  526. data = """0,2000-01-28 16:47:00,America/Chicago
  527. 1,2000-01-29 16:48:00,America/Chicago
  528. 2,2000-01-30 16:49:00,America/Los_Angeles
  529. 3,2000-01-31 16:50:00,America/Chicago
  530. 4,2000-01-01 16:50:00,America/New_York"""
  531. df = pd.read_csv(StringIO(data), header=None, names=["value", "date", "tz"])
  532. result = df.groupby("tz", group_keys=False).date.apply(
  533. lambda x: pd.to_datetime(x).dt.tz_localize(x.name)
  534. )
  535. expected = Series(
  536. [
  537. Timestamp("2000-01-28 16:47:00-0600", tz="America/Chicago"),
  538. Timestamp("2000-01-29 16:48:00-0600", tz="America/Chicago"),
  539. Timestamp("2000-01-30 16:49:00-0800", tz="America/Los_Angeles"),
  540. Timestamp("2000-01-31 16:50:00-0600", tz="America/Chicago"),
  541. Timestamp("2000-01-01 16:50:00-0500", tz="America/New_York"),
  542. ],
  543. name="date",
  544. dtype=object,
  545. )
  546. tm.assert_series_equal(result, expected)
  547. tz = "America/Chicago"
  548. res_values = df.groupby("tz").date.get_group(tz)
  549. result = pd.to_datetime(res_values).dt.tz_localize(tz)
  550. exp_values = Series(
  551. ["2000-01-28 16:47:00", "2000-01-29 16:48:00", "2000-01-31 16:50:00"],
  552. index=[0, 1, 3],
  553. name="date",
  554. )
  555. expected = pd.to_datetime(exp_values).dt.tz_localize(tz)
  556. tm.assert_series_equal(result, expected)
  557. def test_groupby_groups_periods(self):
  558. dates = [
  559. "2011-07-19 07:00:00",
  560. "2011-07-19 08:00:00",
  561. "2011-07-19 09:00:00",
  562. "2011-07-19 07:00:00",
  563. "2011-07-19 08:00:00",
  564. "2011-07-19 09:00:00",
  565. ]
  566. df = DataFrame(
  567. {
  568. "label": ["a", "a", "a", "b", "b", "b"],
  569. "period": [pd.Period(d, freq="H") for d in dates],
  570. "value1": np.arange(6, dtype="int64"),
  571. "value2": [1, 2] * 3,
  572. }
  573. )
  574. exp_idx1 = pd.PeriodIndex(
  575. [
  576. "2011-07-19 07:00:00",
  577. "2011-07-19 07:00:00",
  578. "2011-07-19 08:00:00",
  579. "2011-07-19 08:00:00",
  580. "2011-07-19 09:00:00",
  581. "2011-07-19 09:00:00",
  582. ],
  583. freq="H",
  584. name="period",
  585. )
  586. exp_idx2 = Index(["a", "b"] * 3, name="label")
  587. exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
  588. expected = DataFrame(
  589. {"value1": [0, 3, 1, 4, 2, 5], "value2": [1, 2, 2, 1, 1, 2]},
  590. index=exp_idx,
  591. columns=["value1", "value2"],
  592. )
  593. result = df.groupby(["period", "label"]).sum()
  594. tm.assert_frame_equal(result, expected)
  595. # by level
  596. didx = pd.PeriodIndex(dates, freq="H")
  597. df = DataFrame(
  598. {"value1": np.arange(6, dtype="int64"), "value2": [1, 2, 3, 1, 2, 3]},
  599. index=didx,
  600. )
  601. exp_idx = pd.PeriodIndex(
  602. ["2011-07-19 07:00:00", "2011-07-19 08:00:00", "2011-07-19 09:00:00"],
  603. freq="H",
  604. )
  605. expected = DataFrame(
  606. {"value1": [3, 5, 7], "value2": [2, 4, 6]},
  607. index=exp_idx,
  608. columns=["value1", "value2"],
  609. )
  610. result = df.groupby(level=0).sum()
  611. tm.assert_frame_equal(result, expected)
  612. def test_groupby_first_datetime64(self):
  613. df = DataFrame([(1, 1351036800000000000), (2, 1351036800000000000)])
  614. df[1] = df[1].view("M8[ns]")
  615. assert issubclass(df[1].dtype.type, np.datetime64)
  616. result = df.groupby(level=0).first()
  617. got_dt = result[1].dtype
  618. assert issubclass(got_dt.type, np.datetime64)
  619. result = df[1].groupby(level=0).first()
  620. got_dt = result.dtype
  621. assert issubclass(got_dt.type, np.datetime64)
  622. def test_groupby_max_datetime64(self):
  623. # GH 5869
  624. # datetimelike dtype conversion from int
  625. df = DataFrame({"A": Timestamp("20130101"), "B": np.arange(5)})
  626. expected = df.groupby("A")["A"].apply(lambda x: x.max())
  627. result = df.groupby("A")["A"].max()
  628. tm.assert_series_equal(result, expected)
  629. def test_groupby_datetime64_32_bit(self):
  630. # GH 6410 / numpy 4328
  631. # 32-bit under 1.9-dev indexing issue
  632. df = DataFrame({"A": range(2), "B": [Timestamp("2000-01-1")] * 2})
  633. result = df.groupby("A")["B"].transform(min)
  634. expected = Series([Timestamp("2000-01-1")] * 2, name="B")
  635. tm.assert_series_equal(result, expected)
  636. def test_groupby_with_timezone_selection(self):
  637. # GH 11616
  638. # Test that column selection returns output in correct timezone.
  639. np.random.seed(42)
  640. df = DataFrame(
  641. {
  642. "factor": np.random.randint(0, 3, size=60),
  643. "time": date_range("01/01/2000 00:00", periods=60, freq="s", tz="UTC"),
  644. }
  645. )
  646. df1 = df.groupby("factor").max()["time"]
  647. df2 = df.groupby("factor")["time"].max()
  648. tm.assert_series_equal(df1, df2)
  649. def test_timezone_info(self):
  650. # see gh-11682: Timezone info lost when broadcasting
  651. # scalar datetime to DataFrame
  652. df = DataFrame({"a": [1], "b": [datetime.now(pytz.utc)]})
  653. assert df["b"][0].tzinfo == pytz.utc
  654. df = DataFrame({"a": [1, 2, 3]})
  655. df["b"] = datetime.now(pytz.utc)
  656. assert df["b"][0].tzinfo == pytz.utc
  657. def test_datetime_count(self):
  658. df = DataFrame(
  659. {"a": [1, 2, 3] * 2, "dates": date_range("now", periods=6, freq="T")}
  660. )
  661. result = df.groupby("a").dates.count()
  662. expected = Series([2, 2, 2], index=Index([1, 2, 3], name="a"), name="dates")
  663. tm.assert_series_equal(result, expected)
  664. def test_first_last_max_min_on_time_data(self):
  665. # GH 10295
  666. # Verify that NaT is not in the result of max, min, first and last on
  667. # Dataframe with datetime or timedelta values.
  668. df_test = DataFrame(
  669. {
  670. "dt": [
  671. np.nan,
  672. "2015-07-24 10:10",
  673. "2015-07-25 11:11",
  674. "2015-07-23 12:12",
  675. np.nan,
  676. ],
  677. "td": [
  678. np.nan,
  679. timedelta(days=1),
  680. timedelta(days=2),
  681. timedelta(days=3),
  682. np.nan,
  683. ],
  684. }
  685. )
  686. df_test.dt = pd.to_datetime(df_test.dt)
  687. df_test["group"] = "A"
  688. df_ref = df_test[df_test.dt.notna()]
  689. grouped_test = df_test.groupby("group")
  690. grouped_ref = df_ref.groupby("group")
  691. tm.assert_frame_equal(grouped_ref.max(), grouped_test.max())
  692. tm.assert_frame_equal(grouped_ref.min(), grouped_test.min())
  693. tm.assert_frame_equal(grouped_ref.first(), grouped_test.first())
  694. tm.assert_frame_equal(grouped_ref.last(), grouped_test.last())
  695. def test_nunique_with_timegrouper_and_nat(self):
  696. # GH 17575
  697. test = DataFrame(
  698. {
  699. "time": [
  700. Timestamp("2016-06-28 09:35:35"),
  701. pd.NaT,
  702. Timestamp("2016-06-28 16:46:28"),
  703. ],
  704. "data": ["1", "2", "3"],
  705. }
  706. )
  707. grouper = Grouper(key="time", freq="h")
  708. result = test.groupby(grouper)["data"].nunique()
  709. expected = test[test.time.notnull()].groupby(grouper)["data"].nunique()
  710. expected.index = expected.index._with_freq(None)
  711. tm.assert_series_equal(result, expected)
  712. def test_scalar_call_versus_list_call(self):
  713. # Issue: 17530
  714. data_frame = {
  715. "location": ["shanghai", "beijing", "shanghai"],
  716. "time": Series(
  717. ["2017-08-09 13:32:23", "2017-08-11 23:23:15", "2017-08-11 22:23:15"],
  718. dtype="datetime64[ns]",
  719. ),
  720. "value": [1, 2, 3],
  721. }
  722. data_frame = DataFrame(data_frame).set_index("time")
  723. grouper = Grouper(freq="D")
  724. grouped = data_frame.groupby(grouper)
  725. result = grouped.count()
  726. grouped = data_frame.groupby([grouper])
  727. expected = grouped.count()
  728. tm.assert_frame_equal(result, expected)
  729. def test_grouper_period_index(self):
  730. # GH 32108
  731. periods = 2
  732. index = pd.period_range(
  733. start="2018-01", periods=periods, freq="M", name="Month"
  734. )
  735. period_series = Series(range(periods), index=index)
  736. result = period_series.groupby(period_series.index.month).sum()
  737. expected = Series(
  738. range(0, periods), index=Index(range(1, periods + 1), name=index.name)
  739. )
  740. tm.assert_series_equal(result, expected)
  741. def test_groupby_apply_timegrouper_with_nat_dict_returns(
  742. self, groupby_with_truncated_bingrouper
  743. ):
  744. # GH#43500 case where gb.grouper.result_index and gb.grouper.group_keys_seq
  745. # have different lengths that goes through the `isinstance(values[0], dict)`
  746. # path
  747. gb = groupby_with_truncated_bingrouper
  748. res = gb["Quantity"].apply(lambda x: {"foo": len(x)})
  749. dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
  750. mi = MultiIndex.from_arrays([dti, ["foo"] * len(dti)])
  751. expected = Series([3, 0, 0, 0, 0, 0, 2], index=mi, name="Quantity")
  752. tm.assert_series_equal(res, expected)
  753. def test_groupby_apply_timegrouper_with_nat_scalar_returns(
  754. self, groupby_with_truncated_bingrouper
  755. ):
  756. # GH#43500 Previously raised ValueError bc used index with incorrect
  757. # length in wrap_applied_result
  758. gb = groupby_with_truncated_bingrouper
  759. res = gb["Quantity"].apply(lambda x: x.iloc[0] if len(x) else np.nan)
  760. dti = date_range("2013-09-01", "2013-10-01", freq="5D", name="Date")
  761. expected = Series(
  762. [18, np.nan, np.nan, np.nan, np.nan, np.nan, 5],
  763. index=dti._with_freq(None),
  764. name="Quantity",
  765. )
  766. tm.assert_series_equal(res, expected)
  767. def test_groupby_apply_timegrouper_with_nat_apply_squeeze(
  768. self, frame_for_truncated_bingrouper
  769. ):
  770. df = frame_for_truncated_bingrouper
  771. # We need to create a GroupBy object with only one non-NaT group,
  772. # so use a huge freq so that all non-NaT dates will be grouped together
  773. tdg = Grouper(key="Date", freq="100Y")
  774. gb = df.groupby(tdg)
  775. # check that we will go through the singular_series path
  776. # in _wrap_applied_output_series
  777. assert gb.ngroups == 1
  778. assert gb._selected_obj._get_axis(gb.axis).nlevels == 1
  779. # function that returns a Series
  780. res = gb.apply(lambda x: x["Quantity"] * 2)
  781. expected = DataFrame(
  782. [[36, 6, 6, 10, 2]],
  783. index=Index([Timestamp("2013-12-31")], name="Date"),
  784. columns=Index([0, 1, 5, 2, 3], name="Quantity"),
  785. )
  786. tm.assert_frame_equal(res, expected)
  787. @td.skip_if_no("numba")
  788. def test_groupby_agg_numba_timegrouper_with_nat(
  789. self, groupby_with_truncated_bingrouper
  790. ):
  791. # See discussion in GH#43487
  792. gb = groupby_with_truncated_bingrouper
  793. result = gb["Quantity"].aggregate(
  794. lambda values, index: np.nanmean(values), engine="numba"
  795. )
  796. expected = gb["Quantity"].aggregate(np.nanmean)
  797. tm.assert_series_equal(result, expected)
  798. result_df = gb[["Quantity"]].aggregate(
  799. lambda values, index: np.nanmean(values), engine="numba"
  800. )
  801. expected_df = gb[["Quantity"]].aggregate(np.nanmean)
  802. tm.assert_frame_equal(result_df, expected_df)