test_apply.py 40 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341
  1. from datetime import (
  2. date,
  3. datetime,
  4. )
  5. from io import StringIO
  6. import numpy as np
  7. import pytest
  8. import pandas as pd
  9. from pandas import (
  10. DataFrame,
  11. Index,
  12. MultiIndex,
  13. Series,
  14. bdate_range,
  15. )
  16. import pandas._testing as tm
  17. from pandas.tests.groupby import get_groupby_method_args
  18. def test_apply_issues():
  19. # GH 5788
  20. s = """2011.05.16,00:00,1.40893
  21. 2011.05.16,01:00,1.40760
  22. 2011.05.16,02:00,1.40750
  23. 2011.05.16,03:00,1.40649
  24. 2011.05.17,02:00,1.40893
  25. 2011.05.17,03:00,1.40760
  26. 2011.05.17,04:00,1.40750
  27. 2011.05.17,05:00,1.40649
  28. 2011.05.18,02:00,1.40893
  29. 2011.05.18,03:00,1.40760
  30. 2011.05.18,04:00,1.40750
  31. 2011.05.18,05:00,1.40649"""
  32. df = pd.read_csv(
  33. StringIO(s),
  34. header=None,
  35. names=["date", "time", "value"],
  36. parse_dates=[["date", "time"]],
  37. )
  38. df = df.set_index("date_time")
  39. expected = df.groupby(df.index.date).idxmax()
  40. result = df.groupby(df.index.date).apply(lambda x: x.idxmax())
  41. tm.assert_frame_equal(result, expected)
  42. # GH 5789
  43. # don't auto coerce dates
  44. df = pd.read_csv(StringIO(s), header=None, names=["date", "time", "value"])
  45. exp_idx = Index(
  46. ["2011.05.16", "2011.05.17", "2011.05.18"], dtype=object, name="date"
  47. )
  48. expected = Series(["00:00", "02:00", "02:00"], index=exp_idx)
  49. result = df.groupby("date", group_keys=False).apply(
  50. lambda x: x["time"][x["value"].idxmax()]
  51. )
  52. tm.assert_series_equal(result, expected)
  53. def test_apply_trivial():
  54. # GH 20066
  55. # trivial apply: ignore input and return a constant dataframe.
  56. df = DataFrame(
  57. {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
  58. columns=["key", "data"],
  59. )
  60. expected = pd.concat([df.iloc[1:], df.iloc[1:]], axis=1, keys=["float64", "object"])
  61. result = df.groupby([str(x) for x in df.dtypes], axis=1).apply(
  62. lambda x: df.iloc[1:]
  63. )
  64. tm.assert_frame_equal(result, expected)
  65. def test_apply_trivial_fail():
  66. # GH 20066
  67. df = DataFrame(
  68. {"key": ["a", "a", "b", "b", "a"], "data": [1.0, 2.0, 3.0, 4.0, 5.0]},
  69. columns=["key", "data"],
  70. )
  71. expected = pd.concat([df, df], axis=1, keys=["float64", "object"])
  72. result = df.groupby([str(x) for x in df.dtypes], axis=1, group_keys=True).apply(
  73. lambda x: df
  74. )
  75. tm.assert_frame_equal(result, expected)
  76. @pytest.mark.parametrize(
  77. "df, group_names",
  78. [
  79. (DataFrame({"a": [1, 1, 1, 2, 3], "b": ["a", "a", "a", "b", "c"]}), [1, 2, 3]),
  80. (DataFrame({"a": [0, 0, 1, 1], "b": [0, 1, 0, 1]}), [0, 1]),
  81. (DataFrame({"a": [1]}), [1]),
  82. (DataFrame({"a": [1, 1, 1, 2, 2, 1, 1, 2], "b": range(8)}), [1, 2]),
  83. (DataFrame({"a": [1, 2, 3, 1, 2, 3], "two": [4, 5, 6, 7, 8, 9]}), [1, 2, 3]),
  84. (
  85. DataFrame(
  86. {
  87. "a": list("aaabbbcccc"),
  88. "B": [3, 4, 3, 6, 5, 2, 1, 9, 5, 4],
  89. "C": [4, 0, 2, 2, 2, 7, 8, 6, 2, 8],
  90. }
  91. ),
  92. ["a", "b", "c"],
  93. ),
  94. (DataFrame([[1, 2, 3], [2, 2, 3]], columns=["a", "b", "c"]), [1, 2]),
  95. ],
  96. ids=[
  97. "GH2936",
  98. "GH7739 & GH10519",
  99. "GH10519",
  100. "GH2656",
  101. "GH12155",
  102. "GH20084",
  103. "GH21417",
  104. ],
  105. )
  106. def test_group_apply_once_per_group(df, group_names):
  107. # GH2936, GH7739, GH10519, GH2656, GH12155, GH20084, GH21417
  108. # This test should ensure that a function is only evaluated
  109. # once per group. Previously the function has been evaluated twice
  110. # on the first group to check if the Cython index slider is safe to use
  111. # This test ensures that the side effect (append to list) is only triggered
  112. # once per group
  113. names = []
  114. # cannot parameterize over the functions since they need external
  115. # `names` to detect side effects
  116. def f_copy(group):
  117. # this takes the fast apply path
  118. names.append(group.name)
  119. return group.copy()
  120. def f_nocopy(group):
  121. # this takes the slow apply path
  122. names.append(group.name)
  123. return group
  124. def f_scalar(group):
  125. # GH7739, GH2656
  126. names.append(group.name)
  127. return 0
  128. def f_none(group):
  129. # GH10519, GH12155, GH21417
  130. names.append(group.name)
  131. def f_constant_df(group):
  132. # GH2936, GH20084
  133. names.append(group.name)
  134. return DataFrame({"a": [1], "b": [1]})
  135. for func in [f_copy, f_nocopy, f_scalar, f_none, f_constant_df]:
  136. del names[:]
  137. df.groupby("a", group_keys=False).apply(func)
  138. assert names == group_names
  139. def test_group_apply_once_per_group2(capsys):
  140. # GH: 31111
  141. # groupby-apply need to execute len(set(group_by_columns)) times
  142. expected = 2 # Number of times `apply` should call a function for the current test
  143. df = DataFrame(
  144. {
  145. "group_by_column": [0, 0, 0, 0, 1, 1, 1, 1],
  146. "test_column": ["0", "2", "4", "6", "8", "10", "12", "14"],
  147. },
  148. index=["0", "2", "4", "6", "8", "10", "12", "14"],
  149. )
  150. df.groupby("group_by_column", group_keys=False).apply(
  151. lambda df: print("function_called")
  152. )
  153. result = capsys.readouterr().out.count("function_called")
  154. # If `groupby` behaves unexpectedly, this test will break
  155. assert result == expected
  156. def test_apply_fast_slow_identical():
  157. # GH 31613
  158. df = DataFrame({"A": [0, 0, 1], "b": range(3)})
  159. # For simple index structures we check for fast/slow apply using
  160. # an identity check on in/output
  161. def slow(group):
  162. return group
  163. def fast(group):
  164. return group.copy()
  165. fast_df = df.groupby("A", group_keys=False).apply(fast)
  166. slow_df = df.groupby("A", group_keys=False).apply(slow)
  167. tm.assert_frame_equal(fast_df, slow_df)
  168. @pytest.mark.parametrize(
  169. "func",
  170. [
  171. lambda x: x,
  172. lambda x: x[:],
  173. lambda x: x.copy(deep=False),
  174. lambda x: x.copy(deep=True),
  175. ],
  176. )
  177. def test_groupby_apply_identity_maybecopy_index_identical(func):
  178. # GH 14927
  179. # Whether the function returns a copy of the input data or not should not
  180. # have an impact on the index structure of the result since this is not
  181. # transparent to the user
  182. df = DataFrame({"g": [1, 2, 2, 2], "a": [1, 2, 3, 4], "b": [5, 6, 7, 8]})
  183. result = df.groupby("g", group_keys=False).apply(func)
  184. tm.assert_frame_equal(result, df)
  185. def test_apply_with_mixed_dtype():
  186. # GH3480, apply with mixed dtype on axis=1 breaks in 0.11
  187. df = DataFrame(
  188. {
  189. "foo1": np.random.randn(6),
  190. "foo2": ["one", "two", "two", "three", "one", "two"],
  191. }
  192. )
  193. result = df.apply(lambda x: x, axis=1).dtypes
  194. expected = df.dtypes
  195. tm.assert_series_equal(result, expected)
  196. # GH 3610 incorrect dtype conversion with as_index=False
  197. df = DataFrame({"c1": [1, 2, 6, 6, 8]})
  198. df["c2"] = df.c1 / 2.0
  199. result1 = df.groupby("c2").mean().reset_index().c2
  200. result2 = df.groupby("c2", as_index=False).mean().c2
  201. tm.assert_series_equal(result1, result2)
  202. def test_groupby_as_index_apply():
  203. # GH #4648 and #3417
  204. df = DataFrame(
  205. {
  206. "item_id": ["b", "b", "a", "c", "a", "b"],
  207. "user_id": [1, 2, 1, 1, 3, 1],
  208. "time": range(6),
  209. }
  210. )
  211. g_as = df.groupby("user_id", as_index=True)
  212. g_not_as = df.groupby("user_id", as_index=False)
  213. res_as = g_as.head(2).index
  214. res_not_as = g_not_as.head(2).index
  215. exp = Index([0, 1, 2, 4])
  216. tm.assert_index_equal(res_as, exp)
  217. tm.assert_index_equal(res_not_as, exp)
  218. res_as_apply = g_as.apply(lambda x: x.head(2)).index
  219. res_not_as_apply = g_not_as.apply(lambda x: x.head(2)).index
  220. # apply doesn't maintain the original ordering
  221. # changed in GH5610 as the as_index=False returns a MI here
  222. exp_not_as_apply = MultiIndex.from_tuples([(0, 0), (0, 2), (1, 1), (2, 4)])
  223. tp = [(1, 0), (1, 2), (2, 1), (3, 4)]
  224. exp_as_apply = MultiIndex.from_tuples(tp, names=["user_id", None])
  225. tm.assert_index_equal(res_as_apply, exp_as_apply)
  226. tm.assert_index_equal(res_not_as_apply, exp_not_as_apply)
  227. ind = Index(list("abcde"))
  228. df = DataFrame([[1, 2], [2, 3], [1, 4], [1, 5], [2, 6]], index=ind)
  229. res = df.groupby(0, as_index=False, group_keys=False).apply(lambda x: x).index
  230. tm.assert_index_equal(res, ind)
  231. def test_apply_concat_preserve_names(three_group):
  232. grouped = three_group.groupby(["A", "B"])
  233. def desc(group):
  234. result = group.describe()
  235. result.index.name = "stat"
  236. return result
  237. def desc2(group):
  238. result = group.describe()
  239. result.index.name = "stat"
  240. result = result[: len(group)]
  241. # weirdo
  242. return result
  243. def desc3(group):
  244. result = group.describe()
  245. # names are different
  246. result.index.name = f"stat_{len(group):d}"
  247. result = result[: len(group)]
  248. # weirdo
  249. return result
  250. result = grouped.apply(desc)
  251. assert result.index.names == ("A", "B", "stat")
  252. result2 = grouped.apply(desc2)
  253. assert result2.index.names == ("A", "B", "stat")
  254. result3 = grouped.apply(desc3)
  255. assert result3.index.names == ("A", "B", None)
  256. def test_apply_series_to_frame():
  257. def f(piece):
  258. with np.errstate(invalid="ignore"):
  259. logged = np.log(piece)
  260. return DataFrame(
  261. {"value": piece, "demeaned": piece - piece.mean(), "logged": logged}
  262. )
  263. dr = bdate_range("1/1/2000", periods=100)
  264. ts = Series(np.random.randn(100), index=dr)
  265. grouped = ts.groupby(lambda x: x.month, group_keys=False)
  266. result = grouped.apply(f)
  267. assert isinstance(result, DataFrame)
  268. assert not hasattr(result, "name") # GH49907
  269. tm.assert_index_equal(result.index, ts.index)
  270. def test_apply_series_yield_constant(df):
  271. result = df.groupby(["A", "B"])["C"].apply(len)
  272. assert result.index.names[:2] == ("A", "B")
  273. def test_apply_frame_yield_constant(df):
  274. # GH13568
  275. result = df.groupby(["A", "B"]).apply(len)
  276. assert isinstance(result, Series)
  277. assert result.name is None
  278. result = df.groupby(["A", "B"])[["C", "D"]].apply(len)
  279. assert isinstance(result, Series)
  280. assert result.name is None
  281. def test_apply_frame_to_series(df):
  282. grouped = df.groupby(["A", "B"])
  283. result = grouped.apply(len)
  284. expected = grouped.count()["C"]
  285. tm.assert_index_equal(result.index, expected.index)
  286. tm.assert_numpy_array_equal(result.values, expected.values)
  287. def test_apply_frame_not_as_index_column_name(df):
  288. # GH 35964 - path within _wrap_applied_output not hit by a test
  289. grouped = df.groupby(["A", "B"], as_index=False)
  290. result = grouped.apply(len)
  291. expected = grouped.count().rename(columns={"C": np.nan}).drop(columns="D")
  292. # TODO(GH#34306): Use assert_frame_equal when column name is not np.nan
  293. tm.assert_index_equal(result.index, expected.index)
  294. tm.assert_numpy_array_equal(result.values, expected.values)
  295. def test_apply_frame_concat_series():
  296. def trans(group):
  297. return group.groupby("B")["C"].sum().sort_values().iloc[:2]
  298. def trans2(group):
  299. grouped = group.groupby(df.reindex(group.index)["B"])
  300. return grouped.sum().sort_values().iloc[:2]
  301. df = DataFrame(
  302. {
  303. "A": np.random.randint(0, 5, 1000),
  304. "B": np.random.randint(0, 5, 1000),
  305. "C": np.random.randn(1000),
  306. }
  307. )
  308. result = df.groupby("A").apply(trans)
  309. exp = df.groupby("A")["C"].apply(trans2)
  310. tm.assert_series_equal(result, exp, check_names=False)
  311. assert result.name == "C"
  312. def test_apply_transform(ts):
  313. grouped = ts.groupby(lambda x: x.month, group_keys=False)
  314. result = grouped.apply(lambda x: x * 2)
  315. expected = grouped.transform(lambda x: x * 2)
  316. tm.assert_series_equal(result, expected)
  317. def test_apply_multikey_corner(tsframe):
  318. grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
  319. def f(group):
  320. return group.sort_values("A")[-5:]
  321. result = grouped.apply(f)
  322. for key, group in grouped:
  323. tm.assert_frame_equal(result.loc[key], f(group))
  324. @pytest.mark.parametrize("group_keys", [True, False])
  325. def test_apply_chunk_view(group_keys):
  326. # Low level tinkering could be unsafe, make sure not
  327. df = DataFrame({"key": [1, 1, 1, 2, 2, 2, 3, 3, 3], "value": range(9)})
  328. result = df.groupby("key", group_keys=group_keys).apply(lambda x: x.iloc[:2])
  329. expected = df.take([0, 1, 3, 4, 6, 7])
  330. if group_keys:
  331. expected.index = MultiIndex.from_arrays(
  332. [[1, 1, 2, 2, 3, 3], expected.index], names=["key", None]
  333. )
  334. tm.assert_frame_equal(result, expected)
  335. def test_apply_no_name_column_conflict():
  336. df = DataFrame(
  337. {
  338. "name": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2],
  339. "name2": [0, 0, 0, 1, 1, 1, 0, 0, 1, 1],
  340. "value": range(9, -1, -1),
  341. }
  342. )
  343. # it works! #2605
  344. grouped = df.groupby(["name", "name2"])
  345. grouped.apply(lambda x: x.sort_values("value", inplace=True))
  346. def test_apply_typecast_fail():
  347. df = DataFrame(
  348. {
  349. "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
  350. "c": np.tile(["a", "b", "c"], 2),
  351. "v": np.arange(1.0, 7.0),
  352. }
  353. )
  354. def f(group):
  355. v = group["v"]
  356. group["v2"] = (v - v.min()) / (v.max() - v.min())
  357. return group
  358. result = df.groupby("d", group_keys=False).apply(f)
  359. expected = df.copy()
  360. expected["v2"] = np.tile([0.0, 0.5, 1], 2)
  361. tm.assert_frame_equal(result, expected)
  362. def test_apply_multiindex_fail():
  363. index = MultiIndex.from_arrays([[0, 0, 0, 1, 1, 1], [1, 2, 3, 1, 2, 3]])
  364. df = DataFrame(
  365. {
  366. "d": [1.0, 1.0, 1.0, 2.0, 2.0, 2.0],
  367. "c": np.tile(["a", "b", "c"], 2),
  368. "v": np.arange(1.0, 7.0),
  369. },
  370. index=index,
  371. )
  372. def f(group):
  373. v = group["v"]
  374. group["v2"] = (v - v.min()) / (v.max() - v.min())
  375. return group
  376. result = df.groupby("d", group_keys=False).apply(f)
  377. expected = df.copy()
  378. expected["v2"] = np.tile([0.0, 0.5, 1], 2)
  379. tm.assert_frame_equal(result, expected)
  380. def test_apply_corner(tsframe):
  381. result = tsframe.groupby(lambda x: x.year, group_keys=False).apply(lambda x: x * 2)
  382. expected = tsframe * 2
  383. tm.assert_frame_equal(result, expected)
  384. def test_apply_without_copy():
  385. # GH 5545
  386. # returning a non-copy in an applied function fails
  387. data = DataFrame(
  388. {
  389. "id_field": [100, 100, 200, 300],
  390. "category": ["a", "b", "c", "c"],
  391. "value": [1, 2, 3, 4],
  392. }
  393. )
  394. def filt1(x):
  395. if x.shape[0] == 1:
  396. return x.copy()
  397. else:
  398. return x[x.category == "c"]
  399. def filt2(x):
  400. if x.shape[0] == 1:
  401. return x
  402. else:
  403. return x[x.category == "c"]
  404. expected = data.groupby("id_field").apply(filt1)
  405. result = data.groupby("id_field").apply(filt2)
  406. tm.assert_frame_equal(result, expected)
  407. @pytest.mark.parametrize("test_series", [True, False])
  408. def test_apply_with_duplicated_non_sorted_axis(test_series):
  409. # GH 30667
  410. df = DataFrame(
  411. [["x", "p"], ["x", "p"], ["x", "o"]], columns=["X", "Y"], index=[1, 2, 2]
  412. )
  413. if test_series:
  414. ser = df.set_index("Y")["X"]
  415. result = ser.groupby(level=0, group_keys=False).apply(lambda x: x)
  416. # not expecting the order to remain the same for duplicated axis
  417. result = result.sort_index()
  418. expected = ser.sort_index()
  419. tm.assert_series_equal(result, expected)
  420. else:
  421. result = df.groupby("Y", group_keys=False).apply(lambda x: x)
  422. # not expecting the order to remain the same for duplicated axis
  423. result = result.sort_values("Y")
  424. expected = df.sort_values("Y")
  425. tm.assert_frame_equal(result, expected)
  426. def test_apply_reindex_values():
  427. # GH: 26209
  428. # reindexing from a single column of a groupby object with duplicate indices caused
  429. # a ValueError (cannot reindex from duplicate axis) in 0.24.2, the problem was
  430. # solved in #30679
  431. values = [1, 2, 3, 4]
  432. indices = [1, 1, 2, 2]
  433. df = DataFrame({"group": ["Group1", "Group2"] * 2, "value": values}, index=indices)
  434. expected = Series(values, index=indices, name="value")
  435. def reindex_helper(x):
  436. return x.reindex(np.arange(x.index.min(), x.index.max() + 1))
  437. # the following group by raised a ValueError
  438. result = df.groupby("group", group_keys=False).value.apply(reindex_helper)
  439. tm.assert_series_equal(expected, result)
  440. def test_apply_corner_cases():
  441. # #535, can't use sliding iterator
  442. N = 1000
  443. labels = np.random.randint(0, 100, size=N)
  444. df = DataFrame(
  445. {
  446. "key": labels,
  447. "value1": np.random.randn(N),
  448. "value2": ["foo", "bar", "baz", "qux"] * (N // 4),
  449. }
  450. )
  451. grouped = df.groupby("key", group_keys=False)
  452. def f(g):
  453. g["value3"] = g["value1"] * 2
  454. return g
  455. result = grouped.apply(f)
  456. assert "value3" in result
  457. def test_apply_numeric_coercion_when_datetime():
  458. # In the past, group-by/apply operations have been over-eager
  459. # in converting dtypes to numeric, in the presence of datetime
  460. # columns. Various GH issues were filed, the reproductions
  461. # for which are here.
  462. # GH 15670
  463. df = DataFrame(
  464. {"Number": [1, 2], "Date": ["2017-03-02"] * 2, "Str": ["foo", "inf"]}
  465. )
  466. expected = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
  467. df.Date = pd.to_datetime(df.Date)
  468. result = df.groupby(["Number"]).apply(lambda x: x.iloc[0])
  469. tm.assert_series_equal(result["Str"], expected["Str"])
  470. # GH 15421
  471. df = DataFrame(
  472. {"A": [10, 20, 30], "B": ["foo", "3", "4"], "T": [pd.Timestamp("12:31:22")] * 3}
  473. )
  474. def get_B(g):
  475. return g.iloc[0][["B"]]
  476. result = df.groupby("A").apply(get_B)["B"]
  477. expected = df.B
  478. expected.index = df.A
  479. tm.assert_series_equal(result, expected)
  480. # GH 14423
  481. def predictions(tool):
  482. out = Series(index=["p1", "p2", "useTime"], dtype=object)
  483. if "step1" in list(tool.State):
  484. out["p1"] = str(tool[tool.State == "step1"].Machine.values[0])
  485. if "step2" in list(tool.State):
  486. out["p2"] = str(tool[tool.State == "step2"].Machine.values[0])
  487. out["useTime"] = str(tool[tool.State == "step2"].oTime.values[0])
  488. return out
  489. df1 = DataFrame(
  490. {
  491. "Key": ["B", "B", "A", "A"],
  492. "State": ["step1", "step2", "step1", "step2"],
  493. "oTime": ["", "2016-09-19 05:24:33", "", "2016-09-19 23:59:04"],
  494. "Machine": ["23", "36L", "36R", "36R"],
  495. }
  496. )
  497. df2 = df1.copy()
  498. df2.oTime = pd.to_datetime(df2.oTime)
  499. expected = df1.groupby("Key").apply(predictions).p1
  500. result = df2.groupby("Key").apply(predictions).p1
  501. tm.assert_series_equal(expected, result)
  502. def test_apply_aggregating_timedelta_and_datetime():
  503. # Regression test for GH 15562
  504. # The following groupby caused ValueErrors and IndexErrors pre 0.20.0
  505. df = DataFrame(
  506. {
  507. "clientid": ["A", "B", "C"],
  508. "datetime": [np.datetime64("2017-02-01 00:00:00")] * 3,
  509. }
  510. )
  511. df["time_delta_zero"] = df.datetime - df.datetime
  512. result = df.groupby("clientid").apply(
  513. lambda ddf: Series(
  514. {"clientid_age": ddf.time_delta_zero.min(), "date": ddf.datetime.min()}
  515. )
  516. )
  517. expected = DataFrame(
  518. {
  519. "clientid": ["A", "B", "C"],
  520. "clientid_age": [np.timedelta64(0, "D")] * 3,
  521. "date": [np.datetime64("2017-02-01 00:00:00")] * 3,
  522. }
  523. ).set_index("clientid")
  524. tm.assert_frame_equal(result, expected)
  525. def test_apply_groupby_datetimeindex():
  526. # GH 26182
  527. # groupby apply failed on dataframe with DatetimeIndex
  528. data = [["A", 10], ["B", 20], ["B", 30], ["C", 40], ["C", 50]]
  529. df = DataFrame(
  530. data, columns=["Name", "Value"], index=pd.date_range("2020-09-01", "2020-09-05")
  531. )
  532. result = df.groupby("Name").sum()
  533. expected = DataFrame({"Name": ["A", "B", "C"], "Value": [10, 50, 90]})
  534. expected.set_index("Name", inplace=True)
  535. tm.assert_frame_equal(result, expected)
  536. def test_time_field_bug():
  537. # Test a fix for the following error related to GH issue 11324 When
  538. # non-key fields in a group-by dataframe contained time-based fields
  539. # that were not returned by the apply function, an exception would be
  540. # raised.
  541. df = DataFrame({"a": 1, "b": [datetime.now() for nn in range(10)]})
  542. def func_with_no_date(batch):
  543. return Series({"c": 2})
  544. def func_with_date(batch):
  545. return Series({"b": datetime(2015, 1, 1), "c": 2})
  546. dfg_no_conversion = df.groupby(by=["a"]).apply(func_with_no_date)
  547. dfg_no_conversion_expected = DataFrame({"c": 2}, index=[1])
  548. dfg_no_conversion_expected.index.name = "a"
  549. dfg_conversion = df.groupby(by=["a"]).apply(func_with_date)
  550. dfg_conversion_expected = DataFrame({"b": datetime(2015, 1, 1), "c": 2}, index=[1])
  551. dfg_conversion_expected.index.name = "a"
  552. tm.assert_frame_equal(dfg_no_conversion, dfg_no_conversion_expected)
  553. tm.assert_frame_equal(dfg_conversion, dfg_conversion_expected)
  554. def test_gb_apply_list_of_unequal_len_arrays():
  555. # GH1738
  556. df = DataFrame(
  557. {
  558. "group1": ["a", "a", "a", "b", "b", "b", "a", "a", "a", "b", "b", "b"],
  559. "group2": ["c", "c", "d", "d", "d", "e", "c", "c", "d", "d", "d", "e"],
  560. "weight": [1.1, 2, 3, 4, 5, 6, 2, 4, 6, 8, 1, 2],
  561. "value": [7.1, 8, 9, 10, 11, 12, 8, 7, 6, 5, 4, 3],
  562. }
  563. )
  564. df = df.set_index(["group1", "group2"])
  565. df_grouped = df.groupby(level=["group1", "group2"], sort=True)
  566. def noddy(value, weight):
  567. out = np.array(value * weight).repeat(3)
  568. return out
  569. # the kernel function returns arrays of unequal length
  570. # pandas sniffs the first one, sees it's an array and not
  571. # a list, and assumed the rest are of equal length
  572. # and so tries a vstack
  573. # don't die
  574. df_grouped.apply(lambda x: noddy(x.value, x.weight))
  575. def test_groupby_apply_all_none():
  576. # Tests to make sure no errors if apply function returns all None
  577. # values. Issue 9684.
  578. test_df = DataFrame({"groups": [0, 0, 1, 1], "random_vars": [8, 7, 4, 5]})
  579. def test_func(x):
  580. pass
  581. result = test_df.groupby("groups").apply(test_func)
  582. expected = DataFrame()
  583. tm.assert_frame_equal(result, expected)
  584. def test_groupby_apply_none_first():
  585. # GH 12824. Tests if apply returns None first.
  586. test_df1 = DataFrame({"groups": [1, 1, 1, 2], "vars": [0, 1, 2, 3]})
  587. test_df2 = DataFrame({"groups": [1, 2, 2, 2], "vars": [0, 1, 2, 3]})
  588. def test_func(x):
  589. if x.shape[0] < 2:
  590. return None
  591. return x.iloc[[0, -1]]
  592. result1 = test_df1.groupby("groups").apply(test_func)
  593. result2 = test_df2.groupby("groups").apply(test_func)
  594. index1 = MultiIndex.from_arrays([[1, 1], [0, 2]], names=["groups", None])
  595. index2 = MultiIndex.from_arrays([[2, 2], [1, 3]], names=["groups", None])
  596. expected1 = DataFrame({"groups": [1, 1], "vars": [0, 2]}, index=index1)
  597. expected2 = DataFrame({"groups": [2, 2], "vars": [1, 3]}, index=index2)
  598. tm.assert_frame_equal(result1, expected1)
  599. tm.assert_frame_equal(result2, expected2)
  600. def test_groupby_apply_return_empty_chunk():
  601. # GH 22221: apply filter which returns some empty groups
  602. df = DataFrame({"value": [0, 1], "group": ["filled", "empty"]})
  603. groups = df.groupby("group")
  604. result = groups.apply(lambda group: group[group.value != 1]["value"])
  605. expected = Series(
  606. [0],
  607. name="value",
  608. index=MultiIndex.from_product(
  609. [["empty", "filled"], [0]], names=["group", None]
  610. ).drop("empty"),
  611. )
  612. tm.assert_series_equal(result, expected)
  613. def test_apply_with_mixed_types():
  614. # gh-20949
  615. df = DataFrame({"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]})
  616. g = df.groupby("A", group_keys=False)
  617. result = g.transform(lambda x: x / x.sum())
  618. expected = DataFrame({"B": [1 / 3.0, 2 / 3.0, 1], "C": [0.4, 0.6, 1.0]})
  619. tm.assert_frame_equal(result, expected)
  620. result = g.apply(lambda x: x / x.sum())
  621. tm.assert_frame_equal(result, expected)
  622. def test_func_returns_object():
  623. # GH 28652
  624. df = DataFrame({"a": [1, 2]}, index=Index([1, 2]))
  625. result = df.groupby("a").apply(lambda g: g.index)
  626. expected = Series([Index([1]), Index([2])], index=Index([1, 2], name="a"))
  627. tm.assert_series_equal(result, expected)
  628. @pytest.mark.parametrize(
  629. "group_column_dtlike",
  630. [datetime.today(), datetime.today().date(), datetime.today().time()],
  631. )
  632. def test_apply_datetime_issue(group_column_dtlike):
  633. # GH-28247
  634. # groupby-apply throws an error if one of the columns in the DataFrame
  635. # is a datetime object and the column labels are different from
  636. # standard int values in range(len(num_columns))
  637. df = DataFrame({"a": ["foo"], "b": [group_column_dtlike]})
  638. result = df.groupby("a").apply(lambda x: Series(["spam"], index=[42]))
  639. expected = DataFrame(
  640. ["spam"], Index(["foo"], dtype="object", name="a"), columns=[42]
  641. )
  642. tm.assert_frame_equal(result, expected)
  643. def test_apply_series_return_dataframe_groups():
  644. # GH 10078
  645. tdf = DataFrame(
  646. {
  647. "day": {
  648. 0: pd.Timestamp("2015-02-24 00:00:00"),
  649. 1: pd.Timestamp("2015-02-24 00:00:00"),
  650. 2: pd.Timestamp("2015-02-24 00:00:00"),
  651. 3: pd.Timestamp("2015-02-24 00:00:00"),
  652. 4: pd.Timestamp("2015-02-24 00:00:00"),
  653. },
  654. "userAgent": {
  655. 0: "some UA string",
  656. 1: "some UA string",
  657. 2: "some UA string",
  658. 3: "another UA string",
  659. 4: "some UA string",
  660. },
  661. "userId": {
  662. 0: "17661101",
  663. 1: "17661101",
  664. 2: "17661101",
  665. 3: "17661101",
  666. 4: "17661101",
  667. },
  668. }
  669. )
  670. def most_common_values(df):
  671. return Series({c: s.value_counts().index[0] for c, s in df.items()})
  672. result = tdf.groupby("day").apply(most_common_values)["userId"]
  673. expected = Series(
  674. ["17661101"], index=pd.DatetimeIndex(["2015-02-24"], name="day"), name="userId"
  675. )
  676. tm.assert_series_equal(result, expected)
  677. @pytest.mark.parametrize("category", [False, True])
  678. def test_apply_multi_level_name(category):
  679. # https://github.com/pandas-dev/pandas/issues/31068
  680. b = [1, 2] * 5
  681. if category:
  682. b = pd.Categorical(b, categories=[1, 2, 3])
  683. expected_index = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3], name="B")
  684. # GH#40669 - summing an empty frame gives float dtype
  685. expected_values = [20.0, 25.0, 0.0]
  686. else:
  687. expected_index = Index([1, 2], name="B")
  688. expected_values = [20, 25]
  689. expected = DataFrame(
  690. {"C": expected_values, "D": expected_values}, index=expected_index
  691. )
  692. df = DataFrame(
  693. {"A": np.arange(10), "B": b, "C": list(range(10)), "D": list(range(10))}
  694. ).set_index(["A", "B"])
  695. result = df.groupby("B").apply(lambda x: x.sum())
  696. tm.assert_frame_equal(result, expected)
  697. assert df.index.names == ["A", "B"]
  698. def test_groupby_apply_datetime_result_dtypes():
  699. # GH 14849
  700. data = DataFrame.from_records(
  701. [
  702. (pd.Timestamp(2016, 1, 1), "red", "dark", 1, "8"),
  703. (pd.Timestamp(2015, 1, 1), "green", "stormy", 2, "9"),
  704. (pd.Timestamp(2014, 1, 1), "blue", "bright", 3, "10"),
  705. (pd.Timestamp(2013, 1, 1), "blue", "calm", 4, "potato"),
  706. ],
  707. columns=["observation", "color", "mood", "intensity", "score"],
  708. )
  709. result = data.groupby("color").apply(lambda g: g.iloc[0]).dtypes
  710. expected = Series(
  711. [np.dtype("datetime64[ns]"), object, object, np.int64, object],
  712. index=["observation", "color", "mood", "intensity", "score"],
  713. )
  714. tm.assert_series_equal(result, expected)
  715. @pytest.mark.parametrize(
  716. "index",
  717. [
  718. pd.CategoricalIndex(list("abc")),
  719. pd.interval_range(0, 3),
  720. pd.period_range("2020", periods=3, freq="D"),
  721. MultiIndex.from_tuples([("a", 0), ("a", 1), ("b", 0)]),
  722. ],
  723. )
  724. def test_apply_index_has_complex_internals(index):
  725. # GH 31248
  726. df = DataFrame({"group": [1, 1, 2], "value": [0, 1, 0]}, index=index)
  727. result = df.groupby("group", group_keys=False).apply(lambda x: x)
  728. tm.assert_frame_equal(result, df)
  729. @pytest.mark.parametrize(
  730. "function, expected_values",
  731. [
  732. (lambda x: x.index.to_list(), [[0, 1], [2, 3]]),
  733. (lambda x: set(x.index.to_list()), [{0, 1}, {2, 3}]),
  734. (lambda x: tuple(x.index.to_list()), [(0, 1), (2, 3)]),
  735. (
  736. lambda x: dict(enumerate(x.index.to_list())),
  737. [{0: 0, 1: 1}, {0: 2, 1: 3}],
  738. ),
  739. (
  740. lambda x: [{n: i} for (n, i) in enumerate(x.index.to_list())],
  741. [[{0: 0}, {1: 1}], [{0: 2}, {1: 3}]],
  742. ),
  743. ],
  744. )
  745. def test_apply_function_returns_non_pandas_non_scalar(function, expected_values):
  746. # GH 31441
  747. df = DataFrame(["A", "A", "B", "B"], columns=["groups"])
  748. result = df.groupby("groups").apply(function)
  749. expected = Series(expected_values, index=Index(["A", "B"], name="groups"))
  750. tm.assert_series_equal(result, expected)
  751. def test_apply_function_returns_numpy_array():
  752. # GH 31605
  753. def fct(group):
  754. return group["B"].values.flatten()
  755. df = DataFrame({"A": ["a", "a", "b", "none"], "B": [1, 2, 3, np.nan]})
  756. result = df.groupby("A").apply(fct)
  757. expected = Series(
  758. [[1.0, 2.0], [3.0], [np.nan]], index=Index(["a", "b", "none"], name="A")
  759. )
  760. tm.assert_series_equal(result, expected)
  761. @pytest.mark.parametrize("function", [lambda gr: gr.index, lambda gr: gr.index + 1 - 1])
  762. def test_apply_function_index_return(function):
  763. # GH: 22541
  764. df = DataFrame([1, 2, 2, 2, 1, 2, 3, 1, 3, 1], columns=["id"])
  765. result = df.groupby("id").apply(function)
  766. expected = Series(
  767. [Index([0, 4, 7, 9]), Index([1, 2, 3, 5]), Index([6, 8])],
  768. index=Index([1, 2, 3], name="id"),
  769. )
  770. tm.assert_series_equal(result, expected)
  771. def test_apply_function_with_indexing_return_column():
  772. # GH#7002, GH#41480, GH#49256
  773. df = DataFrame(
  774. {
  775. "foo1": ["one", "two", "two", "three", "one", "two"],
  776. "foo2": [1, 2, 4, 4, 5, 6],
  777. }
  778. )
  779. result = df.groupby("foo1", as_index=False).apply(lambda x: x.mean())
  780. expected = DataFrame(
  781. {
  782. "foo1": ["one", "three", "two"],
  783. "foo2": [3.0, 4.0, 4.0],
  784. }
  785. )
  786. tm.assert_frame_equal(result, expected)
  787. @pytest.mark.parametrize(
  788. "udf",
  789. [(lambda x: x.copy()), (lambda x: x.copy().rename(lambda y: y + 1))],
  790. )
  791. @pytest.mark.parametrize("group_keys", [True, False])
  792. def test_apply_result_type(group_keys, udf):
  793. # https://github.com/pandas-dev/pandas/issues/34809
  794. # We'd like to control whether the group keys end up in the index
  795. # regardless of whether the UDF happens to be a transform.
  796. df = DataFrame({"A": ["a", "b"], "B": [1, 2]})
  797. df_result = df.groupby("A", group_keys=group_keys).apply(udf)
  798. series_result = df.B.groupby(df.A, group_keys=group_keys).apply(udf)
  799. if group_keys:
  800. assert df_result.index.nlevels == 2
  801. assert series_result.index.nlevels == 2
  802. else:
  803. assert df_result.index.nlevels == 1
  804. assert series_result.index.nlevels == 1
  805. def test_result_order_group_keys_false():
  806. # GH 34998
  807. # apply result order should not depend on whether index is the same or just equal
  808. df = DataFrame({"A": [2, 1, 2], "B": [1, 2, 3]})
  809. result = df.groupby("A", group_keys=False).apply(lambda x: x)
  810. expected = df.groupby("A", group_keys=False).apply(lambda x: x.copy())
  811. tm.assert_frame_equal(result, expected)
  812. def test_apply_with_timezones_aware():
  813. # GH: 27212
  814. dates = ["2001-01-01"] * 2 + ["2001-01-02"] * 2 + ["2001-01-03"] * 2
  815. index_no_tz = pd.DatetimeIndex(dates)
  816. index_tz = pd.DatetimeIndex(dates, tz="UTC")
  817. df1 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_no_tz})
  818. df2 = DataFrame({"x": list(range(2)) * 3, "y": range(6), "t": index_tz})
  819. result1 = df1.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy())
  820. result2 = df2.groupby("x", group_keys=False).apply(lambda df: df[["x", "y"]].copy())
  821. tm.assert_frame_equal(result1, result2)
  822. def test_apply_is_unchanged_when_other_methods_are_called_first(reduction_func):
  823. # GH #34656
  824. # GH #34271
  825. df = DataFrame(
  826. {
  827. "a": [99, 99, 99, 88, 88, 88],
  828. "b": [1, 2, 3, 4, 5, 6],
  829. "c": [10, 20, 30, 40, 50, 60],
  830. }
  831. )
  832. expected = DataFrame(
  833. {"a": [264, 297], "b": [15, 6], "c": [150, 60]},
  834. index=Index([88, 99], name="a"),
  835. )
  836. # Check output when no other methods are called before .apply()
  837. grp = df.groupby(by="a")
  838. result = grp.apply(sum)
  839. tm.assert_frame_equal(result, expected)
  840. # Check output when another method is called before .apply()
  841. grp = df.groupby(by="a")
  842. args = get_groupby_method_args(reduction_func, df)
  843. _ = getattr(grp, reduction_func)(*args)
  844. result = grp.apply(sum)
  845. tm.assert_frame_equal(result, expected)
  846. def test_apply_with_date_in_multiindex_does_not_convert_to_timestamp():
  847. # GH 29617
  848. df = DataFrame(
  849. {
  850. "A": ["a", "a", "a", "b"],
  851. "B": [
  852. date(2020, 1, 10),
  853. date(2020, 1, 10),
  854. date(2020, 2, 10),
  855. date(2020, 2, 10),
  856. ],
  857. "C": [1, 2, 3, 4],
  858. },
  859. index=Index([100, 101, 102, 103], name="idx"),
  860. )
  861. grp = df.groupby(["A", "B"])
  862. result = grp.apply(lambda x: x.head(1))
  863. expected = df.iloc[[0, 2, 3]]
  864. expected = expected.reset_index()
  865. expected.index = MultiIndex.from_frame(expected[["A", "B", "idx"]])
  866. expected = expected.drop(columns="idx")
  867. tm.assert_frame_equal(result, expected)
  868. for val in result.index.levels[1]:
  869. assert type(val) is date
  870. def test_apply_by_cols_equals_apply_by_rows_transposed():
  871. # GH 16646
  872. # Operating on the columns, or transposing and operating on the rows
  873. # should give the same result. There was previously a bug where the
  874. # by_rows operation would work fine, but by_cols would throw a ValueError
  875. df = DataFrame(
  876. np.random.random([6, 4]),
  877. columns=MultiIndex.from_product([["A", "B"], [1, 2]]),
  878. )
  879. by_rows = df.T.groupby(axis=0, level=0).apply(
  880. lambda x: x.droplevel(axis=0, level=0)
  881. )
  882. by_cols = df.groupby(axis=1, level=0).apply(lambda x: x.droplevel(axis=1, level=0))
  883. tm.assert_frame_equal(by_cols, by_rows.T)
  884. tm.assert_frame_equal(by_cols, df)
  885. @pytest.mark.parametrize("dropna", [True, False])
  886. def test_apply_dropna_with_indexed_same(dropna):
  887. # GH 38227
  888. # GH#43205
  889. df = DataFrame(
  890. {
  891. "col": [1, 2, 3, 4, 5],
  892. "group": ["a", np.nan, np.nan, "b", "b"],
  893. },
  894. index=list("xxyxz"),
  895. )
  896. result = df.groupby("group", dropna=dropna, group_keys=False).apply(lambda x: x)
  897. expected = df.dropna() if dropna else df.iloc[[0, 3, 1, 2, 4]]
  898. tm.assert_frame_equal(result, expected)
  899. @pytest.mark.parametrize(
  900. "as_index, expected",
  901. [
  902. [
  903. False,
  904. DataFrame(
  905. [[1, 1, 1], [2, 2, 1]], columns=Index(["a", "b", None], dtype=object)
  906. ),
  907. ],
  908. [
  909. True,
  910. Series(
  911. [1, 1], index=MultiIndex.from_tuples([(1, 1), (2, 2)], names=["a", "b"])
  912. ),
  913. ],
  914. ],
  915. )
  916. def test_apply_as_index_constant_lambda(as_index, expected):
  917. # GH 13217
  918. df = DataFrame({"a": [1, 1, 2, 2], "b": [1, 1, 2, 2], "c": [1, 1, 1, 1]})
  919. result = df.groupby(["a", "b"], as_index=as_index).apply(lambda x: 1)
  920. tm.assert_equal(result, expected)
  921. def test_sort_index_groups():
  922. # GH 20420
  923. df = DataFrame(
  924. {"A": [1, 2, 3, 4, 5], "B": [6, 7, 8, 9, 0], "C": [1, 1, 1, 2, 2]},
  925. index=range(5),
  926. )
  927. result = df.groupby("C").apply(lambda x: x.A.sort_index())
  928. expected = Series(
  929. range(1, 6),
  930. index=MultiIndex.from_tuples(
  931. [(1, 0), (1, 1), (1, 2), (2, 3), (2, 4)], names=["C", None]
  932. ),
  933. name="A",
  934. )
  935. tm.assert_series_equal(result, expected)
  936. def test_positional_slice_groups_datetimelike():
  937. # GH 21651
  938. expected = DataFrame(
  939. {
  940. "date": pd.date_range("2010-01-01", freq="12H", periods=5),
  941. "vals": range(5),
  942. "let": list("abcde"),
  943. }
  944. )
  945. result = expected.groupby(
  946. [expected.let, expected.date.dt.date], group_keys=False
  947. ).apply(lambda x: x.iloc[0:])
  948. tm.assert_frame_equal(result, expected)
  949. def test_groupby_apply_shape_cache_safety():
  950. # GH#42702 this fails if we cache_readonly Block.shape
  951. df = DataFrame({"A": ["a", "a", "b"], "B": [1, 2, 3], "C": [4, 6, 5]})
  952. gb = df.groupby("A")
  953. result = gb[["B", "C"]].apply(lambda x: x.astype(float).max() - x.min())
  954. expected = DataFrame(
  955. {"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A")
  956. )
  957. tm.assert_frame_equal(result, expected)
  958. @pytest.mark.parametrize("dropna", [True, False])
  959. def test_apply_na(dropna):
  960. # GH#28984
  961. df = DataFrame(
  962. {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]}
  963. )
  964. dfgrp = df.groupby("grp", dropna=dropna)
  965. result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z"))
  966. expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1))
  967. tm.assert_frame_equal(result, expected)
  968. def test_apply_empty_string_nan_coerce_bug():
  969. # GH#24903
  970. result = (
  971. DataFrame(
  972. {
  973. "a": [1, 1, 2, 2],
  974. "b": ["", "", "", ""],
  975. "c": pd.to_datetime([1, 2, 3, 4], unit="s"),
  976. }
  977. )
  978. .groupby(["a", "b"])
  979. .apply(lambda df: df.iloc[-1])
  980. )
  981. expected = DataFrame(
  982. [[1, "", pd.to_datetime(2, unit="s")], [2, "", pd.to_datetime(4, unit="s")]],
  983. columns=["a", "b", "c"],
  984. index=MultiIndex.from_tuples([(1, ""), (2, "")], names=["a", "b"]),
  985. )
  986. tm.assert_frame_equal(result, expected)
  987. @pytest.mark.parametrize("index_values", [[1, 2, 3], [1.0, 2.0, 3.0]])
  988. def test_apply_index_key_error_bug(index_values):
  989. # GH 44310
  990. result = DataFrame(
  991. {
  992. "a": ["aa", "a2", "a3"],
  993. "b": [1, 2, 3],
  994. },
  995. index=Index(index_values),
  996. )
  997. expected = DataFrame(
  998. {
  999. "b_mean": [2.0, 3.0, 1.0],
  1000. },
  1001. index=Index(["a2", "a3", "aa"], name="a"),
  1002. )
  1003. result = result.groupby("a").apply(
  1004. lambda df: Series([df["b"].mean()], index=["b_mean"])
  1005. )
  1006. tm.assert_frame_equal(result, expected)
  1007. @pytest.mark.parametrize(
  1008. "arg,idx",
  1009. [
  1010. [
  1011. [
  1012. 1,
  1013. 2,
  1014. 3,
  1015. ],
  1016. [
  1017. 0.1,
  1018. 0.3,
  1019. 0.2,
  1020. ],
  1021. ],
  1022. [
  1023. [
  1024. 1,
  1025. 2,
  1026. 3,
  1027. ],
  1028. [
  1029. 0.1,
  1030. 0.2,
  1031. 0.3,
  1032. ],
  1033. ],
  1034. [
  1035. [
  1036. 1,
  1037. 4,
  1038. 3,
  1039. ],
  1040. [
  1041. 0.1,
  1042. 0.4,
  1043. 0.2,
  1044. ],
  1045. ],
  1046. ],
  1047. )
  1048. def test_apply_nonmonotonic_float_index(arg, idx):
  1049. # GH 34455
  1050. expected = DataFrame({"col": arg}, index=idx)
  1051. result = expected.groupby("col", group_keys=False).apply(lambda x: x)
  1052. tm.assert_frame_equal(result, expected)
  1053. @pytest.mark.parametrize("args, kwargs", [([True], {}), ([], {"numeric_only": True})])
  1054. def test_apply_str_with_args(df, args, kwargs):
  1055. # GH#46479
  1056. gb = df.groupby("A")
  1057. result = gb.apply("sum", *args, **kwargs)
  1058. expected = gb.sum(numeric_only=True)
  1059. tm.assert_frame_equal(result, expected)
  1060. @pytest.mark.parametrize("name", ["some_name", None])
  1061. def test_result_name_when_one_group(name):
  1062. # GH 46369
  1063. ser = Series([1, 2], name=name)
  1064. result = ser.groupby(["a", "a"], group_keys=False).apply(lambda x: x)
  1065. expected = Series([1, 2], name=name)
  1066. tm.assert_series_equal(result, expected)
  1067. @pytest.mark.parametrize(
  1068. "method, op",
  1069. [
  1070. ("apply", lambda gb: gb.values[-1]),
  1071. ("apply", lambda gb: gb["b"].iloc[0]),
  1072. ("agg", "skew"),
  1073. ("agg", "prod"),
  1074. ("agg", "sum"),
  1075. ],
  1076. )
  1077. def test_empty_df(method, op):
  1078. # GH 47985
  1079. empty_df = DataFrame({"a": [], "b": []})
  1080. gb = empty_df.groupby("a", group_keys=True)
  1081. group = getattr(gb, "b")
  1082. result = getattr(group, method)(op)
  1083. expected = Series(
  1084. [], name="b", dtype="float64", index=Index([], dtype="float64", name="a")
  1085. )
  1086. tm.assert_series_equal(result, expected)