test_function.py 53 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637
  1. import builtins
  2. from io import StringIO
  3. import numpy as np
  4. import pytest
  5. from pandas._libs import lib
  6. from pandas.errors import UnsupportedFunctionCall
  7. import pandas as pd
  8. from pandas import (
  9. DataFrame,
  10. Index,
  11. MultiIndex,
  12. Series,
  13. Timestamp,
  14. date_range,
  15. )
  16. import pandas._testing as tm
  17. from pandas.core import nanops
  18. from pandas.tests.groupby import get_groupby_method_args
  19. from pandas.util import _test_decorators as td
  20. @pytest.fixture(
  21. params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
  22. ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
  23. )
  24. def dtypes_for_minmax(request):
  25. """
  26. Fixture of dtypes with min and max values used for testing
  27. cummin and cummax
  28. """
  29. dtype = request.param
  30. np_type = dtype
  31. if dtype == "Int64":
  32. np_type = np.int64
  33. elif dtype == "Float64":
  34. np_type = np.float64
  35. min_val = (
  36. np.iinfo(np_type).min
  37. if np.dtype(np_type).kind == "i"
  38. else np.finfo(np_type).min
  39. )
  40. max_val = (
  41. np.iinfo(np_type).max
  42. if np.dtype(np_type).kind == "i"
  43. else np.finfo(np_type).max
  44. )
  45. return (dtype, min_val, max_val)
  46. def test_intercept_builtin_sum():
  47. s = Series([1.0, 2.0, np.nan, 3.0])
  48. grouped = s.groupby([0, 1, 2, 2])
  49. result = grouped.agg(builtins.sum)
  50. result2 = grouped.apply(builtins.sum)
  51. expected = grouped.sum()
  52. tm.assert_series_equal(result, expected)
  53. tm.assert_series_equal(result2, expected)
  54. @pytest.mark.parametrize("f", [max, min, sum])
  55. @pytest.mark.parametrize("keys", ["jim", ["jim", "joe"]]) # Single key # Multi-key
  56. def test_builtins_apply(keys, f):
  57. # see gh-8155
  58. df = DataFrame(np.random.randint(1, 50, (1000, 2)), columns=["jim", "joe"])
  59. df["jolie"] = np.random.randn(1000)
  60. gb = df.groupby(keys)
  61. fname = f.__name__
  62. result = gb.apply(f)
  63. ngroups = len(df.drop_duplicates(subset=keys))
  64. assert_msg = f"invalid frame shape: {result.shape} (expected ({ngroups}, 3))"
  65. assert result.shape == (ngroups, 3), assert_msg
  66. npfunc = lambda x: getattr(np, fname)(x, axis=0) # numpy's equivalent function
  67. expected = gb.apply(npfunc)
  68. tm.assert_frame_equal(result, expected)
  69. with tm.assert_produces_warning(None):
  70. expected2 = gb.apply(lambda x: npfunc(x))
  71. tm.assert_frame_equal(result, expected2)
  72. if f != sum:
  73. expected = gb.agg(fname).reset_index()
  74. expected.set_index(keys, inplace=True, drop=False)
  75. tm.assert_frame_equal(result, expected, check_dtype=False)
  76. tm.assert_series_equal(getattr(result, fname)(axis=0), getattr(df, fname)(axis=0))
  77. class TestNumericOnly:
  78. # make sure that we are passing thru kwargs to our agg functions
  79. @pytest.fixture
  80. def df(self):
  81. # GH3668
  82. # GH5724
  83. df = DataFrame(
  84. {
  85. "group": [1, 1, 2],
  86. "int": [1, 2, 3],
  87. "float": [4.0, 5.0, 6.0],
  88. "string": list("abc"),
  89. "category_string": Series(list("abc")).astype("category"),
  90. "category_int": [7, 8, 9],
  91. "datetime": date_range("20130101", periods=3),
  92. "datetimetz": date_range("20130101", periods=3, tz="US/Eastern"),
  93. "timedelta": pd.timedelta_range("1 s", periods=3, freq="s"),
  94. },
  95. columns=[
  96. "group",
  97. "int",
  98. "float",
  99. "string",
  100. "category_string",
  101. "category_int",
  102. "datetime",
  103. "datetimetz",
  104. "timedelta",
  105. ],
  106. )
  107. return df
  108. @pytest.mark.parametrize("method", ["mean", "median"])
  109. def test_averages(self, df, method):
  110. # mean / median
  111. expected_columns_numeric = Index(["int", "float", "category_int"])
  112. gb = df.groupby("group")
  113. expected = DataFrame(
  114. {
  115. "category_int": [7.5, 9],
  116. "float": [4.5, 6.0],
  117. "timedelta": [pd.Timedelta("1.5s"), pd.Timedelta("3s")],
  118. "int": [1.5, 3],
  119. "datetime": [
  120. Timestamp("2013-01-01 12:00:00"),
  121. Timestamp("2013-01-03 00:00:00"),
  122. ],
  123. "datetimetz": [
  124. Timestamp("2013-01-01 12:00:00", tz="US/Eastern"),
  125. Timestamp("2013-01-03 00:00:00", tz="US/Eastern"),
  126. ],
  127. },
  128. index=Index([1, 2], name="group"),
  129. columns=[
  130. "int",
  131. "float",
  132. "category_int",
  133. ],
  134. )
  135. result = getattr(gb, method)(numeric_only=True)
  136. tm.assert_frame_equal(result.reindex_like(expected), expected)
  137. expected_columns = expected.columns
  138. self._check(df, method, expected_columns, expected_columns_numeric)
  139. @pytest.mark.parametrize("method", ["min", "max"])
  140. def test_extrema(self, df, method):
  141. # TODO: min, max *should* handle
  142. # categorical (ordered) dtype
  143. expected_columns = Index(
  144. [
  145. "int",
  146. "float",
  147. "string",
  148. "category_int",
  149. "datetime",
  150. "datetimetz",
  151. "timedelta",
  152. ]
  153. )
  154. expected_columns_numeric = expected_columns
  155. self._check(df, method, expected_columns, expected_columns_numeric)
  156. @pytest.mark.parametrize("method", ["first", "last"])
  157. def test_first_last(self, df, method):
  158. expected_columns = Index(
  159. [
  160. "int",
  161. "float",
  162. "string",
  163. "category_string",
  164. "category_int",
  165. "datetime",
  166. "datetimetz",
  167. "timedelta",
  168. ]
  169. )
  170. expected_columns_numeric = expected_columns
  171. self._check(df, method, expected_columns, expected_columns_numeric)
  172. @pytest.mark.parametrize("method", ["sum", "cumsum"])
  173. def test_sum_cumsum(self, df, method):
  174. expected_columns_numeric = Index(["int", "float", "category_int"])
  175. expected_columns = Index(
  176. ["int", "float", "string", "category_int", "timedelta"]
  177. )
  178. if method == "cumsum":
  179. # cumsum loses string
  180. expected_columns = Index(["int", "float", "category_int", "timedelta"])
  181. self._check(df, method, expected_columns, expected_columns_numeric)
  182. @pytest.mark.parametrize("method", ["prod", "cumprod"])
  183. def test_prod_cumprod(self, df, method):
  184. expected_columns = Index(["int", "float", "category_int"])
  185. expected_columns_numeric = expected_columns
  186. self._check(df, method, expected_columns, expected_columns_numeric)
  187. @pytest.mark.parametrize("method", ["cummin", "cummax"])
  188. def test_cummin_cummax(self, df, method):
  189. # like min, max, but don't include strings
  190. expected_columns = Index(
  191. ["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
  192. )
  193. # GH#15561: numeric_only=False set by default like min/max
  194. expected_columns_numeric = expected_columns
  195. self._check(df, method, expected_columns, expected_columns_numeric)
  196. def _check(self, df, method, expected_columns, expected_columns_numeric):
  197. gb = df.groupby("group")
  198. # object dtypes for transformations are not implemented in Cython and
  199. # have no Python fallback
  200. exception = NotImplementedError if method.startswith("cum") else TypeError
  201. if method in ("min", "max", "cummin", "cummax", "cumsum", "cumprod"):
  202. # The methods default to numeric_only=False and raise TypeError
  203. msg = "|".join(
  204. [
  205. "Categorical is not ordered",
  206. "function is not implemented for this dtype",
  207. f"Cannot perform {method} with non-ordered Categorical",
  208. ]
  209. )
  210. with pytest.raises(exception, match=msg):
  211. getattr(gb, method)()
  212. elif method in ("sum", "mean", "median", "prod"):
  213. msg = "|".join(
  214. [
  215. "category type does not support sum operations",
  216. "[Cc]ould not convert",
  217. "can't multiply sequence by non-int of type 'str'",
  218. ]
  219. )
  220. with pytest.raises(exception, match=msg):
  221. getattr(gb, method)()
  222. else:
  223. result = getattr(gb, method)()
  224. tm.assert_index_equal(result.columns, expected_columns_numeric)
  225. if method not in ("first", "last"):
  226. msg = "|".join(
  227. [
  228. "[Cc]ould not convert",
  229. "Categorical is not ordered",
  230. "category type does not support",
  231. "can't multiply sequence",
  232. "function is not implemented for this dtype",
  233. f"Cannot perform {method} with non-ordered Categorical",
  234. ]
  235. )
  236. with pytest.raises(exception, match=msg):
  237. getattr(gb, method)(numeric_only=False)
  238. else:
  239. result = getattr(gb, method)(numeric_only=False)
  240. tm.assert_index_equal(result.columns, expected_columns)
  241. class TestGroupByNonCythonPaths:
  242. # GH#5610 non-cython calls should not include the grouper
  243. # Tests for code not expected to go through cython paths.
  244. @pytest.fixture
  245. def df(self):
  246. df = DataFrame(
  247. [[1, 2, "foo"], [1, np.nan, "bar"], [3, np.nan, "baz"]],
  248. columns=["A", "B", "C"],
  249. )
  250. return df
  251. @pytest.fixture
  252. def gb(self, df):
  253. gb = df.groupby("A")
  254. return gb
  255. @pytest.fixture
  256. def gni(self, df):
  257. gni = df.groupby("A", as_index=False)
  258. return gni
  259. def test_describe(self, df, gb, gni):
  260. # describe
  261. expected_index = Index([1, 3], name="A")
  262. expected_col = MultiIndex(
  263. levels=[["B"], ["count", "mean", "std", "min", "25%", "50%", "75%", "max"]],
  264. codes=[[0] * 8, list(range(8))],
  265. )
  266. expected = DataFrame(
  267. [
  268. [1.0, 2.0, np.nan, 2.0, 2.0, 2.0, 2.0, 2.0],
  269. [0.0, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
  270. ],
  271. index=expected_index,
  272. columns=expected_col,
  273. )
  274. result = gb.describe()
  275. tm.assert_frame_equal(result, expected)
  276. expected = expected.reset_index()
  277. result = gni.describe()
  278. tm.assert_frame_equal(result, expected)
  279. def test_cython_api2():
  280. # this takes the fast apply path
  281. # cumsum (GH5614)
  282. df = DataFrame([[1, 2, np.nan], [1, np.nan, 9], [3, 4, 9]], columns=["A", "B", "C"])
  283. expected = DataFrame([[2, np.nan], [np.nan, 9], [4, 9]], columns=["B", "C"])
  284. result = df.groupby("A").cumsum()
  285. tm.assert_frame_equal(result, expected)
  286. # GH 5755 - cumsum is a transformer and should ignore as_index
  287. result = df.groupby("A", as_index=False).cumsum()
  288. tm.assert_frame_equal(result, expected)
  289. # GH 13994
  290. result = df.groupby("A").cumsum(axis=1)
  291. expected = df.cumsum(axis=1)
  292. tm.assert_frame_equal(result, expected)
  293. result = df.groupby("A").cumprod(axis=1)
  294. expected = df.cumprod(axis=1)
  295. tm.assert_frame_equal(result, expected)
  296. def test_cython_median():
  297. arr = np.random.randn(1000)
  298. arr[::2] = np.nan
  299. df = DataFrame(arr)
  300. labels = np.random.randint(0, 50, size=1000).astype(float)
  301. labels[::17] = np.nan
  302. result = df.groupby(labels).median()
  303. exp = df.groupby(labels).agg(nanops.nanmedian)
  304. tm.assert_frame_equal(result, exp)
  305. df = DataFrame(np.random.randn(1000, 5))
  306. rs = df.groupby(labels).agg(np.median)
  307. xp = df.groupby(labels).median()
  308. tm.assert_frame_equal(rs, xp)
  309. def test_median_empty_bins(observed):
  310. df = DataFrame(np.random.randint(0, 44, 500))
  311. grps = range(0, 55, 5)
  312. bins = pd.cut(df[0], grps)
  313. result = df.groupby(bins, observed=observed).median()
  314. expected = df.groupby(bins, observed=observed).agg(lambda x: x.median())
  315. tm.assert_frame_equal(result, expected)
  316. @pytest.mark.parametrize(
  317. "dtype", ["int8", "int16", "int32", "int64", "float32", "float64", "uint64"]
  318. )
  319. @pytest.mark.parametrize(
  320. "method,data",
  321. [
  322. ("first", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
  323. ("last", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
  324. ("min", {"df": [{"a": 1, "b": 1}, {"a": 2, "b": 3}]}),
  325. ("max", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 4}]}),
  326. ("count", {"df": [{"a": 1, "b": 2}, {"a": 2, "b": 2}], "out_type": "int64"}),
  327. ],
  328. )
  329. def test_groupby_non_arithmetic_agg_types(dtype, method, data):
  330. # GH9311, GH6620
  331. df = DataFrame(
  332. [{"a": 1, "b": 1}, {"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 2, "b": 4}]
  333. )
  334. df["b"] = df.b.astype(dtype)
  335. if "args" not in data:
  336. data["args"] = []
  337. if "out_type" in data:
  338. out_type = data["out_type"]
  339. else:
  340. out_type = dtype
  341. exp = data["df"]
  342. df_out = DataFrame(exp)
  343. df_out["b"] = df_out.b.astype(out_type)
  344. df_out.set_index("a", inplace=True)
  345. grpd = df.groupby("a")
  346. t = getattr(grpd, method)(*data["args"])
  347. tm.assert_frame_equal(t, df_out)
  348. @pytest.mark.parametrize(
  349. "i",
  350. [
  351. (
  352. Timestamp("2011-01-15 12:50:28.502376"),
  353. Timestamp("2011-01-20 12:50:28.593448"),
  354. ),
  355. (24650000000000001, 24650000000000002),
  356. ],
  357. )
  358. def test_groupby_non_arithmetic_agg_int_like_precision(i):
  359. # see gh-6620, gh-9311
  360. df = DataFrame([{"a": 1, "b": i[0]}, {"a": 1, "b": i[1]}])
  361. grp_exp = {
  362. "first": {"expected": i[0]},
  363. "last": {"expected": i[1]},
  364. "min": {"expected": i[0]},
  365. "max": {"expected": i[1]},
  366. "nth": {"expected": i[1], "args": [1]},
  367. "count": {"expected": 2},
  368. }
  369. for method, data in grp_exp.items():
  370. if "args" not in data:
  371. data["args"] = []
  372. grouped = df.groupby("a")
  373. res = getattr(grouped, method)(*data["args"])
  374. assert res.iloc[0].b == data["expected"]
  375. @pytest.mark.parametrize(
  376. "func, values",
  377. [
  378. ("idxmin", {"c_int": [0, 2], "c_float": [1, 3], "c_date": [1, 2]}),
  379. ("idxmax", {"c_int": [1, 3], "c_float": [0, 2], "c_date": [0, 3]}),
  380. ],
  381. )
  382. @pytest.mark.parametrize("numeric_only", [True, False])
  383. def test_idxmin_idxmax_returns_int_types(func, values, numeric_only):
  384. # GH 25444
  385. df = DataFrame(
  386. {
  387. "name": ["A", "A", "B", "B"],
  388. "c_int": [1, 2, 3, 4],
  389. "c_float": [4.02, 3.03, 2.04, 1.05],
  390. "c_date": ["2019", "2018", "2016", "2017"],
  391. }
  392. )
  393. df["c_date"] = pd.to_datetime(df["c_date"])
  394. df["c_date_tz"] = df["c_date"].dt.tz_localize("US/Pacific")
  395. df["c_timedelta"] = df["c_date"] - df["c_date"].iloc[0]
  396. df["c_period"] = df["c_date"].dt.to_period("W")
  397. df["c_Integer"] = df["c_int"].astype("Int64")
  398. df["c_Floating"] = df["c_float"].astype("Float64")
  399. result = getattr(df.groupby("name"), func)(numeric_only=numeric_only)
  400. expected = DataFrame(values, index=Index(["A", "B"], name="name"))
  401. if numeric_only:
  402. expected = expected.drop(columns=["c_date"])
  403. else:
  404. expected["c_date_tz"] = expected["c_date"]
  405. expected["c_timedelta"] = expected["c_date"]
  406. expected["c_period"] = expected["c_date"]
  407. expected["c_Integer"] = expected["c_int"]
  408. expected["c_Floating"] = expected["c_float"]
  409. tm.assert_frame_equal(result, expected)
  410. def test_idxmin_idxmax_axis1():
  411. df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
  412. df["A"] = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
  413. gb = df.groupby("A")
  414. res = gb.idxmax(axis=1)
  415. alt = df.iloc[:, 1:].idxmax(axis=1)
  416. indexer = res.index.get_level_values(1)
  417. tm.assert_series_equal(alt[indexer], res.droplevel("A"))
  418. df["E"] = date_range("2016-01-01", periods=10)
  419. gb2 = df.groupby("A")
  420. msg = "reduction operation 'argmax' not allowed for this dtype"
  421. with pytest.raises(TypeError, match=msg):
  422. gb2.idxmax(axis=1)
  423. @pytest.mark.parametrize("numeric_only", [True, False, None])
  424. def test_axis1_numeric_only(request, groupby_func, numeric_only):
  425. if groupby_func in ("idxmax", "idxmin"):
  426. pytest.skip("idxmax and idx_min tested in test_idxmin_idxmax_axis1")
  427. if groupby_func in ("corrwith", "skew"):
  428. msg = "GH#47723 groupby.corrwith and skew do not correctly implement axis=1"
  429. request.node.add_marker(pytest.mark.xfail(reason=msg))
  430. df = DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
  431. df["E"] = "x"
  432. groups = [1, 2, 3, 1, 2, 3, 1, 2, 3, 4]
  433. gb = df.groupby(groups)
  434. method = getattr(gb, groupby_func)
  435. args = get_groupby_method_args(groupby_func, df)
  436. kwargs = {"axis": 1}
  437. if numeric_only is not None:
  438. # when numeric_only is None we don't pass any argument
  439. kwargs["numeric_only"] = numeric_only
  440. # Functions without numeric_only and axis args
  441. no_args = ("cumprod", "cumsum", "diff", "fillna", "pct_change", "rank", "shift")
  442. # Functions with axis args
  443. has_axis = (
  444. "cumprod",
  445. "cumsum",
  446. "diff",
  447. "pct_change",
  448. "rank",
  449. "shift",
  450. "cummax",
  451. "cummin",
  452. "idxmin",
  453. "idxmax",
  454. "fillna",
  455. )
  456. if numeric_only is not None and groupby_func in no_args:
  457. msg = "got an unexpected keyword argument 'numeric_only'"
  458. with pytest.raises(TypeError, match=msg):
  459. method(*args, **kwargs)
  460. elif groupby_func not in has_axis:
  461. msg = "got an unexpected keyword argument 'axis'"
  462. with pytest.raises(TypeError, match=msg):
  463. method(*args, **kwargs)
  464. # fillna and shift are successful even on object dtypes
  465. elif (numeric_only is None or not numeric_only) and groupby_func not in (
  466. "fillna",
  467. "shift",
  468. ):
  469. msgs = (
  470. # cummax, cummin, rank
  471. "not supported between instances of",
  472. # cumprod
  473. "can't multiply sequence by non-int of type 'float'",
  474. # cumsum, diff, pct_change
  475. "unsupported operand type",
  476. )
  477. with pytest.raises(TypeError, match=f"({'|'.join(msgs)})"):
  478. method(*args, **kwargs)
  479. else:
  480. result = method(*args, **kwargs)
  481. df_expected = df.drop(columns="E").T if numeric_only else df.T
  482. expected = getattr(df_expected, groupby_func)(*args).T
  483. if groupby_func == "shift" and not numeric_only:
  484. # shift with axis=1 leaves the leftmost column as numeric
  485. # but transposing for expected gives us object dtype
  486. expected = expected.astype(float)
  487. tm.assert_equal(result, expected)
  488. def test_groupby_cumprod():
  489. # GH 4095
  490. df = DataFrame({"key": ["b"] * 10, "value": 2})
  491. actual = df.groupby("key")["value"].cumprod()
  492. expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
  493. expected.name = "value"
  494. tm.assert_series_equal(actual, expected)
  495. df = DataFrame({"key": ["b"] * 100, "value": 2})
  496. df["value"] = df["value"].astype(float)
  497. actual = df.groupby("key")["value"].cumprod()
  498. expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
  499. expected.name = "value"
  500. tm.assert_series_equal(actual, expected)
  501. def test_groupby_cumprod_overflow():
  502. # GH#37493 if we overflow we return garbage consistent with numpy
  503. df = DataFrame({"key": ["b"] * 4, "value": 100_000})
  504. actual = df.groupby("key")["value"].cumprod()
  505. expected = Series(
  506. [100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
  507. name="value",
  508. )
  509. tm.assert_series_equal(actual, expected)
  510. numpy_result = df.groupby("key", group_keys=False)["value"].apply(
  511. lambda x: x.cumprod()
  512. )
  513. numpy_result.name = "value"
  514. tm.assert_series_equal(actual, numpy_result)
  515. def test_groupby_cumprod_nan_influences_other_columns():
  516. # GH#48064
  517. df = DataFrame(
  518. {
  519. "a": 1,
  520. "b": [1, np.nan, 2],
  521. "c": [1, 2, 3.0],
  522. }
  523. )
  524. result = df.groupby("a").cumprod(numeric_only=True, skipna=False)
  525. expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]})
  526. tm.assert_frame_equal(result, expected)
  527. def scipy_sem(*args, **kwargs):
  528. from scipy.stats import sem
  529. return sem(*args, ddof=1, **kwargs)
  530. @pytest.mark.parametrize(
  531. "op,targop",
  532. [
  533. ("mean", np.mean),
  534. ("median", np.median),
  535. ("std", np.std),
  536. ("var", np.var),
  537. ("sum", np.sum),
  538. ("prod", np.prod),
  539. ("min", np.min),
  540. ("max", np.max),
  541. ("first", lambda x: x.iloc[0]),
  542. ("last", lambda x: x.iloc[-1]),
  543. ("count", np.size),
  544. pytest.param("sem", scipy_sem, marks=td.skip_if_no_scipy),
  545. ],
  546. )
  547. def test_ops_general(op, targop):
  548. df = DataFrame(np.random.randn(1000))
  549. labels = np.random.randint(0, 50, size=1000).astype(float)
  550. result = getattr(df.groupby(labels), op)()
  551. expected = df.groupby(labels).agg(targop)
  552. tm.assert_frame_equal(result, expected)
  553. def test_max_nan_bug():
  554. raw = """,Date,app,File
  555. -04-23,2013-04-23 00:00:00,,log080001.log
  556. -05-06,2013-05-06 00:00:00,,log.log
  557. -05-07,2013-05-07 00:00:00,OE,xlsx"""
  558. with tm.assert_produces_warning(UserWarning, match="Could not infer format"):
  559. df = pd.read_csv(StringIO(raw), parse_dates=[0])
  560. gb = df.groupby("Date")
  561. r = gb[["File"]].max()
  562. e = gb["File"].max().to_frame()
  563. tm.assert_frame_equal(r, e)
  564. assert not r["File"].isna().any()
  565. def test_nlargest():
  566. a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
  567. b = Series(list("a" * 5 + "b" * 5))
  568. gb = a.groupby(b)
  569. r = gb.nlargest(3)
  570. e = Series(
  571. [7, 5, 3, 10, 9, 6],
  572. index=MultiIndex.from_arrays([list("aaabbb"), [3, 2, 1, 9, 5, 8]]),
  573. )
  574. tm.assert_series_equal(r, e)
  575. a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
  576. gb = a.groupby(b)
  577. e = Series(
  578. [3, 2, 1, 3, 3, 2],
  579. index=MultiIndex.from_arrays([list("aaabbb"), [2, 3, 1, 6, 5, 7]]),
  580. )
  581. tm.assert_series_equal(gb.nlargest(3, keep="last"), e)
  582. def test_nlargest_mi_grouper():
  583. # see gh-21411
  584. npr = np.random.RandomState(123456789)
  585. dts = date_range("20180101", periods=10)
  586. iterables = [dts, ["one", "two"]]
  587. idx = MultiIndex.from_product(iterables, names=["first", "second"])
  588. s = Series(npr.randn(20), index=idx)
  589. result = s.groupby("first").nlargest(1)
  590. exp_idx = MultiIndex.from_tuples(
  591. [
  592. (dts[0], dts[0], "one"),
  593. (dts[1], dts[1], "one"),
  594. (dts[2], dts[2], "one"),
  595. (dts[3], dts[3], "two"),
  596. (dts[4], dts[4], "one"),
  597. (dts[5], dts[5], "one"),
  598. (dts[6], dts[6], "one"),
  599. (dts[7], dts[7], "one"),
  600. (dts[8], dts[8], "two"),
  601. (dts[9], dts[9], "one"),
  602. ],
  603. names=["first", "first", "second"],
  604. )
  605. exp_values = [
  606. 2.2129019979039612,
  607. 1.8417114045748335,
  608. 0.858963679564603,
  609. 1.3759151378258088,
  610. 0.9430284594687134,
  611. 0.5296914208183142,
  612. 0.8318045593815487,
  613. -0.8476703342910327,
  614. 0.3804446884133735,
  615. -0.8028845810770998,
  616. ]
  617. expected = Series(exp_values, index=exp_idx)
  618. tm.assert_series_equal(result, expected, check_exact=False, rtol=1e-3)
  619. def test_nsmallest():
  620. a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
  621. b = Series(list("a" * 5 + "b" * 5))
  622. gb = a.groupby(b)
  623. r = gb.nsmallest(3)
  624. e = Series(
  625. [1, 2, 3, 0, 4, 6],
  626. index=MultiIndex.from_arrays([list("aaabbb"), [0, 4, 1, 6, 7, 8]]),
  627. )
  628. tm.assert_series_equal(r, e)
  629. a = Series([1, 1, 3, 2, 0, 3, 3, 2, 1, 0])
  630. gb = a.groupby(b)
  631. e = Series(
  632. [0, 1, 1, 0, 1, 2],
  633. index=MultiIndex.from_arrays([list("aaabbb"), [4, 1, 0, 9, 8, 7]]),
  634. )
  635. tm.assert_series_equal(gb.nsmallest(3, keep="last"), e)
  636. @pytest.mark.parametrize(
  637. "data, groups",
  638. [([0, 1, 2, 3], [0, 0, 1, 1]), ([0], [0])],
  639. )
  640. @pytest.mark.parametrize("dtype", [None, *tm.ALL_INT_NUMPY_DTYPES])
  641. @pytest.mark.parametrize("method", ["nlargest", "nsmallest"])
  642. def test_nlargest_and_smallest_noop(data, groups, dtype, method):
  643. # GH 15272, GH 16345, GH 29129
  644. # Test nlargest/smallest when it results in a noop,
  645. # i.e. input is sorted and group size <= n
  646. if dtype is not None:
  647. data = np.array(data, dtype=dtype)
  648. if method == "nlargest":
  649. data = list(reversed(data))
  650. ser = Series(data, name="a")
  651. result = getattr(ser.groupby(groups), method)(n=2)
  652. expidx = np.array(groups, dtype=np.int_) if isinstance(groups, list) else groups
  653. expected = Series(data, index=MultiIndex.from_arrays([expidx, ser.index]), name="a")
  654. tm.assert_series_equal(result, expected)
  655. @pytest.mark.parametrize("func", ["cumprod", "cumsum"])
  656. def test_numpy_compat(func):
  657. # see gh-12811
  658. df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
  659. g = df.groupby("A")
  660. msg = "numpy operations are not valid with groupby"
  661. with pytest.raises(UnsupportedFunctionCall, match=msg):
  662. getattr(g, func)(1, 2, 3)
  663. with pytest.raises(UnsupportedFunctionCall, match=msg):
  664. getattr(g, func)(foo=1)
  665. def test_cummin(dtypes_for_minmax):
  666. dtype = dtypes_for_minmax[0]
  667. min_val = dtypes_for_minmax[1]
  668. # GH 15048
  669. base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
  670. expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
  671. df = base_df.astype(dtype)
  672. expected = DataFrame({"B": expected_mins}).astype(dtype)
  673. result = df.groupby("A").cummin()
  674. tm.assert_frame_equal(result, expected)
  675. result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
  676. tm.assert_frame_equal(result, expected)
  677. # Test w/ min value for dtype
  678. df.loc[[2, 6], "B"] = min_val
  679. df.loc[[1, 5], "B"] = min_val + 1
  680. expected.loc[[2, 3, 6, 7], "B"] = min_val
  681. expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
  682. result = df.groupby("A").cummin()
  683. tm.assert_frame_equal(result, expected, check_exact=True)
  684. expected = (
  685. df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
  686. )
  687. tm.assert_frame_equal(result, expected, check_exact=True)
  688. # Test nan in some values
  689. # Explicit cast to float to avoid implicit cast when setting nan
  690. base_df = base_df.astype({"B": "float"})
  691. base_df.loc[[0, 2, 4, 6], "B"] = np.nan
  692. expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
  693. result = base_df.groupby("A").cummin()
  694. tm.assert_frame_equal(result, expected)
  695. expected = (
  696. base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
  697. )
  698. tm.assert_frame_equal(result, expected)
  699. # GH 15561
  700. df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
  701. expected = Series(pd.to_datetime("2001"), index=[0], name="b")
  702. result = df.groupby("a")["b"].cummin()
  703. tm.assert_series_equal(expected, result)
  704. # GH 15635
  705. df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
  706. result = df.groupby("a").b.cummin()
  707. expected = Series([1, 2, 1], name="b")
  708. tm.assert_series_equal(result, expected)
  709. @pytest.mark.parametrize("method", ["cummin", "cummax"])
  710. @pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
  711. def test_cummin_max_all_nan_column(method, dtype):
  712. base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
  713. base_df["B"] = base_df["B"].astype(dtype)
  714. grouped = base_df.groupby("A")
  715. expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype)
  716. result = getattr(grouped, method)()
  717. tm.assert_frame_equal(expected, result)
  718. result = getattr(grouped["B"], method)().to_frame()
  719. tm.assert_frame_equal(expected, result)
  720. def test_cummax(dtypes_for_minmax):
  721. dtype = dtypes_for_minmax[0]
  722. max_val = dtypes_for_minmax[2]
  723. # GH 15048
  724. base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
  725. expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]
  726. df = base_df.astype(dtype)
  727. expected = DataFrame({"B": expected_maxs}).astype(dtype)
  728. result = df.groupby("A").cummax()
  729. tm.assert_frame_equal(result, expected)
  730. result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
  731. tm.assert_frame_equal(result, expected)
  732. # Test w/ max value for dtype
  733. df.loc[[2, 6], "B"] = max_val
  734. expected.loc[[2, 3, 6, 7], "B"] = max_val
  735. result = df.groupby("A").cummax()
  736. tm.assert_frame_equal(result, expected)
  737. expected = (
  738. df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
  739. )
  740. tm.assert_frame_equal(result, expected)
  741. # Test nan in some values
  742. # Explicit cast to float to avoid implicit cast when setting nan
  743. base_df = base_df.astype({"B": "float"})
  744. base_df.loc[[0, 2, 4, 6], "B"] = np.nan
  745. expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
  746. result = base_df.groupby("A").cummax()
  747. tm.assert_frame_equal(result, expected)
  748. expected = (
  749. base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
  750. )
  751. tm.assert_frame_equal(result, expected)
  752. # GH 15561
  753. df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
  754. expected = Series(pd.to_datetime("2001"), index=[0], name="b")
  755. result = df.groupby("a")["b"].cummax()
  756. tm.assert_series_equal(expected, result)
  757. # GH 15635
  758. df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
  759. result = df.groupby("a").b.cummax()
  760. expected = Series([2, 1, 2], name="b")
  761. tm.assert_series_equal(result, expected)
  762. def test_cummax_i8_at_implementation_bound():
  763. # the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT
  764. # for int64 dtype GH#46382
  765. ser = Series([pd.NaT._value + n for n in range(5)])
  766. df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")})
  767. gb = df.groupby("A")
  768. res = gb.cummax()
  769. exp = df[["B", "C"]]
  770. tm.assert_frame_equal(res, exp)
  771. @pytest.mark.parametrize("method", ["cummin", "cummax"])
  772. @pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
  773. @pytest.mark.parametrize(
  774. "groups,expected_data",
  775. [
  776. ([1, 1, 1], [1, None, None]),
  777. ([1, 2, 3], [1, None, 2]),
  778. ([1, 3, 3], [1, None, None]),
  779. ],
  780. )
  781. def test_cummin_max_skipna(method, dtype, groups, expected_data):
  782. # GH-34047
  783. df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
  784. orig = df.copy()
  785. gb = df.groupby(groups)["a"]
  786. result = getattr(gb, method)(skipna=False)
  787. expected = Series(expected_data, dtype=dtype, name="a")
  788. # check we didn't accidentally alter df
  789. tm.assert_frame_equal(df, orig)
  790. tm.assert_series_equal(result, expected)
  791. @pytest.mark.parametrize("method", ["cummin", "cummax"])
  792. def test_cummin_max_skipna_multiple_cols(method):
  793. # Ensure missing value in "a" doesn't cause "b" to be nan-filled
  794. df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
  795. gb = df.groupby([1, 1, 1])[["a", "b"]]
  796. result = getattr(gb, method)(skipna=False)
  797. expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})
  798. tm.assert_frame_equal(result, expected)
  799. @td.skip_if_32bit
  800. @pytest.mark.parametrize("method", ["cummin", "cummax"])
  801. @pytest.mark.parametrize(
  802. "dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)]
  803. )
  804. def test_nullable_int_not_cast_as_float(method, dtype, val):
  805. data = [val, pd.NA]
  806. df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
  807. grouped = df.groupby("grp")
  808. result = grouped.transform(method)
  809. expected = DataFrame({"b": data}, dtype=dtype)
  810. tm.assert_frame_equal(result, expected)
  811. @pytest.mark.parametrize(
  812. "in_vals, out_vals",
  813. [
  814. # Basics: strictly increasing (T), strictly decreasing (F),
  815. # abs val increasing (F), non-strictly increasing (T)
  816. ([1, 2, 5, 3, 2, 0, 4, 5, -6, 1, 1], [True, False, False, True]),
  817. # Test with inf vals
  818. (
  819. [1, 2.1, np.inf, 3, 2, np.inf, -np.inf, 5, 11, 1, -np.inf],
  820. [True, False, True, False],
  821. ),
  822. # Test with nan vals; should always be False
  823. (
  824. [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
  825. [False, False, False, False],
  826. ),
  827. ],
  828. )
  829. def test_is_monotonic_increasing(in_vals, out_vals):
  830. # GH 17015
  831. source_dict = {
  832. "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
  833. "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
  834. "C": in_vals,
  835. }
  836. df = DataFrame(source_dict)
  837. result = df.groupby("B").C.is_monotonic_increasing
  838. index = Index(list("abcd"), name="B")
  839. expected = Series(index=index, data=out_vals, name="C")
  840. tm.assert_series_equal(result, expected)
  841. # Also check result equal to manually taking x.is_monotonic_increasing.
  842. expected = df.groupby(["B"]).C.apply(lambda x: x.is_monotonic_increasing)
  843. tm.assert_series_equal(result, expected)
  844. @pytest.mark.parametrize(
  845. "in_vals, out_vals",
  846. [
  847. # Basics: strictly decreasing (T), strictly increasing (F),
  848. # abs val decreasing (F), non-strictly increasing (T)
  849. ([10, 9, 7, 3, 4, 5, -3, 2, 0, 1, 1], [True, False, False, True]),
  850. # Test with inf vals
  851. (
  852. [np.inf, 1, -np.inf, np.inf, 2, -3, -np.inf, 5, -3, -np.inf, -np.inf],
  853. [True, True, False, True],
  854. ),
  855. # Test with nan vals; should always be False
  856. (
  857. [1, 2, np.nan, 3, 2, np.nan, np.nan, 5, -np.inf, 1, np.nan],
  858. [False, False, False, False],
  859. ),
  860. ],
  861. )
  862. def test_is_monotonic_decreasing(in_vals, out_vals):
  863. # GH 17015
  864. source_dict = {
  865. "A": ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11"],
  866. "B": ["a", "a", "a", "b", "b", "b", "c", "c", "c", "d", "d"],
  867. "C": in_vals,
  868. }
  869. df = DataFrame(source_dict)
  870. result = df.groupby("B").C.is_monotonic_decreasing
  871. index = Index(list("abcd"), name="B")
  872. expected = Series(index=index, data=out_vals, name="C")
  873. tm.assert_series_equal(result, expected)
  874. # describe
  875. # --------------------------------
  876. def test_apply_describe_bug(mframe):
  877. grouped = mframe.groupby(level="first")
  878. grouped.describe() # it works!
  879. def test_series_describe_multikey():
  880. ts = tm.makeTimeSeries()
  881. grouped = ts.groupby([lambda x: x.year, lambda x: x.month])
  882. result = grouped.describe()
  883. tm.assert_series_equal(result["mean"], grouped.mean(), check_names=False)
  884. tm.assert_series_equal(result["std"], grouped.std(), check_names=False)
  885. tm.assert_series_equal(result["min"], grouped.min(), check_names=False)
  886. def test_series_describe_single():
  887. ts = tm.makeTimeSeries()
  888. grouped = ts.groupby(lambda x: x.month)
  889. result = grouped.apply(lambda x: x.describe())
  890. expected = grouped.describe().stack()
  891. tm.assert_series_equal(result, expected)
  892. @pytest.mark.parametrize("keys", ["key1", ["key1", "key2"]])
  893. def test_series_describe_as_index(as_index, keys):
  894. # GH#49256
  895. df = DataFrame(
  896. {
  897. "key1": ["one", "two", "two", "three", "two"],
  898. "key2": ["one", "two", "two", "three", "two"],
  899. "foo2": [1, 2, 4, 4, 6],
  900. }
  901. )
  902. gb = df.groupby(keys, as_index=as_index)["foo2"]
  903. result = gb.describe()
  904. expected = DataFrame(
  905. {
  906. "key1": ["one", "three", "two"],
  907. "count": [1.0, 1.0, 3.0],
  908. "mean": [1.0, 4.0, 4.0],
  909. "std": [np.nan, np.nan, 2.0],
  910. "min": [1.0, 4.0, 2.0],
  911. "25%": [1.0, 4.0, 3.0],
  912. "50%": [1.0, 4.0, 4.0],
  913. "75%": [1.0, 4.0, 5.0],
  914. "max": [1.0, 4.0, 6.0],
  915. }
  916. )
  917. if len(keys) == 2:
  918. expected.insert(1, "key2", expected["key1"])
  919. if as_index:
  920. expected = expected.set_index(keys)
  921. tm.assert_frame_equal(result, expected)
  922. def test_series_index_name(df):
  923. grouped = df.loc[:, ["C"]].groupby(df["A"])
  924. result = grouped.agg(lambda x: x.mean())
  925. assert result.index.name == "A"
  926. def test_frame_describe_multikey(tsframe):
  927. grouped = tsframe.groupby([lambda x: x.year, lambda x: x.month])
  928. result = grouped.describe()
  929. desc_groups = []
  930. for col in tsframe:
  931. group = grouped[col].describe()
  932. # GH 17464 - Remove duplicate MultiIndex levels
  933. group_col = MultiIndex(
  934. levels=[[col], group.columns],
  935. codes=[[0] * len(group.columns), range(len(group.columns))],
  936. )
  937. group = DataFrame(group.values, columns=group_col, index=group.index)
  938. desc_groups.append(group)
  939. expected = pd.concat(desc_groups, axis=1)
  940. tm.assert_frame_equal(result, expected)
  941. groupedT = tsframe.groupby({"A": 0, "B": 0, "C": 1, "D": 1}, axis=1)
  942. result = groupedT.describe()
  943. expected = tsframe.describe().T
  944. # reverting the change from https://github.com/pandas-dev/pandas/pull/35441/
  945. expected.index = MultiIndex(
  946. levels=[[0, 1], expected.index],
  947. codes=[[0, 0, 1, 1], range(len(expected.index))],
  948. )
  949. tm.assert_frame_equal(result, expected)
  950. def test_frame_describe_tupleindex():
  951. # GH 14848 - regression from 0.19.0 to 0.19.1
  952. df1 = DataFrame(
  953. {
  954. "x": [1, 2, 3, 4, 5] * 3,
  955. "y": [10, 20, 30, 40, 50] * 3,
  956. "z": [100, 200, 300, 400, 500] * 3,
  957. }
  958. )
  959. df1["k"] = [(0, 0, 1), (0, 1, 0), (1, 0, 0)] * 5
  960. df2 = df1.rename(columns={"k": "key"})
  961. msg = "Names should be list-like for a MultiIndex"
  962. with pytest.raises(ValueError, match=msg):
  963. df1.groupby("k").describe()
  964. with pytest.raises(ValueError, match=msg):
  965. df2.groupby("key").describe()
  966. def test_frame_describe_unstacked_format():
  967. # GH 4792
  968. prices = {
  969. Timestamp("2011-01-06 10:59:05", tz=None): 24990,
  970. Timestamp("2011-01-06 12:43:33", tz=None): 25499,
  971. Timestamp("2011-01-06 12:54:09", tz=None): 25499,
  972. }
  973. volumes = {
  974. Timestamp("2011-01-06 10:59:05", tz=None): 1500000000,
  975. Timestamp("2011-01-06 12:43:33", tz=None): 5000000000,
  976. Timestamp("2011-01-06 12:54:09", tz=None): 100000000,
  977. }
  978. df = DataFrame({"PRICE": prices, "VOLUME": volumes})
  979. result = df.groupby("PRICE").VOLUME.describe()
  980. data = [
  981. df[df.PRICE == 24990].VOLUME.describe().values.tolist(),
  982. df[df.PRICE == 25499].VOLUME.describe().values.tolist(),
  983. ]
  984. expected = DataFrame(
  985. data,
  986. index=Index([24990, 25499], name="PRICE"),
  987. columns=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
  988. )
  989. tm.assert_frame_equal(result, expected)
  990. @pytest.mark.filterwarnings(
  991. "ignore:"
  992. "indexing past lexsort depth may impact performance:"
  993. "pandas.errors.PerformanceWarning"
  994. )
  995. @pytest.mark.parametrize("as_index", [True, False])
  996. @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
  997. def test_describe_with_duplicate_output_column_names(as_index, keys):
  998. # GH 35314
  999. df = DataFrame(
  1000. {
  1001. "a1": [99, 99, 99, 88, 88, 88],
  1002. "a2": [99, 99, 99, 88, 88, 88],
  1003. "b": [1, 2, 3, 4, 5, 6],
  1004. "c": [10, 20, 30, 40, 50, 60],
  1005. },
  1006. columns=["a1", "a2", "b", "b"],
  1007. copy=False,
  1008. )
  1009. if keys == ["a1"]:
  1010. df = df.drop(columns="a2")
  1011. expected = (
  1012. DataFrame.from_records(
  1013. [
  1014. ("b", "count", 3.0, 3.0),
  1015. ("b", "mean", 5.0, 2.0),
  1016. ("b", "std", 1.0, 1.0),
  1017. ("b", "min", 4.0, 1.0),
  1018. ("b", "25%", 4.5, 1.5),
  1019. ("b", "50%", 5.0, 2.0),
  1020. ("b", "75%", 5.5, 2.5),
  1021. ("b", "max", 6.0, 3.0),
  1022. ("b", "count", 3.0, 3.0),
  1023. ("b", "mean", 5.0, 2.0),
  1024. ("b", "std", 1.0, 1.0),
  1025. ("b", "min", 4.0, 1.0),
  1026. ("b", "25%", 4.5, 1.5),
  1027. ("b", "50%", 5.0, 2.0),
  1028. ("b", "75%", 5.5, 2.5),
  1029. ("b", "max", 6.0, 3.0),
  1030. ],
  1031. )
  1032. .set_index([0, 1])
  1033. .T
  1034. )
  1035. expected.columns.names = [None, None]
  1036. if len(keys) == 2:
  1037. expected.index = MultiIndex(
  1038. levels=[[88, 99], [88, 99]], codes=[[0, 1], [0, 1]], names=["a1", "a2"]
  1039. )
  1040. else:
  1041. expected.index = Index([88, 99], name="a1")
  1042. if not as_index:
  1043. expected = expected.reset_index()
  1044. result = df.groupby(keys, as_index=as_index).describe()
  1045. tm.assert_frame_equal(result, expected)
  1046. def test_describe_duplicate_columns():
  1047. # GH#50806
  1048. df = DataFrame([[0, 1, 2, 3]])
  1049. df.columns = [0, 1, 2, 0]
  1050. gb = df.groupby(df[1])
  1051. result = gb.describe(percentiles=[])
  1052. columns = ["count", "mean", "std", "min", "50%", "max"]
  1053. frames = [
  1054. DataFrame([[1.0, val, np.nan, val, val, val]], index=[1], columns=columns)
  1055. for val in (0.0, 2.0, 3.0)
  1056. ]
  1057. expected = pd.concat(frames, axis=1)
  1058. expected.columns = MultiIndex(
  1059. levels=[[0, 2], columns],
  1060. codes=[6 * [0] + 6 * [1] + 6 * [0], 3 * list(range(6))],
  1061. )
  1062. expected.index.names = [1]
  1063. tm.assert_frame_equal(result, expected)
  1064. def test_groupby_mean_no_overflow():
  1065. # Regression test for (#22487)
  1066. df = DataFrame(
  1067. {
  1068. "user": ["A", "A", "A", "A", "A"],
  1069. "connections": [4970, 4749, 4719, 4704, 18446744073699999744],
  1070. }
  1071. )
  1072. assert df.groupby("user")["connections"].mean()["A"] == 3689348814740003840
  1073. @pytest.mark.parametrize(
  1074. "values",
  1075. [
  1076. {
  1077. "a": [1, 1, 1, 2, 2, 2, 3, 3, 3],
  1078. "b": [1, pd.NA, 2, 1, pd.NA, 2, 1, pd.NA, 2],
  1079. },
  1080. {"a": [1, 1, 2, 2, 3, 3], "b": [1, 2, 1, 2, 1, 2]},
  1081. ],
  1082. )
  1083. @pytest.mark.parametrize("function", ["mean", "median", "var"])
  1084. def test_apply_to_nullable_integer_returns_float(values, function):
  1085. # https://github.com/pandas-dev/pandas/issues/32219
  1086. output = 0.5 if function == "var" else 1.5
  1087. arr = np.array([output] * 3, dtype=float)
  1088. idx = Index([1, 2, 3], name="a", dtype="Int64")
  1089. expected = DataFrame({"b": arr}, index=idx).astype("Float64")
  1090. groups = DataFrame(values, dtype="Int64").groupby("a")
  1091. result = getattr(groups, function)()
  1092. tm.assert_frame_equal(result, expected)
  1093. result = groups.agg(function)
  1094. tm.assert_frame_equal(result, expected)
  1095. result = groups.agg([function])
  1096. expected.columns = MultiIndex.from_tuples([("b", function)])
  1097. tm.assert_frame_equal(result, expected)
  1098. def test_groupby_sum_below_mincount_nullable_integer():
  1099. # https://github.com/pandas-dev/pandas/issues/32861
  1100. df = DataFrame({"a": [0, 1, 2], "b": [0, 1, 2], "c": [0, 1, 2]}, dtype="Int64")
  1101. grouped = df.groupby("a")
  1102. idx = Index([0, 1, 2], name="a", dtype="Int64")
  1103. result = grouped["b"].sum(min_count=2)
  1104. expected = Series([pd.NA] * 3, dtype="Int64", index=idx, name="b")
  1105. tm.assert_series_equal(result, expected)
  1106. result = grouped.sum(min_count=2)
  1107. expected = DataFrame({"b": [pd.NA] * 3, "c": [pd.NA] * 3}, dtype="Int64", index=idx)
  1108. tm.assert_frame_equal(result, expected)
  1109. def test_mean_on_timedelta():
  1110. # GH 17382
  1111. df = DataFrame({"time": pd.to_timedelta(range(10)), "cat": ["A", "B"] * 5})
  1112. result = df.groupby("cat")["time"].mean()
  1113. expected = Series(
  1114. pd.to_timedelta([4, 5]), name="time", index=Index(["A", "B"], name="cat")
  1115. )
  1116. tm.assert_series_equal(result, expected)
  1117. def test_groupby_sum_timedelta_with_nat():
  1118. # GH#42659
  1119. df = DataFrame(
  1120. {
  1121. "a": [1, 1, 2, 2],
  1122. "b": [pd.Timedelta("1d"), pd.Timedelta("2d"), pd.Timedelta("3d"), pd.NaT],
  1123. }
  1124. )
  1125. td3 = pd.Timedelta(days=3)
  1126. gb = df.groupby("a")
  1127. res = gb.sum()
  1128. expected = DataFrame({"b": [td3, td3]}, index=Index([1, 2], name="a"))
  1129. tm.assert_frame_equal(res, expected)
  1130. res = gb["b"].sum()
  1131. tm.assert_series_equal(res, expected["b"])
  1132. res = gb["b"].sum(min_count=2)
  1133. expected = Series([td3, pd.NaT], dtype="m8[ns]", name="b", index=expected.index)
  1134. tm.assert_series_equal(res, expected)
  1135. @pytest.mark.parametrize(
  1136. "kernel, has_arg",
  1137. [
  1138. ("all", False),
  1139. ("any", False),
  1140. ("bfill", False),
  1141. ("corr", True),
  1142. ("corrwith", True),
  1143. ("cov", True),
  1144. ("cummax", True),
  1145. ("cummin", True),
  1146. ("cumprod", True),
  1147. ("cumsum", True),
  1148. ("diff", False),
  1149. ("ffill", False),
  1150. ("fillna", False),
  1151. ("first", True),
  1152. ("idxmax", True),
  1153. ("idxmin", True),
  1154. ("last", True),
  1155. ("max", True),
  1156. ("mean", True),
  1157. ("median", True),
  1158. ("min", True),
  1159. ("nth", False),
  1160. ("nunique", False),
  1161. ("pct_change", False),
  1162. ("prod", True),
  1163. ("quantile", True),
  1164. ("sem", True),
  1165. ("skew", True),
  1166. ("std", True),
  1167. ("sum", True),
  1168. ("var", True),
  1169. ],
  1170. )
  1171. @pytest.mark.parametrize("numeric_only", [True, False, lib.no_default])
  1172. @pytest.mark.parametrize("keys", [["a1"], ["a1", "a2"]])
  1173. def test_numeric_only(kernel, has_arg, numeric_only, keys):
  1174. # GH#46072
  1175. # drops_nuisance: Whether the op drops nuisance columns even when numeric_only=False
  1176. # has_arg: Whether the op has a numeric_only arg
  1177. df = DataFrame({"a1": [1, 1], "a2": [2, 2], "a3": [5, 6], "b": 2 * [object]})
  1178. args = get_groupby_method_args(kernel, df)
  1179. kwargs = {} if numeric_only is lib.no_default else {"numeric_only": numeric_only}
  1180. gb = df.groupby(keys)
  1181. method = getattr(gb, kernel)
  1182. if has_arg and numeric_only is True:
  1183. # Cases where b does not appear in the result
  1184. result = method(*args, **kwargs)
  1185. assert "b" not in result.columns
  1186. elif (
  1187. # kernels that work on any dtype and have numeric_only arg
  1188. kernel in ("first", "last")
  1189. or (
  1190. # kernels that work on any dtype and don't have numeric_only arg
  1191. kernel in ("any", "all", "bfill", "ffill", "fillna", "nth", "nunique")
  1192. and numeric_only is lib.no_default
  1193. )
  1194. ):
  1195. result = method(*args, **kwargs)
  1196. assert "b" in result.columns
  1197. elif has_arg or kernel in ("idxmax", "idxmin"):
  1198. assert numeric_only is not True
  1199. # kernels that are successful on any dtype were above; this will fail
  1200. # object dtypes for transformations are not implemented in Cython and
  1201. # have no Python fallback
  1202. exception = NotImplementedError if kernel.startswith("cum") else TypeError
  1203. msg = "|".join(
  1204. [
  1205. "not allowed for this dtype",
  1206. "must be a string or a number",
  1207. "cannot be performed against 'object' dtypes",
  1208. "must be a string or a real number",
  1209. "unsupported operand type",
  1210. "not supported between instances of",
  1211. "function is not implemented for this dtype",
  1212. ]
  1213. )
  1214. with pytest.raises(exception, match=msg):
  1215. method(*args, **kwargs)
  1216. elif not has_arg and numeric_only is not lib.no_default:
  1217. with pytest.raises(
  1218. TypeError, match="got an unexpected keyword argument 'numeric_only'"
  1219. ):
  1220. method(*args, **kwargs)
  1221. else:
  1222. assert kernel in ("diff", "pct_change")
  1223. assert numeric_only is lib.no_default
  1224. # Doesn't have numeric_only argument and fails on nuisance columns
  1225. with pytest.raises(TypeError, match=r"unsupported operand type"):
  1226. method(*args, **kwargs)
  1227. @pytest.mark.parametrize("dtype", [bool, int, float, object])
  1228. def test_deprecate_numeric_only_series(dtype, groupby_func, request):
  1229. # GH#46560
  1230. if groupby_func == "corrwith":
  1231. msg = "corrwith is not implemented on SeriesGroupBy"
  1232. request.node.add_marker(pytest.mark.xfail(reason=msg))
  1233. grouper = [0, 0, 1]
  1234. ser = Series([1, 0, 0], dtype=dtype)
  1235. gb = ser.groupby(grouper)
  1236. method = getattr(gb, groupby_func)
  1237. expected_ser = Series([1, 0, 0])
  1238. expected_gb = expected_ser.groupby(grouper)
  1239. expected_method = getattr(expected_gb, groupby_func)
  1240. args = get_groupby_method_args(groupby_func, ser)
  1241. fails_on_numeric_object = (
  1242. "corr",
  1243. "cov",
  1244. "cummax",
  1245. "cummin",
  1246. "cumprod",
  1247. "cumsum",
  1248. "idxmax",
  1249. "idxmin",
  1250. "quantile",
  1251. )
  1252. # ops that give an object result on object input
  1253. obj_result = (
  1254. "first",
  1255. "last",
  1256. "nth",
  1257. "bfill",
  1258. "ffill",
  1259. "shift",
  1260. "sum",
  1261. "diff",
  1262. "pct_change",
  1263. "var",
  1264. "mean",
  1265. "median",
  1266. "min",
  1267. "max",
  1268. "prod",
  1269. )
  1270. # Test default behavior; kernels that fail may be enabled in the future but kernels
  1271. # that succeed should not be allowed to fail (without deprecation, at least)
  1272. if groupby_func in fails_on_numeric_object and dtype is object:
  1273. if groupby_func in ("idxmax", "idxmin"):
  1274. msg = "not allowed for this dtype"
  1275. elif groupby_func == "quantile":
  1276. msg = "cannot be performed against 'object' dtypes"
  1277. else:
  1278. msg = "is not supported for object dtype"
  1279. with pytest.raises(TypeError, match=msg):
  1280. method(*args)
  1281. elif dtype is object:
  1282. result = method(*args)
  1283. expected = expected_method(*args)
  1284. if groupby_func in obj_result:
  1285. expected = expected.astype(object)
  1286. tm.assert_series_equal(result, expected)
  1287. has_numeric_only = (
  1288. "first",
  1289. "last",
  1290. "max",
  1291. "mean",
  1292. "median",
  1293. "min",
  1294. "prod",
  1295. "quantile",
  1296. "sem",
  1297. "skew",
  1298. "std",
  1299. "sum",
  1300. "var",
  1301. "cummax",
  1302. "cummin",
  1303. "cumprod",
  1304. "cumsum",
  1305. )
  1306. if groupby_func not in has_numeric_only:
  1307. msg = "got an unexpected keyword argument 'numeric_only'"
  1308. with pytest.raises(TypeError, match=msg):
  1309. method(*args, numeric_only=True)
  1310. elif dtype is object:
  1311. msg = "|".join(
  1312. [
  1313. "SeriesGroupBy.sem called with numeric_only=True and dtype object",
  1314. "Series.skew does not allow numeric_only=True with non-numeric",
  1315. "cum(sum|prod|min|max) is not supported for object dtype",
  1316. r"Cannot use numeric_only=True with SeriesGroupBy\..* and non-numeric",
  1317. ]
  1318. )
  1319. with pytest.raises(TypeError, match=msg):
  1320. method(*args, numeric_only=True)
  1321. else:
  1322. result = method(*args, numeric_only=True)
  1323. expected = method(*args, numeric_only=False)
  1324. tm.assert_series_equal(result, expected)
  1325. @pytest.mark.parametrize("dtype", [int, float, object])
  1326. @pytest.mark.parametrize(
  1327. "kwargs",
  1328. [
  1329. {"percentiles": [0.10, 0.20, 0.30], "include": "all", "exclude": None},
  1330. {"percentiles": [0.10, 0.20, 0.30], "include": None, "exclude": ["int"]},
  1331. {"percentiles": [0.10, 0.20, 0.30], "include": ["int"], "exclude": None},
  1332. ],
  1333. )
  1334. def test_groupby_empty_dataset(dtype, kwargs):
  1335. # GH#41575
  1336. df = DataFrame([[1, 2, 3]], columns=["A", "B", "C"], dtype=dtype)
  1337. df["B"] = df["B"].astype(int)
  1338. df["C"] = df["C"].astype(float)
  1339. result = df.iloc[:0].groupby("A").describe(**kwargs)
  1340. expected = df.groupby("A").describe(**kwargs).reset_index(drop=True).iloc[:0]
  1341. tm.assert_frame_equal(result, expected)
  1342. result = df.iloc[:0].groupby("A").B.describe(**kwargs)
  1343. expected = df.groupby("A").B.describe(**kwargs).reset_index(drop=True).iloc[:0]
  1344. expected.index = Index([])
  1345. tm.assert_frame_equal(result, expected)
  1346. def test_corrwith_with_1_axis():
  1347. # GH 47723
  1348. df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
  1349. result = df.groupby("a").corrwith(df, axis=1)
  1350. index = Index(
  1351. data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
  1352. name=("a", None),
  1353. )
  1354. expected = Series([np.nan] * 6, index=index)
  1355. tm.assert_series_equal(result, expected)
  1356. def test_multiindex_group_all_columns_when_empty(groupby_func):
  1357. # GH 32464
  1358. df = DataFrame({"a": [], "b": [], "c": []}).set_index(["a", "b", "c"])
  1359. gb = df.groupby(["a", "b", "c"], group_keys=False)
  1360. method = getattr(gb, groupby_func)
  1361. args = get_groupby_method_args(groupby_func, df)
  1362. result = method(*args).index
  1363. expected = df.index
  1364. tm.assert_index_equal(result, expected)
  1365. def test_duplicate_columns(request, groupby_func, as_index):
  1366. # GH#50806
  1367. if groupby_func == "corrwith":
  1368. msg = "GH#50845 - corrwith fails when there are duplicate columns"
  1369. request.node.add_marker(pytest.mark.xfail(reason=msg))
  1370. df = DataFrame([[1, 3, 6], [1, 4, 7], [2, 5, 8]], columns=list("abb"))
  1371. args = get_groupby_method_args(groupby_func, df)
  1372. gb = df.groupby("a", as_index=as_index)
  1373. result = getattr(gb, groupby_func)(*args)
  1374. expected_df = df.set_axis(["a", "b", "c"], axis=1)
  1375. expected_args = get_groupby_method_args(groupby_func, expected_df)
  1376. expected_gb = expected_df.groupby("a", as_index=as_index)
  1377. expected = getattr(expected_gb, groupby_func)(*expected_args)
  1378. if groupby_func not in ("size", "ngroup", "cumcount"):
  1379. expected = expected.rename(columns={"c": "b"})
  1380. tm.assert_equal(result, expected)