generic.py 86 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651
  1. """
  2. Define the SeriesGroupBy and DataFrameGroupBy
  3. classes that hold the groupby interfaces (and some implementations).
  4. These are user facing as the result of the ``df.groupby(...)`` operations,
  5. which here returns a DataFrameGroupBy object.
  6. """
  7. from __future__ import annotations
  8. from collections import abc
  9. from functools import partial
  10. from textwrap import dedent
  11. from typing import (
  12. TYPE_CHECKING,
  13. Any,
  14. Callable,
  15. Hashable,
  16. Iterable,
  17. Literal,
  18. Mapping,
  19. NamedTuple,
  20. Sequence,
  21. TypeVar,
  22. Union,
  23. cast,
  24. )
  25. import numpy as np
  26. from pandas._libs import (
  27. Interval,
  28. lib,
  29. reduction as libreduction,
  30. )
  31. from pandas._typing import (
  32. ArrayLike,
  33. Axis,
  34. AxisInt,
  35. CorrelationMethod,
  36. FillnaOptions,
  37. IndexLabel,
  38. Manager,
  39. Manager2D,
  40. SingleManager,
  41. TakeIndexer,
  42. )
  43. from pandas.errors import SpecificationError
  44. from pandas.util._decorators import (
  45. Appender,
  46. Substitution,
  47. doc,
  48. )
  49. from pandas.core.dtypes.common import (
  50. ensure_int64,
  51. is_bool,
  52. is_categorical_dtype,
  53. is_dict_like,
  54. is_integer_dtype,
  55. is_interval_dtype,
  56. is_numeric_dtype,
  57. is_scalar,
  58. )
  59. from pandas.core.dtypes.missing import (
  60. isna,
  61. notna,
  62. )
  63. from pandas.core import algorithms
  64. from pandas.core.apply import (
  65. GroupByApply,
  66. maybe_mangle_lambdas,
  67. reconstruct_func,
  68. validate_func_kwargs,
  69. )
  70. import pandas.core.common as com
  71. from pandas.core.frame import DataFrame
  72. from pandas.core.groupby import base
  73. from pandas.core.groupby.groupby import (
  74. GroupBy,
  75. GroupByPlot,
  76. _agg_template,
  77. _apply_docs,
  78. _transform_template,
  79. )
  80. from pandas.core.indexes.api import (
  81. Index,
  82. MultiIndex,
  83. all_indexes_same,
  84. default_index,
  85. )
  86. from pandas.core.series import Series
  87. from pandas.core.util.numba_ import maybe_use_numba
  88. from pandas.plotting import boxplot_frame_groupby
  89. if TYPE_CHECKING:
  90. from pandas import Categorical
  91. from pandas.core.generic import NDFrame
  92. # TODO(typing) the return value on this callable should be any *scalar*.
  93. AggScalar = Union[str, Callable[..., Any]]
  94. # TODO: validate types on ScalarResult and move to _typing
  95. # Blocked from using by https://github.com/python/mypy/issues/1484
  96. # See note at _mangle_lambda_list
  97. ScalarResult = TypeVar("ScalarResult")
  98. class NamedAgg(NamedTuple):
  99. """
  100. Helper for column specific aggregation with control over output column names.
  101. Subclass of typing.NamedTuple.
  102. Parameters
  103. ----------
  104. column : Hashable
  105. Column label in the DataFrame to apply aggfunc.
  106. aggfunc : function or str
  107. Function to apply to the provided column. If string, the name of a built-in
  108. pandas function.
  109. Examples
  110. --------
  111. >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})
  112. >>> agg_a = pd.NamedAgg(column="a", aggfunc="min")
  113. >>> agg_1 = pd.NamedAgg(column=1, aggfunc=np.mean)
  114. >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)
  115. result_a result_1
  116. key
  117. 1 -1 10.5
  118. 2 1 12.0
  119. """
  120. column: Hashable
  121. aggfunc: AggScalar
  122. class SeriesGroupBy(GroupBy[Series]):
  123. def _wrap_agged_manager(self, mgr: Manager) -> Series:
  124. return self.obj._constructor(mgr, name=self.obj.name)
  125. def _get_data_to_aggregate(
  126. self, *, numeric_only: bool = False, name: str | None = None
  127. ) -> SingleManager:
  128. ser = self._selected_obj
  129. single = ser._mgr
  130. if numeric_only and not is_numeric_dtype(ser.dtype):
  131. # GH#41291 match Series behavior
  132. kwd_name = "numeric_only"
  133. raise TypeError(
  134. f"Cannot use {kwd_name}=True with "
  135. f"{type(self).__name__}.{name} and non-numeric dtypes."
  136. )
  137. return single
  138. def _iterate_slices(self) -> Iterable[Series]:
  139. yield self._selected_obj
  140. _agg_examples_doc = dedent(
  141. """
  142. Examples
  143. --------
  144. >>> s = pd.Series([1, 2, 3, 4])
  145. >>> s
  146. 0 1
  147. 1 2
  148. 2 3
  149. 3 4
  150. dtype: int64
  151. >>> s.groupby([1, 1, 2, 2]).min()
  152. 1 1
  153. 2 3
  154. dtype: int64
  155. >>> s.groupby([1, 1, 2, 2]).agg('min')
  156. 1 1
  157. 2 3
  158. dtype: int64
  159. >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
  160. min max
  161. 1 1 2
  162. 2 3 4
  163. The output column names can be controlled by passing
  164. the desired column names and aggregations as keyword arguments.
  165. >>> s.groupby([1, 1, 2, 2]).agg(
  166. ... minimum='min',
  167. ... maximum='max',
  168. ... )
  169. minimum maximum
  170. 1 1 2
  171. 2 3 4
  172. .. versionchanged:: 1.3.0
  173. The resulting dtype will reflect the return value of the aggregating function.
  174. >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())
  175. 1 1.0
  176. 2 3.0
  177. dtype: float64
  178. """
  179. )
  180. @Appender(
  181. _apply_docs["template"].format(
  182. input="series", examples=_apply_docs["series_examples"]
  183. )
  184. )
  185. def apply(self, func, *args, **kwargs) -> Series:
  186. return super().apply(func, *args, **kwargs)
  187. @doc(_agg_template, examples=_agg_examples_doc, klass="Series")
  188. def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
  189. if maybe_use_numba(engine):
  190. return self._aggregate_with_numba(
  191. func, *args, engine_kwargs=engine_kwargs, **kwargs
  192. )
  193. relabeling = func is None
  194. columns = None
  195. if relabeling:
  196. columns, func = validate_func_kwargs(kwargs)
  197. kwargs = {}
  198. if isinstance(func, str):
  199. return getattr(self, func)(*args, **kwargs)
  200. elif isinstance(func, abc.Iterable):
  201. # Catch instances of lists / tuples
  202. # but not the class list / tuple itself.
  203. func = maybe_mangle_lambdas(func)
  204. ret = self._aggregate_multiple_funcs(func, *args, **kwargs)
  205. if relabeling:
  206. # columns is not narrowed by mypy from relabeling flag
  207. assert columns is not None # for mypy
  208. ret.columns = columns
  209. if not self.as_index:
  210. ret = ret.reset_index()
  211. return ret
  212. else:
  213. cyfunc = com.get_cython_func(func)
  214. if cyfunc and not args and not kwargs:
  215. return getattr(self, cyfunc)()
  216. if self.ngroups == 0:
  217. # e.g. test_evaluate_with_empty_groups without any groups to
  218. # iterate over, we have no output on which to do dtype
  219. # inference. We default to using the existing dtype.
  220. # xref GH#51445
  221. obj = self._obj_with_exclusions
  222. return self.obj._constructor(
  223. [],
  224. name=self.obj.name,
  225. index=self.grouper.result_index,
  226. dtype=obj.dtype,
  227. )
  228. if self.grouper.nkeys > 1:
  229. return self._python_agg_general(func, *args, **kwargs)
  230. try:
  231. return self._python_agg_general(func, *args, **kwargs)
  232. except KeyError:
  233. # KeyError raised in test_groupby.test_basic is bc the func does
  234. # a dictionary lookup on group.name, but group name is not
  235. # pinned in _python_agg_general, only in _aggregate_named
  236. result = self._aggregate_named(func, *args, **kwargs)
  237. # result is a dict whose keys are the elements of result_index
  238. result = Series(result, index=self.grouper.result_index)
  239. result = self._wrap_aggregated_output(result)
  240. return result
  241. agg = aggregate
  242. def _python_agg_general(self, func, *args, **kwargs):
  243. func = com.is_builtin_func(func)
  244. f = lambda x: func(x, *args, **kwargs)
  245. obj = self._obj_with_exclusions
  246. result = self.grouper.agg_series(obj, f)
  247. res = obj._constructor(result, name=obj.name)
  248. return self._wrap_aggregated_output(res)
  249. def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:
  250. if isinstance(arg, dict):
  251. if self.as_index:
  252. # GH 15931
  253. raise SpecificationError("nested renamer is not supported")
  254. else:
  255. # GH#50684 - This accidentally worked in 1.x
  256. arg = list(arg.items())
  257. elif any(isinstance(x, (tuple, list)) for x in arg):
  258. arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
  259. else:
  260. # list of functions / function names
  261. columns = []
  262. for f in arg:
  263. columns.append(com.get_callable_name(f) or f)
  264. arg = zip(columns, arg)
  265. results: dict[base.OutputKey, DataFrame | Series] = {}
  266. with com.temp_setattr(self, "as_index", True):
  267. # Combine results using the index, need to adjust index after
  268. # if as_index=False (GH#50724)
  269. for idx, (name, func) in enumerate(arg):
  270. key = base.OutputKey(label=name, position=idx)
  271. results[key] = self.aggregate(func, *args, **kwargs)
  272. if any(isinstance(x, DataFrame) for x in results.values()):
  273. from pandas import concat
  274. res_df = concat(
  275. results.values(), axis=1, keys=[key.label for key in results]
  276. )
  277. return res_df
  278. indexed_output = {key.position: val for key, val in results.items()}
  279. output = self.obj._constructor_expanddim(indexed_output, index=None)
  280. output.columns = Index(key.label for key in results)
  281. return output
  282. def _wrap_applied_output(
  283. self,
  284. data: Series,
  285. values: list[Any],
  286. not_indexed_same: bool = False,
  287. is_transform: bool = False,
  288. ) -> DataFrame | Series:
  289. """
  290. Wrap the output of SeriesGroupBy.apply into the expected result.
  291. Parameters
  292. ----------
  293. data : Series
  294. Input data for groupby operation.
  295. values : List[Any]
  296. Applied output for each group.
  297. not_indexed_same : bool, default False
  298. Whether the applied outputs are not indexed the same as the group axes.
  299. Returns
  300. -------
  301. DataFrame or Series
  302. """
  303. if len(values) == 0:
  304. # GH #6265
  305. if is_transform:
  306. # GH#47787 see test_group_on_empty_multiindex
  307. res_index = data.index
  308. else:
  309. res_index = self.grouper.result_index
  310. return self.obj._constructor(
  311. [],
  312. name=self.obj.name,
  313. index=res_index,
  314. dtype=data.dtype,
  315. )
  316. assert values is not None
  317. if isinstance(values[0], dict):
  318. # GH #823 #24880
  319. index = self.grouper.result_index
  320. res_df = self.obj._constructor_expanddim(values, index=index)
  321. res_df = self._reindex_output(res_df)
  322. # if self.observed is False,
  323. # keep all-NaN rows created while re-indexing
  324. res_ser = res_df.stack(dropna=self.observed)
  325. res_ser.name = self.obj.name
  326. return res_ser
  327. elif isinstance(values[0], (Series, DataFrame)):
  328. result = self._concat_objects(
  329. values,
  330. not_indexed_same=not_indexed_same,
  331. is_transform=is_transform,
  332. )
  333. if isinstance(result, Series):
  334. result.name = self.obj.name
  335. if not self.as_index and not_indexed_same:
  336. result = self._insert_inaxis_grouper(result)
  337. result.index = default_index(len(result))
  338. return result
  339. else:
  340. # GH #6265 #24880
  341. result = self.obj._constructor(
  342. data=values, index=self.grouper.result_index, name=self.obj.name
  343. )
  344. if not self.as_index:
  345. result = self._insert_inaxis_grouper(result)
  346. result.index = default_index(len(result))
  347. return self._reindex_output(result)
  348. def _aggregate_named(self, func, *args, **kwargs):
  349. # Note: this is very similar to _aggregate_series_pure_python,
  350. # but that does not pin group.name
  351. result = {}
  352. initialized = False
  353. for name, group in self:
  354. object.__setattr__(group, "name", name)
  355. output = func(group, *args, **kwargs)
  356. output = libreduction.extract_result(output)
  357. if not initialized:
  358. # We only do this validation on the first iteration
  359. libreduction.check_result_array(output, group.dtype)
  360. initialized = True
  361. result[name] = output
  362. return result
  363. __examples_series_doc = dedent(
  364. """
  365. >>> ser = pd.Series(
  366. ... [390.0, 350.0, 30.0, 20.0],
  367. ... index=["Falcon", "Falcon", "Parrot", "Parrot"],
  368. ... name="Max Speed")
  369. >>> grouped = ser.groupby([1, 1, 2, 2])
  370. >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
  371. Falcon 0.707107
  372. Falcon -0.707107
  373. Parrot 0.707107
  374. Parrot -0.707107
  375. Name: Max Speed, dtype: float64
  376. Broadcast result of the transformation
  377. >>> grouped.transform(lambda x: x.max() - x.min())
  378. Falcon 40.0
  379. Falcon 40.0
  380. Parrot 10.0
  381. Parrot 10.0
  382. Name: Max Speed, dtype: float64
  383. >>> grouped.transform("mean")
  384. Falcon 370.0
  385. Falcon 370.0
  386. Parrot 25.0
  387. Parrot 25.0
  388. Name: Max Speed, dtype: float64
  389. .. versionchanged:: 1.3.0
  390. The resulting dtype will reflect the return value of the passed ``func``,
  391. for example:
  392. >>> grouped.transform(lambda x: x.astype(int).max())
  393. Falcon 390
  394. Falcon 390
  395. Parrot 30
  396. Parrot 30
  397. Name: Max Speed, dtype: int64
  398. """
  399. )
  400. @Substitution(klass="Series", example=__examples_series_doc)
  401. @Appender(_transform_template)
  402. def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
  403. return self._transform(
  404. func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
  405. )
  406. def _cython_transform(
  407. self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
  408. ):
  409. assert axis == 0 # handled by caller
  410. obj = self._selected_obj
  411. try:
  412. result = self.grouper._cython_operation(
  413. "transform", obj._values, how, axis, **kwargs
  414. )
  415. except NotImplementedError as err:
  416. # e.g. test_groupby_raises_string
  417. raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
  418. return obj._constructor(result, index=self.obj.index, name=obj.name)
  419. def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
  420. """
  421. Transform with a callable func`.
  422. """
  423. assert callable(func)
  424. klass = type(self.obj)
  425. results = []
  426. for name, group in self.grouper.get_iterator(
  427. self._selected_obj, axis=self.axis
  428. ):
  429. # this setattr is needed for test_transform_lambda_with_datetimetz
  430. object.__setattr__(group, "name", name)
  431. res = func(group, *args, **kwargs)
  432. results.append(klass(res, index=group.index))
  433. # check for empty "results" to avoid concat ValueError
  434. if results:
  435. from pandas.core.reshape.concat import concat
  436. concatenated = concat(results)
  437. result = self._set_result_index_ordered(concatenated)
  438. else:
  439. result = self.obj._constructor(dtype=np.float64)
  440. result.name = self.obj.name
  441. return result
  442. def filter(self, func, dropna: bool = True, *args, **kwargs):
  443. """
  444. Filter elements from groups that don't satisfy a criterion.
  445. Elements from groups are filtered if they do not satisfy the
  446. boolean criterion specified by func.
  447. Parameters
  448. ----------
  449. func : function
  450. Criterion to apply to each group. Should return True or False.
  451. dropna : bool
  452. Drop groups that do not pass the filter. True by default; if False,
  453. groups that evaluate False are filled with NaNs.
  454. Returns
  455. -------
  456. Series
  457. Notes
  458. -----
  459. Functions that mutate the passed object can produce unexpected
  460. behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
  461. for more details.
  462. Examples
  463. --------
  464. >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
  465. ... 'foo', 'bar'],
  466. ... 'B' : [1, 2, 3, 4, 5, 6],
  467. ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
  468. >>> grouped = df.groupby('A')
  469. >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
  470. 1 2
  471. 3 4
  472. 5 6
  473. Name: B, dtype: int64
  474. """
  475. if isinstance(func, str):
  476. wrapper = lambda x: getattr(x, func)(*args, **kwargs)
  477. else:
  478. wrapper = lambda x: func(x, *args, **kwargs)
  479. # Interpret np.nan as False.
  480. def true_and_notna(x) -> bool:
  481. b = wrapper(x)
  482. return notna(b) and b
  483. try:
  484. indices = [
  485. self._get_index(name) for name, group in self if true_and_notna(group)
  486. ]
  487. except (ValueError, TypeError) as err:
  488. raise TypeError("the filter must return a boolean result") from err
  489. filtered = self._apply_filter(indices, dropna)
  490. return filtered
  491. def nunique(self, dropna: bool = True) -> Series | DataFrame:
  492. """
  493. Return number of unique elements in the group.
  494. Returns
  495. -------
  496. Series
  497. Number of unique values within each group.
  498. """
  499. ids, _, _ = self.grouper.group_info
  500. val = self.obj._values
  501. codes, _ = algorithms.factorize(val, sort=False)
  502. sorter = np.lexsort((codes, ids))
  503. codes = codes[sorter]
  504. ids = ids[sorter]
  505. # group boundaries are where group ids change
  506. # unique observations are where sorted values change
  507. idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
  508. inc = np.r_[1, codes[1:] != codes[:-1]]
  509. # 1st item of each group is a new unique observation
  510. mask = codes == -1
  511. if dropna:
  512. inc[idx] = 1
  513. inc[mask] = 0
  514. else:
  515. inc[mask & np.r_[False, mask[:-1]]] = 0
  516. inc[idx] = 1
  517. out = np.add.reduceat(inc, idx).astype("int64", copy=False)
  518. if len(ids):
  519. # NaN/NaT group exists if the head of ids is -1,
  520. # so remove it from res and exclude its index from idx
  521. if ids[0] == -1:
  522. res = out[1:]
  523. idx = idx[np.flatnonzero(idx)]
  524. else:
  525. res = out
  526. else:
  527. res = out[1:]
  528. ri = self.grouper.result_index
  529. # we might have duplications among the bins
  530. if len(res) != len(ri):
  531. res, out = np.zeros(len(ri), dtype=out.dtype), res
  532. if len(ids) > 0:
  533. # GH#21334s
  534. res[ids[idx]] = out
  535. result: Series | DataFrame = self.obj._constructor(
  536. res, index=ri, name=self.obj.name
  537. )
  538. if not self.as_index:
  539. result = self._insert_inaxis_grouper(result)
  540. result.index = default_index(len(result))
  541. return self._reindex_output(result, fill_value=0)
  542. @doc(Series.describe)
  543. def describe(self, **kwargs):
  544. return super().describe(**kwargs)
  545. def value_counts(
  546. self,
  547. normalize: bool = False,
  548. sort: bool = True,
  549. ascending: bool = False,
  550. bins=None,
  551. dropna: bool = True,
  552. ) -> Series | DataFrame:
  553. name = "proportion" if normalize else "count"
  554. if bins is None:
  555. result = self._value_counts(
  556. normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
  557. )
  558. result.name = name
  559. return result
  560. from pandas.core.reshape.merge import get_join_indexers
  561. from pandas.core.reshape.tile import cut
  562. ids, _, _ = self.grouper.group_info
  563. val = self.obj._values
  564. index_names = self.grouper.names + [self.obj.name]
  565. if is_categorical_dtype(val.dtype) or (
  566. bins is not None and not np.iterable(bins)
  567. ):
  568. # scalar bins cannot be done at top level
  569. # in a backward compatible way
  570. # GH38672 relates to categorical dtype
  571. ser = self.apply(
  572. Series.value_counts,
  573. normalize=normalize,
  574. sort=sort,
  575. ascending=ascending,
  576. bins=bins,
  577. )
  578. ser.name = name
  579. ser.index.names = index_names
  580. return ser
  581. # groupby removes null keys from groupings
  582. mask = ids != -1
  583. ids, val = ids[mask], val[mask]
  584. if bins is None:
  585. lab, lev = algorithms.factorize(val, sort=True)
  586. llab = lambda lab, inc: lab[inc]
  587. else:
  588. # lab is a Categorical with categories an IntervalIndex
  589. cat_ser = cut(Series(val, copy=False), bins, include_lowest=True)
  590. cat_obj = cast("Categorical", cat_ser._values)
  591. lev = cat_obj.categories
  592. lab = lev.take(
  593. cat_obj.codes,
  594. allow_fill=True,
  595. fill_value=lev._na_value,
  596. )
  597. llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
  598. if is_interval_dtype(lab.dtype):
  599. # TODO: should we do this inside II?
  600. lab_interval = cast(Interval, lab)
  601. sorter = np.lexsort((lab_interval.left, lab_interval.right, ids))
  602. else:
  603. sorter = np.lexsort((lab, ids))
  604. ids, lab = ids[sorter], lab[sorter]
  605. # group boundaries are where group ids change
  606. idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]
  607. idx = np.r_[0, idchanges]
  608. if not len(ids):
  609. idx = idchanges
  610. # new values are where sorted labels change
  611. lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
  612. inc = np.r_[True, lchanges]
  613. if not len(val):
  614. inc = lchanges
  615. inc[idx] = True # group boundaries are also new values
  616. out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
  617. # num. of times each group should be repeated
  618. rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
  619. # multi-index components
  620. codes = self.grouper.reconstructed_codes
  621. codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
  622. levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
  623. if dropna:
  624. mask = codes[-1] != -1
  625. if mask.all():
  626. dropna = False
  627. else:
  628. out, codes = out[mask], [level_codes[mask] for level_codes in codes]
  629. if normalize:
  630. out = out.astype("float")
  631. d = np.diff(np.r_[idx, len(ids)])
  632. if dropna:
  633. m = ids[lab == -1]
  634. np.add.at(d, m, -1)
  635. acc = rep(d)[mask]
  636. else:
  637. acc = rep(d)
  638. out /= acc
  639. if sort and bins is None:
  640. cat = ids[inc][mask] if dropna else ids[inc]
  641. sorter = np.lexsort((out if ascending else -out, cat))
  642. out, codes[-1] = out[sorter], codes[-1][sorter]
  643. if bins is not None:
  644. # for compat. with libgroupby.value_counts need to ensure every
  645. # bin is present at every index level, null filled with zeros
  646. diff = np.zeros(len(out), dtype="bool")
  647. for level_codes in codes[:-1]:
  648. diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
  649. ncat, nbin = diff.sum(), len(levels[-1])
  650. left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
  651. right = [diff.cumsum() - 1, codes[-1]]
  652. _, idx = get_join_indexers(left, right, sort=False, how="left")
  653. out = np.where(idx != -1, out[idx], 0)
  654. if sort:
  655. sorter = np.lexsort((out if ascending else -out, left[0]))
  656. out, left[-1] = out[sorter], left[-1][sorter]
  657. # build the multi-index w/ full levels
  658. def build_codes(lev_codes: np.ndarray) -> np.ndarray:
  659. return np.repeat(lev_codes[diff], nbin)
  660. codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
  661. codes.append(left[-1])
  662. mi = MultiIndex(
  663. levels=levels, codes=codes, names=index_names, verify_integrity=False
  664. )
  665. if is_integer_dtype(out.dtype):
  666. out = ensure_int64(out)
  667. result = self.obj._constructor(out, index=mi, name=name)
  668. if not self.as_index:
  669. result = result.reset_index()
  670. return result
  671. def fillna(
  672. self,
  673. value: object | ArrayLike | None = None,
  674. method: FillnaOptions | None = None,
  675. axis: Axis | None = None,
  676. inplace: bool = False,
  677. limit: int | None = None,
  678. downcast: dict | None = None,
  679. ) -> Series | None:
  680. """
  681. Fill NA/NaN values using the specified method within groups.
  682. Parameters
  683. ----------
  684. value : scalar, dict, Series, or DataFrame
  685. Value to use to fill holes (e.g. 0), alternately a
  686. dict/Series/DataFrame of values specifying which value to use for
  687. each index (for a Series) or column (for a DataFrame). Values not
  688. in the dict/Series/DataFrame will not be filled. This value cannot
  689. be a list. Users wanting to use the ``value`` argument and not ``method``
  690. should prefer :meth:`.Series.fillna` as this
  691. will produce the same result and be more performant.
  692. method : {{'bfill', 'ffill', None}}, default None
  693. Method to use for filling holes. ``'ffill'`` will propagate
  694. the last valid observation forward within a group.
  695. ``'bfill'`` will use next valid observation to fill the gap.
  696. axis : {0 or 'index', 1 or 'columns'}
  697. Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`.
  698. inplace : bool, default False
  699. Broken. Do not set to True.
  700. limit : int, default None
  701. If method is specified, this is the maximum number of consecutive
  702. NaN values to forward/backward fill within a group. In other words,
  703. if there is a gap with more than this number of consecutive NaNs,
  704. it will only be partially filled. If method is not specified, this is the
  705. maximum number of entries along the entire axis where NaNs will be
  706. filled. Must be greater than 0 if not None.
  707. downcast : dict, default is None
  708. A dict of item->dtype of what to downcast if possible,
  709. or the string 'infer' which will try to downcast to an appropriate
  710. equal type (e.g. float64 to int64 if possible).
  711. Returns
  712. -------
  713. Series
  714. Object with missing values filled within groups.
  715. See Also
  716. --------
  717. ffill : Forward fill values within a group.
  718. bfill : Backward fill values within a group.
  719. Examples
  720. --------
  721. >>> ser = pd.Series([np.nan, np.nan, 2, 3, np.nan, np.nan])
  722. >>> ser
  723. 0 NaN
  724. 1 NaN
  725. 2 2.0
  726. 3 3.0
  727. 4 NaN
  728. 5 NaN
  729. dtype: float64
  730. Propagate non-null values forward or backward within each group.
  731. >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill")
  732. 0 NaN
  733. 1 NaN
  734. 2 2.0
  735. 3 3.0
  736. 4 3.0
  737. 5 3.0
  738. dtype: float64
  739. >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="bfill")
  740. 0 2.0
  741. 1 2.0
  742. 2 2.0
  743. 3 3.0
  744. 4 NaN
  745. 5 NaN
  746. dtype: float64
  747. Only replace the first NaN element within a group.
  748. >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill", limit=1)
  749. 0 NaN
  750. 1 NaN
  751. 2 2.0
  752. 3 3.0
  753. 4 3.0
  754. 5 NaN
  755. dtype: float64
  756. """
  757. result = self._op_via_apply(
  758. "fillna",
  759. value=value,
  760. method=method,
  761. axis=axis,
  762. inplace=inplace,
  763. limit=limit,
  764. downcast=downcast,
  765. )
  766. return result
  767. def take(
  768. self,
  769. indices: TakeIndexer,
  770. axis: Axis = 0,
  771. **kwargs,
  772. ) -> Series:
  773. """
  774. Return the elements in the given *positional* indices in each group.
  775. This means that we are not indexing according to actual values in
  776. the index attribute of the object. We are indexing according to the
  777. actual position of the element in the object.
  778. If a requested index does not exist for some group, this method will raise.
  779. To get similar behavior that ignores indices that don't exist, see
  780. :meth:`.SeriesGroupBy.nth`.
  781. Parameters
  782. ----------
  783. indices : array-like
  784. An array of ints indicating which positions to take in each group.
  785. axis : {0 or 'index', 1 or 'columns', None}, default 0
  786. The axis on which to select elements. ``0`` means that we are
  787. selecting rows, ``1`` means that we are selecting columns.
  788. For `SeriesGroupBy` this parameter is unused and defaults to 0.
  789. **kwargs
  790. For compatibility with :meth:`numpy.take`. Has no effect on the
  791. output.
  792. Returns
  793. -------
  794. Series
  795. A Series containing the elements taken from each group.
  796. See Also
  797. --------
  798. Series.take : Take elements from a Series along an axis.
  799. Series.loc : Select a subset of a DataFrame by labels.
  800. Series.iloc : Select a subset of a DataFrame by positions.
  801. numpy.take : Take elements from an array along an axis.
  802. SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist.
  803. Examples
  804. --------
  805. >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
  806. ... ('parrot', 'bird', 24.0),
  807. ... ('lion', 'mammal', 80.5),
  808. ... ('monkey', 'mammal', np.nan),
  809. ... ('rabbit', 'mammal', 15.0)],
  810. ... columns=['name', 'class', 'max_speed'],
  811. ... index=[4, 3, 2, 1, 0])
  812. >>> df
  813. name class max_speed
  814. 4 falcon bird 389.0
  815. 3 parrot bird 24.0
  816. 2 lion mammal 80.5
  817. 1 monkey mammal NaN
  818. 0 rabbit mammal 15.0
  819. >>> gb = df["name"].groupby([1, 1, 2, 2, 2])
  820. Take elements at positions 0 and 1 along the axis 0 in each group (default).
  821. >>> gb.take([0, 1])
  822. 1 4 falcon
  823. 3 parrot
  824. 2 2 lion
  825. 1 monkey
  826. Name: name, dtype: object
  827. We may take elements using negative integers for positive indices,
  828. starting from the end of the object, just like with Python lists.
  829. >>> gb.take([-1, -2])
  830. 1 3 parrot
  831. 4 falcon
  832. 2 0 rabbit
  833. 1 monkey
  834. Name: name, dtype: object
  835. """
  836. result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
  837. return result
  838. def skew(
  839. self,
  840. axis: Axis | lib.NoDefault = lib.no_default,
  841. skipna: bool = True,
  842. numeric_only: bool = False,
  843. **kwargs,
  844. ) -> Series:
  845. """
  846. Return unbiased skew within groups.
  847. Normalized by N-1.
  848. Parameters
  849. ----------
  850. axis : {0 or 'index', 1 or 'columns', None}, default 0
  851. Axis for the function to be applied on.
  852. This parameter is only for compatibility with DataFrame and is unused.
  853. skipna : bool, default True
  854. Exclude NA/null values when computing the result.
  855. numeric_only : bool, default False
  856. Include only float, int, boolean columns. Not implemented for Series.
  857. **kwargs
  858. Additional keyword arguments to be passed to the function.
  859. Returns
  860. -------
  861. Series
  862. See Also
  863. --------
  864. Series.skew : Return unbiased skew over requested axis.
  865. Examples
  866. --------
  867. >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.],
  868. ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon',
  869. ... 'Parrot', 'Parrot', 'Parrot'],
  870. ... name="Max Speed")
  871. >>> ser
  872. Falcon 390.0
  873. Falcon 350.0
  874. Falcon 357.0
  875. Falcon NaN
  876. Parrot 22.0
  877. Parrot 20.0
  878. Parrot 30.0
  879. Name: Max Speed, dtype: float64
  880. >>> ser.groupby(level=0).skew()
  881. Falcon 1.525174
  882. Parrot 1.457863
  883. Name: Max Speed, dtype: float64
  884. >>> ser.groupby(level=0).skew(skipna=False)
  885. Falcon NaN
  886. Parrot 1.457863
  887. Name: Max Speed, dtype: float64
  888. """
  889. result = self._op_via_apply(
  890. "skew",
  891. axis=axis,
  892. skipna=skipna,
  893. numeric_only=numeric_only,
  894. **kwargs,
  895. )
  896. return result
  897. @property
  898. @doc(Series.plot.__doc__)
  899. def plot(self):
  900. result = GroupByPlot(self)
  901. return result
  902. @doc(Series.nlargest.__doc__)
  903. def nlargest(
  904. self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
  905. ) -> Series:
  906. f = partial(Series.nlargest, n=n, keep=keep)
  907. data = self._selected_obj
  908. # Don't change behavior if result index happens to be the same, i.e.
  909. # already ordered and n >= all group sizes.
  910. result = self._python_apply_general(f, data, not_indexed_same=True)
  911. return result
  912. @doc(Series.nsmallest.__doc__)
  913. def nsmallest(
  914. self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
  915. ) -> Series:
  916. f = partial(Series.nsmallest, n=n, keep=keep)
  917. data = self._selected_obj
  918. # Don't change behavior if result index happens to be the same, i.e.
  919. # already ordered and n >= all group sizes.
  920. result = self._python_apply_general(f, data, not_indexed_same=True)
  921. return result
  922. @doc(Series.idxmin.__doc__)
  923. def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
  924. result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)
  925. return result
  926. @doc(Series.idxmax.__doc__)
  927. def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
  928. result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
  929. return result
  930. @doc(Series.corr.__doc__)
  931. def corr(
  932. self,
  933. other: Series,
  934. method: CorrelationMethod = "pearson",
  935. min_periods: int | None = None,
  936. ) -> Series:
  937. result = self._op_via_apply(
  938. "corr", other=other, method=method, min_periods=min_periods
  939. )
  940. return result
  941. @doc(Series.cov.__doc__)
  942. def cov(
  943. self, other: Series, min_periods: int | None = None, ddof: int | None = 1
  944. ) -> Series:
  945. result = self._op_via_apply(
  946. "cov", other=other, min_periods=min_periods, ddof=ddof
  947. )
  948. return result
  949. @property
  950. @doc(Series.is_monotonic_increasing.__doc__)
  951. def is_monotonic_increasing(self) -> Series:
  952. return self.apply(lambda ser: ser.is_monotonic_increasing)
  953. @property
  954. @doc(Series.is_monotonic_decreasing.__doc__)
  955. def is_monotonic_decreasing(self) -> Series:
  956. return self.apply(lambda ser: ser.is_monotonic_decreasing)
  957. @doc(Series.hist.__doc__)
  958. def hist(
  959. self,
  960. by=None,
  961. ax=None,
  962. grid: bool = True,
  963. xlabelsize: int | None = None,
  964. xrot: float | None = None,
  965. ylabelsize: int | None = None,
  966. yrot: float | None = None,
  967. figsize: tuple[int, int] | None = None,
  968. bins: int | Sequence[int] = 10,
  969. backend: str | None = None,
  970. legend: bool = False,
  971. **kwargs,
  972. ):
  973. result = self._op_via_apply(
  974. "hist",
  975. by=by,
  976. ax=ax,
  977. grid=grid,
  978. xlabelsize=xlabelsize,
  979. xrot=xrot,
  980. ylabelsize=ylabelsize,
  981. yrot=yrot,
  982. figsize=figsize,
  983. bins=bins,
  984. backend=backend,
  985. legend=legend,
  986. **kwargs,
  987. )
  988. return result
  989. @property
  990. @doc(Series.dtype.__doc__)
  991. def dtype(self) -> Series:
  992. return self.apply(lambda ser: ser.dtype)
  993. @doc(Series.unique.__doc__)
  994. def unique(self) -> Series:
  995. result = self._op_via_apply("unique")
  996. return result
  997. class DataFrameGroupBy(GroupBy[DataFrame]):
  998. _agg_examples_doc = dedent(
  999. """
  1000. Examples
  1001. --------
  1002. >>> df = pd.DataFrame(
  1003. ... {
  1004. ... "A": [1, 1, 2, 2],
  1005. ... "B": [1, 2, 3, 4],
  1006. ... "C": [0.362838, 0.227877, 1.267767, -0.562860],
  1007. ... }
  1008. ... )
  1009. >>> df
  1010. A B C
  1011. 0 1 1 0.362838
  1012. 1 1 2 0.227877
  1013. 2 2 3 1.267767
  1014. 3 2 4 -0.562860
  1015. The aggregation is for each column.
  1016. >>> df.groupby('A').agg('min')
  1017. B C
  1018. A
  1019. 1 1 0.227877
  1020. 2 3 -0.562860
  1021. Multiple aggregations
  1022. >>> df.groupby('A').agg(['min', 'max'])
  1023. B C
  1024. min max min max
  1025. A
  1026. 1 1 2 0.227877 0.362838
  1027. 2 3 4 -0.562860 1.267767
  1028. Select a column for aggregation
  1029. >>> df.groupby('A').B.agg(['min', 'max'])
  1030. min max
  1031. A
  1032. 1 1 2
  1033. 2 3 4
  1034. User-defined function for aggregation
  1035. >>> df.groupby('A').agg(lambda x: sum(x) + 2)
  1036. B C
  1037. A
  1038. 1 5 2.590715
  1039. 2 9 2.704907
  1040. Different aggregations per column
  1041. >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
  1042. B C
  1043. min max sum
  1044. A
  1045. 1 1 2 0.590715
  1046. 2 3 4 0.704907
  1047. To control the output names with different aggregations per column,
  1048. pandas supports "named aggregation"
  1049. >>> df.groupby("A").agg(
  1050. ... b_min=pd.NamedAgg(column="B", aggfunc="min"),
  1051. ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
  1052. b_min c_sum
  1053. A
  1054. 1 1 0.590715
  1055. 2 3 0.704907
  1056. - The keywords are the *output* column names
  1057. - The values are tuples whose first element is the column to select
  1058. and the second element is the aggregation to apply to that column.
  1059. Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
  1060. ``['column', 'aggfunc']`` to make it clearer what the arguments are.
  1061. As usual, the aggregation can be a callable or a string alias.
  1062. See :ref:`groupby.aggregate.named` for more.
  1063. .. versionchanged:: 1.3.0
  1064. The resulting dtype will reflect the return value of the aggregating function.
  1065. >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())
  1066. B
  1067. A
  1068. 1 1.0
  1069. 2 3.0
  1070. """
  1071. )
  1072. @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")
  1073. def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
  1074. if maybe_use_numba(engine):
  1075. return self._aggregate_with_numba(
  1076. func, *args, engine_kwargs=engine_kwargs, **kwargs
  1077. )
  1078. relabeling, func, columns, order = reconstruct_func(func, **kwargs)
  1079. func = maybe_mangle_lambdas(func)
  1080. op = GroupByApply(self, func, args, kwargs)
  1081. result = op.agg()
  1082. if not is_dict_like(func) and result is not None:
  1083. return result
  1084. elif relabeling:
  1085. # this should be the only (non-raising) case with relabeling
  1086. # used reordered index of columns
  1087. result = cast(DataFrame, result)
  1088. result = result.iloc[:, order]
  1089. result = cast(DataFrame, result)
  1090. # error: Incompatible types in assignment (expression has type
  1091. # "Optional[List[str]]", variable has type
  1092. # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],
  1093. # Index, Series], Sequence[Any]]")
  1094. result.columns = columns # type: ignore[assignment]
  1095. if result is None:
  1096. # grouper specific aggregations
  1097. if self.grouper.nkeys > 1:
  1098. # test_groupby_as_index_series_scalar gets here with 'not self.as_index'
  1099. return self._python_agg_general(func, *args, **kwargs)
  1100. elif args or kwargs:
  1101. # test_pass_args_kwargs gets here (with and without as_index)
  1102. # can't return early
  1103. result = self._aggregate_frame(func, *args, **kwargs)
  1104. elif self.axis == 1:
  1105. # _aggregate_multiple_funcs does not allow self.axis == 1
  1106. # Note: axis == 1 precludes 'not self.as_index', see __init__
  1107. result = self._aggregate_frame(func)
  1108. return result
  1109. else:
  1110. # try to treat as if we are passing a list
  1111. gba = GroupByApply(self, [func], args=(), kwargs={})
  1112. try:
  1113. result = gba.agg()
  1114. except ValueError as err:
  1115. if "No objects to concatenate" not in str(err):
  1116. raise
  1117. # _aggregate_frame can fail with e.g. func=Series.mode,
  1118. # where it expects 1D values but would be getting 2D values
  1119. # In other tests, using aggregate_frame instead of GroupByApply
  1120. # would give correct values but incorrect dtypes
  1121. # object vs float64 in test_cython_agg_empty_buckets
  1122. # float64 vs int64 in test_category_order_apply
  1123. result = self._aggregate_frame(func)
  1124. else:
  1125. # GH#32040, GH#35246
  1126. # e.g. test_groupby_as_index_select_column_sum_empty_df
  1127. result = cast(DataFrame, result)
  1128. result.columns = self._obj_with_exclusions.columns.copy()
  1129. if not self.as_index:
  1130. result = self._insert_inaxis_grouper(result)
  1131. result.index = default_index(len(result))
  1132. return result
  1133. agg = aggregate
  1134. def _python_agg_general(self, func, *args, **kwargs):
  1135. func = com.is_builtin_func(func)
  1136. f = lambda x: func(x, *args, **kwargs)
  1137. # iterate through "columns" ex exclusions to populate output dict
  1138. output: dict[base.OutputKey, ArrayLike] = {}
  1139. if self.ngroups == 0:
  1140. # e.g. test_evaluate_with_empty_groups different path gets different
  1141. # result dtype in empty case.
  1142. return self._python_apply_general(f, self._selected_obj, is_agg=True)
  1143. for idx, obj in enumerate(self._iterate_slices()):
  1144. name = obj.name
  1145. result = self.grouper.agg_series(obj, f)
  1146. key = base.OutputKey(label=name, position=idx)
  1147. output[key] = result
  1148. if not output:
  1149. # e.g. test_margins_no_values_no_cols
  1150. return self._python_apply_general(f, self._selected_obj)
  1151. res = self._indexed_output_to_ndframe(output)
  1152. return self._wrap_aggregated_output(res)
  1153. def _iterate_slices(self) -> Iterable[Series]:
  1154. obj = self._selected_obj
  1155. if self.axis == 1:
  1156. obj = obj.T
  1157. if isinstance(obj, Series) and obj.name not in self.exclusions:
  1158. # Occurs when doing DataFrameGroupBy(...)["X"]
  1159. yield obj
  1160. else:
  1161. for label, values in obj.items():
  1162. if label in self.exclusions:
  1163. # Note: if we tried to just iterate over _obj_with_exclusions,
  1164. # we would break test_wrap_agg_out by yielding a column
  1165. # that is skipped here but not dropped from obj_with_exclusions
  1166. continue
  1167. yield values
  1168. def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
  1169. if self.grouper.nkeys != 1:
  1170. raise AssertionError("Number of keys must be 1")
  1171. obj = self._obj_with_exclusions
  1172. result: dict[Hashable, NDFrame | np.ndarray] = {}
  1173. for name, grp_df in self.grouper.get_iterator(obj, self.axis):
  1174. fres = func(grp_df, *args, **kwargs)
  1175. result[name] = fres
  1176. result_index = self.grouper.result_index
  1177. other_ax = obj.axes[1 - self.axis]
  1178. out = self.obj._constructor(result, index=other_ax, columns=result_index)
  1179. if self.axis == 0:
  1180. out = out.T
  1181. return out
  1182. def _wrap_applied_output(
  1183. self,
  1184. data: DataFrame,
  1185. values: list,
  1186. not_indexed_same: bool = False,
  1187. is_transform: bool = False,
  1188. ):
  1189. if len(values) == 0:
  1190. if is_transform:
  1191. # GH#47787 see test_group_on_empty_multiindex
  1192. res_index = data.index
  1193. else:
  1194. res_index = self.grouper.result_index
  1195. result = self.obj._constructor(index=res_index, columns=data.columns)
  1196. result = result.astype(data.dtypes, copy=False)
  1197. return result
  1198. # GH12824
  1199. # using values[0] here breaks test_groupby_apply_none_first
  1200. first_not_none = next(com.not_none(*values), None)
  1201. if first_not_none is None:
  1202. # GH9684 - All values are None, return an empty frame.
  1203. return self.obj._constructor()
  1204. elif isinstance(first_not_none, DataFrame):
  1205. return self._concat_objects(
  1206. values,
  1207. not_indexed_same=not_indexed_same,
  1208. is_transform=is_transform,
  1209. )
  1210. key_index = self.grouper.result_index if self.as_index else None
  1211. if isinstance(first_not_none, (np.ndarray, Index)):
  1212. # GH#1738: values is list of arrays of unequal lengths
  1213. # fall through to the outer else clause
  1214. # TODO: sure this is right? we used to do this
  1215. # after raising AttributeError above
  1216. return self.obj._constructor_sliced(
  1217. values, index=key_index, name=self._selection
  1218. )
  1219. elif not isinstance(first_not_none, Series):
  1220. # values are not series or array-like but scalars
  1221. # self._selection not passed through to Series as the
  1222. # result should not take the name of original selection
  1223. # of columns
  1224. if self.as_index:
  1225. return self.obj._constructor_sliced(values, index=key_index)
  1226. else:
  1227. result = self.obj._constructor(values, columns=[self._selection])
  1228. result = self._insert_inaxis_grouper(result)
  1229. return result
  1230. else:
  1231. # values are Series
  1232. return self._wrap_applied_output_series(
  1233. values,
  1234. not_indexed_same,
  1235. first_not_none,
  1236. key_index,
  1237. is_transform,
  1238. )
  1239. def _wrap_applied_output_series(
  1240. self,
  1241. values: list[Series],
  1242. not_indexed_same: bool,
  1243. first_not_none,
  1244. key_index: Index | None,
  1245. is_transform: bool,
  1246. ) -> DataFrame | Series:
  1247. kwargs = first_not_none._construct_axes_dict()
  1248. backup = Series(**kwargs)
  1249. values = [x if (x is not None) else backup for x in values]
  1250. all_indexed_same = all_indexes_same(x.index for x in values)
  1251. if not all_indexed_same:
  1252. # GH 8467
  1253. return self._concat_objects(
  1254. values,
  1255. not_indexed_same=True,
  1256. is_transform=is_transform,
  1257. )
  1258. # Combine values
  1259. # vstack+constructor is faster than concat and handles MI-columns
  1260. stacked_values = np.vstack([np.asarray(v) for v in values])
  1261. if self.axis == 0:
  1262. index = key_index
  1263. columns = first_not_none.index.copy()
  1264. if columns.name is None:
  1265. # GH6124 - propagate name of Series when it's consistent
  1266. names = {v.name for v in values}
  1267. if len(names) == 1:
  1268. columns.name = list(names)[0]
  1269. else:
  1270. index = first_not_none.index
  1271. columns = key_index
  1272. stacked_values = stacked_values.T
  1273. if stacked_values.dtype == object:
  1274. # We'll have the DataFrame constructor do inference
  1275. stacked_values = stacked_values.tolist()
  1276. result = self.obj._constructor(stacked_values, index=index, columns=columns)
  1277. if not self.as_index:
  1278. result = self._insert_inaxis_grouper(result)
  1279. return self._reindex_output(result)
  1280. def _cython_transform(
  1281. self,
  1282. how: str,
  1283. numeric_only: bool = False,
  1284. axis: AxisInt = 0,
  1285. **kwargs,
  1286. ) -> DataFrame:
  1287. assert axis == 0 # handled by caller
  1288. # With self.axis == 0, we have multi-block tests
  1289. # e.g. test_rank_min_int, test_cython_transform_frame
  1290. # test_transform_numeric_ret
  1291. # With self.axis == 1, _get_data_to_aggregate does a transpose
  1292. # so we always have a single block.
  1293. mgr: Manager2D = self._get_data_to_aggregate(
  1294. numeric_only=numeric_only, name=how
  1295. )
  1296. def arr_func(bvalues: ArrayLike) -> ArrayLike:
  1297. return self.grouper._cython_operation(
  1298. "transform", bvalues, how, 1, **kwargs
  1299. )
  1300. # We could use `mgr.apply` here and not have to set_axis, but
  1301. # we would have to do shape gymnastics for ArrayManager compat
  1302. res_mgr = mgr.grouped_reduce(arr_func)
  1303. res_mgr.set_axis(1, mgr.axes[1])
  1304. res_df = self.obj._constructor(res_mgr)
  1305. res_df = self._maybe_transpose_result(res_df)
  1306. return res_df
  1307. def _transform_general(self, func, *args, **kwargs):
  1308. from pandas.core.reshape.concat import concat
  1309. applied = []
  1310. obj = self._obj_with_exclusions
  1311. gen = self.grouper.get_iterator(obj, axis=self.axis)
  1312. fast_path, slow_path = self._define_paths(func, *args, **kwargs)
  1313. # Determine whether to use slow or fast path by evaluating on the first group.
  1314. # Need to handle the case of an empty generator and process the result so that
  1315. # it does not need to be computed again.
  1316. try:
  1317. name, group = next(gen)
  1318. except StopIteration:
  1319. pass
  1320. else:
  1321. object.__setattr__(group, "name", name)
  1322. try:
  1323. path, res = self._choose_path(fast_path, slow_path, group)
  1324. except ValueError as err:
  1325. # e.g. test_transform_with_non_scalar_group
  1326. msg = "transform must return a scalar value for each group"
  1327. raise ValueError(msg) from err
  1328. if group.size > 0:
  1329. res = _wrap_transform_general_frame(self.obj, group, res)
  1330. applied.append(res)
  1331. # Compute and process with the remaining groups
  1332. for name, group in gen:
  1333. if group.size == 0:
  1334. continue
  1335. object.__setattr__(group, "name", name)
  1336. res = path(group)
  1337. res = _wrap_transform_general_frame(self.obj, group, res)
  1338. applied.append(res)
  1339. concat_index = obj.columns if self.axis == 0 else obj.index
  1340. other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
  1341. concatenated = concat(applied, axis=self.axis, verify_integrity=False)
  1342. concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
  1343. return self._set_result_index_ordered(concatenated)
  1344. __examples_dataframe_doc = dedent(
  1345. """
  1346. >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
  1347. ... 'foo', 'bar'],
  1348. ... 'B' : ['one', 'one', 'two', 'three',
  1349. ... 'two', 'two'],
  1350. ... 'C' : [1, 5, 5, 2, 5, 5],
  1351. ... 'D' : [2.0, 5., 8., 1., 2., 9.]})
  1352. >>> grouped = df.groupby('A')[['C', 'D']]
  1353. >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
  1354. C D
  1355. 0 -1.154701 -0.577350
  1356. 1 0.577350 0.000000
  1357. 2 0.577350 1.154701
  1358. 3 -1.154701 -1.000000
  1359. 4 0.577350 -0.577350
  1360. 5 0.577350 1.000000
  1361. Broadcast result of the transformation
  1362. >>> grouped.transform(lambda x: x.max() - x.min())
  1363. C D
  1364. 0 4.0 6.0
  1365. 1 3.0 8.0
  1366. 2 4.0 6.0
  1367. 3 3.0 8.0
  1368. 4 4.0 6.0
  1369. 5 3.0 8.0
  1370. >>> grouped.transform("mean")
  1371. C D
  1372. 0 3.666667 4.0
  1373. 1 4.000000 5.0
  1374. 2 3.666667 4.0
  1375. 3 4.000000 5.0
  1376. 4 3.666667 4.0
  1377. 5 4.000000 5.0
  1378. .. versionchanged:: 1.3.0
  1379. The resulting dtype will reflect the return value of the passed ``func``,
  1380. for example:
  1381. >>> grouped.transform(lambda x: x.astype(int).max())
  1382. C D
  1383. 0 5 8
  1384. 1 5 9
  1385. 2 5 8
  1386. 3 5 9
  1387. 4 5 8
  1388. 5 5 9
  1389. """
  1390. )
  1391. @Substitution(klass="DataFrame", example=__examples_dataframe_doc)
  1392. @Appender(_transform_template)
  1393. def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
  1394. return self._transform(
  1395. func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
  1396. )
  1397. def _define_paths(self, func, *args, **kwargs):
  1398. if isinstance(func, str):
  1399. fast_path = lambda group: getattr(group, func)(*args, **kwargs)
  1400. slow_path = lambda group: group.apply(
  1401. lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
  1402. )
  1403. else:
  1404. fast_path = lambda group: func(group, *args, **kwargs)
  1405. slow_path = lambda group: group.apply(
  1406. lambda x: func(x, *args, **kwargs), axis=self.axis
  1407. )
  1408. return fast_path, slow_path
  1409. def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
  1410. path = slow_path
  1411. res = slow_path(group)
  1412. if self.ngroups == 1:
  1413. # no need to evaluate multiple paths when only
  1414. # a single group exists
  1415. return path, res
  1416. # if we make it here, test if we can use the fast path
  1417. try:
  1418. res_fast = fast_path(group)
  1419. except AssertionError:
  1420. raise # pragma: no cover
  1421. except Exception:
  1422. # GH#29631 For user-defined function, we can't predict what may be
  1423. # raised; see test_transform.test_transform_fastpath_raises
  1424. return path, res
  1425. # verify fast path returns either:
  1426. # a DataFrame with columns equal to group.columns
  1427. # OR a Series with index equal to group.columns
  1428. if isinstance(res_fast, DataFrame):
  1429. if not res_fast.columns.equals(group.columns):
  1430. return path, res
  1431. elif isinstance(res_fast, Series):
  1432. if not res_fast.index.equals(group.columns):
  1433. return path, res
  1434. else:
  1435. return path, res
  1436. if res_fast.equals(res):
  1437. path = fast_path
  1438. return path, res
  1439. def filter(self, func, dropna: bool = True, *args, **kwargs):
  1440. """
  1441. Filter elements from groups that don't satisfy a criterion.
  1442. Elements from groups are filtered if they do not satisfy the
  1443. boolean criterion specified by func.
  1444. Parameters
  1445. ----------
  1446. func : function
  1447. Criterion to apply to each group. Should return True or False.
  1448. dropna : bool
  1449. Drop groups that do not pass the filter. True by default; if False,
  1450. groups that evaluate False are filled with NaNs.
  1451. Returns
  1452. -------
  1453. DataFrame
  1454. Notes
  1455. -----
  1456. Each subframe is endowed the attribute 'name' in case you need to know
  1457. which group you are working on.
  1458. Functions that mutate the passed object can produce unexpected
  1459. behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
  1460. for more details.
  1461. Examples
  1462. --------
  1463. >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
  1464. ... 'foo', 'bar'],
  1465. ... 'B' : [1, 2, 3, 4, 5, 6],
  1466. ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
  1467. >>> grouped = df.groupby('A')
  1468. >>> grouped.filter(lambda x: x['B'].mean() > 3.)
  1469. A B C
  1470. 1 bar 2 5.0
  1471. 3 bar 4 1.0
  1472. 5 bar 6 9.0
  1473. """
  1474. indices = []
  1475. obj = self._selected_obj
  1476. gen = self.grouper.get_iterator(obj, axis=self.axis)
  1477. for name, group in gen:
  1478. object.__setattr__(group, "name", name)
  1479. res = func(group, *args, **kwargs)
  1480. try:
  1481. res = res.squeeze()
  1482. except AttributeError: # allow e.g., scalars and frames to pass
  1483. pass
  1484. # interpret the result of the filter
  1485. if is_bool(res) or (is_scalar(res) and isna(res)):
  1486. if notna(res) and res:
  1487. indices.append(self._get_index(name))
  1488. else:
  1489. # non scalars aren't allowed
  1490. raise TypeError(
  1491. f"filter function returned a {type(res).__name__}, "
  1492. "but expected a scalar bool"
  1493. )
  1494. return self._apply_filter(indices, dropna)
  1495. def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:
  1496. if self.axis == 1:
  1497. # GH 37725
  1498. raise ValueError("Cannot subset columns when using axis=1")
  1499. # per GH 23566
  1500. if isinstance(key, tuple) and len(key) > 1:
  1501. # if len == 1, then it becomes a SeriesGroupBy and this is actually
  1502. # valid syntax, so don't raise
  1503. raise ValueError(
  1504. "Cannot subset columns with a tuple with more than one element. "
  1505. "Use a list instead."
  1506. )
  1507. return super().__getitem__(key)
  1508. def _gotitem(self, key, ndim: int, subset=None):
  1509. """
  1510. sub-classes to define
  1511. return a sliced object
  1512. Parameters
  1513. ----------
  1514. key : string / list of selections
  1515. ndim : {1, 2}
  1516. requested ndim of result
  1517. subset : object, default None
  1518. subset to act on
  1519. """
  1520. if ndim == 2:
  1521. if subset is None:
  1522. subset = self.obj
  1523. return DataFrameGroupBy(
  1524. subset,
  1525. self.grouper,
  1526. axis=self.axis,
  1527. level=self.level,
  1528. grouper=self.grouper,
  1529. exclusions=self.exclusions,
  1530. selection=key,
  1531. as_index=self.as_index,
  1532. sort=self.sort,
  1533. group_keys=self.group_keys,
  1534. observed=self.observed,
  1535. dropna=self.dropna,
  1536. )
  1537. elif ndim == 1:
  1538. if subset is None:
  1539. subset = self.obj[key]
  1540. return SeriesGroupBy(
  1541. subset,
  1542. level=self.level,
  1543. grouper=self.grouper,
  1544. exclusions=self.exclusions,
  1545. selection=key,
  1546. as_index=self.as_index,
  1547. sort=self.sort,
  1548. group_keys=self.group_keys,
  1549. observed=self.observed,
  1550. dropna=self.dropna,
  1551. )
  1552. raise AssertionError("invalid ndim for _gotitem")
  1553. def _get_data_to_aggregate(
  1554. self, *, numeric_only: bool = False, name: str | None = None
  1555. ) -> Manager2D:
  1556. obj = self._obj_with_exclusions
  1557. if self.axis == 1:
  1558. mgr = obj.T._mgr
  1559. else:
  1560. mgr = obj._mgr
  1561. if numeric_only:
  1562. mgr = mgr.get_numeric_data(copy=False)
  1563. return mgr
  1564. def _indexed_output_to_ndframe(
  1565. self, output: Mapping[base.OutputKey, ArrayLike]
  1566. ) -> DataFrame:
  1567. """
  1568. Wrap the dict result of a GroupBy aggregation into a DataFrame.
  1569. """
  1570. indexed_output = {key.position: val for key, val in output.items()}
  1571. columns = Index([key.label for key in output])
  1572. columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
  1573. result = self.obj._constructor(indexed_output)
  1574. result.columns = columns
  1575. return result
  1576. def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
  1577. return self.obj._constructor(mgr)
  1578. def _iterate_column_groupbys(self, obj: DataFrame):
  1579. for i, colname in enumerate(obj.columns):
  1580. yield colname, SeriesGroupBy(
  1581. obj.iloc[:, i],
  1582. selection=colname,
  1583. grouper=self.grouper,
  1584. exclusions=self.exclusions,
  1585. observed=self.observed,
  1586. )
  1587. def _apply_to_column_groupbys(self, func, obj: DataFrame) -> DataFrame:
  1588. from pandas.core.reshape.concat import concat
  1589. columns = obj.columns
  1590. results = [
  1591. func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)
  1592. ]
  1593. if not len(results):
  1594. # concat would raise
  1595. return DataFrame([], columns=columns, index=self.grouper.result_index)
  1596. else:
  1597. return concat(results, keys=columns, axis=1)
  1598. def nunique(self, dropna: bool = True) -> DataFrame:
  1599. """
  1600. Return DataFrame with counts of unique elements in each position.
  1601. Parameters
  1602. ----------
  1603. dropna : bool, default True
  1604. Don't include NaN in the counts.
  1605. Returns
  1606. -------
  1607. nunique: DataFrame
  1608. Examples
  1609. --------
  1610. >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
  1611. ... 'ham', 'ham'],
  1612. ... 'value1': [1, 5, 5, 2, 5, 5],
  1613. ... 'value2': list('abbaxy')})
  1614. >>> df
  1615. id value1 value2
  1616. 0 spam 1 a
  1617. 1 egg 5 b
  1618. 2 egg 5 b
  1619. 3 spam 2 a
  1620. 4 ham 5 x
  1621. 5 ham 5 y
  1622. >>> df.groupby('id').nunique()
  1623. value1 value2
  1624. id
  1625. egg 1 1
  1626. ham 1 2
  1627. spam 2 1
  1628. Check for rows with the same id but conflicting values:
  1629. >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
  1630. id value1 value2
  1631. 0 spam 1 a
  1632. 3 spam 2 a
  1633. 4 ham 5 x
  1634. 5 ham 5 y
  1635. """
  1636. if self.axis != 0:
  1637. # see test_groupby_crash_on_nunique
  1638. return self._python_apply_general(
  1639. lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True
  1640. )
  1641. obj = self._obj_with_exclusions
  1642. results = self._apply_to_column_groupbys(
  1643. lambda sgb: sgb.nunique(dropna), obj=obj
  1644. )
  1645. if not self.as_index:
  1646. results.index = default_index(len(results))
  1647. results = self._insert_inaxis_grouper(results)
  1648. return results
  1649. def idxmax(
  1650. self,
  1651. axis: Axis | None = None,
  1652. skipna: bool = True,
  1653. numeric_only: bool = False,
  1654. ) -> DataFrame:
  1655. """
  1656. Return index of first occurrence of maximum over requested axis.
  1657. NA/null values are excluded.
  1658. Parameters
  1659. ----------
  1660. axis : {{0 or 'index', 1 or 'columns'}}, default None
  1661. The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
  1662. If axis is not provided, grouper's axis is used.
  1663. .. versionchanged:: 2.0.0
  1664. skipna : bool, default True
  1665. Exclude NA/null values. If an entire row/column is NA, the result
  1666. will be NA.
  1667. numeric_only : bool, default False
  1668. Include only `float`, `int` or `boolean` data.
  1669. .. versionadded:: 1.5.0
  1670. Returns
  1671. -------
  1672. Series
  1673. Indexes of maxima along the specified axis.
  1674. Raises
  1675. ------
  1676. ValueError
  1677. * If the row/column is empty
  1678. See Also
  1679. --------
  1680. Series.idxmax : Return index of the maximum element.
  1681. Notes
  1682. -----
  1683. This method is the DataFrame version of ``ndarray.argmax``.
  1684. Examples
  1685. --------
  1686. Consider a dataset containing food consumption in Argentina.
  1687. >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
  1688. ... 'co2_emissions': [37.2, 19.66, 1712]},
  1689. ... index=['Pork', 'Wheat Products', 'Beef'])
  1690. >>> df
  1691. consumption co2_emissions
  1692. Pork 10.51 37.20
  1693. Wheat Products 103.11 19.66
  1694. Beef 55.48 1712.00
  1695. By default, it returns the index for the maximum value in each column.
  1696. >>> df.idxmax()
  1697. consumption Wheat Products
  1698. co2_emissions Beef
  1699. dtype: object
  1700. To return the index for the maximum value in each row, use ``axis="columns"``.
  1701. >>> df.idxmax(axis="columns")
  1702. Pork co2_emissions
  1703. Wheat Products consumption
  1704. Beef co2_emissions
  1705. dtype: object
  1706. """
  1707. if axis is None:
  1708. axis = self.axis
  1709. def func(df):
  1710. return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)
  1711. func.__name__ = "idxmax"
  1712. result = self._python_apply_general(
  1713. func, self._obj_with_exclusions, not_indexed_same=True
  1714. )
  1715. return result
  1716. def idxmin(
  1717. self,
  1718. axis: Axis | None = None,
  1719. skipna: bool = True,
  1720. numeric_only: bool = False,
  1721. ) -> DataFrame:
  1722. """
  1723. Return index of first occurrence of minimum over requested axis.
  1724. NA/null values are excluded.
  1725. Parameters
  1726. ----------
  1727. axis : {{0 or 'index', 1 or 'columns'}}, default None
  1728. The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
  1729. If axis is not provided, grouper's axis is used.
  1730. .. versionchanged:: 2.0.0
  1731. skipna : bool, default True
  1732. Exclude NA/null values. If an entire row/column is NA, the result
  1733. will be NA.
  1734. numeric_only : bool, default False
  1735. Include only `float`, `int` or `boolean` data.
  1736. .. versionadded:: 1.5.0
  1737. Returns
  1738. -------
  1739. Series
  1740. Indexes of minima along the specified axis.
  1741. Raises
  1742. ------
  1743. ValueError
  1744. * If the row/column is empty
  1745. See Also
  1746. --------
  1747. Series.idxmin : Return index of the minimum element.
  1748. Notes
  1749. -----
  1750. This method is the DataFrame version of ``ndarray.argmin``.
  1751. Examples
  1752. --------
  1753. Consider a dataset containing food consumption in Argentina.
  1754. >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
  1755. ... 'co2_emissions': [37.2, 19.66, 1712]},
  1756. ... index=['Pork', 'Wheat Products', 'Beef'])
  1757. >>> df
  1758. consumption co2_emissions
  1759. Pork 10.51 37.20
  1760. Wheat Products 103.11 19.66
  1761. Beef 55.48 1712.00
  1762. By default, it returns the index for the minimum value in each column.
  1763. >>> df.idxmin()
  1764. consumption Pork
  1765. co2_emissions Wheat Products
  1766. dtype: object
  1767. To return the index for the minimum value in each row, use ``axis="columns"``.
  1768. >>> df.idxmin(axis="columns")
  1769. Pork consumption
  1770. Wheat Products co2_emissions
  1771. Beef consumption
  1772. dtype: object
  1773. """
  1774. if axis is None:
  1775. axis = self.axis
  1776. def func(df):
  1777. return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)
  1778. func.__name__ = "idxmin"
  1779. result = self._python_apply_general(
  1780. func, self._obj_with_exclusions, not_indexed_same=True
  1781. )
  1782. return result
  1783. boxplot = boxplot_frame_groupby
  1784. def value_counts(
  1785. self,
  1786. subset: Sequence[Hashable] | None = None,
  1787. normalize: bool = False,
  1788. sort: bool = True,
  1789. ascending: bool = False,
  1790. dropna: bool = True,
  1791. ) -> DataFrame | Series:
  1792. """
  1793. Return a Series or DataFrame containing counts of unique rows.
  1794. .. versionadded:: 1.4.0
  1795. Parameters
  1796. ----------
  1797. subset : list-like, optional
  1798. Columns to use when counting unique combinations.
  1799. normalize : bool, default False
  1800. Return proportions rather than frequencies.
  1801. sort : bool, default True
  1802. Sort by frequencies.
  1803. ascending : bool, default False
  1804. Sort in ascending order.
  1805. dropna : bool, default True
  1806. Don’t include counts of rows that contain NA values.
  1807. Returns
  1808. -------
  1809. Series or DataFrame
  1810. Series if the groupby as_index is True, otherwise DataFrame.
  1811. See Also
  1812. --------
  1813. Series.value_counts: Equivalent method on Series.
  1814. DataFrame.value_counts: Equivalent method on DataFrame.
  1815. SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
  1816. Notes
  1817. -----
  1818. - If the groupby as_index is True then the returned Series will have a
  1819. MultiIndex with one level per input column.
  1820. - If the groupby as_index is False then the returned DataFrame will have an
  1821. additional column with the value_counts. The column is labelled 'count' or
  1822. 'proportion', depending on the ``normalize`` parameter.
  1823. By default, rows that contain any NA values are omitted from
  1824. the result.
  1825. By default, the result will be in descending order so that the
  1826. first element of each group is the most frequently-occurring row.
  1827. Examples
  1828. --------
  1829. >>> df = pd.DataFrame({
  1830. ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
  1831. ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
  1832. ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
  1833. ... })
  1834. >>> df
  1835. gender education country
  1836. 0 male low US
  1837. 1 male medium FR
  1838. 2 female high US
  1839. 3 male low FR
  1840. 4 female high FR
  1841. 5 male low FR
  1842. >>> df.groupby('gender').value_counts()
  1843. gender education country
  1844. female high FR 1
  1845. US 1
  1846. male low FR 2
  1847. US 1
  1848. medium FR 1
  1849. Name: count, dtype: int64
  1850. >>> df.groupby('gender').value_counts(ascending=True)
  1851. gender education country
  1852. female high FR 1
  1853. US 1
  1854. male low US 1
  1855. medium FR 1
  1856. low FR 2
  1857. Name: count, dtype: int64
  1858. >>> df.groupby('gender').value_counts(normalize=True)
  1859. gender education country
  1860. female high FR 0.50
  1861. US 0.50
  1862. male low FR 0.50
  1863. US 0.25
  1864. medium FR 0.25
  1865. Name: proportion, dtype: float64
  1866. >>> df.groupby('gender', as_index=False).value_counts()
  1867. gender education country count
  1868. 0 female high FR 1
  1869. 1 female high US 1
  1870. 2 male low FR 2
  1871. 3 male low US 1
  1872. 4 male medium FR 1
  1873. >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
  1874. gender education country proportion
  1875. 0 female high FR 0.50
  1876. 1 female high US 0.50
  1877. 2 male low FR 0.50
  1878. 3 male low US 0.25
  1879. 4 male medium FR 0.25
  1880. """
  1881. return self._value_counts(subset, normalize, sort, ascending, dropna)
  1882. def fillna(
  1883. self,
  1884. value: Hashable | Mapping | Series | DataFrame = None,
  1885. method: FillnaOptions | None = None,
  1886. axis: Axis | None = None,
  1887. inplace: bool = False,
  1888. limit=None,
  1889. downcast=None,
  1890. ) -> DataFrame | None:
  1891. """
  1892. Fill NA/NaN values using the specified method within groups.
  1893. Parameters
  1894. ----------
  1895. value : scalar, dict, Series, or DataFrame
  1896. Value to use to fill holes (e.g. 0), alternately a
  1897. dict/Series/DataFrame of values specifying which value to use for
  1898. each index (for a Series) or column (for a DataFrame). Values not
  1899. in the dict/Series/DataFrame will not be filled. This value cannot
  1900. be a list. Users wanting to use the ``value`` argument and not ``method``
  1901. should prefer :meth:`.DataFrame.fillna` as this
  1902. will produce the same result and be more performant.
  1903. method : {{'bfill', 'ffill', None}}, default None
  1904. Method to use for filling holes. ``'ffill'`` will propagate
  1905. the last valid observation forward within a group.
  1906. ``'bfill'`` will use next valid observation to fill the gap.
  1907. axis : {0 or 'index', 1 or 'columns'}
  1908. Axis along which to fill missing values. When the :class:`DataFrameGroupBy`
  1909. ``axis`` argument is ``0``, using ``axis=1`` here will produce
  1910. the same results as :meth:`.DataFrame.fillna`. When the
  1911. :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0``
  1912. or ``axis=1`` here will produce the same results.
  1913. inplace : bool, default False
  1914. Broken. Do not set to True.
  1915. limit : int, default None
  1916. If method is specified, this is the maximum number of consecutive
  1917. NaN values to forward/backward fill within a group. In other words,
  1918. if there is a gap with more than this number of consecutive NaNs,
  1919. it will only be partially filled. If method is not specified, this is the
  1920. maximum number of entries along the entire axis where NaNs will be
  1921. filled. Must be greater than 0 if not None.
  1922. downcast : dict, default is None
  1923. A dict of item->dtype of what to downcast if possible,
  1924. or the string 'infer' which will try to downcast to an appropriate
  1925. equal type (e.g. float64 to int64 if possible).
  1926. Returns
  1927. -------
  1928. DataFrame
  1929. Object with missing values filled.
  1930. See Also
  1931. --------
  1932. ffill : Forward fill values within a group.
  1933. bfill : Backward fill values within a group.
  1934. Examples
  1935. --------
  1936. >>> df = pd.DataFrame(
  1937. ... {
  1938. ... "key": [0, 0, 1, 1, 1],
  1939. ... "A": [np.nan, 2, np.nan, 3, np.nan],
  1940. ... "B": [2, 3, np.nan, np.nan, np.nan],
  1941. ... "C": [np.nan, np.nan, 2, np.nan, np.nan],
  1942. ... }
  1943. ... )
  1944. >>> df
  1945. key A B C
  1946. 0 0 NaN 2.0 NaN
  1947. 1 0 2.0 3.0 NaN
  1948. 2 1 NaN NaN 2.0
  1949. 3 1 3.0 NaN NaN
  1950. 4 1 NaN NaN NaN
  1951. Propagate non-null values forward or backward within each group along columns.
  1952. >>> df.groupby("key").fillna(method="ffill")
  1953. A B C
  1954. 0 NaN 2.0 NaN
  1955. 1 2.0 3.0 NaN
  1956. 2 NaN NaN 2.0
  1957. 3 3.0 NaN 2.0
  1958. 4 3.0 NaN 2.0
  1959. >>> df.groupby("key").fillna(method="bfill")
  1960. A B C
  1961. 0 2.0 2.0 NaN
  1962. 1 2.0 3.0 NaN
  1963. 2 3.0 NaN 2.0
  1964. 3 3.0 NaN NaN
  1965. 4 NaN NaN NaN
  1966. Propagate non-null values forward or backward within each group along rows.
  1967. >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="ffill")
  1968. key A B C
  1969. 0 0.0 0.0 2.0 2.0
  1970. 1 0.0 2.0 3.0 3.0
  1971. 2 1.0 1.0 NaN 2.0
  1972. 3 1.0 3.0 NaN NaN
  1973. 4 1.0 1.0 NaN NaN
  1974. >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="bfill")
  1975. key A B C
  1976. 0 0.0 NaN 2.0 NaN
  1977. 1 0.0 2.0 3.0 NaN
  1978. 2 1.0 NaN 2.0 2.0
  1979. 3 1.0 3.0 NaN NaN
  1980. 4 1.0 NaN NaN NaN
  1981. Only replace the first NaN element within a group along rows.
  1982. >>> df.groupby("key").fillna(method="ffill", limit=1)
  1983. A B C
  1984. 0 NaN 2.0 NaN
  1985. 1 2.0 3.0 NaN
  1986. 2 NaN NaN 2.0
  1987. 3 3.0 NaN 2.0
  1988. 4 3.0 NaN NaN
  1989. """
  1990. result = self._op_via_apply(
  1991. "fillna",
  1992. value=value,
  1993. method=method,
  1994. axis=axis,
  1995. inplace=inplace,
  1996. limit=limit,
  1997. downcast=downcast,
  1998. )
  1999. return result
  2000. def take(
  2001. self,
  2002. indices: TakeIndexer,
  2003. axis: Axis | None = 0,
  2004. **kwargs,
  2005. ) -> DataFrame:
  2006. """
  2007. Return the elements in the given *positional* indices in each group.
  2008. This means that we are not indexing according to actual values in
  2009. the index attribute of the object. We are indexing according to the
  2010. actual position of the element in the object.
  2011. If a requested index does not exist for some group, this method will raise.
  2012. To get similar behavior that ignores indices that don't exist, see
  2013. :meth:`.DataFrameGroupBy.nth`.
  2014. Parameters
  2015. ----------
  2016. indices : array-like
  2017. An array of ints indicating which positions to take.
  2018. axis : {0 or 'index', 1 or 'columns', None}, default 0
  2019. The axis on which to select elements. ``0`` means that we are
  2020. selecting rows, ``1`` means that we are selecting columns.
  2021. **kwargs
  2022. For compatibility with :meth:`numpy.take`. Has no effect on the
  2023. output.
  2024. Returns
  2025. -------
  2026. DataFrame
  2027. An DataFrame containing the elements taken from each group.
  2028. See Also
  2029. --------
  2030. DataFrame.take : Take elements from a Series along an axis.
  2031. DataFrame.loc : Select a subset of a DataFrame by labels.
  2032. DataFrame.iloc : Select a subset of a DataFrame by positions.
  2033. numpy.take : Take elements from an array along an axis.
  2034. Examples
  2035. --------
  2036. >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
  2037. ... ('parrot', 'bird', 24.0),
  2038. ... ('lion', 'mammal', 80.5),
  2039. ... ('monkey', 'mammal', np.nan),
  2040. ... ('rabbit', 'mammal', 15.0)],
  2041. ... columns=['name', 'class', 'max_speed'],
  2042. ... index=[4, 3, 2, 1, 0])
  2043. >>> df
  2044. name class max_speed
  2045. 4 falcon bird 389.0
  2046. 3 parrot bird 24.0
  2047. 2 lion mammal 80.5
  2048. 1 monkey mammal NaN
  2049. 0 rabbit mammal 15.0
  2050. >>> gb = df.groupby([1, 1, 2, 2, 2])
  2051. Take elements at positions 0 and 1 along the axis 0 (default).
  2052. Note how the indices selected in the result do not correspond to
  2053. our input indices 0 and 1. That's because we are selecting the 0th
  2054. and 1st rows, not rows whose indices equal 0 and 1.
  2055. >>> gb.take([0, 1])
  2056. name class max_speed
  2057. 1 4 falcon bird 389.0
  2058. 3 parrot bird 24.0
  2059. 2 2 lion mammal 80.5
  2060. 1 monkey mammal NaN
  2061. The order of the specified indices influences the order in the result.
  2062. Here, the order is swapped from the previous example.
  2063. >>> gb.take([1, 0])
  2064. name class max_speed
  2065. 1 3 parrot bird 24.0
  2066. 4 falcon bird 389.0
  2067. 2 1 monkey mammal NaN
  2068. 2 lion mammal 80.5
  2069. Take elements at indices 1 and 2 along the axis 1 (column selection).
  2070. We may take elements using negative integers for positive indices,
  2071. starting from the end of the object, just like with Python lists.
  2072. >>> gb.take([-1, -2])
  2073. name class max_speed
  2074. 1 3 parrot bird 24.0
  2075. 4 falcon bird 389.0
  2076. 2 0 rabbit mammal 15.0
  2077. 1 monkey mammal NaN
  2078. """
  2079. result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
  2080. return result
  2081. def skew(
  2082. self,
  2083. axis: Axis | None | lib.NoDefault = lib.no_default,
  2084. skipna: bool = True,
  2085. numeric_only: bool = False,
  2086. **kwargs,
  2087. ) -> DataFrame:
  2088. """
  2089. Return unbiased skew within groups.
  2090. Normalized by N-1.
  2091. Parameters
  2092. ----------
  2093. axis : {0 or 'index', 1 or 'columns', None}, default 0
  2094. Axis for the function to be applied on.
  2095. Specifying ``axis=None`` will apply the aggregation across both axes.
  2096. .. versionadded:: 2.0.0
  2097. skipna : bool, default True
  2098. Exclude NA/null values when computing the result.
  2099. numeric_only : bool, default False
  2100. Include only float, int, boolean columns.
  2101. **kwargs
  2102. Additional keyword arguments to be passed to the function.
  2103. Returns
  2104. -------
  2105. DataFrame
  2106. See Also
  2107. --------
  2108. DataFrame.skew : Return unbiased skew over requested axis.
  2109. Examples
  2110. --------
  2111. >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi',
  2112. ... 'lion', 'monkey', 'rabbit'],
  2113. ... ['bird', 'bird', 'bird', 'bird',
  2114. ... 'mammal', 'mammal', 'mammal']]
  2115. >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class'))
  2116. >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan,
  2117. ... 80.5, 21.5, 15.0]},
  2118. ... index=index)
  2119. >>> df
  2120. max_speed
  2121. name class
  2122. falcon bird 389.0
  2123. parrot bird 24.0
  2124. cockatoo bird 70.0
  2125. kiwi bird NaN
  2126. lion mammal 80.5
  2127. monkey mammal 21.5
  2128. rabbit mammal 15.0
  2129. >>> gb = df.groupby(["class"])
  2130. >>> gb.skew()
  2131. max_speed
  2132. class
  2133. bird 1.628296
  2134. mammal 1.669046
  2135. >>> gb.skew(skipna=False)
  2136. max_speed
  2137. class
  2138. bird NaN
  2139. mammal 1.669046
  2140. """
  2141. result = self._op_via_apply(
  2142. "skew",
  2143. axis=axis,
  2144. skipna=skipna,
  2145. numeric_only=numeric_only,
  2146. **kwargs,
  2147. )
  2148. return result
  2149. @property
  2150. @doc(DataFrame.plot.__doc__)
  2151. def plot(self) -> GroupByPlot:
  2152. result = GroupByPlot(self)
  2153. return result
  2154. @doc(DataFrame.corr.__doc__)
  2155. def corr(
  2156. self,
  2157. method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
  2158. min_periods: int = 1,
  2159. numeric_only: bool = False,
  2160. ) -> DataFrame:
  2161. result = self._op_via_apply(
  2162. "corr", method=method, min_periods=min_periods, numeric_only=numeric_only
  2163. )
  2164. return result
  2165. @doc(DataFrame.cov.__doc__)
  2166. def cov(
  2167. self,
  2168. min_periods: int | None = None,
  2169. ddof: int | None = 1,
  2170. numeric_only: bool = False,
  2171. ) -> DataFrame:
  2172. result = self._op_via_apply(
  2173. "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only
  2174. )
  2175. return result
  2176. @doc(DataFrame.hist.__doc__)
  2177. def hist(
  2178. self,
  2179. column: IndexLabel = None,
  2180. by=None,
  2181. grid: bool = True,
  2182. xlabelsize: int | None = None,
  2183. xrot: float | None = None,
  2184. ylabelsize: int | None = None,
  2185. yrot: float | None = None,
  2186. ax=None,
  2187. sharex: bool = False,
  2188. sharey: bool = False,
  2189. figsize: tuple[int, int] | None = None,
  2190. layout: tuple[int, int] | None = None,
  2191. bins: int | Sequence[int] = 10,
  2192. backend: str | None = None,
  2193. legend: bool = False,
  2194. **kwargs,
  2195. ):
  2196. result = self._op_via_apply(
  2197. "hist",
  2198. column=column,
  2199. by=by,
  2200. grid=grid,
  2201. xlabelsize=xlabelsize,
  2202. xrot=xrot,
  2203. ylabelsize=ylabelsize,
  2204. yrot=yrot,
  2205. ax=ax,
  2206. sharex=sharex,
  2207. sharey=sharey,
  2208. figsize=figsize,
  2209. layout=layout,
  2210. bins=bins,
  2211. backend=backend,
  2212. legend=legend,
  2213. **kwargs,
  2214. )
  2215. return result
  2216. @property
  2217. @doc(DataFrame.dtypes.__doc__)
  2218. def dtypes(self) -> Series:
  2219. # error: Incompatible return value type (got "DataFrame", expected "Series")
  2220. return self.apply(lambda df: df.dtypes) # type: ignore[return-value]
  2221. @doc(DataFrame.corrwith.__doc__)
  2222. def corrwith(
  2223. self,
  2224. other: DataFrame | Series,
  2225. axis: Axis = 0,
  2226. drop: bool = False,
  2227. method: CorrelationMethod = "pearson",
  2228. numeric_only: bool = False,
  2229. ) -> DataFrame:
  2230. result = self._op_via_apply(
  2231. "corrwith",
  2232. other=other,
  2233. axis=axis,
  2234. drop=drop,
  2235. method=method,
  2236. numeric_only=numeric_only,
  2237. )
  2238. return result
  2239. def _wrap_transform_general_frame(
  2240. obj: DataFrame, group: DataFrame, res: DataFrame | Series
  2241. ) -> DataFrame:
  2242. from pandas import concat
  2243. if isinstance(res, Series):
  2244. # we need to broadcast across the
  2245. # other dimension; this will preserve dtypes
  2246. # GH14457
  2247. if res.index.is_(obj.index):
  2248. res_frame = concat([res] * len(group.columns), axis=1)
  2249. res_frame.columns = group.columns
  2250. res_frame.index = group.index
  2251. else:
  2252. res_frame = obj._constructor(
  2253. np.tile(res.values, (len(group.index), 1)),
  2254. columns=group.columns,
  2255. index=group.index,
  2256. )
  2257. assert isinstance(res_frame, DataFrame)
  2258. return res_frame
  2259. elif isinstance(res, DataFrame) and not res.index.is_(group.index):
  2260. return res._align_frame(group)[0]
  2261. else:
  2262. return res