groupby.py 140 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292
  1. """
  2. Provide the groupby split-apply-combine paradigm. Define the GroupBy
  3. class providing the base-class of operations.
  4. The SeriesGroupBy and DataFrameGroupBy sub-class
  5. (defined in pandas.core.groupby.generic)
  6. expose these user-facing objects to provide specific functionality.
  7. """
  8. from __future__ import annotations
  9. import datetime
  10. from functools import (
  11. partial,
  12. wraps,
  13. )
  14. import inspect
  15. from textwrap import dedent
  16. from typing import (
  17. TYPE_CHECKING,
  18. Callable,
  19. Hashable,
  20. Iterable,
  21. Iterator,
  22. List,
  23. Literal,
  24. Mapping,
  25. Sequence,
  26. TypeVar,
  27. Union,
  28. cast,
  29. final,
  30. )
  31. import warnings
  32. import numpy as np
  33. from pandas._config.config import option_context
  34. from pandas._libs import (
  35. Timestamp,
  36. lib,
  37. )
  38. from pandas._libs.algos import rank_1d
  39. import pandas._libs.groupby as libgroupby
  40. from pandas._libs.missing import NA
  41. from pandas._typing import (
  42. AnyArrayLike,
  43. ArrayLike,
  44. Axis,
  45. AxisInt,
  46. DtypeObj,
  47. FillnaOptions,
  48. IndexLabel,
  49. NDFrameT,
  50. PositionalIndexer,
  51. RandomState,
  52. Scalar,
  53. T,
  54. npt,
  55. )
  56. from pandas.compat.numpy import function as nv
  57. from pandas.errors import (
  58. AbstractMethodError,
  59. DataError,
  60. )
  61. from pandas.util._decorators import (
  62. Appender,
  63. Substitution,
  64. cache_readonly,
  65. doc,
  66. )
  67. from pandas.core.dtypes.cast import ensure_dtype_can_hold_na
  68. from pandas.core.dtypes.common import (
  69. is_bool_dtype,
  70. is_float_dtype,
  71. is_hashable,
  72. is_integer,
  73. is_integer_dtype,
  74. is_numeric_dtype,
  75. is_object_dtype,
  76. is_scalar,
  77. needs_i8_conversion,
  78. )
  79. from pandas.core.dtypes.missing import (
  80. isna,
  81. notna,
  82. )
  83. from pandas.core import (
  84. algorithms,
  85. sample,
  86. )
  87. from pandas.core._numba import executor
  88. from pandas.core.arrays import (
  89. BaseMaskedArray,
  90. BooleanArray,
  91. Categorical,
  92. DatetimeArray,
  93. ExtensionArray,
  94. FloatingArray,
  95. TimedeltaArray,
  96. )
  97. from pandas.core.base import (
  98. PandasObject,
  99. SelectionMixin,
  100. )
  101. import pandas.core.common as com
  102. from pandas.core.frame import DataFrame
  103. from pandas.core.generic import NDFrame
  104. from pandas.core.groupby import (
  105. base,
  106. numba_,
  107. ops,
  108. )
  109. from pandas.core.groupby.grouper import get_grouper
  110. from pandas.core.groupby.indexing import (
  111. GroupByIndexingMixin,
  112. GroupByNthSelector,
  113. )
  114. from pandas.core.indexes.api import (
  115. CategoricalIndex,
  116. Index,
  117. MultiIndex,
  118. RangeIndex,
  119. default_index,
  120. )
  121. from pandas.core.internals.blocks import ensure_block_shape
  122. from pandas.core.series import Series
  123. from pandas.core.sorting import get_group_index_sorter
  124. from pandas.core.util.numba_ import (
  125. get_jit_arguments,
  126. maybe_use_numba,
  127. )
  128. if TYPE_CHECKING:
  129. from pandas.core.window import (
  130. ExpandingGroupby,
  131. ExponentialMovingWindowGroupby,
  132. RollingGroupby,
  133. )
  134. _common_see_also = """
  135. See Also
  136. --------
  137. Series.%(name)s : Apply a function %(name)s to a Series.
  138. DataFrame.%(name)s : Apply a function %(name)s
  139. to each row or column of a DataFrame.
  140. """
  141. _apply_docs = {
  142. "template": """
  143. Apply function ``func`` group-wise and combine the results together.
  144. The function passed to ``apply`` must take a {input} as its first
  145. argument and return a DataFrame, Series or scalar. ``apply`` will
  146. then take care of combining the results back together into a single
  147. dataframe or series. ``apply`` is therefore a highly flexible
  148. grouping method.
  149. While ``apply`` is a very flexible method, its downside is that
  150. using it can be quite a bit slower than using more specific methods
  151. like ``agg`` or ``transform``. Pandas offers a wide range of method that will
  152. be much faster than using ``apply`` for their specific purposes, so try to
  153. use them before reaching for ``apply``.
  154. Parameters
  155. ----------
  156. func : callable
  157. A callable that takes a {input} as its first argument, and
  158. returns a dataframe, a series or a scalar. In addition the
  159. callable may take positional and keyword arguments.
  160. args, kwargs : tuple and dict
  161. Optional positional and keyword arguments to pass to ``func``.
  162. Returns
  163. -------
  164. Series or DataFrame
  165. See Also
  166. --------
  167. pipe : Apply function to the full GroupBy object instead of to each
  168. group.
  169. aggregate : Apply aggregate function to the GroupBy object.
  170. transform : Apply function column-by-column to the GroupBy object.
  171. Series.apply : Apply a function to a Series.
  172. DataFrame.apply : Apply a function to each row or column of a DataFrame.
  173. Notes
  174. -----
  175. .. versionchanged:: 1.3.0
  176. The resulting dtype will reflect the return value of the passed ``func``,
  177. see the examples below.
  178. Functions that mutate the passed object can produce unexpected
  179. behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
  180. for more details.
  181. Examples
  182. --------
  183. {examples}
  184. """,
  185. "dataframe_examples": """
  186. >>> df = pd.DataFrame({'A': 'a a b'.split(),
  187. ... 'B': [1,2,3],
  188. ... 'C': [4,6,5]})
  189. >>> g1 = df.groupby('A', group_keys=False)
  190. >>> g2 = df.groupby('A', group_keys=True)
  191. Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only
  192. differ in their ``group_keys`` argument. Calling `apply` in various ways,
  193. we can get different grouping results:
  194. Example 1: below the function passed to `apply` takes a DataFrame as
  195. its argument and returns a DataFrame. `apply` combines the result for
  196. each group together into a new DataFrame:
  197. >>> g1[['B', 'C']].apply(lambda x: x / x.sum())
  198. B C
  199. 0 0.333333 0.4
  200. 1 0.666667 0.6
  201. 2 1.000000 1.0
  202. In the above, the groups are not part of the index. We can have them included
  203. by using ``g2`` where ``group_keys=True``:
  204. >>> g2[['B', 'C']].apply(lambda x: x / x.sum())
  205. B C
  206. A
  207. a 0 0.333333 0.4
  208. 1 0.666667 0.6
  209. b 2 1.000000 1.0
  210. Example 2: The function passed to `apply` takes a DataFrame as
  211. its argument and returns a Series. `apply` combines the result for
  212. each group together into a new DataFrame.
  213. .. versionchanged:: 1.3.0
  214. The resulting dtype will reflect the return value of the passed ``func``.
  215. >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
  216. B C
  217. A
  218. a 1.0 2.0
  219. b 0.0 0.0
  220. >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
  221. B C
  222. A
  223. a 1.0 2.0
  224. b 0.0 0.0
  225. The ``group_keys`` argument has no effect here because the result is not
  226. like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
  227. to the input.
  228. Example 3: The function passed to `apply` takes a DataFrame as
  229. its argument and returns a scalar. `apply` combines the result for
  230. each group together into a Series, including setting the index as
  231. appropriate:
  232. >>> g1.apply(lambda x: x.C.max() - x.B.min())
  233. A
  234. a 5
  235. b 2
  236. dtype: int64""",
  237. "series_examples": """
  238. >>> s = pd.Series([0, 1, 2], index='a a b'.split())
  239. >>> g1 = s.groupby(s.index, group_keys=False)
  240. >>> g2 = s.groupby(s.index, group_keys=True)
  241. From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
  242. Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
  243. differ in their ``group_keys`` argument. Calling `apply` in various ways,
  244. we can get different grouping results:
  245. Example 1: The function passed to `apply` takes a Series as
  246. its argument and returns a Series. `apply` combines the result for
  247. each group together into a new Series.
  248. .. versionchanged:: 1.3.0
  249. The resulting dtype will reflect the return value of the passed ``func``.
  250. >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2)
  251. a 0.0
  252. a 2.0
  253. b 1.0
  254. dtype: float64
  255. In the above, the groups are not part of the index. We can have them included
  256. by using ``g2`` where ``group_keys=True``:
  257. >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2)
  258. a a 0.0
  259. a 2.0
  260. b b 1.0
  261. dtype: float64
  262. Example 2: The function passed to `apply` takes a Series as
  263. its argument and returns a scalar. `apply` combines the result for
  264. each group together into a Series, including setting the index as
  265. appropriate:
  266. >>> g1.apply(lambda x: x.max() - x.min())
  267. a 1
  268. b 0
  269. dtype: int64
  270. The ``group_keys`` argument has no effect here because the result is not
  271. like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
  272. to the input.
  273. >>> g2.apply(lambda x: x.max() - x.min())
  274. a 1
  275. b 0
  276. dtype: int64""",
  277. }
  278. _groupby_agg_method_template = """
  279. Compute {fname} of group values.
  280. Parameters
  281. ----------
  282. numeric_only : bool, default {no}
  283. Include only float, int, boolean columns.
  284. .. versionchanged:: 2.0.0
  285. numeric_only no longer accepts ``None``.
  286. min_count : int, default {mc}
  287. The required number of valid values to perform the operation. If fewer
  288. than ``min_count`` non-NA values are present the result will be NA.
  289. Returns
  290. -------
  291. Series or DataFrame
  292. Computed {fname} of values within each group.
  293. """
  294. _pipe_template = """
  295. Apply a ``func`` with arguments to this %(klass)s object and return its result.
  296. Use `.pipe` when you want to improve readability by chaining together
  297. functions that expect Series, DataFrames, GroupBy or Resampler objects.
  298. Instead of writing
  299. >>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP
  300. You can write
  301. >>> (df.groupby('group')
  302. ... .pipe(f)
  303. ... .pipe(g, arg1=a)
  304. ... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP
  305. which is much more readable.
  306. Parameters
  307. ----------
  308. func : callable or tuple of (callable, str)
  309. Function to apply to this %(klass)s object or, alternatively,
  310. a `(callable, data_keyword)` tuple where `data_keyword` is a
  311. string indicating the keyword of `callable` that expects the
  312. %(klass)s object.
  313. args : iterable, optional
  314. Positional arguments passed into `func`.
  315. kwargs : dict, optional
  316. A dictionary of keyword arguments passed into `func`.
  317. Returns
  318. -------
  319. the return type of `func`.
  320. See Also
  321. --------
  322. Series.pipe : Apply a function with arguments to a series.
  323. DataFrame.pipe: Apply a function with arguments to a dataframe.
  324. apply : Apply function to each group instead of to the
  325. full %(klass)s object.
  326. Notes
  327. -----
  328. See more `here
  329. <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_
  330. Examples
  331. --------
  332. %(examples)s
  333. """
  334. _transform_template = """
  335. Call function producing a same-indexed %(klass)s on each group.
  336. Returns a %(klass)s having the same indexes as the original object
  337. filled with the transformed values.
  338. Parameters
  339. ----------
  340. f : function, str
  341. Function to apply to each group. See the Notes section below for requirements.
  342. Accepted inputs are:
  343. - String
  344. - Python function
  345. - Numba JIT function with ``engine='numba'`` specified.
  346. Only passing a single function is supported with this engine.
  347. If the ``'numba'`` engine is chosen, the function must be
  348. a user defined function with ``values`` and ``index`` as the
  349. first and second arguments respectively in the function signature.
  350. Each group's index will be passed to the user defined function
  351. and optionally available for use.
  352. If a string is chosen, then it needs to be the name
  353. of the groupby method you want to use.
  354. .. versionchanged:: 1.1.0
  355. *args
  356. Positional arguments to pass to func.
  357. engine : str, default None
  358. * ``'cython'`` : Runs the function through C-extensions from cython.
  359. * ``'numba'`` : Runs the function through JIT compiled code from numba.
  360. * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba``
  361. .. versionadded:: 1.1.0
  362. engine_kwargs : dict, default None
  363. * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
  364. * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
  365. and ``parallel`` dictionary keys. The values must either be ``True`` or
  366. ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
  367. ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be
  368. applied to the function
  369. .. versionadded:: 1.1.0
  370. **kwargs
  371. Keyword arguments to be passed into func.
  372. Returns
  373. -------
  374. %(klass)s
  375. See Also
  376. --------
  377. %(klass)s.groupby.apply : Apply function ``func`` group-wise and combine
  378. the results together.
  379. %(klass)s.groupby.aggregate : Aggregate using one or more
  380. operations over the specified axis.
  381. %(klass)s.transform : Call ``func`` on self producing a %(klass)s with the
  382. same axis shape as self.
  383. Notes
  384. -----
  385. Each group is endowed the attribute 'name' in case you need to know
  386. which group you are working on.
  387. The current implementation imposes three requirements on f:
  388. * f must return a value that either has the same shape as the input
  389. subframe or can be broadcast to the shape of the input subframe.
  390. For example, if `f` returns a scalar it will be broadcast to have the
  391. same shape as the input subframe.
  392. * if this is a DataFrame, f must support application column-by-column
  393. in the subframe. If f also supports application to the entire subframe,
  394. then a fast path is used starting from the second chunk.
  395. * f must not mutate groups. Mutation is not supported and may
  396. produce unexpected results. See :ref:`gotchas.udf-mutation` for more details.
  397. When using ``engine='numba'``, there will be no "fall back" behavior internally.
  398. The group data and group index will be passed as numpy arrays to the JITed
  399. user defined function, and no alternative execution attempts will be tried.
  400. .. versionchanged:: 1.3.0
  401. The resulting dtype will reflect the return value of the passed ``func``,
  402. see the examples below.
  403. .. versionchanged:: 2.0.0
  404. When using ``.transform`` on a grouped DataFrame and the transformation function
  405. returns a DataFrame, pandas now aligns the result's index
  406. with the input's index. You can call ``.to_numpy()`` on the
  407. result of the transformation function to avoid alignment.
  408. Examples
  409. --------
  410. %(example)s"""
  411. _agg_template = """
  412. Aggregate using one or more operations over the specified axis.
  413. Parameters
  414. ----------
  415. func : function, str, list, dict or None
  416. Function to use for aggregating the data. If a function, must either
  417. work when passed a {klass} or when passed to {klass}.apply.
  418. Accepted combinations are:
  419. - function
  420. - string function name
  421. - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
  422. - dict of axis labels -> functions, function names or list of such.
  423. - None, in which case ``**kwargs`` are used with Named Aggregation. Here the
  424. output has one column for each element in ``**kwargs``. The name of the
  425. column is keyword, whereas the value determines the aggregation used to compute
  426. the values in the column.
  427. Can also accept a Numba JIT function with
  428. ``engine='numba'`` specified. Only passing a single function is supported
  429. with this engine.
  430. If the ``'numba'`` engine is chosen, the function must be
  431. a user defined function with ``values`` and ``index`` as the
  432. first and second arguments respectively in the function signature.
  433. Each group's index will be passed to the user defined function
  434. and optionally available for use.
  435. .. versionchanged:: 1.1.0
  436. *args
  437. Positional arguments to pass to func.
  438. engine : str, default None
  439. * ``'cython'`` : Runs the function through C-extensions from cython.
  440. * ``'numba'`` : Runs the function through JIT compiled code from numba.
  441. * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
  442. .. versionadded:: 1.1.0
  443. engine_kwargs : dict, default None
  444. * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
  445. * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
  446. and ``parallel`` dictionary keys. The values must either be ``True`` or
  447. ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
  448. ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
  449. applied to the function
  450. .. versionadded:: 1.1.0
  451. **kwargs
  452. * If ``func`` is None, ``**kwargs`` are used to define the output names and
  453. aggregations via Named Aggregation. See ``func`` entry.
  454. * Otherwise, keyword arguments to be passed into func.
  455. Returns
  456. -------
  457. {klass}
  458. See Also
  459. --------
  460. {klass}.groupby.apply : Apply function func group-wise
  461. and combine the results together.
  462. {klass}.groupby.transform : Transforms the Series on each group
  463. based on the given function.
  464. {klass}.aggregate : Aggregate using one or more
  465. operations over the specified axis.
  466. Notes
  467. -----
  468. When using ``engine='numba'``, there will be no "fall back" behavior internally.
  469. The group data and group index will be passed as numpy arrays to the JITed
  470. user defined function, and no alternative execution attempts will be tried.
  471. Functions that mutate the passed object can produce unexpected
  472. behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
  473. for more details.
  474. .. versionchanged:: 1.3.0
  475. The resulting dtype will reflect the return value of the passed ``func``,
  476. see the examples below.
  477. {examples}"""
  478. @final
  479. class GroupByPlot(PandasObject):
  480. """
  481. Class implementing the .plot attribute for groupby objects.
  482. """
  483. def __init__(self, groupby: GroupBy) -> None:
  484. self._groupby = groupby
  485. def __call__(self, *args, **kwargs):
  486. def f(self):
  487. return self.plot(*args, **kwargs)
  488. f.__name__ = "plot"
  489. return self._groupby.apply(f)
  490. def __getattr__(self, name: str):
  491. def attr(*args, **kwargs):
  492. def f(self):
  493. return getattr(self.plot, name)(*args, **kwargs)
  494. return self._groupby.apply(f)
  495. return attr
  496. _KeysArgType = Union[
  497. Hashable,
  498. List[Hashable],
  499. Callable[[Hashable], Hashable],
  500. List[Callable[[Hashable], Hashable]],
  501. Mapping[Hashable, Hashable],
  502. ]
  503. class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):
  504. _hidden_attrs = PandasObject._hidden_attrs | {
  505. "as_index",
  506. "axis",
  507. "dropna",
  508. "exclusions",
  509. "grouper",
  510. "group_keys",
  511. "keys",
  512. "level",
  513. "obj",
  514. "observed",
  515. "sort",
  516. }
  517. axis: AxisInt
  518. grouper: ops.BaseGrouper
  519. keys: _KeysArgType | None = None
  520. level: IndexLabel | None = None
  521. group_keys: bool
  522. @final
  523. def __len__(self) -> int:
  524. return len(self.groups)
  525. @final
  526. def __repr__(self) -> str:
  527. # TODO: Better repr for GroupBy object
  528. return object.__repr__(self)
  529. @final
  530. @property
  531. def groups(self) -> dict[Hashable, np.ndarray]:
  532. """
  533. Dict {group name -> group labels}.
  534. """
  535. return self.grouper.groups
  536. @final
  537. @property
  538. def ngroups(self) -> int:
  539. return self.grouper.ngroups
  540. @final
  541. @property
  542. def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
  543. """
  544. Dict {group name -> group indices}.
  545. """
  546. return self.grouper.indices
  547. @final
  548. def _get_indices(self, names):
  549. """
  550. Safe get multiple indices, translate keys for
  551. datelike to underlying repr.
  552. """
  553. def get_converter(s):
  554. # possibly convert to the actual key types
  555. # in the indices, could be a Timestamp or a np.datetime64
  556. if isinstance(s, datetime.datetime):
  557. return lambda key: Timestamp(key)
  558. elif isinstance(s, np.datetime64):
  559. return lambda key: Timestamp(key).asm8
  560. else:
  561. return lambda key: key
  562. if len(names) == 0:
  563. return []
  564. if len(self.indices) > 0:
  565. index_sample = next(iter(self.indices))
  566. else:
  567. index_sample = None # Dummy sample
  568. name_sample = names[0]
  569. if isinstance(index_sample, tuple):
  570. if not isinstance(name_sample, tuple):
  571. msg = "must supply a tuple to get_group with multiple grouping keys"
  572. raise ValueError(msg)
  573. if not len(name_sample) == len(index_sample):
  574. try:
  575. # If the original grouper was a tuple
  576. return [self.indices[name] for name in names]
  577. except KeyError as err:
  578. # turns out it wasn't a tuple
  579. msg = (
  580. "must supply a same-length tuple to get_group "
  581. "with multiple grouping keys"
  582. )
  583. raise ValueError(msg) from err
  584. converters = [get_converter(s) for s in index_sample]
  585. names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
  586. else:
  587. converter = get_converter(index_sample)
  588. names = (converter(name) for name in names)
  589. return [self.indices.get(name, []) for name in names]
  590. @final
  591. def _get_index(self, name):
  592. """
  593. Safe get index, translate keys for datelike to underlying repr.
  594. """
  595. return self._get_indices([name])[0]
  596. @final
  597. @cache_readonly
  598. def _selected_obj(self):
  599. # Note: _selected_obj is always just `self.obj` for SeriesGroupBy
  600. if isinstance(self.obj, Series):
  601. return self.obj
  602. if self._selection is not None:
  603. if is_hashable(self._selection):
  604. # i.e. a single key, so selecting it will return a Series.
  605. # In this case, _obj_with_exclusions would wrap the key
  606. # in a list and return a single-column DataFrame.
  607. return self.obj[self._selection]
  608. # Otherwise _selection is equivalent to _selection_list, so
  609. # _selected_obj matches _obj_with_exclusions, so we can re-use
  610. # that and avoid making a copy.
  611. return self._obj_with_exclusions
  612. return self.obj
  613. @final
  614. def _dir_additions(self) -> set[str]:
  615. return self.obj._dir_additions()
  616. @Substitution(
  617. klass="GroupBy",
  618. examples=dedent(
  619. """\
  620. >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
  621. >>> df
  622. A B
  623. 0 a 1
  624. 1 b 2
  625. 2 a 3
  626. 3 b 4
  627. To get the difference between each groups maximum and minimum value in one
  628. pass, you can do
  629. >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
  630. B
  631. A
  632. a 2
  633. b 2"""
  634. ),
  635. )
  636. @Appender(_pipe_template)
  637. def pipe(
  638. self,
  639. func: Callable[..., T] | tuple[Callable[..., T], str],
  640. *args,
  641. **kwargs,
  642. ) -> T:
  643. return com.pipe(self, func, *args, **kwargs)
  644. @final
  645. def get_group(self, name, obj=None) -> DataFrame | Series:
  646. """
  647. Construct DataFrame from group with provided name.
  648. Parameters
  649. ----------
  650. name : object
  651. The name of the group to get as a DataFrame.
  652. obj : DataFrame, default None
  653. The DataFrame to take the DataFrame out of. If
  654. it is None, the object groupby was called on will
  655. be used.
  656. Returns
  657. -------
  658. same type as obj
  659. """
  660. if obj is None:
  661. obj = self._selected_obj
  662. inds = self._get_index(name)
  663. if not len(inds):
  664. raise KeyError(name)
  665. return obj._take_with_is_copy(inds, axis=self.axis)
  666. @final
  667. def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:
  668. """
  669. Groupby iterator.
  670. Returns
  671. -------
  672. Generator yielding sequence of (name, subsetted object)
  673. for each group
  674. """
  675. keys = self.keys
  676. result = self.grouper.get_iterator(self._selected_obj, axis=self.axis)
  677. if isinstance(keys, list) and len(keys) == 1:
  678. # GH#42795 - when keys is a list, return tuples even when length is 1
  679. result = (((key,), group) for key, group in result)
  680. return result
  681. # To track operations that expand dimensions, like ohlc
  682. OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame)
  683. class GroupBy(BaseGroupBy[NDFrameT]):
  684. """
  685. Class for grouping and aggregating relational data.
  686. See aggregate, transform, and apply functions on this object.
  687. It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
  688. ::
  689. grouped = groupby(obj, ...)
  690. Parameters
  691. ----------
  692. obj : pandas object
  693. axis : int, default 0
  694. level : int, default None
  695. Level of MultiIndex
  696. groupings : list of Grouping objects
  697. Most users should ignore this
  698. exclusions : array-like, optional
  699. List of columns to exclude
  700. name : str
  701. Most users should ignore this
  702. Returns
  703. -------
  704. **Attributes**
  705. groups : dict
  706. {group name -> group labels}
  707. len(grouped) : int
  708. Number of groups
  709. Notes
  710. -----
  711. After grouping, see aggregate, apply, and transform functions. Here are
  712. some other brief notes about usage. When grouping by multiple groups, the
  713. result index will be a MultiIndex (hierarchical) by default.
  714. Iteration produces (key, group) tuples, i.e. chunking the data by group. So
  715. you can write code like:
  716. ::
  717. grouped = obj.groupby(keys, axis=axis)
  718. for key, group in grouped:
  719. # do something with the data
  720. Function calls on GroupBy, if not specially implemented, "dispatch" to the
  721. grouped data. So if you group a DataFrame and wish to invoke the std()
  722. method on each group, you can simply do:
  723. ::
  724. df.groupby(mapper).std()
  725. rather than
  726. ::
  727. df.groupby(mapper).aggregate(np.std)
  728. You can pass arguments to these "wrapped" functions, too.
  729. See the online documentation for full exposition on these topics and much
  730. more
  731. """
  732. grouper: ops.BaseGrouper
  733. as_index: bool
  734. @final
  735. def __init__(
  736. self,
  737. obj: NDFrameT,
  738. keys: _KeysArgType | None = None,
  739. axis: Axis = 0,
  740. level: IndexLabel | None = None,
  741. grouper: ops.BaseGrouper | None = None,
  742. exclusions: frozenset[Hashable] | None = None,
  743. selection: IndexLabel | None = None,
  744. as_index: bool = True,
  745. sort: bool = True,
  746. group_keys: bool = True,
  747. observed: bool = False,
  748. dropna: bool = True,
  749. ) -> None:
  750. self._selection = selection
  751. assert isinstance(obj, NDFrame), type(obj)
  752. self.level = level
  753. if not as_index:
  754. if axis != 0:
  755. raise ValueError("as_index=False only valid for axis=0")
  756. self.as_index = as_index
  757. self.keys = keys
  758. self.sort = sort
  759. self.group_keys = group_keys
  760. self.observed = observed
  761. self.dropna = dropna
  762. if grouper is None:
  763. grouper, exclusions, obj = get_grouper(
  764. obj,
  765. keys,
  766. axis=axis,
  767. level=level,
  768. sort=sort,
  769. observed=observed,
  770. dropna=self.dropna,
  771. )
  772. self.obj = obj
  773. self.axis = obj._get_axis_number(axis)
  774. self.grouper = grouper
  775. self.exclusions = frozenset(exclusions) if exclusions else frozenset()
  776. def __getattr__(self, attr: str):
  777. if attr in self._internal_names_set:
  778. return object.__getattribute__(self, attr)
  779. if attr in self.obj:
  780. return self[attr]
  781. raise AttributeError(
  782. f"'{type(self).__name__}' object has no attribute '{attr}'"
  783. )
  784. @final
  785. def _op_via_apply(self, name: str, *args, **kwargs):
  786. """Compute the result of an operation by using GroupBy's apply."""
  787. f = getattr(type(self._obj_with_exclusions), name)
  788. sig = inspect.signature(f)
  789. # a little trickery for aggregation functions that need an axis
  790. # argument
  791. if "axis" in sig.parameters:
  792. if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default:
  793. kwargs["axis"] = self.axis
  794. def curried(x):
  795. return f(x, *args, **kwargs)
  796. # preserve the name so we can detect it when calling plot methods,
  797. # to avoid duplicates
  798. curried.__name__ = name
  799. # special case otherwise extra plots are created when catching the
  800. # exception below
  801. if name in base.plotting_methods:
  802. return self.apply(curried)
  803. is_transform = name in base.transformation_kernels
  804. result = self._python_apply_general(
  805. curried,
  806. self._obj_with_exclusions,
  807. is_transform=is_transform,
  808. not_indexed_same=not is_transform,
  809. )
  810. if self.grouper.has_dropped_na and is_transform:
  811. # result will have dropped rows due to nans, fill with null
  812. # and ensure index is ordered same as the input
  813. result = self._set_result_index_ordered(result)
  814. return result
  815. # -----------------------------------------------------------------
  816. # Selection
  817. def _iterate_slices(self) -> Iterable[Series]:
  818. raise AbstractMethodError(self)
  819. # -----------------------------------------------------------------
  820. # Dispatch/Wrapping
  821. @final
  822. def _concat_objects(
  823. self,
  824. values,
  825. not_indexed_same: bool = False,
  826. is_transform: bool = False,
  827. ):
  828. from pandas.core.reshape.concat import concat
  829. if self.group_keys and not is_transform:
  830. if self.as_index:
  831. # possible MI return case
  832. group_keys = self.grouper.result_index
  833. group_levels = self.grouper.levels
  834. group_names = self.grouper.names
  835. result = concat(
  836. values,
  837. axis=self.axis,
  838. keys=group_keys,
  839. levels=group_levels,
  840. names=group_names,
  841. sort=False,
  842. )
  843. else:
  844. # GH5610, returns a MI, with the first level being a
  845. # range index
  846. keys = list(range(len(values)))
  847. result = concat(values, axis=self.axis, keys=keys)
  848. elif not not_indexed_same:
  849. result = concat(values, axis=self.axis)
  850. ax = self._selected_obj._get_axis(self.axis)
  851. if self.dropna:
  852. labels = self.grouper.group_info[0]
  853. mask = labels != -1
  854. ax = ax[mask]
  855. # this is a very unfortunate situation
  856. # we can't use reindex to restore the original order
  857. # when the ax has duplicates
  858. # so we resort to this
  859. # GH 14776, 30667
  860. # TODO: can we re-use e.g. _reindex_non_unique?
  861. if ax.has_duplicates and not result.axes[self.axis].equals(ax):
  862. # e.g. test_category_order_transformer
  863. target = algorithms.unique1d(ax._values)
  864. indexer, _ = result.index.get_indexer_non_unique(target)
  865. result = result.take(indexer, axis=self.axis)
  866. else:
  867. result = result.reindex(ax, axis=self.axis, copy=False)
  868. else:
  869. result = concat(values, axis=self.axis)
  870. name = self.obj.name if self.obj.ndim == 1 else self._selection
  871. if isinstance(result, Series) and name is not None:
  872. result.name = name
  873. return result
  874. @final
  875. def _set_result_index_ordered(
  876. self, result: OutputFrameOrSeries
  877. ) -> OutputFrameOrSeries:
  878. # set the result index on the passed values object and
  879. # return the new object, xref 8046
  880. obj_axis = self.obj._get_axis(self.axis)
  881. if self.grouper.is_monotonic and not self.grouper.has_dropped_na:
  882. # shortcut if we have an already ordered grouper
  883. result = result.set_axis(obj_axis, axis=self.axis, copy=False)
  884. return result
  885. # row order is scrambled => sort the rows by position in original index
  886. original_positions = Index(self.grouper.result_ilocs())
  887. result = result.set_axis(original_positions, axis=self.axis, copy=False)
  888. result = result.sort_index(axis=self.axis)
  889. if self.grouper.has_dropped_na:
  890. # Add back in any missing rows due to dropna - index here is integral
  891. # with values referring to the row of the input so can use RangeIndex
  892. result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
  893. result = result.set_axis(obj_axis, axis=self.axis, copy=False)
  894. return result
  895. @final
  896. def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:
  897. if isinstance(result, Series):
  898. result = result.to_frame()
  899. # zip in reverse so we can always insert at loc 0
  900. columns = result.columns
  901. for name, lev, in_axis in zip(
  902. reversed(self.grouper.names),
  903. reversed(self.grouper.get_group_levels()),
  904. reversed([grp.in_axis for grp in self.grouper.groupings]),
  905. ):
  906. # GH #28549
  907. # When using .apply(-), name will be in columns already
  908. if in_axis and name not in columns:
  909. result.insert(0, name, lev)
  910. return result
  911. def _indexed_output_to_ndframe(
  912. self, result: Mapping[base.OutputKey, ArrayLike]
  913. ) -> Series | DataFrame:
  914. raise AbstractMethodError(self)
  915. @final
  916. def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT:
  917. if self.axis == 1:
  918. # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy
  919. result = result.T
  920. if result.index.equals(self.obj.index):
  921. # Retain e.g. DatetimeIndex/TimedeltaIndex freq
  922. # e.g. test_groupby_crash_on_nunique
  923. result.index = self.obj.index.copy()
  924. return result
  925. @final
  926. def _wrap_aggregated_output(
  927. self,
  928. result: Series | DataFrame,
  929. qs: npt.NDArray[np.float64] | None = None,
  930. ):
  931. """
  932. Wraps the output of GroupBy aggregations into the expected result.
  933. Parameters
  934. ----------
  935. result : Series, DataFrame
  936. Returns
  937. -------
  938. Series or DataFrame
  939. """
  940. # ATM we do not get here for SeriesGroupBy; when we do, we will
  941. # need to require that result.name already match self.obj.name
  942. if not self.as_index:
  943. # `not self.as_index` is only relevant for DataFrameGroupBy,
  944. # enforced in __init__
  945. result = self._insert_inaxis_grouper(result)
  946. result = result._consolidate()
  947. index = Index(range(self.grouper.ngroups))
  948. else:
  949. index = self.grouper.result_index
  950. if qs is not None:
  951. # We get here with len(qs) != 1 and not self.as_index
  952. # in test_pass_args_kwargs
  953. index = _insert_quantile_level(index, qs)
  954. result.index = index
  955. # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has
  956. # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT"
  957. res = self._maybe_transpose_result(result) # type: ignore[arg-type]
  958. return self._reindex_output(res, qs=qs)
  959. def _wrap_applied_output(
  960. self,
  961. data,
  962. values: list,
  963. not_indexed_same: bool = False,
  964. is_transform: bool = False,
  965. ):
  966. raise AbstractMethodError(self)
  967. # -----------------------------------------------------------------
  968. # numba
  969. @final
  970. def _numba_prep(self, data: DataFrame):
  971. ids, _, ngroups = self.grouper.group_info
  972. sorted_index = get_group_index_sorter(ids, ngroups)
  973. sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False)
  974. sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()
  975. if len(self.grouper.groupings) > 1:
  976. raise NotImplementedError(
  977. "More than 1 grouping labels are not supported with engine='numba'"
  978. )
  979. # GH 46867
  980. index_data = data.index
  981. if isinstance(index_data, MultiIndex):
  982. group_key = self.grouper.groupings[0].name
  983. index_data = index_data.get_level_values(group_key)
  984. sorted_index_data = index_data.take(sorted_index).to_numpy()
  985. starts, ends = lib.generate_slices(sorted_ids, ngroups)
  986. return (
  987. starts,
  988. ends,
  989. sorted_index_data,
  990. sorted_data,
  991. )
  992. def _numba_agg_general(
  993. self,
  994. func: Callable,
  995. engine_kwargs: dict[str, bool] | None,
  996. *aggregator_args,
  997. ):
  998. """
  999. Perform groupby with a standard numerical aggregation function (e.g. mean)
  1000. with Numba.
  1001. """
  1002. if not self.as_index:
  1003. raise NotImplementedError(
  1004. "as_index=False is not supported. Use .reset_index() instead."
  1005. )
  1006. if self.axis == 1:
  1007. raise NotImplementedError("axis=1 is not supported.")
  1008. data = self._obj_with_exclusions
  1009. df = data if data.ndim == 2 else data.to_frame()
  1010. starts, ends, sorted_index, sorted_data = self._numba_prep(df)
  1011. aggregator = executor.generate_shared_aggregator(
  1012. func, **get_jit_arguments(engine_kwargs)
  1013. )
  1014. result = aggregator(sorted_data, starts, ends, 0, *aggregator_args)
  1015. index = self.grouper.result_index
  1016. if data.ndim == 1:
  1017. result_kwargs = {"name": data.name}
  1018. result = result.ravel()
  1019. else:
  1020. result_kwargs = {"columns": data.columns}
  1021. return data._constructor(result, index=index, **result_kwargs)
  1022. @final
  1023. def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
  1024. """
  1025. Perform groupby transform routine with the numba engine.
  1026. This routine mimics the data splitting routine of the DataSplitter class
  1027. to generate the indices of each group in the sorted data and then passes the
  1028. data and indices into a Numba jitted function.
  1029. """
  1030. data = self._obj_with_exclusions
  1031. df = data if data.ndim == 2 else data.to_frame()
  1032. starts, ends, sorted_index, sorted_data = self._numba_prep(df)
  1033. numba_.validate_udf(func)
  1034. numba_transform_func = numba_.generate_numba_transform_func(
  1035. func, **get_jit_arguments(engine_kwargs, kwargs)
  1036. )
  1037. result = numba_transform_func(
  1038. sorted_data,
  1039. sorted_index,
  1040. starts,
  1041. ends,
  1042. len(df.columns),
  1043. *args,
  1044. )
  1045. # result values needs to be resorted to their original positions since we
  1046. # evaluated the data sorted by group
  1047. result = result.take(np.argsort(sorted_index), axis=0)
  1048. index = data.index
  1049. if data.ndim == 1:
  1050. result_kwargs = {"name": data.name}
  1051. result = result.ravel()
  1052. else:
  1053. result_kwargs = {"columns": data.columns}
  1054. return data._constructor(result, index=index, **result_kwargs)
  1055. @final
  1056. def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
  1057. """
  1058. Perform groupby aggregation routine with the numba engine.
  1059. This routine mimics the data splitting routine of the DataSplitter class
  1060. to generate the indices of each group in the sorted data and then passes the
  1061. data and indices into a Numba jitted function.
  1062. """
  1063. data = self._obj_with_exclusions
  1064. df = data if data.ndim == 2 else data.to_frame()
  1065. starts, ends, sorted_index, sorted_data = self._numba_prep(df)
  1066. numba_.validate_udf(func)
  1067. numba_agg_func = numba_.generate_numba_agg_func(
  1068. func, **get_jit_arguments(engine_kwargs, kwargs)
  1069. )
  1070. result = numba_agg_func(
  1071. sorted_data,
  1072. sorted_index,
  1073. starts,
  1074. ends,
  1075. len(df.columns),
  1076. *args,
  1077. )
  1078. index = self.grouper.result_index
  1079. if data.ndim == 1:
  1080. result_kwargs = {"name": data.name}
  1081. result = result.ravel()
  1082. else:
  1083. result_kwargs = {"columns": data.columns}
  1084. res = data._constructor(result, index=index, **result_kwargs)
  1085. if not self.as_index:
  1086. res = self._insert_inaxis_grouper(res)
  1087. res.index = default_index(len(res))
  1088. return res
  1089. # -----------------------------------------------------------------
  1090. # apply/agg/transform
  1091. @Appender(
  1092. _apply_docs["template"].format(
  1093. input="dataframe", examples=_apply_docs["dataframe_examples"]
  1094. )
  1095. )
  1096. def apply(self, func, *args, **kwargs) -> NDFrameT:
  1097. func = com.is_builtin_func(func)
  1098. if isinstance(func, str):
  1099. if hasattr(self, func):
  1100. res = getattr(self, func)
  1101. if callable(res):
  1102. return res(*args, **kwargs)
  1103. elif args or kwargs:
  1104. raise ValueError(f"Cannot pass arguments to property {func}")
  1105. return res
  1106. else:
  1107. raise TypeError(f"apply func should be callable, not '{func}'")
  1108. elif args or kwargs:
  1109. if callable(func):
  1110. @wraps(func)
  1111. def f(g):
  1112. with np.errstate(all="ignore"):
  1113. return func(g, *args, **kwargs)
  1114. else:
  1115. raise ValueError(
  1116. "func must be a callable if args or kwargs are supplied"
  1117. )
  1118. else:
  1119. f = func
  1120. # ignore SettingWithCopy here in case the user mutates
  1121. with option_context("mode.chained_assignment", None):
  1122. try:
  1123. result = self._python_apply_general(f, self._selected_obj)
  1124. except TypeError:
  1125. # gh-20949
  1126. # try again, with .apply acting as a filtering
  1127. # operation, by excluding the grouping column
  1128. # This would normally not be triggered
  1129. # except if the udf is trying an operation that
  1130. # fails on *some* columns, e.g. a numeric operation
  1131. # on a string grouper column
  1132. return self._python_apply_general(f, self._obj_with_exclusions)
  1133. return result
  1134. @final
  1135. def _python_apply_general(
  1136. self,
  1137. f: Callable,
  1138. data: DataFrame | Series,
  1139. not_indexed_same: bool | None = None,
  1140. is_transform: bool = False,
  1141. is_agg: bool = False,
  1142. ) -> NDFrameT:
  1143. """
  1144. Apply function f in python space
  1145. Parameters
  1146. ----------
  1147. f : callable
  1148. Function to apply
  1149. data : Series or DataFrame
  1150. Data to apply f to
  1151. not_indexed_same: bool, optional
  1152. When specified, overrides the value of not_indexed_same. Apply behaves
  1153. differently when the result index is equal to the input index, but
  1154. this can be coincidental leading to value-dependent behavior.
  1155. is_transform : bool, default False
  1156. Indicator for whether the function is actually a transform
  1157. and should not have group keys prepended.
  1158. is_agg : bool, default False
  1159. Indicator for whether the function is an aggregation. When the
  1160. result is empty, we don't want to warn for this case.
  1161. See _GroupBy._python_agg_general.
  1162. Returns
  1163. -------
  1164. Series or DataFrame
  1165. data after applying f
  1166. """
  1167. values, mutated = self.grouper.apply(f, data, self.axis)
  1168. if not_indexed_same is None:
  1169. not_indexed_same = mutated
  1170. return self._wrap_applied_output(
  1171. data,
  1172. values,
  1173. not_indexed_same,
  1174. is_transform,
  1175. )
  1176. @final
  1177. def _agg_general(
  1178. self,
  1179. numeric_only: bool = False,
  1180. min_count: int = -1,
  1181. *,
  1182. alias: str,
  1183. npfunc: Callable,
  1184. ):
  1185. result = self._cython_agg_general(
  1186. how=alias,
  1187. alt=npfunc,
  1188. numeric_only=numeric_only,
  1189. min_count=min_count,
  1190. )
  1191. return result.__finalize__(self.obj, method="groupby")
  1192. def _agg_py_fallback(
  1193. self, values: ArrayLike, ndim: int, alt: Callable
  1194. ) -> ArrayLike:
  1195. """
  1196. Fallback to pure-python aggregation if _cython_operation raises
  1197. NotImplementedError.
  1198. """
  1199. # We get here with a) EADtypes and b) object dtype
  1200. assert alt is not None
  1201. if values.ndim == 1:
  1202. # For DataFrameGroupBy we only get here with ExtensionArray
  1203. ser = Series(values, copy=False)
  1204. else:
  1205. # We only get here with values.dtype == object
  1206. # TODO: special case not needed with ArrayManager
  1207. df = DataFrame(values.T)
  1208. # bc we split object blocks in grouped_reduce, we have only 1 col
  1209. # otherwise we'd have to worry about block-splitting GH#39329
  1210. assert df.shape[1] == 1
  1211. # Avoid call to self.values that can occur in DataFrame
  1212. # reductions; see GH#28949
  1213. ser = df.iloc[:, 0]
  1214. # We do not get here with UDFs, so we know that our dtype
  1215. # should always be preserved by the implemented aggregations
  1216. # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
  1217. res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
  1218. if isinstance(values, Categorical):
  1219. # Because we only get here with known dtype-preserving
  1220. # reductions, we cast back to Categorical.
  1221. # TODO: if we ever get "rank" working, exclude it here.
  1222. res_values = type(values)._from_sequence(res_values, dtype=values.dtype)
  1223. elif ser.dtype == object:
  1224. res_values = res_values.astype(object, copy=False)
  1225. # If we are DataFrameGroupBy and went through a SeriesGroupByPath
  1226. # then we need to reshape
  1227. # GH#32223 includes case with IntegerArray values, ndarray res_values
  1228. # test_groupby_duplicate_columns with object dtype values
  1229. return ensure_block_shape(res_values, ndim=ndim)
  1230. @final
  1231. def _cython_agg_general(
  1232. self,
  1233. how: str,
  1234. alt: Callable,
  1235. numeric_only: bool = False,
  1236. min_count: int = -1,
  1237. **kwargs,
  1238. ):
  1239. # Note: we never get here with how="ohlc" for DataFrameGroupBy;
  1240. # that goes through SeriesGroupBy
  1241. data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
  1242. def array_func(values: ArrayLike) -> ArrayLike:
  1243. try:
  1244. result = self.grouper._cython_operation(
  1245. "aggregate",
  1246. values,
  1247. how,
  1248. axis=data.ndim - 1,
  1249. min_count=min_count,
  1250. **kwargs,
  1251. )
  1252. except NotImplementedError:
  1253. # generally if we have numeric_only=False
  1254. # and non-applicable functions
  1255. # try to python agg
  1256. # TODO: shouldn't min_count matter?
  1257. result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
  1258. return result
  1259. new_mgr = data.grouped_reduce(array_func)
  1260. res = self._wrap_agged_manager(new_mgr)
  1261. out = self._wrap_aggregated_output(res)
  1262. if self.axis == 1:
  1263. out = out.infer_objects(copy=False)
  1264. return out
  1265. def _cython_transform(
  1266. self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
  1267. ):
  1268. raise AbstractMethodError(self)
  1269. @final
  1270. def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
  1271. if maybe_use_numba(engine):
  1272. return self._transform_with_numba(
  1273. func, *args, engine_kwargs=engine_kwargs, **kwargs
  1274. )
  1275. # optimized transforms
  1276. func = com.get_cython_func(func) or func
  1277. if not isinstance(func, str):
  1278. return self._transform_general(func, *args, **kwargs)
  1279. elif func not in base.transform_kernel_allowlist:
  1280. msg = f"'{func}' is not a valid function name for transform(name)"
  1281. raise ValueError(msg)
  1282. elif func in base.cythonized_kernels or func in base.transformation_kernels:
  1283. # cythonized transform or canned "agg+broadcast"
  1284. return getattr(self, func)(*args, **kwargs)
  1285. else:
  1286. # i.e. func in base.reduction_kernels
  1287. # GH#30918 Use _transform_fast only when we know func is an aggregation
  1288. # If func is a reduction, we need to broadcast the
  1289. # result to the whole group. Compute func result
  1290. # and deal with possible broadcasting below.
  1291. # Temporarily set observed for dealing with categoricals.
  1292. with com.temp_setattr(self, "observed", True):
  1293. with com.temp_setattr(self, "as_index", True):
  1294. # GH#49834 - result needs groups in the index for
  1295. # _wrap_transform_fast_result
  1296. result = getattr(self, func)(*args, **kwargs)
  1297. return self._wrap_transform_fast_result(result)
  1298. @final
  1299. def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
  1300. """
  1301. Fast transform path for aggregations.
  1302. """
  1303. obj = self._obj_with_exclusions
  1304. # for each col, reshape to size of original frame by take operation
  1305. ids, _, _ = self.grouper.group_info
  1306. result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False)
  1307. if self.obj.ndim == 1:
  1308. # i.e. SeriesGroupBy
  1309. out = algorithms.take_nd(result._values, ids)
  1310. output = obj._constructor(out, index=obj.index, name=obj.name)
  1311. else:
  1312. # `.size()` gives Series output on DataFrame input, need axis 0
  1313. axis = 0 if result.ndim == 1 else self.axis
  1314. # GH#46209
  1315. # Don't convert indices: negative indices need to give rise
  1316. # to null values in the result
  1317. output = result._take(ids, axis=axis, convert_indices=False)
  1318. output = output.set_axis(obj._get_axis(self.axis), axis=axis)
  1319. return output
  1320. # -----------------------------------------------------------------
  1321. # Utilities
  1322. @final
  1323. def _apply_filter(self, indices, dropna):
  1324. if len(indices) == 0:
  1325. indices = np.array([], dtype="int64")
  1326. else:
  1327. indices = np.sort(np.concatenate(indices))
  1328. if dropna:
  1329. filtered = self._selected_obj.take(indices, axis=self.axis)
  1330. else:
  1331. mask = np.empty(len(self._selected_obj.index), dtype=bool)
  1332. mask.fill(False)
  1333. mask[indices.astype(int)] = True
  1334. # mask fails to broadcast when passed to where; broadcast manually.
  1335. mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
  1336. filtered = self._selected_obj.where(mask) # Fill with NaNs.
  1337. return filtered
  1338. @final
  1339. def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
  1340. """
  1341. Parameters
  1342. ----------
  1343. ascending : bool, default True
  1344. If False, number in reverse, from length of group - 1 to 0.
  1345. Notes
  1346. -----
  1347. this is currently implementing sort=False
  1348. (though the default is sort=True) for groupby in general
  1349. """
  1350. ids, _, ngroups = self.grouper.group_info
  1351. sorter = get_group_index_sorter(ids, ngroups)
  1352. ids, count = ids[sorter], len(ids)
  1353. if count == 0:
  1354. return np.empty(0, dtype=np.int64)
  1355. run = np.r_[True, ids[:-1] != ids[1:]]
  1356. rep = np.diff(np.r_[np.nonzero(run)[0], count])
  1357. out = (~run).cumsum()
  1358. if ascending:
  1359. out -= np.repeat(out[run], rep)
  1360. else:
  1361. out = np.repeat(out[np.r_[run[1:], True]], rep) - out
  1362. if self.grouper.has_dropped_na:
  1363. out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
  1364. else:
  1365. out = out.astype(np.int64, copy=False)
  1366. rev = np.empty(count, dtype=np.intp)
  1367. rev[sorter] = np.arange(count, dtype=np.intp)
  1368. return out[rev]
  1369. # -----------------------------------------------------------------
  1370. @final
  1371. @property
  1372. def _obj_1d_constructor(self) -> Callable:
  1373. # GH28330 preserve subclassed Series/DataFrames
  1374. if isinstance(self.obj, DataFrame):
  1375. return self.obj._constructor_sliced
  1376. assert isinstance(self.obj, Series)
  1377. return self.obj._constructor
  1378. @final
  1379. def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool):
  1380. """
  1381. Shared func to call any / all Cython GroupBy implementations.
  1382. """
  1383. def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:
  1384. if is_object_dtype(vals.dtype) and skipna:
  1385. # GH#37501: don't raise on pd.NA when skipna=True
  1386. mask = isna(vals)
  1387. if mask.any():
  1388. # mask on original values computed separately
  1389. vals = vals.copy()
  1390. vals[mask] = True
  1391. elif isinstance(vals, BaseMaskedArray):
  1392. vals = vals._data
  1393. vals = vals.astype(bool, copy=False)
  1394. return vals.view(np.int8), bool
  1395. def result_to_bool(
  1396. result: np.ndarray,
  1397. inference: type,
  1398. nullable: bool = False,
  1399. ) -> ArrayLike:
  1400. if nullable:
  1401. return BooleanArray(result.astype(bool, copy=False), result == -1)
  1402. else:
  1403. return result.astype(inference, copy=False)
  1404. return self._get_cythonized_result(
  1405. libgroupby.group_any_all,
  1406. numeric_only=False,
  1407. cython_dtype=np.dtype(np.int8),
  1408. pre_processing=objs_to_bool,
  1409. post_processing=result_to_bool,
  1410. val_test=val_test,
  1411. skipna=skipna,
  1412. )
  1413. @final
  1414. @Substitution(name="groupby")
  1415. @Appender(_common_see_also)
  1416. def any(self, skipna: bool = True):
  1417. """
  1418. Return True if any value in the group is truthful, else False.
  1419. Parameters
  1420. ----------
  1421. skipna : bool, default True
  1422. Flag to ignore nan values during truth testing.
  1423. Returns
  1424. -------
  1425. Series or DataFrame
  1426. DataFrame or Series of boolean values, where a value is True if any element
  1427. is True within its respective group, False otherwise.
  1428. """
  1429. return self._bool_agg("any", skipna)
  1430. @final
  1431. @Substitution(name="groupby")
  1432. @Appender(_common_see_also)
  1433. def all(self, skipna: bool = True):
  1434. """
  1435. Return True if all values in the group are truthful, else False.
  1436. Parameters
  1437. ----------
  1438. skipna : bool, default True
  1439. Flag to ignore nan values during truth testing.
  1440. Returns
  1441. -------
  1442. Series or DataFrame
  1443. DataFrame or Series of boolean values, where a value is True if all elements
  1444. are True within its respective group, False otherwise.
  1445. """
  1446. return self._bool_agg("all", skipna)
  1447. @final
  1448. @Substitution(name="groupby")
  1449. @Appender(_common_see_also)
  1450. def count(self) -> NDFrameT:
  1451. """
  1452. Compute count of group, excluding missing values.
  1453. Returns
  1454. -------
  1455. Series or DataFrame
  1456. Count of values within each group.
  1457. """
  1458. data = self._get_data_to_aggregate()
  1459. ids, _, ngroups = self.grouper.group_info
  1460. mask = ids != -1
  1461. is_series = data.ndim == 1
  1462. def hfunc(bvalues: ArrayLike) -> ArrayLike:
  1463. # TODO(EA2D): reshape would not be necessary with 2D EAs
  1464. if bvalues.ndim == 1:
  1465. # EA
  1466. masked = mask & ~isna(bvalues).reshape(1, -1)
  1467. else:
  1468. masked = mask & ~isna(bvalues)
  1469. counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups)
  1470. if is_series:
  1471. assert counted.ndim == 2
  1472. assert counted.shape[0] == 1
  1473. return counted[0]
  1474. return counted
  1475. new_mgr = data.grouped_reduce(hfunc)
  1476. new_obj = self._wrap_agged_manager(new_mgr)
  1477. # If we are grouping on categoricals we want unobserved categories to
  1478. # return zero, rather than the default of NaN which the reindexing in
  1479. # _wrap_aggregated_output() returns. GH 35028
  1480. # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false
  1481. with com.temp_setattr(self, "observed", True):
  1482. result = self._wrap_aggregated_output(new_obj)
  1483. return self._reindex_output(result, fill_value=0)
  1484. @final
  1485. @Substitution(name="groupby")
  1486. @Substitution(see_also=_common_see_also)
  1487. def mean(
  1488. self,
  1489. numeric_only: bool = False,
  1490. engine: str = "cython",
  1491. engine_kwargs: dict[str, bool] | None = None,
  1492. ):
  1493. """
  1494. Compute mean of groups, excluding missing values.
  1495. Parameters
  1496. ----------
  1497. numeric_only : bool, default False
  1498. Include only float, int, boolean columns.
  1499. .. versionchanged:: 2.0.0
  1500. numeric_only no longer accepts ``None`` and defaults to ``False``.
  1501. engine : str, default None
  1502. * ``'cython'`` : Runs the operation through C-extensions from cython.
  1503. * ``'numba'`` : Runs the operation through JIT compiled code from numba.
  1504. * ``None`` : Defaults to ``'cython'`` or globally setting
  1505. ``compute.use_numba``
  1506. .. versionadded:: 1.4.0
  1507. engine_kwargs : dict, default None
  1508. * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
  1509. * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
  1510. and ``parallel`` dictionary keys. The values must either be ``True`` or
  1511. ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
  1512. ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
  1513. .. versionadded:: 1.4.0
  1514. Returns
  1515. -------
  1516. pandas.Series or pandas.DataFrame
  1517. %(see_also)s
  1518. Examples
  1519. --------
  1520. >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
  1521. ... 'B': [np.nan, 2, 3, 4, 5],
  1522. ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
  1523. Groupby one column and return the mean of the remaining columns in
  1524. each group.
  1525. >>> df.groupby('A').mean()
  1526. B C
  1527. A
  1528. 1 3.0 1.333333
  1529. 2 4.0 1.500000
  1530. Groupby two columns and return the mean of the remaining column.
  1531. >>> df.groupby(['A', 'B']).mean()
  1532. C
  1533. A B
  1534. 1 2.0 2.0
  1535. 4.0 1.0
  1536. 2 3.0 1.0
  1537. 5.0 2.0
  1538. Groupby one column and return the mean of only particular column in
  1539. the group.
  1540. >>> df.groupby('A')['B'].mean()
  1541. A
  1542. 1 3.0
  1543. 2 4.0
  1544. Name: B, dtype: float64
  1545. """
  1546. if maybe_use_numba(engine):
  1547. from pandas.core._numba.kernels import sliding_mean
  1548. return self._numba_agg_general(sliding_mean, engine_kwargs)
  1549. else:
  1550. result = self._cython_agg_general(
  1551. "mean",
  1552. alt=lambda x: Series(x).mean(numeric_only=numeric_only),
  1553. numeric_only=numeric_only,
  1554. )
  1555. return result.__finalize__(self.obj, method="groupby")
  1556. @final
  1557. def median(self, numeric_only: bool = False):
  1558. """
  1559. Compute median of groups, excluding missing values.
  1560. For multiple groupings, the result index will be a MultiIndex
  1561. Parameters
  1562. ----------
  1563. numeric_only : bool, default False
  1564. Include only float, int, boolean columns.
  1565. .. versionchanged:: 2.0.0
  1566. numeric_only no longer accepts ``None`` and defaults to False.
  1567. Returns
  1568. -------
  1569. Series or DataFrame
  1570. Median of values within each group.
  1571. """
  1572. result = self._cython_agg_general(
  1573. "median",
  1574. alt=lambda x: Series(x).median(numeric_only=numeric_only),
  1575. numeric_only=numeric_only,
  1576. )
  1577. return result.__finalize__(self.obj, method="groupby")
  1578. @final
  1579. @Substitution(name="groupby")
  1580. @Appender(_common_see_also)
  1581. def std(
  1582. self,
  1583. ddof: int = 1,
  1584. engine: str | None = None,
  1585. engine_kwargs: dict[str, bool] | None = None,
  1586. numeric_only: bool = False,
  1587. ):
  1588. """
  1589. Compute standard deviation of groups, excluding missing values.
  1590. For multiple groupings, the result index will be a MultiIndex.
  1591. Parameters
  1592. ----------
  1593. ddof : int, default 1
  1594. Degrees of freedom.
  1595. engine : str, default None
  1596. * ``'cython'`` : Runs the operation through C-extensions from cython.
  1597. * ``'numba'`` : Runs the operation through JIT compiled code from numba.
  1598. * ``None`` : Defaults to ``'cython'`` or globally setting
  1599. ``compute.use_numba``
  1600. .. versionadded:: 1.4.0
  1601. engine_kwargs : dict, default None
  1602. * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
  1603. * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
  1604. and ``parallel`` dictionary keys. The values must either be ``True`` or
  1605. ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
  1606. ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
  1607. .. versionadded:: 1.4.0
  1608. numeric_only : bool, default False
  1609. Include only `float`, `int` or `boolean` data.
  1610. .. versionadded:: 1.5.0
  1611. .. versionchanged:: 2.0.0
  1612. numeric_only now defaults to ``False``.
  1613. Returns
  1614. -------
  1615. Series or DataFrame
  1616. Standard deviation of values within each group.
  1617. """
  1618. if maybe_use_numba(engine):
  1619. from pandas.core._numba.kernels import sliding_var
  1620. return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))
  1621. else:
  1622. def _preprocessing(values):
  1623. if isinstance(values, BaseMaskedArray):
  1624. return values._data, None
  1625. return values, None
  1626. def _postprocessing(
  1627. vals, inference, nullable: bool = False, result_mask=None
  1628. ) -> ArrayLike:
  1629. if nullable:
  1630. if result_mask.ndim == 2:
  1631. result_mask = result_mask[:, 0]
  1632. return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_))
  1633. return np.sqrt(vals)
  1634. result = self._get_cythonized_result(
  1635. libgroupby.group_var,
  1636. cython_dtype=np.dtype(np.float64),
  1637. numeric_only=numeric_only,
  1638. needs_counts=True,
  1639. pre_processing=_preprocessing,
  1640. post_processing=_postprocessing,
  1641. ddof=ddof,
  1642. how="std",
  1643. )
  1644. return result
  1645. @final
  1646. @Substitution(name="groupby")
  1647. @Appender(_common_see_also)
  1648. def var(
  1649. self,
  1650. ddof: int = 1,
  1651. engine: str | None = None,
  1652. engine_kwargs: dict[str, bool] | None = None,
  1653. numeric_only: bool = False,
  1654. ):
  1655. """
  1656. Compute variance of groups, excluding missing values.
  1657. For multiple groupings, the result index will be a MultiIndex.
  1658. Parameters
  1659. ----------
  1660. ddof : int, default 1
  1661. Degrees of freedom.
  1662. engine : str, default None
  1663. * ``'cython'`` : Runs the operation through C-extensions from cython.
  1664. * ``'numba'`` : Runs the operation through JIT compiled code from numba.
  1665. * ``None`` : Defaults to ``'cython'`` or globally setting
  1666. ``compute.use_numba``
  1667. .. versionadded:: 1.4.0
  1668. engine_kwargs : dict, default None
  1669. * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
  1670. * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
  1671. and ``parallel`` dictionary keys. The values must either be ``True`` or
  1672. ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
  1673. ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
  1674. .. versionadded:: 1.4.0
  1675. numeric_only : bool, default False
  1676. Include only `float`, `int` or `boolean` data.
  1677. .. versionadded:: 1.5.0
  1678. .. versionchanged:: 2.0.0
  1679. numeric_only now defaults to ``False``.
  1680. Returns
  1681. -------
  1682. Series or DataFrame
  1683. Variance of values within each group.
  1684. """
  1685. if maybe_use_numba(engine):
  1686. from pandas.core._numba.kernels import sliding_var
  1687. return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
  1688. else:
  1689. return self._cython_agg_general(
  1690. "var",
  1691. alt=lambda x: Series(x).var(ddof=ddof),
  1692. numeric_only=numeric_only,
  1693. ddof=ddof,
  1694. )
  1695. @final
  1696. def _value_counts(
  1697. self,
  1698. subset: Sequence[Hashable] | None = None,
  1699. normalize: bool = False,
  1700. sort: bool = True,
  1701. ascending: bool = False,
  1702. dropna: bool = True,
  1703. ) -> DataFrame | Series:
  1704. """
  1705. Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy.
  1706. SeriesGroupBy additionally supports a bins argument. See the docstring of
  1707. DataFrameGroupBy.value_counts for a description of arguments.
  1708. """
  1709. if self.axis == 1:
  1710. raise NotImplementedError(
  1711. "DataFrameGroupBy.value_counts only handles axis=0"
  1712. )
  1713. name = "proportion" if normalize else "count"
  1714. df = self.obj
  1715. obj = self._obj_with_exclusions
  1716. in_axis_names = {
  1717. grouping.name for grouping in self.grouper.groupings if grouping.in_axis
  1718. }
  1719. if isinstance(obj, Series):
  1720. _name = obj.name
  1721. keys = [] if _name in in_axis_names else [obj]
  1722. else:
  1723. unique_cols = set(obj.columns)
  1724. if subset is not None:
  1725. subsetted = set(subset)
  1726. clashing = subsetted & set(in_axis_names)
  1727. if clashing:
  1728. raise ValueError(
  1729. f"Keys {clashing} in subset cannot be in "
  1730. "the groupby column keys."
  1731. )
  1732. doesnt_exist = subsetted - unique_cols
  1733. if doesnt_exist:
  1734. raise ValueError(
  1735. f"Keys {doesnt_exist} in subset do not "
  1736. f"exist in the DataFrame."
  1737. )
  1738. else:
  1739. subsetted = unique_cols
  1740. keys = [
  1741. # Can't use .values because the column label needs to be preserved
  1742. obj.iloc[:, idx]
  1743. for idx, _name in enumerate(obj.columns)
  1744. if _name not in in_axis_names and _name in subsetted
  1745. ]
  1746. groupings = list(self.grouper.groupings)
  1747. for key in keys:
  1748. grouper, _, _ = get_grouper(
  1749. df,
  1750. key=key,
  1751. axis=self.axis,
  1752. sort=self.sort,
  1753. observed=False,
  1754. dropna=dropna,
  1755. )
  1756. groupings += list(grouper.groupings)
  1757. # Take the size of the overall columns
  1758. gb = df.groupby(
  1759. groupings,
  1760. sort=self.sort,
  1761. observed=self.observed,
  1762. dropna=self.dropna,
  1763. )
  1764. result_series = cast(Series, gb.size())
  1765. result_series.name = name
  1766. # GH-46357 Include non-observed categories
  1767. # of non-grouping columns regardless of `observed`
  1768. if any(
  1769. isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
  1770. and not grouping._observed
  1771. for grouping in groupings
  1772. ):
  1773. levels_list = [ping.result_index for ping in groupings]
  1774. multi_index, _ = MultiIndex.from_product(
  1775. levels_list, names=[ping.name for ping in groupings]
  1776. ).sortlevel()
  1777. result_series = result_series.reindex(multi_index, fill_value=0)
  1778. if normalize:
  1779. # Normalize the results by dividing by the original group sizes.
  1780. # We are guaranteed to have the first N levels be the
  1781. # user-requested grouping.
  1782. levels = list(
  1783. range(len(self.grouper.groupings), result_series.index.nlevels)
  1784. )
  1785. indexed_group_size = result_series.groupby(
  1786. result_series.index.droplevel(levels),
  1787. sort=self.sort,
  1788. dropna=self.dropna,
  1789. ).transform("sum")
  1790. result_series /= indexed_group_size
  1791. # Handle groups of non-observed categories
  1792. result_series = result_series.fillna(0.0)
  1793. if sort:
  1794. # Sort the values and then resort by the main grouping
  1795. index_level = range(len(self.grouper.groupings))
  1796. result_series = result_series.sort_values(ascending=ascending).sort_index(
  1797. level=index_level, sort_remaining=False
  1798. )
  1799. result: Series | DataFrame
  1800. if self.as_index:
  1801. result = result_series
  1802. else:
  1803. # Convert to frame
  1804. index = result_series.index
  1805. columns = com.fill_missing_names(index.names)
  1806. if name in columns:
  1807. raise ValueError(f"Column label '{name}' is duplicate of result column")
  1808. result_series.name = name
  1809. result_series.index = index.set_names(range(len(columns)))
  1810. result_frame = result_series.reset_index()
  1811. result_frame.columns = columns + [name]
  1812. result = result_frame
  1813. return result.__finalize__(self.obj, method="value_counts")
  1814. @final
  1815. def sem(self, ddof: int = 1, numeric_only: bool = False):
  1816. """
  1817. Compute standard error of the mean of groups, excluding missing values.
  1818. For multiple groupings, the result index will be a MultiIndex.
  1819. Parameters
  1820. ----------
  1821. ddof : int, default 1
  1822. Degrees of freedom.
  1823. numeric_only : bool, default False
  1824. Include only `float`, `int` or `boolean` data.
  1825. .. versionadded:: 1.5.0
  1826. .. versionchanged:: 2.0.0
  1827. numeric_only now defaults to ``False``.
  1828. Returns
  1829. -------
  1830. Series or DataFrame
  1831. Standard error of the mean of values within each group.
  1832. """
  1833. if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
  1834. raise TypeError(
  1835. f"{type(self).__name__}.sem called with "
  1836. f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
  1837. )
  1838. result = self.std(ddof=ddof, numeric_only=numeric_only)
  1839. if result.ndim == 1:
  1840. result /= np.sqrt(self.count())
  1841. else:
  1842. cols = result.columns.difference(self.exclusions).unique()
  1843. counts = self.count()
  1844. result_ilocs = result.columns.get_indexer_for(cols)
  1845. count_ilocs = counts.columns.get_indexer_for(cols)
  1846. result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs])
  1847. return result
  1848. @final
  1849. @Substitution(name="groupby")
  1850. @Appender(_common_see_also)
  1851. def size(self) -> DataFrame | Series:
  1852. """
  1853. Compute group sizes.
  1854. Returns
  1855. -------
  1856. DataFrame or Series
  1857. Number of rows in each group as a Series if as_index is True
  1858. or a DataFrame if as_index is False.
  1859. """
  1860. result = self.grouper.size()
  1861. # GH28330 preserve subclassed Series/DataFrames through calls
  1862. if isinstance(self.obj, Series):
  1863. result = self._obj_1d_constructor(result, name=self.obj.name)
  1864. else:
  1865. result = self._obj_1d_constructor(result)
  1866. with com.temp_setattr(self, "as_index", True):
  1867. # size already has the desired behavior in GH#49519, but this makes the
  1868. # as_index=False path of _reindex_output fail on categorical groupers.
  1869. result = self._reindex_output(result, fill_value=0)
  1870. if not self.as_index:
  1871. # error: Incompatible types in assignment (expression has
  1872. # type "DataFrame", variable has type "Series")
  1873. result = result.rename("size").reset_index() # type: ignore[assignment]
  1874. return result
  1875. @final
  1876. @doc(_groupby_agg_method_template, fname="sum", no=False, mc=0)
  1877. def sum(
  1878. self,
  1879. numeric_only: bool = False,
  1880. min_count: int = 0,
  1881. engine: str | None = None,
  1882. engine_kwargs: dict[str, bool] | None = None,
  1883. ):
  1884. if maybe_use_numba(engine):
  1885. from pandas.core._numba.kernels import sliding_sum
  1886. return self._numba_agg_general(
  1887. sliding_sum,
  1888. engine_kwargs,
  1889. )
  1890. else:
  1891. # If we are grouping on categoricals we want unobserved categories to
  1892. # return zero, rather than the default of NaN which the reindexing in
  1893. # _agg_general() returns. GH #31422
  1894. with com.temp_setattr(self, "observed", True):
  1895. result = self._agg_general(
  1896. numeric_only=numeric_only,
  1897. min_count=min_count,
  1898. alias="sum",
  1899. npfunc=np.sum,
  1900. )
  1901. return self._reindex_output(result, fill_value=0)
  1902. @final
  1903. @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0)
  1904. def prod(self, numeric_only: bool = False, min_count: int = 0):
  1905. return self._agg_general(
  1906. numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
  1907. )
  1908. @final
  1909. @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1)
  1910. def min(
  1911. self,
  1912. numeric_only: bool = False,
  1913. min_count: int = -1,
  1914. engine: str | None = None,
  1915. engine_kwargs: dict[str, bool] | None = None,
  1916. ):
  1917. if maybe_use_numba(engine):
  1918. from pandas.core._numba.kernels import sliding_min_max
  1919. return self._numba_agg_general(sliding_min_max, engine_kwargs, False)
  1920. else:
  1921. return self._agg_general(
  1922. numeric_only=numeric_only,
  1923. min_count=min_count,
  1924. alias="min",
  1925. npfunc=np.min,
  1926. )
  1927. @final
  1928. @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)
  1929. def max(
  1930. self,
  1931. numeric_only: bool = False,
  1932. min_count: int = -1,
  1933. engine: str | None = None,
  1934. engine_kwargs: dict[str, bool] | None = None,
  1935. ):
  1936. if maybe_use_numba(engine):
  1937. from pandas.core._numba.kernels import sliding_min_max
  1938. return self._numba_agg_general(sliding_min_max, engine_kwargs, True)
  1939. else:
  1940. return self._agg_general(
  1941. numeric_only=numeric_only,
  1942. min_count=min_count,
  1943. alias="max",
  1944. npfunc=np.max,
  1945. )
  1946. @final
  1947. def first(self, numeric_only: bool = False, min_count: int = -1):
  1948. """
  1949. Compute the first non-null entry of each column.
  1950. Parameters
  1951. ----------
  1952. numeric_only : bool, default False
  1953. Include only float, int, boolean columns.
  1954. min_count : int, default -1
  1955. The required number of valid values to perform the operation. If fewer
  1956. than ``min_count`` non-NA values are present the result will be NA.
  1957. Returns
  1958. -------
  1959. Series or DataFrame
  1960. First non-null of values within each group.
  1961. See Also
  1962. --------
  1963. DataFrame.groupby : Apply a function groupby to each row or column of a
  1964. DataFrame.
  1965. pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry
  1966. of each column.
  1967. pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.
  1968. Examples
  1969. --------
  1970. >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3],
  1971. ... D=['3/11/2000', '3/12/2000', '3/13/2000']))
  1972. >>> df['D'] = pd.to_datetime(df['D'])
  1973. >>> df.groupby("A").first()
  1974. B C D
  1975. A
  1976. 1 5.0 1 2000-03-11
  1977. 3 6.0 3 2000-03-13
  1978. >>> df.groupby("A").first(min_count=2)
  1979. B C D
  1980. A
  1981. 1 NaN 1.0 2000-03-11
  1982. 3 NaN NaN NaT
  1983. >>> df.groupby("A").first(numeric_only=True)
  1984. B C
  1985. A
  1986. 1 5.0 1
  1987. 3 6.0 3
  1988. """
  1989. def first_compat(obj: NDFrameT, axis: AxisInt = 0):
  1990. def first(x: Series):
  1991. """Helper function for first item that isn't NA."""
  1992. arr = x.array[notna(x.array)]
  1993. if not len(arr):
  1994. return np.nan
  1995. return arr[0]
  1996. if isinstance(obj, DataFrame):
  1997. return obj.apply(first, axis=axis)
  1998. elif isinstance(obj, Series):
  1999. return first(obj)
  2000. else: # pragma: no cover
  2001. raise TypeError(type(obj))
  2002. return self._agg_general(
  2003. numeric_only=numeric_only,
  2004. min_count=min_count,
  2005. alias="first",
  2006. npfunc=first_compat,
  2007. )
  2008. @final
  2009. def last(self, numeric_only: bool = False, min_count: int = -1):
  2010. """
  2011. Compute the last non-null entry of each column.
  2012. Parameters
  2013. ----------
  2014. numeric_only : bool, default False
  2015. Include only float, int, boolean columns. If None, will attempt to use
  2016. everything, then use only numeric data.
  2017. min_count : int, default -1
  2018. The required number of valid values to perform the operation. If fewer
  2019. than ``min_count`` non-NA values are present the result will be NA.
  2020. Returns
  2021. -------
  2022. Series or DataFrame
  2023. Last non-null of values within each group.
  2024. See Also
  2025. --------
  2026. DataFrame.groupby : Apply a function groupby to each row or column of a
  2027. DataFrame.
  2028. pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry
  2029. of each column.
  2030. pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.
  2031. Examples
  2032. --------
  2033. >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3]))
  2034. >>> df.groupby("A").last()
  2035. B C
  2036. A
  2037. 1 5.0 2
  2038. 3 6.0 3
  2039. """
  2040. def last_compat(obj: NDFrameT, axis: AxisInt = 0):
  2041. def last(x: Series):
  2042. """Helper function for last item that isn't NA."""
  2043. arr = x.array[notna(x.array)]
  2044. if not len(arr):
  2045. return np.nan
  2046. return arr[-1]
  2047. if isinstance(obj, DataFrame):
  2048. return obj.apply(last, axis=axis)
  2049. elif isinstance(obj, Series):
  2050. return last(obj)
  2051. else: # pragma: no cover
  2052. raise TypeError(type(obj))
  2053. return self._agg_general(
  2054. numeric_only=numeric_only,
  2055. min_count=min_count,
  2056. alias="last",
  2057. npfunc=last_compat,
  2058. )
  2059. @final
  2060. def ohlc(self) -> DataFrame:
  2061. """
  2062. Compute open, high, low and close values of a group, excluding missing values.
  2063. For multiple groupings, the result index will be a MultiIndex
  2064. Returns
  2065. -------
  2066. DataFrame
  2067. Open, high, low and close values within each group.
  2068. """
  2069. if self.obj.ndim == 1:
  2070. # self._iterate_slices() yields only self._selected_obj
  2071. obj = self._selected_obj
  2072. is_numeric = is_numeric_dtype(obj.dtype)
  2073. if not is_numeric:
  2074. raise DataError("No numeric types to aggregate")
  2075. res_values = self.grouper._cython_operation(
  2076. "aggregate", obj._values, "ohlc", axis=0, min_count=-1
  2077. )
  2078. agg_names = ["open", "high", "low", "close"]
  2079. result = self.obj._constructor_expanddim(
  2080. res_values, index=self.grouper.result_index, columns=agg_names
  2081. )
  2082. return self._reindex_output(result)
  2083. result = self._apply_to_column_groupbys(
  2084. lambda x: x.ohlc(), self._obj_with_exclusions
  2085. )
  2086. if not self.as_index:
  2087. result = self._insert_inaxis_grouper(result)
  2088. result.index = default_index(len(result))
  2089. return result
  2090. @doc(DataFrame.describe)
  2091. def describe(
  2092. self,
  2093. percentiles=None,
  2094. include=None,
  2095. exclude=None,
  2096. ) -> NDFrameT:
  2097. obj = self._obj_with_exclusions
  2098. if len(obj) == 0:
  2099. described = obj.describe(
  2100. percentiles=percentiles, include=include, exclude=exclude
  2101. )
  2102. if obj.ndim == 1:
  2103. result = described
  2104. else:
  2105. result = described.unstack()
  2106. return result.to_frame().T.iloc[:0]
  2107. with com.temp_setattr(self, "as_index", True):
  2108. result = self._python_apply_general(
  2109. lambda x: x.describe(
  2110. percentiles=percentiles, include=include, exclude=exclude
  2111. ),
  2112. obj,
  2113. not_indexed_same=True,
  2114. )
  2115. if self.axis == 1:
  2116. return result.T
  2117. # GH#49256 - properly handle the grouping column(s)
  2118. result = result.unstack()
  2119. if not self.as_index:
  2120. result = self._insert_inaxis_grouper(result)
  2121. result.index = default_index(len(result))
  2122. return result
  2123. @final
  2124. def resample(self, rule, *args, **kwargs):
  2125. """
  2126. Provide resampling when using a TimeGrouper.
  2127. Given a grouper, the function resamples it according to a string
  2128. "string" -> "frequency".
  2129. See the :ref:`frequency aliases <timeseries.offset_aliases>`
  2130. documentation for more details.
  2131. Parameters
  2132. ----------
  2133. rule : str or DateOffset
  2134. The offset string or object representing target grouper conversion.
  2135. *args, **kwargs
  2136. Possible arguments are `how`, `fill_method`, `limit`, `kind` and
  2137. `on`, and other arguments of `TimeGrouper`.
  2138. Returns
  2139. -------
  2140. Grouper
  2141. Return a new grouper with our resampler appended.
  2142. See Also
  2143. --------
  2144. Grouper : Specify a frequency to resample with when
  2145. grouping by a key.
  2146. DatetimeIndex.resample : Frequency conversion and resampling of
  2147. time series.
  2148. Examples
  2149. --------
  2150. >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')
  2151. >>> df = pd.DataFrame(data=4 * [range(2)],
  2152. ... index=idx,
  2153. ... columns=['a', 'b'])
  2154. >>> df.iloc[2, 0] = 5
  2155. >>> df
  2156. a b
  2157. 2000-01-01 00:00:00 0 1
  2158. 2000-01-01 00:01:00 0 1
  2159. 2000-01-01 00:02:00 5 1
  2160. 2000-01-01 00:03:00 0 1
  2161. Downsample the DataFrame into 3 minute bins and sum the values of
  2162. the timestamps falling into a bin.
  2163. >>> df.groupby('a').resample('3T').sum()
  2164. a b
  2165. a
  2166. 0 2000-01-01 00:00:00 0 2
  2167. 2000-01-01 00:03:00 0 1
  2168. 5 2000-01-01 00:00:00 5 1
  2169. Upsample the series into 30 second bins.
  2170. >>> df.groupby('a').resample('30S').sum()
  2171. a b
  2172. a
  2173. 0 2000-01-01 00:00:00 0 1
  2174. 2000-01-01 00:00:30 0 0
  2175. 2000-01-01 00:01:00 0 1
  2176. 2000-01-01 00:01:30 0 0
  2177. 2000-01-01 00:02:00 0 0
  2178. 2000-01-01 00:02:30 0 0
  2179. 2000-01-01 00:03:00 0 1
  2180. 5 2000-01-01 00:02:00 5 1
  2181. Resample by month. Values are assigned to the month of the period.
  2182. >>> df.groupby('a').resample('M').sum()
  2183. a b
  2184. a
  2185. 0 2000-01-31 0 3
  2186. 5 2000-01-31 5 1
  2187. Downsample the series into 3 minute bins as above, but close the right
  2188. side of the bin interval.
  2189. >>> df.groupby('a').resample('3T', closed='right').sum()
  2190. a b
  2191. a
  2192. 0 1999-12-31 23:57:00 0 1
  2193. 2000-01-01 00:00:00 0 2
  2194. 5 2000-01-01 00:00:00 5 1
  2195. Downsample the series into 3 minute bins and close the right side of
  2196. the bin interval, but label each bin using the right edge instead of
  2197. the left.
  2198. >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
  2199. a b
  2200. a
  2201. 0 2000-01-01 00:00:00 0 1
  2202. 2000-01-01 00:03:00 0 2
  2203. 5 2000-01-01 00:03:00 5 1
  2204. """
  2205. from pandas.core.resample import get_resampler_for_grouping
  2206. return get_resampler_for_grouping(self, rule, *args, **kwargs)
  2207. @final
  2208. def rolling(self, *args, **kwargs) -> RollingGroupby:
  2209. """
  2210. Return a rolling grouper, providing rolling functionality per group.
  2211. Parameters
  2212. ----------
  2213. window : int, timedelta, str, offset, or BaseIndexer subclass
  2214. Size of the moving window.
  2215. If an integer, the fixed number of observations used for
  2216. each window.
  2217. If a timedelta, str, or offset, the time period of each window. Each
  2218. window will be a variable sized based on the observations included in
  2219. the time-period. This is only valid for datetimelike indexes.
  2220. To learn more about the offsets & frequency strings, please see `this link
  2221. <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
  2222. If a BaseIndexer subclass, the window boundaries
  2223. based on the defined ``get_window_bounds`` method. Additional rolling
  2224. keyword arguments, namely ``min_periods``, ``center``, ``closed`` and
  2225. ``step`` will be passed to ``get_window_bounds``.
  2226. min_periods : int, default None
  2227. Minimum number of observations in window required to have a value;
  2228. otherwise, result is ``np.nan``.
  2229. For a window that is specified by an offset,
  2230. ``min_periods`` will default to 1.
  2231. For a window that is specified by an integer, ``min_periods`` will default
  2232. to the size of the window.
  2233. center : bool, default False
  2234. If False, set the window labels as the right edge of the window index.
  2235. If True, set the window labels as the center of the window index.
  2236. win_type : str, default None
  2237. If ``None``, all points are evenly weighted.
  2238. If a string, it must be a valid `scipy.signal window function
  2239. <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.
  2240. Certain Scipy window types require additional parameters to be passed
  2241. in the aggregation function. The additional parameters must match
  2242. the keywords specified in the Scipy window type method signature.
  2243. on : str, optional
  2244. For a DataFrame, a column label or Index level on which
  2245. to calculate the rolling window, rather than the DataFrame's index.
  2246. Provided integer column is ignored and excluded from result since
  2247. an integer index is not used to calculate the rolling window.
  2248. axis : int or str, default 0
  2249. If ``0`` or ``'index'``, roll across the rows.
  2250. If ``1`` or ``'columns'``, roll across the columns.
  2251. For `Series` this parameter is unused and defaults to 0.
  2252. closed : str, default None
  2253. If ``'right'``, the first point in the window is excluded from calculations.
  2254. If ``'left'``, the last point in the window is excluded from calculations.
  2255. If ``'both'``, the no points in the window are excluded from calculations.
  2256. If ``'neither'``, the first and last points in the window are excluded
  2257. from calculations.
  2258. Default ``None`` (``'right'``).
  2259. method : str {'single', 'table'}, default 'single'
  2260. Execute the rolling operation per single column or row (``'single'``)
  2261. or over the entire object (``'table'``).
  2262. This argument is only implemented when specifying ``engine='numba'``
  2263. in the method call.
  2264. Returns
  2265. -------
  2266. RollingGroupby
  2267. Return a new grouper with our rolling appended.
  2268. See Also
  2269. --------
  2270. Series.rolling : Calling object with Series data.
  2271. DataFrame.rolling : Calling object with DataFrames.
  2272. Series.groupby : Apply a function groupby to a Series.
  2273. DataFrame.groupby : Apply a function groupby.
  2274. Examples
  2275. --------
  2276. >>> df = pd.DataFrame({'A': [1, 1, 2, 2],
  2277. ... 'B': [1, 2, 3, 4],
  2278. ... 'C': [0.362, 0.227, 1.267, -0.562]})
  2279. >>> df
  2280. A B C
  2281. 0 1 1 0.362
  2282. 1 1 2 0.227
  2283. 2 2 3 1.267
  2284. 3 2 4 -0.562
  2285. >>> df.groupby('A').rolling(2).sum()
  2286. B C
  2287. A
  2288. 1 0 NaN NaN
  2289. 1 3.0 0.589
  2290. 2 2 NaN NaN
  2291. 3 7.0 0.705
  2292. >>> df.groupby('A').rolling(2, min_periods=1).sum()
  2293. B C
  2294. A
  2295. 1 0 1.0 0.362
  2296. 1 3.0 0.589
  2297. 2 2 3.0 1.267
  2298. 3 7.0 0.705
  2299. >>> df.groupby('A').rolling(2, on='B').sum()
  2300. B C
  2301. A
  2302. 1 0 1 NaN
  2303. 1 2 0.589
  2304. 2 2 3 NaN
  2305. 3 4 0.705
  2306. """
  2307. from pandas.core.window import RollingGroupby
  2308. return RollingGroupby(
  2309. self._selected_obj,
  2310. *args,
  2311. _grouper=self.grouper,
  2312. _as_index=self.as_index,
  2313. **kwargs,
  2314. )
  2315. @final
  2316. @Substitution(name="groupby")
  2317. @Appender(_common_see_also)
  2318. def expanding(self, *args, **kwargs) -> ExpandingGroupby:
  2319. """
  2320. Return an expanding grouper, providing expanding
  2321. functionality per group.
  2322. """
  2323. from pandas.core.window import ExpandingGroupby
  2324. return ExpandingGroupby(
  2325. self._selected_obj,
  2326. *args,
  2327. _grouper=self.grouper,
  2328. **kwargs,
  2329. )
  2330. @final
  2331. @Substitution(name="groupby")
  2332. @Appender(_common_see_also)
  2333. def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:
  2334. """
  2335. Return an ewm grouper, providing ewm functionality per group.
  2336. """
  2337. from pandas.core.window import ExponentialMovingWindowGroupby
  2338. return ExponentialMovingWindowGroupby(
  2339. self._selected_obj,
  2340. *args,
  2341. _grouper=self.grouper,
  2342. **kwargs,
  2343. )
  2344. @final
  2345. def _fill(self, direction: Literal["ffill", "bfill"], limit=None):
  2346. """
  2347. Shared function for `pad` and `backfill` to call Cython method.
  2348. Parameters
  2349. ----------
  2350. direction : {'ffill', 'bfill'}
  2351. Direction passed to underlying Cython function. `bfill` will cause
  2352. values to be filled backwards. `ffill` and any other values will
  2353. default to a forward fill
  2354. limit : int, default None
  2355. Maximum number of consecutive values to fill. If `None`, this
  2356. method will convert to -1 prior to passing to Cython
  2357. Returns
  2358. -------
  2359. `Series` or `DataFrame` with filled values
  2360. See Also
  2361. --------
  2362. pad : Returns Series with minimum number of char in object.
  2363. backfill : Backward fill the missing values in the dataset.
  2364. """
  2365. # Need int value for Cython
  2366. if limit is None:
  2367. limit = -1
  2368. ids, _, _ = self.grouper.group_info
  2369. sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False)
  2370. if direction == "bfill":
  2371. sorted_labels = sorted_labels[::-1]
  2372. col_func = partial(
  2373. libgroupby.group_fillna_indexer,
  2374. labels=ids,
  2375. sorted_labels=sorted_labels,
  2376. direction=direction,
  2377. limit=limit,
  2378. dropna=self.dropna,
  2379. )
  2380. def blk_func(values: ArrayLike) -> ArrayLike:
  2381. mask = isna(values)
  2382. if values.ndim == 1:
  2383. indexer = np.empty(values.shape, dtype=np.intp)
  2384. col_func(out=indexer, mask=mask)
  2385. return algorithms.take_nd(values, indexer)
  2386. else:
  2387. # We broadcast algorithms.take_nd analogous to
  2388. # np.take_along_axis
  2389. # Note: we only get here with backfill/pad,
  2390. # so if we have a dtype that cannot hold NAs,
  2391. # then there will be no -1s in indexer, so we can use
  2392. # the original dtype (no need to ensure_dtype_can_hold_na)
  2393. if isinstance(values, np.ndarray):
  2394. dtype = values.dtype
  2395. if self.grouper.has_dropped_na:
  2396. # dropped null groups give rise to nan in the result
  2397. dtype = ensure_dtype_can_hold_na(values.dtype)
  2398. out = np.empty(values.shape, dtype=dtype)
  2399. else:
  2400. out = type(values)._empty(values.shape, dtype=values.dtype)
  2401. for i, value_element in enumerate(values):
  2402. # call group_fillna_indexer column-wise
  2403. indexer = np.empty(values.shape[1], dtype=np.intp)
  2404. col_func(out=indexer, mask=mask[i])
  2405. out[i, :] = algorithms.take_nd(value_element, indexer)
  2406. return out
  2407. mgr = self._get_data_to_aggregate()
  2408. res_mgr = mgr.apply(blk_func)
  2409. new_obj = self._wrap_agged_manager(res_mgr)
  2410. if self.axis == 1:
  2411. # Only relevant for DataFrameGroupBy
  2412. new_obj = new_obj.T
  2413. new_obj.columns = self.obj.columns
  2414. new_obj.index = self.obj.index
  2415. return new_obj
  2416. @final
  2417. @Substitution(name="groupby")
  2418. def ffill(self, limit=None):
  2419. """
  2420. Forward fill the values.
  2421. Parameters
  2422. ----------
  2423. limit : int, optional
  2424. Limit of how many values to fill.
  2425. Returns
  2426. -------
  2427. Series or DataFrame
  2428. Object with missing values filled.
  2429. See Also
  2430. --------
  2431. Series.ffill: Returns Series with minimum number of char in object.
  2432. DataFrame.ffill: Object with missing values filled or None if inplace=True.
  2433. Series.fillna: Fill NaN values of a Series.
  2434. DataFrame.fillna: Fill NaN values of a DataFrame.
  2435. """
  2436. return self._fill("ffill", limit=limit)
  2437. @final
  2438. @Substitution(name="groupby")
  2439. def bfill(self, limit=None):
  2440. """
  2441. Backward fill the values.
  2442. Parameters
  2443. ----------
  2444. limit : int, optional
  2445. Limit of how many values to fill.
  2446. Returns
  2447. -------
  2448. Series or DataFrame
  2449. Object with missing values filled.
  2450. See Also
  2451. --------
  2452. Series.bfill : Backward fill the missing values in the dataset.
  2453. DataFrame.bfill: Backward fill the missing values in the dataset.
  2454. Series.fillna: Fill NaN values of a Series.
  2455. DataFrame.fillna: Fill NaN values of a DataFrame.
  2456. """
  2457. return self._fill("bfill", limit=limit)
  2458. @final
  2459. @property
  2460. @Substitution(name="groupby")
  2461. @Substitution(see_also=_common_see_also)
  2462. def nth(self) -> GroupByNthSelector:
  2463. """
  2464. Take the nth row from each group if n is an int, otherwise a subset of rows.
  2465. Can be either a call or an index. dropna is not available with index notation.
  2466. Index notation accepts a comma separated list of integers and slices.
  2467. If dropna, will take the nth non-null row, dropna is either
  2468. 'all' or 'any'; this is equivalent to calling dropna(how=dropna)
  2469. before the groupby.
  2470. Parameters
  2471. ----------
  2472. n : int, slice or list of ints and slices
  2473. A single nth value for the row or a list of nth values or slices.
  2474. .. versionchanged:: 1.4.0
  2475. Added slice and lists containing slices.
  2476. Added index notation.
  2477. dropna : {'any', 'all', None}, default None
  2478. Apply the specified dropna operation before counting which row is
  2479. the nth row. Only supported if n is an int.
  2480. Returns
  2481. -------
  2482. Series or DataFrame
  2483. N-th value within each group.
  2484. %(see_also)s
  2485. Examples
  2486. --------
  2487. >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
  2488. ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
  2489. >>> g = df.groupby('A')
  2490. >>> g.nth(0)
  2491. A B
  2492. 0 1 NaN
  2493. 2 2 3.0
  2494. >>> g.nth(1)
  2495. A B
  2496. 1 1 2.0
  2497. 4 2 5.0
  2498. >>> g.nth(-1)
  2499. A B
  2500. 3 1 4.0
  2501. 4 2 5.0
  2502. >>> g.nth([0, 1])
  2503. A B
  2504. 0 1 NaN
  2505. 1 1 2.0
  2506. 2 2 3.0
  2507. 4 2 5.0
  2508. >>> g.nth(slice(None, -1))
  2509. A B
  2510. 0 1 NaN
  2511. 1 1 2.0
  2512. 2 2 3.0
  2513. Index notation may also be used
  2514. >>> g.nth[0, 1]
  2515. A B
  2516. 0 1 NaN
  2517. 1 1 2.0
  2518. 2 2 3.0
  2519. 4 2 5.0
  2520. >>> g.nth[:-1]
  2521. A B
  2522. 0 1 NaN
  2523. 1 1 2.0
  2524. 2 2 3.0
  2525. Specifying `dropna` allows ignoring ``NaN`` values
  2526. >>> g.nth(0, dropna='any')
  2527. A B
  2528. 1 1 2.0
  2529. 2 2 3.0
  2530. When the specified ``n`` is larger than any of the groups, an
  2531. empty DataFrame is returned
  2532. >>> g.nth(3, dropna='any')
  2533. Empty DataFrame
  2534. Columns: [A, B]
  2535. Index: []
  2536. """
  2537. return GroupByNthSelector(self)
  2538. def _nth(
  2539. self,
  2540. n: PositionalIndexer | tuple,
  2541. dropna: Literal["any", "all", None] = None,
  2542. ) -> NDFrameT:
  2543. if not dropna:
  2544. mask = self._make_mask_from_positional_indexer(n)
  2545. ids, _, _ = self.grouper.group_info
  2546. # Drop NA values in grouping
  2547. mask = mask & (ids != -1)
  2548. out = self._mask_selected_obj(mask)
  2549. return out
  2550. # dropna is truthy
  2551. if not is_integer(n):
  2552. raise ValueError("dropna option only supported for an integer argument")
  2553. if dropna not in ["any", "all"]:
  2554. # Note: when agg-ing picker doesn't raise this, just returns NaN
  2555. raise ValueError(
  2556. "For a DataFrame or Series groupby.nth, dropna must be "
  2557. "either None, 'any' or 'all', "
  2558. f"(was passed {dropna})."
  2559. )
  2560. # old behaviour, but with all and any support for DataFrames.
  2561. # modified in GH 7559 to have better perf
  2562. n = cast(int, n)
  2563. dropped = self.obj.dropna(how=dropna, axis=self.axis)
  2564. # get a new grouper for our dropped obj
  2565. if self.keys is None and self.level is None:
  2566. # we don't have the grouper info available
  2567. # (e.g. we have selected out
  2568. # a column that is not in the current object)
  2569. axis = self.grouper.axis
  2570. grouper = self.grouper.codes_info[axis.isin(dropped.index)]
  2571. if self.grouper.has_dropped_na:
  2572. # Null groups need to still be encoded as -1 when passed to groupby
  2573. nulls = grouper == -1
  2574. # error: No overload variant of "where" matches argument types
  2575. # "Any", "NAType", "Any"
  2576. values = np.where(nulls, NA, grouper) # type: ignore[call-overload]
  2577. grouper = Index(values, dtype="Int64") # type: ignore[assignment]
  2578. else:
  2579. # create a grouper with the original parameters, but on dropped
  2580. # object
  2581. grouper, _, _ = get_grouper( # type: ignore[assignment]
  2582. dropped,
  2583. key=self.keys,
  2584. axis=self.axis,
  2585. level=self.level,
  2586. sort=self.sort,
  2587. )
  2588. grb = dropped.groupby(
  2589. grouper, as_index=self.as_index, sort=self.sort, axis=self.axis
  2590. )
  2591. return grb.nth(n)
  2592. @final
  2593. def quantile(
  2594. self,
  2595. q: float | AnyArrayLike = 0.5,
  2596. interpolation: str = "linear",
  2597. numeric_only: bool = False,
  2598. ):
  2599. """
  2600. Return group values at the given quantile, a la numpy.percentile.
  2601. Parameters
  2602. ----------
  2603. q : float or array-like, default 0.5 (50% quantile)
  2604. Value(s) between 0 and 1 providing the quantile(s) to compute.
  2605. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
  2606. Method to use when the desired quantile falls between two points.
  2607. numeric_only : bool, default False
  2608. Include only `float`, `int` or `boolean` data.
  2609. .. versionadded:: 1.5.0
  2610. .. versionchanged:: 2.0.0
  2611. numeric_only now defaults to ``False``.
  2612. Returns
  2613. -------
  2614. Series or DataFrame
  2615. Return type determined by caller of GroupBy object.
  2616. See Also
  2617. --------
  2618. Series.quantile : Similar method for Series.
  2619. DataFrame.quantile : Similar method for DataFrame.
  2620. numpy.percentile : NumPy method to compute qth percentile.
  2621. Examples
  2622. --------
  2623. >>> df = pd.DataFrame([
  2624. ... ['a', 1], ['a', 2], ['a', 3],
  2625. ... ['b', 1], ['b', 3], ['b', 5]
  2626. ... ], columns=['key', 'val'])
  2627. >>> df.groupby('key').quantile()
  2628. val
  2629. key
  2630. a 2.0
  2631. b 3.0
  2632. """
  2633. def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
  2634. if is_object_dtype(vals):
  2635. raise TypeError(
  2636. "'quantile' cannot be performed against 'object' dtypes!"
  2637. )
  2638. inference: DtypeObj | None = None
  2639. if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype):
  2640. out = vals.to_numpy(dtype=float, na_value=np.nan)
  2641. inference = vals.dtype
  2642. elif is_integer_dtype(vals.dtype):
  2643. if isinstance(vals, ExtensionArray):
  2644. out = vals.to_numpy(dtype=float, na_value=np.nan)
  2645. else:
  2646. out = vals
  2647. inference = np.dtype(np.int64)
  2648. elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):
  2649. out = vals.to_numpy(dtype=float, na_value=np.nan)
  2650. elif needs_i8_conversion(vals.dtype):
  2651. inference = vals.dtype
  2652. # In this case we need to delay the casting until after the
  2653. # np.lexsort below.
  2654. # error: Incompatible return value type (got
  2655. # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any,
  2656. # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any],
  2657. # Optional[Union[dtype[Any], ExtensionDtype]]]")
  2658. return vals, inference # type: ignore[return-value]
  2659. elif isinstance(vals, ExtensionArray) and is_float_dtype(vals):
  2660. inference = np.dtype(np.float64)
  2661. out = vals.to_numpy(dtype=float, na_value=np.nan)
  2662. else:
  2663. out = np.asarray(vals)
  2664. return out, inference
  2665. def post_processor(
  2666. vals: np.ndarray,
  2667. inference: DtypeObj | None,
  2668. result_mask: np.ndarray | None,
  2669. orig_vals: ArrayLike,
  2670. ) -> ArrayLike:
  2671. if inference:
  2672. # Check for edge case
  2673. if isinstance(orig_vals, BaseMaskedArray):
  2674. assert result_mask is not None # for mypy
  2675. if interpolation in {"linear", "midpoint"} and not is_float_dtype(
  2676. orig_vals
  2677. ):
  2678. return FloatingArray(vals, result_mask)
  2679. else:
  2680. # Item "ExtensionDtype" of "Union[ExtensionDtype, str,
  2681. # dtype[Any], Type[object]]" has no attribute "numpy_dtype"
  2682. # [union-attr]
  2683. return type(orig_vals)(
  2684. vals.astype(
  2685. inference.numpy_dtype # type: ignore[union-attr]
  2686. ),
  2687. result_mask,
  2688. )
  2689. elif not (
  2690. is_integer_dtype(inference)
  2691. and interpolation in {"linear", "midpoint"}
  2692. ):
  2693. if needs_i8_conversion(inference):
  2694. # error: Item "ExtensionArray" of "Union[ExtensionArray,
  2695. # ndarray[Any, Any]]" has no attribute "_ndarray"
  2696. vals = vals.astype("i8").view(
  2697. orig_vals._ndarray.dtype # type: ignore[union-attr]
  2698. )
  2699. # error: Item "ExtensionArray" of "Union[ExtensionArray,
  2700. # ndarray[Any, Any]]" has no attribute "_from_backing_data"
  2701. return orig_vals._from_backing_data( # type: ignore[union-attr]
  2702. vals
  2703. )
  2704. assert isinstance(inference, np.dtype) # for mypy
  2705. return vals.astype(inference)
  2706. return vals
  2707. orig_scalar = is_scalar(q)
  2708. if orig_scalar:
  2709. # error: Incompatible types in assignment (expression has type "List[
  2710. # Union[float, ExtensionArray, ndarray[Any, Any], Index, Series]]",
  2711. # variable has type "Union[float, Union[Union[ExtensionArray, ndarray[
  2712. # Any, Any]], Index, Series]]")
  2713. q = [q] # type: ignore[assignment]
  2714. qs = np.array(q, dtype=np.float64)
  2715. ids, _, ngroups = self.grouper.group_info
  2716. nqs = len(qs)
  2717. func = partial(
  2718. libgroupby.group_quantile, labels=ids, qs=qs, interpolation=interpolation
  2719. )
  2720. # Put '-1' (NaN) labels as the last group so it does not interfere
  2721. # with the calculations. Note: length check avoids failure on empty
  2722. # labels. In that case, the value doesn't matter
  2723. na_label_for_sorting = ids.max() + 1 if len(ids) > 0 else 0
  2724. labels_for_lexsort = np.where(ids == -1, na_label_for_sorting, ids)
  2725. def blk_func(values: ArrayLike) -> ArrayLike:
  2726. orig_vals = values
  2727. if isinstance(values, BaseMaskedArray):
  2728. mask = values._mask
  2729. result_mask = np.zeros((ngroups, nqs), dtype=np.bool_)
  2730. else:
  2731. mask = isna(values)
  2732. result_mask = None
  2733. is_datetimelike = needs_i8_conversion(values.dtype)
  2734. vals, inference = pre_processor(values)
  2735. ncols = 1
  2736. if vals.ndim == 2:
  2737. ncols = vals.shape[0]
  2738. shaped_labels = np.broadcast_to(
  2739. labels_for_lexsort, (ncols, len(labels_for_lexsort))
  2740. )
  2741. else:
  2742. shaped_labels = labels_for_lexsort
  2743. out = np.empty((ncols, ngroups, nqs), dtype=np.float64)
  2744. # Get an index of values sorted by values and then labels
  2745. order = (vals, shaped_labels)
  2746. sort_arr = np.lexsort(order).astype(np.intp, copy=False)
  2747. if is_datetimelike:
  2748. # This casting needs to happen after the lexsort in order
  2749. # to ensure that NaTs are placed at the end and not the front
  2750. vals = vals.view("i8").astype(np.float64)
  2751. if vals.ndim == 1:
  2752. # Ea is always 1d
  2753. func(
  2754. out[0],
  2755. values=vals,
  2756. mask=mask,
  2757. sort_indexer=sort_arr,
  2758. result_mask=result_mask,
  2759. )
  2760. else:
  2761. for i in range(ncols):
  2762. func(out[i], values=vals[i], mask=mask[i], sort_indexer=sort_arr[i])
  2763. if vals.ndim == 1:
  2764. out = out.ravel("K")
  2765. if result_mask is not None:
  2766. result_mask = result_mask.ravel("K")
  2767. else:
  2768. out = out.reshape(ncols, ngroups * nqs)
  2769. return post_processor(out, inference, result_mask, orig_vals)
  2770. data = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile")
  2771. res_mgr = data.grouped_reduce(blk_func)
  2772. res = self._wrap_agged_manager(res_mgr)
  2773. if orig_scalar:
  2774. # Avoid expensive MultiIndex construction
  2775. return self._wrap_aggregated_output(res)
  2776. return self._wrap_aggregated_output(res, qs=qs)
  2777. @final
  2778. @Substitution(name="groupby")
  2779. def ngroup(self, ascending: bool = True):
  2780. """
  2781. Number each group from 0 to the number of groups - 1.
  2782. This is the enumerative complement of cumcount. Note that the
  2783. numbers given to the groups match the order in which the groups
  2784. would be seen when iterating over the groupby object, not the
  2785. order they are first observed.
  2786. Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN`
  2787. and will be skipped from the count.
  2788. Parameters
  2789. ----------
  2790. ascending : bool, default True
  2791. If False, number in reverse, from number of group - 1 to 0.
  2792. Returns
  2793. -------
  2794. Series
  2795. Unique numbers for each group.
  2796. See Also
  2797. --------
  2798. .cumcount : Number the rows in each group.
  2799. Examples
  2800. --------
  2801. >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]})
  2802. >>> df
  2803. color
  2804. 0 red
  2805. 1 None
  2806. 2 red
  2807. 3 blue
  2808. 4 blue
  2809. 5 red
  2810. >>> df.groupby("color").ngroup()
  2811. 0 1.0
  2812. 1 NaN
  2813. 2 1.0
  2814. 3 0.0
  2815. 4 0.0
  2816. 5 1.0
  2817. dtype: float64
  2818. >>> df.groupby("color", dropna=False).ngroup()
  2819. 0 1
  2820. 1 2
  2821. 2 1
  2822. 3 0
  2823. 4 0
  2824. 5 1
  2825. dtype: int64
  2826. >>> df.groupby("color", dropna=False).ngroup(ascending=False)
  2827. 0 1
  2828. 1 0
  2829. 2 1
  2830. 3 2
  2831. 4 2
  2832. 5 1
  2833. dtype: int64
  2834. """
  2835. obj = self._obj_with_exclusions
  2836. index = obj._get_axis(self.axis)
  2837. comp_ids = self.grouper.group_info[0]
  2838. dtype: type
  2839. if self.grouper.has_dropped_na:
  2840. comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)
  2841. dtype = np.float64
  2842. else:
  2843. dtype = np.int64
  2844. if any(ping._passed_categorical for ping in self.grouper.groupings):
  2845. # comp_ids reflect non-observed groups, we need only observed
  2846. comp_ids = rank_1d(comp_ids, ties_method="dense") - 1
  2847. result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
  2848. if not ascending:
  2849. result = self.ngroups - 1 - result
  2850. return result
  2851. @final
  2852. @Substitution(name="groupby")
  2853. def cumcount(self, ascending: bool = True):
  2854. """
  2855. Number each item in each group from 0 to the length of that group - 1.
  2856. Essentially this is equivalent to
  2857. .. code-block:: python
  2858. self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
  2859. Parameters
  2860. ----------
  2861. ascending : bool, default True
  2862. If False, number in reverse, from length of group - 1 to 0.
  2863. Returns
  2864. -------
  2865. Series
  2866. Sequence number of each element within each group.
  2867. See Also
  2868. --------
  2869. .ngroup : Number the groups themselves.
  2870. Examples
  2871. --------
  2872. >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
  2873. ... columns=['A'])
  2874. >>> df
  2875. A
  2876. 0 a
  2877. 1 a
  2878. 2 a
  2879. 3 b
  2880. 4 b
  2881. 5 a
  2882. >>> df.groupby('A').cumcount()
  2883. 0 0
  2884. 1 1
  2885. 2 2
  2886. 3 0
  2887. 4 1
  2888. 5 3
  2889. dtype: int64
  2890. >>> df.groupby('A').cumcount(ascending=False)
  2891. 0 3
  2892. 1 2
  2893. 2 1
  2894. 3 1
  2895. 4 0
  2896. 5 0
  2897. dtype: int64
  2898. """
  2899. index = self._obj_with_exclusions._get_axis(self.axis)
  2900. cumcounts = self._cumcount_array(ascending=ascending)
  2901. return self._obj_1d_constructor(cumcounts, index)
  2902. @final
  2903. @Substitution(name="groupby")
  2904. @Substitution(see_also=_common_see_also)
  2905. def rank(
  2906. self,
  2907. method: str = "average",
  2908. ascending: bool = True,
  2909. na_option: str = "keep",
  2910. pct: bool = False,
  2911. axis: AxisInt = 0,
  2912. ) -> NDFrameT:
  2913. """
  2914. Provide the rank of values within each group.
  2915. Parameters
  2916. ----------
  2917. method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
  2918. * average: average rank of group.
  2919. * min: lowest rank in group.
  2920. * max: highest rank in group.
  2921. * first: ranks assigned in order they appear in the array.
  2922. * dense: like 'min', but rank always increases by 1 between groups.
  2923. ascending : bool, default True
  2924. False for ranks by high (1) to low (N).
  2925. na_option : {'keep', 'top', 'bottom'}, default 'keep'
  2926. * keep: leave NA values where they are.
  2927. * top: smallest rank if ascending.
  2928. * bottom: smallest rank if descending.
  2929. pct : bool, default False
  2930. Compute percentage rank of data within each group.
  2931. axis : int, default 0
  2932. The axis of the object over which to compute the rank.
  2933. Returns
  2934. -------
  2935. DataFrame with ranking of values within each group
  2936. %(see_also)s
  2937. Examples
  2938. --------
  2939. >>> df = pd.DataFrame(
  2940. ... {
  2941. ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"],
  2942. ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5],
  2943. ... }
  2944. ... )
  2945. >>> df
  2946. group value
  2947. 0 a 2
  2948. 1 a 4
  2949. 2 a 2
  2950. 3 a 3
  2951. 4 a 5
  2952. 5 b 1
  2953. 6 b 2
  2954. 7 b 4
  2955. 8 b 1
  2956. 9 b 5
  2957. >>> for method in ['average', 'min', 'max', 'dense', 'first']:
  2958. ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method)
  2959. >>> df
  2960. group value average_rank min_rank max_rank dense_rank first_rank
  2961. 0 a 2 1.5 1.0 2.0 1.0 1.0
  2962. 1 a 4 4.0 4.0 4.0 3.0 4.0
  2963. 2 a 2 1.5 1.0 2.0 1.0 2.0
  2964. 3 a 3 3.0 3.0 3.0 2.0 3.0
  2965. 4 a 5 5.0 5.0 5.0 4.0 5.0
  2966. 5 b 1 1.5 1.0 2.0 1.0 1.0
  2967. 6 b 2 3.0 3.0 3.0 2.0 3.0
  2968. 7 b 4 4.0 4.0 4.0 3.0 4.0
  2969. 8 b 1 1.5 1.0 2.0 1.0 2.0
  2970. 9 b 5 5.0 5.0 5.0 4.0 5.0
  2971. """
  2972. if na_option not in {"keep", "top", "bottom"}:
  2973. msg = "na_option must be one of 'keep', 'top', or 'bottom'"
  2974. raise ValueError(msg)
  2975. kwargs = {
  2976. "ties_method": method,
  2977. "ascending": ascending,
  2978. "na_option": na_option,
  2979. "pct": pct,
  2980. }
  2981. if axis != 0:
  2982. # DataFrame uses different keyword name
  2983. kwargs["method"] = kwargs.pop("ties_method")
  2984. f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs)
  2985. result = self._python_apply_general(
  2986. f, self._selected_obj, is_transform=True
  2987. )
  2988. return result
  2989. return self._cython_transform(
  2990. "rank",
  2991. numeric_only=False,
  2992. axis=axis,
  2993. **kwargs,
  2994. )
  2995. @final
  2996. @Substitution(name="groupby")
  2997. @Appender(_common_see_also)
  2998. def cumprod(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT:
  2999. """
  3000. Cumulative product for each group.
  3001. Returns
  3002. -------
  3003. Series or DataFrame
  3004. """
  3005. nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])
  3006. if axis != 0:
  3007. f = lambda x: x.cumprod(axis=axis, **kwargs)
  3008. return self._python_apply_general(f, self._selected_obj, is_transform=True)
  3009. return self._cython_transform("cumprod", **kwargs)
  3010. @final
  3011. @Substitution(name="groupby")
  3012. @Appender(_common_see_also)
  3013. def cumsum(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT:
  3014. """
  3015. Cumulative sum for each group.
  3016. Returns
  3017. -------
  3018. Series or DataFrame
  3019. """
  3020. nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])
  3021. if axis != 0:
  3022. f = lambda x: x.cumsum(axis=axis, **kwargs)
  3023. return self._python_apply_general(f, self._selected_obj, is_transform=True)
  3024. return self._cython_transform("cumsum", **kwargs)
  3025. @final
  3026. @Substitution(name="groupby")
  3027. @Appender(_common_see_also)
  3028. def cummin(
  3029. self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs
  3030. ) -> NDFrameT:
  3031. """
  3032. Cumulative min for each group.
  3033. Returns
  3034. -------
  3035. Series or DataFrame
  3036. """
  3037. skipna = kwargs.get("skipna", True)
  3038. if axis != 0:
  3039. f = lambda x: np.minimum.accumulate(x, axis)
  3040. obj = self._selected_obj
  3041. if numeric_only:
  3042. obj = obj._get_numeric_data()
  3043. return self._python_apply_general(f, obj, is_transform=True)
  3044. return self._cython_transform(
  3045. "cummin", numeric_only=numeric_only, skipna=skipna
  3046. )
  3047. @final
  3048. @Substitution(name="groupby")
  3049. @Appender(_common_see_also)
  3050. def cummax(
  3051. self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs
  3052. ) -> NDFrameT:
  3053. """
  3054. Cumulative max for each group.
  3055. Returns
  3056. -------
  3057. Series or DataFrame
  3058. """
  3059. skipna = kwargs.get("skipna", True)
  3060. if axis != 0:
  3061. f = lambda x: np.maximum.accumulate(x, axis)
  3062. obj = self._selected_obj
  3063. if numeric_only:
  3064. obj = obj._get_numeric_data()
  3065. return self._python_apply_general(f, obj, is_transform=True)
  3066. return self._cython_transform(
  3067. "cummax", numeric_only=numeric_only, skipna=skipna
  3068. )
  3069. @final
  3070. def _get_cythonized_result(
  3071. self,
  3072. base_func: Callable,
  3073. cython_dtype: np.dtype,
  3074. numeric_only: bool = False,
  3075. needs_counts: bool = False,
  3076. pre_processing=None,
  3077. post_processing=None,
  3078. how: str = "any_all",
  3079. **kwargs,
  3080. ):
  3081. """
  3082. Get result for Cythonized functions.
  3083. Parameters
  3084. ----------
  3085. base_func : callable, Cythonized function to be called
  3086. cython_dtype : np.dtype
  3087. Type of the array that will be modified by the Cython call.
  3088. numeric_only : bool, default False
  3089. Whether only numeric datatypes should be computed
  3090. needs_counts : bool, default False
  3091. Whether the counts should be a part of the Cython call
  3092. pre_processing : function, default None
  3093. Function to be applied to `values` prior to passing to Cython.
  3094. Function should return a tuple where the first element is the
  3095. values to be passed to Cython and the second element is an optional
  3096. type which the values should be converted to after being returned
  3097. by the Cython operation. This function is also responsible for
  3098. raising a TypeError if the values have an invalid type. Raises
  3099. if `needs_values` is False.
  3100. post_processing : function, default None
  3101. Function to be applied to result of Cython function. Should accept
  3102. an array of values as the first argument and type inferences as its
  3103. second argument, i.e. the signature should be
  3104. (ndarray, Type). If `needs_nullable=True`, a third argument should be
  3105. `nullable`, to allow for processing specific to nullable values.
  3106. how : str, default any_all
  3107. Determines if any/all cython interface or std interface is used.
  3108. **kwargs : dict
  3109. Extra arguments to be passed back to Cython funcs
  3110. Returns
  3111. -------
  3112. `Series` or `DataFrame` with filled values
  3113. """
  3114. if post_processing and not callable(post_processing):
  3115. raise ValueError("'post_processing' must be a callable!")
  3116. if pre_processing and not callable(pre_processing):
  3117. raise ValueError("'pre_processing' must be a callable!")
  3118. grouper = self.grouper
  3119. ids, _, ngroups = grouper.group_info
  3120. base_func = partial(base_func, labels=ids)
  3121. def blk_func(values: ArrayLike) -> ArrayLike:
  3122. values = values.T
  3123. ncols = 1 if values.ndim == 1 else values.shape[1]
  3124. result: ArrayLike
  3125. result = np.zeros(ngroups * ncols, dtype=cython_dtype)
  3126. result = result.reshape((ngroups, ncols))
  3127. func = partial(base_func, out=result)
  3128. inferences = None
  3129. if needs_counts:
  3130. counts = np.zeros(ngroups, dtype=np.int64)
  3131. func = partial(func, counts=counts)
  3132. is_datetimelike = values.dtype.kind in ["m", "M"]
  3133. vals = values
  3134. if is_datetimelike and how == "std":
  3135. vals = vals.view("i8")
  3136. if pre_processing:
  3137. vals, inferences = pre_processing(vals)
  3138. vals = vals.astype(cython_dtype, copy=False)
  3139. if vals.ndim == 1:
  3140. vals = vals.reshape((-1, 1))
  3141. func = partial(func, values=vals)
  3142. if how != "std" or isinstance(values, BaseMaskedArray):
  3143. mask = isna(values).view(np.uint8)
  3144. if mask.ndim == 1:
  3145. mask = mask.reshape(-1, 1)
  3146. func = partial(func, mask=mask)
  3147. if how != "std":
  3148. is_nullable = isinstance(values, BaseMaskedArray)
  3149. func = partial(func, nullable=is_nullable)
  3150. elif isinstance(values, BaseMaskedArray):
  3151. result_mask = np.zeros(result.shape, dtype=np.bool_)
  3152. func = partial(func, result_mask=result_mask)
  3153. # Call func to modify result in place
  3154. if how == "std":
  3155. func(**kwargs, is_datetimelike=is_datetimelike)
  3156. else:
  3157. func(**kwargs)
  3158. if values.ndim == 1:
  3159. assert result.shape[1] == 1, result.shape
  3160. result = result[:, 0]
  3161. if post_processing:
  3162. pp_kwargs: dict[str, bool | np.ndarray] = {}
  3163. pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray)
  3164. if how == "std" and pp_kwargs["nullable"]:
  3165. pp_kwargs["result_mask"] = result_mask
  3166. result = post_processing(result, inferences, **pp_kwargs)
  3167. if how == "std" and is_datetimelike:
  3168. values = cast("DatetimeArray | TimedeltaArray", values)
  3169. unit = values.unit
  3170. with warnings.catch_warnings():
  3171. # suppress "RuntimeWarning: invalid value encountered in cast"
  3172. warnings.filterwarnings("ignore")
  3173. result = result.astype(np.int64, copy=False)
  3174. result = result.view(f"m8[{unit}]")
  3175. return result.T
  3176. # Operate block-wise instead of column-by-column
  3177. mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
  3178. res_mgr = mgr.grouped_reduce(blk_func)
  3179. out = self._wrap_agged_manager(res_mgr)
  3180. return self._wrap_aggregated_output(out)
  3181. @final
  3182. @Substitution(name="groupby")
  3183. def shift(self, periods: int = 1, freq=None, axis: Axis = 0, fill_value=None):
  3184. """
  3185. Shift each group by periods observations.
  3186. If freq is passed, the index will be increased using the periods and the freq.
  3187. Parameters
  3188. ----------
  3189. periods : int, default 1
  3190. Number of periods to shift.
  3191. freq : str, optional
  3192. Frequency string.
  3193. axis : axis to shift, default 0
  3194. Shift direction.
  3195. fill_value : optional
  3196. The scalar value to use for newly introduced missing values.
  3197. Returns
  3198. -------
  3199. Series or DataFrame
  3200. Object shifted within each group.
  3201. See Also
  3202. --------
  3203. Index.shift : Shift values of Index.
  3204. """
  3205. if freq is not None or axis != 0:
  3206. f = lambda x: x.shift(periods, freq, axis, fill_value)
  3207. return self._python_apply_general(f, self._selected_obj, is_transform=True)
  3208. ids, _, ngroups = self.grouper.group_info
  3209. res_indexer = np.zeros(len(ids), dtype=np.int64)
  3210. libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods)
  3211. obj = self._obj_with_exclusions
  3212. res = obj._reindex_with_indexers(
  3213. {self.axis: (obj.axes[self.axis], res_indexer)},
  3214. fill_value=fill_value,
  3215. allow_dups=True,
  3216. )
  3217. return res
  3218. @final
  3219. @Substitution(name="groupby")
  3220. @Appender(_common_see_also)
  3221. def diff(self, periods: int = 1, axis: AxisInt = 0) -> NDFrameT:
  3222. """
  3223. First discrete difference of element.
  3224. Calculates the difference of each element compared with another
  3225. element in the group (default is element in previous row).
  3226. Parameters
  3227. ----------
  3228. periods : int, default 1
  3229. Periods to shift for calculating difference, accepts negative values.
  3230. axis : axis to shift, default 0
  3231. Take difference over rows (0) or columns (1).
  3232. Returns
  3233. -------
  3234. Series or DataFrame
  3235. First differences.
  3236. """
  3237. if axis != 0:
  3238. return self.apply(lambda x: x.diff(periods=periods, axis=axis))
  3239. obj = self._obj_with_exclusions
  3240. shifted = self.shift(periods=periods, axis=axis)
  3241. # GH45562 - to retain existing behavior and match behavior of Series.diff(),
  3242. # int8 and int16 are coerced to float32 rather than float64.
  3243. dtypes_to_f32 = ["int8", "int16"]
  3244. if obj.ndim == 1:
  3245. if obj.dtype in dtypes_to_f32:
  3246. shifted = shifted.astype("float32")
  3247. else:
  3248. to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]
  3249. if len(to_coerce):
  3250. shifted = shifted.astype({c: "float32" for c in to_coerce})
  3251. return obj - shifted
  3252. @final
  3253. @Substitution(name="groupby")
  3254. @Appender(_common_see_also)
  3255. def pct_change(
  3256. self,
  3257. periods: int = 1,
  3258. fill_method: FillnaOptions = "ffill",
  3259. limit=None,
  3260. freq=None,
  3261. axis: Axis = 0,
  3262. ):
  3263. """
  3264. Calculate pct_change of each value to previous entry in group.
  3265. Returns
  3266. -------
  3267. Series or DataFrame
  3268. Percentage changes within each group.
  3269. """
  3270. # TODO(GH#23918): Remove this conditional for SeriesGroupBy when
  3271. # GH#23918 is fixed
  3272. if freq is not None or axis != 0:
  3273. f = lambda x: x.pct_change(
  3274. periods=periods,
  3275. fill_method=fill_method,
  3276. limit=limit,
  3277. freq=freq,
  3278. axis=axis,
  3279. )
  3280. return self._python_apply_general(f, self._selected_obj, is_transform=True)
  3281. if fill_method is None: # GH30463
  3282. fill_method = "ffill"
  3283. limit = 0
  3284. filled = getattr(self, fill_method)(limit=limit)
  3285. fill_grp = filled.groupby(
  3286. self.grouper.codes, axis=self.axis, group_keys=self.group_keys
  3287. )
  3288. shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis)
  3289. return (filled / shifted) - 1
  3290. @final
  3291. @Substitution(name="groupby")
  3292. @Substitution(see_also=_common_see_also)
  3293. def head(self, n: int = 5) -> NDFrameT:
  3294. """
  3295. Return first n rows of each group.
  3296. Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows
  3297. from the original DataFrame with original index and order preserved
  3298. (``as_index`` flag is ignored).
  3299. Parameters
  3300. ----------
  3301. n : int
  3302. If positive: number of entries to include from start of each group.
  3303. If negative: number of entries to exclude from end of each group.
  3304. Returns
  3305. -------
  3306. Series or DataFrame
  3307. Subset of original Series or DataFrame as determined by n.
  3308. %(see_also)s
  3309. Examples
  3310. --------
  3311. >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
  3312. ... columns=['A', 'B'])
  3313. >>> df.groupby('A').head(1)
  3314. A B
  3315. 0 1 2
  3316. 2 5 6
  3317. >>> df.groupby('A').head(-1)
  3318. A B
  3319. 0 1 2
  3320. """
  3321. mask = self._make_mask_from_positional_indexer(slice(None, n))
  3322. return self._mask_selected_obj(mask)
  3323. @final
  3324. @Substitution(name="groupby")
  3325. @Substitution(see_also=_common_see_also)
  3326. def tail(self, n: int = 5) -> NDFrameT:
  3327. """
  3328. Return last n rows of each group.
  3329. Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows
  3330. from the original DataFrame with original index and order preserved
  3331. (``as_index`` flag is ignored).
  3332. Parameters
  3333. ----------
  3334. n : int
  3335. If positive: number of entries to include from end of each group.
  3336. If negative: number of entries to exclude from start of each group.
  3337. Returns
  3338. -------
  3339. Series or DataFrame
  3340. Subset of original Series or DataFrame as determined by n.
  3341. %(see_also)s
  3342. Examples
  3343. --------
  3344. >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
  3345. ... columns=['A', 'B'])
  3346. >>> df.groupby('A').tail(1)
  3347. A B
  3348. 1 a 2
  3349. 3 b 2
  3350. >>> df.groupby('A').tail(-1)
  3351. A B
  3352. 1 a 2
  3353. 3 b 2
  3354. """
  3355. if n:
  3356. mask = self._make_mask_from_positional_indexer(slice(-n, None))
  3357. else:
  3358. mask = self._make_mask_from_positional_indexer([])
  3359. return self._mask_selected_obj(mask)
  3360. @final
  3361. def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT:
  3362. """
  3363. Return _selected_obj with mask applied to the correct axis.
  3364. Parameters
  3365. ----------
  3366. mask : np.ndarray[bool]
  3367. Boolean mask to apply.
  3368. Returns
  3369. -------
  3370. Series or DataFrame
  3371. Filtered _selected_obj.
  3372. """
  3373. ids = self.grouper.group_info[0]
  3374. mask = mask & (ids != -1)
  3375. if self.axis == 0:
  3376. return self._selected_obj[mask]
  3377. else:
  3378. return self._selected_obj.iloc[:, mask]
  3379. @final
  3380. def _reindex_output(
  3381. self,
  3382. output: OutputFrameOrSeries,
  3383. fill_value: Scalar = np.NaN,
  3384. qs: npt.NDArray[np.float64] | None = None,
  3385. ) -> OutputFrameOrSeries:
  3386. """
  3387. If we have categorical groupers, then we might want to make sure that
  3388. we have a fully re-indexed output to the levels. This means expanding
  3389. the output space to accommodate all values in the cartesian product of
  3390. our groups, regardless of whether they were observed in the data or
  3391. not. This will expand the output space if there are missing groups.
  3392. The method returns early without modifying the input if the number of
  3393. groupings is less than 2, self.observed == True or none of the groupers
  3394. are categorical.
  3395. Parameters
  3396. ----------
  3397. output : Series or DataFrame
  3398. Object resulting from grouping and applying an operation.
  3399. fill_value : scalar, default np.NaN
  3400. Value to use for unobserved categories if self.observed is False.
  3401. qs : np.ndarray[float64] or None, default None
  3402. quantile values, only relevant for quantile.
  3403. Returns
  3404. -------
  3405. Series or DataFrame
  3406. Object (potentially) re-indexed to include all possible groups.
  3407. """
  3408. groupings = self.grouper.groupings
  3409. if len(groupings) == 1:
  3410. return output
  3411. # if we only care about the observed values
  3412. # we are done
  3413. elif self.observed:
  3414. return output
  3415. # reindexing only applies to a Categorical grouper
  3416. elif not any(
  3417. isinstance(ping.grouping_vector, (Categorical, CategoricalIndex))
  3418. for ping in groupings
  3419. ):
  3420. return output
  3421. levels_list = [ping.group_index for ping in groupings]
  3422. names = self.grouper.names
  3423. if qs is not None:
  3424. # error: Argument 1 to "append" of "list" has incompatible type
  3425. # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"
  3426. levels_list.append(qs) # type: ignore[arg-type]
  3427. names = names + [None]
  3428. index = MultiIndex.from_product(levels_list, names=names)
  3429. if self.sort:
  3430. index = index.sort_values()
  3431. if self.as_index:
  3432. # Always holds for SeriesGroupBy unless GH#36507 is implemented
  3433. d = {
  3434. self.obj._get_axis_name(self.axis): index,
  3435. "copy": False,
  3436. "fill_value": fill_value,
  3437. }
  3438. return output.reindex(**d) # type: ignore[arg-type]
  3439. # GH 13204
  3440. # Here, the categorical in-axis groupers, which need to be fully
  3441. # expanded, are columns in `output`. An idea is to do:
  3442. # output = output.set_index(self.grouper.names)
  3443. # .reindex(index).reset_index()
  3444. # but special care has to be taken because of possible not-in-axis
  3445. # groupers.
  3446. # So, we manually select and drop the in-axis grouper columns,
  3447. # reindex `output`, and then reset the in-axis grouper columns.
  3448. # Select in-axis groupers
  3449. in_axis_grps = list(
  3450. (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
  3451. )
  3452. if len(in_axis_grps) > 0:
  3453. g_nums, g_names = zip(*in_axis_grps)
  3454. output = output.drop(labels=list(g_names), axis=1)
  3455. # Set a temp index and reindex (possibly expanding)
  3456. output = output.set_index(self.grouper.result_index).reindex(
  3457. index, copy=False, fill_value=fill_value
  3458. )
  3459. # Reset in-axis grouper columns
  3460. # (using level numbers `g_nums` because level names may not be unique)
  3461. if len(in_axis_grps) > 0:
  3462. output = output.reset_index(level=g_nums)
  3463. return output.reset_index(drop=True)
  3464. @final
  3465. def sample(
  3466. self,
  3467. n: int | None = None,
  3468. frac: float | None = None,
  3469. replace: bool = False,
  3470. weights: Sequence | Series | None = None,
  3471. random_state: RandomState | None = None,
  3472. ):
  3473. """
  3474. Return a random sample of items from each group.
  3475. You can use `random_state` for reproducibility.
  3476. .. versionadded:: 1.1.0
  3477. Parameters
  3478. ----------
  3479. n : int, optional
  3480. Number of items to return for each group. Cannot be used with
  3481. `frac` and must be no larger than the smallest group unless
  3482. `replace` is True. Default is one if `frac` is None.
  3483. frac : float, optional
  3484. Fraction of items to return. Cannot be used with `n`.
  3485. replace : bool, default False
  3486. Allow or disallow sampling of the same row more than once.
  3487. weights : list-like, optional
  3488. Default None results in equal probability weighting.
  3489. If passed a list-like then values must have the same length as
  3490. the underlying DataFrame or Series object and will be used as
  3491. sampling probabilities after normalization within each group.
  3492. Values must be non-negative with at least one positive element
  3493. within each group.
  3494. random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
  3495. If int, array-like, or BitGenerator, seed for random number generator.
  3496. If np.random.RandomState or np.random.Generator, use as given.
  3497. .. versionchanged:: 1.4.0
  3498. np.random.Generator objects now accepted
  3499. Returns
  3500. -------
  3501. Series or DataFrame
  3502. A new object of same type as caller containing items randomly
  3503. sampled within each group from the caller object.
  3504. See Also
  3505. --------
  3506. DataFrame.sample: Generate random samples from a DataFrame object.
  3507. numpy.random.choice: Generate a random sample from a given 1-D numpy
  3508. array.
  3509. Examples
  3510. --------
  3511. >>> df = pd.DataFrame(
  3512. ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}
  3513. ... )
  3514. >>> df
  3515. a b
  3516. 0 red 0
  3517. 1 red 1
  3518. 2 blue 2
  3519. 3 blue 3
  3520. 4 black 4
  3521. 5 black 5
  3522. Select one row at random for each distinct value in column a. The
  3523. `random_state` argument can be used to guarantee reproducibility:
  3524. >>> df.groupby("a").sample(n=1, random_state=1)
  3525. a b
  3526. 4 black 4
  3527. 2 blue 2
  3528. 1 red 1
  3529. Set `frac` to sample fixed proportions rather than counts:
  3530. >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)
  3531. 5 5
  3532. 2 2
  3533. 0 0
  3534. Name: b, dtype: int64
  3535. Control sample probabilities within groups by setting weights:
  3536. >>> df.groupby("a").sample(
  3537. ... n=1,
  3538. ... weights=[1, 1, 1, 0, 0, 1],
  3539. ... random_state=1,
  3540. ... )
  3541. a b
  3542. 5 black 5
  3543. 2 blue 2
  3544. 0 red 0
  3545. """ # noqa:E501
  3546. if self._selected_obj.empty:
  3547. # GH48459 prevent ValueError when object is empty
  3548. return self._selected_obj
  3549. size = sample.process_sampling_size(n, frac, replace)
  3550. if weights is not None:
  3551. weights_arr = sample.preprocess_weights(
  3552. self._selected_obj, weights, axis=self.axis
  3553. )
  3554. random_state = com.random_state(random_state)
  3555. group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
  3556. sampled_indices = []
  3557. for labels, obj in group_iterator:
  3558. grp_indices = self.indices[labels]
  3559. group_size = len(grp_indices)
  3560. if size is not None:
  3561. sample_size = size
  3562. else:
  3563. assert frac is not None
  3564. sample_size = round(frac * group_size)
  3565. grp_sample = sample.sample(
  3566. group_size,
  3567. size=sample_size,
  3568. replace=replace,
  3569. weights=None if weights is None else weights_arr[grp_indices],
  3570. random_state=random_state,
  3571. )
  3572. sampled_indices.append(grp_indices[grp_sample])
  3573. sampled_indices = np.concatenate(sampled_indices)
  3574. return self._selected_obj.take(sampled_indices, axis=self.axis)
  3575. @doc(GroupBy)
  3576. def get_groupby(
  3577. obj: NDFrame,
  3578. by: _KeysArgType | None = None,
  3579. axis: AxisInt = 0,
  3580. grouper: ops.BaseGrouper | None = None,
  3581. group_keys: bool = True,
  3582. ) -> GroupBy:
  3583. klass: type[GroupBy]
  3584. if isinstance(obj, Series):
  3585. from pandas.core.groupby.generic import SeriesGroupBy
  3586. klass = SeriesGroupBy
  3587. elif isinstance(obj, DataFrame):
  3588. from pandas.core.groupby.generic import DataFrameGroupBy
  3589. klass = DataFrameGroupBy
  3590. else: # pragma: no cover
  3591. raise TypeError(f"invalid type: {obj}")
  3592. return klass(
  3593. obj=obj,
  3594. keys=by,
  3595. axis=axis,
  3596. grouper=grouper,
  3597. group_keys=group_keys,
  3598. )
  3599. def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex:
  3600. """
  3601. Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex.
  3602. The quantile level in the MultiIndex is a repeated copy of 'qs'.
  3603. Parameters
  3604. ----------
  3605. idx : Index
  3606. qs : np.ndarray[float64]
  3607. Returns
  3608. -------
  3609. MultiIndex
  3610. """
  3611. nqs = len(qs)
  3612. if idx._is_multi:
  3613. idx = cast(MultiIndex, idx)
  3614. lev_codes, lev = Index(qs).factorize()
  3615. levels = list(idx.levels) + [lev]
  3616. codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))]
  3617. mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None])
  3618. else:
  3619. mi = MultiIndex.from_product([idx, qs])
  3620. return mi