12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292 |
- """
- Provide the groupby split-apply-combine paradigm. Define the GroupBy
- class providing the base-class of operations.
- The SeriesGroupBy and DataFrameGroupBy sub-class
- (defined in pandas.core.groupby.generic)
- expose these user-facing objects to provide specific functionality.
- """
- from __future__ import annotations
- import datetime
- from functools import (
- partial,
- wraps,
- )
- import inspect
- from textwrap import dedent
- from typing import (
- TYPE_CHECKING,
- Callable,
- Hashable,
- Iterable,
- Iterator,
- List,
- Literal,
- Mapping,
- Sequence,
- TypeVar,
- Union,
- cast,
- final,
- )
- import warnings
- import numpy as np
- from pandas._config.config import option_context
- from pandas._libs import (
- Timestamp,
- lib,
- )
- from pandas._libs.algos import rank_1d
- import pandas._libs.groupby as libgroupby
- from pandas._libs.missing import NA
- from pandas._typing import (
- AnyArrayLike,
- ArrayLike,
- Axis,
- AxisInt,
- DtypeObj,
- FillnaOptions,
- IndexLabel,
- NDFrameT,
- PositionalIndexer,
- RandomState,
- Scalar,
- T,
- npt,
- )
- from pandas.compat.numpy import function as nv
- from pandas.errors import (
- AbstractMethodError,
- DataError,
- )
- from pandas.util._decorators import (
- Appender,
- Substitution,
- cache_readonly,
- doc,
- )
- from pandas.core.dtypes.cast import ensure_dtype_can_hold_na
- from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_float_dtype,
- is_hashable,
- is_integer,
- is_integer_dtype,
- is_numeric_dtype,
- is_object_dtype,
- is_scalar,
- needs_i8_conversion,
- )
- from pandas.core.dtypes.missing import (
- isna,
- notna,
- )
- from pandas.core import (
- algorithms,
- sample,
- )
- from pandas.core._numba import executor
- from pandas.core.arrays import (
- BaseMaskedArray,
- BooleanArray,
- Categorical,
- DatetimeArray,
- ExtensionArray,
- FloatingArray,
- TimedeltaArray,
- )
- from pandas.core.base import (
- PandasObject,
- SelectionMixin,
- )
- import pandas.core.common as com
- from pandas.core.frame import DataFrame
- from pandas.core.generic import NDFrame
- from pandas.core.groupby import (
- base,
- numba_,
- ops,
- )
- from pandas.core.groupby.grouper import get_grouper
- from pandas.core.groupby.indexing import (
- GroupByIndexingMixin,
- GroupByNthSelector,
- )
- from pandas.core.indexes.api import (
- CategoricalIndex,
- Index,
- MultiIndex,
- RangeIndex,
- default_index,
- )
- from pandas.core.internals.blocks import ensure_block_shape
- from pandas.core.series import Series
- from pandas.core.sorting import get_group_index_sorter
- from pandas.core.util.numba_ import (
- get_jit_arguments,
- maybe_use_numba,
- )
- if TYPE_CHECKING:
- from pandas.core.window import (
- ExpandingGroupby,
- ExponentialMovingWindowGroupby,
- RollingGroupby,
- )
- _common_see_also = """
- See Also
- --------
- Series.%(name)s : Apply a function %(name)s to a Series.
- DataFrame.%(name)s : Apply a function %(name)s
- to each row or column of a DataFrame.
- """
- _apply_docs = {
- "template": """
- Apply function ``func`` group-wise and combine the results together.
- The function passed to ``apply`` must take a {input} as its first
- argument and return a DataFrame, Series or scalar. ``apply`` will
- then take care of combining the results back together into a single
- dataframe or series. ``apply`` is therefore a highly flexible
- grouping method.
- While ``apply`` is a very flexible method, its downside is that
- using it can be quite a bit slower than using more specific methods
- like ``agg`` or ``transform``. Pandas offers a wide range of method that will
- be much faster than using ``apply`` for their specific purposes, so try to
- use them before reaching for ``apply``.
- Parameters
- ----------
- func : callable
- A callable that takes a {input} as its first argument, and
- returns a dataframe, a series or a scalar. In addition the
- callable may take positional and keyword arguments.
- args, kwargs : tuple and dict
- Optional positional and keyword arguments to pass to ``func``.
- Returns
- -------
- Series or DataFrame
- See Also
- --------
- pipe : Apply function to the full GroupBy object instead of to each
- group.
- aggregate : Apply aggregate function to the GroupBy object.
- transform : Apply function column-by-column to the GroupBy object.
- Series.apply : Apply a function to a Series.
- DataFrame.apply : Apply a function to each row or column of a DataFrame.
- Notes
- -----
- .. versionchanged:: 1.3.0
- The resulting dtype will reflect the return value of the passed ``func``,
- see the examples below.
- Functions that mutate the passed object can produce unexpected
- behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
- for more details.
- Examples
- --------
- {examples}
- """,
- "dataframe_examples": """
- >>> df = pd.DataFrame({'A': 'a a b'.split(),
- ... 'B': [1,2,3],
- ... 'C': [4,6,5]})
- >>> g1 = df.groupby('A', group_keys=False)
- >>> g2 = df.groupby('A', group_keys=True)
- Notice that ``g1`` and ``g2`` have two groups, ``a`` and ``b``, and only
- differ in their ``group_keys`` argument. Calling `apply` in various ways,
- we can get different grouping results:
- Example 1: below the function passed to `apply` takes a DataFrame as
- its argument and returns a DataFrame. `apply` combines the result for
- each group together into a new DataFrame:
- >>> g1[['B', 'C']].apply(lambda x: x / x.sum())
- B C
- 0 0.333333 0.4
- 1 0.666667 0.6
- 2 1.000000 1.0
- In the above, the groups are not part of the index. We can have them included
- by using ``g2`` where ``group_keys=True``:
- >>> g2[['B', 'C']].apply(lambda x: x / x.sum())
- B C
- A
- a 0 0.333333 0.4
- 1 0.666667 0.6
- b 2 1.000000 1.0
- Example 2: The function passed to `apply` takes a DataFrame as
- its argument and returns a Series. `apply` combines the result for
- each group together into a new DataFrame.
- .. versionchanged:: 1.3.0
- The resulting dtype will reflect the return value of the passed ``func``.
- >>> g1[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
- B C
- A
- a 1.0 2.0
- b 0.0 0.0
- >>> g2[['B', 'C']].apply(lambda x: x.astype(float).max() - x.min())
- B C
- A
- a 1.0 2.0
- b 0.0 0.0
- The ``group_keys`` argument has no effect here because the result is not
- like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
- to the input.
- Example 3: The function passed to `apply` takes a DataFrame as
- its argument and returns a scalar. `apply` combines the result for
- each group together into a Series, including setting the index as
- appropriate:
- >>> g1.apply(lambda x: x.C.max() - x.B.min())
- A
- a 5
- b 2
- dtype: int64""",
- "series_examples": """
- >>> s = pd.Series([0, 1, 2], index='a a b'.split())
- >>> g1 = s.groupby(s.index, group_keys=False)
- >>> g2 = s.groupby(s.index, group_keys=True)
- From ``s`` above we can see that ``g`` has two groups, ``a`` and ``b``.
- Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
- differ in their ``group_keys`` argument. Calling `apply` in various ways,
- we can get different grouping results:
- Example 1: The function passed to `apply` takes a Series as
- its argument and returns a Series. `apply` combines the result for
- each group together into a new Series.
- .. versionchanged:: 1.3.0
- The resulting dtype will reflect the return value of the passed ``func``.
- >>> g1.apply(lambda x: x*2 if x.name == 'a' else x/2)
- a 0.0
- a 2.0
- b 1.0
- dtype: float64
- In the above, the groups are not part of the index. We can have them included
- by using ``g2`` where ``group_keys=True``:
- >>> g2.apply(lambda x: x*2 if x.name == 'a' else x/2)
- a a 0.0
- a 2.0
- b b 1.0
- dtype: float64
- Example 2: The function passed to `apply` takes a Series as
- its argument and returns a scalar. `apply` combines the result for
- each group together into a Series, including setting the index as
- appropriate:
- >>> g1.apply(lambda x: x.max() - x.min())
- a 1
- b 0
- dtype: int64
- The ``group_keys`` argument has no effect here because the result is not
- like-indexed (i.e. :ref:`a transform <groupby.transform>`) when compared
- to the input.
- >>> g2.apply(lambda x: x.max() - x.min())
- a 1
- b 0
- dtype: int64""",
- }
- _groupby_agg_method_template = """
- Compute {fname} of group values.
- Parameters
- ----------
- numeric_only : bool, default {no}
- Include only float, int, boolean columns.
- .. versionchanged:: 2.0.0
- numeric_only no longer accepts ``None``.
- min_count : int, default {mc}
- The required number of valid values to perform the operation. If fewer
- than ``min_count`` non-NA values are present the result will be NA.
- Returns
- -------
- Series or DataFrame
- Computed {fname} of values within each group.
- """
- _pipe_template = """
- Apply a ``func`` with arguments to this %(klass)s object and return its result.
- Use `.pipe` when you want to improve readability by chaining together
- functions that expect Series, DataFrames, GroupBy or Resampler objects.
- Instead of writing
- >>> h(g(f(df.groupby('group')), arg1=a), arg2=b, arg3=c) # doctest: +SKIP
- You can write
- >>> (df.groupby('group')
- ... .pipe(f)
- ... .pipe(g, arg1=a)
- ... .pipe(h, arg2=b, arg3=c)) # doctest: +SKIP
- which is much more readable.
- Parameters
- ----------
- func : callable or tuple of (callable, str)
- Function to apply to this %(klass)s object or, alternatively,
- a `(callable, data_keyword)` tuple where `data_keyword` is a
- string indicating the keyword of `callable` that expects the
- %(klass)s object.
- args : iterable, optional
- Positional arguments passed into `func`.
- kwargs : dict, optional
- A dictionary of keyword arguments passed into `func`.
- Returns
- -------
- the return type of `func`.
- See Also
- --------
- Series.pipe : Apply a function with arguments to a series.
- DataFrame.pipe: Apply a function with arguments to a dataframe.
- apply : Apply function to each group instead of to the
- full %(klass)s object.
- Notes
- -----
- See more `here
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#piping-function-calls>`_
- Examples
- --------
- %(examples)s
- """
- _transform_template = """
- Call function producing a same-indexed %(klass)s on each group.
- Returns a %(klass)s having the same indexes as the original object
- filled with the transformed values.
- Parameters
- ----------
- f : function, str
- Function to apply to each group. See the Notes section below for requirements.
- Accepted inputs are:
- - String
- - Python function
- - Numba JIT function with ``engine='numba'`` specified.
- Only passing a single function is supported with this engine.
- If the ``'numba'`` engine is chosen, the function must be
- a user defined function with ``values`` and ``index`` as the
- first and second arguments respectively in the function signature.
- Each group's index will be passed to the user defined function
- and optionally available for use.
- If a string is chosen, then it needs to be the name
- of the groupby method you want to use.
- .. versionchanged:: 1.1.0
- *args
- Positional arguments to pass to func.
- engine : str, default None
- * ``'cython'`` : Runs the function through C-extensions from cython.
- * ``'numba'`` : Runs the function through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or the global setting ``compute.use_numba``
- .. versionadded:: 1.1.0
- engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{'nopython': True, 'nogil': False, 'parallel': False}`` and will be
- applied to the function
- .. versionadded:: 1.1.0
- **kwargs
- Keyword arguments to be passed into func.
- Returns
- -------
- %(klass)s
- See Also
- --------
- %(klass)s.groupby.apply : Apply function ``func`` group-wise and combine
- the results together.
- %(klass)s.groupby.aggregate : Aggregate using one or more
- operations over the specified axis.
- %(klass)s.transform : Call ``func`` on self producing a %(klass)s with the
- same axis shape as self.
- Notes
- -----
- Each group is endowed the attribute 'name' in case you need to know
- which group you are working on.
- The current implementation imposes three requirements on f:
- * f must return a value that either has the same shape as the input
- subframe or can be broadcast to the shape of the input subframe.
- For example, if `f` returns a scalar it will be broadcast to have the
- same shape as the input subframe.
- * if this is a DataFrame, f must support application column-by-column
- in the subframe. If f also supports application to the entire subframe,
- then a fast path is used starting from the second chunk.
- * f must not mutate groups. Mutation is not supported and may
- produce unexpected results. See :ref:`gotchas.udf-mutation` for more details.
- When using ``engine='numba'``, there will be no "fall back" behavior internally.
- The group data and group index will be passed as numpy arrays to the JITed
- user defined function, and no alternative execution attempts will be tried.
- .. versionchanged:: 1.3.0
- The resulting dtype will reflect the return value of the passed ``func``,
- see the examples below.
- .. versionchanged:: 2.0.0
- When using ``.transform`` on a grouped DataFrame and the transformation function
- returns a DataFrame, pandas now aligns the result's index
- with the input's index. You can call ``.to_numpy()`` on the
- result of the transformation function to avoid alignment.
- Examples
- --------
- %(example)s"""
- _agg_template = """
- Aggregate using one or more operations over the specified axis.
- Parameters
- ----------
- func : function, str, list, dict or None
- Function to use for aggregating the data. If a function, must either
- work when passed a {klass} or when passed to {klass}.apply.
- Accepted combinations are:
- - function
- - string function name
- - list of functions and/or function names, e.g. ``[np.sum, 'mean']``
- - dict of axis labels -> functions, function names or list of such.
- - None, in which case ``**kwargs`` are used with Named Aggregation. Here the
- output has one column for each element in ``**kwargs``. The name of the
- column is keyword, whereas the value determines the aggregation used to compute
- the values in the column.
- Can also accept a Numba JIT function with
- ``engine='numba'`` specified. Only passing a single function is supported
- with this engine.
- If the ``'numba'`` engine is chosen, the function must be
- a user defined function with ``values`` and ``index`` as the
- first and second arguments respectively in the function signature.
- Each group's index will be passed to the user defined function
- and optionally available for use.
- .. versionchanged:: 1.1.0
- *args
- Positional arguments to pass to func.
- engine : str, default None
- * ``'cython'`` : Runs the function through C-extensions from cython.
- * ``'numba'`` : Runs the function through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or globally setting ``compute.use_numba``
- .. versionadded:: 1.1.0
- engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}`` and will be
- applied to the function
- .. versionadded:: 1.1.0
- **kwargs
- * If ``func`` is None, ``**kwargs`` are used to define the output names and
- aggregations via Named Aggregation. See ``func`` entry.
- * Otherwise, keyword arguments to be passed into func.
- Returns
- -------
- {klass}
- See Also
- --------
- {klass}.groupby.apply : Apply function func group-wise
- and combine the results together.
- {klass}.groupby.transform : Transforms the Series on each group
- based on the given function.
- {klass}.aggregate : Aggregate using one or more
- operations over the specified axis.
- Notes
- -----
- When using ``engine='numba'``, there will be no "fall back" behavior internally.
- The group data and group index will be passed as numpy arrays to the JITed
- user defined function, and no alternative execution attempts will be tried.
- Functions that mutate the passed object can produce unexpected
- behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
- for more details.
- .. versionchanged:: 1.3.0
- The resulting dtype will reflect the return value of the passed ``func``,
- see the examples below.
- {examples}"""
- @final
- class GroupByPlot(PandasObject):
- """
- Class implementing the .plot attribute for groupby objects.
- """
- def __init__(self, groupby: GroupBy) -> None:
- self._groupby = groupby
- def __call__(self, *args, **kwargs):
- def f(self):
- return self.plot(*args, **kwargs)
- f.__name__ = "plot"
- return self._groupby.apply(f)
- def __getattr__(self, name: str):
- def attr(*args, **kwargs):
- def f(self):
- return getattr(self.plot, name)(*args, **kwargs)
- return self._groupby.apply(f)
- return attr
- _KeysArgType = Union[
- Hashable,
- List[Hashable],
- Callable[[Hashable], Hashable],
- List[Callable[[Hashable], Hashable]],
- Mapping[Hashable, Hashable],
- ]
- class BaseGroupBy(PandasObject, SelectionMixin[NDFrameT], GroupByIndexingMixin):
- _hidden_attrs = PandasObject._hidden_attrs | {
- "as_index",
- "axis",
- "dropna",
- "exclusions",
- "grouper",
- "group_keys",
- "keys",
- "level",
- "obj",
- "observed",
- "sort",
- }
- axis: AxisInt
- grouper: ops.BaseGrouper
- keys: _KeysArgType | None = None
- level: IndexLabel | None = None
- group_keys: bool
- @final
- def __len__(self) -> int:
- return len(self.groups)
- @final
- def __repr__(self) -> str:
- # TODO: Better repr for GroupBy object
- return object.__repr__(self)
- @final
- @property
- def groups(self) -> dict[Hashable, np.ndarray]:
- """
- Dict {group name -> group labels}.
- """
- return self.grouper.groups
- @final
- @property
- def ngroups(self) -> int:
- return self.grouper.ngroups
- @final
- @property
- def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
- """
- Dict {group name -> group indices}.
- """
- return self.grouper.indices
- @final
- def _get_indices(self, names):
- """
- Safe get multiple indices, translate keys for
- datelike to underlying repr.
- """
- def get_converter(s):
- # possibly convert to the actual key types
- # in the indices, could be a Timestamp or a np.datetime64
- if isinstance(s, datetime.datetime):
- return lambda key: Timestamp(key)
- elif isinstance(s, np.datetime64):
- return lambda key: Timestamp(key).asm8
- else:
- return lambda key: key
- if len(names) == 0:
- return []
- if len(self.indices) > 0:
- index_sample = next(iter(self.indices))
- else:
- index_sample = None # Dummy sample
- name_sample = names[0]
- if isinstance(index_sample, tuple):
- if not isinstance(name_sample, tuple):
- msg = "must supply a tuple to get_group with multiple grouping keys"
- raise ValueError(msg)
- if not len(name_sample) == len(index_sample):
- try:
- # If the original grouper was a tuple
- return [self.indices[name] for name in names]
- except KeyError as err:
- # turns out it wasn't a tuple
- msg = (
- "must supply a same-length tuple to get_group "
- "with multiple grouping keys"
- )
- raise ValueError(msg) from err
- converters = [get_converter(s) for s in index_sample]
- names = (tuple(f(n) for f, n in zip(converters, name)) for name in names)
- else:
- converter = get_converter(index_sample)
- names = (converter(name) for name in names)
- return [self.indices.get(name, []) for name in names]
- @final
- def _get_index(self, name):
- """
- Safe get index, translate keys for datelike to underlying repr.
- """
- return self._get_indices([name])[0]
- @final
- @cache_readonly
- def _selected_obj(self):
- # Note: _selected_obj is always just `self.obj` for SeriesGroupBy
- if isinstance(self.obj, Series):
- return self.obj
- if self._selection is not None:
- if is_hashable(self._selection):
- # i.e. a single key, so selecting it will return a Series.
- # In this case, _obj_with_exclusions would wrap the key
- # in a list and return a single-column DataFrame.
- return self.obj[self._selection]
- # Otherwise _selection is equivalent to _selection_list, so
- # _selected_obj matches _obj_with_exclusions, so we can re-use
- # that and avoid making a copy.
- return self._obj_with_exclusions
- return self.obj
- @final
- def _dir_additions(self) -> set[str]:
- return self.obj._dir_additions()
- @Substitution(
- klass="GroupBy",
- examples=dedent(
- """\
- >>> df = pd.DataFrame({'A': 'a b a b'.split(), 'B': [1, 2, 3, 4]})
- >>> df
- A B
- 0 a 1
- 1 b 2
- 2 a 3
- 3 b 4
- To get the difference between each groups maximum and minimum value in one
- pass, you can do
- >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
- B
- A
- a 2
- b 2"""
- ),
- )
- @Appender(_pipe_template)
- def pipe(
- self,
- func: Callable[..., T] | tuple[Callable[..., T], str],
- *args,
- **kwargs,
- ) -> T:
- return com.pipe(self, func, *args, **kwargs)
- @final
- def get_group(self, name, obj=None) -> DataFrame | Series:
- """
- Construct DataFrame from group with provided name.
- Parameters
- ----------
- name : object
- The name of the group to get as a DataFrame.
- obj : DataFrame, default None
- The DataFrame to take the DataFrame out of. If
- it is None, the object groupby was called on will
- be used.
- Returns
- -------
- same type as obj
- """
- if obj is None:
- obj = self._selected_obj
- inds = self._get_index(name)
- if not len(inds):
- raise KeyError(name)
- return obj._take_with_is_copy(inds, axis=self.axis)
- @final
- def __iter__(self) -> Iterator[tuple[Hashable, NDFrameT]]:
- """
- Groupby iterator.
- Returns
- -------
- Generator yielding sequence of (name, subsetted object)
- for each group
- """
- keys = self.keys
- result = self.grouper.get_iterator(self._selected_obj, axis=self.axis)
- if isinstance(keys, list) and len(keys) == 1:
- # GH#42795 - when keys is a list, return tuples even when length is 1
- result = (((key,), group) for key, group in result)
- return result
- # To track operations that expand dimensions, like ohlc
- OutputFrameOrSeries = TypeVar("OutputFrameOrSeries", bound=NDFrame)
- class GroupBy(BaseGroupBy[NDFrameT]):
- """
- Class for grouping and aggregating relational data.
- See aggregate, transform, and apply functions on this object.
- It's easiest to use obj.groupby(...) to use GroupBy, but you can also do:
- ::
- grouped = groupby(obj, ...)
- Parameters
- ----------
- obj : pandas object
- axis : int, default 0
- level : int, default None
- Level of MultiIndex
- groupings : list of Grouping objects
- Most users should ignore this
- exclusions : array-like, optional
- List of columns to exclude
- name : str
- Most users should ignore this
- Returns
- -------
- **Attributes**
- groups : dict
- {group name -> group labels}
- len(grouped) : int
- Number of groups
- Notes
- -----
- After grouping, see aggregate, apply, and transform functions. Here are
- some other brief notes about usage. When grouping by multiple groups, the
- result index will be a MultiIndex (hierarchical) by default.
- Iteration produces (key, group) tuples, i.e. chunking the data by group. So
- you can write code like:
- ::
- grouped = obj.groupby(keys, axis=axis)
- for key, group in grouped:
- # do something with the data
- Function calls on GroupBy, if not specially implemented, "dispatch" to the
- grouped data. So if you group a DataFrame and wish to invoke the std()
- method on each group, you can simply do:
- ::
- df.groupby(mapper).std()
- rather than
- ::
- df.groupby(mapper).aggregate(np.std)
- You can pass arguments to these "wrapped" functions, too.
- See the online documentation for full exposition on these topics and much
- more
- """
- grouper: ops.BaseGrouper
- as_index: bool
- @final
- def __init__(
- self,
- obj: NDFrameT,
- keys: _KeysArgType | None = None,
- axis: Axis = 0,
- level: IndexLabel | None = None,
- grouper: ops.BaseGrouper | None = None,
- exclusions: frozenset[Hashable] | None = None,
- selection: IndexLabel | None = None,
- as_index: bool = True,
- sort: bool = True,
- group_keys: bool = True,
- observed: bool = False,
- dropna: bool = True,
- ) -> None:
- self._selection = selection
- assert isinstance(obj, NDFrame), type(obj)
- self.level = level
- if not as_index:
- if axis != 0:
- raise ValueError("as_index=False only valid for axis=0")
- self.as_index = as_index
- self.keys = keys
- self.sort = sort
- self.group_keys = group_keys
- self.observed = observed
- self.dropna = dropna
- if grouper is None:
- grouper, exclusions, obj = get_grouper(
- obj,
- keys,
- axis=axis,
- level=level,
- sort=sort,
- observed=observed,
- dropna=self.dropna,
- )
- self.obj = obj
- self.axis = obj._get_axis_number(axis)
- self.grouper = grouper
- self.exclusions = frozenset(exclusions) if exclusions else frozenset()
- def __getattr__(self, attr: str):
- if attr in self._internal_names_set:
- return object.__getattribute__(self, attr)
- if attr in self.obj:
- return self[attr]
- raise AttributeError(
- f"'{type(self).__name__}' object has no attribute '{attr}'"
- )
- @final
- def _op_via_apply(self, name: str, *args, **kwargs):
- """Compute the result of an operation by using GroupBy's apply."""
- f = getattr(type(self._obj_with_exclusions), name)
- sig = inspect.signature(f)
- # a little trickery for aggregation functions that need an axis
- # argument
- if "axis" in sig.parameters:
- if kwargs.get("axis", None) is None or kwargs.get("axis") is lib.no_default:
- kwargs["axis"] = self.axis
- def curried(x):
- return f(x, *args, **kwargs)
- # preserve the name so we can detect it when calling plot methods,
- # to avoid duplicates
- curried.__name__ = name
- # special case otherwise extra plots are created when catching the
- # exception below
- if name in base.plotting_methods:
- return self.apply(curried)
- is_transform = name in base.transformation_kernels
- result = self._python_apply_general(
- curried,
- self._obj_with_exclusions,
- is_transform=is_transform,
- not_indexed_same=not is_transform,
- )
- if self.grouper.has_dropped_na and is_transform:
- # result will have dropped rows due to nans, fill with null
- # and ensure index is ordered same as the input
- result = self._set_result_index_ordered(result)
- return result
- # -----------------------------------------------------------------
- # Selection
- def _iterate_slices(self) -> Iterable[Series]:
- raise AbstractMethodError(self)
- # -----------------------------------------------------------------
- # Dispatch/Wrapping
- @final
- def _concat_objects(
- self,
- values,
- not_indexed_same: bool = False,
- is_transform: bool = False,
- ):
- from pandas.core.reshape.concat import concat
- if self.group_keys and not is_transform:
- if self.as_index:
- # possible MI return case
- group_keys = self.grouper.result_index
- group_levels = self.grouper.levels
- group_names = self.grouper.names
- result = concat(
- values,
- axis=self.axis,
- keys=group_keys,
- levels=group_levels,
- names=group_names,
- sort=False,
- )
- else:
- # GH5610, returns a MI, with the first level being a
- # range index
- keys = list(range(len(values)))
- result = concat(values, axis=self.axis, keys=keys)
- elif not not_indexed_same:
- result = concat(values, axis=self.axis)
- ax = self._selected_obj._get_axis(self.axis)
- if self.dropna:
- labels = self.grouper.group_info[0]
- mask = labels != -1
- ax = ax[mask]
- # this is a very unfortunate situation
- # we can't use reindex to restore the original order
- # when the ax has duplicates
- # so we resort to this
- # GH 14776, 30667
- # TODO: can we re-use e.g. _reindex_non_unique?
- if ax.has_duplicates and not result.axes[self.axis].equals(ax):
- # e.g. test_category_order_transformer
- target = algorithms.unique1d(ax._values)
- indexer, _ = result.index.get_indexer_non_unique(target)
- result = result.take(indexer, axis=self.axis)
- else:
- result = result.reindex(ax, axis=self.axis, copy=False)
- else:
- result = concat(values, axis=self.axis)
- name = self.obj.name if self.obj.ndim == 1 else self._selection
- if isinstance(result, Series) and name is not None:
- result.name = name
- return result
- @final
- def _set_result_index_ordered(
- self, result: OutputFrameOrSeries
- ) -> OutputFrameOrSeries:
- # set the result index on the passed values object and
- # return the new object, xref 8046
- obj_axis = self.obj._get_axis(self.axis)
- if self.grouper.is_monotonic and not self.grouper.has_dropped_na:
- # shortcut if we have an already ordered grouper
- result = result.set_axis(obj_axis, axis=self.axis, copy=False)
- return result
- # row order is scrambled => sort the rows by position in original index
- original_positions = Index(self.grouper.result_ilocs())
- result = result.set_axis(original_positions, axis=self.axis, copy=False)
- result = result.sort_index(axis=self.axis)
- if self.grouper.has_dropped_na:
- # Add back in any missing rows due to dropna - index here is integral
- # with values referring to the row of the input so can use RangeIndex
- result = result.reindex(RangeIndex(len(obj_axis)), axis=self.axis)
- result = result.set_axis(obj_axis, axis=self.axis, copy=False)
- return result
- @final
- def _insert_inaxis_grouper(self, result: Series | DataFrame) -> DataFrame:
- if isinstance(result, Series):
- result = result.to_frame()
- # zip in reverse so we can always insert at loc 0
- columns = result.columns
- for name, lev, in_axis in zip(
- reversed(self.grouper.names),
- reversed(self.grouper.get_group_levels()),
- reversed([grp.in_axis for grp in self.grouper.groupings]),
- ):
- # GH #28549
- # When using .apply(-), name will be in columns already
- if in_axis and name not in columns:
- result.insert(0, name, lev)
- return result
- def _indexed_output_to_ndframe(
- self, result: Mapping[base.OutputKey, ArrayLike]
- ) -> Series | DataFrame:
- raise AbstractMethodError(self)
- @final
- def _maybe_transpose_result(self, result: NDFrameT) -> NDFrameT:
- if self.axis == 1:
- # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy
- result = result.T
- if result.index.equals(self.obj.index):
- # Retain e.g. DatetimeIndex/TimedeltaIndex freq
- # e.g. test_groupby_crash_on_nunique
- result.index = self.obj.index.copy()
- return result
- @final
- def _wrap_aggregated_output(
- self,
- result: Series | DataFrame,
- qs: npt.NDArray[np.float64] | None = None,
- ):
- """
- Wraps the output of GroupBy aggregations into the expected result.
- Parameters
- ----------
- result : Series, DataFrame
- Returns
- -------
- Series or DataFrame
- """
- # ATM we do not get here for SeriesGroupBy; when we do, we will
- # need to require that result.name already match self.obj.name
- if not self.as_index:
- # `not self.as_index` is only relevant for DataFrameGroupBy,
- # enforced in __init__
- result = self._insert_inaxis_grouper(result)
- result = result._consolidate()
- index = Index(range(self.grouper.ngroups))
- else:
- index = self.grouper.result_index
- if qs is not None:
- # We get here with len(qs) != 1 and not self.as_index
- # in test_pass_args_kwargs
- index = _insert_quantile_level(index, qs)
- result.index = index
- # error: Argument 1 to "_maybe_transpose_result" of "GroupBy" has
- # incompatible type "Union[Series, DataFrame]"; expected "NDFrameT"
- res = self._maybe_transpose_result(result) # type: ignore[arg-type]
- return self._reindex_output(res, qs=qs)
- def _wrap_applied_output(
- self,
- data,
- values: list,
- not_indexed_same: bool = False,
- is_transform: bool = False,
- ):
- raise AbstractMethodError(self)
- # -----------------------------------------------------------------
- # numba
- @final
- def _numba_prep(self, data: DataFrame):
- ids, _, ngroups = self.grouper.group_info
- sorted_index = get_group_index_sorter(ids, ngroups)
- sorted_ids = algorithms.take_nd(ids, sorted_index, allow_fill=False)
- sorted_data = data.take(sorted_index, axis=self.axis).to_numpy()
- if len(self.grouper.groupings) > 1:
- raise NotImplementedError(
- "More than 1 grouping labels are not supported with engine='numba'"
- )
- # GH 46867
- index_data = data.index
- if isinstance(index_data, MultiIndex):
- group_key = self.grouper.groupings[0].name
- index_data = index_data.get_level_values(group_key)
- sorted_index_data = index_data.take(sorted_index).to_numpy()
- starts, ends = lib.generate_slices(sorted_ids, ngroups)
- return (
- starts,
- ends,
- sorted_index_data,
- sorted_data,
- )
- def _numba_agg_general(
- self,
- func: Callable,
- engine_kwargs: dict[str, bool] | None,
- *aggregator_args,
- ):
- """
- Perform groupby with a standard numerical aggregation function (e.g. mean)
- with Numba.
- """
- if not self.as_index:
- raise NotImplementedError(
- "as_index=False is not supported. Use .reset_index() instead."
- )
- if self.axis == 1:
- raise NotImplementedError("axis=1 is not supported.")
- data = self._obj_with_exclusions
- df = data if data.ndim == 2 else data.to_frame()
- starts, ends, sorted_index, sorted_data = self._numba_prep(df)
- aggregator = executor.generate_shared_aggregator(
- func, **get_jit_arguments(engine_kwargs)
- )
- result = aggregator(sorted_data, starts, ends, 0, *aggregator_args)
- index = self.grouper.result_index
- if data.ndim == 1:
- result_kwargs = {"name": data.name}
- result = result.ravel()
- else:
- result_kwargs = {"columns": data.columns}
- return data._constructor(result, index=index, **result_kwargs)
- @final
- def _transform_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
- """
- Perform groupby transform routine with the numba engine.
- This routine mimics the data splitting routine of the DataSplitter class
- to generate the indices of each group in the sorted data and then passes the
- data and indices into a Numba jitted function.
- """
- data = self._obj_with_exclusions
- df = data if data.ndim == 2 else data.to_frame()
- starts, ends, sorted_index, sorted_data = self._numba_prep(df)
- numba_.validate_udf(func)
- numba_transform_func = numba_.generate_numba_transform_func(
- func, **get_jit_arguments(engine_kwargs, kwargs)
- )
- result = numba_transform_func(
- sorted_data,
- sorted_index,
- starts,
- ends,
- len(df.columns),
- *args,
- )
- # result values needs to be resorted to their original positions since we
- # evaluated the data sorted by group
- result = result.take(np.argsort(sorted_index), axis=0)
- index = data.index
- if data.ndim == 1:
- result_kwargs = {"name": data.name}
- result = result.ravel()
- else:
- result_kwargs = {"columns": data.columns}
- return data._constructor(result, index=index, **result_kwargs)
- @final
- def _aggregate_with_numba(self, func, *args, engine_kwargs=None, **kwargs):
- """
- Perform groupby aggregation routine with the numba engine.
- This routine mimics the data splitting routine of the DataSplitter class
- to generate the indices of each group in the sorted data and then passes the
- data and indices into a Numba jitted function.
- """
- data = self._obj_with_exclusions
- df = data if data.ndim == 2 else data.to_frame()
- starts, ends, sorted_index, sorted_data = self._numba_prep(df)
- numba_.validate_udf(func)
- numba_agg_func = numba_.generate_numba_agg_func(
- func, **get_jit_arguments(engine_kwargs, kwargs)
- )
- result = numba_agg_func(
- sorted_data,
- sorted_index,
- starts,
- ends,
- len(df.columns),
- *args,
- )
- index = self.grouper.result_index
- if data.ndim == 1:
- result_kwargs = {"name": data.name}
- result = result.ravel()
- else:
- result_kwargs = {"columns": data.columns}
- res = data._constructor(result, index=index, **result_kwargs)
- if not self.as_index:
- res = self._insert_inaxis_grouper(res)
- res.index = default_index(len(res))
- return res
- # -----------------------------------------------------------------
- # apply/agg/transform
- @Appender(
- _apply_docs["template"].format(
- input="dataframe", examples=_apply_docs["dataframe_examples"]
- )
- )
- def apply(self, func, *args, **kwargs) -> NDFrameT:
- func = com.is_builtin_func(func)
- if isinstance(func, str):
- if hasattr(self, func):
- res = getattr(self, func)
- if callable(res):
- return res(*args, **kwargs)
- elif args or kwargs:
- raise ValueError(f"Cannot pass arguments to property {func}")
- return res
- else:
- raise TypeError(f"apply func should be callable, not '{func}'")
- elif args or kwargs:
- if callable(func):
- @wraps(func)
- def f(g):
- with np.errstate(all="ignore"):
- return func(g, *args, **kwargs)
- else:
- raise ValueError(
- "func must be a callable if args or kwargs are supplied"
- )
- else:
- f = func
- # ignore SettingWithCopy here in case the user mutates
- with option_context("mode.chained_assignment", None):
- try:
- result = self._python_apply_general(f, self._selected_obj)
- except TypeError:
- # gh-20949
- # try again, with .apply acting as a filtering
- # operation, by excluding the grouping column
- # This would normally not be triggered
- # except if the udf is trying an operation that
- # fails on *some* columns, e.g. a numeric operation
- # on a string grouper column
- return self._python_apply_general(f, self._obj_with_exclusions)
- return result
- @final
- def _python_apply_general(
- self,
- f: Callable,
- data: DataFrame | Series,
- not_indexed_same: bool | None = None,
- is_transform: bool = False,
- is_agg: bool = False,
- ) -> NDFrameT:
- """
- Apply function f in python space
- Parameters
- ----------
- f : callable
- Function to apply
- data : Series or DataFrame
- Data to apply f to
- not_indexed_same: bool, optional
- When specified, overrides the value of not_indexed_same. Apply behaves
- differently when the result index is equal to the input index, but
- this can be coincidental leading to value-dependent behavior.
- is_transform : bool, default False
- Indicator for whether the function is actually a transform
- and should not have group keys prepended.
- is_agg : bool, default False
- Indicator for whether the function is an aggregation. When the
- result is empty, we don't want to warn for this case.
- See _GroupBy._python_agg_general.
- Returns
- -------
- Series or DataFrame
- data after applying f
- """
- values, mutated = self.grouper.apply(f, data, self.axis)
- if not_indexed_same is None:
- not_indexed_same = mutated
- return self._wrap_applied_output(
- data,
- values,
- not_indexed_same,
- is_transform,
- )
- @final
- def _agg_general(
- self,
- numeric_only: bool = False,
- min_count: int = -1,
- *,
- alias: str,
- npfunc: Callable,
- ):
- result = self._cython_agg_general(
- how=alias,
- alt=npfunc,
- numeric_only=numeric_only,
- min_count=min_count,
- )
- return result.__finalize__(self.obj, method="groupby")
- def _agg_py_fallback(
- self, values: ArrayLike, ndim: int, alt: Callable
- ) -> ArrayLike:
- """
- Fallback to pure-python aggregation if _cython_operation raises
- NotImplementedError.
- """
- # We get here with a) EADtypes and b) object dtype
- assert alt is not None
- if values.ndim == 1:
- # For DataFrameGroupBy we only get here with ExtensionArray
- ser = Series(values, copy=False)
- else:
- # We only get here with values.dtype == object
- # TODO: special case not needed with ArrayManager
- df = DataFrame(values.T)
- # bc we split object blocks in grouped_reduce, we have only 1 col
- # otherwise we'd have to worry about block-splitting GH#39329
- assert df.shape[1] == 1
- # Avoid call to self.values that can occur in DataFrame
- # reductions; see GH#28949
- ser = df.iloc[:, 0]
- # We do not get here with UDFs, so we know that our dtype
- # should always be preserved by the implemented aggregations
- # TODO: Is this exactly right; see WrappedCythonOp get_result_dtype?
- res_values = self.grouper.agg_series(ser, alt, preserve_dtype=True)
- if isinstance(values, Categorical):
- # Because we only get here with known dtype-preserving
- # reductions, we cast back to Categorical.
- # TODO: if we ever get "rank" working, exclude it here.
- res_values = type(values)._from_sequence(res_values, dtype=values.dtype)
- elif ser.dtype == object:
- res_values = res_values.astype(object, copy=False)
- # If we are DataFrameGroupBy and went through a SeriesGroupByPath
- # then we need to reshape
- # GH#32223 includes case with IntegerArray values, ndarray res_values
- # test_groupby_duplicate_columns with object dtype values
- return ensure_block_shape(res_values, ndim=ndim)
- @final
- def _cython_agg_general(
- self,
- how: str,
- alt: Callable,
- numeric_only: bool = False,
- min_count: int = -1,
- **kwargs,
- ):
- # Note: we never get here with how="ohlc" for DataFrameGroupBy;
- # that goes through SeriesGroupBy
- data = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
- def array_func(values: ArrayLike) -> ArrayLike:
- try:
- result = self.grouper._cython_operation(
- "aggregate",
- values,
- how,
- axis=data.ndim - 1,
- min_count=min_count,
- **kwargs,
- )
- except NotImplementedError:
- # generally if we have numeric_only=False
- # and non-applicable functions
- # try to python agg
- # TODO: shouldn't min_count matter?
- result = self._agg_py_fallback(values, ndim=data.ndim, alt=alt)
- return result
- new_mgr = data.grouped_reduce(array_func)
- res = self._wrap_agged_manager(new_mgr)
- out = self._wrap_aggregated_output(res)
- if self.axis == 1:
- out = out.infer_objects(copy=False)
- return out
- def _cython_transform(
- self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
- ):
- raise AbstractMethodError(self)
- @final
- def _transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
- if maybe_use_numba(engine):
- return self._transform_with_numba(
- func, *args, engine_kwargs=engine_kwargs, **kwargs
- )
- # optimized transforms
- func = com.get_cython_func(func) or func
- if not isinstance(func, str):
- return self._transform_general(func, *args, **kwargs)
- elif func not in base.transform_kernel_allowlist:
- msg = f"'{func}' is not a valid function name for transform(name)"
- raise ValueError(msg)
- elif func in base.cythonized_kernels or func in base.transformation_kernels:
- # cythonized transform or canned "agg+broadcast"
- return getattr(self, func)(*args, **kwargs)
- else:
- # i.e. func in base.reduction_kernels
- # GH#30918 Use _transform_fast only when we know func is an aggregation
- # If func is a reduction, we need to broadcast the
- # result to the whole group. Compute func result
- # and deal with possible broadcasting below.
- # Temporarily set observed for dealing with categoricals.
- with com.temp_setattr(self, "observed", True):
- with com.temp_setattr(self, "as_index", True):
- # GH#49834 - result needs groups in the index for
- # _wrap_transform_fast_result
- result = getattr(self, func)(*args, **kwargs)
- return self._wrap_transform_fast_result(result)
- @final
- def _wrap_transform_fast_result(self, result: NDFrameT) -> NDFrameT:
- """
- Fast transform path for aggregations.
- """
- obj = self._obj_with_exclusions
- # for each col, reshape to size of original frame by take operation
- ids, _, _ = self.grouper.group_info
- result = result.reindex(self.grouper.result_index, axis=self.axis, copy=False)
- if self.obj.ndim == 1:
- # i.e. SeriesGroupBy
- out = algorithms.take_nd(result._values, ids)
- output = obj._constructor(out, index=obj.index, name=obj.name)
- else:
- # `.size()` gives Series output on DataFrame input, need axis 0
- axis = 0 if result.ndim == 1 else self.axis
- # GH#46209
- # Don't convert indices: negative indices need to give rise
- # to null values in the result
- output = result._take(ids, axis=axis, convert_indices=False)
- output = output.set_axis(obj._get_axis(self.axis), axis=axis)
- return output
- # -----------------------------------------------------------------
- # Utilities
- @final
- def _apply_filter(self, indices, dropna):
- if len(indices) == 0:
- indices = np.array([], dtype="int64")
- else:
- indices = np.sort(np.concatenate(indices))
- if dropna:
- filtered = self._selected_obj.take(indices, axis=self.axis)
- else:
- mask = np.empty(len(self._selected_obj.index), dtype=bool)
- mask.fill(False)
- mask[indices.astype(int)] = True
- # mask fails to broadcast when passed to where; broadcast manually.
- mask = np.tile(mask, list(self._selected_obj.shape[1:]) + [1]).T
- filtered = self._selected_obj.where(mask) # Fill with NaNs.
- return filtered
- @final
- def _cumcount_array(self, ascending: bool = True) -> np.ndarray:
- """
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from length of group - 1 to 0.
- Notes
- -----
- this is currently implementing sort=False
- (though the default is sort=True) for groupby in general
- """
- ids, _, ngroups = self.grouper.group_info
- sorter = get_group_index_sorter(ids, ngroups)
- ids, count = ids[sorter], len(ids)
- if count == 0:
- return np.empty(0, dtype=np.int64)
- run = np.r_[True, ids[:-1] != ids[1:]]
- rep = np.diff(np.r_[np.nonzero(run)[0], count])
- out = (~run).cumsum()
- if ascending:
- out -= np.repeat(out[run], rep)
- else:
- out = np.repeat(out[np.r_[run[1:], True]], rep) - out
- if self.grouper.has_dropped_na:
- out = np.where(ids == -1, np.nan, out.astype(np.float64, copy=False))
- else:
- out = out.astype(np.int64, copy=False)
- rev = np.empty(count, dtype=np.intp)
- rev[sorter] = np.arange(count, dtype=np.intp)
- return out[rev]
- # -----------------------------------------------------------------
- @final
- @property
- def _obj_1d_constructor(self) -> Callable:
- # GH28330 preserve subclassed Series/DataFrames
- if isinstance(self.obj, DataFrame):
- return self.obj._constructor_sliced
- assert isinstance(self.obj, Series)
- return self.obj._constructor
- @final
- def _bool_agg(self, val_test: Literal["any", "all"], skipna: bool):
- """
- Shared func to call any / all Cython GroupBy implementations.
- """
- def objs_to_bool(vals: ArrayLike) -> tuple[np.ndarray, type]:
- if is_object_dtype(vals.dtype) and skipna:
- # GH#37501: don't raise on pd.NA when skipna=True
- mask = isna(vals)
- if mask.any():
- # mask on original values computed separately
- vals = vals.copy()
- vals[mask] = True
- elif isinstance(vals, BaseMaskedArray):
- vals = vals._data
- vals = vals.astype(bool, copy=False)
- return vals.view(np.int8), bool
- def result_to_bool(
- result: np.ndarray,
- inference: type,
- nullable: bool = False,
- ) -> ArrayLike:
- if nullable:
- return BooleanArray(result.astype(bool, copy=False), result == -1)
- else:
- return result.astype(inference, copy=False)
- return self._get_cythonized_result(
- libgroupby.group_any_all,
- numeric_only=False,
- cython_dtype=np.dtype(np.int8),
- pre_processing=objs_to_bool,
- post_processing=result_to_bool,
- val_test=val_test,
- skipna=skipna,
- )
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def any(self, skipna: bool = True):
- """
- Return True if any value in the group is truthful, else False.
- Parameters
- ----------
- skipna : bool, default True
- Flag to ignore nan values during truth testing.
- Returns
- -------
- Series or DataFrame
- DataFrame or Series of boolean values, where a value is True if any element
- is True within its respective group, False otherwise.
- """
- return self._bool_agg("any", skipna)
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def all(self, skipna: bool = True):
- """
- Return True if all values in the group are truthful, else False.
- Parameters
- ----------
- skipna : bool, default True
- Flag to ignore nan values during truth testing.
- Returns
- -------
- Series or DataFrame
- DataFrame or Series of boolean values, where a value is True if all elements
- are True within its respective group, False otherwise.
- """
- return self._bool_agg("all", skipna)
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def count(self) -> NDFrameT:
- """
- Compute count of group, excluding missing values.
- Returns
- -------
- Series or DataFrame
- Count of values within each group.
- """
- data = self._get_data_to_aggregate()
- ids, _, ngroups = self.grouper.group_info
- mask = ids != -1
- is_series = data.ndim == 1
- def hfunc(bvalues: ArrayLike) -> ArrayLike:
- # TODO(EA2D): reshape would not be necessary with 2D EAs
- if bvalues.ndim == 1:
- # EA
- masked = mask & ~isna(bvalues).reshape(1, -1)
- else:
- masked = mask & ~isna(bvalues)
- counted = lib.count_level_2d(masked, labels=ids, max_bin=ngroups)
- if is_series:
- assert counted.ndim == 2
- assert counted.shape[0] == 1
- return counted[0]
- return counted
- new_mgr = data.grouped_reduce(hfunc)
- new_obj = self._wrap_agged_manager(new_mgr)
- # If we are grouping on categoricals we want unobserved categories to
- # return zero, rather than the default of NaN which the reindexing in
- # _wrap_aggregated_output() returns. GH 35028
- # e.g. test_dataframe_groupby_on_2_categoricals_when_observed_is_false
- with com.temp_setattr(self, "observed", True):
- result = self._wrap_aggregated_output(new_obj)
- return self._reindex_output(result, fill_value=0)
- @final
- @Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
- def mean(
- self,
- numeric_only: bool = False,
- engine: str = "cython",
- engine_kwargs: dict[str, bool] | None = None,
- ):
- """
- Compute mean of groups, excluding missing values.
- Parameters
- ----------
- numeric_only : bool, default False
- Include only float, int, boolean columns.
- .. versionchanged:: 2.0.0
- numeric_only no longer accepts ``None`` and defaults to ``False``.
- engine : str, default None
- * ``'cython'`` : Runs the operation through C-extensions from cython.
- * ``'numba'`` : Runs the operation through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or globally setting
- ``compute.use_numba``
- .. versionadded:: 1.4.0
- engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
- .. versionadded:: 1.4.0
- Returns
- -------
- pandas.Series or pandas.DataFrame
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
- ... 'B': [np.nan, 2, 3, 4, 5],
- ... 'C': [1, 2, 1, 1, 2]}, columns=['A', 'B', 'C'])
- Groupby one column and return the mean of the remaining columns in
- each group.
- >>> df.groupby('A').mean()
- B C
- A
- 1 3.0 1.333333
- 2 4.0 1.500000
- Groupby two columns and return the mean of the remaining column.
- >>> df.groupby(['A', 'B']).mean()
- C
- A B
- 1 2.0 2.0
- 4.0 1.0
- 2 3.0 1.0
- 5.0 2.0
- Groupby one column and return the mean of only particular column in
- the group.
- >>> df.groupby('A')['B'].mean()
- A
- 1 3.0
- 2 4.0
- Name: B, dtype: float64
- """
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_mean
- return self._numba_agg_general(sliding_mean, engine_kwargs)
- else:
- result = self._cython_agg_general(
- "mean",
- alt=lambda x: Series(x).mean(numeric_only=numeric_only),
- numeric_only=numeric_only,
- )
- return result.__finalize__(self.obj, method="groupby")
- @final
- def median(self, numeric_only: bool = False):
- """
- Compute median of groups, excluding missing values.
- For multiple groupings, the result index will be a MultiIndex
- Parameters
- ----------
- numeric_only : bool, default False
- Include only float, int, boolean columns.
- .. versionchanged:: 2.0.0
- numeric_only no longer accepts ``None`` and defaults to False.
- Returns
- -------
- Series or DataFrame
- Median of values within each group.
- """
- result = self._cython_agg_general(
- "median",
- alt=lambda x: Series(x).median(numeric_only=numeric_only),
- numeric_only=numeric_only,
- )
- return result.__finalize__(self.obj, method="groupby")
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def std(
- self,
- ddof: int = 1,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- numeric_only: bool = False,
- ):
- """
- Compute standard deviation of groups, excluding missing values.
- For multiple groupings, the result index will be a MultiIndex.
- Parameters
- ----------
- ddof : int, default 1
- Degrees of freedom.
- engine : str, default None
- * ``'cython'`` : Runs the operation through C-extensions from cython.
- * ``'numba'`` : Runs the operation through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or globally setting
- ``compute.use_numba``
- .. versionadded:: 1.4.0
- engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
- .. versionadded:: 1.4.0
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- .. versionchanged:: 2.0.0
- numeric_only now defaults to ``False``.
- Returns
- -------
- Series or DataFrame
- Standard deviation of values within each group.
- """
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_var
- return np.sqrt(self._numba_agg_general(sliding_var, engine_kwargs, ddof))
- else:
- def _preprocessing(values):
- if isinstance(values, BaseMaskedArray):
- return values._data, None
- return values, None
- def _postprocessing(
- vals, inference, nullable: bool = False, result_mask=None
- ) -> ArrayLike:
- if nullable:
- if result_mask.ndim == 2:
- result_mask = result_mask[:, 0]
- return FloatingArray(np.sqrt(vals), result_mask.view(np.bool_))
- return np.sqrt(vals)
- result = self._get_cythonized_result(
- libgroupby.group_var,
- cython_dtype=np.dtype(np.float64),
- numeric_only=numeric_only,
- needs_counts=True,
- pre_processing=_preprocessing,
- post_processing=_postprocessing,
- ddof=ddof,
- how="std",
- )
- return result
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def var(
- self,
- ddof: int = 1,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- numeric_only: bool = False,
- ):
- """
- Compute variance of groups, excluding missing values.
- For multiple groupings, the result index will be a MultiIndex.
- Parameters
- ----------
- ddof : int, default 1
- Degrees of freedom.
- engine : str, default None
- * ``'cython'`` : Runs the operation through C-extensions from cython.
- * ``'numba'`` : Runs the operation through JIT compiled code from numba.
- * ``None`` : Defaults to ``'cython'`` or globally setting
- ``compute.use_numba``
- .. versionadded:: 1.4.0
- engine_kwargs : dict, default None
- * For ``'cython'`` engine, there are no accepted ``engine_kwargs``
- * For ``'numba'`` engine, the engine can accept ``nopython``, ``nogil``
- and ``parallel`` dictionary keys. The values must either be ``True`` or
- ``False``. The default ``engine_kwargs`` for the ``'numba'`` engine is
- ``{{'nopython': True, 'nogil': False, 'parallel': False}}``
- .. versionadded:: 1.4.0
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- .. versionchanged:: 2.0.0
- numeric_only now defaults to ``False``.
- Returns
- -------
- Series or DataFrame
- Variance of values within each group.
- """
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_var
- return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
- else:
- return self._cython_agg_general(
- "var",
- alt=lambda x: Series(x).var(ddof=ddof),
- numeric_only=numeric_only,
- ddof=ddof,
- )
- @final
- def _value_counts(
- self,
- subset: Sequence[Hashable] | None = None,
- normalize: bool = False,
- sort: bool = True,
- ascending: bool = False,
- dropna: bool = True,
- ) -> DataFrame | Series:
- """
- Shared implementation of value_counts for SeriesGroupBy and DataFrameGroupBy.
- SeriesGroupBy additionally supports a bins argument. See the docstring of
- DataFrameGroupBy.value_counts for a description of arguments.
- """
- if self.axis == 1:
- raise NotImplementedError(
- "DataFrameGroupBy.value_counts only handles axis=0"
- )
- name = "proportion" if normalize else "count"
- df = self.obj
- obj = self._obj_with_exclusions
- in_axis_names = {
- grouping.name for grouping in self.grouper.groupings if grouping.in_axis
- }
- if isinstance(obj, Series):
- _name = obj.name
- keys = [] if _name in in_axis_names else [obj]
- else:
- unique_cols = set(obj.columns)
- if subset is not None:
- subsetted = set(subset)
- clashing = subsetted & set(in_axis_names)
- if clashing:
- raise ValueError(
- f"Keys {clashing} in subset cannot be in "
- "the groupby column keys."
- )
- doesnt_exist = subsetted - unique_cols
- if doesnt_exist:
- raise ValueError(
- f"Keys {doesnt_exist} in subset do not "
- f"exist in the DataFrame."
- )
- else:
- subsetted = unique_cols
- keys = [
- # Can't use .values because the column label needs to be preserved
- obj.iloc[:, idx]
- for idx, _name in enumerate(obj.columns)
- if _name not in in_axis_names and _name in subsetted
- ]
- groupings = list(self.grouper.groupings)
- for key in keys:
- grouper, _, _ = get_grouper(
- df,
- key=key,
- axis=self.axis,
- sort=self.sort,
- observed=False,
- dropna=dropna,
- )
- groupings += list(grouper.groupings)
- # Take the size of the overall columns
- gb = df.groupby(
- groupings,
- sort=self.sort,
- observed=self.observed,
- dropna=self.dropna,
- )
- result_series = cast(Series, gb.size())
- result_series.name = name
- # GH-46357 Include non-observed categories
- # of non-grouping columns regardless of `observed`
- if any(
- isinstance(grouping.grouping_vector, (Categorical, CategoricalIndex))
- and not grouping._observed
- for grouping in groupings
- ):
- levels_list = [ping.result_index for ping in groupings]
- multi_index, _ = MultiIndex.from_product(
- levels_list, names=[ping.name for ping in groupings]
- ).sortlevel()
- result_series = result_series.reindex(multi_index, fill_value=0)
- if normalize:
- # Normalize the results by dividing by the original group sizes.
- # We are guaranteed to have the first N levels be the
- # user-requested grouping.
- levels = list(
- range(len(self.grouper.groupings), result_series.index.nlevels)
- )
- indexed_group_size = result_series.groupby(
- result_series.index.droplevel(levels),
- sort=self.sort,
- dropna=self.dropna,
- ).transform("sum")
- result_series /= indexed_group_size
- # Handle groups of non-observed categories
- result_series = result_series.fillna(0.0)
- if sort:
- # Sort the values and then resort by the main grouping
- index_level = range(len(self.grouper.groupings))
- result_series = result_series.sort_values(ascending=ascending).sort_index(
- level=index_level, sort_remaining=False
- )
- result: Series | DataFrame
- if self.as_index:
- result = result_series
- else:
- # Convert to frame
- index = result_series.index
- columns = com.fill_missing_names(index.names)
- if name in columns:
- raise ValueError(f"Column label '{name}' is duplicate of result column")
- result_series.name = name
- result_series.index = index.set_names(range(len(columns)))
- result_frame = result_series.reset_index()
- result_frame.columns = columns + [name]
- result = result_frame
- return result.__finalize__(self.obj, method="value_counts")
- @final
- def sem(self, ddof: int = 1, numeric_only: bool = False):
- """
- Compute standard error of the mean of groups, excluding missing values.
- For multiple groupings, the result index will be a MultiIndex.
- Parameters
- ----------
- ddof : int, default 1
- Degrees of freedom.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- .. versionchanged:: 2.0.0
- numeric_only now defaults to ``False``.
- Returns
- -------
- Series or DataFrame
- Standard error of the mean of values within each group.
- """
- if numeric_only and self.obj.ndim == 1 and not is_numeric_dtype(self.obj.dtype):
- raise TypeError(
- f"{type(self).__name__}.sem called with "
- f"numeric_only={numeric_only} and dtype {self.obj.dtype}"
- )
- result = self.std(ddof=ddof, numeric_only=numeric_only)
- if result.ndim == 1:
- result /= np.sqrt(self.count())
- else:
- cols = result.columns.difference(self.exclusions).unique()
- counts = self.count()
- result_ilocs = result.columns.get_indexer_for(cols)
- count_ilocs = counts.columns.get_indexer_for(cols)
- result.iloc[:, result_ilocs] /= np.sqrt(counts.iloc[:, count_ilocs])
- return result
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def size(self) -> DataFrame | Series:
- """
- Compute group sizes.
- Returns
- -------
- DataFrame or Series
- Number of rows in each group as a Series if as_index is True
- or a DataFrame if as_index is False.
- """
- result = self.grouper.size()
- # GH28330 preserve subclassed Series/DataFrames through calls
- if isinstance(self.obj, Series):
- result = self._obj_1d_constructor(result, name=self.obj.name)
- else:
- result = self._obj_1d_constructor(result)
- with com.temp_setattr(self, "as_index", True):
- # size already has the desired behavior in GH#49519, but this makes the
- # as_index=False path of _reindex_output fail on categorical groupers.
- result = self._reindex_output(result, fill_value=0)
- if not self.as_index:
- # error: Incompatible types in assignment (expression has
- # type "DataFrame", variable has type "Series")
- result = result.rename("size").reset_index() # type: ignore[assignment]
- return result
- @final
- @doc(_groupby_agg_method_template, fname="sum", no=False, mc=0)
- def sum(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_sum
- return self._numba_agg_general(
- sliding_sum,
- engine_kwargs,
- )
- else:
- # If we are grouping on categoricals we want unobserved categories to
- # return zero, rather than the default of NaN which the reindexing in
- # _agg_general() returns. GH #31422
- with com.temp_setattr(self, "observed", True):
- result = self._agg_general(
- numeric_only=numeric_only,
- min_count=min_count,
- alias="sum",
- npfunc=np.sum,
- )
- return self._reindex_output(result, fill_value=0)
- @final
- @doc(_groupby_agg_method_template, fname="prod", no=False, mc=0)
- def prod(self, numeric_only: bool = False, min_count: int = 0):
- return self._agg_general(
- numeric_only=numeric_only, min_count=min_count, alias="prod", npfunc=np.prod
- )
- @final
- @doc(_groupby_agg_method_template, fname="min", no=False, mc=-1)
- def min(
- self,
- numeric_only: bool = False,
- min_count: int = -1,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_min_max
- return self._numba_agg_general(sliding_min_max, engine_kwargs, False)
- else:
- return self._agg_general(
- numeric_only=numeric_only,
- min_count=min_count,
- alias="min",
- npfunc=np.min,
- )
- @final
- @doc(_groupby_agg_method_template, fname="max", no=False, mc=-1)
- def max(
- self,
- numeric_only: bool = False,
- min_count: int = -1,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- if maybe_use_numba(engine):
- from pandas.core._numba.kernels import sliding_min_max
- return self._numba_agg_general(sliding_min_max, engine_kwargs, True)
- else:
- return self._agg_general(
- numeric_only=numeric_only,
- min_count=min_count,
- alias="max",
- npfunc=np.max,
- )
- @final
- def first(self, numeric_only: bool = False, min_count: int = -1):
- """
- Compute the first non-null entry of each column.
- Parameters
- ----------
- numeric_only : bool, default False
- Include only float, int, boolean columns.
- min_count : int, default -1
- The required number of valid values to perform the operation. If fewer
- than ``min_count`` non-NA values are present the result will be NA.
- Returns
- -------
- Series or DataFrame
- First non-null of values within each group.
- See Also
- --------
- DataFrame.groupby : Apply a function groupby to each row or column of a
- DataFrame.
- pandas.core.groupby.DataFrameGroupBy.last : Compute the last non-null entry
- of each column.
- pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.
- Examples
- --------
- >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[None, 5, 6], C=[1, 2, 3],
- ... D=['3/11/2000', '3/12/2000', '3/13/2000']))
- >>> df['D'] = pd.to_datetime(df['D'])
- >>> df.groupby("A").first()
- B C D
- A
- 1 5.0 1 2000-03-11
- 3 6.0 3 2000-03-13
- >>> df.groupby("A").first(min_count=2)
- B C D
- A
- 1 NaN 1.0 2000-03-11
- 3 NaN NaN NaT
- >>> df.groupby("A").first(numeric_only=True)
- B C
- A
- 1 5.0 1
- 3 6.0 3
- """
- def first_compat(obj: NDFrameT, axis: AxisInt = 0):
- def first(x: Series):
- """Helper function for first item that isn't NA."""
- arr = x.array[notna(x.array)]
- if not len(arr):
- return np.nan
- return arr[0]
- if isinstance(obj, DataFrame):
- return obj.apply(first, axis=axis)
- elif isinstance(obj, Series):
- return first(obj)
- else: # pragma: no cover
- raise TypeError(type(obj))
- return self._agg_general(
- numeric_only=numeric_only,
- min_count=min_count,
- alias="first",
- npfunc=first_compat,
- )
- @final
- def last(self, numeric_only: bool = False, min_count: int = -1):
- """
- Compute the last non-null entry of each column.
- Parameters
- ----------
- numeric_only : bool, default False
- Include only float, int, boolean columns. If None, will attempt to use
- everything, then use only numeric data.
- min_count : int, default -1
- The required number of valid values to perform the operation. If fewer
- than ``min_count`` non-NA values are present the result will be NA.
- Returns
- -------
- Series or DataFrame
- Last non-null of values within each group.
- See Also
- --------
- DataFrame.groupby : Apply a function groupby to each row or column of a
- DataFrame.
- pandas.core.groupby.DataFrameGroupBy.first : Compute the first non-null entry
- of each column.
- pandas.core.groupby.DataFrameGroupBy.nth : Take the nth row from each group.
- Examples
- --------
- >>> df = pd.DataFrame(dict(A=[1, 1, 3], B=[5, None, 6], C=[1, 2, 3]))
- >>> df.groupby("A").last()
- B C
- A
- 1 5.0 2
- 3 6.0 3
- """
- def last_compat(obj: NDFrameT, axis: AxisInt = 0):
- def last(x: Series):
- """Helper function for last item that isn't NA."""
- arr = x.array[notna(x.array)]
- if not len(arr):
- return np.nan
- return arr[-1]
- if isinstance(obj, DataFrame):
- return obj.apply(last, axis=axis)
- elif isinstance(obj, Series):
- return last(obj)
- else: # pragma: no cover
- raise TypeError(type(obj))
- return self._agg_general(
- numeric_only=numeric_only,
- min_count=min_count,
- alias="last",
- npfunc=last_compat,
- )
- @final
- def ohlc(self) -> DataFrame:
- """
- Compute open, high, low and close values of a group, excluding missing values.
- For multiple groupings, the result index will be a MultiIndex
- Returns
- -------
- DataFrame
- Open, high, low and close values within each group.
- """
- if self.obj.ndim == 1:
- # self._iterate_slices() yields only self._selected_obj
- obj = self._selected_obj
- is_numeric = is_numeric_dtype(obj.dtype)
- if not is_numeric:
- raise DataError("No numeric types to aggregate")
- res_values = self.grouper._cython_operation(
- "aggregate", obj._values, "ohlc", axis=0, min_count=-1
- )
- agg_names = ["open", "high", "low", "close"]
- result = self.obj._constructor_expanddim(
- res_values, index=self.grouper.result_index, columns=agg_names
- )
- return self._reindex_output(result)
- result = self._apply_to_column_groupbys(
- lambda x: x.ohlc(), self._obj_with_exclusions
- )
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
- return result
- @doc(DataFrame.describe)
- def describe(
- self,
- percentiles=None,
- include=None,
- exclude=None,
- ) -> NDFrameT:
- obj = self._obj_with_exclusions
- if len(obj) == 0:
- described = obj.describe(
- percentiles=percentiles, include=include, exclude=exclude
- )
- if obj.ndim == 1:
- result = described
- else:
- result = described.unstack()
- return result.to_frame().T.iloc[:0]
- with com.temp_setattr(self, "as_index", True):
- result = self._python_apply_general(
- lambda x: x.describe(
- percentiles=percentiles, include=include, exclude=exclude
- ),
- obj,
- not_indexed_same=True,
- )
- if self.axis == 1:
- return result.T
- # GH#49256 - properly handle the grouping column(s)
- result = result.unstack()
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
- return result
- @final
- def resample(self, rule, *args, **kwargs):
- """
- Provide resampling when using a TimeGrouper.
- Given a grouper, the function resamples it according to a string
- "string" -> "frequency".
- See the :ref:`frequency aliases <timeseries.offset_aliases>`
- documentation for more details.
- Parameters
- ----------
- rule : str or DateOffset
- The offset string or object representing target grouper conversion.
- *args, **kwargs
- Possible arguments are `how`, `fill_method`, `limit`, `kind` and
- `on`, and other arguments of `TimeGrouper`.
- Returns
- -------
- Grouper
- Return a new grouper with our resampler appended.
- See Also
- --------
- Grouper : Specify a frequency to resample with when
- grouping by a key.
- DatetimeIndex.resample : Frequency conversion and resampling of
- time series.
- Examples
- --------
- >>> idx = pd.date_range('1/1/2000', periods=4, freq='T')
- >>> df = pd.DataFrame(data=4 * [range(2)],
- ... index=idx,
- ... columns=['a', 'b'])
- >>> df.iloc[2, 0] = 5
- >>> df
- a b
- 2000-01-01 00:00:00 0 1
- 2000-01-01 00:01:00 0 1
- 2000-01-01 00:02:00 5 1
- 2000-01-01 00:03:00 0 1
- Downsample the DataFrame into 3 minute bins and sum the values of
- the timestamps falling into a bin.
- >>> df.groupby('a').resample('3T').sum()
- a b
- a
- 0 2000-01-01 00:00:00 0 2
- 2000-01-01 00:03:00 0 1
- 5 2000-01-01 00:00:00 5 1
- Upsample the series into 30 second bins.
- >>> df.groupby('a').resample('30S').sum()
- a b
- a
- 0 2000-01-01 00:00:00 0 1
- 2000-01-01 00:00:30 0 0
- 2000-01-01 00:01:00 0 1
- 2000-01-01 00:01:30 0 0
- 2000-01-01 00:02:00 0 0
- 2000-01-01 00:02:30 0 0
- 2000-01-01 00:03:00 0 1
- 5 2000-01-01 00:02:00 5 1
- Resample by month. Values are assigned to the month of the period.
- >>> df.groupby('a').resample('M').sum()
- a b
- a
- 0 2000-01-31 0 3
- 5 2000-01-31 5 1
- Downsample the series into 3 minute bins as above, but close the right
- side of the bin interval.
- >>> df.groupby('a').resample('3T', closed='right').sum()
- a b
- a
- 0 1999-12-31 23:57:00 0 1
- 2000-01-01 00:00:00 0 2
- 5 2000-01-01 00:00:00 5 1
- Downsample the series into 3 minute bins and close the right side of
- the bin interval, but label each bin using the right edge instead of
- the left.
- >>> df.groupby('a').resample('3T', closed='right', label='right').sum()
- a b
- a
- 0 2000-01-01 00:00:00 0 1
- 2000-01-01 00:03:00 0 2
- 5 2000-01-01 00:03:00 5 1
- """
- from pandas.core.resample import get_resampler_for_grouping
- return get_resampler_for_grouping(self, rule, *args, **kwargs)
- @final
- def rolling(self, *args, **kwargs) -> RollingGroupby:
- """
- Return a rolling grouper, providing rolling functionality per group.
- Parameters
- ----------
- window : int, timedelta, str, offset, or BaseIndexer subclass
- Size of the moving window.
- If an integer, the fixed number of observations used for
- each window.
- If a timedelta, str, or offset, the time period of each window. Each
- window will be a variable sized based on the observations included in
- the time-period. This is only valid for datetimelike indexes.
- To learn more about the offsets & frequency strings, please see `this link
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.
- If a BaseIndexer subclass, the window boundaries
- based on the defined ``get_window_bounds`` method. Additional rolling
- keyword arguments, namely ``min_periods``, ``center``, ``closed`` and
- ``step`` will be passed to ``get_window_bounds``.
- min_periods : int, default None
- Minimum number of observations in window required to have a value;
- otherwise, result is ``np.nan``.
- For a window that is specified by an offset,
- ``min_periods`` will default to 1.
- For a window that is specified by an integer, ``min_periods`` will default
- to the size of the window.
- center : bool, default False
- If False, set the window labels as the right edge of the window index.
- If True, set the window labels as the center of the window index.
- win_type : str, default None
- If ``None``, all points are evenly weighted.
- If a string, it must be a valid `scipy.signal window function
- <https://docs.scipy.org/doc/scipy/reference/signal.windows.html#module-scipy.signal.windows>`__.
- Certain Scipy window types require additional parameters to be passed
- in the aggregation function. The additional parameters must match
- the keywords specified in the Scipy window type method signature.
- on : str, optional
- For a DataFrame, a column label or Index level on which
- to calculate the rolling window, rather than the DataFrame's index.
- Provided integer column is ignored and excluded from result since
- an integer index is not used to calculate the rolling window.
- axis : int or str, default 0
- If ``0`` or ``'index'``, roll across the rows.
- If ``1`` or ``'columns'``, roll across the columns.
- For `Series` this parameter is unused and defaults to 0.
- closed : str, default None
- If ``'right'``, the first point in the window is excluded from calculations.
- If ``'left'``, the last point in the window is excluded from calculations.
- If ``'both'``, the no points in the window are excluded from calculations.
- If ``'neither'``, the first and last points in the window are excluded
- from calculations.
- Default ``None`` (``'right'``).
- method : str {'single', 'table'}, default 'single'
- Execute the rolling operation per single column or row (``'single'``)
- or over the entire object (``'table'``).
- This argument is only implemented when specifying ``engine='numba'``
- in the method call.
- Returns
- -------
- RollingGroupby
- Return a new grouper with our rolling appended.
- See Also
- --------
- Series.rolling : Calling object with Series data.
- DataFrame.rolling : Calling object with DataFrames.
- Series.groupby : Apply a function groupby to a Series.
- DataFrame.groupby : Apply a function groupby.
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 2, 2],
- ... 'B': [1, 2, 3, 4],
- ... 'C': [0.362, 0.227, 1.267, -0.562]})
- >>> df
- A B C
- 0 1 1 0.362
- 1 1 2 0.227
- 2 2 3 1.267
- 3 2 4 -0.562
- >>> df.groupby('A').rolling(2).sum()
- B C
- A
- 1 0 NaN NaN
- 1 3.0 0.589
- 2 2 NaN NaN
- 3 7.0 0.705
- >>> df.groupby('A').rolling(2, min_periods=1).sum()
- B C
- A
- 1 0 1.0 0.362
- 1 3.0 0.589
- 2 2 3.0 1.267
- 3 7.0 0.705
- >>> df.groupby('A').rolling(2, on='B').sum()
- B C
- A
- 1 0 1 NaN
- 1 2 0.589
- 2 2 3 NaN
- 3 4 0.705
- """
- from pandas.core.window import RollingGroupby
- return RollingGroupby(
- self._selected_obj,
- *args,
- _grouper=self.grouper,
- _as_index=self.as_index,
- **kwargs,
- )
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def expanding(self, *args, **kwargs) -> ExpandingGroupby:
- """
- Return an expanding grouper, providing expanding
- functionality per group.
- """
- from pandas.core.window import ExpandingGroupby
- return ExpandingGroupby(
- self._selected_obj,
- *args,
- _grouper=self.grouper,
- **kwargs,
- )
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def ewm(self, *args, **kwargs) -> ExponentialMovingWindowGroupby:
- """
- Return an ewm grouper, providing ewm functionality per group.
- """
- from pandas.core.window import ExponentialMovingWindowGroupby
- return ExponentialMovingWindowGroupby(
- self._selected_obj,
- *args,
- _grouper=self.grouper,
- **kwargs,
- )
- @final
- def _fill(self, direction: Literal["ffill", "bfill"], limit=None):
- """
- Shared function for `pad` and `backfill` to call Cython method.
- Parameters
- ----------
- direction : {'ffill', 'bfill'}
- Direction passed to underlying Cython function. `bfill` will cause
- values to be filled backwards. `ffill` and any other values will
- default to a forward fill
- limit : int, default None
- Maximum number of consecutive values to fill. If `None`, this
- method will convert to -1 prior to passing to Cython
- Returns
- -------
- `Series` or `DataFrame` with filled values
- See Also
- --------
- pad : Returns Series with minimum number of char in object.
- backfill : Backward fill the missing values in the dataset.
- """
- # Need int value for Cython
- if limit is None:
- limit = -1
- ids, _, _ = self.grouper.group_info
- sorted_labels = np.argsort(ids, kind="mergesort").astype(np.intp, copy=False)
- if direction == "bfill":
- sorted_labels = sorted_labels[::-1]
- col_func = partial(
- libgroupby.group_fillna_indexer,
- labels=ids,
- sorted_labels=sorted_labels,
- direction=direction,
- limit=limit,
- dropna=self.dropna,
- )
- def blk_func(values: ArrayLike) -> ArrayLike:
- mask = isna(values)
- if values.ndim == 1:
- indexer = np.empty(values.shape, dtype=np.intp)
- col_func(out=indexer, mask=mask)
- return algorithms.take_nd(values, indexer)
- else:
- # We broadcast algorithms.take_nd analogous to
- # np.take_along_axis
- # Note: we only get here with backfill/pad,
- # so if we have a dtype that cannot hold NAs,
- # then there will be no -1s in indexer, so we can use
- # the original dtype (no need to ensure_dtype_can_hold_na)
- if isinstance(values, np.ndarray):
- dtype = values.dtype
- if self.grouper.has_dropped_na:
- # dropped null groups give rise to nan in the result
- dtype = ensure_dtype_can_hold_na(values.dtype)
- out = np.empty(values.shape, dtype=dtype)
- else:
- out = type(values)._empty(values.shape, dtype=values.dtype)
- for i, value_element in enumerate(values):
- # call group_fillna_indexer column-wise
- indexer = np.empty(values.shape[1], dtype=np.intp)
- col_func(out=indexer, mask=mask[i])
- out[i, :] = algorithms.take_nd(value_element, indexer)
- return out
- mgr = self._get_data_to_aggregate()
- res_mgr = mgr.apply(blk_func)
- new_obj = self._wrap_agged_manager(res_mgr)
- if self.axis == 1:
- # Only relevant for DataFrameGroupBy
- new_obj = new_obj.T
- new_obj.columns = self.obj.columns
- new_obj.index = self.obj.index
- return new_obj
- @final
- @Substitution(name="groupby")
- def ffill(self, limit=None):
- """
- Forward fill the values.
- Parameters
- ----------
- limit : int, optional
- Limit of how many values to fill.
- Returns
- -------
- Series or DataFrame
- Object with missing values filled.
- See Also
- --------
- Series.ffill: Returns Series with minimum number of char in object.
- DataFrame.ffill: Object with missing values filled or None if inplace=True.
- Series.fillna: Fill NaN values of a Series.
- DataFrame.fillna: Fill NaN values of a DataFrame.
- """
- return self._fill("ffill", limit=limit)
- @final
- @Substitution(name="groupby")
- def bfill(self, limit=None):
- """
- Backward fill the values.
- Parameters
- ----------
- limit : int, optional
- Limit of how many values to fill.
- Returns
- -------
- Series or DataFrame
- Object with missing values filled.
- See Also
- --------
- Series.bfill : Backward fill the missing values in the dataset.
- DataFrame.bfill: Backward fill the missing values in the dataset.
- Series.fillna: Fill NaN values of a Series.
- DataFrame.fillna: Fill NaN values of a DataFrame.
- """
- return self._fill("bfill", limit=limit)
- @final
- @property
- @Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
- def nth(self) -> GroupByNthSelector:
- """
- Take the nth row from each group if n is an int, otherwise a subset of rows.
- Can be either a call or an index. dropna is not available with index notation.
- Index notation accepts a comma separated list of integers and slices.
- If dropna, will take the nth non-null row, dropna is either
- 'all' or 'any'; this is equivalent to calling dropna(how=dropna)
- before the groupby.
- Parameters
- ----------
- n : int, slice or list of ints and slices
- A single nth value for the row or a list of nth values or slices.
- .. versionchanged:: 1.4.0
- Added slice and lists containing slices.
- Added index notation.
- dropna : {'any', 'all', None}, default None
- Apply the specified dropna operation before counting which row is
- the nth row. Only supported if n is an int.
- Returns
- -------
- Series or DataFrame
- N-th value within each group.
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 1, 2, 1, 2],
- ... 'B': [np.nan, 2, 3, 4, 5]}, columns=['A', 'B'])
- >>> g = df.groupby('A')
- >>> g.nth(0)
- A B
- 0 1 NaN
- 2 2 3.0
- >>> g.nth(1)
- A B
- 1 1 2.0
- 4 2 5.0
- >>> g.nth(-1)
- A B
- 3 1 4.0
- 4 2 5.0
- >>> g.nth([0, 1])
- A B
- 0 1 NaN
- 1 1 2.0
- 2 2 3.0
- 4 2 5.0
- >>> g.nth(slice(None, -1))
- A B
- 0 1 NaN
- 1 1 2.0
- 2 2 3.0
- Index notation may also be used
- >>> g.nth[0, 1]
- A B
- 0 1 NaN
- 1 1 2.0
- 2 2 3.0
- 4 2 5.0
- >>> g.nth[:-1]
- A B
- 0 1 NaN
- 1 1 2.0
- 2 2 3.0
- Specifying `dropna` allows ignoring ``NaN`` values
- >>> g.nth(0, dropna='any')
- A B
- 1 1 2.0
- 2 2 3.0
- When the specified ``n`` is larger than any of the groups, an
- empty DataFrame is returned
- >>> g.nth(3, dropna='any')
- Empty DataFrame
- Columns: [A, B]
- Index: []
- """
- return GroupByNthSelector(self)
- def _nth(
- self,
- n: PositionalIndexer | tuple,
- dropna: Literal["any", "all", None] = None,
- ) -> NDFrameT:
- if not dropna:
- mask = self._make_mask_from_positional_indexer(n)
- ids, _, _ = self.grouper.group_info
- # Drop NA values in grouping
- mask = mask & (ids != -1)
- out = self._mask_selected_obj(mask)
- return out
- # dropna is truthy
- if not is_integer(n):
- raise ValueError("dropna option only supported for an integer argument")
- if dropna not in ["any", "all"]:
- # Note: when agg-ing picker doesn't raise this, just returns NaN
- raise ValueError(
- "For a DataFrame or Series groupby.nth, dropna must be "
- "either None, 'any' or 'all', "
- f"(was passed {dropna})."
- )
- # old behaviour, but with all and any support for DataFrames.
- # modified in GH 7559 to have better perf
- n = cast(int, n)
- dropped = self.obj.dropna(how=dropna, axis=self.axis)
- # get a new grouper for our dropped obj
- if self.keys is None and self.level is None:
- # we don't have the grouper info available
- # (e.g. we have selected out
- # a column that is not in the current object)
- axis = self.grouper.axis
- grouper = self.grouper.codes_info[axis.isin(dropped.index)]
- if self.grouper.has_dropped_na:
- # Null groups need to still be encoded as -1 when passed to groupby
- nulls = grouper == -1
- # error: No overload variant of "where" matches argument types
- # "Any", "NAType", "Any"
- values = np.where(nulls, NA, grouper) # type: ignore[call-overload]
- grouper = Index(values, dtype="Int64") # type: ignore[assignment]
- else:
- # create a grouper with the original parameters, but on dropped
- # object
- grouper, _, _ = get_grouper( # type: ignore[assignment]
- dropped,
- key=self.keys,
- axis=self.axis,
- level=self.level,
- sort=self.sort,
- )
- grb = dropped.groupby(
- grouper, as_index=self.as_index, sort=self.sort, axis=self.axis
- )
- return grb.nth(n)
- @final
- def quantile(
- self,
- q: float | AnyArrayLike = 0.5,
- interpolation: str = "linear",
- numeric_only: bool = False,
- ):
- """
- Return group values at the given quantile, a la numpy.percentile.
- Parameters
- ----------
- q : float or array-like, default 0.5 (50% quantile)
- Value(s) between 0 and 1 providing the quantile(s) to compute.
- interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
- Method to use when the desired quantile falls between two points.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- .. versionchanged:: 2.0.0
- numeric_only now defaults to ``False``.
- Returns
- -------
- Series or DataFrame
- Return type determined by caller of GroupBy object.
- See Also
- --------
- Series.quantile : Similar method for Series.
- DataFrame.quantile : Similar method for DataFrame.
- numpy.percentile : NumPy method to compute qth percentile.
- Examples
- --------
- >>> df = pd.DataFrame([
- ... ['a', 1], ['a', 2], ['a', 3],
- ... ['b', 1], ['b', 3], ['b', 5]
- ... ], columns=['key', 'val'])
- >>> df.groupby('key').quantile()
- val
- key
- a 2.0
- b 3.0
- """
- def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
- if is_object_dtype(vals):
- raise TypeError(
- "'quantile' cannot be performed against 'object' dtypes!"
- )
- inference: DtypeObj | None = None
- if isinstance(vals, BaseMaskedArray) and is_numeric_dtype(vals.dtype):
- out = vals.to_numpy(dtype=float, na_value=np.nan)
- inference = vals.dtype
- elif is_integer_dtype(vals.dtype):
- if isinstance(vals, ExtensionArray):
- out = vals.to_numpy(dtype=float, na_value=np.nan)
- else:
- out = vals
- inference = np.dtype(np.int64)
- elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):
- out = vals.to_numpy(dtype=float, na_value=np.nan)
- elif needs_i8_conversion(vals.dtype):
- inference = vals.dtype
- # In this case we need to delay the casting until after the
- # np.lexsort below.
- # error: Incompatible return value type (got
- # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any,
- # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any],
- # Optional[Union[dtype[Any], ExtensionDtype]]]")
- return vals, inference # type: ignore[return-value]
- elif isinstance(vals, ExtensionArray) and is_float_dtype(vals):
- inference = np.dtype(np.float64)
- out = vals.to_numpy(dtype=float, na_value=np.nan)
- else:
- out = np.asarray(vals)
- return out, inference
- def post_processor(
- vals: np.ndarray,
- inference: DtypeObj | None,
- result_mask: np.ndarray | None,
- orig_vals: ArrayLike,
- ) -> ArrayLike:
- if inference:
- # Check for edge case
- if isinstance(orig_vals, BaseMaskedArray):
- assert result_mask is not None # for mypy
- if interpolation in {"linear", "midpoint"} and not is_float_dtype(
- orig_vals
- ):
- return FloatingArray(vals, result_mask)
- else:
- # Item "ExtensionDtype" of "Union[ExtensionDtype, str,
- # dtype[Any], Type[object]]" has no attribute "numpy_dtype"
- # [union-attr]
- return type(orig_vals)(
- vals.astype(
- inference.numpy_dtype # type: ignore[union-attr]
- ),
- result_mask,
- )
- elif not (
- is_integer_dtype(inference)
- and interpolation in {"linear", "midpoint"}
- ):
- if needs_i8_conversion(inference):
- # error: Item "ExtensionArray" of "Union[ExtensionArray,
- # ndarray[Any, Any]]" has no attribute "_ndarray"
- vals = vals.astype("i8").view(
- orig_vals._ndarray.dtype # type: ignore[union-attr]
- )
- # error: Item "ExtensionArray" of "Union[ExtensionArray,
- # ndarray[Any, Any]]" has no attribute "_from_backing_data"
- return orig_vals._from_backing_data( # type: ignore[union-attr]
- vals
- )
- assert isinstance(inference, np.dtype) # for mypy
- return vals.astype(inference)
- return vals
- orig_scalar = is_scalar(q)
- if orig_scalar:
- # error: Incompatible types in assignment (expression has type "List[
- # Union[float, ExtensionArray, ndarray[Any, Any], Index, Series]]",
- # variable has type "Union[float, Union[Union[ExtensionArray, ndarray[
- # Any, Any]], Index, Series]]")
- q = [q] # type: ignore[assignment]
- qs = np.array(q, dtype=np.float64)
- ids, _, ngroups = self.grouper.group_info
- nqs = len(qs)
- func = partial(
- libgroupby.group_quantile, labels=ids, qs=qs, interpolation=interpolation
- )
- # Put '-1' (NaN) labels as the last group so it does not interfere
- # with the calculations. Note: length check avoids failure on empty
- # labels. In that case, the value doesn't matter
- na_label_for_sorting = ids.max() + 1 if len(ids) > 0 else 0
- labels_for_lexsort = np.where(ids == -1, na_label_for_sorting, ids)
- def blk_func(values: ArrayLike) -> ArrayLike:
- orig_vals = values
- if isinstance(values, BaseMaskedArray):
- mask = values._mask
- result_mask = np.zeros((ngroups, nqs), dtype=np.bool_)
- else:
- mask = isna(values)
- result_mask = None
- is_datetimelike = needs_i8_conversion(values.dtype)
- vals, inference = pre_processor(values)
- ncols = 1
- if vals.ndim == 2:
- ncols = vals.shape[0]
- shaped_labels = np.broadcast_to(
- labels_for_lexsort, (ncols, len(labels_for_lexsort))
- )
- else:
- shaped_labels = labels_for_lexsort
- out = np.empty((ncols, ngroups, nqs), dtype=np.float64)
- # Get an index of values sorted by values and then labels
- order = (vals, shaped_labels)
- sort_arr = np.lexsort(order).astype(np.intp, copy=False)
- if is_datetimelike:
- # This casting needs to happen after the lexsort in order
- # to ensure that NaTs are placed at the end and not the front
- vals = vals.view("i8").astype(np.float64)
- if vals.ndim == 1:
- # Ea is always 1d
- func(
- out[0],
- values=vals,
- mask=mask,
- sort_indexer=sort_arr,
- result_mask=result_mask,
- )
- else:
- for i in range(ncols):
- func(out[i], values=vals[i], mask=mask[i], sort_indexer=sort_arr[i])
- if vals.ndim == 1:
- out = out.ravel("K")
- if result_mask is not None:
- result_mask = result_mask.ravel("K")
- else:
- out = out.reshape(ncols, ngroups * nqs)
- return post_processor(out, inference, result_mask, orig_vals)
- data = self._get_data_to_aggregate(numeric_only=numeric_only, name="quantile")
- res_mgr = data.grouped_reduce(blk_func)
- res = self._wrap_agged_manager(res_mgr)
- if orig_scalar:
- # Avoid expensive MultiIndex construction
- return self._wrap_aggregated_output(res)
- return self._wrap_aggregated_output(res, qs=qs)
- @final
- @Substitution(name="groupby")
- def ngroup(self, ascending: bool = True):
- """
- Number each group from 0 to the number of groups - 1.
- This is the enumerative complement of cumcount. Note that the
- numbers given to the groups match the order in which the groups
- would be seen when iterating over the groupby object, not the
- order they are first observed.
- Groups with missing keys (where `pd.isna()` is True) will be labeled with `NaN`
- and will be skipped from the count.
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from number of group - 1 to 0.
- Returns
- -------
- Series
- Unique numbers for each group.
- See Also
- --------
- .cumcount : Number the rows in each group.
- Examples
- --------
- >>> df = pd.DataFrame({"color": ["red", None, "red", "blue", "blue", "red"]})
- >>> df
- color
- 0 red
- 1 None
- 2 red
- 3 blue
- 4 blue
- 5 red
- >>> df.groupby("color").ngroup()
- 0 1.0
- 1 NaN
- 2 1.0
- 3 0.0
- 4 0.0
- 5 1.0
- dtype: float64
- >>> df.groupby("color", dropna=False).ngroup()
- 0 1
- 1 2
- 2 1
- 3 0
- 4 0
- 5 1
- dtype: int64
- >>> df.groupby("color", dropna=False).ngroup(ascending=False)
- 0 1
- 1 0
- 2 1
- 3 2
- 4 2
- 5 1
- dtype: int64
- """
- obj = self._obj_with_exclusions
- index = obj._get_axis(self.axis)
- comp_ids = self.grouper.group_info[0]
- dtype: type
- if self.grouper.has_dropped_na:
- comp_ids = np.where(comp_ids == -1, np.nan, comp_ids)
- dtype = np.float64
- else:
- dtype = np.int64
- if any(ping._passed_categorical for ping in self.grouper.groupings):
- # comp_ids reflect non-observed groups, we need only observed
- comp_ids = rank_1d(comp_ids, ties_method="dense") - 1
- result = self._obj_1d_constructor(comp_ids, index, dtype=dtype)
- if not ascending:
- result = self.ngroups - 1 - result
- return result
- @final
- @Substitution(name="groupby")
- def cumcount(self, ascending: bool = True):
- """
- Number each item in each group from 0 to the length of that group - 1.
- Essentially this is equivalent to
- .. code-block:: python
- self.apply(lambda x: pd.Series(np.arange(len(x)), x.index))
- Parameters
- ----------
- ascending : bool, default True
- If False, number in reverse, from length of group - 1 to 0.
- Returns
- -------
- Series
- Sequence number of each element within each group.
- See Also
- --------
- .ngroup : Number the groups themselves.
- Examples
- --------
- >>> df = pd.DataFrame([['a'], ['a'], ['a'], ['b'], ['b'], ['a']],
- ... columns=['A'])
- >>> df
- A
- 0 a
- 1 a
- 2 a
- 3 b
- 4 b
- 5 a
- >>> df.groupby('A').cumcount()
- 0 0
- 1 1
- 2 2
- 3 0
- 4 1
- 5 3
- dtype: int64
- >>> df.groupby('A').cumcount(ascending=False)
- 0 3
- 1 2
- 2 1
- 3 1
- 4 0
- 5 0
- dtype: int64
- """
- index = self._obj_with_exclusions._get_axis(self.axis)
- cumcounts = self._cumcount_array(ascending=ascending)
- return self._obj_1d_constructor(cumcounts, index)
- @final
- @Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
- def rank(
- self,
- method: str = "average",
- ascending: bool = True,
- na_option: str = "keep",
- pct: bool = False,
- axis: AxisInt = 0,
- ) -> NDFrameT:
- """
- Provide the rank of values within each group.
- Parameters
- ----------
- method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
- * average: average rank of group.
- * min: lowest rank in group.
- * max: highest rank in group.
- * first: ranks assigned in order they appear in the array.
- * dense: like 'min', but rank always increases by 1 between groups.
- ascending : bool, default True
- False for ranks by high (1) to low (N).
- na_option : {'keep', 'top', 'bottom'}, default 'keep'
- * keep: leave NA values where they are.
- * top: smallest rank if ascending.
- * bottom: smallest rank if descending.
- pct : bool, default False
- Compute percentage rank of data within each group.
- axis : int, default 0
- The axis of the object over which to compute the rank.
- Returns
- -------
- DataFrame with ranking of values within each group
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {
- ... "group": ["a", "a", "a", "a", "a", "b", "b", "b", "b", "b"],
- ... "value": [2, 4, 2, 3, 5, 1, 2, 4, 1, 5],
- ... }
- ... )
- >>> df
- group value
- 0 a 2
- 1 a 4
- 2 a 2
- 3 a 3
- 4 a 5
- 5 b 1
- 6 b 2
- 7 b 4
- 8 b 1
- 9 b 5
- >>> for method in ['average', 'min', 'max', 'dense', 'first']:
- ... df[f'{method}_rank'] = df.groupby('group')['value'].rank(method)
- >>> df
- group value average_rank min_rank max_rank dense_rank first_rank
- 0 a 2 1.5 1.0 2.0 1.0 1.0
- 1 a 4 4.0 4.0 4.0 3.0 4.0
- 2 a 2 1.5 1.0 2.0 1.0 2.0
- 3 a 3 3.0 3.0 3.0 2.0 3.0
- 4 a 5 5.0 5.0 5.0 4.0 5.0
- 5 b 1 1.5 1.0 2.0 1.0 1.0
- 6 b 2 3.0 3.0 3.0 2.0 3.0
- 7 b 4 4.0 4.0 4.0 3.0 4.0
- 8 b 1 1.5 1.0 2.0 1.0 2.0
- 9 b 5 5.0 5.0 5.0 4.0 5.0
- """
- if na_option not in {"keep", "top", "bottom"}:
- msg = "na_option must be one of 'keep', 'top', or 'bottom'"
- raise ValueError(msg)
- kwargs = {
- "ties_method": method,
- "ascending": ascending,
- "na_option": na_option,
- "pct": pct,
- }
- if axis != 0:
- # DataFrame uses different keyword name
- kwargs["method"] = kwargs.pop("ties_method")
- f = lambda x: x.rank(axis=axis, numeric_only=False, **kwargs)
- result = self._python_apply_general(
- f, self._selected_obj, is_transform=True
- )
- return result
- return self._cython_transform(
- "rank",
- numeric_only=False,
- axis=axis,
- **kwargs,
- )
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def cumprod(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT:
- """
- Cumulative product for each group.
- Returns
- -------
- Series or DataFrame
- """
- nv.validate_groupby_func("cumprod", args, kwargs, ["numeric_only", "skipna"])
- if axis != 0:
- f = lambda x: x.cumprod(axis=axis, **kwargs)
- return self._python_apply_general(f, self._selected_obj, is_transform=True)
- return self._cython_transform("cumprod", **kwargs)
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def cumsum(self, axis: Axis = 0, *args, **kwargs) -> NDFrameT:
- """
- Cumulative sum for each group.
- Returns
- -------
- Series or DataFrame
- """
- nv.validate_groupby_func("cumsum", args, kwargs, ["numeric_only", "skipna"])
- if axis != 0:
- f = lambda x: x.cumsum(axis=axis, **kwargs)
- return self._python_apply_general(f, self._selected_obj, is_transform=True)
- return self._cython_transform("cumsum", **kwargs)
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def cummin(
- self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs
- ) -> NDFrameT:
- """
- Cumulative min for each group.
- Returns
- -------
- Series or DataFrame
- """
- skipna = kwargs.get("skipna", True)
- if axis != 0:
- f = lambda x: np.minimum.accumulate(x, axis)
- obj = self._selected_obj
- if numeric_only:
- obj = obj._get_numeric_data()
- return self._python_apply_general(f, obj, is_transform=True)
- return self._cython_transform(
- "cummin", numeric_only=numeric_only, skipna=skipna
- )
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def cummax(
- self, axis: AxisInt = 0, numeric_only: bool = False, **kwargs
- ) -> NDFrameT:
- """
- Cumulative max for each group.
- Returns
- -------
- Series or DataFrame
- """
- skipna = kwargs.get("skipna", True)
- if axis != 0:
- f = lambda x: np.maximum.accumulate(x, axis)
- obj = self._selected_obj
- if numeric_only:
- obj = obj._get_numeric_data()
- return self._python_apply_general(f, obj, is_transform=True)
- return self._cython_transform(
- "cummax", numeric_only=numeric_only, skipna=skipna
- )
- @final
- def _get_cythonized_result(
- self,
- base_func: Callable,
- cython_dtype: np.dtype,
- numeric_only: bool = False,
- needs_counts: bool = False,
- pre_processing=None,
- post_processing=None,
- how: str = "any_all",
- **kwargs,
- ):
- """
- Get result for Cythonized functions.
- Parameters
- ----------
- base_func : callable, Cythonized function to be called
- cython_dtype : np.dtype
- Type of the array that will be modified by the Cython call.
- numeric_only : bool, default False
- Whether only numeric datatypes should be computed
- needs_counts : bool, default False
- Whether the counts should be a part of the Cython call
- pre_processing : function, default None
- Function to be applied to `values` prior to passing to Cython.
- Function should return a tuple where the first element is the
- values to be passed to Cython and the second element is an optional
- type which the values should be converted to after being returned
- by the Cython operation. This function is also responsible for
- raising a TypeError if the values have an invalid type. Raises
- if `needs_values` is False.
- post_processing : function, default None
- Function to be applied to result of Cython function. Should accept
- an array of values as the first argument and type inferences as its
- second argument, i.e. the signature should be
- (ndarray, Type). If `needs_nullable=True`, a third argument should be
- `nullable`, to allow for processing specific to nullable values.
- how : str, default any_all
- Determines if any/all cython interface or std interface is used.
- **kwargs : dict
- Extra arguments to be passed back to Cython funcs
- Returns
- -------
- `Series` or `DataFrame` with filled values
- """
- if post_processing and not callable(post_processing):
- raise ValueError("'post_processing' must be a callable!")
- if pre_processing and not callable(pre_processing):
- raise ValueError("'pre_processing' must be a callable!")
- grouper = self.grouper
- ids, _, ngroups = grouper.group_info
- base_func = partial(base_func, labels=ids)
- def blk_func(values: ArrayLike) -> ArrayLike:
- values = values.T
- ncols = 1 if values.ndim == 1 else values.shape[1]
- result: ArrayLike
- result = np.zeros(ngroups * ncols, dtype=cython_dtype)
- result = result.reshape((ngroups, ncols))
- func = partial(base_func, out=result)
- inferences = None
- if needs_counts:
- counts = np.zeros(ngroups, dtype=np.int64)
- func = partial(func, counts=counts)
- is_datetimelike = values.dtype.kind in ["m", "M"]
- vals = values
- if is_datetimelike and how == "std":
- vals = vals.view("i8")
- if pre_processing:
- vals, inferences = pre_processing(vals)
- vals = vals.astype(cython_dtype, copy=False)
- if vals.ndim == 1:
- vals = vals.reshape((-1, 1))
- func = partial(func, values=vals)
- if how != "std" or isinstance(values, BaseMaskedArray):
- mask = isna(values).view(np.uint8)
- if mask.ndim == 1:
- mask = mask.reshape(-1, 1)
- func = partial(func, mask=mask)
- if how != "std":
- is_nullable = isinstance(values, BaseMaskedArray)
- func = partial(func, nullable=is_nullable)
- elif isinstance(values, BaseMaskedArray):
- result_mask = np.zeros(result.shape, dtype=np.bool_)
- func = partial(func, result_mask=result_mask)
- # Call func to modify result in place
- if how == "std":
- func(**kwargs, is_datetimelike=is_datetimelike)
- else:
- func(**kwargs)
- if values.ndim == 1:
- assert result.shape[1] == 1, result.shape
- result = result[:, 0]
- if post_processing:
- pp_kwargs: dict[str, bool | np.ndarray] = {}
- pp_kwargs["nullable"] = isinstance(values, BaseMaskedArray)
- if how == "std" and pp_kwargs["nullable"]:
- pp_kwargs["result_mask"] = result_mask
- result = post_processing(result, inferences, **pp_kwargs)
- if how == "std" and is_datetimelike:
- values = cast("DatetimeArray | TimedeltaArray", values)
- unit = values.unit
- with warnings.catch_warnings():
- # suppress "RuntimeWarning: invalid value encountered in cast"
- warnings.filterwarnings("ignore")
- result = result.astype(np.int64, copy=False)
- result = result.view(f"m8[{unit}]")
- return result.T
- # Operate block-wise instead of column-by-column
- mgr = self._get_data_to_aggregate(numeric_only=numeric_only, name=how)
- res_mgr = mgr.grouped_reduce(blk_func)
- out = self._wrap_agged_manager(res_mgr)
- return self._wrap_aggregated_output(out)
- @final
- @Substitution(name="groupby")
- def shift(self, periods: int = 1, freq=None, axis: Axis = 0, fill_value=None):
- """
- Shift each group by periods observations.
- If freq is passed, the index will be increased using the periods and the freq.
- Parameters
- ----------
- periods : int, default 1
- Number of periods to shift.
- freq : str, optional
- Frequency string.
- axis : axis to shift, default 0
- Shift direction.
- fill_value : optional
- The scalar value to use for newly introduced missing values.
- Returns
- -------
- Series or DataFrame
- Object shifted within each group.
- See Also
- --------
- Index.shift : Shift values of Index.
- """
- if freq is not None or axis != 0:
- f = lambda x: x.shift(periods, freq, axis, fill_value)
- return self._python_apply_general(f, self._selected_obj, is_transform=True)
- ids, _, ngroups = self.grouper.group_info
- res_indexer = np.zeros(len(ids), dtype=np.int64)
- libgroupby.group_shift_indexer(res_indexer, ids, ngroups, periods)
- obj = self._obj_with_exclusions
- res = obj._reindex_with_indexers(
- {self.axis: (obj.axes[self.axis], res_indexer)},
- fill_value=fill_value,
- allow_dups=True,
- )
- return res
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def diff(self, periods: int = 1, axis: AxisInt = 0) -> NDFrameT:
- """
- First discrete difference of element.
- Calculates the difference of each element compared with another
- element in the group (default is element in previous row).
- Parameters
- ----------
- periods : int, default 1
- Periods to shift for calculating difference, accepts negative values.
- axis : axis to shift, default 0
- Take difference over rows (0) or columns (1).
- Returns
- -------
- Series or DataFrame
- First differences.
- """
- if axis != 0:
- return self.apply(lambda x: x.diff(periods=periods, axis=axis))
- obj = self._obj_with_exclusions
- shifted = self.shift(periods=periods, axis=axis)
- # GH45562 - to retain existing behavior and match behavior of Series.diff(),
- # int8 and int16 are coerced to float32 rather than float64.
- dtypes_to_f32 = ["int8", "int16"]
- if obj.ndim == 1:
- if obj.dtype in dtypes_to_f32:
- shifted = shifted.astype("float32")
- else:
- to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32]
- if len(to_coerce):
- shifted = shifted.astype({c: "float32" for c in to_coerce})
- return obj - shifted
- @final
- @Substitution(name="groupby")
- @Appender(_common_see_also)
- def pct_change(
- self,
- periods: int = 1,
- fill_method: FillnaOptions = "ffill",
- limit=None,
- freq=None,
- axis: Axis = 0,
- ):
- """
- Calculate pct_change of each value to previous entry in group.
- Returns
- -------
- Series or DataFrame
- Percentage changes within each group.
- """
- # TODO(GH#23918): Remove this conditional for SeriesGroupBy when
- # GH#23918 is fixed
- if freq is not None or axis != 0:
- f = lambda x: x.pct_change(
- periods=periods,
- fill_method=fill_method,
- limit=limit,
- freq=freq,
- axis=axis,
- )
- return self._python_apply_general(f, self._selected_obj, is_transform=True)
- if fill_method is None: # GH30463
- fill_method = "ffill"
- limit = 0
- filled = getattr(self, fill_method)(limit=limit)
- fill_grp = filled.groupby(
- self.grouper.codes, axis=self.axis, group_keys=self.group_keys
- )
- shifted = fill_grp.shift(periods=periods, freq=freq, axis=self.axis)
- return (filled / shifted) - 1
- @final
- @Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
- def head(self, n: int = 5) -> NDFrameT:
- """
- Return first n rows of each group.
- Similar to ``.apply(lambda x: x.head(n))``, but it returns a subset of rows
- from the original DataFrame with original index and order preserved
- (``as_index`` flag is ignored).
- Parameters
- ----------
- n : int
- If positive: number of entries to include from start of each group.
- If negative: number of entries to exclude from end of each group.
- Returns
- -------
- Series or DataFrame
- Subset of original Series or DataFrame as determined by n.
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame([[1, 2], [1, 4], [5, 6]],
- ... columns=['A', 'B'])
- >>> df.groupby('A').head(1)
- A B
- 0 1 2
- 2 5 6
- >>> df.groupby('A').head(-1)
- A B
- 0 1 2
- """
- mask = self._make_mask_from_positional_indexer(slice(None, n))
- return self._mask_selected_obj(mask)
- @final
- @Substitution(name="groupby")
- @Substitution(see_also=_common_see_also)
- def tail(self, n: int = 5) -> NDFrameT:
- """
- Return last n rows of each group.
- Similar to ``.apply(lambda x: x.tail(n))``, but it returns a subset of rows
- from the original DataFrame with original index and order preserved
- (``as_index`` flag is ignored).
- Parameters
- ----------
- n : int
- If positive: number of entries to include from end of each group.
- If negative: number of entries to exclude from start of each group.
- Returns
- -------
- Series or DataFrame
- Subset of original Series or DataFrame as determined by n.
- %(see_also)s
- Examples
- --------
- >>> df = pd.DataFrame([['a', 1], ['a', 2], ['b', 1], ['b', 2]],
- ... columns=['A', 'B'])
- >>> df.groupby('A').tail(1)
- A B
- 1 a 2
- 3 b 2
- >>> df.groupby('A').tail(-1)
- A B
- 1 a 2
- 3 b 2
- """
- if n:
- mask = self._make_mask_from_positional_indexer(slice(-n, None))
- else:
- mask = self._make_mask_from_positional_indexer([])
- return self._mask_selected_obj(mask)
- @final
- def _mask_selected_obj(self, mask: npt.NDArray[np.bool_]) -> NDFrameT:
- """
- Return _selected_obj with mask applied to the correct axis.
- Parameters
- ----------
- mask : np.ndarray[bool]
- Boolean mask to apply.
- Returns
- -------
- Series or DataFrame
- Filtered _selected_obj.
- """
- ids = self.grouper.group_info[0]
- mask = mask & (ids != -1)
- if self.axis == 0:
- return self._selected_obj[mask]
- else:
- return self._selected_obj.iloc[:, mask]
- @final
- def _reindex_output(
- self,
- output: OutputFrameOrSeries,
- fill_value: Scalar = np.NaN,
- qs: npt.NDArray[np.float64] | None = None,
- ) -> OutputFrameOrSeries:
- """
- If we have categorical groupers, then we might want to make sure that
- we have a fully re-indexed output to the levels. This means expanding
- the output space to accommodate all values in the cartesian product of
- our groups, regardless of whether they were observed in the data or
- not. This will expand the output space if there are missing groups.
- The method returns early without modifying the input if the number of
- groupings is less than 2, self.observed == True or none of the groupers
- are categorical.
- Parameters
- ----------
- output : Series or DataFrame
- Object resulting from grouping and applying an operation.
- fill_value : scalar, default np.NaN
- Value to use for unobserved categories if self.observed is False.
- qs : np.ndarray[float64] or None, default None
- quantile values, only relevant for quantile.
- Returns
- -------
- Series or DataFrame
- Object (potentially) re-indexed to include all possible groups.
- """
- groupings = self.grouper.groupings
- if len(groupings) == 1:
- return output
- # if we only care about the observed values
- # we are done
- elif self.observed:
- return output
- # reindexing only applies to a Categorical grouper
- elif not any(
- isinstance(ping.grouping_vector, (Categorical, CategoricalIndex))
- for ping in groupings
- ):
- return output
- levels_list = [ping.group_index for ping in groupings]
- names = self.grouper.names
- if qs is not None:
- # error: Argument 1 to "append" of "list" has incompatible type
- # "ndarray[Any, dtype[floating[_64Bit]]]"; expected "Index"
- levels_list.append(qs) # type: ignore[arg-type]
- names = names + [None]
- index = MultiIndex.from_product(levels_list, names=names)
- if self.sort:
- index = index.sort_values()
- if self.as_index:
- # Always holds for SeriesGroupBy unless GH#36507 is implemented
- d = {
- self.obj._get_axis_name(self.axis): index,
- "copy": False,
- "fill_value": fill_value,
- }
- return output.reindex(**d) # type: ignore[arg-type]
- # GH 13204
- # Here, the categorical in-axis groupers, which need to be fully
- # expanded, are columns in `output`. An idea is to do:
- # output = output.set_index(self.grouper.names)
- # .reindex(index).reset_index()
- # but special care has to be taken because of possible not-in-axis
- # groupers.
- # So, we manually select and drop the in-axis grouper columns,
- # reindex `output`, and then reset the in-axis grouper columns.
- # Select in-axis groupers
- in_axis_grps = list(
- (i, ping.name) for (i, ping) in enumerate(groupings) if ping.in_axis
- )
- if len(in_axis_grps) > 0:
- g_nums, g_names = zip(*in_axis_grps)
- output = output.drop(labels=list(g_names), axis=1)
- # Set a temp index and reindex (possibly expanding)
- output = output.set_index(self.grouper.result_index).reindex(
- index, copy=False, fill_value=fill_value
- )
- # Reset in-axis grouper columns
- # (using level numbers `g_nums` because level names may not be unique)
- if len(in_axis_grps) > 0:
- output = output.reset_index(level=g_nums)
- return output.reset_index(drop=True)
- @final
- def sample(
- self,
- n: int | None = None,
- frac: float | None = None,
- replace: bool = False,
- weights: Sequence | Series | None = None,
- random_state: RandomState | None = None,
- ):
- """
- Return a random sample of items from each group.
- You can use `random_state` for reproducibility.
- .. versionadded:: 1.1.0
- Parameters
- ----------
- n : int, optional
- Number of items to return for each group. Cannot be used with
- `frac` and must be no larger than the smallest group unless
- `replace` is True. Default is one if `frac` is None.
- frac : float, optional
- Fraction of items to return. Cannot be used with `n`.
- replace : bool, default False
- Allow or disallow sampling of the same row more than once.
- weights : list-like, optional
- Default None results in equal probability weighting.
- If passed a list-like then values must have the same length as
- the underlying DataFrame or Series object and will be used as
- sampling probabilities after normalization within each group.
- Values must be non-negative with at least one positive element
- within each group.
- random_state : int, array-like, BitGenerator, np.random.RandomState, np.random.Generator, optional
- If int, array-like, or BitGenerator, seed for random number generator.
- If np.random.RandomState or np.random.Generator, use as given.
- .. versionchanged:: 1.4.0
- np.random.Generator objects now accepted
- Returns
- -------
- Series or DataFrame
- A new object of same type as caller containing items randomly
- sampled within each group from the caller object.
- See Also
- --------
- DataFrame.sample: Generate random samples from a DataFrame object.
- numpy.random.choice: Generate a random sample from a given 1-D numpy
- array.
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {"a": ["red"] * 2 + ["blue"] * 2 + ["black"] * 2, "b": range(6)}
- ... )
- >>> df
- a b
- 0 red 0
- 1 red 1
- 2 blue 2
- 3 blue 3
- 4 black 4
- 5 black 5
- Select one row at random for each distinct value in column a. The
- `random_state` argument can be used to guarantee reproducibility:
- >>> df.groupby("a").sample(n=1, random_state=1)
- a b
- 4 black 4
- 2 blue 2
- 1 red 1
- Set `frac` to sample fixed proportions rather than counts:
- >>> df.groupby("a")["b"].sample(frac=0.5, random_state=2)
- 5 5
- 2 2
- 0 0
- Name: b, dtype: int64
- Control sample probabilities within groups by setting weights:
- >>> df.groupby("a").sample(
- ... n=1,
- ... weights=[1, 1, 1, 0, 0, 1],
- ... random_state=1,
- ... )
- a b
- 5 black 5
- 2 blue 2
- 0 red 0
- """ # noqa:E501
- if self._selected_obj.empty:
- # GH48459 prevent ValueError when object is empty
- return self._selected_obj
- size = sample.process_sampling_size(n, frac, replace)
- if weights is not None:
- weights_arr = sample.preprocess_weights(
- self._selected_obj, weights, axis=self.axis
- )
- random_state = com.random_state(random_state)
- group_iterator = self.grouper.get_iterator(self._selected_obj, self.axis)
- sampled_indices = []
- for labels, obj in group_iterator:
- grp_indices = self.indices[labels]
- group_size = len(grp_indices)
- if size is not None:
- sample_size = size
- else:
- assert frac is not None
- sample_size = round(frac * group_size)
- grp_sample = sample.sample(
- group_size,
- size=sample_size,
- replace=replace,
- weights=None if weights is None else weights_arr[grp_indices],
- random_state=random_state,
- )
- sampled_indices.append(grp_indices[grp_sample])
- sampled_indices = np.concatenate(sampled_indices)
- return self._selected_obj.take(sampled_indices, axis=self.axis)
- @doc(GroupBy)
- def get_groupby(
- obj: NDFrame,
- by: _KeysArgType | None = None,
- axis: AxisInt = 0,
- grouper: ops.BaseGrouper | None = None,
- group_keys: bool = True,
- ) -> GroupBy:
- klass: type[GroupBy]
- if isinstance(obj, Series):
- from pandas.core.groupby.generic import SeriesGroupBy
- klass = SeriesGroupBy
- elif isinstance(obj, DataFrame):
- from pandas.core.groupby.generic import DataFrameGroupBy
- klass = DataFrameGroupBy
- else: # pragma: no cover
- raise TypeError(f"invalid type: {obj}")
- return klass(
- obj=obj,
- keys=by,
- axis=axis,
- grouper=grouper,
- group_keys=group_keys,
- )
- def _insert_quantile_level(idx: Index, qs: npt.NDArray[np.float64]) -> MultiIndex:
- """
- Insert the sequence 'qs' of quantiles as the inner-most level of a MultiIndex.
- The quantile level in the MultiIndex is a repeated copy of 'qs'.
- Parameters
- ----------
- idx : Index
- qs : np.ndarray[float64]
- Returns
- -------
- MultiIndex
- """
- nqs = len(qs)
- if idx._is_multi:
- idx = cast(MultiIndex, idx)
- lev_codes, lev = Index(qs).factorize()
- levels = list(idx.levels) + [lev]
- codes = [np.repeat(x, nqs) for x in idx.codes] + [np.tile(lev_codes, len(idx))]
- mi = MultiIndex(levels=levels, codes=codes, names=idx.names + [None])
- else:
- mi = MultiIndex.from_product([idx, qs])
- return mi
|