12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651 |
- """
- Define the SeriesGroupBy and DataFrameGroupBy
- classes that hold the groupby interfaces (and some implementations).
- These are user facing as the result of the ``df.groupby(...)`` operations,
- which here returns a DataFrameGroupBy object.
- """
- from __future__ import annotations
- from collections import abc
- from functools import partial
- from textwrap import dedent
- from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Iterable,
- Literal,
- Mapping,
- NamedTuple,
- Sequence,
- TypeVar,
- Union,
- cast,
- )
- import numpy as np
- from pandas._libs import (
- Interval,
- lib,
- reduction as libreduction,
- )
- from pandas._typing import (
- ArrayLike,
- Axis,
- AxisInt,
- CorrelationMethod,
- FillnaOptions,
- IndexLabel,
- Manager,
- Manager2D,
- SingleManager,
- TakeIndexer,
- )
- from pandas.errors import SpecificationError
- from pandas.util._decorators import (
- Appender,
- Substitution,
- doc,
- )
- from pandas.core.dtypes.common import (
- ensure_int64,
- is_bool,
- is_categorical_dtype,
- is_dict_like,
- is_integer_dtype,
- is_interval_dtype,
- is_numeric_dtype,
- is_scalar,
- )
- from pandas.core.dtypes.missing import (
- isna,
- notna,
- )
- from pandas.core import algorithms
- from pandas.core.apply import (
- GroupByApply,
- maybe_mangle_lambdas,
- reconstruct_func,
- validate_func_kwargs,
- )
- import pandas.core.common as com
- from pandas.core.frame import DataFrame
- from pandas.core.groupby import base
- from pandas.core.groupby.groupby import (
- GroupBy,
- GroupByPlot,
- _agg_template,
- _apply_docs,
- _transform_template,
- )
- from pandas.core.indexes.api import (
- Index,
- MultiIndex,
- all_indexes_same,
- default_index,
- )
- from pandas.core.series import Series
- from pandas.core.util.numba_ import maybe_use_numba
- from pandas.plotting import boxplot_frame_groupby
- if TYPE_CHECKING:
- from pandas import Categorical
- from pandas.core.generic import NDFrame
- # TODO(typing) the return value on this callable should be any *scalar*.
- AggScalar = Union[str, Callable[..., Any]]
- # TODO: validate types on ScalarResult and move to _typing
- # Blocked from using by https://github.com/python/mypy/issues/1484
- # See note at _mangle_lambda_list
- ScalarResult = TypeVar("ScalarResult")
- class NamedAgg(NamedTuple):
- """
- Helper for column specific aggregation with control over output column names.
- Subclass of typing.NamedTuple.
- Parameters
- ----------
- column : Hashable
- Column label in the DataFrame to apply aggfunc.
- aggfunc : function or str
- Function to apply to the provided column. If string, the name of a built-in
- pandas function.
- Examples
- --------
- >>> df = pd.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})
- >>> agg_a = pd.NamedAgg(column="a", aggfunc="min")
- >>> agg_1 = pd.NamedAgg(column=1, aggfunc=np.mean)
- >>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)
- result_a result_1
- key
- 1 -1 10.5
- 2 1 12.0
- """
- column: Hashable
- aggfunc: AggScalar
- class SeriesGroupBy(GroupBy[Series]):
- def _wrap_agged_manager(self, mgr: Manager) -> Series:
- return self.obj._constructor(mgr, name=self.obj.name)
- def _get_data_to_aggregate(
- self, *, numeric_only: bool = False, name: str | None = None
- ) -> SingleManager:
- ser = self._selected_obj
- single = ser._mgr
- if numeric_only and not is_numeric_dtype(ser.dtype):
- # GH#41291 match Series behavior
- kwd_name = "numeric_only"
- raise TypeError(
- f"Cannot use {kwd_name}=True with "
- f"{type(self).__name__}.{name} and non-numeric dtypes."
- )
- return single
- def _iterate_slices(self) -> Iterable[Series]:
- yield self._selected_obj
- _agg_examples_doc = dedent(
- """
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4])
- >>> s
- 0 1
- 1 2
- 2 3
- 3 4
- dtype: int64
- >>> s.groupby([1, 1, 2, 2]).min()
- 1 1
- 2 3
- dtype: int64
- >>> s.groupby([1, 1, 2, 2]).agg('min')
- 1 1
- 2 3
- dtype: int64
- >>> s.groupby([1, 1, 2, 2]).agg(['min', 'max'])
- min max
- 1 1 2
- 2 3 4
- The output column names can be controlled by passing
- the desired column names and aggregations as keyword arguments.
- >>> s.groupby([1, 1, 2, 2]).agg(
- ... minimum='min',
- ... maximum='max',
- ... )
- minimum maximum
- 1 1 2
- 2 3 4
- .. versionchanged:: 1.3.0
- The resulting dtype will reflect the return value of the aggregating function.
- >>> s.groupby([1, 1, 2, 2]).agg(lambda x: x.astype(float).min())
- 1 1.0
- 2 3.0
- dtype: float64
- """
- )
- @Appender(
- _apply_docs["template"].format(
- input="series", examples=_apply_docs["series_examples"]
- )
- )
- def apply(self, func, *args, **kwargs) -> Series:
- return super().apply(func, *args, **kwargs)
- @doc(_agg_template, examples=_agg_examples_doc, klass="Series")
- def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
- if maybe_use_numba(engine):
- return self._aggregate_with_numba(
- func, *args, engine_kwargs=engine_kwargs, **kwargs
- )
- relabeling = func is None
- columns = None
- if relabeling:
- columns, func = validate_func_kwargs(kwargs)
- kwargs = {}
- if isinstance(func, str):
- return getattr(self, func)(*args, **kwargs)
- elif isinstance(func, abc.Iterable):
- # Catch instances of lists / tuples
- # but not the class list / tuple itself.
- func = maybe_mangle_lambdas(func)
- ret = self._aggregate_multiple_funcs(func, *args, **kwargs)
- if relabeling:
- # columns is not narrowed by mypy from relabeling flag
- assert columns is not None # for mypy
- ret.columns = columns
- if not self.as_index:
- ret = ret.reset_index()
- return ret
- else:
- cyfunc = com.get_cython_func(func)
- if cyfunc and not args and not kwargs:
- return getattr(self, cyfunc)()
- if self.ngroups == 0:
- # e.g. test_evaluate_with_empty_groups without any groups to
- # iterate over, we have no output on which to do dtype
- # inference. We default to using the existing dtype.
- # xref GH#51445
- obj = self._obj_with_exclusions
- return self.obj._constructor(
- [],
- name=self.obj.name,
- index=self.grouper.result_index,
- dtype=obj.dtype,
- )
- if self.grouper.nkeys > 1:
- return self._python_agg_general(func, *args, **kwargs)
- try:
- return self._python_agg_general(func, *args, **kwargs)
- except KeyError:
- # KeyError raised in test_groupby.test_basic is bc the func does
- # a dictionary lookup on group.name, but group name is not
- # pinned in _python_agg_general, only in _aggregate_named
- result = self._aggregate_named(func, *args, **kwargs)
- # result is a dict whose keys are the elements of result_index
- result = Series(result, index=self.grouper.result_index)
- result = self._wrap_aggregated_output(result)
- return result
- agg = aggregate
- def _python_agg_general(self, func, *args, **kwargs):
- func = com.is_builtin_func(func)
- f = lambda x: func(x, *args, **kwargs)
- obj = self._obj_with_exclusions
- result = self.grouper.agg_series(obj, f)
- res = obj._constructor(result, name=obj.name)
- return self._wrap_aggregated_output(res)
- def _aggregate_multiple_funcs(self, arg, *args, **kwargs) -> DataFrame:
- if isinstance(arg, dict):
- if self.as_index:
- # GH 15931
- raise SpecificationError("nested renamer is not supported")
- else:
- # GH#50684 - This accidentally worked in 1.x
- arg = list(arg.items())
- elif any(isinstance(x, (tuple, list)) for x in arg):
- arg = [(x, x) if not isinstance(x, (tuple, list)) else x for x in arg]
- else:
- # list of functions / function names
- columns = []
- for f in arg:
- columns.append(com.get_callable_name(f) or f)
- arg = zip(columns, arg)
- results: dict[base.OutputKey, DataFrame | Series] = {}
- with com.temp_setattr(self, "as_index", True):
- # Combine results using the index, need to adjust index after
- # if as_index=False (GH#50724)
- for idx, (name, func) in enumerate(arg):
- key = base.OutputKey(label=name, position=idx)
- results[key] = self.aggregate(func, *args, **kwargs)
- if any(isinstance(x, DataFrame) for x in results.values()):
- from pandas import concat
- res_df = concat(
- results.values(), axis=1, keys=[key.label for key in results]
- )
- return res_df
- indexed_output = {key.position: val for key, val in results.items()}
- output = self.obj._constructor_expanddim(indexed_output, index=None)
- output.columns = Index(key.label for key in results)
- return output
- def _wrap_applied_output(
- self,
- data: Series,
- values: list[Any],
- not_indexed_same: bool = False,
- is_transform: bool = False,
- ) -> DataFrame | Series:
- """
- Wrap the output of SeriesGroupBy.apply into the expected result.
- Parameters
- ----------
- data : Series
- Input data for groupby operation.
- values : List[Any]
- Applied output for each group.
- not_indexed_same : bool, default False
- Whether the applied outputs are not indexed the same as the group axes.
- Returns
- -------
- DataFrame or Series
- """
- if len(values) == 0:
- # GH #6265
- if is_transform:
- # GH#47787 see test_group_on_empty_multiindex
- res_index = data.index
- else:
- res_index = self.grouper.result_index
- return self.obj._constructor(
- [],
- name=self.obj.name,
- index=res_index,
- dtype=data.dtype,
- )
- assert values is not None
- if isinstance(values[0], dict):
- # GH #823 #24880
- index = self.grouper.result_index
- res_df = self.obj._constructor_expanddim(values, index=index)
- res_df = self._reindex_output(res_df)
- # if self.observed is False,
- # keep all-NaN rows created while re-indexing
- res_ser = res_df.stack(dropna=self.observed)
- res_ser.name = self.obj.name
- return res_ser
- elif isinstance(values[0], (Series, DataFrame)):
- result = self._concat_objects(
- values,
- not_indexed_same=not_indexed_same,
- is_transform=is_transform,
- )
- if isinstance(result, Series):
- result.name = self.obj.name
- if not self.as_index and not_indexed_same:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
- return result
- else:
- # GH #6265 #24880
- result = self.obj._constructor(
- data=values, index=self.grouper.result_index, name=self.obj.name
- )
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
- return self._reindex_output(result)
- def _aggregate_named(self, func, *args, **kwargs):
- # Note: this is very similar to _aggregate_series_pure_python,
- # but that does not pin group.name
- result = {}
- initialized = False
- for name, group in self:
- object.__setattr__(group, "name", name)
- output = func(group, *args, **kwargs)
- output = libreduction.extract_result(output)
- if not initialized:
- # We only do this validation on the first iteration
- libreduction.check_result_array(output, group.dtype)
- initialized = True
- result[name] = output
- return result
- __examples_series_doc = dedent(
- """
- >>> ser = pd.Series(
- ... [390.0, 350.0, 30.0, 20.0],
- ... index=["Falcon", "Falcon", "Parrot", "Parrot"],
- ... name="Max Speed")
- >>> grouped = ser.groupby([1, 1, 2, 2])
- >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
- Falcon 0.707107
- Falcon -0.707107
- Parrot 0.707107
- Parrot -0.707107
- Name: Max Speed, dtype: float64
- Broadcast result of the transformation
- >>> grouped.transform(lambda x: x.max() - x.min())
- Falcon 40.0
- Falcon 40.0
- Parrot 10.0
- Parrot 10.0
- Name: Max Speed, dtype: float64
- >>> grouped.transform("mean")
- Falcon 370.0
- Falcon 370.0
- Parrot 25.0
- Parrot 25.0
- Name: Max Speed, dtype: float64
- .. versionchanged:: 1.3.0
- The resulting dtype will reflect the return value of the passed ``func``,
- for example:
- >>> grouped.transform(lambda x: x.astype(int).max())
- Falcon 390
- Falcon 390
- Parrot 30
- Parrot 30
- Name: Max Speed, dtype: int64
- """
- )
- @Substitution(klass="Series", example=__examples_series_doc)
- @Appender(_transform_template)
- def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
- return self._transform(
- func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
- )
- def _cython_transform(
- self, how: str, numeric_only: bool = False, axis: AxisInt = 0, **kwargs
- ):
- assert axis == 0 # handled by caller
- obj = self._selected_obj
- try:
- result = self.grouper._cython_operation(
- "transform", obj._values, how, axis, **kwargs
- )
- except NotImplementedError as err:
- # e.g. test_groupby_raises_string
- raise TypeError(f"{how} is not supported for {obj.dtype} dtype") from err
- return obj._constructor(result, index=self.obj.index, name=obj.name)
- def _transform_general(self, func: Callable, *args, **kwargs) -> Series:
- """
- Transform with a callable func`.
- """
- assert callable(func)
- klass = type(self.obj)
- results = []
- for name, group in self.grouper.get_iterator(
- self._selected_obj, axis=self.axis
- ):
- # this setattr is needed for test_transform_lambda_with_datetimetz
- object.__setattr__(group, "name", name)
- res = func(group, *args, **kwargs)
- results.append(klass(res, index=group.index))
- # check for empty "results" to avoid concat ValueError
- if results:
- from pandas.core.reshape.concat import concat
- concatenated = concat(results)
- result = self._set_result_index_ordered(concatenated)
- else:
- result = self.obj._constructor(dtype=np.float64)
- result.name = self.obj.name
- return result
- def filter(self, func, dropna: bool = True, *args, **kwargs):
- """
- Filter elements from groups that don't satisfy a criterion.
- Elements from groups are filtered if they do not satisfy the
- boolean criterion specified by func.
- Parameters
- ----------
- func : function
- Criterion to apply to each group. Should return True or False.
- dropna : bool
- Drop groups that do not pass the filter. True by default; if False,
- groups that evaluate False are filled with NaNs.
- Returns
- -------
- Series
- Notes
- -----
- Functions that mutate the passed object can produce unexpected
- behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
- for more details.
- Examples
- --------
- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
- ... 'foo', 'bar'],
- ... 'B' : [1, 2, 3, 4, 5, 6],
- ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
- >>> grouped = df.groupby('A')
- >>> df.groupby('A').B.filter(lambda x: x.mean() > 3.)
- 1 2
- 3 4
- 5 6
- Name: B, dtype: int64
- """
- if isinstance(func, str):
- wrapper = lambda x: getattr(x, func)(*args, **kwargs)
- else:
- wrapper = lambda x: func(x, *args, **kwargs)
- # Interpret np.nan as False.
- def true_and_notna(x) -> bool:
- b = wrapper(x)
- return notna(b) and b
- try:
- indices = [
- self._get_index(name) for name, group in self if true_and_notna(group)
- ]
- except (ValueError, TypeError) as err:
- raise TypeError("the filter must return a boolean result") from err
- filtered = self._apply_filter(indices, dropna)
- return filtered
- def nunique(self, dropna: bool = True) -> Series | DataFrame:
- """
- Return number of unique elements in the group.
- Returns
- -------
- Series
- Number of unique values within each group.
- """
- ids, _, _ = self.grouper.group_info
- val = self.obj._values
- codes, _ = algorithms.factorize(val, sort=False)
- sorter = np.lexsort((codes, ids))
- codes = codes[sorter]
- ids = ids[sorter]
- # group boundaries are where group ids change
- # unique observations are where sorted values change
- idx = np.r_[0, 1 + np.nonzero(ids[1:] != ids[:-1])[0]]
- inc = np.r_[1, codes[1:] != codes[:-1]]
- # 1st item of each group is a new unique observation
- mask = codes == -1
- if dropna:
- inc[idx] = 1
- inc[mask] = 0
- else:
- inc[mask & np.r_[False, mask[:-1]]] = 0
- inc[idx] = 1
- out = np.add.reduceat(inc, idx).astype("int64", copy=False)
- if len(ids):
- # NaN/NaT group exists if the head of ids is -1,
- # so remove it from res and exclude its index from idx
- if ids[0] == -1:
- res = out[1:]
- idx = idx[np.flatnonzero(idx)]
- else:
- res = out
- else:
- res = out[1:]
- ri = self.grouper.result_index
- # we might have duplications among the bins
- if len(res) != len(ri):
- res, out = np.zeros(len(ri), dtype=out.dtype), res
- if len(ids) > 0:
- # GH#21334s
- res[ids[idx]] = out
- result: Series | DataFrame = self.obj._constructor(
- res, index=ri, name=self.obj.name
- )
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
- return self._reindex_output(result, fill_value=0)
- @doc(Series.describe)
- def describe(self, **kwargs):
- return super().describe(**kwargs)
- def value_counts(
- self,
- normalize: bool = False,
- sort: bool = True,
- ascending: bool = False,
- bins=None,
- dropna: bool = True,
- ) -> Series | DataFrame:
- name = "proportion" if normalize else "count"
- if bins is None:
- result = self._value_counts(
- normalize=normalize, sort=sort, ascending=ascending, dropna=dropna
- )
- result.name = name
- return result
- from pandas.core.reshape.merge import get_join_indexers
- from pandas.core.reshape.tile import cut
- ids, _, _ = self.grouper.group_info
- val = self.obj._values
- index_names = self.grouper.names + [self.obj.name]
- if is_categorical_dtype(val.dtype) or (
- bins is not None and not np.iterable(bins)
- ):
- # scalar bins cannot be done at top level
- # in a backward compatible way
- # GH38672 relates to categorical dtype
- ser = self.apply(
- Series.value_counts,
- normalize=normalize,
- sort=sort,
- ascending=ascending,
- bins=bins,
- )
- ser.name = name
- ser.index.names = index_names
- return ser
- # groupby removes null keys from groupings
- mask = ids != -1
- ids, val = ids[mask], val[mask]
- if bins is None:
- lab, lev = algorithms.factorize(val, sort=True)
- llab = lambda lab, inc: lab[inc]
- else:
- # lab is a Categorical with categories an IntervalIndex
- cat_ser = cut(Series(val, copy=False), bins, include_lowest=True)
- cat_obj = cast("Categorical", cat_ser._values)
- lev = cat_obj.categories
- lab = lev.take(
- cat_obj.codes,
- allow_fill=True,
- fill_value=lev._na_value,
- )
- llab = lambda lab, inc: lab[inc]._multiindex.codes[-1]
- if is_interval_dtype(lab.dtype):
- # TODO: should we do this inside II?
- lab_interval = cast(Interval, lab)
- sorter = np.lexsort((lab_interval.left, lab_interval.right, ids))
- else:
- sorter = np.lexsort((lab, ids))
- ids, lab = ids[sorter], lab[sorter]
- # group boundaries are where group ids change
- idchanges = 1 + np.nonzero(ids[1:] != ids[:-1])[0]
- idx = np.r_[0, idchanges]
- if not len(ids):
- idx = idchanges
- # new values are where sorted labels change
- lchanges = llab(lab, slice(1, None)) != llab(lab, slice(None, -1))
- inc = np.r_[True, lchanges]
- if not len(val):
- inc = lchanges
- inc[idx] = True # group boundaries are also new values
- out = np.diff(np.nonzero(np.r_[inc, True])[0]) # value counts
- # num. of times each group should be repeated
- rep = partial(np.repeat, repeats=np.add.reduceat(inc, idx))
- # multi-index components
- codes = self.grouper.reconstructed_codes
- codes = [rep(level_codes) for level_codes in codes] + [llab(lab, inc)]
- levels = [ping.group_index for ping in self.grouper.groupings] + [lev]
- if dropna:
- mask = codes[-1] != -1
- if mask.all():
- dropna = False
- else:
- out, codes = out[mask], [level_codes[mask] for level_codes in codes]
- if normalize:
- out = out.astype("float")
- d = np.diff(np.r_[idx, len(ids)])
- if dropna:
- m = ids[lab == -1]
- np.add.at(d, m, -1)
- acc = rep(d)[mask]
- else:
- acc = rep(d)
- out /= acc
- if sort and bins is None:
- cat = ids[inc][mask] if dropna else ids[inc]
- sorter = np.lexsort((out if ascending else -out, cat))
- out, codes[-1] = out[sorter], codes[-1][sorter]
- if bins is not None:
- # for compat. with libgroupby.value_counts need to ensure every
- # bin is present at every index level, null filled with zeros
- diff = np.zeros(len(out), dtype="bool")
- for level_codes in codes[:-1]:
- diff |= np.r_[True, level_codes[1:] != level_codes[:-1]]
- ncat, nbin = diff.sum(), len(levels[-1])
- left = [np.repeat(np.arange(ncat), nbin), np.tile(np.arange(nbin), ncat)]
- right = [diff.cumsum() - 1, codes[-1]]
- _, idx = get_join_indexers(left, right, sort=False, how="left")
- out = np.where(idx != -1, out[idx], 0)
- if sort:
- sorter = np.lexsort((out if ascending else -out, left[0]))
- out, left[-1] = out[sorter], left[-1][sorter]
- # build the multi-index w/ full levels
- def build_codes(lev_codes: np.ndarray) -> np.ndarray:
- return np.repeat(lev_codes[diff], nbin)
- codes = [build_codes(lev_codes) for lev_codes in codes[:-1]]
- codes.append(left[-1])
- mi = MultiIndex(
- levels=levels, codes=codes, names=index_names, verify_integrity=False
- )
- if is_integer_dtype(out.dtype):
- out = ensure_int64(out)
- result = self.obj._constructor(out, index=mi, name=name)
- if not self.as_index:
- result = result.reset_index()
- return result
- def fillna(
- self,
- value: object | ArrayLike | None = None,
- method: FillnaOptions | None = None,
- axis: Axis | None = None,
- inplace: bool = False,
- limit: int | None = None,
- downcast: dict | None = None,
- ) -> Series | None:
- """
- Fill NA/NaN values using the specified method within groups.
- Parameters
- ----------
- value : scalar, dict, Series, or DataFrame
- Value to use to fill holes (e.g. 0), alternately a
- dict/Series/DataFrame of values specifying which value to use for
- each index (for a Series) or column (for a DataFrame). Values not
- in the dict/Series/DataFrame will not be filled. This value cannot
- be a list. Users wanting to use the ``value`` argument and not ``method``
- should prefer :meth:`.Series.fillna` as this
- will produce the same result and be more performant.
- method : {{'bfill', 'ffill', None}}, default None
- Method to use for filling holes. ``'ffill'`` will propagate
- the last valid observation forward within a group.
- ``'bfill'`` will use next valid observation to fill the gap.
- axis : {0 or 'index', 1 or 'columns'}
- Unused, only for compatibility with :meth:`DataFrameGroupBy.fillna`.
- inplace : bool, default False
- Broken. Do not set to True.
- limit : int, default None
- If method is specified, this is the maximum number of consecutive
- NaN values to forward/backward fill within a group. In other words,
- if there is a gap with more than this number of consecutive NaNs,
- it will only be partially filled. If method is not specified, this is the
- maximum number of entries along the entire axis where NaNs will be
- filled. Must be greater than 0 if not None.
- downcast : dict, default is None
- A dict of item->dtype of what to downcast if possible,
- or the string 'infer' which will try to downcast to an appropriate
- equal type (e.g. float64 to int64 if possible).
- Returns
- -------
- Series
- Object with missing values filled within groups.
- See Also
- --------
- ffill : Forward fill values within a group.
- bfill : Backward fill values within a group.
- Examples
- --------
- >>> ser = pd.Series([np.nan, np.nan, 2, 3, np.nan, np.nan])
- >>> ser
- 0 NaN
- 1 NaN
- 2 2.0
- 3 3.0
- 4 NaN
- 5 NaN
- dtype: float64
- Propagate non-null values forward or backward within each group.
- >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill")
- 0 NaN
- 1 NaN
- 2 2.0
- 3 3.0
- 4 3.0
- 5 3.0
- dtype: float64
- >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="bfill")
- 0 2.0
- 1 2.0
- 2 2.0
- 3 3.0
- 4 NaN
- 5 NaN
- dtype: float64
- Only replace the first NaN element within a group.
- >>> ser.groupby([0, 0, 0, 1, 1, 1]).fillna(method="ffill", limit=1)
- 0 NaN
- 1 NaN
- 2 2.0
- 3 3.0
- 4 3.0
- 5 NaN
- dtype: float64
- """
- result = self._op_via_apply(
- "fillna",
- value=value,
- method=method,
- axis=axis,
- inplace=inplace,
- limit=limit,
- downcast=downcast,
- )
- return result
- def take(
- self,
- indices: TakeIndexer,
- axis: Axis = 0,
- **kwargs,
- ) -> Series:
- """
- Return the elements in the given *positional* indices in each group.
- This means that we are not indexing according to actual values in
- the index attribute of the object. We are indexing according to the
- actual position of the element in the object.
- If a requested index does not exist for some group, this method will raise.
- To get similar behavior that ignores indices that don't exist, see
- :meth:`.SeriesGroupBy.nth`.
- Parameters
- ----------
- indices : array-like
- An array of ints indicating which positions to take in each group.
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- The axis on which to select elements. ``0`` means that we are
- selecting rows, ``1`` means that we are selecting columns.
- For `SeriesGroupBy` this parameter is unused and defaults to 0.
- **kwargs
- For compatibility with :meth:`numpy.take`. Has no effect on the
- output.
- Returns
- -------
- Series
- A Series containing the elements taken from each group.
- See Also
- --------
- Series.take : Take elements from a Series along an axis.
- Series.loc : Select a subset of a DataFrame by labels.
- Series.iloc : Select a subset of a DataFrame by positions.
- numpy.take : Take elements from an array along an axis.
- SeriesGroupBy.nth : Similar to take, won't raise if indices don't exist.
- Examples
- --------
- >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
- ... ('parrot', 'bird', 24.0),
- ... ('lion', 'mammal', 80.5),
- ... ('monkey', 'mammal', np.nan),
- ... ('rabbit', 'mammal', 15.0)],
- ... columns=['name', 'class', 'max_speed'],
- ... index=[4, 3, 2, 1, 0])
- >>> df
- name class max_speed
- 4 falcon bird 389.0
- 3 parrot bird 24.0
- 2 lion mammal 80.5
- 1 monkey mammal NaN
- 0 rabbit mammal 15.0
- >>> gb = df["name"].groupby([1, 1, 2, 2, 2])
- Take elements at positions 0 and 1 along the axis 0 in each group (default).
- >>> gb.take([0, 1])
- 1 4 falcon
- 3 parrot
- 2 2 lion
- 1 monkey
- Name: name, dtype: object
- We may take elements using negative integers for positive indices,
- starting from the end of the object, just like with Python lists.
- >>> gb.take([-1, -2])
- 1 3 parrot
- 4 falcon
- 2 0 rabbit
- 1 monkey
- Name: name, dtype: object
- """
- result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
- return result
- def skew(
- self,
- axis: Axis | lib.NoDefault = lib.no_default,
- skipna: bool = True,
- numeric_only: bool = False,
- **kwargs,
- ) -> Series:
- """
- Return unbiased skew within groups.
- Normalized by N-1.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Axis for the function to be applied on.
- This parameter is only for compatibility with DataFrame and is unused.
- skipna : bool, default True
- Exclude NA/null values when computing the result.
- numeric_only : bool, default False
- Include only float, int, boolean columns. Not implemented for Series.
- **kwargs
- Additional keyword arguments to be passed to the function.
- Returns
- -------
- Series
- See Also
- --------
- Series.skew : Return unbiased skew over requested axis.
- Examples
- --------
- >>> ser = pd.Series([390., 350., 357., np.nan, 22., 20., 30.],
- ... index=['Falcon', 'Falcon', 'Falcon', 'Falcon',
- ... 'Parrot', 'Parrot', 'Parrot'],
- ... name="Max Speed")
- >>> ser
- Falcon 390.0
- Falcon 350.0
- Falcon 357.0
- Falcon NaN
- Parrot 22.0
- Parrot 20.0
- Parrot 30.0
- Name: Max Speed, dtype: float64
- >>> ser.groupby(level=0).skew()
- Falcon 1.525174
- Parrot 1.457863
- Name: Max Speed, dtype: float64
- >>> ser.groupby(level=0).skew(skipna=False)
- Falcon NaN
- Parrot 1.457863
- Name: Max Speed, dtype: float64
- """
- result = self._op_via_apply(
- "skew",
- axis=axis,
- skipna=skipna,
- numeric_only=numeric_only,
- **kwargs,
- )
- return result
- @property
- @doc(Series.plot.__doc__)
- def plot(self):
- result = GroupByPlot(self)
- return result
- @doc(Series.nlargest.__doc__)
- def nlargest(
- self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
- ) -> Series:
- f = partial(Series.nlargest, n=n, keep=keep)
- data = self._selected_obj
- # Don't change behavior if result index happens to be the same, i.e.
- # already ordered and n >= all group sizes.
- result = self._python_apply_general(f, data, not_indexed_same=True)
- return result
- @doc(Series.nsmallest.__doc__)
- def nsmallest(
- self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
- ) -> Series:
- f = partial(Series.nsmallest, n=n, keep=keep)
- data = self._selected_obj
- # Don't change behavior if result index happens to be the same, i.e.
- # already ordered and n >= all group sizes.
- result = self._python_apply_general(f, data, not_indexed_same=True)
- return result
- @doc(Series.idxmin.__doc__)
- def idxmin(self, axis: Axis = 0, skipna: bool = True) -> Series:
- result = self._op_via_apply("idxmin", axis=axis, skipna=skipna)
- return result
- @doc(Series.idxmax.__doc__)
- def idxmax(self, axis: Axis = 0, skipna: bool = True) -> Series:
- result = self._op_via_apply("idxmax", axis=axis, skipna=skipna)
- return result
- @doc(Series.corr.__doc__)
- def corr(
- self,
- other: Series,
- method: CorrelationMethod = "pearson",
- min_periods: int | None = None,
- ) -> Series:
- result = self._op_via_apply(
- "corr", other=other, method=method, min_periods=min_periods
- )
- return result
- @doc(Series.cov.__doc__)
- def cov(
- self, other: Series, min_periods: int | None = None, ddof: int | None = 1
- ) -> Series:
- result = self._op_via_apply(
- "cov", other=other, min_periods=min_periods, ddof=ddof
- )
- return result
- @property
- @doc(Series.is_monotonic_increasing.__doc__)
- def is_monotonic_increasing(self) -> Series:
- return self.apply(lambda ser: ser.is_monotonic_increasing)
- @property
- @doc(Series.is_monotonic_decreasing.__doc__)
- def is_monotonic_decreasing(self) -> Series:
- return self.apply(lambda ser: ser.is_monotonic_decreasing)
- @doc(Series.hist.__doc__)
- def hist(
- self,
- by=None,
- ax=None,
- grid: bool = True,
- xlabelsize: int | None = None,
- xrot: float | None = None,
- ylabelsize: int | None = None,
- yrot: float | None = None,
- figsize: tuple[int, int] | None = None,
- bins: int | Sequence[int] = 10,
- backend: str | None = None,
- legend: bool = False,
- **kwargs,
- ):
- result = self._op_via_apply(
- "hist",
- by=by,
- ax=ax,
- grid=grid,
- xlabelsize=xlabelsize,
- xrot=xrot,
- ylabelsize=ylabelsize,
- yrot=yrot,
- figsize=figsize,
- bins=bins,
- backend=backend,
- legend=legend,
- **kwargs,
- )
- return result
- @property
- @doc(Series.dtype.__doc__)
- def dtype(self) -> Series:
- return self.apply(lambda ser: ser.dtype)
- @doc(Series.unique.__doc__)
- def unique(self) -> Series:
- result = self._op_via_apply("unique")
- return result
- class DataFrameGroupBy(GroupBy[DataFrame]):
- _agg_examples_doc = dedent(
- """
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {
- ... "A": [1, 1, 2, 2],
- ... "B": [1, 2, 3, 4],
- ... "C": [0.362838, 0.227877, 1.267767, -0.562860],
- ... }
- ... )
- >>> df
- A B C
- 0 1 1 0.362838
- 1 1 2 0.227877
- 2 2 3 1.267767
- 3 2 4 -0.562860
- The aggregation is for each column.
- >>> df.groupby('A').agg('min')
- B C
- A
- 1 1 0.227877
- 2 3 -0.562860
- Multiple aggregations
- >>> df.groupby('A').agg(['min', 'max'])
- B C
- min max min max
- A
- 1 1 2 0.227877 0.362838
- 2 3 4 -0.562860 1.267767
- Select a column for aggregation
- >>> df.groupby('A').B.agg(['min', 'max'])
- min max
- A
- 1 1 2
- 2 3 4
- User-defined function for aggregation
- >>> df.groupby('A').agg(lambda x: sum(x) + 2)
- B C
- A
- 1 5 2.590715
- 2 9 2.704907
- Different aggregations per column
- >>> df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})
- B C
- min max sum
- A
- 1 1 2 0.590715
- 2 3 4 0.704907
- To control the output names with different aggregations per column,
- pandas supports "named aggregation"
- >>> df.groupby("A").agg(
- ... b_min=pd.NamedAgg(column="B", aggfunc="min"),
- ... c_sum=pd.NamedAgg(column="C", aggfunc="sum"))
- b_min c_sum
- A
- 1 1 0.590715
- 2 3 0.704907
- - The keywords are the *output* column names
- - The values are tuples whose first element is the column to select
- and the second element is the aggregation to apply to that column.
- Pandas provides the ``pandas.NamedAgg`` namedtuple with the fields
- ``['column', 'aggfunc']`` to make it clearer what the arguments are.
- As usual, the aggregation can be a callable or a string alias.
- See :ref:`groupby.aggregate.named` for more.
- .. versionchanged:: 1.3.0
- The resulting dtype will reflect the return value of the aggregating function.
- >>> df.groupby("A")[["B"]].agg(lambda x: x.astype(float).min())
- B
- A
- 1 1.0
- 2 3.0
- """
- )
- @doc(_agg_template, examples=_agg_examples_doc, klass="DataFrame")
- def aggregate(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
- if maybe_use_numba(engine):
- return self._aggregate_with_numba(
- func, *args, engine_kwargs=engine_kwargs, **kwargs
- )
- relabeling, func, columns, order = reconstruct_func(func, **kwargs)
- func = maybe_mangle_lambdas(func)
- op = GroupByApply(self, func, args, kwargs)
- result = op.agg()
- if not is_dict_like(func) and result is not None:
- return result
- elif relabeling:
- # this should be the only (non-raising) case with relabeling
- # used reordered index of columns
- result = cast(DataFrame, result)
- result = result.iloc[:, order]
- result = cast(DataFrame, result)
- # error: Incompatible types in assignment (expression has type
- # "Optional[List[str]]", variable has type
- # "Union[Union[Union[ExtensionArray, ndarray[Any, Any]],
- # Index, Series], Sequence[Any]]")
- result.columns = columns # type: ignore[assignment]
- if result is None:
- # grouper specific aggregations
- if self.grouper.nkeys > 1:
- # test_groupby_as_index_series_scalar gets here with 'not self.as_index'
- return self._python_agg_general(func, *args, **kwargs)
- elif args or kwargs:
- # test_pass_args_kwargs gets here (with and without as_index)
- # can't return early
- result = self._aggregate_frame(func, *args, **kwargs)
- elif self.axis == 1:
- # _aggregate_multiple_funcs does not allow self.axis == 1
- # Note: axis == 1 precludes 'not self.as_index', see __init__
- result = self._aggregate_frame(func)
- return result
- else:
- # try to treat as if we are passing a list
- gba = GroupByApply(self, [func], args=(), kwargs={})
- try:
- result = gba.agg()
- except ValueError as err:
- if "No objects to concatenate" not in str(err):
- raise
- # _aggregate_frame can fail with e.g. func=Series.mode,
- # where it expects 1D values but would be getting 2D values
- # In other tests, using aggregate_frame instead of GroupByApply
- # would give correct values but incorrect dtypes
- # object vs float64 in test_cython_agg_empty_buckets
- # float64 vs int64 in test_category_order_apply
- result = self._aggregate_frame(func)
- else:
- # GH#32040, GH#35246
- # e.g. test_groupby_as_index_select_column_sum_empty_df
- result = cast(DataFrame, result)
- result.columns = self._obj_with_exclusions.columns.copy()
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- result.index = default_index(len(result))
- return result
- agg = aggregate
- def _python_agg_general(self, func, *args, **kwargs):
- func = com.is_builtin_func(func)
- f = lambda x: func(x, *args, **kwargs)
- # iterate through "columns" ex exclusions to populate output dict
- output: dict[base.OutputKey, ArrayLike] = {}
- if self.ngroups == 0:
- # e.g. test_evaluate_with_empty_groups different path gets different
- # result dtype in empty case.
- return self._python_apply_general(f, self._selected_obj, is_agg=True)
- for idx, obj in enumerate(self._iterate_slices()):
- name = obj.name
- result = self.grouper.agg_series(obj, f)
- key = base.OutputKey(label=name, position=idx)
- output[key] = result
- if not output:
- # e.g. test_margins_no_values_no_cols
- return self._python_apply_general(f, self._selected_obj)
- res = self._indexed_output_to_ndframe(output)
- return self._wrap_aggregated_output(res)
- def _iterate_slices(self) -> Iterable[Series]:
- obj = self._selected_obj
- if self.axis == 1:
- obj = obj.T
- if isinstance(obj, Series) and obj.name not in self.exclusions:
- # Occurs when doing DataFrameGroupBy(...)["X"]
- yield obj
- else:
- for label, values in obj.items():
- if label in self.exclusions:
- # Note: if we tried to just iterate over _obj_with_exclusions,
- # we would break test_wrap_agg_out by yielding a column
- # that is skipped here but not dropped from obj_with_exclusions
- continue
- yield values
- def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame:
- if self.grouper.nkeys != 1:
- raise AssertionError("Number of keys must be 1")
- obj = self._obj_with_exclusions
- result: dict[Hashable, NDFrame | np.ndarray] = {}
- for name, grp_df in self.grouper.get_iterator(obj, self.axis):
- fres = func(grp_df, *args, **kwargs)
- result[name] = fres
- result_index = self.grouper.result_index
- other_ax = obj.axes[1 - self.axis]
- out = self.obj._constructor(result, index=other_ax, columns=result_index)
- if self.axis == 0:
- out = out.T
- return out
- def _wrap_applied_output(
- self,
- data: DataFrame,
- values: list,
- not_indexed_same: bool = False,
- is_transform: bool = False,
- ):
- if len(values) == 0:
- if is_transform:
- # GH#47787 see test_group_on_empty_multiindex
- res_index = data.index
- else:
- res_index = self.grouper.result_index
- result = self.obj._constructor(index=res_index, columns=data.columns)
- result = result.astype(data.dtypes, copy=False)
- return result
- # GH12824
- # using values[0] here breaks test_groupby_apply_none_first
- first_not_none = next(com.not_none(*values), None)
- if first_not_none is None:
- # GH9684 - All values are None, return an empty frame.
- return self.obj._constructor()
- elif isinstance(first_not_none, DataFrame):
- return self._concat_objects(
- values,
- not_indexed_same=not_indexed_same,
- is_transform=is_transform,
- )
- key_index = self.grouper.result_index if self.as_index else None
- if isinstance(first_not_none, (np.ndarray, Index)):
- # GH#1738: values is list of arrays of unequal lengths
- # fall through to the outer else clause
- # TODO: sure this is right? we used to do this
- # after raising AttributeError above
- return self.obj._constructor_sliced(
- values, index=key_index, name=self._selection
- )
- elif not isinstance(first_not_none, Series):
- # values are not series or array-like but scalars
- # self._selection not passed through to Series as the
- # result should not take the name of original selection
- # of columns
- if self.as_index:
- return self.obj._constructor_sliced(values, index=key_index)
- else:
- result = self.obj._constructor(values, columns=[self._selection])
- result = self._insert_inaxis_grouper(result)
- return result
- else:
- # values are Series
- return self._wrap_applied_output_series(
- values,
- not_indexed_same,
- first_not_none,
- key_index,
- is_transform,
- )
- def _wrap_applied_output_series(
- self,
- values: list[Series],
- not_indexed_same: bool,
- first_not_none,
- key_index: Index | None,
- is_transform: bool,
- ) -> DataFrame | Series:
- kwargs = first_not_none._construct_axes_dict()
- backup = Series(**kwargs)
- values = [x if (x is not None) else backup for x in values]
- all_indexed_same = all_indexes_same(x.index for x in values)
- if not all_indexed_same:
- # GH 8467
- return self._concat_objects(
- values,
- not_indexed_same=True,
- is_transform=is_transform,
- )
- # Combine values
- # vstack+constructor is faster than concat and handles MI-columns
- stacked_values = np.vstack([np.asarray(v) for v in values])
- if self.axis == 0:
- index = key_index
- columns = first_not_none.index.copy()
- if columns.name is None:
- # GH6124 - propagate name of Series when it's consistent
- names = {v.name for v in values}
- if len(names) == 1:
- columns.name = list(names)[0]
- else:
- index = first_not_none.index
- columns = key_index
- stacked_values = stacked_values.T
- if stacked_values.dtype == object:
- # We'll have the DataFrame constructor do inference
- stacked_values = stacked_values.tolist()
- result = self.obj._constructor(stacked_values, index=index, columns=columns)
- if not self.as_index:
- result = self._insert_inaxis_grouper(result)
- return self._reindex_output(result)
- def _cython_transform(
- self,
- how: str,
- numeric_only: bool = False,
- axis: AxisInt = 0,
- **kwargs,
- ) -> DataFrame:
- assert axis == 0 # handled by caller
- # With self.axis == 0, we have multi-block tests
- # e.g. test_rank_min_int, test_cython_transform_frame
- # test_transform_numeric_ret
- # With self.axis == 1, _get_data_to_aggregate does a transpose
- # so we always have a single block.
- mgr: Manager2D = self._get_data_to_aggregate(
- numeric_only=numeric_only, name=how
- )
- def arr_func(bvalues: ArrayLike) -> ArrayLike:
- return self.grouper._cython_operation(
- "transform", bvalues, how, 1, **kwargs
- )
- # We could use `mgr.apply` here and not have to set_axis, but
- # we would have to do shape gymnastics for ArrayManager compat
- res_mgr = mgr.grouped_reduce(arr_func)
- res_mgr.set_axis(1, mgr.axes[1])
- res_df = self.obj._constructor(res_mgr)
- res_df = self._maybe_transpose_result(res_df)
- return res_df
- def _transform_general(self, func, *args, **kwargs):
- from pandas.core.reshape.concat import concat
- applied = []
- obj = self._obj_with_exclusions
- gen = self.grouper.get_iterator(obj, axis=self.axis)
- fast_path, slow_path = self._define_paths(func, *args, **kwargs)
- # Determine whether to use slow or fast path by evaluating on the first group.
- # Need to handle the case of an empty generator and process the result so that
- # it does not need to be computed again.
- try:
- name, group = next(gen)
- except StopIteration:
- pass
- else:
- object.__setattr__(group, "name", name)
- try:
- path, res = self._choose_path(fast_path, slow_path, group)
- except ValueError as err:
- # e.g. test_transform_with_non_scalar_group
- msg = "transform must return a scalar value for each group"
- raise ValueError(msg) from err
- if group.size > 0:
- res = _wrap_transform_general_frame(self.obj, group, res)
- applied.append(res)
- # Compute and process with the remaining groups
- for name, group in gen:
- if group.size == 0:
- continue
- object.__setattr__(group, "name", name)
- res = path(group)
- res = _wrap_transform_general_frame(self.obj, group, res)
- applied.append(res)
- concat_index = obj.columns if self.axis == 0 else obj.index
- other_axis = 1 if self.axis == 0 else 0 # switches between 0 & 1
- concatenated = concat(applied, axis=self.axis, verify_integrity=False)
- concatenated = concatenated.reindex(concat_index, axis=other_axis, copy=False)
- return self._set_result_index_ordered(concatenated)
- __examples_dataframe_doc = dedent(
- """
- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
- ... 'foo', 'bar'],
- ... 'B' : ['one', 'one', 'two', 'three',
- ... 'two', 'two'],
- ... 'C' : [1, 5, 5, 2, 5, 5],
- ... 'D' : [2.0, 5., 8., 1., 2., 9.]})
- >>> grouped = df.groupby('A')[['C', 'D']]
- >>> grouped.transform(lambda x: (x - x.mean()) / x.std())
- C D
- 0 -1.154701 -0.577350
- 1 0.577350 0.000000
- 2 0.577350 1.154701
- 3 -1.154701 -1.000000
- 4 0.577350 -0.577350
- 5 0.577350 1.000000
- Broadcast result of the transformation
- >>> grouped.transform(lambda x: x.max() - x.min())
- C D
- 0 4.0 6.0
- 1 3.0 8.0
- 2 4.0 6.0
- 3 3.0 8.0
- 4 4.0 6.0
- 5 3.0 8.0
- >>> grouped.transform("mean")
- C D
- 0 3.666667 4.0
- 1 4.000000 5.0
- 2 3.666667 4.0
- 3 4.000000 5.0
- 4 3.666667 4.0
- 5 4.000000 5.0
- .. versionchanged:: 1.3.0
- The resulting dtype will reflect the return value of the passed ``func``,
- for example:
- >>> grouped.transform(lambda x: x.astype(int).max())
- C D
- 0 5 8
- 1 5 9
- 2 5 8
- 3 5 9
- 4 5 8
- 5 5 9
- """
- )
- @Substitution(klass="DataFrame", example=__examples_dataframe_doc)
- @Appender(_transform_template)
- def transform(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
- return self._transform(
- func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
- )
- def _define_paths(self, func, *args, **kwargs):
- if isinstance(func, str):
- fast_path = lambda group: getattr(group, func)(*args, **kwargs)
- slow_path = lambda group: group.apply(
- lambda x: getattr(x, func)(*args, **kwargs), axis=self.axis
- )
- else:
- fast_path = lambda group: func(group, *args, **kwargs)
- slow_path = lambda group: group.apply(
- lambda x: func(x, *args, **kwargs), axis=self.axis
- )
- return fast_path, slow_path
- def _choose_path(self, fast_path: Callable, slow_path: Callable, group: DataFrame):
- path = slow_path
- res = slow_path(group)
- if self.ngroups == 1:
- # no need to evaluate multiple paths when only
- # a single group exists
- return path, res
- # if we make it here, test if we can use the fast path
- try:
- res_fast = fast_path(group)
- except AssertionError:
- raise # pragma: no cover
- except Exception:
- # GH#29631 For user-defined function, we can't predict what may be
- # raised; see test_transform.test_transform_fastpath_raises
- return path, res
- # verify fast path returns either:
- # a DataFrame with columns equal to group.columns
- # OR a Series with index equal to group.columns
- if isinstance(res_fast, DataFrame):
- if not res_fast.columns.equals(group.columns):
- return path, res
- elif isinstance(res_fast, Series):
- if not res_fast.index.equals(group.columns):
- return path, res
- else:
- return path, res
- if res_fast.equals(res):
- path = fast_path
- return path, res
- def filter(self, func, dropna: bool = True, *args, **kwargs):
- """
- Filter elements from groups that don't satisfy a criterion.
- Elements from groups are filtered if they do not satisfy the
- boolean criterion specified by func.
- Parameters
- ----------
- func : function
- Criterion to apply to each group. Should return True or False.
- dropna : bool
- Drop groups that do not pass the filter. True by default; if False,
- groups that evaluate False are filled with NaNs.
- Returns
- -------
- DataFrame
- Notes
- -----
- Each subframe is endowed the attribute 'name' in case you need to know
- which group you are working on.
- Functions that mutate the passed object can produce unexpected
- behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
- for more details.
- Examples
- --------
- >>> df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
- ... 'foo', 'bar'],
- ... 'B' : [1, 2, 3, 4, 5, 6],
- ... 'C' : [2.0, 5., 8., 1., 2., 9.]})
- >>> grouped = df.groupby('A')
- >>> grouped.filter(lambda x: x['B'].mean() > 3.)
- A B C
- 1 bar 2 5.0
- 3 bar 4 1.0
- 5 bar 6 9.0
- """
- indices = []
- obj = self._selected_obj
- gen = self.grouper.get_iterator(obj, axis=self.axis)
- for name, group in gen:
- object.__setattr__(group, "name", name)
- res = func(group, *args, **kwargs)
- try:
- res = res.squeeze()
- except AttributeError: # allow e.g., scalars and frames to pass
- pass
- # interpret the result of the filter
- if is_bool(res) or (is_scalar(res) and isna(res)):
- if notna(res) and res:
- indices.append(self._get_index(name))
- else:
- # non scalars aren't allowed
- raise TypeError(
- f"filter function returned a {type(res).__name__}, "
- "but expected a scalar bool"
- )
- return self._apply_filter(indices, dropna)
- def __getitem__(self, key) -> DataFrameGroupBy | SeriesGroupBy:
- if self.axis == 1:
- # GH 37725
- raise ValueError("Cannot subset columns when using axis=1")
- # per GH 23566
- if isinstance(key, tuple) and len(key) > 1:
- # if len == 1, then it becomes a SeriesGroupBy and this is actually
- # valid syntax, so don't raise
- raise ValueError(
- "Cannot subset columns with a tuple with more than one element. "
- "Use a list instead."
- )
- return super().__getitem__(key)
- def _gotitem(self, key, ndim: int, subset=None):
- """
- sub-classes to define
- return a sliced object
- Parameters
- ----------
- key : string / list of selections
- ndim : {1, 2}
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- if ndim == 2:
- if subset is None:
- subset = self.obj
- return DataFrameGroupBy(
- subset,
- self.grouper,
- axis=self.axis,
- level=self.level,
- grouper=self.grouper,
- exclusions=self.exclusions,
- selection=key,
- as_index=self.as_index,
- sort=self.sort,
- group_keys=self.group_keys,
- observed=self.observed,
- dropna=self.dropna,
- )
- elif ndim == 1:
- if subset is None:
- subset = self.obj[key]
- return SeriesGroupBy(
- subset,
- level=self.level,
- grouper=self.grouper,
- exclusions=self.exclusions,
- selection=key,
- as_index=self.as_index,
- sort=self.sort,
- group_keys=self.group_keys,
- observed=self.observed,
- dropna=self.dropna,
- )
- raise AssertionError("invalid ndim for _gotitem")
- def _get_data_to_aggregate(
- self, *, numeric_only: bool = False, name: str | None = None
- ) -> Manager2D:
- obj = self._obj_with_exclusions
- if self.axis == 1:
- mgr = obj.T._mgr
- else:
- mgr = obj._mgr
- if numeric_only:
- mgr = mgr.get_numeric_data(copy=False)
- return mgr
- def _indexed_output_to_ndframe(
- self, output: Mapping[base.OutputKey, ArrayLike]
- ) -> DataFrame:
- """
- Wrap the dict result of a GroupBy aggregation into a DataFrame.
- """
- indexed_output = {key.position: val for key, val in output.items()}
- columns = Index([key.label for key in output])
- columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names)
- result = self.obj._constructor(indexed_output)
- result.columns = columns
- return result
- def _wrap_agged_manager(self, mgr: Manager2D) -> DataFrame:
- return self.obj._constructor(mgr)
- def _iterate_column_groupbys(self, obj: DataFrame):
- for i, colname in enumerate(obj.columns):
- yield colname, SeriesGroupBy(
- obj.iloc[:, i],
- selection=colname,
- grouper=self.grouper,
- exclusions=self.exclusions,
- observed=self.observed,
- )
- def _apply_to_column_groupbys(self, func, obj: DataFrame) -> DataFrame:
- from pandas.core.reshape.concat import concat
- columns = obj.columns
- results = [
- func(col_groupby) for _, col_groupby in self._iterate_column_groupbys(obj)
- ]
- if not len(results):
- # concat would raise
- return DataFrame([], columns=columns, index=self.grouper.result_index)
- else:
- return concat(results, keys=columns, axis=1)
- def nunique(self, dropna: bool = True) -> DataFrame:
- """
- Return DataFrame with counts of unique elements in each position.
- Parameters
- ----------
- dropna : bool, default True
- Don't include NaN in the counts.
- Returns
- -------
- nunique: DataFrame
- Examples
- --------
- >>> df = pd.DataFrame({'id': ['spam', 'egg', 'egg', 'spam',
- ... 'ham', 'ham'],
- ... 'value1': [1, 5, 5, 2, 5, 5],
- ... 'value2': list('abbaxy')})
- >>> df
- id value1 value2
- 0 spam 1 a
- 1 egg 5 b
- 2 egg 5 b
- 3 spam 2 a
- 4 ham 5 x
- 5 ham 5 y
- >>> df.groupby('id').nunique()
- value1 value2
- id
- egg 1 1
- ham 1 2
- spam 2 1
- Check for rows with the same id but conflicting values:
- >>> df.groupby('id').filter(lambda g: (g.nunique() > 1).any())
- id value1 value2
- 0 spam 1 a
- 3 spam 2 a
- 4 ham 5 x
- 5 ham 5 y
- """
- if self.axis != 0:
- # see test_groupby_crash_on_nunique
- return self._python_apply_general(
- lambda sgb: sgb.nunique(dropna), self._obj_with_exclusions, is_agg=True
- )
- obj = self._obj_with_exclusions
- results = self._apply_to_column_groupbys(
- lambda sgb: sgb.nunique(dropna), obj=obj
- )
- if not self.as_index:
- results.index = default_index(len(results))
- results = self._insert_inaxis_grouper(results)
- return results
- def idxmax(
- self,
- axis: Axis | None = None,
- skipna: bool = True,
- numeric_only: bool = False,
- ) -> DataFrame:
- """
- Return index of first occurrence of maximum over requested axis.
- NA/null values are excluded.
- Parameters
- ----------
- axis : {{0 or 'index', 1 or 'columns'}}, default None
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- If axis is not provided, grouper's axis is used.
- .. versionchanged:: 2.0.0
- skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result
- will be NA.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- Returns
- -------
- Series
- Indexes of maxima along the specified axis.
- Raises
- ------
- ValueError
- * If the row/column is empty
- See Also
- --------
- Series.idxmax : Return index of the maximum element.
- Notes
- -----
- This method is the DataFrame version of ``ndarray.argmax``.
- Examples
- --------
- Consider a dataset containing food consumption in Argentina.
- >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
- ... 'co2_emissions': [37.2, 19.66, 1712]},
- ... index=['Pork', 'Wheat Products', 'Beef'])
- >>> df
- consumption co2_emissions
- Pork 10.51 37.20
- Wheat Products 103.11 19.66
- Beef 55.48 1712.00
- By default, it returns the index for the maximum value in each column.
- >>> df.idxmax()
- consumption Wheat Products
- co2_emissions Beef
- dtype: object
- To return the index for the maximum value in each row, use ``axis="columns"``.
- >>> df.idxmax(axis="columns")
- Pork co2_emissions
- Wheat Products consumption
- Beef co2_emissions
- dtype: object
- """
- if axis is None:
- axis = self.axis
- def func(df):
- return df.idxmax(axis=axis, skipna=skipna, numeric_only=numeric_only)
- func.__name__ = "idxmax"
- result = self._python_apply_general(
- func, self._obj_with_exclusions, not_indexed_same=True
- )
- return result
- def idxmin(
- self,
- axis: Axis | None = None,
- skipna: bool = True,
- numeric_only: bool = False,
- ) -> DataFrame:
- """
- Return index of first occurrence of minimum over requested axis.
- NA/null values are excluded.
- Parameters
- ----------
- axis : {{0 or 'index', 1 or 'columns'}}, default None
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- If axis is not provided, grouper's axis is used.
- .. versionchanged:: 2.0.0
- skipna : bool, default True
- Exclude NA/null values. If an entire row/column is NA, the result
- will be NA.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- Returns
- -------
- Series
- Indexes of minima along the specified axis.
- Raises
- ------
- ValueError
- * If the row/column is empty
- See Also
- --------
- Series.idxmin : Return index of the minimum element.
- Notes
- -----
- This method is the DataFrame version of ``ndarray.argmin``.
- Examples
- --------
- Consider a dataset containing food consumption in Argentina.
- >>> df = pd.DataFrame({'consumption': [10.51, 103.11, 55.48],
- ... 'co2_emissions': [37.2, 19.66, 1712]},
- ... index=['Pork', 'Wheat Products', 'Beef'])
- >>> df
- consumption co2_emissions
- Pork 10.51 37.20
- Wheat Products 103.11 19.66
- Beef 55.48 1712.00
- By default, it returns the index for the minimum value in each column.
- >>> df.idxmin()
- consumption Pork
- co2_emissions Wheat Products
- dtype: object
- To return the index for the minimum value in each row, use ``axis="columns"``.
- >>> df.idxmin(axis="columns")
- Pork consumption
- Wheat Products co2_emissions
- Beef consumption
- dtype: object
- """
- if axis is None:
- axis = self.axis
- def func(df):
- return df.idxmin(axis=axis, skipna=skipna, numeric_only=numeric_only)
- func.__name__ = "idxmin"
- result = self._python_apply_general(
- func, self._obj_with_exclusions, not_indexed_same=True
- )
- return result
- boxplot = boxplot_frame_groupby
- def value_counts(
- self,
- subset: Sequence[Hashable] | None = None,
- normalize: bool = False,
- sort: bool = True,
- ascending: bool = False,
- dropna: bool = True,
- ) -> DataFrame | Series:
- """
- Return a Series or DataFrame containing counts of unique rows.
- .. versionadded:: 1.4.0
- Parameters
- ----------
- subset : list-like, optional
- Columns to use when counting unique combinations.
- normalize : bool, default False
- Return proportions rather than frequencies.
- sort : bool, default True
- Sort by frequencies.
- ascending : bool, default False
- Sort in ascending order.
- dropna : bool, default True
- Don’t include counts of rows that contain NA values.
- Returns
- -------
- Series or DataFrame
- Series if the groupby as_index is True, otherwise DataFrame.
- See Also
- --------
- Series.value_counts: Equivalent method on Series.
- DataFrame.value_counts: Equivalent method on DataFrame.
- SeriesGroupBy.value_counts: Equivalent method on SeriesGroupBy.
- Notes
- -----
- - If the groupby as_index is True then the returned Series will have a
- MultiIndex with one level per input column.
- - If the groupby as_index is False then the returned DataFrame will have an
- additional column with the value_counts. The column is labelled 'count' or
- 'proportion', depending on the ``normalize`` parameter.
- By default, rows that contain any NA values are omitted from
- the result.
- By default, the result will be in descending order so that the
- first element of each group is the most frequently-occurring row.
- Examples
- --------
- >>> df = pd.DataFrame({
- ... 'gender': ['male', 'male', 'female', 'male', 'female', 'male'],
- ... 'education': ['low', 'medium', 'high', 'low', 'high', 'low'],
- ... 'country': ['US', 'FR', 'US', 'FR', 'FR', 'FR']
- ... })
- >>> df
- gender education country
- 0 male low US
- 1 male medium FR
- 2 female high US
- 3 male low FR
- 4 female high FR
- 5 male low FR
- >>> df.groupby('gender').value_counts()
- gender education country
- female high FR 1
- US 1
- male low FR 2
- US 1
- medium FR 1
- Name: count, dtype: int64
- >>> df.groupby('gender').value_counts(ascending=True)
- gender education country
- female high FR 1
- US 1
- male low US 1
- medium FR 1
- low FR 2
- Name: count, dtype: int64
- >>> df.groupby('gender').value_counts(normalize=True)
- gender education country
- female high FR 0.50
- US 0.50
- male low FR 0.50
- US 0.25
- medium FR 0.25
- Name: proportion, dtype: float64
- >>> df.groupby('gender', as_index=False).value_counts()
- gender education country count
- 0 female high FR 1
- 1 female high US 1
- 2 male low FR 2
- 3 male low US 1
- 4 male medium FR 1
- >>> df.groupby('gender', as_index=False).value_counts(normalize=True)
- gender education country proportion
- 0 female high FR 0.50
- 1 female high US 0.50
- 2 male low FR 0.50
- 3 male low US 0.25
- 4 male medium FR 0.25
- """
- return self._value_counts(subset, normalize, sort, ascending, dropna)
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = None,
- method: FillnaOptions | None = None,
- axis: Axis | None = None,
- inplace: bool = False,
- limit=None,
- downcast=None,
- ) -> DataFrame | None:
- """
- Fill NA/NaN values using the specified method within groups.
- Parameters
- ----------
- value : scalar, dict, Series, or DataFrame
- Value to use to fill holes (e.g. 0), alternately a
- dict/Series/DataFrame of values specifying which value to use for
- each index (for a Series) or column (for a DataFrame). Values not
- in the dict/Series/DataFrame will not be filled. This value cannot
- be a list. Users wanting to use the ``value`` argument and not ``method``
- should prefer :meth:`.DataFrame.fillna` as this
- will produce the same result and be more performant.
- method : {{'bfill', 'ffill', None}}, default None
- Method to use for filling holes. ``'ffill'`` will propagate
- the last valid observation forward within a group.
- ``'bfill'`` will use next valid observation to fill the gap.
- axis : {0 or 'index', 1 or 'columns'}
- Axis along which to fill missing values. When the :class:`DataFrameGroupBy`
- ``axis`` argument is ``0``, using ``axis=1`` here will produce
- the same results as :meth:`.DataFrame.fillna`. When the
- :class:`DataFrameGroupBy` ``axis`` argument is ``1``, using ``axis=0``
- or ``axis=1`` here will produce the same results.
- inplace : bool, default False
- Broken. Do not set to True.
- limit : int, default None
- If method is specified, this is the maximum number of consecutive
- NaN values to forward/backward fill within a group. In other words,
- if there is a gap with more than this number of consecutive NaNs,
- it will only be partially filled. If method is not specified, this is the
- maximum number of entries along the entire axis where NaNs will be
- filled. Must be greater than 0 if not None.
- downcast : dict, default is None
- A dict of item->dtype of what to downcast if possible,
- or the string 'infer' which will try to downcast to an appropriate
- equal type (e.g. float64 to int64 if possible).
- Returns
- -------
- DataFrame
- Object with missing values filled.
- See Also
- --------
- ffill : Forward fill values within a group.
- bfill : Backward fill values within a group.
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {
- ... "key": [0, 0, 1, 1, 1],
- ... "A": [np.nan, 2, np.nan, 3, np.nan],
- ... "B": [2, 3, np.nan, np.nan, np.nan],
- ... "C": [np.nan, np.nan, 2, np.nan, np.nan],
- ... }
- ... )
- >>> df
- key A B C
- 0 0 NaN 2.0 NaN
- 1 0 2.0 3.0 NaN
- 2 1 NaN NaN 2.0
- 3 1 3.0 NaN NaN
- 4 1 NaN NaN NaN
- Propagate non-null values forward or backward within each group along columns.
- >>> df.groupby("key").fillna(method="ffill")
- A B C
- 0 NaN 2.0 NaN
- 1 2.0 3.0 NaN
- 2 NaN NaN 2.0
- 3 3.0 NaN 2.0
- 4 3.0 NaN 2.0
- >>> df.groupby("key").fillna(method="bfill")
- A B C
- 0 2.0 2.0 NaN
- 1 2.0 3.0 NaN
- 2 3.0 NaN 2.0
- 3 3.0 NaN NaN
- 4 NaN NaN NaN
- Propagate non-null values forward or backward within each group along rows.
- >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="ffill")
- key A B C
- 0 0.0 0.0 2.0 2.0
- 1 0.0 2.0 3.0 3.0
- 2 1.0 1.0 NaN 2.0
- 3 1.0 3.0 NaN NaN
- 4 1.0 1.0 NaN NaN
- >>> df.groupby([0, 0, 1, 1], axis=1).fillna(method="bfill")
- key A B C
- 0 0.0 NaN 2.0 NaN
- 1 0.0 2.0 3.0 NaN
- 2 1.0 NaN 2.0 2.0
- 3 1.0 3.0 NaN NaN
- 4 1.0 NaN NaN NaN
- Only replace the first NaN element within a group along rows.
- >>> df.groupby("key").fillna(method="ffill", limit=1)
- A B C
- 0 NaN 2.0 NaN
- 1 2.0 3.0 NaN
- 2 NaN NaN 2.0
- 3 3.0 NaN 2.0
- 4 3.0 NaN NaN
- """
- result = self._op_via_apply(
- "fillna",
- value=value,
- method=method,
- axis=axis,
- inplace=inplace,
- limit=limit,
- downcast=downcast,
- )
- return result
- def take(
- self,
- indices: TakeIndexer,
- axis: Axis | None = 0,
- **kwargs,
- ) -> DataFrame:
- """
- Return the elements in the given *positional* indices in each group.
- This means that we are not indexing according to actual values in
- the index attribute of the object. We are indexing according to the
- actual position of the element in the object.
- If a requested index does not exist for some group, this method will raise.
- To get similar behavior that ignores indices that don't exist, see
- :meth:`.DataFrameGroupBy.nth`.
- Parameters
- ----------
- indices : array-like
- An array of ints indicating which positions to take.
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- The axis on which to select elements. ``0`` means that we are
- selecting rows, ``1`` means that we are selecting columns.
- **kwargs
- For compatibility with :meth:`numpy.take`. Has no effect on the
- output.
- Returns
- -------
- DataFrame
- An DataFrame containing the elements taken from each group.
- See Also
- --------
- DataFrame.take : Take elements from a Series along an axis.
- DataFrame.loc : Select a subset of a DataFrame by labels.
- DataFrame.iloc : Select a subset of a DataFrame by positions.
- numpy.take : Take elements from an array along an axis.
- Examples
- --------
- >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
- ... ('parrot', 'bird', 24.0),
- ... ('lion', 'mammal', 80.5),
- ... ('monkey', 'mammal', np.nan),
- ... ('rabbit', 'mammal', 15.0)],
- ... columns=['name', 'class', 'max_speed'],
- ... index=[4, 3, 2, 1, 0])
- >>> df
- name class max_speed
- 4 falcon bird 389.0
- 3 parrot bird 24.0
- 2 lion mammal 80.5
- 1 monkey mammal NaN
- 0 rabbit mammal 15.0
- >>> gb = df.groupby([1, 1, 2, 2, 2])
- Take elements at positions 0 and 1 along the axis 0 (default).
- Note how the indices selected in the result do not correspond to
- our input indices 0 and 1. That's because we are selecting the 0th
- and 1st rows, not rows whose indices equal 0 and 1.
- >>> gb.take([0, 1])
- name class max_speed
- 1 4 falcon bird 389.0
- 3 parrot bird 24.0
- 2 2 lion mammal 80.5
- 1 monkey mammal NaN
- The order of the specified indices influences the order in the result.
- Here, the order is swapped from the previous example.
- >>> gb.take([1, 0])
- name class max_speed
- 1 3 parrot bird 24.0
- 4 falcon bird 389.0
- 2 1 monkey mammal NaN
- 2 lion mammal 80.5
- Take elements at indices 1 and 2 along the axis 1 (column selection).
- We may take elements using negative integers for positive indices,
- starting from the end of the object, just like with Python lists.
- >>> gb.take([-1, -2])
- name class max_speed
- 1 3 parrot bird 24.0
- 4 falcon bird 389.0
- 2 0 rabbit mammal 15.0
- 1 monkey mammal NaN
- """
- result = self._op_via_apply("take", indices=indices, axis=axis, **kwargs)
- return result
- def skew(
- self,
- axis: Axis | None | lib.NoDefault = lib.no_default,
- skipna: bool = True,
- numeric_only: bool = False,
- **kwargs,
- ) -> DataFrame:
- """
- Return unbiased skew within groups.
- Normalized by N-1.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns', None}, default 0
- Axis for the function to be applied on.
- Specifying ``axis=None`` will apply the aggregation across both axes.
- .. versionadded:: 2.0.0
- skipna : bool, default True
- Exclude NA/null values when computing the result.
- numeric_only : bool, default False
- Include only float, int, boolean columns.
- **kwargs
- Additional keyword arguments to be passed to the function.
- Returns
- -------
- DataFrame
- See Also
- --------
- DataFrame.skew : Return unbiased skew over requested axis.
- Examples
- --------
- >>> arrays = [['falcon', 'parrot', 'cockatoo', 'kiwi',
- ... 'lion', 'monkey', 'rabbit'],
- ... ['bird', 'bird', 'bird', 'bird',
- ... 'mammal', 'mammal', 'mammal']]
- >>> index = pd.MultiIndex.from_arrays(arrays, names=('name', 'class'))
- >>> df = pd.DataFrame({'max_speed': [389.0, 24.0, 70.0, np.nan,
- ... 80.5, 21.5, 15.0]},
- ... index=index)
- >>> df
- max_speed
- name class
- falcon bird 389.0
- parrot bird 24.0
- cockatoo bird 70.0
- kiwi bird NaN
- lion mammal 80.5
- monkey mammal 21.5
- rabbit mammal 15.0
- >>> gb = df.groupby(["class"])
- >>> gb.skew()
- max_speed
- class
- bird 1.628296
- mammal 1.669046
- >>> gb.skew(skipna=False)
- max_speed
- class
- bird NaN
- mammal 1.669046
- """
- result = self._op_via_apply(
- "skew",
- axis=axis,
- skipna=skipna,
- numeric_only=numeric_only,
- **kwargs,
- )
- return result
- @property
- @doc(DataFrame.plot.__doc__)
- def plot(self) -> GroupByPlot:
- result = GroupByPlot(self)
- return result
- @doc(DataFrame.corr.__doc__)
- def corr(
- self,
- method: str | Callable[[np.ndarray, np.ndarray], float] = "pearson",
- min_periods: int = 1,
- numeric_only: bool = False,
- ) -> DataFrame:
- result = self._op_via_apply(
- "corr", method=method, min_periods=min_periods, numeric_only=numeric_only
- )
- return result
- @doc(DataFrame.cov.__doc__)
- def cov(
- self,
- min_periods: int | None = None,
- ddof: int | None = 1,
- numeric_only: bool = False,
- ) -> DataFrame:
- result = self._op_via_apply(
- "cov", min_periods=min_periods, ddof=ddof, numeric_only=numeric_only
- )
- return result
- @doc(DataFrame.hist.__doc__)
- def hist(
- self,
- column: IndexLabel = None,
- by=None,
- grid: bool = True,
- xlabelsize: int | None = None,
- xrot: float | None = None,
- ylabelsize: int | None = None,
- yrot: float | None = None,
- ax=None,
- sharex: bool = False,
- sharey: bool = False,
- figsize: tuple[int, int] | None = None,
- layout: tuple[int, int] | None = None,
- bins: int | Sequence[int] = 10,
- backend: str | None = None,
- legend: bool = False,
- **kwargs,
- ):
- result = self._op_via_apply(
- "hist",
- column=column,
- by=by,
- grid=grid,
- xlabelsize=xlabelsize,
- xrot=xrot,
- ylabelsize=ylabelsize,
- yrot=yrot,
- ax=ax,
- sharex=sharex,
- sharey=sharey,
- figsize=figsize,
- layout=layout,
- bins=bins,
- backend=backend,
- legend=legend,
- **kwargs,
- )
- return result
- @property
- @doc(DataFrame.dtypes.__doc__)
- def dtypes(self) -> Series:
- # error: Incompatible return value type (got "DataFrame", expected "Series")
- return self.apply(lambda df: df.dtypes) # type: ignore[return-value]
- @doc(DataFrame.corrwith.__doc__)
- def corrwith(
- self,
- other: DataFrame | Series,
- axis: Axis = 0,
- drop: bool = False,
- method: CorrelationMethod = "pearson",
- numeric_only: bool = False,
- ) -> DataFrame:
- result = self._op_via_apply(
- "corrwith",
- other=other,
- axis=axis,
- drop=drop,
- method=method,
- numeric_only=numeric_only,
- )
- return result
- def _wrap_transform_general_frame(
- obj: DataFrame, group: DataFrame, res: DataFrame | Series
- ) -> DataFrame:
- from pandas import concat
- if isinstance(res, Series):
- # we need to broadcast across the
- # other dimension; this will preserve dtypes
- # GH14457
- if res.index.is_(obj.index):
- res_frame = concat([res] * len(group.columns), axis=1)
- res_frame.columns = group.columns
- res_frame.index = group.index
- else:
- res_frame = obj._constructor(
- np.tile(res.values, (len(group.index), 1)),
- columns=group.columns,
- index=group.index,
- )
- assert isinstance(res_frame, DataFrame)
- return res_frame
- elif isinstance(res, DataFrame) and not res.index.is_(group.index):
- return res._align_frame(group)[0]
- else:
- return res
|