123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044 |
- """
- Provide user facing operators for doing the split part of the
- split-apply-combine paradigm.
- """
- from __future__ import annotations
- from typing import (
- TYPE_CHECKING,
- Hashable,
- Iterator,
- final,
- )
- import warnings
- import numpy as np
- from pandas._config import using_copy_on_write
- from pandas._typing import (
- ArrayLike,
- Axis,
- NDFrameT,
- npt,
- )
- from pandas.errors import InvalidIndexError
- from pandas.util._decorators import cache_readonly
- from pandas.util._exceptions import find_stack_level
- from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_list_like,
- is_scalar,
- )
- from pandas.core import algorithms
- from pandas.core.arrays import (
- Categorical,
- ExtensionArray,
- )
- import pandas.core.common as com
- from pandas.core.frame import DataFrame
- from pandas.core.groupby import ops
- from pandas.core.groupby.categorical import recode_for_groupby
- from pandas.core.indexes.api import (
- CategoricalIndex,
- Index,
- MultiIndex,
- )
- from pandas.core.series import Series
- from pandas.io.formats.printing import pprint_thing
- if TYPE_CHECKING:
- from pandas.core.generic import NDFrame
- class Grouper:
- """
- A Grouper allows the user to specify a groupby instruction for an object.
- This specification will select a column via the key parameter, or if the
- level and/or axis parameters are given, a level of the index of the target
- object.
- If `axis` and/or `level` are passed as keywords to both `Grouper` and
- `groupby`, the values passed to `Grouper` take precedence.
- Parameters
- ----------
- key : str, defaults to None
- Groupby key, which selects the grouping column of the target.
- level : name/number, defaults to None
- The level for the target index.
- freq : str / frequency object, defaults to None
- This will groupby the specified frequency if the target selection
- (via key or level) is a datetime-like object. For full specification
- of available frequencies, please see `here
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
- axis : str, int, defaults to 0
- Number/name of the axis.
- sort : bool, default to False
- Whether to sort the resulting labels.
- closed : {'left' or 'right'}
- Closed end of interval. Only when `freq` parameter is passed.
- label : {'left' or 'right'}
- Interval boundary to use for labeling.
- Only when `freq` parameter is passed.
- convention : {'start', 'end', 'e', 's'}
- If grouper is PeriodIndex and `freq` parameter is passed.
- origin : Timestamp or str, default 'start_day'
- The timestamp on which to adjust the grouping. The timezone of origin must
- match the timezone of the index.
- If string, must be one of the following:
- - 'epoch': `origin` is 1970-01-01
- - 'start': `origin` is the first value of the timeseries
- - 'start_day': `origin` is the first day at midnight of the timeseries
- .. versionadded:: 1.1.0
- - 'end': `origin` is the last value of the timeseries
- - 'end_day': `origin` is the ceiling midnight of the last day
- .. versionadded:: 1.3.0
- offset : Timedelta or str, default is None
- An offset timedelta added to the origin.
- .. versionadded:: 1.1.0
- dropna : bool, default True
- If True, and if group keys contain NA values, NA values together with
- row/column will be dropped. If False, NA values will also be treated as
- the key in groups.
- .. versionadded:: 1.2.0
- Returns
- -------
- A specification for a groupby instruction
- Examples
- --------
- Syntactic sugar for ``df.groupby('A')``
- >>> df = pd.DataFrame(
- ... {
- ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"],
- ... "Speed": [100, 5, 200, 300, 15],
- ... }
- ... )
- >>> df
- Animal Speed
- 0 Falcon 100
- 1 Parrot 5
- 2 Falcon 200
- 3 Falcon 300
- 4 Parrot 15
- >>> df.groupby(pd.Grouper(key="Animal")).mean()
- Speed
- Animal
- Falcon 200.0
- Parrot 10.0
- Specify a resample operation on the column 'Publish date'
- >>> df = pd.DataFrame(
- ... {
- ... "Publish date": [
- ... pd.Timestamp("2000-01-02"),
- ... pd.Timestamp("2000-01-02"),
- ... pd.Timestamp("2000-01-09"),
- ... pd.Timestamp("2000-01-16")
- ... ],
- ... "ID": [0, 1, 2, 3],
- ... "Price": [10, 20, 30, 40]
- ... }
- ... )
- >>> df
- Publish date ID Price
- 0 2000-01-02 0 10
- 1 2000-01-02 1 20
- 2 2000-01-09 2 30
- 3 2000-01-16 3 40
- >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean()
- ID Price
- Publish date
- 2000-01-02 0.5 15.0
- 2000-01-09 2.0 30.0
- 2000-01-16 3.0 40.0
- If you want to adjust the start of the bins based on a fixed timestamp:
- >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
- >>> rng = pd.date_range(start, end, freq='7min')
- >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
- >>> ts
- 2000-10-01 23:30:00 0
- 2000-10-01 23:37:00 3
- 2000-10-01 23:44:00 6
- 2000-10-01 23:51:00 9
- 2000-10-01 23:58:00 12
- 2000-10-02 00:05:00 15
- 2000-10-02 00:12:00 18
- 2000-10-02 00:19:00 21
- 2000-10-02 00:26:00 24
- Freq: 7T, dtype: int64
- >>> ts.groupby(pd.Grouper(freq='17min')).sum()
- 2000-10-01 23:14:00 0
- 2000-10-01 23:31:00 9
- 2000-10-01 23:48:00 21
- 2000-10-02 00:05:00 54
- 2000-10-02 00:22:00 24
- Freq: 17T, dtype: int64
- >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum()
- 2000-10-01 23:18:00 0
- 2000-10-01 23:35:00 18
- 2000-10-01 23:52:00 27
- 2000-10-02 00:09:00 39
- 2000-10-02 00:26:00 24
- Freq: 17T, dtype: int64
- >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
- 2000-10-01 23:24:00 3
- 2000-10-01 23:41:00 15
- 2000-10-01 23:58:00 45
- 2000-10-02 00:15:00 45
- Freq: 17T, dtype: int64
- If you want to adjust the start of the bins with an `offset` Timedelta, the two
- following lines are equivalent:
- >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum()
- 2000-10-01 23:30:00 9
- 2000-10-01 23:47:00 21
- 2000-10-02 00:04:00 54
- 2000-10-02 00:21:00 24
- Freq: 17T, dtype: int64
- >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum()
- 2000-10-01 23:30:00 9
- 2000-10-01 23:47:00 21
- 2000-10-02 00:04:00 54
- 2000-10-02 00:21:00 24
- Freq: 17T, dtype: int64
- To replace the use of the deprecated `base` argument, you can now use `offset`,
- in this example it is equivalent to have `base=2`:
- >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum()
- 2000-10-01 23:16:00 0
- 2000-10-01 23:33:00 9
- 2000-10-01 23:50:00 36
- 2000-10-02 00:07:00 39
- 2000-10-02 00:24:00 24
- Freq: 17T, dtype: int64
- """
- sort: bool
- dropna: bool
- _gpr_index: Index | None
- _grouper: Index | None
- _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna")
- def __new__(cls, *args, **kwargs):
- if kwargs.get("freq") is not None:
- from pandas.core.resample import TimeGrouper
- cls = TimeGrouper
- return super().__new__(cls)
- def __init__(
- self,
- key=None,
- level=None,
- freq=None,
- axis: Axis = 0,
- sort: bool = False,
- dropna: bool = True,
- ) -> None:
- self.key = key
- self.level = level
- self.freq = freq
- self.axis = axis
- self.sort = sort
- self.dropna = dropna
- self._grouper_deprecated = None
- self._indexer_deprecated = None
- self._obj_deprecated = None
- self._gpr_index = None
- self.binner = None
- self._grouper = None
- self._indexer = None
- def _get_grouper(
- self, obj: NDFrameT, validate: bool = True
- ) -> tuple[ops.BaseGrouper, NDFrameT]:
- """
- Parameters
- ----------
- obj : Series or DataFrame
- validate : bool, default True
- if True, validate the grouper
- Returns
- -------
- a tuple of grouper, obj (possibly sorted)
- """
- obj, _, _ = self._set_grouper(obj)
- grouper, _, obj = get_grouper(
- obj,
- [self.key],
- axis=self.axis,
- level=self.level,
- sort=self.sort,
- validate=validate,
- dropna=self.dropna,
- )
- # Without setting this, subsequent lookups to .groups raise
- # error: Incompatible types in assignment (expression has type "BaseGrouper",
- # variable has type "None")
- self._grouper_deprecated = grouper # type: ignore[assignment]
- return grouper, obj
- @final
- def _set_grouper(
- self, obj: NDFrame, sort: bool = False, *, gpr_index: Index | None = None
- ):
- """
- given an object and the specifications, setup the internal grouper
- for this particular specification
- Parameters
- ----------
- obj : Series or DataFrame
- sort : bool, default False
- whether the resulting grouper should be sorted
- gpr_index : Index or None, default None
- Returns
- -------
- NDFrame
- Index
- np.ndarray[np.intp] | None
- """
- assert obj is not None
- indexer = None
- if self.key is not None and self.level is not None:
- raise ValueError("The Grouper cannot specify both a key and a level!")
- # Keep self._grouper value before overriding
- if self._grouper is None:
- # TODO: What are we assuming about subsequent calls?
- self._grouper = gpr_index
- self._indexer = self._indexer_deprecated
- # the key must be a valid info item
- if self.key is not None:
- key = self.key
- # The 'on' is already defined
- if getattr(gpr_index, "name", None) == key and isinstance(obj, Series):
- # Sometimes self._grouper will have been resorted while
- # obj has not. In this case there is a mismatch when we
- # call self._grouper.take(obj.index) so we need to undo the sorting
- # before we call _grouper.take.
- assert self._grouper is not None
- if self._indexer is not None:
- reverse_indexer = self._indexer.argsort()
- unsorted_ax = self._grouper.take(reverse_indexer)
- ax = unsorted_ax.take(obj.index)
- else:
- ax = self._grouper.take(obj.index)
- else:
- if key not in obj._info_axis:
- raise KeyError(f"The grouper name {key} is not found")
- ax = Index(obj[key], name=key)
- else:
- ax = obj._get_axis(self.axis)
- if self.level is not None:
- level = self.level
- # if a level is given it must be a mi level or
- # equivalent to the axis name
- if isinstance(ax, MultiIndex):
- level = ax._get_level_number(level)
- ax = Index(ax._get_level_values(level), name=ax.names[level])
- else:
- if level not in (0, ax.name):
- raise ValueError(f"The level {level} is not valid")
- # possibly sort
- if (self.sort or sort) and not ax.is_monotonic_increasing:
- # use stable sort to support first, last, nth
- # TODO: why does putting na_position="first" fix datetimelike cases?
- indexer = self._indexer_deprecated = ax.array.argsort(
- kind="mergesort", na_position="first"
- )
- ax = ax.take(indexer)
- obj = obj.take(indexer, axis=self.axis)
- # error: Incompatible types in assignment (expression has type
- # "NDFrameT", variable has type "None")
- self._obj_deprecated = obj # type: ignore[assignment]
- self._gpr_index = ax
- return obj, ax, indexer
- @final
- @property
- def ax(self) -> Index:
- warnings.warn(
- f"{type(self).__name__}.ax is deprecated and will be removed in a "
- "future version. Use Resampler.ax instead",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- index = self._gpr_index
- if index is None:
- raise ValueError("_set_grouper must be called before ax is accessed")
- return index
- @final
- @property
- def indexer(self):
- warnings.warn(
- f"{type(self).__name__}.indexer is deprecated and will be removed "
- "in a future version. Use Resampler.indexer instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self._indexer_deprecated
- @final
- @property
- def obj(self):
- warnings.warn(
- f"{type(self).__name__}.obj is deprecated and will be removed "
- "in a future version. Use GroupBy.indexer instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self._obj_deprecated
- @final
- @property
- def grouper(self):
- warnings.warn(
- f"{type(self).__name__}.grouper is deprecated and will be removed "
- "in a future version. Use GroupBy.grouper instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- return self._grouper_deprecated
- @final
- @property
- def groups(self):
- warnings.warn(
- f"{type(self).__name__}.groups is deprecated and will be removed "
- "in a future version. Use GroupBy.groups instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- # error: "None" has no attribute "groups"
- return self._grouper_deprecated.groups # type: ignore[attr-defined]
- @final
- def __repr__(self) -> str:
- attrs_list = (
- f"{attr_name}={repr(getattr(self, attr_name))}"
- for attr_name in self._attributes
- if getattr(self, attr_name) is not None
- )
- attrs = ", ".join(attrs_list)
- cls_name = type(self).__name__
- return f"{cls_name}({attrs})"
- @final
- class Grouping:
- """
- Holds the grouping information for a single key
- Parameters
- ----------
- index : Index
- grouper :
- obj : DataFrame or Series
- name : Label
- level :
- observed : bool, default False
- If we are a Categorical, use the observed values
- in_axis : if the Grouping is a column in self.obj and hence among
- Groupby.exclusions list
- dropna : bool, default True
- Whether to drop NA groups.
- uniques : Array-like, optional
- When specified, will be used for unique values. Enables including empty groups
- in the result for a BinGrouper. Must not contain duplicates.
- Attributes
- -------
- indices : dict
- Mapping of {group -> index_list}
- codes : ndarray
- Group codes
- group_index : Index or None
- unique groups
- groups : dict
- Mapping of {group -> label_list}
- """
- _codes: npt.NDArray[np.signedinteger] | None = None
- _group_index: Index | None = None
- _all_grouper: Categorical | None
- _orig_cats: Index | None
- _index: Index
- def __init__(
- self,
- index: Index,
- grouper=None,
- obj: NDFrame | None = None,
- level=None,
- sort: bool = True,
- observed: bool = False,
- in_axis: bool = False,
- dropna: bool = True,
- uniques: ArrayLike | None = None,
- ) -> None:
- self.level = level
- self._orig_grouper = grouper
- grouping_vector = _convert_grouper(index, grouper)
- self._all_grouper = None
- self._orig_cats = None
- self._index = index
- self._sort = sort
- self.obj = obj
- self._observed = observed
- self.in_axis = in_axis
- self._dropna = dropna
- self._uniques = uniques
- # we have a single grouper which may be a myriad of things,
- # some of which are dependent on the passing in level
- ilevel = self._ilevel
- if ilevel is not None:
- # In extant tests, the new self.grouping_vector matches
- # `index.get_level_values(ilevel)` whenever
- # mapper is None and isinstance(index, MultiIndex)
- if isinstance(index, MultiIndex):
- index_level = index.get_level_values(ilevel)
- else:
- index_level = index
- if grouping_vector is None:
- grouping_vector = index_level
- else:
- mapper = grouping_vector
- grouping_vector = index_level.map(mapper)
- # a passed Grouper like, directly get the grouper in the same way
- # as single grouper groupby, use the group_info to get codes
- elif isinstance(grouping_vector, Grouper):
- # get the new grouper; we already have disambiguated
- # what key/level refer to exactly, don't need to
- # check again as we have by this point converted these
- # to an actual value (rather than a pd.Grouper)
- assert self.obj is not None # for mypy
- newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False)
- self.obj = newobj
- if isinstance(newgrouper, ops.BinGrouper):
- # TODO: can we unwrap this and get a tighter typing
- # for self.grouping_vector?
- grouping_vector = newgrouper
- else:
- # ops.BaseGrouper
- # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1.
- # If that were to occur, would we be throwing out information?
- # error: Cannot determine type of "grouping_vector" [has-type]
- ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type]
- # use Index instead of ndarray so we can recover the name
- grouping_vector = Index(ng, name=newgrouper.result_index.name)
- elif not isinstance(
- grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
- ):
- # no level passed
- if getattr(grouping_vector, "ndim", 1) != 1:
- t = str(type(grouping_vector))
- raise ValueError(f"Grouper for '{t}' not 1-dimensional")
- grouping_vector = index.map(grouping_vector)
- if not (
- hasattr(grouping_vector, "__len__")
- and len(grouping_vector) == len(index)
- ):
- grper = pprint_thing(grouping_vector)
- errmsg = (
- "Grouper result violates len(labels) == "
- f"len(data)\nresult: {grper}"
- )
- raise AssertionError(errmsg)
- if isinstance(grouping_vector, np.ndarray):
- if grouping_vector.dtype.kind in ["m", "M"]:
- # if we have a date/time-like grouper, make sure that we have
- # Timestamps like
- # TODO 2022-10-08 we only have one test that gets here and
- # values are already in nanoseconds in that case.
- grouping_vector = Series(grouping_vector).to_numpy()
- elif is_categorical_dtype(grouping_vector):
- # a passed Categorical
- self._orig_cats = grouping_vector.categories
- grouping_vector, self._all_grouper = recode_for_groupby(
- grouping_vector, sort, observed
- )
- self.grouping_vector = grouping_vector
- def __repr__(self) -> str:
- return f"Grouping({self.name})"
- def __iter__(self) -> Iterator:
- return iter(self.indices)
- @cache_readonly
- def _passed_categorical(self) -> bool:
- return is_categorical_dtype(self.grouping_vector)
- @cache_readonly
- def name(self) -> Hashable:
- ilevel = self._ilevel
- if ilevel is not None:
- return self._index.names[ilevel]
- if isinstance(self._orig_grouper, (Index, Series)):
- return self._orig_grouper.name
- elif isinstance(self.grouping_vector, ops.BaseGrouper):
- return self.grouping_vector.result_index.name
- elif isinstance(self.grouping_vector, Index):
- return self.grouping_vector.name
- # otherwise we have ndarray or ExtensionArray -> no name
- return None
- @cache_readonly
- def _ilevel(self) -> int | None:
- """
- If necessary, converted index level name to index level position.
- """
- level = self.level
- if level is None:
- return None
- if not isinstance(level, int):
- index = self._index
- if level not in index.names:
- raise AssertionError(f"Level {level} not in index")
- return index.names.index(level)
- return level
- @property
- def ngroups(self) -> int:
- return len(self.group_index)
- @cache_readonly
- def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
- # we have a list of groupers
- if isinstance(self.grouping_vector, ops.BaseGrouper):
- return self.grouping_vector.indices
- values = Categorical(self.grouping_vector)
- return values._reverse_indexer()
- @property
- def codes(self) -> npt.NDArray[np.signedinteger]:
- return self._codes_and_uniques[0]
- @cache_readonly
- def group_arraylike(self) -> ArrayLike:
- """
- Analogous to result_index, but holding an ArrayLike to ensure
- we can retain ExtensionDtypes.
- """
- if self._all_grouper is not None:
- # retain dtype for categories, including unobserved ones
- return self.result_index._values
- elif self._passed_categorical:
- return self.group_index._values
- return self._codes_and_uniques[1]
- @cache_readonly
- def result_index(self) -> Index:
- # result_index retains dtype for categories, including unobserved ones,
- # which group_index does not
- if self._all_grouper is not None:
- group_idx = self.group_index
- assert isinstance(group_idx, CategoricalIndex)
- cats = self._orig_cats
- # set_categories is dynamically added
- return group_idx.set_categories(cats) # type: ignore[attr-defined]
- return self.group_index
- @cache_readonly
- def group_index(self) -> Index:
- codes, uniques = self._codes_and_uniques
- if not self._dropna and self._passed_categorical:
- assert isinstance(uniques, Categorical)
- if self._sort and (codes == len(uniques)).any():
- # Add NA value on the end when sorting
- uniques = Categorical.from_codes(
- np.append(uniques.codes, [-1]), uniques.categories
- )
- elif len(codes) > 0:
- # Need to determine proper placement of NA value when not sorting
- cat = self.grouping_vector
- na_idx = (cat.codes < 0).argmax()
- if cat.codes[na_idx] < 0:
- # count number of unique codes that comes before the nan value
- na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
- uniques = Categorical.from_codes(
- np.insert(uniques.codes, na_unique_idx, -1), uniques.categories
- )
- return Index._with_infer(uniques, name=self.name)
- @cache_readonly
- def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
- uniques: ArrayLike
- if self._passed_categorical:
- # we make a CategoricalIndex out of the cat grouper
- # preserving the categories / ordered attributes;
- # doesn't (yet - GH#46909) handle dropna=False
- cat = self.grouping_vector
- categories = cat.categories
- if self._observed:
- ucodes = algorithms.unique1d(cat.codes)
- ucodes = ucodes[ucodes != -1]
- if self._sort:
- ucodes = np.sort(ucodes)
- else:
- ucodes = np.arange(len(categories))
- uniques = Categorical.from_codes(
- codes=ucodes, categories=categories, ordered=cat.ordered
- )
- codes = cat.codes
- if not self._dropna:
- na_mask = codes < 0
- if np.any(na_mask):
- if self._sort:
- # Replace NA codes with `largest code + 1`
- na_code = len(categories)
- codes = np.where(na_mask, na_code, codes)
- else:
- # Insert NA code into the codes based on first appearance
- # A negative code must exist, no need to check codes[na_idx] < 0
- na_idx = na_mask.argmax()
- # count number of unique codes that comes before the nan value
- na_code = algorithms.nunique_ints(codes[:na_idx])
- codes = np.where(codes >= na_code, codes + 1, codes)
- codes = np.where(na_mask, na_code, codes)
- if not self._observed:
- uniques = uniques.reorder_categories(self._orig_cats)
- return codes, uniques
- elif isinstance(self.grouping_vector, ops.BaseGrouper):
- # we have a list of groupers
- codes = self.grouping_vector.codes_info
- uniques = self.grouping_vector.result_index._values
- elif self._uniques is not None:
- # GH#50486 Code grouping_vector using _uniques; allows
- # including uniques that are not present in grouping_vector.
- cat = Categorical(self.grouping_vector, categories=self._uniques)
- codes = cat.codes
- uniques = self._uniques
- else:
- # GH35667, replace dropna=False with use_na_sentinel=False
- # error: Incompatible types in assignment (expression has type "Union[
- # ndarray[Any, Any], Index]", variable has type "Categorical")
- codes, uniques = algorithms.factorize( # type: ignore[assignment]
- self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
- )
- return codes, uniques
- @cache_readonly
- def groups(self) -> dict[Hashable, np.ndarray]:
- return self._index.groupby(Categorical.from_codes(self.codes, self.group_index))
- def get_grouper(
- obj: NDFrameT,
- key=None,
- axis: Axis = 0,
- level=None,
- sort: bool = True,
- observed: bool = False,
- validate: bool = True,
- dropna: bool = True,
- ) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]:
- """
- Create and return a BaseGrouper, which is an internal
- mapping of how to create the grouper indexers.
- This may be composed of multiple Grouping objects, indicating
- multiple groupers
- Groupers are ultimately index mappings. They can originate as:
- index mappings, keys to columns, functions, or Groupers
- Groupers enable local references to axis,level,sort, while
- the passed in axis, level, and sort are 'global'.
- This routine tries to figure out what the passing in references
- are and then creates a Grouping for each one, combined into
- a BaseGrouper.
- If observed & we have a categorical grouper, only show the observed
- values.
- If validate, then check for key/level overlaps.
- """
- group_axis = obj._get_axis(axis)
- # validate that the passed single level is compatible with the passed
- # axis of the object
- if level is not None:
- # TODO: These if-block and else-block are almost same.
- # MultiIndex instance check is removable, but it seems that there are
- # some processes only for non-MultiIndex in else-block,
- # eg. `obj.index.name != level`. We have to consider carefully whether
- # these are applicable for MultiIndex. Even if these are applicable,
- # we need to check if it makes no side effect to subsequent processes
- # on the outside of this condition.
- # (GH 17621)
- if isinstance(group_axis, MultiIndex):
- if is_list_like(level) and len(level) == 1:
- level = level[0]
- if key is None and is_scalar(level):
- # Get the level values from group_axis
- key = group_axis.get_level_values(level)
- level = None
- else:
- # allow level to be a length-one list-like object
- # (e.g., level=[0])
- # GH 13901
- if is_list_like(level):
- nlevels = len(level)
- if nlevels == 1:
- level = level[0]
- elif nlevels == 0:
- raise ValueError("No group keys passed!")
- else:
- raise ValueError("multiple levels only valid with MultiIndex")
- if isinstance(level, str):
- if obj._get_axis(axis).name != level:
- raise ValueError(
- f"level name {level} is not the name "
- f"of the {obj._get_axis_name(axis)}"
- )
- elif level > 0 or level < -1:
- raise ValueError("level > 0 or level < -1 only valid with MultiIndex")
- # NOTE: `group_axis` and `group_axis.get_level_values(level)`
- # are same in this section.
- level = None
- key = group_axis
- # a passed-in Grouper, directly convert
- if isinstance(key, Grouper):
- grouper, obj = key._get_grouper(obj, validate=False)
- if key.key is None:
- return grouper, frozenset(), obj
- else:
- return grouper, frozenset({key.key}), obj
- # already have a BaseGrouper, just return it
- elif isinstance(key, ops.BaseGrouper):
- return key, frozenset(), obj
- if not isinstance(key, list):
- keys = [key]
- match_axis_length = False
- else:
- keys = key
- match_axis_length = len(keys) == len(group_axis)
- # what are we after, exactly?
- any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
- any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)
- any_arraylike = any(
- isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys
- )
- # is this an index replacement?
- if (
- not any_callable
- and not any_arraylike
- and not any_groupers
- and match_axis_length
- and level is None
- ):
- if isinstance(obj, DataFrame):
- all_in_columns_index = all(
- g in obj.columns or g in obj.index.names for g in keys
- )
- else:
- assert isinstance(obj, Series)
- all_in_columns_index = all(g in obj.index.names for g in keys)
- if not all_in_columns_index:
- keys = [com.asarray_tuplesafe(keys)]
- if isinstance(level, (tuple, list)):
- if key is None:
- keys = [None] * len(level)
- levels = level
- else:
- levels = [level] * len(keys)
- groupings: list[Grouping] = []
- exclusions: set[Hashable] = set()
- # if the actual grouper should be obj[key]
- def is_in_axis(key) -> bool:
- if not _is_label_like(key):
- if obj.ndim == 1:
- return False
- # items -> .columns for DataFrame, .index for Series
- items = obj.axes[-1]
- try:
- items.get_loc(key)
- except (KeyError, TypeError, InvalidIndexError):
- # TypeError shows up here if we pass e.g. an Index
- return False
- return True
- # if the grouper is obj[name]
- def is_in_obj(gpr) -> bool:
- if not hasattr(gpr, "name"):
- return False
- if using_copy_on_write():
- # For the CoW case, we check the references to determine if the
- # series is part of the object
- try:
- obj_gpr_column = obj[gpr.name]
- except (KeyError, IndexError, InvalidIndexError):
- return False
- if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series):
- return gpr._mgr.references_same_values( # type: ignore[union-attr]
- obj_gpr_column._mgr, 0 # type: ignore[arg-type]
- )
- return False
- try:
- return gpr is obj[gpr.name]
- except (KeyError, IndexError, InvalidIndexError):
- # IndexError reached in e.g. test_skip_group_keys when we pass
- # lambda here
- # InvalidIndexError raised on key-types inappropriate for index,
- # e.g. DatetimeIndex.get_loc(tuple())
- return False
- for gpr, level in zip(keys, levels):
- if is_in_obj(gpr): # df.groupby(df['name'])
- in_axis = True
- exclusions.add(gpr.name)
- elif is_in_axis(gpr): # df.groupby('name')
- if obj.ndim != 1 and gpr in obj:
- if validate:
- obj._check_label_or_level_ambiguity(gpr, axis=axis)
- in_axis, name, gpr = True, gpr, obj[gpr]
- if gpr.ndim != 1:
- # non-unique columns; raise here to get the name in the
- # exception message
- raise ValueError(f"Grouper for '{name}' not 1-dimensional")
- exclusions.add(name)
- elif obj._is_level_reference(gpr, axis=axis):
- in_axis, level, gpr = False, gpr, None
- else:
- raise KeyError(gpr)
- elif isinstance(gpr, Grouper) and gpr.key is not None:
- # Add key to exclusions
- exclusions.add(gpr.key)
- in_axis = True
- else:
- in_axis = False
- # create the Grouping
- # allow us to passing the actual Grouping as the gpr
- ping = (
- Grouping(
- group_axis,
- gpr,
- obj=obj,
- level=level,
- sort=sort,
- observed=observed,
- in_axis=in_axis,
- dropna=dropna,
- )
- if not isinstance(gpr, Grouping)
- else gpr
- )
- groupings.append(ping)
- if len(groupings) == 0 and len(obj):
- raise ValueError("No group keys passed!")
- if len(groupings) == 0:
- groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))
- # create the internals grouper
- grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna)
- return grouper, frozenset(exclusions), obj
- def _is_label_like(val) -> bool:
- return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))
- def _convert_grouper(axis: Index, grouper):
- if isinstance(grouper, dict):
- return grouper.get
- elif isinstance(grouper, Series):
- if grouper.index.equals(axis):
- return grouper._values
- else:
- return grouper.reindex(axis)._values
- elif isinstance(grouper, MultiIndex):
- return grouper._values
- elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)):
- if len(grouper) != len(axis):
- raise ValueError("Grouper and axis must be same length")
- if isinstance(grouper, (list, tuple)):
- grouper = com.asarray_tuplesafe(grouper)
- return grouper
- else:
- return grouper
|