grouper.py 35 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044
  1. """
  2. Provide user facing operators for doing the split part of the
  3. split-apply-combine paradigm.
  4. """
  5. from __future__ import annotations
  6. from typing import (
  7. TYPE_CHECKING,
  8. Hashable,
  9. Iterator,
  10. final,
  11. )
  12. import warnings
  13. import numpy as np
  14. from pandas._config import using_copy_on_write
  15. from pandas._typing import (
  16. ArrayLike,
  17. Axis,
  18. NDFrameT,
  19. npt,
  20. )
  21. from pandas.errors import InvalidIndexError
  22. from pandas.util._decorators import cache_readonly
  23. from pandas.util._exceptions import find_stack_level
  24. from pandas.core.dtypes.common import (
  25. is_categorical_dtype,
  26. is_list_like,
  27. is_scalar,
  28. )
  29. from pandas.core import algorithms
  30. from pandas.core.arrays import (
  31. Categorical,
  32. ExtensionArray,
  33. )
  34. import pandas.core.common as com
  35. from pandas.core.frame import DataFrame
  36. from pandas.core.groupby import ops
  37. from pandas.core.groupby.categorical import recode_for_groupby
  38. from pandas.core.indexes.api import (
  39. CategoricalIndex,
  40. Index,
  41. MultiIndex,
  42. )
  43. from pandas.core.series import Series
  44. from pandas.io.formats.printing import pprint_thing
  45. if TYPE_CHECKING:
  46. from pandas.core.generic import NDFrame
  47. class Grouper:
  48. """
  49. A Grouper allows the user to specify a groupby instruction for an object.
  50. This specification will select a column via the key parameter, or if the
  51. level and/or axis parameters are given, a level of the index of the target
  52. object.
  53. If `axis` and/or `level` are passed as keywords to both `Grouper` and
  54. `groupby`, the values passed to `Grouper` take precedence.
  55. Parameters
  56. ----------
  57. key : str, defaults to None
  58. Groupby key, which selects the grouping column of the target.
  59. level : name/number, defaults to None
  60. The level for the target index.
  61. freq : str / frequency object, defaults to None
  62. This will groupby the specified frequency if the target selection
  63. (via key or level) is a datetime-like object. For full specification
  64. of available frequencies, please see `here
  65. <https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`_.
  66. axis : str, int, defaults to 0
  67. Number/name of the axis.
  68. sort : bool, default to False
  69. Whether to sort the resulting labels.
  70. closed : {'left' or 'right'}
  71. Closed end of interval. Only when `freq` parameter is passed.
  72. label : {'left' or 'right'}
  73. Interval boundary to use for labeling.
  74. Only when `freq` parameter is passed.
  75. convention : {'start', 'end', 'e', 's'}
  76. If grouper is PeriodIndex and `freq` parameter is passed.
  77. origin : Timestamp or str, default 'start_day'
  78. The timestamp on which to adjust the grouping. The timezone of origin must
  79. match the timezone of the index.
  80. If string, must be one of the following:
  81. - 'epoch': `origin` is 1970-01-01
  82. - 'start': `origin` is the first value of the timeseries
  83. - 'start_day': `origin` is the first day at midnight of the timeseries
  84. .. versionadded:: 1.1.0
  85. - 'end': `origin` is the last value of the timeseries
  86. - 'end_day': `origin` is the ceiling midnight of the last day
  87. .. versionadded:: 1.3.0
  88. offset : Timedelta or str, default is None
  89. An offset timedelta added to the origin.
  90. .. versionadded:: 1.1.0
  91. dropna : bool, default True
  92. If True, and if group keys contain NA values, NA values together with
  93. row/column will be dropped. If False, NA values will also be treated as
  94. the key in groups.
  95. .. versionadded:: 1.2.0
  96. Returns
  97. -------
  98. A specification for a groupby instruction
  99. Examples
  100. --------
  101. Syntactic sugar for ``df.groupby('A')``
  102. >>> df = pd.DataFrame(
  103. ... {
  104. ... "Animal": ["Falcon", "Parrot", "Falcon", "Falcon", "Parrot"],
  105. ... "Speed": [100, 5, 200, 300, 15],
  106. ... }
  107. ... )
  108. >>> df
  109. Animal Speed
  110. 0 Falcon 100
  111. 1 Parrot 5
  112. 2 Falcon 200
  113. 3 Falcon 300
  114. 4 Parrot 15
  115. >>> df.groupby(pd.Grouper(key="Animal")).mean()
  116. Speed
  117. Animal
  118. Falcon 200.0
  119. Parrot 10.0
  120. Specify a resample operation on the column 'Publish date'
  121. >>> df = pd.DataFrame(
  122. ... {
  123. ... "Publish date": [
  124. ... pd.Timestamp("2000-01-02"),
  125. ... pd.Timestamp("2000-01-02"),
  126. ... pd.Timestamp("2000-01-09"),
  127. ... pd.Timestamp("2000-01-16")
  128. ... ],
  129. ... "ID": [0, 1, 2, 3],
  130. ... "Price": [10, 20, 30, 40]
  131. ... }
  132. ... )
  133. >>> df
  134. Publish date ID Price
  135. 0 2000-01-02 0 10
  136. 1 2000-01-02 1 20
  137. 2 2000-01-09 2 30
  138. 3 2000-01-16 3 40
  139. >>> df.groupby(pd.Grouper(key="Publish date", freq="1W")).mean()
  140. ID Price
  141. Publish date
  142. 2000-01-02 0.5 15.0
  143. 2000-01-09 2.0 30.0
  144. 2000-01-16 3.0 40.0
  145. If you want to adjust the start of the bins based on a fixed timestamp:
  146. >>> start, end = '2000-10-01 23:30:00', '2000-10-02 00:30:00'
  147. >>> rng = pd.date_range(start, end, freq='7min')
  148. >>> ts = pd.Series(np.arange(len(rng)) * 3, index=rng)
  149. >>> ts
  150. 2000-10-01 23:30:00 0
  151. 2000-10-01 23:37:00 3
  152. 2000-10-01 23:44:00 6
  153. 2000-10-01 23:51:00 9
  154. 2000-10-01 23:58:00 12
  155. 2000-10-02 00:05:00 15
  156. 2000-10-02 00:12:00 18
  157. 2000-10-02 00:19:00 21
  158. 2000-10-02 00:26:00 24
  159. Freq: 7T, dtype: int64
  160. >>> ts.groupby(pd.Grouper(freq='17min')).sum()
  161. 2000-10-01 23:14:00 0
  162. 2000-10-01 23:31:00 9
  163. 2000-10-01 23:48:00 21
  164. 2000-10-02 00:05:00 54
  165. 2000-10-02 00:22:00 24
  166. Freq: 17T, dtype: int64
  167. >>> ts.groupby(pd.Grouper(freq='17min', origin='epoch')).sum()
  168. 2000-10-01 23:18:00 0
  169. 2000-10-01 23:35:00 18
  170. 2000-10-01 23:52:00 27
  171. 2000-10-02 00:09:00 39
  172. 2000-10-02 00:26:00 24
  173. Freq: 17T, dtype: int64
  174. >>> ts.groupby(pd.Grouper(freq='17min', origin='2000-01-01')).sum()
  175. 2000-10-01 23:24:00 3
  176. 2000-10-01 23:41:00 15
  177. 2000-10-01 23:58:00 45
  178. 2000-10-02 00:15:00 45
  179. Freq: 17T, dtype: int64
  180. If you want to adjust the start of the bins with an `offset` Timedelta, the two
  181. following lines are equivalent:
  182. >>> ts.groupby(pd.Grouper(freq='17min', origin='start')).sum()
  183. 2000-10-01 23:30:00 9
  184. 2000-10-01 23:47:00 21
  185. 2000-10-02 00:04:00 54
  186. 2000-10-02 00:21:00 24
  187. Freq: 17T, dtype: int64
  188. >>> ts.groupby(pd.Grouper(freq='17min', offset='23h30min')).sum()
  189. 2000-10-01 23:30:00 9
  190. 2000-10-01 23:47:00 21
  191. 2000-10-02 00:04:00 54
  192. 2000-10-02 00:21:00 24
  193. Freq: 17T, dtype: int64
  194. To replace the use of the deprecated `base` argument, you can now use `offset`,
  195. in this example it is equivalent to have `base=2`:
  196. >>> ts.groupby(pd.Grouper(freq='17min', offset='2min')).sum()
  197. 2000-10-01 23:16:00 0
  198. 2000-10-01 23:33:00 9
  199. 2000-10-01 23:50:00 36
  200. 2000-10-02 00:07:00 39
  201. 2000-10-02 00:24:00 24
  202. Freq: 17T, dtype: int64
  203. """
  204. sort: bool
  205. dropna: bool
  206. _gpr_index: Index | None
  207. _grouper: Index | None
  208. _attributes: tuple[str, ...] = ("key", "level", "freq", "axis", "sort", "dropna")
  209. def __new__(cls, *args, **kwargs):
  210. if kwargs.get("freq") is not None:
  211. from pandas.core.resample import TimeGrouper
  212. cls = TimeGrouper
  213. return super().__new__(cls)
  214. def __init__(
  215. self,
  216. key=None,
  217. level=None,
  218. freq=None,
  219. axis: Axis = 0,
  220. sort: bool = False,
  221. dropna: bool = True,
  222. ) -> None:
  223. self.key = key
  224. self.level = level
  225. self.freq = freq
  226. self.axis = axis
  227. self.sort = sort
  228. self.dropna = dropna
  229. self._grouper_deprecated = None
  230. self._indexer_deprecated = None
  231. self._obj_deprecated = None
  232. self._gpr_index = None
  233. self.binner = None
  234. self._grouper = None
  235. self._indexer = None
  236. def _get_grouper(
  237. self, obj: NDFrameT, validate: bool = True
  238. ) -> tuple[ops.BaseGrouper, NDFrameT]:
  239. """
  240. Parameters
  241. ----------
  242. obj : Series or DataFrame
  243. validate : bool, default True
  244. if True, validate the grouper
  245. Returns
  246. -------
  247. a tuple of grouper, obj (possibly sorted)
  248. """
  249. obj, _, _ = self._set_grouper(obj)
  250. grouper, _, obj = get_grouper(
  251. obj,
  252. [self.key],
  253. axis=self.axis,
  254. level=self.level,
  255. sort=self.sort,
  256. validate=validate,
  257. dropna=self.dropna,
  258. )
  259. # Without setting this, subsequent lookups to .groups raise
  260. # error: Incompatible types in assignment (expression has type "BaseGrouper",
  261. # variable has type "None")
  262. self._grouper_deprecated = grouper # type: ignore[assignment]
  263. return grouper, obj
  264. @final
  265. def _set_grouper(
  266. self, obj: NDFrame, sort: bool = False, *, gpr_index: Index | None = None
  267. ):
  268. """
  269. given an object and the specifications, setup the internal grouper
  270. for this particular specification
  271. Parameters
  272. ----------
  273. obj : Series or DataFrame
  274. sort : bool, default False
  275. whether the resulting grouper should be sorted
  276. gpr_index : Index or None, default None
  277. Returns
  278. -------
  279. NDFrame
  280. Index
  281. np.ndarray[np.intp] | None
  282. """
  283. assert obj is not None
  284. indexer = None
  285. if self.key is not None and self.level is not None:
  286. raise ValueError("The Grouper cannot specify both a key and a level!")
  287. # Keep self._grouper value before overriding
  288. if self._grouper is None:
  289. # TODO: What are we assuming about subsequent calls?
  290. self._grouper = gpr_index
  291. self._indexer = self._indexer_deprecated
  292. # the key must be a valid info item
  293. if self.key is not None:
  294. key = self.key
  295. # The 'on' is already defined
  296. if getattr(gpr_index, "name", None) == key and isinstance(obj, Series):
  297. # Sometimes self._grouper will have been resorted while
  298. # obj has not. In this case there is a mismatch when we
  299. # call self._grouper.take(obj.index) so we need to undo the sorting
  300. # before we call _grouper.take.
  301. assert self._grouper is not None
  302. if self._indexer is not None:
  303. reverse_indexer = self._indexer.argsort()
  304. unsorted_ax = self._grouper.take(reverse_indexer)
  305. ax = unsorted_ax.take(obj.index)
  306. else:
  307. ax = self._grouper.take(obj.index)
  308. else:
  309. if key not in obj._info_axis:
  310. raise KeyError(f"The grouper name {key} is not found")
  311. ax = Index(obj[key], name=key)
  312. else:
  313. ax = obj._get_axis(self.axis)
  314. if self.level is not None:
  315. level = self.level
  316. # if a level is given it must be a mi level or
  317. # equivalent to the axis name
  318. if isinstance(ax, MultiIndex):
  319. level = ax._get_level_number(level)
  320. ax = Index(ax._get_level_values(level), name=ax.names[level])
  321. else:
  322. if level not in (0, ax.name):
  323. raise ValueError(f"The level {level} is not valid")
  324. # possibly sort
  325. if (self.sort or sort) and not ax.is_monotonic_increasing:
  326. # use stable sort to support first, last, nth
  327. # TODO: why does putting na_position="first" fix datetimelike cases?
  328. indexer = self._indexer_deprecated = ax.array.argsort(
  329. kind="mergesort", na_position="first"
  330. )
  331. ax = ax.take(indexer)
  332. obj = obj.take(indexer, axis=self.axis)
  333. # error: Incompatible types in assignment (expression has type
  334. # "NDFrameT", variable has type "None")
  335. self._obj_deprecated = obj # type: ignore[assignment]
  336. self._gpr_index = ax
  337. return obj, ax, indexer
  338. @final
  339. @property
  340. def ax(self) -> Index:
  341. warnings.warn(
  342. f"{type(self).__name__}.ax is deprecated and will be removed in a "
  343. "future version. Use Resampler.ax instead",
  344. FutureWarning,
  345. stacklevel=find_stack_level(),
  346. )
  347. index = self._gpr_index
  348. if index is None:
  349. raise ValueError("_set_grouper must be called before ax is accessed")
  350. return index
  351. @final
  352. @property
  353. def indexer(self):
  354. warnings.warn(
  355. f"{type(self).__name__}.indexer is deprecated and will be removed "
  356. "in a future version. Use Resampler.indexer instead.",
  357. FutureWarning,
  358. stacklevel=find_stack_level(),
  359. )
  360. return self._indexer_deprecated
  361. @final
  362. @property
  363. def obj(self):
  364. warnings.warn(
  365. f"{type(self).__name__}.obj is deprecated and will be removed "
  366. "in a future version. Use GroupBy.indexer instead.",
  367. FutureWarning,
  368. stacklevel=find_stack_level(),
  369. )
  370. return self._obj_deprecated
  371. @final
  372. @property
  373. def grouper(self):
  374. warnings.warn(
  375. f"{type(self).__name__}.grouper is deprecated and will be removed "
  376. "in a future version. Use GroupBy.grouper instead.",
  377. FutureWarning,
  378. stacklevel=find_stack_level(),
  379. )
  380. return self._grouper_deprecated
  381. @final
  382. @property
  383. def groups(self):
  384. warnings.warn(
  385. f"{type(self).__name__}.groups is deprecated and will be removed "
  386. "in a future version. Use GroupBy.groups instead.",
  387. FutureWarning,
  388. stacklevel=find_stack_level(),
  389. )
  390. # error: "None" has no attribute "groups"
  391. return self._grouper_deprecated.groups # type: ignore[attr-defined]
  392. @final
  393. def __repr__(self) -> str:
  394. attrs_list = (
  395. f"{attr_name}={repr(getattr(self, attr_name))}"
  396. for attr_name in self._attributes
  397. if getattr(self, attr_name) is not None
  398. )
  399. attrs = ", ".join(attrs_list)
  400. cls_name = type(self).__name__
  401. return f"{cls_name}({attrs})"
  402. @final
  403. class Grouping:
  404. """
  405. Holds the grouping information for a single key
  406. Parameters
  407. ----------
  408. index : Index
  409. grouper :
  410. obj : DataFrame or Series
  411. name : Label
  412. level :
  413. observed : bool, default False
  414. If we are a Categorical, use the observed values
  415. in_axis : if the Grouping is a column in self.obj and hence among
  416. Groupby.exclusions list
  417. dropna : bool, default True
  418. Whether to drop NA groups.
  419. uniques : Array-like, optional
  420. When specified, will be used for unique values. Enables including empty groups
  421. in the result for a BinGrouper. Must not contain duplicates.
  422. Attributes
  423. -------
  424. indices : dict
  425. Mapping of {group -> index_list}
  426. codes : ndarray
  427. Group codes
  428. group_index : Index or None
  429. unique groups
  430. groups : dict
  431. Mapping of {group -> label_list}
  432. """
  433. _codes: npt.NDArray[np.signedinteger] | None = None
  434. _group_index: Index | None = None
  435. _all_grouper: Categorical | None
  436. _orig_cats: Index | None
  437. _index: Index
  438. def __init__(
  439. self,
  440. index: Index,
  441. grouper=None,
  442. obj: NDFrame | None = None,
  443. level=None,
  444. sort: bool = True,
  445. observed: bool = False,
  446. in_axis: bool = False,
  447. dropna: bool = True,
  448. uniques: ArrayLike | None = None,
  449. ) -> None:
  450. self.level = level
  451. self._orig_grouper = grouper
  452. grouping_vector = _convert_grouper(index, grouper)
  453. self._all_grouper = None
  454. self._orig_cats = None
  455. self._index = index
  456. self._sort = sort
  457. self.obj = obj
  458. self._observed = observed
  459. self.in_axis = in_axis
  460. self._dropna = dropna
  461. self._uniques = uniques
  462. # we have a single grouper which may be a myriad of things,
  463. # some of which are dependent on the passing in level
  464. ilevel = self._ilevel
  465. if ilevel is not None:
  466. # In extant tests, the new self.grouping_vector matches
  467. # `index.get_level_values(ilevel)` whenever
  468. # mapper is None and isinstance(index, MultiIndex)
  469. if isinstance(index, MultiIndex):
  470. index_level = index.get_level_values(ilevel)
  471. else:
  472. index_level = index
  473. if grouping_vector is None:
  474. grouping_vector = index_level
  475. else:
  476. mapper = grouping_vector
  477. grouping_vector = index_level.map(mapper)
  478. # a passed Grouper like, directly get the grouper in the same way
  479. # as single grouper groupby, use the group_info to get codes
  480. elif isinstance(grouping_vector, Grouper):
  481. # get the new grouper; we already have disambiguated
  482. # what key/level refer to exactly, don't need to
  483. # check again as we have by this point converted these
  484. # to an actual value (rather than a pd.Grouper)
  485. assert self.obj is not None # for mypy
  486. newgrouper, newobj = grouping_vector._get_grouper(self.obj, validate=False)
  487. self.obj = newobj
  488. if isinstance(newgrouper, ops.BinGrouper):
  489. # TODO: can we unwrap this and get a tighter typing
  490. # for self.grouping_vector?
  491. grouping_vector = newgrouper
  492. else:
  493. # ops.BaseGrouper
  494. # TODO: 2023-02-03 no test cases with len(newgrouper.groupings) > 1.
  495. # If that were to occur, would we be throwing out information?
  496. # error: Cannot determine type of "grouping_vector" [has-type]
  497. ng = newgrouper.groupings[0].grouping_vector # type: ignore[has-type]
  498. # use Index instead of ndarray so we can recover the name
  499. grouping_vector = Index(ng, name=newgrouper.result_index.name)
  500. elif not isinstance(
  501. grouping_vector, (Series, Index, ExtensionArray, np.ndarray)
  502. ):
  503. # no level passed
  504. if getattr(grouping_vector, "ndim", 1) != 1:
  505. t = str(type(grouping_vector))
  506. raise ValueError(f"Grouper for '{t}' not 1-dimensional")
  507. grouping_vector = index.map(grouping_vector)
  508. if not (
  509. hasattr(grouping_vector, "__len__")
  510. and len(grouping_vector) == len(index)
  511. ):
  512. grper = pprint_thing(grouping_vector)
  513. errmsg = (
  514. "Grouper result violates len(labels) == "
  515. f"len(data)\nresult: {grper}"
  516. )
  517. raise AssertionError(errmsg)
  518. if isinstance(grouping_vector, np.ndarray):
  519. if grouping_vector.dtype.kind in ["m", "M"]:
  520. # if we have a date/time-like grouper, make sure that we have
  521. # Timestamps like
  522. # TODO 2022-10-08 we only have one test that gets here and
  523. # values are already in nanoseconds in that case.
  524. grouping_vector = Series(grouping_vector).to_numpy()
  525. elif is_categorical_dtype(grouping_vector):
  526. # a passed Categorical
  527. self._orig_cats = grouping_vector.categories
  528. grouping_vector, self._all_grouper = recode_for_groupby(
  529. grouping_vector, sort, observed
  530. )
  531. self.grouping_vector = grouping_vector
  532. def __repr__(self) -> str:
  533. return f"Grouping({self.name})"
  534. def __iter__(self) -> Iterator:
  535. return iter(self.indices)
  536. @cache_readonly
  537. def _passed_categorical(self) -> bool:
  538. return is_categorical_dtype(self.grouping_vector)
  539. @cache_readonly
  540. def name(self) -> Hashable:
  541. ilevel = self._ilevel
  542. if ilevel is not None:
  543. return self._index.names[ilevel]
  544. if isinstance(self._orig_grouper, (Index, Series)):
  545. return self._orig_grouper.name
  546. elif isinstance(self.grouping_vector, ops.BaseGrouper):
  547. return self.grouping_vector.result_index.name
  548. elif isinstance(self.grouping_vector, Index):
  549. return self.grouping_vector.name
  550. # otherwise we have ndarray or ExtensionArray -> no name
  551. return None
  552. @cache_readonly
  553. def _ilevel(self) -> int | None:
  554. """
  555. If necessary, converted index level name to index level position.
  556. """
  557. level = self.level
  558. if level is None:
  559. return None
  560. if not isinstance(level, int):
  561. index = self._index
  562. if level not in index.names:
  563. raise AssertionError(f"Level {level} not in index")
  564. return index.names.index(level)
  565. return level
  566. @property
  567. def ngroups(self) -> int:
  568. return len(self.group_index)
  569. @cache_readonly
  570. def indices(self) -> dict[Hashable, npt.NDArray[np.intp]]:
  571. # we have a list of groupers
  572. if isinstance(self.grouping_vector, ops.BaseGrouper):
  573. return self.grouping_vector.indices
  574. values = Categorical(self.grouping_vector)
  575. return values._reverse_indexer()
  576. @property
  577. def codes(self) -> npt.NDArray[np.signedinteger]:
  578. return self._codes_and_uniques[0]
  579. @cache_readonly
  580. def group_arraylike(self) -> ArrayLike:
  581. """
  582. Analogous to result_index, but holding an ArrayLike to ensure
  583. we can retain ExtensionDtypes.
  584. """
  585. if self._all_grouper is not None:
  586. # retain dtype for categories, including unobserved ones
  587. return self.result_index._values
  588. elif self._passed_categorical:
  589. return self.group_index._values
  590. return self._codes_and_uniques[1]
  591. @cache_readonly
  592. def result_index(self) -> Index:
  593. # result_index retains dtype for categories, including unobserved ones,
  594. # which group_index does not
  595. if self._all_grouper is not None:
  596. group_idx = self.group_index
  597. assert isinstance(group_idx, CategoricalIndex)
  598. cats = self._orig_cats
  599. # set_categories is dynamically added
  600. return group_idx.set_categories(cats) # type: ignore[attr-defined]
  601. return self.group_index
  602. @cache_readonly
  603. def group_index(self) -> Index:
  604. codes, uniques = self._codes_and_uniques
  605. if not self._dropna and self._passed_categorical:
  606. assert isinstance(uniques, Categorical)
  607. if self._sort and (codes == len(uniques)).any():
  608. # Add NA value on the end when sorting
  609. uniques = Categorical.from_codes(
  610. np.append(uniques.codes, [-1]), uniques.categories
  611. )
  612. elif len(codes) > 0:
  613. # Need to determine proper placement of NA value when not sorting
  614. cat = self.grouping_vector
  615. na_idx = (cat.codes < 0).argmax()
  616. if cat.codes[na_idx] < 0:
  617. # count number of unique codes that comes before the nan value
  618. na_unique_idx = algorithms.nunique_ints(cat.codes[:na_idx])
  619. uniques = Categorical.from_codes(
  620. np.insert(uniques.codes, na_unique_idx, -1), uniques.categories
  621. )
  622. return Index._with_infer(uniques, name=self.name)
  623. @cache_readonly
  624. def _codes_and_uniques(self) -> tuple[npt.NDArray[np.signedinteger], ArrayLike]:
  625. uniques: ArrayLike
  626. if self._passed_categorical:
  627. # we make a CategoricalIndex out of the cat grouper
  628. # preserving the categories / ordered attributes;
  629. # doesn't (yet - GH#46909) handle dropna=False
  630. cat = self.grouping_vector
  631. categories = cat.categories
  632. if self._observed:
  633. ucodes = algorithms.unique1d(cat.codes)
  634. ucodes = ucodes[ucodes != -1]
  635. if self._sort:
  636. ucodes = np.sort(ucodes)
  637. else:
  638. ucodes = np.arange(len(categories))
  639. uniques = Categorical.from_codes(
  640. codes=ucodes, categories=categories, ordered=cat.ordered
  641. )
  642. codes = cat.codes
  643. if not self._dropna:
  644. na_mask = codes < 0
  645. if np.any(na_mask):
  646. if self._sort:
  647. # Replace NA codes with `largest code + 1`
  648. na_code = len(categories)
  649. codes = np.where(na_mask, na_code, codes)
  650. else:
  651. # Insert NA code into the codes based on first appearance
  652. # A negative code must exist, no need to check codes[na_idx] < 0
  653. na_idx = na_mask.argmax()
  654. # count number of unique codes that comes before the nan value
  655. na_code = algorithms.nunique_ints(codes[:na_idx])
  656. codes = np.where(codes >= na_code, codes + 1, codes)
  657. codes = np.where(na_mask, na_code, codes)
  658. if not self._observed:
  659. uniques = uniques.reorder_categories(self._orig_cats)
  660. return codes, uniques
  661. elif isinstance(self.grouping_vector, ops.BaseGrouper):
  662. # we have a list of groupers
  663. codes = self.grouping_vector.codes_info
  664. uniques = self.grouping_vector.result_index._values
  665. elif self._uniques is not None:
  666. # GH#50486 Code grouping_vector using _uniques; allows
  667. # including uniques that are not present in grouping_vector.
  668. cat = Categorical(self.grouping_vector, categories=self._uniques)
  669. codes = cat.codes
  670. uniques = self._uniques
  671. else:
  672. # GH35667, replace dropna=False with use_na_sentinel=False
  673. # error: Incompatible types in assignment (expression has type "Union[
  674. # ndarray[Any, Any], Index]", variable has type "Categorical")
  675. codes, uniques = algorithms.factorize( # type: ignore[assignment]
  676. self.grouping_vector, sort=self._sort, use_na_sentinel=self._dropna
  677. )
  678. return codes, uniques
  679. @cache_readonly
  680. def groups(self) -> dict[Hashable, np.ndarray]:
  681. return self._index.groupby(Categorical.from_codes(self.codes, self.group_index))
  682. def get_grouper(
  683. obj: NDFrameT,
  684. key=None,
  685. axis: Axis = 0,
  686. level=None,
  687. sort: bool = True,
  688. observed: bool = False,
  689. validate: bool = True,
  690. dropna: bool = True,
  691. ) -> tuple[ops.BaseGrouper, frozenset[Hashable], NDFrameT]:
  692. """
  693. Create and return a BaseGrouper, which is an internal
  694. mapping of how to create the grouper indexers.
  695. This may be composed of multiple Grouping objects, indicating
  696. multiple groupers
  697. Groupers are ultimately index mappings. They can originate as:
  698. index mappings, keys to columns, functions, or Groupers
  699. Groupers enable local references to axis,level,sort, while
  700. the passed in axis, level, and sort are 'global'.
  701. This routine tries to figure out what the passing in references
  702. are and then creates a Grouping for each one, combined into
  703. a BaseGrouper.
  704. If observed & we have a categorical grouper, only show the observed
  705. values.
  706. If validate, then check for key/level overlaps.
  707. """
  708. group_axis = obj._get_axis(axis)
  709. # validate that the passed single level is compatible with the passed
  710. # axis of the object
  711. if level is not None:
  712. # TODO: These if-block and else-block are almost same.
  713. # MultiIndex instance check is removable, but it seems that there are
  714. # some processes only for non-MultiIndex in else-block,
  715. # eg. `obj.index.name != level`. We have to consider carefully whether
  716. # these are applicable for MultiIndex. Even if these are applicable,
  717. # we need to check if it makes no side effect to subsequent processes
  718. # on the outside of this condition.
  719. # (GH 17621)
  720. if isinstance(group_axis, MultiIndex):
  721. if is_list_like(level) and len(level) == 1:
  722. level = level[0]
  723. if key is None and is_scalar(level):
  724. # Get the level values from group_axis
  725. key = group_axis.get_level_values(level)
  726. level = None
  727. else:
  728. # allow level to be a length-one list-like object
  729. # (e.g., level=[0])
  730. # GH 13901
  731. if is_list_like(level):
  732. nlevels = len(level)
  733. if nlevels == 1:
  734. level = level[0]
  735. elif nlevels == 0:
  736. raise ValueError("No group keys passed!")
  737. else:
  738. raise ValueError("multiple levels only valid with MultiIndex")
  739. if isinstance(level, str):
  740. if obj._get_axis(axis).name != level:
  741. raise ValueError(
  742. f"level name {level} is not the name "
  743. f"of the {obj._get_axis_name(axis)}"
  744. )
  745. elif level > 0 or level < -1:
  746. raise ValueError("level > 0 or level < -1 only valid with MultiIndex")
  747. # NOTE: `group_axis` and `group_axis.get_level_values(level)`
  748. # are same in this section.
  749. level = None
  750. key = group_axis
  751. # a passed-in Grouper, directly convert
  752. if isinstance(key, Grouper):
  753. grouper, obj = key._get_grouper(obj, validate=False)
  754. if key.key is None:
  755. return grouper, frozenset(), obj
  756. else:
  757. return grouper, frozenset({key.key}), obj
  758. # already have a BaseGrouper, just return it
  759. elif isinstance(key, ops.BaseGrouper):
  760. return key, frozenset(), obj
  761. if not isinstance(key, list):
  762. keys = [key]
  763. match_axis_length = False
  764. else:
  765. keys = key
  766. match_axis_length = len(keys) == len(group_axis)
  767. # what are we after, exactly?
  768. any_callable = any(callable(g) or isinstance(g, dict) for g in keys)
  769. any_groupers = any(isinstance(g, (Grouper, Grouping)) for g in keys)
  770. any_arraylike = any(
  771. isinstance(g, (list, tuple, Series, Index, np.ndarray)) for g in keys
  772. )
  773. # is this an index replacement?
  774. if (
  775. not any_callable
  776. and not any_arraylike
  777. and not any_groupers
  778. and match_axis_length
  779. and level is None
  780. ):
  781. if isinstance(obj, DataFrame):
  782. all_in_columns_index = all(
  783. g in obj.columns or g in obj.index.names for g in keys
  784. )
  785. else:
  786. assert isinstance(obj, Series)
  787. all_in_columns_index = all(g in obj.index.names for g in keys)
  788. if not all_in_columns_index:
  789. keys = [com.asarray_tuplesafe(keys)]
  790. if isinstance(level, (tuple, list)):
  791. if key is None:
  792. keys = [None] * len(level)
  793. levels = level
  794. else:
  795. levels = [level] * len(keys)
  796. groupings: list[Grouping] = []
  797. exclusions: set[Hashable] = set()
  798. # if the actual grouper should be obj[key]
  799. def is_in_axis(key) -> bool:
  800. if not _is_label_like(key):
  801. if obj.ndim == 1:
  802. return False
  803. # items -> .columns for DataFrame, .index for Series
  804. items = obj.axes[-1]
  805. try:
  806. items.get_loc(key)
  807. except (KeyError, TypeError, InvalidIndexError):
  808. # TypeError shows up here if we pass e.g. an Index
  809. return False
  810. return True
  811. # if the grouper is obj[name]
  812. def is_in_obj(gpr) -> bool:
  813. if not hasattr(gpr, "name"):
  814. return False
  815. if using_copy_on_write():
  816. # For the CoW case, we check the references to determine if the
  817. # series is part of the object
  818. try:
  819. obj_gpr_column = obj[gpr.name]
  820. except (KeyError, IndexError, InvalidIndexError):
  821. return False
  822. if isinstance(gpr, Series) and isinstance(obj_gpr_column, Series):
  823. return gpr._mgr.references_same_values( # type: ignore[union-attr]
  824. obj_gpr_column._mgr, 0 # type: ignore[arg-type]
  825. )
  826. return False
  827. try:
  828. return gpr is obj[gpr.name]
  829. except (KeyError, IndexError, InvalidIndexError):
  830. # IndexError reached in e.g. test_skip_group_keys when we pass
  831. # lambda here
  832. # InvalidIndexError raised on key-types inappropriate for index,
  833. # e.g. DatetimeIndex.get_loc(tuple())
  834. return False
  835. for gpr, level in zip(keys, levels):
  836. if is_in_obj(gpr): # df.groupby(df['name'])
  837. in_axis = True
  838. exclusions.add(gpr.name)
  839. elif is_in_axis(gpr): # df.groupby('name')
  840. if obj.ndim != 1 and gpr in obj:
  841. if validate:
  842. obj._check_label_or_level_ambiguity(gpr, axis=axis)
  843. in_axis, name, gpr = True, gpr, obj[gpr]
  844. if gpr.ndim != 1:
  845. # non-unique columns; raise here to get the name in the
  846. # exception message
  847. raise ValueError(f"Grouper for '{name}' not 1-dimensional")
  848. exclusions.add(name)
  849. elif obj._is_level_reference(gpr, axis=axis):
  850. in_axis, level, gpr = False, gpr, None
  851. else:
  852. raise KeyError(gpr)
  853. elif isinstance(gpr, Grouper) and gpr.key is not None:
  854. # Add key to exclusions
  855. exclusions.add(gpr.key)
  856. in_axis = True
  857. else:
  858. in_axis = False
  859. # create the Grouping
  860. # allow us to passing the actual Grouping as the gpr
  861. ping = (
  862. Grouping(
  863. group_axis,
  864. gpr,
  865. obj=obj,
  866. level=level,
  867. sort=sort,
  868. observed=observed,
  869. in_axis=in_axis,
  870. dropna=dropna,
  871. )
  872. if not isinstance(gpr, Grouping)
  873. else gpr
  874. )
  875. groupings.append(ping)
  876. if len(groupings) == 0 and len(obj):
  877. raise ValueError("No group keys passed!")
  878. if len(groupings) == 0:
  879. groupings.append(Grouping(Index([], dtype="int"), np.array([], dtype=np.intp)))
  880. # create the internals grouper
  881. grouper = ops.BaseGrouper(group_axis, groupings, sort=sort, dropna=dropna)
  882. return grouper, frozenset(exclusions), obj
  883. def _is_label_like(val) -> bool:
  884. return isinstance(val, (str, tuple)) or (val is not None and is_scalar(val))
  885. def _convert_grouper(axis: Index, grouper):
  886. if isinstance(grouper, dict):
  887. return grouper.get
  888. elif isinstance(grouper, Series):
  889. if grouper.index.equals(axis):
  890. return grouper._values
  891. else:
  892. return grouper.reindex(axis)._values
  893. elif isinstance(grouper, MultiIndex):
  894. return grouper._values
  895. elif isinstance(grouper, (list, tuple, Index, Categorical, np.ndarray)):
  896. if len(grouper) != len(axis):
  897. raise ValueError("Grouper and axis must be same length")
  898. if isinstance(grouper, (list, tuple)):
  899. grouper = com.asarray_tuplesafe(grouper)
  900. return grouper
  901. else:
  902. return grouper