indexing.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. from __future__ import annotations
  2. from typing import (
  3. TYPE_CHECKING,
  4. Iterable,
  5. Literal,
  6. cast,
  7. )
  8. import numpy as np
  9. from pandas._typing import PositionalIndexer
  10. from pandas.util._decorators import (
  11. cache_readonly,
  12. doc,
  13. )
  14. from pandas.core.dtypes.common import (
  15. is_integer,
  16. is_list_like,
  17. )
  18. if TYPE_CHECKING:
  19. from pandas import (
  20. DataFrame,
  21. Series,
  22. )
  23. from pandas.core.groupby import groupby
  24. class GroupByIndexingMixin:
  25. """
  26. Mixin for adding ._positional_selector to GroupBy.
  27. """
  28. @cache_readonly
  29. def _positional_selector(self) -> GroupByPositionalSelector:
  30. """
  31. Return positional selection for each group.
  32. ``groupby._positional_selector[i:j]`` is similar to
  33. ``groupby.apply(lambda x: x.iloc[i:j])``
  34. but much faster and preserves the original index and order.
  35. ``_positional_selector[]`` is compatible with and extends :meth:`~GroupBy.head`
  36. and :meth:`~GroupBy.tail`. For example:
  37. - ``head(5)``
  38. - ``_positional_selector[5:-5]``
  39. - ``tail(5)``
  40. together return all the rows.
  41. Allowed inputs for the index are:
  42. - An integer valued iterable, e.g. ``range(2, 4)``.
  43. - A comma separated list of integers and slices, e.g. ``5``, ``2, 4``, ``2:4``.
  44. The output format is the same as :meth:`~GroupBy.head` and
  45. :meth:`~GroupBy.tail`, namely
  46. a subset of the ``DataFrame`` or ``Series`` with the index and order preserved.
  47. Returns
  48. -------
  49. Series
  50. The filtered subset of the original Series.
  51. DataFrame
  52. The filtered subset of the original DataFrame.
  53. See Also
  54. --------
  55. DataFrame.iloc : Purely integer-location based indexing for selection by
  56. position.
  57. GroupBy.head : Return first n rows of each group.
  58. GroupBy.tail : Return last n rows of each group.
  59. GroupBy.nth : Take the nth row from each group if n is an int, or a
  60. subset of rows, if n is a list of ints.
  61. Notes
  62. -----
  63. - The slice step cannot be negative.
  64. - If the index specification results in overlaps, the item is not duplicated.
  65. - If the index specification changes the order of items, then
  66. they are returned in their original order.
  67. By contrast, ``DataFrame.iloc`` can change the row order.
  68. - ``groupby()`` parameters such as as_index and dropna are ignored.
  69. The differences between ``_positional_selector[]`` and :meth:`~GroupBy.nth`
  70. with ``as_index=False`` are:
  71. - Input to ``_positional_selector`` can include
  72. one or more slices whereas ``nth``
  73. just handles an integer or a list of integers.
  74. - ``_positional_selector`` can accept a slice relative to the
  75. last row of each group.
  76. - ``_positional_selector`` does not have an equivalent to the
  77. ``nth()`` ``dropna`` parameter.
  78. Examples
  79. --------
  80. >>> df = pd.DataFrame([["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]],
  81. ... columns=["A", "B"])
  82. >>> df.groupby("A")._positional_selector[1:2]
  83. A B
  84. 1 a 2
  85. 4 b 5
  86. >>> df.groupby("A")._positional_selector[1, -1]
  87. A B
  88. 1 a 2
  89. 2 a 3
  90. 4 b 5
  91. """
  92. if TYPE_CHECKING:
  93. # pylint: disable-next=used-before-assignment
  94. groupby_self = cast(groupby.GroupBy, self)
  95. else:
  96. groupby_self = self
  97. return GroupByPositionalSelector(groupby_self)
  98. def _make_mask_from_positional_indexer(
  99. self,
  100. arg: PositionalIndexer | tuple,
  101. ) -> np.ndarray:
  102. if is_list_like(arg):
  103. if all(is_integer(i) for i in cast(Iterable, arg)):
  104. mask = self._make_mask_from_list(cast(Iterable[int], arg))
  105. else:
  106. mask = self._make_mask_from_tuple(cast(tuple, arg))
  107. elif isinstance(arg, slice):
  108. mask = self._make_mask_from_slice(arg)
  109. elif is_integer(arg):
  110. mask = self._make_mask_from_int(cast(int, arg))
  111. else:
  112. raise TypeError(
  113. f"Invalid index {type(arg)}. "
  114. "Must be integer, list-like, slice or a tuple of "
  115. "integers and slices"
  116. )
  117. if isinstance(mask, bool):
  118. if mask:
  119. mask = self._ascending_count >= 0
  120. else:
  121. mask = self._ascending_count < 0
  122. return cast(np.ndarray, mask)
  123. def _make_mask_from_int(self, arg: int) -> np.ndarray:
  124. if arg >= 0:
  125. return self._ascending_count == arg
  126. else:
  127. return self._descending_count == (-arg - 1)
  128. def _make_mask_from_list(self, args: Iterable[int]) -> bool | np.ndarray:
  129. positive = [arg for arg in args if arg >= 0]
  130. negative = [-arg - 1 for arg in args if arg < 0]
  131. mask: bool | np.ndarray = False
  132. if positive:
  133. mask |= np.isin(self._ascending_count, positive)
  134. if negative:
  135. mask |= np.isin(self._descending_count, negative)
  136. return mask
  137. def _make_mask_from_tuple(self, args: tuple) -> bool | np.ndarray:
  138. mask: bool | np.ndarray = False
  139. for arg in args:
  140. if is_integer(arg):
  141. mask |= self._make_mask_from_int(cast(int, arg))
  142. elif isinstance(arg, slice):
  143. mask |= self._make_mask_from_slice(arg)
  144. else:
  145. raise ValueError(
  146. f"Invalid argument {type(arg)}. Should be int or slice."
  147. )
  148. return mask
  149. def _make_mask_from_slice(self, arg: slice) -> bool | np.ndarray:
  150. start = arg.start
  151. stop = arg.stop
  152. step = arg.step
  153. if step is not None and step < 0:
  154. raise ValueError(f"Invalid step {step}. Must be non-negative")
  155. mask: bool | np.ndarray = True
  156. if step is None:
  157. step = 1
  158. if start is None:
  159. if step > 1:
  160. mask &= self._ascending_count % step == 0
  161. elif start >= 0:
  162. mask &= self._ascending_count >= start
  163. if step > 1:
  164. mask &= (self._ascending_count - start) % step == 0
  165. else:
  166. mask &= self._descending_count < -start
  167. offset_array = self._descending_count + start + 1
  168. limit_array = (
  169. self._ascending_count + self._descending_count + (start + 1)
  170. ) < 0
  171. offset_array = np.where(limit_array, self._ascending_count, offset_array)
  172. mask &= offset_array % step == 0
  173. if stop is not None:
  174. if stop >= 0:
  175. mask &= self._ascending_count < stop
  176. else:
  177. mask &= self._descending_count >= -stop
  178. return mask
  179. @cache_readonly
  180. def _ascending_count(self) -> np.ndarray:
  181. if TYPE_CHECKING:
  182. groupby_self = cast(groupby.GroupBy, self)
  183. else:
  184. groupby_self = self
  185. return groupby_self._cumcount_array()
  186. @cache_readonly
  187. def _descending_count(self) -> np.ndarray:
  188. if TYPE_CHECKING:
  189. groupby_self = cast(groupby.GroupBy, self)
  190. else:
  191. groupby_self = self
  192. return groupby_self._cumcount_array(ascending=False)
  193. @doc(GroupByIndexingMixin._positional_selector)
  194. class GroupByPositionalSelector:
  195. def __init__(self, groupby_object: groupby.GroupBy) -> None:
  196. self.groupby_object = groupby_object
  197. def __getitem__(self, arg: PositionalIndexer | tuple) -> DataFrame | Series:
  198. """
  199. Select by positional index per group.
  200. Implements GroupBy._positional_selector
  201. Parameters
  202. ----------
  203. arg : PositionalIndexer | tuple
  204. Allowed values are:
  205. - int
  206. - int valued iterable such as list or range
  207. - slice with step either None or positive
  208. - tuple of integers and slices
  209. Returns
  210. -------
  211. Series
  212. The filtered subset of the original groupby Series.
  213. DataFrame
  214. The filtered subset of the original groupby DataFrame.
  215. See Also
  216. --------
  217. DataFrame.iloc : Integer-location based indexing for selection by position.
  218. GroupBy.head : Return first n rows of each group.
  219. GroupBy.tail : Return last n rows of each group.
  220. GroupBy._positional_selector : Return positional selection for each group.
  221. GroupBy.nth : Take the nth row from each group if n is an int, or a
  222. subset of rows, if n is a list of ints.
  223. """
  224. mask = self.groupby_object._make_mask_from_positional_indexer(arg)
  225. return self.groupby_object._mask_selected_obj(mask)
  226. class GroupByNthSelector:
  227. """
  228. Dynamically substituted for GroupBy.nth to enable both call and index
  229. """
  230. def __init__(self, groupby_object: groupby.GroupBy) -> None:
  231. self.groupby_object = groupby_object
  232. def __call__(
  233. self,
  234. n: PositionalIndexer | tuple,
  235. dropna: Literal["any", "all", None] = None,
  236. ) -> DataFrame | Series:
  237. return self.groupby_object._nth(n, dropna)
  238. def __getitem__(self, n: PositionalIndexer | tuple) -> DataFrame | Series:
  239. return self.groupby_object._nth(n)