categorical.py 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. from __future__ import annotations
  2. import numpy as np
  3. from pandas.core.algorithms import unique1d
  4. from pandas.core.arrays.categorical import (
  5. Categorical,
  6. CategoricalDtype,
  7. recode_for_categories,
  8. )
  9. def recode_for_groupby(
  10. c: Categorical, sort: bool, observed: bool
  11. ) -> tuple[Categorical, Categorical | None]:
  12. """
  13. Code the categories to ensure we can groupby for categoricals.
  14. If observed=True, we return a new Categorical with the observed
  15. categories only.
  16. If sort=False, return a copy of self, coded with categories as
  17. returned by .unique(), followed by any categories not appearing in
  18. the data. If sort=True, return self.
  19. This method is needed solely to ensure the categorical index of the
  20. GroupBy result has categories in the order of appearance in the data
  21. (GH-8868).
  22. Parameters
  23. ----------
  24. c : Categorical
  25. sort : bool
  26. The value of the sort parameter groupby was called with.
  27. observed : bool
  28. Account only for the observed values
  29. Returns
  30. -------
  31. Categorical
  32. If sort=False, the new categories are set to the order of
  33. appearance in codes (unless ordered=True, in which case the
  34. original order is preserved), followed by any unrepresented
  35. categories in the original order.
  36. Categorical or None
  37. If we are observed, return the original categorical, otherwise None
  38. """
  39. # we only care about observed values
  40. if observed:
  41. # In cases with c.ordered, this is equivalent to
  42. # return c.remove_unused_categories(), c
  43. unique_codes = unique1d(c.codes)
  44. take_codes = unique_codes[unique_codes != -1]
  45. if sort:
  46. take_codes = np.sort(take_codes)
  47. # we recode according to the uniques
  48. categories = c.categories.take(take_codes)
  49. codes = recode_for_categories(c.codes, c.categories, categories)
  50. # return a new categorical that maps our new codes
  51. # and categories
  52. dtype = CategoricalDtype(categories, ordered=c.ordered)
  53. return Categorical(codes, dtype=dtype, fastpath=True), c
  54. # Already sorted according to c.categories; all is fine
  55. if sort:
  56. return c, None
  57. # sort=False should order groups in as-encountered order (GH-8868)
  58. # xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
  59. all_codes = np.arange(c.categories.nunique())
  60. # GH 38140: exclude nan from indexer for categories
  61. unique_notnan_codes = unique1d(c.codes[c.codes != -1])
  62. if sort:
  63. unique_notnan_codes = np.sort(unique_notnan_codes)
  64. if len(all_codes) > len(unique_notnan_codes):
  65. # GH 13179: All categories need to be present, even if missing from the data
  66. missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)
  67. take_codes = np.concatenate((unique_notnan_codes, missing_codes))
  68. else:
  69. take_codes = unique_notnan_codes
  70. return Categorical(c, c.unique().categories.take(take_codes)), None