categorical.py 82 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604
  1. from __future__ import annotations
  2. from csv import QUOTE_NONNUMERIC
  3. from functools import partial
  4. import operator
  5. from shutil import get_terminal_size
  6. from typing import (
  7. TYPE_CHECKING,
  8. Hashable,
  9. Iterator,
  10. Literal,
  11. Sequence,
  12. TypeVar,
  13. cast,
  14. overload,
  15. )
  16. import numpy as np
  17. from pandas._config import get_option
  18. from pandas._libs import (
  19. NaT,
  20. algos as libalgos,
  21. lib,
  22. )
  23. from pandas._libs.arrays import NDArrayBacked
  24. from pandas._typing import (
  25. ArrayLike,
  26. AstypeArg,
  27. AxisInt,
  28. Dtype,
  29. NpDtype,
  30. Ordered,
  31. Shape,
  32. SortKind,
  33. npt,
  34. type_t,
  35. )
  36. from pandas.compat.numpy import function as nv
  37. from pandas.util._validators import validate_bool_kwarg
  38. from pandas.core.dtypes.cast import (
  39. coerce_indexer_dtype,
  40. find_common_type,
  41. )
  42. from pandas.core.dtypes.common import (
  43. ensure_int64,
  44. ensure_platform_int,
  45. is_any_real_numeric_dtype,
  46. is_bool_dtype,
  47. is_categorical_dtype,
  48. is_datetime64_dtype,
  49. is_dict_like,
  50. is_dtype_equal,
  51. is_extension_array_dtype,
  52. is_hashable,
  53. is_integer_dtype,
  54. is_list_like,
  55. is_scalar,
  56. is_timedelta64_dtype,
  57. needs_i8_conversion,
  58. pandas_dtype,
  59. )
  60. from pandas.core.dtypes.dtypes import (
  61. CategoricalDtype,
  62. ExtensionDtype,
  63. )
  64. from pandas.core.dtypes.generic import (
  65. ABCIndex,
  66. ABCSeries,
  67. )
  68. from pandas.core.dtypes.missing import (
  69. is_valid_na_for_dtype,
  70. isna,
  71. )
  72. from pandas.core import (
  73. algorithms,
  74. arraylike,
  75. ops,
  76. )
  77. from pandas.core.accessor import (
  78. PandasDelegate,
  79. delegate_names,
  80. )
  81. from pandas.core.algorithms import (
  82. factorize,
  83. take_nd,
  84. )
  85. from pandas.core.arrays._mixins import (
  86. NDArrayBackedExtensionArray,
  87. ravel_compat,
  88. )
  89. from pandas.core.base import (
  90. ExtensionArray,
  91. NoNewAttributesMixin,
  92. PandasObject,
  93. )
  94. import pandas.core.common as com
  95. from pandas.core.construction import (
  96. extract_array,
  97. sanitize_array,
  98. )
  99. from pandas.core.ops.common import unpack_zerodim_and_defer
  100. from pandas.core.sorting import nargsort
  101. from pandas.core.strings.object_array import ObjectStringArrayMixin
  102. from pandas.io.formats import console
  103. if TYPE_CHECKING:
  104. from pandas import (
  105. DataFrame,
  106. Index,
  107. Series,
  108. )
  109. CategoricalT = TypeVar("CategoricalT", bound="Categorical")
  110. def _cat_compare_op(op):
  111. opname = f"__{op.__name__}__"
  112. fill_value = op is operator.ne
  113. @unpack_zerodim_and_defer(opname)
  114. def func(self, other):
  115. hashable = is_hashable(other)
  116. if is_list_like(other) and len(other) != len(self) and not hashable:
  117. # in hashable case we may have a tuple that is itself a category
  118. raise ValueError("Lengths must match.")
  119. if not self.ordered:
  120. if opname in ["__lt__", "__gt__", "__le__", "__ge__"]:
  121. raise TypeError(
  122. "Unordered Categoricals can only compare equality or not"
  123. )
  124. if isinstance(other, Categorical):
  125. # Two Categoricals can only be compared if the categories are
  126. # the same (maybe up to ordering, depending on ordered)
  127. msg = "Categoricals can only be compared if 'categories' are the same."
  128. if not self._categories_match_up_to_permutation(other):
  129. raise TypeError(msg)
  130. if not self.ordered and not self.categories.equals(other.categories):
  131. # both unordered and different order
  132. other_codes = recode_for_categories(
  133. other.codes, other.categories, self.categories, copy=False
  134. )
  135. else:
  136. other_codes = other._codes
  137. ret = op(self._codes, other_codes)
  138. mask = (self._codes == -1) | (other_codes == -1)
  139. if mask.any():
  140. ret[mask] = fill_value
  141. return ret
  142. if hashable:
  143. if other in self.categories:
  144. i = self._unbox_scalar(other)
  145. ret = op(self._codes, i)
  146. if opname not in {"__eq__", "__ge__", "__gt__"}:
  147. # GH#29820 performance trick; get_loc will always give i>=0,
  148. # so in the cases (__ne__, __le__, __lt__) the setting
  149. # here is a no-op, so can be skipped.
  150. mask = self._codes == -1
  151. ret[mask] = fill_value
  152. return ret
  153. else:
  154. return ops.invalid_comparison(self, other, op)
  155. else:
  156. # allow categorical vs object dtype array comparisons for equality
  157. # these are only positional comparisons
  158. if opname not in ["__eq__", "__ne__"]:
  159. raise TypeError(
  160. f"Cannot compare a Categorical for op {opname} with "
  161. f"type {type(other)}.\nIf you want to compare values, "
  162. "use 'np.asarray(cat) <op> other'."
  163. )
  164. if isinstance(other, ExtensionArray) and needs_i8_conversion(other.dtype):
  165. # We would return NotImplemented here, but that messes up
  166. # ExtensionIndex's wrapped methods
  167. return op(other, self)
  168. return getattr(np.array(self), opname)(np.array(other))
  169. func.__name__ = opname
  170. return func
  171. def contains(cat, key, container) -> bool:
  172. """
  173. Helper for membership check for ``key`` in ``cat``.
  174. This is a helper method for :method:`__contains__`
  175. and :class:`CategoricalIndex.__contains__`.
  176. Returns True if ``key`` is in ``cat.categories`` and the
  177. location of ``key`` in ``categories`` is in ``container``.
  178. Parameters
  179. ----------
  180. cat : :class:`Categorical`or :class:`categoricalIndex`
  181. key : a hashable object
  182. The key to check membership for.
  183. container : Container (e.g. list-like or mapping)
  184. The container to check for membership in.
  185. Returns
  186. -------
  187. is_in : bool
  188. True if ``key`` is in ``self.categories`` and location of
  189. ``key`` in ``categories`` is in ``container``, else False.
  190. Notes
  191. -----
  192. This method does not check for NaN values. Do that separately
  193. before calling this method.
  194. """
  195. hash(key)
  196. # get location of key in categories.
  197. # If a KeyError, the key isn't in categories, so logically
  198. # can't be in container either.
  199. try:
  200. loc = cat.categories.get_loc(key)
  201. except (KeyError, TypeError):
  202. return False
  203. # loc is the location of key in categories, but also the *value*
  204. # for key in container. So, `key` may be in categories,
  205. # but still not in `container`. Example ('b' in categories,
  206. # but not in values):
  207. # 'b' in Categorical(['a'], categories=['a', 'b']) # False
  208. if is_scalar(loc):
  209. return loc in container
  210. else:
  211. # if categories is an IntervalIndex, loc is an array.
  212. return any(loc_ in container for loc_ in loc)
  213. class Categorical(NDArrayBackedExtensionArray, PandasObject, ObjectStringArrayMixin):
  214. """
  215. Represent a categorical variable in classic R / S-plus fashion.
  216. `Categoricals` can only take on a limited, and usually fixed, number
  217. of possible values (`categories`). In contrast to statistical categorical
  218. variables, a `Categorical` might have an order, but numerical operations
  219. (additions, divisions, ...) are not possible.
  220. All values of the `Categorical` are either in `categories` or `np.nan`.
  221. Assigning values outside of `categories` will raise a `ValueError`. Order
  222. is defined by the order of the `categories`, not lexical order of the
  223. values.
  224. Parameters
  225. ----------
  226. values : list-like
  227. The values of the categorical. If categories are given, values not in
  228. categories will be replaced with NaN.
  229. categories : Index-like (unique), optional
  230. The unique categories for this categorical. If not given, the
  231. categories are assumed to be the unique values of `values` (sorted, if
  232. possible, otherwise in the order in which they appear).
  233. ordered : bool, default False
  234. Whether or not this categorical is treated as a ordered categorical.
  235. If True, the resulting categorical will be ordered.
  236. An ordered categorical respects, when sorted, the order of its
  237. `categories` attribute (which in turn is the `categories` argument, if
  238. provided).
  239. dtype : CategoricalDtype
  240. An instance of ``CategoricalDtype`` to use for this categorical.
  241. Attributes
  242. ----------
  243. categories : Index
  244. The categories of this categorical
  245. codes : ndarray
  246. The codes (integer positions, which point to the categories) of this
  247. categorical, read only.
  248. ordered : bool
  249. Whether or not this Categorical is ordered.
  250. dtype : CategoricalDtype
  251. The instance of ``CategoricalDtype`` storing the ``categories``
  252. and ``ordered``.
  253. Methods
  254. -------
  255. from_codes
  256. __array__
  257. Raises
  258. ------
  259. ValueError
  260. If the categories do not validate.
  261. TypeError
  262. If an explicit ``ordered=True`` is given but no `categories` and the
  263. `values` are not sortable.
  264. See Also
  265. --------
  266. CategoricalDtype : Type for categorical data.
  267. CategoricalIndex : An Index with an underlying ``Categorical``.
  268. Notes
  269. -----
  270. See the `user guide
  271. <https://pandas.pydata.org/pandas-docs/stable/user_guide/categorical.html>`__
  272. for more.
  273. Examples
  274. --------
  275. >>> pd.Categorical([1, 2, 3, 1, 2, 3])
  276. [1, 2, 3, 1, 2, 3]
  277. Categories (3, int64): [1, 2, 3]
  278. >>> pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'])
  279. ['a', 'b', 'c', 'a', 'b', 'c']
  280. Categories (3, object): ['a', 'b', 'c']
  281. Missing values are not included as a category.
  282. >>> c = pd.Categorical([1, 2, 3, 1, 2, 3, np.nan])
  283. >>> c
  284. [1, 2, 3, 1, 2, 3, NaN]
  285. Categories (3, int64): [1, 2, 3]
  286. However, their presence is indicated in the `codes` attribute
  287. by code `-1`.
  288. >>> c.codes
  289. array([ 0, 1, 2, 0, 1, 2, -1], dtype=int8)
  290. Ordered `Categoricals` can be sorted according to the custom order
  291. of the categories and can have a min and max value.
  292. >>> c = pd.Categorical(['a', 'b', 'c', 'a', 'b', 'c'], ordered=True,
  293. ... categories=['c', 'b', 'a'])
  294. >>> c
  295. ['a', 'b', 'c', 'a', 'b', 'c']
  296. Categories (3, object): ['c' < 'b' < 'a']
  297. >>> c.min()
  298. 'c'
  299. """
  300. # For comparisons, so that numpy uses our implementation if the compare
  301. # ops, which raise
  302. __array_priority__ = 1000
  303. # tolist is not actually deprecated, just suppressed in the __dir__
  304. _hidden_attrs = PandasObject._hidden_attrs | frozenset(["tolist"])
  305. _typ = "categorical"
  306. _dtype: CategoricalDtype
  307. def __init__(
  308. self,
  309. values,
  310. categories=None,
  311. ordered=None,
  312. dtype: Dtype | None = None,
  313. fastpath: bool = False,
  314. copy: bool = True,
  315. ) -> None:
  316. dtype = CategoricalDtype._from_values_or_dtype(
  317. values, categories, ordered, dtype
  318. )
  319. # At this point, dtype is always a CategoricalDtype, but
  320. # we may have dtype.categories be None, and we need to
  321. # infer categories in a factorization step further below
  322. if fastpath:
  323. codes = coerce_indexer_dtype(values, dtype.categories)
  324. dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
  325. super().__init__(codes, dtype)
  326. return
  327. if not is_list_like(values):
  328. # GH#38433
  329. raise TypeError("Categorical input must be list-like")
  330. # null_mask indicates missing values we want to exclude from inference.
  331. # This means: only missing values in list-likes (not arrays/ndframes).
  332. null_mask = np.array(False)
  333. # sanitize input
  334. if is_categorical_dtype(values):
  335. if dtype.categories is None:
  336. dtype = CategoricalDtype(values.categories, dtype.ordered)
  337. elif not isinstance(values, (ABCIndex, ABCSeries, ExtensionArray)):
  338. values = com.convert_to_list_like(values)
  339. if isinstance(values, list) and len(values) == 0:
  340. # By convention, empty lists result in object dtype:
  341. values = np.array([], dtype=object)
  342. elif isinstance(values, np.ndarray):
  343. if values.ndim > 1:
  344. # preempt sanitize_array from raising ValueError
  345. raise NotImplementedError(
  346. "> 1 ndim Categorical are not supported at this time"
  347. )
  348. values = sanitize_array(values, None)
  349. else:
  350. # i.e. must be a list
  351. arr = sanitize_array(values, None)
  352. null_mask = isna(arr)
  353. if null_mask.any():
  354. # We remove null values here, then below will re-insert
  355. # them, grep "full_codes"
  356. arr_list = [values[idx] for idx in np.where(~null_mask)[0]]
  357. # GH#44900 Do not cast to float if we have only missing values
  358. if arr_list or arr.dtype == "object":
  359. sanitize_dtype = None
  360. else:
  361. sanitize_dtype = arr.dtype
  362. arr = sanitize_array(arr_list, None, dtype=sanitize_dtype)
  363. values = arr
  364. if dtype.categories is None:
  365. try:
  366. codes, categories = factorize(values, sort=True)
  367. except TypeError as err:
  368. codes, categories = factorize(values, sort=False)
  369. if dtype.ordered:
  370. # raise, as we don't have a sortable data structure and so
  371. # the user should give us one by specifying categories
  372. raise TypeError(
  373. "'values' is not ordered, please "
  374. "explicitly specify the categories order "
  375. "by passing in a categories argument."
  376. ) from err
  377. # we're inferring from values
  378. dtype = CategoricalDtype(categories, dtype.ordered)
  379. elif is_categorical_dtype(values.dtype):
  380. old_codes = extract_array(values)._codes
  381. codes = recode_for_categories(
  382. old_codes, values.dtype.categories, dtype.categories, copy=copy
  383. )
  384. else:
  385. codes = _get_codes_for_values(values, dtype.categories)
  386. if null_mask.any():
  387. # Reinsert -1 placeholders for previously removed missing values
  388. full_codes = -np.ones(null_mask.shape, dtype=codes.dtype)
  389. full_codes[~null_mask] = codes
  390. codes = full_codes
  391. dtype = CategoricalDtype(ordered=False).update_dtype(dtype)
  392. arr = coerce_indexer_dtype(codes, dtype.categories)
  393. super().__init__(arr, dtype)
  394. @property
  395. def dtype(self) -> CategoricalDtype:
  396. """
  397. The :class:`~pandas.api.types.CategoricalDtype` for this instance.
  398. """
  399. return self._dtype
  400. @property
  401. def _internal_fill_value(self) -> int:
  402. # using the specific numpy integer instead of python int to get
  403. # the correct dtype back from _quantile in the all-NA case
  404. dtype = self._ndarray.dtype
  405. return dtype.type(-1)
  406. @classmethod
  407. def _from_sequence(
  408. cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
  409. ) -> Categorical:
  410. return Categorical(scalars, dtype=dtype, copy=copy)
  411. @overload
  412. def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
  413. ...
  414. @overload
  415. def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
  416. ...
  417. @overload
  418. def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
  419. ...
  420. def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
  421. """
  422. Coerce this type to another dtype
  423. Parameters
  424. ----------
  425. dtype : numpy dtype or pandas type
  426. copy : bool, default True
  427. By default, astype always returns a newly allocated object.
  428. If copy is set to False and dtype is categorical, the original
  429. object is returned.
  430. """
  431. dtype = pandas_dtype(dtype)
  432. if self.dtype is dtype:
  433. result = self.copy() if copy else self
  434. elif is_categorical_dtype(dtype):
  435. dtype = cast(CategoricalDtype, dtype)
  436. # GH 10696/18593/18630
  437. dtype = self.dtype.update_dtype(dtype)
  438. self = self.copy() if copy else self
  439. result = self._set_dtype(dtype)
  440. elif isinstance(dtype, ExtensionDtype):
  441. return super().astype(dtype, copy=copy)
  442. elif is_integer_dtype(dtype) and self.isna().any():
  443. raise ValueError("Cannot convert float NaN to integer")
  444. elif len(self.codes) == 0 or len(self.categories) == 0:
  445. result = np.array(
  446. self,
  447. dtype=dtype,
  448. copy=copy,
  449. )
  450. else:
  451. # GH8628 (PERF): astype category codes instead of astyping array
  452. new_cats = self.categories._values
  453. try:
  454. new_cats = new_cats.astype(dtype=dtype, copy=copy)
  455. fill_value = self.categories._na_value
  456. if not is_valid_na_for_dtype(fill_value, dtype):
  457. fill_value = lib.item_from_zerodim(
  458. np.array(self.categories._na_value).astype(dtype)
  459. )
  460. except (
  461. TypeError, # downstream error msg for CategoricalIndex is misleading
  462. ValueError,
  463. ):
  464. msg = f"Cannot cast {self.categories.dtype} dtype to {dtype}"
  465. raise ValueError(msg)
  466. result = take_nd(
  467. new_cats, ensure_platform_int(self._codes), fill_value=fill_value
  468. )
  469. return result
  470. def to_list(self):
  471. """
  472. Alias for tolist.
  473. """
  474. return self.tolist()
  475. @classmethod
  476. def _from_inferred_categories(
  477. cls, inferred_categories, inferred_codes, dtype, true_values=None
  478. ):
  479. """
  480. Construct a Categorical from inferred values.
  481. For inferred categories (`dtype` is None) the categories are sorted.
  482. For explicit `dtype`, the `inferred_categories` are cast to the
  483. appropriate type.
  484. Parameters
  485. ----------
  486. inferred_categories : Index
  487. inferred_codes : Index
  488. dtype : CategoricalDtype or 'category'
  489. true_values : list, optional
  490. If none are provided, the default ones are
  491. "True", "TRUE", and "true."
  492. Returns
  493. -------
  494. Categorical
  495. """
  496. from pandas import (
  497. Index,
  498. to_datetime,
  499. to_numeric,
  500. to_timedelta,
  501. )
  502. cats = Index(inferred_categories)
  503. known_categories = (
  504. isinstance(dtype, CategoricalDtype) and dtype.categories is not None
  505. )
  506. if known_categories:
  507. # Convert to a specialized type with `dtype` if specified.
  508. if is_any_real_numeric_dtype(dtype.categories):
  509. cats = to_numeric(inferred_categories, errors="coerce")
  510. elif is_datetime64_dtype(dtype.categories):
  511. cats = to_datetime(inferred_categories, errors="coerce")
  512. elif is_timedelta64_dtype(dtype.categories):
  513. cats = to_timedelta(inferred_categories, errors="coerce")
  514. elif is_bool_dtype(dtype.categories):
  515. if true_values is None:
  516. true_values = ["True", "TRUE", "true"]
  517. # error: Incompatible types in assignment (expression has type
  518. # "ndarray", variable has type "Index")
  519. cats = cats.isin(true_values) # type: ignore[assignment]
  520. if known_categories:
  521. # Recode from observation order to dtype.categories order.
  522. categories = dtype.categories
  523. codes = recode_for_categories(inferred_codes, cats, categories)
  524. elif not cats.is_monotonic_increasing:
  525. # Sort categories and recode for unknown categories.
  526. unsorted = cats.copy()
  527. categories = cats.sort_values()
  528. codes = recode_for_categories(inferred_codes, unsorted, categories)
  529. dtype = CategoricalDtype(categories, ordered=False)
  530. else:
  531. dtype = CategoricalDtype(cats, ordered=False)
  532. codes = inferred_codes
  533. return cls(codes, dtype=dtype, fastpath=True)
  534. @classmethod
  535. def from_codes(
  536. cls, codes, categories=None, ordered=None, dtype: Dtype | None = None
  537. ) -> Categorical:
  538. """
  539. Make a Categorical type from codes and categories or dtype.
  540. This constructor is useful if you already have codes and
  541. categories/dtype and so do not need the (computation intensive)
  542. factorization step, which is usually done on the constructor.
  543. If your data does not follow this convention, please use the normal
  544. constructor.
  545. Parameters
  546. ----------
  547. codes : array-like of int
  548. An integer array, where each integer points to a category in
  549. categories or dtype.categories, or else is -1 for NaN.
  550. categories : index-like, optional
  551. The categories for the categorical. Items need to be unique.
  552. If the categories are not given here, then they must be provided
  553. in `dtype`.
  554. ordered : bool, optional
  555. Whether or not this categorical is treated as an ordered
  556. categorical. If not given here or in `dtype`, the resulting
  557. categorical will be unordered.
  558. dtype : CategoricalDtype or "category", optional
  559. If :class:`CategoricalDtype`, cannot be used together with
  560. `categories` or `ordered`.
  561. Returns
  562. -------
  563. Categorical
  564. Examples
  565. --------
  566. >>> dtype = pd.CategoricalDtype(['a', 'b'], ordered=True)
  567. >>> pd.Categorical.from_codes(codes=[0, 1, 0, 1], dtype=dtype)
  568. ['a', 'b', 'a', 'b']
  569. Categories (2, object): ['a' < 'b']
  570. """
  571. dtype = CategoricalDtype._from_values_or_dtype(
  572. categories=categories, ordered=ordered, dtype=dtype
  573. )
  574. if dtype.categories is None:
  575. msg = (
  576. "The categories must be provided in 'categories' or "
  577. "'dtype'. Both were None."
  578. )
  579. raise ValueError(msg)
  580. if is_extension_array_dtype(codes) and is_integer_dtype(codes):
  581. # Avoid the implicit conversion of Int to object
  582. if isna(codes).any():
  583. raise ValueError("codes cannot contain NA values")
  584. codes = codes.to_numpy(dtype=np.int64)
  585. else:
  586. codes = np.asarray(codes)
  587. if len(codes) and not is_integer_dtype(codes):
  588. raise ValueError("codes need to be array-like integers")
  589. if len(codes) and (codes.max() >= len(dtype.categories) or codes.min() < -1):
  590. raise ValueError("codes need to be between -1 and len(categories)-1")
  591. return cls(codes, dtype=dtype, fastpath=True)
  592. # ------------------------------------------------------------------
  593. # Categories/Codes/Ordered
  594. @property
  595. def categories(self) -> Index:
  596. """
  597. The categories of this categorical.
  598. Setting assigns new values to each category (effectively a rename of
  599. each individual category).
  600. The assigned value has to be a list-like object. All items must be
  601. unique and the number of items in the new categories must be the same
  602. as the number of items in the old categories.
  603. Raises
  604. ------
  605. ValueError
  606. If the new categories do not validate as categories or if the
  607. number of new categories is unequal the number of old categories
  608. See Also
  609. --------
  610. rename_categories : Rename categories.
  611. reorder_categories : Reorder categories.
  612. add_categories : Add new categories.
  613. remove_categories : Remove the specified categories.
  614. remove_unused_categories : Remove categories which are not used.
  615. set_categories : Set the categories to the specified ones.
  616. """
  617. return self.dtype.categories
  618. @property
  619. def ordered(self) -> Ordered:
  620. """
  621. Whether the categories have an ordered relationship.
  622. """
  623. return self.dtype.ordered
  624. @property
  625. def codes(self) -> np.ndarray:
  626. """
  627. The category codes of this categorical.
  628. Codes are an array of integers which are the positions of the actual
  629. values in the categories array.
  630. There is no setter, use the other categorical methods and the normal item
  631. setter to change values in the categorical.
  632. Returns
  633. -------
  634. ndarray[int]
  635. A non-writable view of the `codes` array.
  636. """
  637. v = self._codes.view()
  638. v.flags.writeable = False
  639. return v
  640. def _set_categories(self, categories, fastpath: bool = False) -> None:
  641. """
  642. Sets new categories inplace
  643. Parameters
  644. ----------
  645. fastpath : bool, default False
  646. Don't perform validation of the categories for uniqueness or nulls
  647. Examples
  648. --------
  649. >>> c = pd.Categorical(['a', 'b'])
  650. >>> c
  651. ['a', 'b']
  652. Categories (2, object): ['a', 'b']
  653. >>> c._set_categories(pd.Index(['a', 'c']))
  654. >>> c
  655. ['a', 'c']
  656. Categories (2, object): ['a', 'c']
  657. """
  658. if fastpath:
  659. new_dtype = CategoricalDtype._from_fastpath(categories, self.ordered)
  660. else:
  661. new_dtype = CategoricalDtype(categories, ordered=self.ordered)
  662. if (
  663. not fastpath
  664. and self.dtype.categories is not None
  665. and len(new_dtype.categories) != len(self.dtype.categories)
  666. ):
  667. raise ValueError(
  668. "new categories need to have the same number of "
  669. "items as the old categories!"
  670. )
  671. super().__init__(self._ndarray, new_dtype)
  672. def _set_dtype(self, dtype: CategoricalDtype) -> Categorical:
  673. """
  674. Internal method for directly updating the CategoricalDtype
  675. Parameters
  676. ----------
  677. dtype : CategoricalDtype
  678. Notes
  679. -----
  680. We don't do any validation here. It's assumed that the dtype is
  681. a (valid) instance of `CategoricalDtype`.
  682. """
  683. codes = recode_for_categories(self.codes, self.categories, dtype.categories)
  684. return type(self)(codes, dtype=dtype, fastpath=True)
  685. def set_ordered(self, value: bool) -> Categorical:
  686. """
  687. Set the ordered attribute to the boolean value.
  688. Parameters
  689. ----------
  690. value : bool
  691. Set whether this categorical is ordered (True) or not (False).
  692. """
  693. new_dtype = CategoricalDtype(self.categories, ordered=value)
  694. cat = self.copy()
  695. NDArrayBacked.__init__(cat, cat._ndarray, new_dtype)
  696. return cat
  697. def as_ordered(self) -> Categorical:
  698. """
  699. Set the Categorical to be ordered.
  700. Returns
  701. -------
  702. Categorical
  703. Ordered Categorical.
  704. """
  705. return self.set_ordered(True)
  706. def as_unordered(self) -> Categorical:
  707. """
  708. Set the Categorical to be unordered.
  709. Returns
  710. -------
  711. Categorical
  712. Unordered Categorical.
  713. """
  714. return self.set_ordered(False)
  715. def set_categories(self, new_categories, ordered=None, rename: bool = False):
  716. """
  717. Set the categories to the specified new_categories.
  718. `new_categories` can include new categories (which will result in
  719. unused categories) or remove old categories (which results in values
  720. set to NaN). If `rename==True`, the categories will simple be renamed
  721. (less or more items than in old categories will result in values set to
  722. NaN or in unused categories respectively).
  723. This method can be used to perform more than one action of adding,
  724. removing, and reordering simultaneously and is therefore faster than
  725. performing the individual steps via the more specialised methods.
  726. On the other hand this methods does not do checks (e.g., whether the
  727. old categories are included in the new categories on a reorder), which
  728. can result in surprising changes, for example when using special string
  729. dtypes, which does not considers a S1 string equal to a single char
  730. python string.
  731. Parameters
  732. ----------
  733. new_categories : Index-like
  734. The categories in new order.
  735. ordered : bool, default False
  736. Whether or not the categorical is treated as a ordered categorical.
  737. If not given, do not change the ordered information.
  738. rename : bool, default False
  739. Whether or not the new_categories should be considered as a rename
  740. of the old categories or as reordered categories.
  741. Returns
  742. -------
  743. Categorical with reordered categories.
  744. Raises
  745. ------
  746. ValueError
  747. If new_categories does not validate as categories
  748. See Also
  749. --------
  750. rename_categories : Rename categories.
  751. reorder_categories : Reorder categories.
  752. add_categories : Add new categories.
  753. remove_categories : Remove the specified categories.
  754. remove_unused_categories : Remove categories which are not used.
  755. """
  756. if ordered is None:
  757. ordered = self.dtype.ordered
  758. new_dtype = CategoricalDtype(new_categories, ordered=ordered)
  759. cat = self.copy()
  760. if rename:
  761. if cat.dtype.categories is not None and len(new_dtype.categories) < len(
  762. cat.dtype.categories
  763. ):
  764. # remove all _codes which are larger and set to -1/NaN
  765. cat._codes[cat._codes >= len(new_dtype.categories)] = -1
  766. codes = cat._codes
  767. else:
  768. codes = recode_for_categories(
  769. cat.codes, cat.categories, new_dtype.categories
  770. )
  771. NDArrayBacked.__init__(cat, codes, new_dtype)
  772. return cat
  773. def rename_categories(self, new_categories) -> Categorical:
  774. """
  775. Rename categories.
  776. Parameters
  777. ----------
  778. new_categories : list-like, dict-like or callable
  779. New categories which will replace old categories.
  780. * list-like: all items must be unique and the number of items in
  781. the new categories must match the existing number of categories.
  782. * dict-like: specifies a mapping from
  783. old categories to new. Categories not contained in the mapping
  784. are passed through and extra categories in the mapping are
  785. ignored.
  786. * callable : a callable that is called on all items in the old
  787. categories and whose return values comprise the new categories.
  788. Returns
  789. -------
  790. Categorical
  791. Categorical with renamed categories.
  792. Raises
  793. ------
  794. ValueError
  795. If new categories are list-like and do not have the same number of
  796. items than the current categories or do not validate as categories
  797. See Also
  798. --------
  799. reorder_categories : Reorder categories.
  800. add_categories : Add new categories.
  801. remove_categories : Remove the specified categories.
  802. remove_unused_categories : Remove categories which are not used.
  803. set_categories : Set the categories to the specified ones.
  804. Examples
  805. --------
  806. >>> c = pd.Categorical(['a', 'a', 'b'])
  807. >>> c.rename_categories([0, 1])
  808. [0, 0, 1]
  809. Categories (2, int64): [0, 1]
  810. For dict-like ``new_categories``, extra keys are ignored and
  811. categories not in the dictionary are passed through
  812. >>> c.rename_categories({'a': 'A', 'c': 'C'})
  813. ['A', 'A', 'b']
  814. Categories (2, object): ['A', 'b']
  815. You may also provide a callable to create the new categories
  816. >>> c.rename_categories(lambda x: x.upper())
  817. ['A', 'A', 'B']
  818. Categories (2, object): ['A', 'B']
  819. """
  820. if is_dict_like(new_categories):
  821. new_categories = [
  822. new_categories.get(item, item) for item in self.categories
  823. ]
  824. elif callable(new_categories):
  825. new_categories = [new_categories(item) for item in self.categories]
  826. cat = self.copy()
  827. cat._set_categories(new_categories)
  828. return cat
  829. def reorder_categories(self, new_categories, ordered=None):
  830. """
  831. Reorder categories as specified in new_categories.
  832. `new_categories` need to include all old categories and no new category
  833. items.
  834. Parameters
  835. ----------
  836. new_categories : Index-like
  837. The categories in new order.
  838. ordered : bool, optional
  839. Whether or not the categorical is treated as a ordered categorical.
  840. If not given, do not change the ordered information.
  841. Returns
  842. -------
  843. Categorical
  844. Categorical with reordered categories.
  845. Raises
  846. ------
  847. ValueError
  848. If the new categories do not contain all old category items or any
  849. new ones
  850. See Also
  851. --------
  852. rename_categories : Rename categories.
  853. add_categories : Add new categories.
  854. remove_categories : Remove the specified categories.
  855. remove_unused_categories : Remove categories which are not used.
  856. set_categories : Set the categories to the specified ones.
  857. """
  858. if (
  859. len(self.categories) != len(new_categories)
  860. or not self.categories.difference(new_categories).empty
  861. ):
  862. raise ValueError(
  863. "items in new_categories are not the same as in old categories"
  864. )
  865. return self.set_categories(new_categories, ordered=ordered)
  866. def add_categories(self, new_categories) -> Categorical:
  867. """
  868. Add new categories.
  869. `new_categories` will be included at the last/highest place in the
  870. categories and will be unused directly after this call.
  871. Parameters
  872. ----------
  873. new_categories : category or list-like of category
  874. The new categories to be included.
  875. Returns
  876. -------
  877. Categorical
  878. Categorical with new categories added.
  879. Raises
  880. ------
  881. ValueError
  882. If the new categories include old categories or do not validate as
  883. categories
  884. See Also
  885. --------
  886. rename_categories : Rename categories.
  887. reorder_categories : Reorder categories.
  888. remove_categories : Remove the specified categories.
  889. remove_unused_categories : Remove categories which are not used.
  890. set_categories : Set the categories to the specified ones.
  891. Examples
  892. --------
  893. >>> c = pd.Categorical(['c', 'b', 'c'])
  894. >>> c
  895. ['c', 'b', 'c']
  896. Categories (2, object): ['b', 'c']
  897. >>> c.add_categories(['d', 'a'])
  898. ['c', 'b', 'c']
  899. Categories (4, object): ['b', 'c', 'd', 'a']
  900. """
  901. if not is_list_like(new_categories):
  902. new_categories = [new_categories]
  903. already_included = set(new_categories) & set(self.dtype.categories)
  904. if len(already_included) != 0:
  905. raise ValueError(
  906. f"new categories must not include old categories: {already_included}"
  907. )
  908. if hasattr(new_categories, "dtype"):
  909. from pandas import Series
  910. dtype = find_common_type(
  911. [self.dtype.categories.dtype, new_categories.dtype]
  912. )
  913. new_categories = Series(
  914. list(self.dtype.categories) + list(new_categories), dtype=dtype
  915. )
  916. else:
  917. new_categories = list(self.dtype.categories) + list(new_categories)
  918. new_dtype = CategoricalDtype(new_categories, self.ordered)
  919. cat = self.copy()
  920. codes = coerce_indexer_dtype(cat._ndarray, new_dtype.categories)
  921. NDArrayBacked.__init__(cat, codes, new_dtype)
  922. return cat
  923. def remove_categories(self, removals):
  924. """
  925. Remove the specified categories.
  926. `removals` must be included in the old categories. Values which were in
  927. the removed categories will be set to NaN
  928. Parameters
  929. ----------
  930. removals : category or list of categories
  931. The categories which should be removed.
  932. Returns
  933. -------
  934. Categorical
  935. Categorical with removed categories.
  936. Raises
  937. ------
  938. ValueError
  939. If the removals are not contained in the categories
  940. See Also
  941. --------
  942. rename_categories : Rename categories.
  943. reorder_categories : Reorder categories.
  944. add_categories : Add new categories.
  945. remove_unused_categories : Remove categories which are not used.
  946. set_categories : Set the categories to the specified ones.
  947. Examples
  948. --------
  949. >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])
  950. >>> c
  951. ['a', 'c', 'b', 'c', 'd']
  952. Categories (4, object): ['a', 'b', 'c', 'd']
  953. >>> c.remove_categories(['d', 'a'])
  954. [NaN, 'c', 'b', 'c', NaN]
  955. Categories (2, object): ['b', 'c']
  956. """
  957. from pandas import Index
  958. if not is_list_like(removals):
  959. removals = [removals]
  960. removals = Index(removals).unique().dropna()
  961. new_categories = self.dtype.categories.difference(removals)
  962. not_included = removals.difference(self.dtype.categories)
  963. if len(not_included) != 0:
  964. not_included = set(not_included)
  965. raise ValueError(f"removals must all be in old categories: {not_included}")
  966. return self.set_categories(new_categories, ordered=self.ordered, rename=False)
  967. def remove_unused_categories(self) -> Categorical:
  968. """
  969. Remove categories which are not used.
  970. Returns
  971. -------
  972. Categorical
  973. Categorical with unused categories dropped.
  974. See Also
  975. --------
  976. rename_categories : Rename categories.
  977. reorder_categories : Reorder categories.
  978. add_categories : Add new categories.
  979. remove_categories : Remove the specified categories.
  980. set_categories : Set the categories to the specified ones.
  981. Examples
  982. --------
  983. >>> c = pd.Categorical(['a', 'c', 'b', 'c', 'd'])
  984. >>> c
  985. ['a', 'c', 'b', 'c', 'd']
  986. Categories (4, object): ['a', 'b', 'c', 'd']
  987. >>> c[2] = 'a'
  988. >>> c[4] = 'c'
  989. >>> c
  990. ['a', 'c', 'a', 'c', 'c']
  991. Categories (4, object): ['a', 'b', 'c', 'd']
  992. >>> c.remove_unused_categories()
  993. ['a', 'c', 'a', 'c', 'c']
  994. Categories (2, object): ['a', 'c']
  995. """
  996. idx, inv = np.unique(self._codes, return_inverse=True)
  997. if idx.size != 0 and idx[0] == -1: # na sentinel
  998. idx, inv = idx[1:], inv - 1
  999. new_categories = self.dtype.categories.take(idx)
  1000. new_dtype = CategoricalDtype._from_fastpath(
  1001. new_categories, ordered=self.ordered
  1002. )
  1003. new_codes = coerce_indexer_dtype(inv, new_dtype.categories)
  1004. cat = self.copy()
  1005. NDArrayBacked.__init__(cat, new_codes, new_dtype)
  1006. return cat
  1007. # ------------------------------------------------------------------
  1008. def map(self, mapper):
  1009. """
  1010. Map categories using an input mapping or function.
  1011. Maps the categories to new categories. If the mapping correspondence is
  1012. one-to-one the result is a :class:`~pandas.Categorical` which has the
  1013. same order property as the original, otherwise a :class:`~pandas.Index`
  1014. is returned. NaN values are unaffected.
  1015. If a `dict` or :class:`~pandas.Series` is used any unmapped category is
  1016. mapped to `NaN`. Note that if this happens an :class:`~pandas.Index`
  1017. will be returned.
  1018. Parameters
  1019. ----------
  1020. mapper : function, dict, or Series
  1021. Mapping correspondence.
  1022. Returns
  1023. -------
  1024. pandas.Categorical or pandas.Index
  1025. Mapped categorical.
  1026. See Also
  1027. --------
  1028. CategoricalIndex.map : Apply a mapping correspondence on a
  1029. :class:`~pandas.CategoricalIndex`.
  1030. Index.map : Apply a mapping correspondence on an
  1031. :class:`~pandas.Index`.
  1032. Series.map : Apply a mapping correspondence on a
  1033. :class:`~pandas.Series`.
  1034. Series.apply : Apply more complex functions on a
  1035. :class:`~pandas.Series`.
  1036. Examples
  1037. --------
  1038. >>> cat = pd.Categorical(['a', 'b', 'c'])
  1039. >>> cat
  1040. ['a', 'b', 'c']
  1041. Categories (3, object): ['a', 'b', 'c']
  1042. >>> cat.map(lambda x: x.upper())
  1043. ['A', 'B', 'C']
  1044. Categories (3, object): ['A', 'B', 'C']
  1045. >>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
  1046. ['first', 'second', 'third']
  1047. Categories (3, object): ['first', 'second', 'third']
  1048. If the mapping is one-to-one the ordering of the categories is
  1049. preserved:
  1050. >>> cat = pd.Categorical(['a', 'b', 'c'], ordered=True)
  1051. >>> cat
  1052. ['a', 'b', 'c']
  1053. Categories (3, object): ['a' < 'b' < 'c']
  1054. >>> cat.map({'a': 3, 'b': 2, 'c': 1})
  1055. [3, 2, 1]
  1056. Categories (3, int64): [3 < 2 < 1]
  1057. If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
  1058. >>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})
  1059. Index(['first', 'second', 'first'], dtype='object')
  1060. If a `dict` is used, all unmapped categories are mapped to `NaN` and
  1061. the result is an :class:`~pandas.Index`:
  1062. >>> cat.map({'a': 'first', 'b': 'second'})
  1063. Index(['first', 'second', nan], dtype='object')
  1064. """
  1065. new_categories = self.categories.map(mapper)
  1066. try:
  1067. return self.from_codes(
  1068. self._codes.copy(), categories=new_categories, ordered=self.ordered
  1069. )
  1070. except ValueError:
  1071. # NA values are represented in self._codes with -1
  1072. # np.take causes NA values to take final element in new_categories
  1073. if np.any(self._codes == -1):
  1074. new_categories = new_categories.insert(len(new_categories), np.nan)
  1075. return np.take(new_categories, self._codes)
  1076. __eq__ = _cat_compare_op(operator.eq)
  1077. __ne__ = _cat_compare_op(operator.ne)
  1078. __lt__ = _cat_compare_op(operator.lt)
  1079. __gt__ = _cat_compare_op(operator.gt)
  1080. __le__ = _cat_compare_op(operator.le)
  1081. __ge__ = _cat_compare_op(operator.ge)
  1082. # -------------------------------------------------------------
  1083. # Validators; ideally these can be de-duplicated
  1084. def _validate_setitem_value(self, value):
  1085. if not is_hashable(value):
  1086. # wrap scalars and hashable-listlikes in list
  1087. return self._validate_listlike(value)
  1088. else:
  1089. return self._validate_scalar(value)
  1090. def _validate_scalar(self, fill_value):
  1091. """
  1092. Convert a user-facing fill_value to a representation to use with our
  1093. underlying ndarray, raising TypeError if this is not possible.
  1094. Parameters
  1095. ----------
  1096. fill_value : object
  1097. Returns
  1098. -------
  1099. fill_value : int
  1100. Raises
  1101. ------
  1102. TypeError
  1103. """
  1104. if is_valid_na_for_dtype(fill_value, self.categories.dtype):
  1105. fill_value = -1
  1106. elif fill_value in self.categories:
  1107. fill_value = self._unbox_scalar(fill_value)
  1108. else:
  1109. raise TypeError(
  1110. "Cannot setitem on a Categorical with a new "
  1111. f"category ({fill_value}), set the categories first"
  1112. ) from None
  1113. return fill_value
  1114. # -------------------------------------------------------------
  1115. @ravel_compat
  1116. def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
  1117. """
  1118. The numpy array interface.
  1119. Returns
  1120. -------
  1121. numpy.array
  1122. A numpy array of either the specified dtype or,
  1123. if dtype==None (default), the same dtype as
  1124. categorical.categories.dtype.
  1125. """
  1126. ret = take_nd(self.categories._values, self._codes)
  1127. if dtype and not is_dtype_equal(dtype, self.categories.dtype):
  1128. return np.asarray(ret, dtype)
  1129. # When we're a Categorical[ExtensionArray], like Interval,
  1130. # we need to ensure __array__ gets all the way to an
  1131. # ndarray.
  1132. return np.asarray(ret)
  1133. def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
  1134. # for binary ops, use our custom dunder methods
  1135. result = ops.maybe_dispatch_ufunc_to_dunder_op(
  1136. self, ufunc, method, *inputs, **kwargs
  1137. )
  1138. if result is not NotImplemented:
  1139. return result
  1140. if "out" in kwargs:
  1141. # e.g. test_numpy_ufuncs_out
  1142. return arraylike.dispatch_ufunc_with_out(
  1143. self, ufunc, method, *inputs, **kwargs
  1144. )
  1145. if method == "reduce":
  1146. # e.g. TestCategoricalAnalytics::test_min_max_ordered
  1147. result = arraylike.dispatch_reduction_ufunc(
  1148. self, ufunc, method, *inputs, **kwargs
  1149. )
  1150. if result is not NotImplemented:
  1151. return result
  1152. # for all other cases, raise for now (similarly as what happens in
  1153. # Series.__array_prepare__)
  1154. raise TypeError(
  1155. f"Object with dtype {self.dtype} cannot perform "
  1156. f"the numpy op {ufunc.__name__}"
  1157. )
  1158. def __setstate__(self, state) -> None:
  1159. """Necessary for making this object picklable"""
  1160. if not isinstance(state, dict):
  1161. return super().__setstate__(state)
  1162. if "_dtype" not in state:
  1163. state["_dtype"] = CategoricalDtype(state["_categories"], state["_ordered"])
  1164. if "_codes" in state and "_ndarray" not in state:
  1165. # backward compat, changed what is property vs attribute
  1166. state["_ndarray"] = state.pop("_codes")
  1167. super().__setstate__(state)
  1168. @property
  1169. def nbytes(self) -> int:
  1170. return self._codes.nbytes + self.dtype.categories.values.nbytes
  1171. def memory_usage(self, deep: bool = False) -> int:
  1172. """
  1173. Memory usage of my values
  1174. Parameters
  1175. ----------
  1176. deep : bool
  1177. Introspect the data deeply, interrogate
  1178. `object` dtypes for system-level memory consumption
  1179. Returns
  1180. -------
  1181. bytes used
  1182. Notes
  1183. -----
  1184. Memory usage does not include memory consumed by elements that
  1185. are not components of the array if deep=False
  1186. See Also
  1187. --------
  1188. numpy.ndarray.nbytes
  1189. """
  1190. return self._codes.nbytes + self.dtype.categories.memory_usage(deep=deep)
  1191. def isna(self) -> np.ndarray:
  1192. """
  1193. Detect missing values
  1194. Missing values (-1 in .codes) are detected.
  1195. Returns
  1196. -------
  1197. np.ndarray[bool] of whether my values are null
  1198. See Also
  1199. --------
  1200. isna : Top-level isna.
  1201. isnull : Alias of isna.
  1202. Categorical.notna : Boolean inverse of Categorical.isna.
  1203. """
  1204. return self._codes == -1
  1205. isnull = isna
  1206. def notna(self) -> np.ndarray:
  1207. """
  1208. Inverse of isna
  1209. Both missing values (-1 in .codes) and NA as a category are detected as
  1210. null.
  1211. Returns
  1212. -------
  1213. np.ndarray[bool] of whether my values are not null
  1214. See Also
  1215. --------
  1216. notna : Top-level notna.
  1217. notnull : Alias of notna.
  1218. Categorical.isna : Boolean inverse of Categorical.notna.
  1219. """
  1220. return ~self.isna()
  1221. notnull = notna
  1222. def value_counts(self, dropna: bool = True) -> Series:
  1223. """
  1224. Return a Series containing counts of each category.
  1225. Every category will have an entry, even those with a count of 0.
  1226. Parameters
  1227. ----------
  1228. dropna : bool, default True
  1229. Don't include counts of NaN.
  1230. Returns
  1231. -------
  1232. counts : Series
  1233. See Also
  1234. --------
  1235. Series.value_counts
  1236. """
  1237. from pandas import (
  1238. CategoricalIndex,
  1239. Series,
  1240. )
  1241. code, cat = self._codes, self.categories
  1242. ncat, mask = (len(cat), code >= 0)
  1243. ix, clean = np.arange(ncat), mask.all()
  1244. if dropna or clean:
  1245. obs = code if clean else code[mask]
  1246. count = np.bincount(obs, minlength=ncat or 0)
  1247. else:
  1248. count = np.bincount(np.where(mask, code, ncat))
  1249. ix = np.append(ix, -1)
  1250. ix = coerce_indexer_dtype(ix, self.dtype.categories)
  1251. ix = self._from_backing_data(ix)
  1252. return Series(
  1253. count, index=CategoricalIndex(ix), dtype="int64", name="count", copy=False
  1254. )
  1255. # error: Argument 2 of "_empty" is incompatible with supertype
  1256. # "NDArrayBackedExtensionArray"; supertype defines the argument type as
  1257. # "ExtensionDtype"
  1258. @classmethod
  1259. def _empty( # type: ignore[override]
  1260. cls: type_t[Categorical], shape: Shape, dtype: CategoricalDtype
  1261. ) -> Categorical:
  1262. """
  1263. Analogous to np.empty(shape, dtype=dtype)
  1264. Parameters
  1265. ----------
  1266. shape : tuple[int]
  1267. dtype : CategoricalDtype
  1268. """
  1269. arr = cls._from_sequence([], dtype=dtype)
  1270. # We have to use np.zeros instead of np.empty otherwise the resulting
  1271. # ndarray may contain codes not supported by this dtype, in which
  1272. # case repr(result) could segfault.
  1273. backing = np.zeros(shape, dtype=arr._ndarray.dtype)
  1274. return arr._from_backing_data(backing)
  1275. def _internal_get_values(self):
  1276. """
  1277. Return the values.
  1278. For internal compatibility with pandas formatting.
  1279. Returns
  1280. -------
  1281. np.ndarray or Index
  1282. A numpy array of the same dtype as categorical.categories.dtype or
  1283. Index if datetime / periods.
  1284. """
  1285. # if we are a datetime and period index, return Index to keep metadata
  1286. if needs_i8_conversion(self.categories.dtype):
  1287. return self.categories.take(self._codes, fill_value=NaT)
  1288. elif is_integer_dtype(self.categories) and -1 in self._codes:
  1289. return self.categories.astype("object").take(self._codes, fill_value=np.nan)
  1290. return np.array(self)
  1291. def check_for_ordered(self, op) -> None:
  1292. """assert that we are ordered"""
  1293. if not self.ordered:
  1294. raise TypeError(
  1295. f"Categorical is not ordered for operation {op}\n"
  1296. "you can use .as_ordered() to change the "
  1297. "Categorical to an ordered one\n"
  1298. )
  1299. def argsort(
  1300. self, *, ascending: bool = True, kind: SortKind = "quicksort", **kwargs
  1301. ):
  1302. """
  1303. Return the indices that would sort the Categorical.
  1304. Missing values are sorted at the end.
  1305. Parameters
  1306. ----------
  1307. ascending : bool, default True
  1308. Whether the indices should result in an ascending
  1309. or descending sort.
  1310. kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, optional
  1311. Sorting algorithm.
  1312. **kwargs:
  1313. passed through to :func:`numpy.argsort`.
  1314. Returns
  1315. -------
  1316. np.ndarray[np.intp]
  1317. See Also
  1318. --------
  1319. numpy.ndarray.argsort
  1320. Notes
  1321. -----
  1322. While an ordering is applied to the category values, arg-sorting
  1323. in this context refers more to organizing and grouping together
  1324. based on matching category values. Thus, this function can be
  1325. called on an unordered Categorical instance unlike the functions
  1326. 'Categorical.min' and 'Categorical.max'.
  1327. Examples
  1328. --------
  1329. >>> pd.Categorical(['b', 'b', 'a', 'c']).argsort()
  1330. array([2, 0, 1, 3])
  1331. >>> cat = pd.Categorical(['b', 'b', 'a', 'c'],
  1332. ... categories=['c', 'b', 'a'],
  1333. ... ordered=True)
  1334. >>> cat.argsort()
  1335. array([3, 0, 1, 2])
  1336. Missing values are placed at the end
  1337. >>> cat = pd.Categorical([2, None, 1])
  1338. >>> cat.argsort()
  1339. array([2, 0, 1])
  1340. """
  1341. return super().argsort(ascending=ascending, kind=kind, **kwargs)
  1342. @overload
  1343. def sort_values(
  1344. self,
  1345. *,
  1346. inplace: Literal[False] = ...,
  1347. ascending: bool = ...,
  1348. na_position: str = ...,
  1349. ) -> Categorical:
  1350. ...
  1351. @overload
  1352. def sort_values(
  1353. self, *, inplace: Literal[True], ascending: bool = ..., na_position: str = ...
  1354. ) -> None:
  1355. ...
  1356. def sort_values(
  1357. self,
  1358. *,
  1359. inplace: bool = False,
  1360. ascending: bool = True,
  1361. na_position: str = "last",
  1362. ) -> Categorical | None:
  1363. """
  1364. Sort the Categorical by category value returning a new
  1365. Categorical by default.
  1366. While an ordering is applied to the category values, sorting in this
  1367. context refers more to organizing and grouping together based on
  1368. matching category values. Thus, this function can be called on an
  1369. unordered Categorical instance unlike the functions 'Categorical.min'
  1370. and 'Categorical.max'.
  1371. Parameters
  1372. ----------
  1373. inplace : bool, default False
  1374. Do operation in place.
  1375. ascending : bool, default True
  1376. Order ascending. Passing False orders descending. The
  1377. ordering parameter provides the method by which the
  1378. category values are organized.
  1379. na_position : {'first', 'last'} (optional, default='last')
  1380. 'first' puts NaNs at the beginning
  1381. 'last' puts NaNs at the end
  1382. Returns
  1383. -------
  1384. Categorical or None
  1385. See Also
  1386. --------
  1387. Categorical.sort
  1388. Series.sort_values
  1389. Examples
  1390. --------
  1391. >>> c = pd.Categorical([1, 2, 2, 1, 5])
  1392. >>> c
  1393. [1, 2, 2, 1, 5]
  1394. Categories (3, int64): [1, 2, 5]
  1395. >>> c.sort_values()
  1396. [1, 1, 2, 2, 5]
  1397. Categories (3, int64): [1, 2, 5]
  1398. >>> c.sort_values(ascending=False)
  1399. [5, 2, 2, 1, 1]
  1400. Categories (3, int64): [1, 2, 5]
  1401. >>> c = pd.Categorical([1, 2, 2, 1, 5])
  1402. 'sort_values' behaviour with NaNs. Note that 'na_position'
  1403. is independent of the 'ascending' parameter:
  1404. >>> c = pd.Categorical([np.nan, 2, 2, np.nan, 5])
  1405. >>> c
  1406. [NaN, 2, 2, NaN, 5]
  1407. Categories (2, int64): [2, 5]
  1408. >>> c.sort_values()
  1409. [2, 2, 5, NaN, NaN]
  1410. Categories (2, int64): [2, 5]
  1411. >>> c.sort_values(ascending=False)
  1412. [5, 2, 2, NaN, NaN]
  1413. Categories (2, int64): [2, 5]
  1414. >>> c.sort_values(na_position='first')
  1415. [NaN, NaN, 2, 2, 5]
  1416. Categories (2, int64): [2, 5]
  1417. >>> c.sort_values(ascending=False, na_position='first')
  1418. [NaN, NaN, 5, 2, 2]
  1419. Categories (2, int64): [2, 5]
  1420. """
  1421. inplace = validate_bool_kwarg(inplace, "inplace")
  1422. if na_position not in ["last", "first"]:
  1423. raise ValueError(f"invalid na_position: {repr(na_position)}")
  1424. sorted_idx = nargsort(self, ascending=ascending, na_position=na_position)
  1425. if not inplace:
  1426. codes = self._codes[sorted_idx]
  1427. return self._from_backing_data(codes)
  1428. self._codes[:] = self._codes[sorted_idx]
  1429. return None
  1430. def _rank(
  1431. self,
  1432. *,
  1433. axis: AxisInt = 0,
  1434. method: str = "average",
  1435. na_option: str = "keep",
  1436. ascending: bool = True,
  1437. pct: bool = False,
  1438. ):
  1439. """
  1440. See Series.rank.__doc__.
  1441. """
  1442. if axis != 0:
  1443. raise NotImplementedError
  1444. vff = self._values_for_rank()
  1445. return algorithms.rank(
  1446. vff,
  1447. axis=axis,
  1448. method=method,
  1449. na_option=na_option,
  1450. ascending=ascending,
  1451. pct=pct,
  1452. )
  1453. def _values_for_rank(self):
  1454. """
  1455. For correctly ranking ordered categorical data. See GH#15420
  1456. Ordered categorical data should be ranked on the basis of
  1457. codes with -1 translated to NaN.
  1458. Returns
  1459. -------
  1460. numpy.array
  1461. """
  1462. from pandas import Series
  1463. if self.ordered:
  1464. values = self.codes
  1465. mask = values == -1
  1466. if mask.any():
  1467. values = values.astype("float64")
  1468. values[mask] = np.nan
  1469. elif is_any_real_numeric_dtype(self.categories):
  1470. values = np.array(self)
  1471. else:
  1472. # reorder the categories (so rank can use the float codes)
  1473. # instead of passing an object array to rank
  1474. values = np.array(
  1475. self.rename_categories(
  1476. Series(self.categories, copy=False).rank().values
  1477. )
  1478. )
  1479. return values
  1480. # ------------------------------------------------------------------
  1481. # NDArrayBackedExtensionArray compat
  1482. @property
  1483. def _codes(self) -> np.ndarray:
  1484. return self._ndarray
  1485. def _box_func(self, i: int):
  1486. if i == -1:
  1487. return np.NaN
  1488. return self.categories[i]
  1489. def _unbox_scalar(self, key) -> int:
  1490. # searchsorted is very performance sensitive. By converting codes
  1491. # to same dtype as self.codes, we get much faster performance.
  1492. code = self.categories.get_loc(key)
  1493. code = self._ndarray.dtype.type(code)
  1494. return code
  1495. # ------------------------------------------------------------------
  1496. def __iter__(self) -> Iterator:
  1497. """
  1498. Returns an Iterator over the values of this Categorical.
  1499. """
  1500. if self.ndim == 1:
  1501. return iter(self._internal_get_values().tolist())
  1502. else:
  1503. return (self[n] for n in range(len(self)))
  1504. def __contains__(self, key) -> bool:
  1505. """
  1506. Returns True if `key` is in this Categorical.
  1507. """
  1508. # if key is a NaN, check if any NaN is in self.
  1509. if is_valid_na_for_dtype(key, self.categories.dtype):
  1510. return bool(self.isna().any())
  1511. return contains(self, key, container=self._codes)
  1512. # ------------------------------------------------------------------
  1513. # Rendering Methods
  1514. def _formatter(self, boxed: bool = False):
  1515. # Defer to CategoricalFormatter's formatter.
  1516. return None
  1517. def _tidy_repr(self, max_vals: int = 10, footer: bool = True) -> str:
  1518. """
  1519. a short repr displaying only max_vals and an optional (but default
  1520. footer)
  1521. """
  1522. num = max_vals // 2
  1523. head = self[:num]._get_repr(length=False, footer=False)
  1524. tail = self[-(max_vals - num) :]._get_repr(length=False, footer=False)
  1525. result = f"{head[:-1]}, ..., {tail[1:]}"
  1526. if footer:
  1527. result = f"{result}\n{self._repr_footer()}"
  1528. return str(result)
  1529. def _repr_categories(self) -> list[str]:
  1530. """
  1531. return the base repr for the categories
  1532. """
  1533. max_categories = (
  1534. 10
  1535. if get_option("display.max_categories") == 0
  1536. else get_option("display.max_categories")
  1537. )
  1538. from pandas.io.formats import format as fmt
  1539. format_array = partial(
  1540. fmt.format_array, formatter=None, quoting=QUOTE_NONNUMERIC
  1541. )
  1542. if len(self.categories) > max_categories:
  1543. num = max_categories // 2
  1544. head = format_array(self.categories[:num])
  1545. tail = format_array(self.categories[-num:])
  1546. category_strs = head + ["..."] + tail
  1547. else:
  1548. category_strs = format_array(self.categories)
  1549. # Strip all leading spaces, which format_array adds for columns...
  1550. category_strs = [x.strip() for x in category_strs]
  1551. return category_strs
  1552. def _repr_categories_info(self) -> str:
  1553. """
  1554. Returns a string representation of the footer.
  1555. """
  1556. category_strs = self._repr_categories()
  1557. dtype = str(self.categories.dtype)
  1558. levheader = f"Categories ({len(self.categories)}, {dtype}): "
  1559. width, height = get_terminal_size()
  1560. max_width = get_option("display.width") or width
  1561. if console.in_ipython_frontend():
  1562. # 0 = no breaks
  1563. max_width = 0
  1564. levstring = ""
  1565. start = True
  1566. cur_col_len = len(levheader) # header
  1567. sep_len, sep = (3, " < ") if self.ordered else (2, ", ")
  1568. linesep = f"{sep.rstrip()}\n" # remove whitespace
  1569. for val in category_strs:
  1570. if max_width != 0 and cur_col_len + sep_len + len(val) > max_width:
  1571. levstring += linesep + (" " * (len(levheader) + 1))
  1572. cur_col_len = len(levheader) + 1 # header + a whitespace
  1573. elif not start:
  1574. levstring += sep
  1575. cur_col_len += len(val)
  1576. levstring += val
  1577. start = False
  1578. # replace to simple save space by
  1579. return f"{levheader}[{levstring.replace(' < ... < ', ' ... ')}]"
  1580. def _repr_footer(self) -> str:
  1581. info = self._repr_categories_info()
  1582. return f"Length: {len(self)}\n{info}"
  1583. def _get_repr(
  1584. self, length: bool = True, na_rep: str = "NaN", footer: bool = True
  1585. ) -> str:
  1586. from pandas.io.formats import format as fmt
  1587. formatter = fmt.CategoricalFormatter(
  1588. self, length=length, na_rep=na_rep, footer=footer
  1589. )
  1590. result = formatter.to_string()
  1591. return str(result)
  1592. def __repr__(self) -> str:
  1593. """
  1594. String representation.
  1595. """
  1596. _maxlen = 10
  1597. if len(self._codes) > _maxlen:
  1598. result = self._tidy_repr(_maxlen)
  1599. elif len(self._codes) > 0:
  1600. result = self._get_repr(length=len(self) > _maxlen)
  1601. else:
  1602. msg = self._get_repr(length=False, footer=True).replace("\n", ", ")
  1603. result = f"[], {msg}"
  1604. return result
  1605. # ------------------------------------------------------------------
  1606. def _validate_listlike(self, value):
  1607. # NB: here we assume scalar-like tuples have already been excluded
  1608. value = extract_array(value, extract_numpy=True)
  1609. # require identical categories set
  1610. if isinstance(value, Categorical):
  1611. if not is_dtype_equal(self.dtype, value.dtype):
  1612. raise TypeError(
  1613. "Cannot set a Categorical with another, "
  1614. "without identical categories"
  1615. )
  1616. # is_dtype_equal implies categories_match_up_to_permutation
  1617. value = self._encode_with_my_categories(value)
  1618. return value._codes
  1619. from pandas import Index
  1620. # tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
  1621. to_add = Index._with_infer(value, tupleize_cols=False).difference(
  1622. self.categories
  1623. )
  1624. # no assignments of values not in categories, but it's always ok to set
  1625. # something to np.nan
  1626. if len(to_add) and not isna(to_add).all():
  1627. raise TypeError(
  1628. "Cannot setitem on a Categorical with a new "
  1629. "category, set the categories first"
  1630. )
  1631. codes = self.categories.get_indexer(value)
  1632. return codes.astype(self._ndarray.dtype, copy=False)
  1633. def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
  1634. """
  1635. Compute the inverse of a categorical, returning
  1636. a dict of categories -> indexers.
  1637. *This is an internal function*
  1638. Returns
  1639. -------
  1640. Dict[Hashable, np.ndarray[np.intp]]
  1641. dict of categories -> indexers
  1642. Examples
  1643. --------
  1644. >>> c = pd.Categorical(list('aabca'))
  1645. >>> c
  1646. ['a', 'a', 'b', 'c', 'a']
  1647. Categories (3, object): ['a', 'b', 'c']
  1648. >>> c.categories
  1649. Index(['a', 'b', 'c'], dtype='object')
  1650. >>> c.codes
  1651. array([0, 0, 1, 2, 0], dtype=int8)
  1652. >>> c._reverse_indexer()
  1653. {'a': array([0, 1, 4]), 'b': array([2]), 'c': array([3])}
  1654. """
  1655. categories = self.categories
  1656. r, counts = libalgos.groupsort_indexer(
  1657. ensure_platform_int(self.codes), categories.size
  1658. )
  1659. counts = ensure_int64(counts).cumsum()
  1660. _result = (r[start:end] for start, end in zip(counts, counts[1:]))
  1661. return dict(zip(categories, _result))
  1662. # ------------------------------------------------------------------
  1663. # Reductions
  1664. def min(self, *, skipna: bool = True, **kwargs):
  1665. """
  1666. The minimum value of the object.
  1667. Only ordered `Categoricals` have a minimum!
  1668. Raises
  1669. ------
  1670. TypeError
  1671. If the `Categorical` is not `ordered`.
  1672. Returns
  1673. -------
  1674. min : the minimum of this `Categorical`, NA value if empty
  1675. """
  1676. nv.validate_minmax_axis(kwargs.get("axis", 0))
  1677. nv.validate_min((), kwargs)
  1678. self.check_for_ordered("min")
  1679. if not len(self._codes):
  1680. return self.dtype.na_value
  1681. good = self._codes != -1
  1682. if not good.all():
  1683. if skipna and good.any():
  1684. pointer = self._codes[good].min()
  1685. else:
  1686. return np.nan
  1687. else:
  1688. pointer = self._codes.min()
  1689. return self._wrap_reduction_result(None, pointer)
  1690. def max(self, *, skipna: bool = True, **kwargs):
  1691. """
  1692. The maximum value of the object.
  1693. Only ordered `Categoricals` have a maximum!
  1694. Raises
  1695. ------
  1696. TypeError
  1697. If the `Categorical` is not `ordered`.
  1698. Returns
  1699. -------
  1700. max : the maximum of this `Categorical`, NA if array is empty
  1701. """
  1702. nv.validate_minmax_axis(kwargs.get("axis", 0))
  1703. nv.validate_max((), kwargs)
  1704. self.check_for_ordered("max")
  1705. if not len(self._codes):
  1706. return self.dtype.na_value
  1707. good = self._codes != -1
  1708. if not good.all():
  1709. if skipna and good.any():
  1710. pointer = self._codes[good].max()
  1711. else:
  1712. return np.nan
  1713. else:
  1714. pointer = self._codes.max()
  1715. return self._wrap_reduction_result(None, pointer)
  1716. def _mode(self, dropna: bool = True) -> Categorical:
  1717. codes = self._codes
  1718. mask = None
  1719. if dropna:
  1720. mask = self.isna()
  1721. res_codes = algorithms.mode(codes, mask=mask)
  1722. res_codes = cast(np.ndarray, res_codes)
  1723. assert res_codes.dtype == codes.dtype
  1724. res = self._from_backing_data(res_codes)
  1725. return res
  1726. # ------------------------------------------------------------------
  1727. # ExtensionArray Interface
  1728. def unique(self):
  1729. """
  1730. Return the ``Categorical`` which ``categories`` and ``codes`` are
  1731. unique.
  1732. .. versionchanged:: 1.3.0
  1733. Previously, unused categories were dropped from the new categories.
  1734. Returns
  1735. -------
  1736. Categorical
  1737. See Also
  1738. --------
  1739. pandas.unique
  1740. CategoricalIndex.unique
  1741. Series.unique : Return unique values of Series object.
  1742. Examples
  1743. --------
  1744. >>> pd.Categorical(list("baabc")).unique()
  1745. ['b', 'a', 'c']
  1746. Categories (3, object): ['a', 'b', 'c']
  1747. >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique()
  1748. ['b', 'a']
  1749. Categories (3, object): ['a' < 'b' < 'c']
  1750. """
  1751. # pylint: disable=useless-parent-delegation
  1752. return super().unique()
  1753. def _cast_quantile_result(self, res_values: np.ndarray) -> np.ndarray:
  1754. # make sure we have correct itemsize for resulting codes
  1755. assert res_values.dtype == self._ndarray.dtype
  1756. return res_values
  1757. def equals(self, other: object) -> bool:
  1758. """
  1759. Returns True if categorical arrays are equal.
  1760. Parameters
  1761. ----------
  1762. other : `Categorical`
  1763. Returns
  1764. -------
  1765. bool
  1766. """
  1767. if not isinstance(other, Categorical):
  1768. return False
  1769. elif self._categories_match_up_to_permutation(other):
  1770. other = self._encode_with_my_categories(other)
  1771. return np.array_equal(self._codes, other._codes)
  1772. return False
  1773. @classmethod
  1774. def _concat_same_type(
  1775. cls: type[CategoricalT], to_concat: Sequence[CategoricalT], axis: AxisInt = 0
  1776. ) -> CategoricalT:
  1777. from pandas.core.dtypes.concat import union_categoricals
  1778. first = to_concat[0]
  1779. if axis >= first.ndim:
  1780. raise ValueError(
  1781. f"axis {axis} is out of bounds for array of dimension {first.ndim}"
  1782. )
  1783. if axis == 1:
  1784. # Flatten, concatenate then reshape
  1785. if not all(x.ndim == 2 for x in to_concat):
  1786. raise ValueError
  1787. # pass correctly-shaped to union_categoricals
  1788. tc_flat = []
  1789. for obj in to_concat:
  1790. tc_flat.extend([obj[:, i] for i in range(obj.shape[1])])
  1791. res_flat = cls._concat_same_type(tc_flat, axis=0)
  1792. result = res_flat.reshape(len(first), -1, order="F")
  1793. return result
  1794. result = union_categoricals(to_concat)
  1795. return result
  1796. # ------------------------------------------------------------------
  1797. def _encode_with_my_categories(self, other: Categorical) -> Categorical:
  1798. """
  1799. Re-encode another categorical using this Categorical's categories.
  1800. Notes
  1801. -----
  1802. This assumes we have already checked
  1803. self._categories_match_up_to_permutation(other).
  1804. """
  1805. # Indexing on codes is more efficient if categories are the same,
  1806. # so we can apply some optimizations based on the degree of
  1807. # dtype-matching.
  1808. codes = recode_for_categories(
  1809. other.codes, other.categories, self.categories, copy=False
  1810. )
  1811. return self._from_backing_data(codes)
  1812. def _categories_match_up_to_permutation(self, other: Categorical) -> bool:
  1813. """
  1814. Returns True if categoricals are the same dtype
  1815. same categories, and same ordered
  1816. Parameters
  1817. ----------
  1818. other : Categorical
  1819. Returns
  1820. -------
  1821. bool
  1822. """
  1823. return hash(self.dtype) == hash(other.dtype)
  1824. def describe(self) -> DataFrame:
  1825. """
  1826. Describes this Categorical
  1827. Returns
  1828. -------
  1829. description: `DataFrame`
  1830. A dataframe with frequency and counts by category.
  1831. """
  1832. counts = self.value_counts(dropna=False)
  1833. freqs = counts / counts.sum()
  1834. from pandas import Index
  1835. from pandas.core.reshape.concat import concat
  1836. result = concat([counts, freqs], axis=1)
  1837. result.columns = Index(["counts", "freqs"])
  1838. result.index.name = "categories"
  1839. return result
  1840. def isin(self, values) -> npt.NDArray[np.bool_]:
  1841. """
  1842. Check whether `values` are contained in Categorical.
  1843. Return a boolean NumPy Array showing whether each element in
  1844. the Categorical matches an element in the passed sequence of
  1845. `values` exactly.
  1846. Parameters
  1847. ----------
  1848. values : set or list-like
  1849. The sequence of values to test. Passing in a single string will
  1850. raise a ``TypeError``. Instead, turn a single string into a
  1851. list of one element.
  1852. Returns
  1853. -------
  1854. np.ndarray[bool]
  1855. Raises
  1856. ------
  1857. TypeError
  1858. * If `values` is not a set or list-like
  1859. See Also
  1860. --------
  1861. pandas.Series.isin : Equivalent method on Series.
  1862. Examples
  1863. --------
  1864. >>> s = pd.Categorical(['lama', 'cow', 'lama', 'beetle', 'lama',
  1865. ... 'hippo'])
  1866. >>> s.isin(['cow', 'lama'])
  1867. array([ True, True, True, False, True, False])
  1868. Passing a single string as ``s.isin('lama')`` will raise an error. Use
  1869. a list of one element instead:
  1870. >>> s.isin(['lama'])
  1871. array([ True, False, True, False, True, False])
  1872. """
  1873. if not is_list_like(values):
  1874. values_type = type(values).__name__
  1875. raise TypeError(
  1876. "only list-like objects are allowed to be passed "
  1877. f"to isin(), you passed a [{values_type}]"
  1878. )
  1879. values = sanitize_array(values, None, None)
  1880. null_mask = np.asarray(isna(values))
  1881. code_values = self.categories.get_indexer(values)
  1882. code_values = code_values[null_mask | (code_values >= 0)]
  1883. return algorithms.isin(self.codes, code_values)
  1884. def _replace(self, *, to_replace, value, inplace: bool = False):
  1885. from pandas import Index
  1886. inplace = validate_bool_kwarg(inplace, "inplace")
  1887. cat = self if inplace else self.copy()
  1888. mask = isna(np.asarray(value))
  1889. if mask.any():
  1890. removals = np.asarray(to_replace)[mask]
  1891. removals = cat.categories[cat.categories.isin(removals)]
  1892. new_cat = cat.remove_categories(removals)
  1893. NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)
  1894. ser = cat.categories.to_series()
  1895. ser = ser.replace(to_replace=to_replace, value=value)
  1896. all_values = Index(ser)
  1897. # GH51016: maintain order of existing categories
  1898. idxr = cat.categories.get_indexer_for(all_values)
  1899. locs = np.arange(len(ser))
  1900. locs = np.where(idxr == -1, locs, idxr)
  1901. locs = locs.argsort()
  1902. new_categories = ser.take(locs)
  1903. new_categories = new_categories.drop_duplicates(keep="first")
  1904. new_categories = Index(new_categories)
  1905. new_codes = recode_for_categories(
  1906. cat._codes, all_values, new_categories, copy=False
  1907. )
  1908. new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)
  1909. NDArrayBacked.__init__(cat, new_codes, new_dtype)
  1910. if not inplace:
  1911. return cat
  1912. # ------------------------------------------------------------------------
  1913. # String methods interface
  1914. def _str_map(
  1915. self, f, na_value=np.nan, dtype=np.dtype("object"), convert: bool = True
  1916. ):
  1917. # Optimization to apply the callable `f` to the categories once
  1918. # and rebuild the result by `take`ing from the result with the codes.
  1919. # Returns the same type as the object-dtype implementation though.
  1920. from pandas.core.arrays import PandasArray
  1921. categories = self.categories
  1922. codes = self.codes
  1923. result = PandasArray(categories.to_numpy())._str_map(f, na_value, dtype)
  1924. return take_nd(result, codes, fill_value=na_value)
  1925. def _str_get_dummies(self, sep: str = "|"):
  1926. # sep may not be in categories. Just bail on this.
  1927. from pandas.core.arrays import PandasArray
  1928. return PandasArray(self.astype(str))._str_get_dummies(sep)
  1929. # The Series.cat accessor
  1930. @delegate_names(
  1931. delegate=Categorical, accessors=["categories", "ordered"], typ="property"
  1932. )
  1933. @delegate_names(
  1934. delegate=Categorical,
  1935. accessors=[
  1936. "rename_categories",
  1937. "reorder_categories",
  1938. "add_categories",
  1939. "remove_categories",
  1940. "remove_unused_categories",
  1941. "set_categories",
  1942. "as_ordered",
  1943. "as_unordered",
  1944. ],
  1945. typ="method",
  1946. )
  1947. class CategoricalAccessor(PandasDelegate, PandasObject, NoNewAttributesMixin):
  1948. """
  1949. Accessor object for categorical properties of the Series values.
  1950. Parameters
  1951. ----------
  1952. data : Series or CategoricalIndex
  1953. Examples
  1954. --------
  1955. >>> s = pd.Series(list("abbccc")).astype("category")
  1956. >>> s
  1957. 0 a
  1958. 1 b
  1959. 2 b
  1960. 3 c
  1961. 4 c
  1962. 5 c
  1963. dtype: category
  1964. Categories (3, object): ['a', 'b', 'c']
  1965. >>> s.cat.categories
  1966. Index(['a', 'b', 'c'], dtype='object')
  1967. >>> s.cat.rename_categories(list("cba"))
  1968. 0 c
  1969. 1 b
  1970. 2 b
  1971. 3 a
  1972. 4 a
  1973. 5 a
  1974. dtype: category
  1975. Categories (3, object): ['c', 'b', 'a']
  1976. >>> s.cat.reorder_categories(list("cba"))
  1977. 0 a
  1978. 1 b
  1979. 2 b
  1980. 3 c
  1981. 4 c
  1982. 5 c
  1983. dtype: category
  1984. Categories (3, object): ['c', 'b', 'a']
  1985. >>> s.cat.add_categories(["d", "e"])
  1986. 0 a
  1987. 1 b
  1988. 2 b
  1989. 3 c
  1990. 4 c
  1991. 5 c
  1992. dtype: category
  1993. Categories (5, object): ['a', 'b', 'c', 'd', 'e']
  1994. >>> s.cat.remove_categories(["a", "c"])
  1995. 0 NaN
  1996. 1 b
  1997. 2 b
  1998. 3 NaN
  1999. 4 NaN
  2000. 5 NaN
  2001. dtype: category
  2002. Categories (1, object): ['b']
  2003. >>> s1 = s.cat.add_categories(["d", "e"])
  2004. >>> s1.cat.remove_unused_categories()
  2005. 0 a
  2006. 1 b
  2007. 2 b
  2008. 3 c
  2009. 4 c
  2010. 5 c
  2011. dtype: category
  2012. Categories (3, object): ['a', 'b', 'c']
  2013. >>> s.cat.set_categories(list("abcde"))
  2014. 0 a
  2015. 1 b
  2016. 2 b
  2017. 3 c
  2018. 4 c
  2019. 5 c
  2020. dtype: category
  2021. Categories (5, object): ['a', 'b', 'c', 'd', 'e']
  2022. >>> s.cat.as_ordered()
  2023. 0 a
  2024. 1 b
  2025. 2 b
  2026. 3 c
  2027. 4 c
  2028. 5 c
  2029. dtype: category
  2030. Categories (3, object): ['a' < 'b' < 'c']
  2031. >>> s.cat.as_unordered()
  2032. 0 a
  2033. 1 b
  2034. 2 b
  2035. 3 c
  2036. 4 c
  2037. 5 c
  2038. dtype: category
  2039. Categories (3, object): ['a', 'b', 'c']
  2040. """
  2041. def __init__(self, data) -> None:
  2042. self._validate(data)
  2043. self._parent = data.values
  2044. self._index = data.index
  2045. self._name = data.name
  2046. self._freeze()
  2047. @staticmethod
  2048. def _validate(data):
  2049. if not is_categorical_dtype(data.dtype):
  2050. raise AttributeError("Can only use .cat accessor with a 'category' dtype")
  2051. def _delegate_property_get(self, name):
  2052. return getattr(self._parent, name)
  2053. def _delegate_property_set(self, name, new_values):
  2054. return setattr(self._parent, name, new_values)
  2055. @property
  2056. def codes(self) -> Series:
  2057. """
  2058. Return Series of codes as well as the index.
  2059. """
  2060. from pandas import Series
  2061. return Series(self._parent.codes, index=self._index)
  2062. def _delegate_method(self, name, *args, **kwargs):
  2063. from pandas import Series
  2064. method = getattr(self._parent, name)
  2065. res = method(*args, **kwargs)
  2066. if res is not None:
  2067. return Series(res, index=self._index, name=self._name)
  2068. # utility routines
  2069. def _get_codes_for_values(values, categories: Index) -> np.ndarray:
  2070. """
  2071. utility routine to turn values into codes given the specified categories
  2072. If `values` is known to be a Categorical, use recode_for_categories instead.
  2073. """
  2074. if values.ndim > 1:
  2075. flat = values.ravel()
  2076. codes = _get_codes_for_values(flat, categories)
  2077. return codes.reshape(values.shape)
  2078. codes = categories.get_indexer_for(values)
  2079. return coerce_indexer_dtype(codes, categories)
  2080. def recode_for_categories(
  2081. codes: np.ndarray, old_categories, new_categories, copy: bool = True
  2082. ) -> np.ndarray:
  2083. """
  2084. Convert a set of codes for to a new set of categories
  2085. Parameters
  2086. ----------
  2087. codes : np.ndarray
  2088. old_categories, new_categories : Index
  2089. copy: bool, default True
  2090. Whether to copy if the codes are unchanged.
  2091. Returns
  2092. -------
  2093. new_codes : np.ndarray[np.int64]
  2094. Examples
  2095. --------
  2096. >>> old_cat = pd.Index(['b', 'a', 'c'])
  2097. >>> new_cat = pd.Index(['a', 'b'])
  2098. >>> codes = np.array([0, 1, 1, 2])
  2099. >>> recode_for_categories(codes, old_cat, new_cat)
  2100. array([ 1, 0, 0, -1], dtype=int8)
  2101. """
  2102. if len(old_categories) == 0:
  2103. # All null anyway, so just retain the nulls
  2104. if copy:
  2105. return codes.copy()
  2106. return codes
  2107. elif new_categories.equals(old_categories):
  2108. # Same categories, so no need to actually recode
  2109. if copy:
  2110. return codes.copy()
  2111. return codes
  2112. indexer = coerce_indexer_dtype(
  2113. new_categories.get_indexer(old_categories), new_categories
  2114. )
  2115. new_codes = take_nd(indexer, codes, fill_value=-1)
  2116. return new_codes
  2117. def factorize_from_iterable(values) -> tuple[np.ndarray, Index]:
  2118. """
  2119. Factorize an input `values` into `categories` and `codes`. Preserves
  2120. categorical dtype in `categories`.
  2121. Parameters
  2122. ----------
  2123. values : list-like
  2124. Returns
  2125. -------
  2126. codes : ndarray
  2127. categories : Index
  2128. If `values` has a categorical dtype, then `categories` is
  2129. a CategoricalIndex keeping the categories and order of `values`.
  2130. """
  2131. from pandas import CategoricalIndex
  2132. if not is_list_like(values):
  2133. raise TypeError("Input must be list-like")
  2134. categories: Index
  2135. if is_categorical_dtype(values):
  2136. values = extract_array(values)
  2137. # The Categorical we want to build has the same categories
  2138. # as values but its codes are by def [0, ..., len(n_categories) - 1]
  2139. cat_codes = np.arange(len(values.categories), dtype=values.codes.dtype)
  2140. cat = Categorical.from_codes(cat_codes, dtype=values.dtype)
  2141. categories = CategoricalIndex(cat)
  2142. codes = values.codes
  2143. else:
  2144. # The value of ordered is irrelevant since we don't use cat as such,
  2145. # but only the resulting categories, the order of which is independent
  2146. # from ordered. Set ordered to False as default. See GH #15457
  2147. cat = Categorical(values, ordered=False)
  2148. categories = cat.categories
  2149. codes = cat.codes
  2150. return codes, categories
  2151. def factorize_from_iterables(iterables) -> tuple[list[np.ndarray], list[Index]]:
  2152. """
  2153. A higher-level wrapper over `factorize_from_iterable`.
  2154. Parameters
  2155. ----------
  2156. iterables : list-like of list-likes
  2157. Returns
  2158. -------
  2159. codes : list of ndarrays
  2160. categories : list of Indexes
  2161. Notes
  2162. -----
  2163. See `factorize_from_iterable` for more info.
  2164. """
  2165. if len(iterables) == 0:
  2166. # For consistency, it should return two empty lists.
  2167. return [], []
  2168. codes, categories = zip(*(factorize_from_iterable(it) for it in iterables))
  2169. return list(codes), list(categories)