base.py 40 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357
  1. """
  2. Base and utility classes for pandas objects.
  3. """
  4. from __future__ import annotations
  5. import textwrap
  6. from typing import (
  7. TYPE_CHECKING,
  8. Any,
  9. Generic,
  10. Hashable,
  11. Iterator,
  12. Literal,
  13. TypeVar,
  14. cast,
  15. final,
  16. overload,
  17. )
  18. import numpy as np
  19. from pandas._config import using_copy_on_write
  20. from pandas._libs import lib
  21. from pandas._typing import (
  22. Axis,
  23. AxisInt,
  24. DtypeObj,
  25. IndexLabel,
  26. NDFrameT,
  27. Shape,
  28. npt,
  29. )
  30. from pandas.compat import PYPY
  31. from pandas.compat.numpy import function as nv
  32. from pandas.errors import AbstractMethodError
  33. from pandas.util._decorators import (
  34. cache_readonly,
  35. doc,
  36. )
  37. from pandas.core.dtypes.cast import can_hold_element
  38. from pandas.core.dtypes.common import (
  39. is_categorical_dtype,
  40. is_dict_like,
  41. is_extension_array_dtype,
  42. is_object_dtype,
  43. is_scalar,
  44. )
  45. from pandas.core.dtypes.generic import (
  46. ABCDataFrame,
  47. ABCIndex,
  48. ABCSeries,
  49. )
  50. from pandas.core.dtypes.missing import (
  51. isna,
  52. remove_na_arraylike,
  53. )
  54. from pandas.core import (
  55. algorithms,
  56. nanops,
  57. ops,
  58. )
  59. from pandas.core.accessor import DirNamesMixin
  60. from pandas.core.arraylike import OpsMixin
  61. from pandas.core.arrays import ExtensionArray
  62. from pandas.core.construction import (
  63. ensure_wrapped_if_datetimelike,
  64. extract_array,
  65. )
  66. if TYPE_CHECKING:
  67. from pandas._typing import (
  68. DropKeep,
  69. NumpySorter,
  70. NumpyValueArrayLike,
  71. ScalarLike_co,
  72. )
  73. from pandas import (
  74. Categorical,
  75. Index,
  76. Series,
  77. )
  78. _shared_docs: dict[str, str] = {}
  79. _indexops_doc_kwargs = {
  80. "klass": "IndexOpsMixin",
  81. "inplace": "",
  82. "unique": "IndexOpsMixin",
  83. "duplicated": "IndexOpsMixin",
  84. }
  85. _T = TypeVar("_T", bound="IndexOpsMixin")
  86. class PandasObject(DirNamesMixin):
  87. """
  88. Baseclass for various pandas objects.
  89. """
  90. # results from calls to methods decorated with cache_readonly get added to _cache
  91. _cache: dict[str, Any]
  92. @property
  93. def _constructor(self):
  94. """
  95. Class constructor (for this class it's just `__class__`.
  96. """
  97. return type(self)
  98. def __repr__(self) -> str:
  99. """
  100. Return a string representation for a particular object.
  101. """
  102. # Should be overwritten by base classes
  103. return object.__repr__(self)
  104. def _reset_cache(self, key: str | None = None) -> None:
  105. """
  106. Reset cached properties. If ``key`` is passed, only clears that key.
  107. """
  108. if not hasattr(self, "_cache"):
  109. return
  110. if key is None:
  111. self._cache.clear()
  112. else:
  113. self._cache.pop(key, None)
  114. def __sizeof__(self) -> int:
  115. """
  116. Generates the total memory usage for an object that returns
  117. either a value or Series of values
  118. """
  119. memory_usage = getattr(self, "memory_usage", None)
  120. if memory_usage:
  121. mem = memory_usage(deep=True) # pylint: disable=not-callable
  122. return int(mem if is_scalar(mem) else mem.sum())
  123. # no memory_usage attribute, so fall back to object's 'sizeof'
  124. return super().__sizeof__()
  125. class NoNewAttributesMixin:
  126. """
  127. Mixin which prevents adding new attributes.
  128. Prevents additional attributes via xxx.attribute = "something" after a
  129. call to `self.__freeze()`. Mainly used to prevent the user from using
  130. wrong attributes on an accessor (`Series.cat/.str/.dt`).
  131. If you really want to add a new attribute at a later time, you need to use
  132. `object.__setattr__(self, key, value)`.
  133. """
  134. def _freeze(self) -> None:
  135. """
  136. Prevents setting additional attributes.
  137. """
  138. object.__setattr__(self, "__frozen", True)
  139. # prevent adding any attribute via s.xxx.new_attribute = ...
  140. def __setattr__(self, key: str, value) -> None:
  141. # _cache is used by a decorator
  142. # We need to check both 1.) cls.__dict__ and 2.) getattr(self, key)
  143. # because
  144. # 1.) getattr is false for attributes that raise errors
  145. # 2.) cls.__dict__ doesn't traverse into base classes
  146. if getattr(self, "__frozen", False) and not (
  147. key == "_cache"
  148. or key in type(self).__dict__
  149. or getattr(self, key, None) is not None
  150. ):
  151. raise AttributeError(f"You cannot add any new attribute '{key}'")
  152. object.__setattr__(self, key, value)
  153. class SelectionMixin(Generic[NDFrameT]):
  154. """
  155. mixin implementing the selection & aggregation interface on a group-like
  156. object sub-classes need to define: obj, exclusions
  157. """
  158. obj: NDFrameT
  159. _selection: IndexLabel | None = None
  160. exclusions: frozenset[Hashable]
  161. _internal_names = ["_cache", "__setstate__"]
  162. _internal_names_set = set(_internal_names)
  163. @final
  164. @property
  165. def _selection_list(self):
  166. if not isinstance(
  167. self._selection, (list, tuple, ABCSeries, ABCIndex, np.ndarray)
  168. ):
  169. return [self._selection]
  170. return self._selection
  171. @cache_readonly
  172. def _selected_obj(self):
  173. if self._selection is None or isinstance(self.obj, ABCSeries):
  174. return self.obj
  175. else:
  176. return self.obj[self._selection]
  177. @final
  178. @cache_readonly
  179. def ndim(self) -> int:
  180. return self._selected_obj.ndim
  181. @final
  182. @cache_readonly
  183. def _obj_with_exclusions(self):
  184. if isinstance(self.obj, ABCSeries):
  185. return self.obj
  186. if self._selection is not None:
  187. return self.obj._getitem_nocopy(self._selection_list)
  188. if len(self.exclusions) > 0:
  189. # equivalent to `self.obj.drop(self.exclusions, axis=1)
  190. # but this avoids consolidating and making a copy
  191. # TODO: following GH#45287 can we now use .drop directly without
  192. # making a copy?
  193. return self.obj._drop_axis(self.exclusions, axis=1, only_slice=True)
  194. else:
  195. return self.obj
  196. def __getitem__(self, key):
  197. if self._selection is not None:
  198. raise IndexError(f"Column(s) {self._selection} already selected")
  199. if isinstance(key, (list, tuple, ABCSeries, ABCIndex, np.ndarray)):
  200. if len(self.obj.columns.intersection(key)) != len(set(key)):
  201. bad_keys = list(set(key).difference(self.obj.columns))
  202. raise KeyError(f"Columns not found: {str(bad_keys)[1:-1]}")
  203. return self._gotitem(list(key), ndim=2)
  204. else:
  205. if key not in self.obj:
  206. raise KeyError(f"Column not found: {key}")
  207. ndim = self.obj[key].ndim
  208. return self._gotitem(key, ndim=ndim)
  209. def _gotitem(self, key, ndim: int, subset=None):
  210. """
  211. sub-classes to define
  212. return a sliced object
  213. Parameters
  214. ----------
  215. key : str / list of selections
  216. ndim : {1, 2}
  217. requested ndim of result
  218. subset : object, default None
  219. subset to act on
  220. """
  221. raise AbstractMethodError(self)
  222. def aggregate(self, func, *args, **kwargs):
  223. raise AbstractMethodError(self)
  224. agg = aggregate
  225. class IndexOpsMixin(OpsMixin):
  226. """
  227. Common ops mixin to support a unified interface / docs for Series / Index
  228. """
  229. # ndarray compatibility
  230. __array_priority__ = 1000
  231. _hidden_attrs: frozenset[str] = frozenset(
  232. ["tolist"] # tolist is not deprecated, just suppressed in the __dir__
  233. )
  234. @property
  235. def dtype(self) -> DtypeObj:
  236. # must be defined here as a property for mypy
  237. raise AbstractMethodError(self)
  238. @property
  239. def _values(self) -> ExtensionArray | np.ndarray:
  240. # must be defined here as a property for mypy
  241. raise AbstractMethodError(self)
  242. @final
  243. def transpose(self: _T, *args, **kwargs) -> _T:
  244. """
  245. Return the transpose, which is by definition self.
  246. Returns
  247. -------
  248. %(klass)s
  249. """
  250. nv.validate_transpose(args, kwargs)
  251. return self
  252. T = property(
  253. transpose,
  254. doc="""
  255. Return the transpose, which is by definition self.
  256. """,
  257. )
  258. @property
  259. def shape(self) -> Shape:
  260. """
  261. Return a tuple of the shape of the underlying data.
  262. Examples
  263. --------
  264. >>> s = pd.Series([1, 2, 3])
  265. >>> s.shape
  266. (3,)
  267. """
  268. return self._values.shape
  269. def __len__(self) -> int:
  270. # We need this defined here for mypy
  271. raise AbstractMethodError(self)
  272. @property
  273. def ndim(self) -> Literal[1]:
  274. """
  275. Number of dimensions of the underlying data, by definition 1.
  276. """
  277. return 1
  278. @final
  279. def item(self):
  280. """
  281. Return the first element of the underlying data as a Python scalar.
  282. Returns
  283. -------
  284. scalar
  285. The first element of %(klass)s.
  286. Raises
  287. ------
  288. ValueError
  289. If the data is not length-1.
  290. """
  291. if len(self) == 1:
  292. return next(iter(self))
  293. raise ValueError("can only convert an array of size 1 to a Python scalar")
  294. @property
  295. def nbytes(self) -> int:
  296. """
  297. Return the number of bytes in the underlying data.
  298. """
  299. return self._values.nbytes
  300. @property
  301. def size(self) -> int:
  302. """
  303. Return the number of elements in the underlying data.
  304. """
  305. return len(self._values)
  306. @property
  307. def array(self) -> ExtensionArray:
  308. """
  309. The ExtensionArray of the data backing this Series or Index.
  310. Returns
  311. -------
  312. ExtensionArray
  313. An ExtensionArray of the values stored within. For extension
  314. types, this is the actual array. For NumPy native types, this
  315. is a thin (no copy) wrapper around :class:`numpy.ndarray`.
  316. ``.array`` differs ``.values`` which may require converting the
  317. data to a different form.
  318. See Also
  319. --------
  320. Index.to_numpy : Similar method that always returns a NumPy array.
  321. Series.to_numpy : Similar method that always returns a NumPy array.
  322. Notes
  323. -----
  324. This table lays out the different array types for each extension
  325. dtype within pandas.
  326. ================== =============================
  327. dtype array type
  328. ================== =============================
  329. category Categorical
  330. period PeriodArray
  331. interval IntervalArray
  332. IntegerNA IntegerArray
  333. string StringArray
  334. boolean BooleanArray
  335. datetime64[ns, tz] DatetimeArray
  336. ================== =============================
  337. For any 3rd-party extension types, the array type will be an
  338. ExtensionArray.
  339. For all remaining dtypes ``.array`` will be a
  340. :class:`arrays.NumpyExtensionArray` wrapping the actual ndarray
  341. stored within. If you absolutely need a NumPy array (possibly with
  342. copying / coercing data), then use :meth:`Series.to_numpy` instead.
  343. Examples
  344. --------
  345. For regular NumPy types like int, and float, a PandasArray
  346. is returned.
  347. >>> pd.Series([1, 2, 3]).array
  348. <PandasArray>
  349. [1, 2, 3]
  350. Length: 3, dtype: int64
  351. For extension types, like Categorical, the actual ExtensionArray
  352. is returned
  353. >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
  354. >>> ser.array
  355. ['a', 'b', 'a']
  356. Categories (2, object): ['a', 'b']
  357. """
  358. raise AbstractMethodError(self)
  359. @final
  360. def to_numpy(
  361. self,
  362. dtype: npt.DTypeLike | None = None,
  363. copy: bool = False,
  364. na_value: object = lib.no_default,
  365. **kwargs,
  366. ) -> np.ndarray:
  367. """
  368. A NumPy ndarray representing the values in this Series or Index.
  369. Parameters
  370. ----------
  371. dtype : str or numpy.dtype, optional
  372. The dtype to pass to :meth:`numpy.asarray`.
  373. copy : bool, default False
  374. Whether to ensure that the returned value is not a view on
  375. another array. Note that ``copy=False`` does not *ensure* that
  376. ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
  377. a copy is made, even if not strictly necessary.
  378. na_value : Any, optional
  379. The value to use for missing values. The default value depends
  380. on `dtype` and the type of the array.
  381. **kwargs
  382. Additional keywords passed through to the ``to_numpy`` method
  383. of the underlying array (for extension arrays).
  384. Returns
  385. -------
  386. numpy.ndarray
  387. See Also
  388. --------
  389. Series.array : Get the actual data stored within.
  390. Index.array : Get the actual data stored within.
  391. DataFrame.to_numpy : Similar method for DataFrame.
  392. Notes
  393. -----
  394. The returned array will be the same up to equality (values equal
  395. in `self` will be equal in the returned array; likewise for values
  396. that are not equal). When `self` contains an ExtensionArray, the
  397. dtype may be different. For example, for a category-dtype Series,
  398. ``to_numpy()`` will return a NumPy array and the categorical dtype
  399. will be lost.
  400. For NumPy dtypes, this will be a reference to the actual data stored
  401. in this Series or Index (assuming ``copy=False``). Modifying the result
  402. in place will modify the data stored in the Series or Index (not that
  403. we recommend doing that).
  404. For extension types, ``to_numpy()`` *may* require copying data and
  405. coercing the result to a NumPy type (possibly object), which may be
  406. expensive. When you need a no-copy reference to the underlying data,
  407. :attr:`Series.array` should be used instead.
  408. This table lays out the different dtypes and default return types of
  409. ``to_numpy()`` for various dtypes within pandas.
  410. ================== ================================
  411. dtype array type
  412. ================== ================================
  413. category[T] ndarray[T] (same dtype as input)
  414. period ndarray[object] (Periods)
  415. interval ndarray[object] (Intervals)
  416. IntegerNA ndarray[object]
  417. datetime64[ns] datetime64[ns]
  418. datetime64[ns, tz] ndarray[object] (Timestamps)
  419. ================== ================================
  420. Examples
  421. --------
  422. >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a']))
  423. >>> ser.to_numpy()
  424. array(['a', 'b', 'a'], dtype=object)
  425. Specify the `dtype` to control how datetime-aware data is represented.
  426. Use ``dtype=object`` to return an ndarray of pandas :class:`Timestamp`
  427. objects, each with the correct ``tz``.
  428. >>> ser = pd.Series(pd.date_range('2000', periods=2, tz="CET"))
  429. >>> ser.to_numpy(dtype=object)
  430. array([Timestamp('2000-01-01 00:00:00+0100', tz='CET'),
  431. Timestamp('2000-01-02 00:00:00+0100', tz='CET')],
  432. dtype=object)
  433. Or ``dtype='datetime64[ns]'`` to return an ndarray of native
  434. datetime64 values. The values are converted to UTC and the timezone
  435. info is dropped.
  436. >>> ser.to_numpy(dtype="datetime64[ns]")
  437. ... # doctest: +ELLIPSIS
  438. array(['1999-12-31T23:00:00.000000000', '2000-01-01T23:00:00...'],
  439. dtype='datetime64[ns]')
  440. """
  441. if is_extension_array_dtype(self.dtype):
  442. return self.array.to_numpy(dtype, copy=copy, na_value=na_value, **kwargs)
  443. elif kwargs:
  444. bad_keys = list(kwargs.keys())[0]
  445. raise TypeError(
  446. f"to_numpy() got an unexpected keyword argument '{bad_keys}'"
  447. )
  448. if na_value is not lib.no_default:
  449. values = self._values
  450. if not can_hold_element(values, na_value):
  451. # if we can't hold the na_value asarray either makes a copy or we
  452. # error before modifying values. The asarray later on thus won't make
  453. # another copy
  454. values = np.asarray(values, dtype=dtype)
  455. else:
  456. values = values.copy()
  457. values[np.asanyarray(self.isna())] = na_value
  458. else:
  459. values = self._values
  460. result = np.asarray(values, dtype=dtype)
  461. if (copy and na_value is lib.no_default) or (
  462. not copy and using_copy_on_write()
  463. ):
  464. if np.shares_memory(self._values[:2], result[:2]):
  465. # Take slices to improve performance of check
  466. if using_copy_on_write() and not copy:
  467. result = result.view()
  468. result.flags.writeable = False
  469. else:
  470. result = result.copy()
  471. return result
  472. @final
  473. @property
  474. def empty(self) -> bool:
  475. return not self.size
  476. def max(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs):
  477. """
  478. Return the maximum value of the Index.
  479. Parameters
  480. ----------
  481. axis : int, optional
  482. For compatibility with NumPy. Only 0 or None are allowed.
  483. skipna : bool, default True
  484. Exclude NA/null values when showing the result.
  485. *args, **kwargs
  486. Additional arguments and keywords for compatibility with NumPy.
  487. Returns
  488. -------
  489. scalar
  490. Maximum value.
  491. See Also
  492. --------
  493. Index.min : Return the minimum value in an Index.
  494. Series.max : Return the maximum value in a Series.
  495. DataFrame.max : Return the maximum values in a DataFrame.
  496. Examples
  497. --------
  498. >>> idx = pd.Index([3, 2, 1])
  499. >>> idx.max()
  500. 3
  501. >>> idx = pd.Index(['c', 'b', 'a'])
  502. >>> idx.max()
  503. 'c'
  504. For a MultiIndex, the maximum is determined lexicographically.
  505. >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
  506. >>> idx.max()
  507. ('b', 2)
  508. """
  509. nv.validate_minmax_axis(axis)
  510. nv.validate_max(args, kwargs)
  511. return nanops.nanmax(self._values, skipna=skipna)
  512. @doc(op="max", oppose="min", value="largest")
  513. def argmax(
  514. self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs
  515. ) -> int:
  516. """
  517. Return int position of the {value} value in the Series.
  518. If the {op}imum is achieved in multiple locations,
  519. the first row position is returned.
  520. Parameters
  521. ----------
  522. axis : {{None}}
  523. Unused. Parameter needed for compatibility with DataFrame.
  524. skipna : bool, default True
  525. Exclude NA/null values when showing the result.
  526. *args, **kwargs
  527. Additional arguments and keywords for compatibility with NumPy.
  528. Returns
  529. -------
  530. int
  531. Row position of the {op}imum value.
  532. See Also
  533. --------
  534. Series.arg{op} : Return position of the {op}imum value.
  535. Series.arg{oppose} : Return position of the {oppose}imum value.
  536. numpy.ndarray.arg{op} : Equivalent method for numpy arrays.
  537. Series.idxmax : Return index label of the maximum values.
  538. Series.idxmin : Return index label of the minimum values.
  539. Examples
  540. --------
  541. Consider dataset containing cereal calories
  542. >>> s = pd.Series({{'Corn Flakes': 100.0, 'Almond Delight': 110.0,
  543. ... 'Cinnamon Toast Crunch': 120.0, 'Cocoa Puff': 110.0}})
  544. >>> s
  545. Corn Flakes 100.0
  546. Almond Delight 110.0
  547. Cinnamon Toast Crunch 120.0
  548. Cocoa Puff 110.0
  549. dtype: float64
  550. >>> s.argmax()
  551. 2
  552. >>> s.argmin()
  553. 0
  554. The maximum cereal calories is the third element and
  555. the minimum cereal calories is the first element,
  556. since series is zero-indexed.
  557. """
  558. delegate = self._values
  559. nv.validate_minmax_axis(axis)
  560. skipna = nv.validate_argmax_with_skipna(skipna, args, kwargs)
  561. if isinstance(delegate, ExtensionArray):
  562. if not skipna and delegate.isna().any():
  563. return -1
  564. else:
  565. return delegate.argmax()
  566. else:
  567. # error: Incompatible return value type (got "Union[int, ndarray]", expected
  568. # "int")
  569. return nanops.nanargmax( # type: ignore[return-value]
  570. delegate, skipna=skipna
  571. )
  572. def min(self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs):
  573. """
  574. Return the minimum value of the Index.
  575. Parameters
  576. ----------
  577. axis : {None}
  578. Dummy argument for consistency with Series.
  579. skipna : bool, default True
  580. Exclude NA/null values when showing the result.
  581. *args, **kwargs
  582. Additional arguments and keywords for compatibility with NumPy.
  583. Returns
  584. -------
  585. scalar
  586. Minimum value.
  587. See Also
  588. --------
  589. Index.max : Return the maximum value of the object.
  590. Series.min : Return the minimum value in a Series.
  591. DataFrame.min : Return the minimum values in a DataFrame.
  592. Examples
  593. --------
  594. >>> idx = pd.Index([3, 2, 1])
  595. >>> idx.min()
  596. 1
  597. >>> idx = pd.Index(['c', 'b', 'a'])
  598. >>> idx.min()
  599. 'a'
  600. For a MultiIndex, the minimum is determined lexicographically.
  601. >>> idx = pd.MultiIndex.from_product([('a', 'b'), (2, 1)])
  602. >>> idx.min()
  603. ('a', 1)
  604. """
  605. nv.validate_minmax_axis(axis)
  606. nv.validate_min(args, kwargs)
  607. return nanops.nanmin(self._values, skipna=skipna)
  608. @doc(argmax, op="min", oppose="max", value="smallest")
  609. def argmin(
  610. self, axis: AxisInt | None = None, skipna: bool = True, *args, **kwargs
  611. ) -> int:
  612. delegate = self._values
  613. nv.validate_minmax_axis(axis)
  614. skipna = nv.validate_argmin_with_skipna(skipna, args, kwargs)
  615. if isinstance(delegate, ExtensionArray):
  616. if not skipna and delegate.isna().any():
  617. return -1
  618. else:
  619. return delegate.argmin()
  620. else:
  621. # error: Incompatible return value type (got "Union[int, ndarray]", expected
  622. # "int")
  623. return nanops.nanargmin( # type: ignore[return-value]
  624. delegate, skipna=skipna
  625. )
  626. def tolist(self):
  627. """
  628. Return a list of the values.
  629. These are each a scalar type, which is a Python scalar
  630. (for str, int, float) or a pandas scalar
  631. (for Timestamp/Timedelta/Interval/Period)
  632. Returns
  633. -------
  634. list
  635. See Also
  636. --------
  637. numpy.ndarray.tolist : Return the array as an a.ndim-levels deep
  638. nested list of Python scalars.
  639. """
  640. return self._values.tolist()
  641. to_list = tolist
  642. def __iter__(self) -> Iterator:
  643. """
  644. Return an iterator of the values.
  645. These are each a scalar type, which is a Python scalar
  646. (for str, int, float) or a pandas scalar
  647. (for Timestamp/Timedelta/Interval/Period)
  648. Returns
  649. -------
  650. iterator
  651. """
  652. # We are explicitly making element iterators.
  653. if not isinstance(self._values, np.ndarray):
  654. # Check type instead of dtype to catch DTA/TDA
  655. return iter(self._values)
  656. else:
  657. return map(self._values.item, range(self._values.size))
  658. @cache_readonly
  659. def hasnans(self) -> bool:
  660. """
  661. Return True if there are any NaNs.
  662. Enables various performance speedups.
  663. Returns
  664. -------
  665. bool
  666. """
  667. # error: Item "bool" of "Union[bool, ndarray[Any, dtype[bool_]], NDFrame]"
  668. # has no attribute "any"
  669. return bool(isna(self).any()) # type: ignore[union-attr]
  670. def isna(self) -> npt.NDArray[np.bool_]:
  671. return isna(self._values)
  672. def _reduce(
  673. self,
  674. op,
  675. name: str,
  676. *,
  677. axis: Axis = 0,
  678. skipna: bool = True,
  679. numeric_only=None,
  680. filter_type=None,
  681. **kwds,
  682. ):
  683. """
  684. Perform the reduction type operation if we can.
  685. """
  686. func = getattr(self, name, None)
  687. if func is None:
  688. raise TypeError(
  689. f"{type(self).__name__} cannot perform the operation {name}"
  690. )
  691. return func(skipna=skipna, **kwds)
  692. @final
  693. def _map_values(self, mapper, na_action=None):
  694. """
  695. An internal function that maps values using the input
  696. correspondence (which can be a dict, Series, or function).
  697. Parameters
  698. ----------
  699. mapper : function, dict, or Series
  700. The input correspondence object
  701. na_action : {None, 'ignore'}
  702. If 'ignore', propagate NA values, without passing them to the
  703. mapping function
  704. Returns
  705. -------
  706. Union[Index, MultiIndex], inferred
  707. The output of the mapping function applied to the index.
  708. If the function returns a tuple with more than one element
  709. a MultiIndex will be returned.
  710. """
  711. # we can fastpath dict/Series to an efficient map
  712. # as we know that we are not going to have to yield
  713. # python types
  714. if is_dict_like(mapper):
  715. if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
  716. # If a dictionary subclass defines a default value method,
  717. # convert mapper to a lookup function (GH #15999).
  718. dict_with_default = mapper
  719. mapper = lambda x: dict_with_default[
  720. np.nan if isinstance(x, float) and np.isnan(x) else x
  721. ]
  722. else:
  723. # Dictionary does not have a default. Thus it's safe to
  724. # convert to an Series for efficiency.
  725. # we specify the keys here to handle the
  726. # possibility that they are tuples
  727. # The return value of mapping with an empty mapper is
  728. # expected to be pd.Series(np.nan, ...). As np.nan is
  729. # of dtype float64 the return value of this method should
  730. # be float64 as well
  731. from pandas import Series
  732. if len(mapper) == 0:
  733. mapper = Series(mapper, dtype=np.float64)
  734. else:
  735. mapper = Series(mapper)
  736. if isinstance(mapper, ABCSeries):
  737. if na_action not in (None, "ignore"):
  738. msg = (
  739. "na_action must either be 'ignore' or None, "
  740. f"{na_action} was passed"
  741. )
  742. raise ValueError(msg)
  743. if na_action == "ignore":
  744. mapper = mapper[mapper.index.notna()]
  745. # Since values were input this means we came from either
  746. # a dict or a series and mapper should be an index
  747. if is_categorical_dtype(self.dtype):
  748. # use the built in categorical series mapper which saves
  749. # time by mapping the categories instead of all values
  750. cat = cast("Categorical", self._values)
  751. return cat.map(mapper)
  752. values = self._values
  753. indexer = mapper.index.get_indexer(values)
  754. new_values = algorithms.take_nd(mapper._values, indexer)
  755. return new_values
  756. # we must convert to python types
  757. if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
  758. # GH#23179 some EAs do not have `map`
  759. values = self._values
  760. if na_action is not None:
  761. raise NotImplementedError
  762. map_f = lambda values, f: values.map(f)
  763. else:
  764. values = self._values.astype(object)
  765. if na_action == "ignore":
  766. map_f = lambda values, f: lib.map_infer_mask(
  767. values, f, isna(values).view(np.uint8)
  768. )
  769. elif na_action is None:
  770. map_f = lib.map_infer
  771. else:
  772. msg = (
  773. "na_action must either be 'ignore' or None, "
  774. f"{na_action} was passed"
  775. )
  776. raise ValueError(msg)
  777. # mapper is a function
  778. new_values = map_f(values, mapper)
  779. return new_values
  780. @final
  781. def value_counts(
  782. self,
  783. normalize: bool = False,
  784. sort: bool = True,
  785. ascending: bool = False,
  786. bins=None,
  787. dropna: bool = True,
  788. ) -> Series:
  789. """
  790. Return a Series containing counts of unique values.
  791. The resulting object will be in descending order so that the
  792. first element is the most frequently-occurring element.
  793. Excludes NA values by default.
  794. Parameters
  795. ----------
  796. normalize : bool, default False
  797. If True then the object returned will contain the relative
  798. frequencies of the unique values.
  799. sort : bool, default True
  800. Sort by frequencies.
  801. ascending : bool, default False
  802. Sort in ascending order.
  803. bins : int, optional
  804. Rather than count values, group them into half-open bins,
  805. a convenience for ``pd.cut``, only works with numeric data.
  806. dropna : bool, default True
  807. Don't include counts of NaN.
  808. Returns
  809. -------
  810. Series
  811. See Also
  812. --------
  813. Series.count: Number of non-NA elements in a Series.
  814. DataFrame.count: Number of non-NA elements in a DataFrame.
  815. DataFrame.value_counts: Equivalent method on DataFrames.
  816. Examples
  817. --------
  818. >>> index = pd.Index([3, 1, 2, 3, 4, np.nan])
  819. >>> index.value_counts()
  820. 3.0 2
  821. 1.0 1
  822. 2.0 1
  823. 4.0 1
  824. Name: count, dtype: int64
  825. With `normalize` set to `True`, returns the relative frequency by
  826. dividing all values by the sum of values.
  827. >>> s = pd.Series([3, 1, 2, 3, 4, np.nan])
  828. >>> s.value_counts(normalize=True)
  829. 3.0 0.4
  830. 1.0 0.2
  831. 2.0 0.2
  832. 4.0 0.2
  833. Name: proportion, dtype: float64
  834. **bins**
  835. Bins can be useful for going from a continuous variable to a
  836. categorical variable; instead of counting unique
  837. apparitions of values, divide the index in the specified
  838. number of half-open bins.
  839. >>> s.value_counts(bins=3)
  840. (0.996, 2.0] 2
  841. (2.0, 3.0] 2
  842. (3.0, 4.0] 1
  843. Name: count, dtype: int64
  844. **dropna**
  845. With `dropna` set to `False` we can also see NaN index values.
  846. >>> s.value_counts(dropna=False)
  847. 3.0 2
  848. 1.0 1
  849. 2.0 1
  850. 4.0 1
  851. NaN 1
  852. Name: count, dtype: int64
  853. """
  854. return algorithms.value_counts(
  855. self,
  856. sort=sort,
  857. ascending=ascending,
  858. normalize=normalize,
  859. bins=bins,
  860. dropna=dropna,
  861. )
  862. def unique(self):
  863. values = self._values
  864. if not isinstance(values, np.ndarray):
  865. # i.e. ExtensionArray
  866. result = values.unique()
  867. else:
  868. result = algorithms.unique1d(values)
  869. return result
  870. @final
  871. def nunique(self, dropna: bool = True) -> int:
  872. """
  873. Return number of unique elements in the object.
  874. Excludes NA values by default.
  875. Parameters
  876. ----------
  877. dropna : bool, default True
  878. Don't include NaN in the count.
  879. Returns
  880. -------
  881. int
  882. See Also
  883. --------
  884. DataFrame.nunique: Method nunique for DataFrame.
  885. Series.count: Count non-NA/null observations in the Series.
  886. Examples
  887. --------
  888. >>> s = pd.Series([1, 3, 5, 7, 7])
  889. >>> s
  890. 0 1
  891. 1 3
  892. 2 5
  893. 3 7
  894. 4 7
  895. dtype: int64
  896. >>> s.nunique()
  897. 4
  898. """
  899. uniqs = self.unique()
  900. if dropna:
  901. uniqs = remove_na_arraylike(uniqs)
  902. return len(uniqs)
  903. @property
  904. def is_unique(self) -> bool:
  905. """
  906. Return boolean if values in the object are unique.
  907. Returns
  908. -------
  909. bool
  910. """
  911. return self.nunique(dropna=False) == len(self)
  912. @property
  913. def is_monotonic_increasing(self) -> bool:
  914. """
  915. Return boolean if values in the object are monotonically increasing.
  916. Returns
  917. -------
  918. bool
  919. """
  920. from pandas import Index
  921. return Index(self).is_monotonic_increasing
  922. @property
  923. def is_monotonic_decreasing(self) -> bool:
  924. """
  925. Return boolean if values in the object are monotonically decreasing.
  926. Returns
  927. -------
  928. bool
  929. """
  930. from pandas import Index
  931. return Index(self).is_monotonic_decreasing
  932. @final
  933. def _memory_usage(self, deep: bool = False) -> int:
  934. """
  935. Memory usage of the values.
  936. Parameters
  937. ----------
  938. deep : bool, default False
  939. Introspect the data deeply, interrogate
  940. `object` dtypes for system-level memory consumption.
  941. Returns
  942. -------
  943. bytes used
  944. See Also
  945. --------
  946. numpy.ndarray.nbytes : Total bytes consumed by the elements of the
  947. array.
  948. Notes
  949. -----
  950. Memory usage does not include memory consumed by elements that
  951. are not components of the array if deep=False or if used on PyPy
  952. """
  953. if hasattr(self.array, "memory_usage"):
  954. return self.array.memory_usage( # pyright: ignore[reportGeneralTypeIssues]
  955. deep=deep,
  956. )
  957. v = self.array.nbytes
  958. if deep and is_object_dtype(self) and not PYPY:
  959. values = cast(np.ndarray, self._values)
  960. v += lib.memory_usage_of_objects(values)
  961. return v
  962. @doc(
  963. algorithms.factorize,
  964. values="",
  965. order="",
  966. size_hint="",
  967. sort=textwrap.dedent(
  968. """\
  969. sort : bool, default False
  970. Sort `uniques` and shuffle `codes` to maintain the
  971. relationship.
  972. """
  973. ),
  974. )
  975. def factorize(
  976. self,
  977. sort: bool = False,
  978. use_na_sentinel: bool = True,
  979. ) -> tuple[npt.NDArray[np.intp], Index]:
  980. codes, uniques = algorithms.factorize(
  981. self._values, sort=sort, use_na_sentinel=use_na_sentinel
  982. )
  983. if uniques.dtype == np.float16:
  984. uniques = uniques.astype(np.float32)
  985. if isinstance(self, ABCIndex):
  986. # preserve e.g. MultiIndex
  987. uniques = self._constructor(uniques)
  988. else:
  989. from pandas import Index
  990. uniques = Index(uniques)
  991. return codes, uniques
  992. _shared_docs[
  993. "searchsorted"
  994. ] = """
  995. Find indices where elements should be inserted to maintain order.
  996. Find the indices into a sorted {klass} `self` such that, if the
  997. corresponding elements in `value` were inserted before the indices,
  998. the order of `self` would be preserved.
  999. .. note::
  1000. The {klass} *must* be monotonically sorted, otherwise
  1001. wrong locations will likely be returned. Pandas does *not*
  1002. check this for you.
  1003. Parameters
  1004. ----------
  1005. value : array-like or scalar
  1006. Values to insert into `self`.
  1007. side : {{'left', 'right'}}, optional
  1008. If 'left', the index of the first suitable location found is given.
  1009. If 'right', return the last such index. If there is no suitable
  1010. index, return either 0 or N (where N is the length of `self`).
  1011. sorter : 1-D array-like, optional
  1012. Optional array of integer indices that sort `self` into ascending
  1013. order. They are typically the result of ``np.argsort``.
  1014. Returns
  1015. -------
  1016. int or array of int
  1017. A scalar or array of insertion points with the
  1018. same shape as `value`.
  1019. See Also
  1020. --------
  1021. sort_values : Sort by the values along either axis.
  1022. numpy.searchsorted : Similar method from NumPy.
  1023. Notes
  1024. -----
  1025. Binary search is used to find the required insertion points.
  1026. Examples
  1027. --------
  1028. >>> ser = pd.Series([1, 2, 3])
  1029. >>> ser
  1030. 0 1
  1031. 1 2
  1032. 2 3
  1033. dtype: int64
  1034. >>> ser.searchsorted(4)
  1035. 3
  1036. >>> ser.searchsorted([0, 4])
  1037. array([0, 3])
  1038. >>> ser.searchsorted([1, 3], side='left')
  1039. array([0, 2])
  1040. >>> ser.searchsorted([1, 3], side='right')
  1041. array([1, 3])
  1042. >>> ser = pd.Series(pd.to_datetime(['3/11/2000', '3/12/2000', '3/13/2000']))
  1043. >>> ser
  1044. 0 2000-03-11
  1045. 1 2000-03-12
  1046. 2 2000-03-13
  1047. dtype: datetime64[ns]
  1048. >>> ser.searchsorted('3/14/2000')
  1049. 3
  1050. >>> ser = pd.Categorical(
  1051. ... ['apple', 'bread', 'bread', 'cheese', 'milk'], ordered=True
  1052. ... )
  1053. >>> ser
  1054. ['apple', 'bread', 'bread', 'cheese', 'milk']
  1055. Categories (4, object): ['apple' < 'bread' < 'cheese' < 'milk']
  1056. >>> ser.searchsorted('bread')
  1057. 1
  1058. >>> ser.searchsorted(['bread'], side='right')
  1059. array([3])
  1060. If the values are not monotonically sorted, wrong locations
  1061. may be returned:
  1062. >>> ser = pd.Series([2, 1, 3])
  1063. >>> ser
  1064. 0 2
  1065. 1 1
  1066. 2 3
  1067. dtype: int64
  1068. >>> ser.searchsorted(1) # doctest: +SKIP
  1069. 0 # wrong result, correct would be 1
  1070. """
  1071. # This overload is needed so that the call to searchsorted in
  1072. # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result
  1073. @overload
  1074. # The following ignore is also present in numpy/__init__.pyi
  1075. # Possibly a mypy bug??
  1076. # error: Overloaded function signatures 1 and 2 overlap with incompatible
  1077. # return types [misc]
  1078. def searchsorted( # type: ignore[misc]
  1079. self,
  1080. value: ScalarLike_co,
  1081. side: Literal["left", "right"] = ...,
  1082. sorter: NumpySorter = ...,
  1083. ) -> np.intp:
  1084. ...
  1085. @overload
  1086. def searchsorted(
  1087. self,
  1088. value: npt.ArrayLike | ExtensionArray,
  1089. side: Literal["left", "right"] = ...,
  1090. sorter: NumpySorter = ...,
  1091. ) -> npt.NDArray[np.intp]:
  1092. ...
  1093. @doc(_shared_docs["searchsorted"], klass="Index")
  1094. def searchsorted(
  1095. self,
  1096. value: NumpyValueArrayLike | ExtensionArray,
  1097. side: Literal["left", "right"] = "left",
  1098. sorter: NumpySorter = None,
  1099. ) -> npt.NDArray[np.intp] | np.intp:
  1100. if isinstance(value, ABCDataFrame):
  1101. msg = (
  1102. "Value must be 1-D array-like or scalar, "
  1103. f"{type(value).__name__} is not supported"
  1104. )
  1105. raise ValueError(msg)
  1106. values = self._values
  1107. if not isinstance(values, np.ndarray):
  1108. # Going through EA.searchsorted directly improves performance GH#38083
  1109. return values.searchsorted(value, side=side, sorter=sorter)
  1110. return algorithms.searchsorted(
  1111. values,
  1112. value,
  1113. side=side,
  1114. sorter=sorter,
  1115. )
  1116. def drop_duplicates(self, *, keep: DropKeep = "first"):
  1117. duplicated = self._duplicated(keep=keep)
  1118. # error: Value of type "IndexOpsMixin" is not indexable
  1119. return self[~duplicated] # type: ignore[index]
  1120. @final
  1121. def _duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
  1122. return algorithms.duplicated(self._values, keep=keep)
  1123. def _arith_method(self, other, op):
  1124. res_name = ops.get_op_result_name(self, other)
  1125. lvalues = self._values
  1126. rvalues = extract_array(other, extract_numpy=True, extract_range=True)
  1127. rvalues = ops.maybe_prepare_scalar_for_op(rvalues, lvalues.shape)
  1128. rvalues = ensure_wrapped_if_datetimelike(rvalues)
  1129. with np.errstate(all="ignore"):
  1130. result = ops.arithmetic_op(lvalues, rvalues, op)
  1131. return self._construct_result(result, name=res_name)
  1132. def _construct_result(self, result, name):
  1133. """
  1134. Construct an appropriately-wrapped result from the ArrayLike result
  1135. of an arithmetic-like operation.
  1136. """
  1137. raise AbstractMethodError(self)