algorithms.py 51 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672
  1. """
  2. Generic data algorithms. This module is experimental at the moment and not
  3. intended for public consumption
  4. """
  5. from __future__ import annotations
  6. import operator
  7. from textwrap import dedent
  8. from typing import (
  9. TYPE_CHECKING,
  10. Literal,
  11. cast,
  12. )
  13. import warnings
  14. import numpy as np
  15. from pandas._libs import (
  16. algos,
  17. hashtable as htable,
  18. iNaT,
  19. lib,
  20. )
  21. from pandas._typing import (
  22. AnyArrayLike,
  23. ArrayLike,
  24. AxisInt,
  25. DtypeObj,
  26. TakeIndexer,
  27. npt,
  28. )
  29. from pandas.util._decorators import doc
  30. from pandas.util._exceptions import find_stack_level
  31. from pandas.core.dtypes.cast import (
  32. construct_1d_object_array_from_listlike,
  33. infer_dtype_from_array,
  34. np_find_common_type,
  35. )
  36. from pandas.core.dtypes.common import (
  37. ensure_float64,
  38. ensure_object,
  39. ensure_platform_int,
  40. is_array_like,
  41. is_bool_dtype,
  42. is_categorical_dtype,
  43. is_complex_dtype,
  44. is_extension_array_dtype,
  45. is_float_dtype,
  46. is_integer,
  47. is_integer_dtype,
  48. is_list_like,
  49. is_numeric_dtype,
  50. is_object_dtype,
  51. is_scalar,
  52. is_signed_integer_dtype,
  53. needs_i8_conversion,
  54. )
  55. from pandas.core.dtypes.concat import concat_compat
  56. from pandas.core.dtypes.dtypes import (
  57. BaseMaskedDtype,
  58. ExtensionDtype,
  59. PandasDtype,
  60. )
  61. from pandas.core.dtypes.generic import (
  62. ABCDatetimeArray,
  63. ABCExtensionArray,
  64. ABCIndex,
  65. ABCMultiIndex,
  66. ABCSeries,
  67. ABCTimedeltaArray,
  68. )
  69. from pandas.core.dtypes.missing import (
  70. isna,
  71. na_value_for_dtype,
  72. )
  73. from pandas.core.array_algos.take import take_nd
  74. from pandas.core.construction import (
  75. array as pd_array,
  76. ensure_wrapped_if_datetimelike,
  77. extract_array,
  78. )
  79. from pandas.core.indexers import validate_indices
  80. if TYPE_CHECKING:
  81. from pandas._typing import (
  82. NumpySorter,
  83. NumpyValueArrayLike,
  84. )
  85. from pandas import (
  86. Categorical,
  87. Index,
  88. Series,
  89. )
  90. from pandas.core.arrays import (
  91. BaseMaskedArray,
  92. ExtensionArray,
  93. )
  94. # --------------- #
  95. # dtype access #
  96. # --------------- #
  97. def _ensure_data(values: ArrayLike) -> np.ndarray:
  98. """
  99. routine to ensure that our data is of the correct
  100. input dtype for lower-level routines
  101. This will coerce:
  102. - ints -> int64
  103. - uint -> uint64
  104. - bool -> uint8
  105. - datetimelike -> i8
  106. - datetime64tz -> i8 (in local tz)
  107. - categorical -> codes
  108. Parameters
  109. ----------
  110. values : np.ndarray or ExtensionArray
  111. Returns
  112. -------
  113. np.ndarray
  114. """
  115. if not isinstance(values, ABCMultiIndex):
  116. # extract_array would raise
  117. values = extract_array(values, extract_numpy=True)
  118. if is_object_dtype(values.dtype):
  119. return ensure_object(np.asarray(values))
  120. elif isinstance(values.dtype, BaseMaskedDtype):
  121. # i.e. BooleanArray, FloatingArray, IntegerArray
  122. values = cast("BaseMaskedArray", values)
  123. if not values._hasna:
  124. # No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816
  125. # recurse to avoid re-implementing logic for eg bool->uint8
  126. return _ensure_data(values._data)
  127. return np.asarray(values)
  128. elif is_categorical_dtype(values.dtype):
  129. # NB: cases that go through here should NOT be using _reconstruct_data
  130. # on the back-end.
  131. values = cast("Categorical", values)
  132. return values.codes
  133. elif is_bool_dtype(values.dtype):
  134. if isinstance(values, np.ndarray):
  135. # i.e. actually dtype == np.dtype("bool")
  136. return np.asarray(values).view("uint8")
  137. else:
  138. # e.g. Sparse[bool, False] # TODO: no test cases get here
  139. return np.asarray(values).astype("uint8", copy=False)
  140. elif is_integer_dtype(values.dtype):
  141. return np.asarray(values)
  142. elif is_float_dtype(values.dtype):
  143. # Note: checking `values.dtype == "float128"` raises on Windows and 32bit
  144. # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]"
  145. # has no attribute "itemsize"
  146. if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr]
  147. # we dont (yet) have float128 hashtable support
  148. return ensure_float64(values)
  149. return np.asarray(values)
  150. elif is_complex_dtype(values.dtype):
  151. return cast(np.ndarray, values)
  152. # datetimelike
  153. elif needs_i8_conversion(values.dtype):
  154. npvalues = values.view("i8")
  155. npvalues = cast(np.ndarray, npvalues)
  156. return npvalues
  157. # we have failed, return object
  158. values = np.asarray(values, dtype=object)
  159. return ensure_object(values)
  160. def _reconstruct_data(
  161. values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike
  162. ) -> ArrayLike:
  163. """
  164. reverse of _ensure_data
  165. Parameters
  166. ----------
  167. values : np.ndarray or ExtensionArray
  168. dtype : np.dtype or ExtensionDtype
  169. original : AnyArrayLike
  170. Returns
  171. -------
  172. ExtensionArray or np.ndarray
  173. """
  174. if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
  175. # Catch DatetimeArray/TimedeltaArray
  176. return values
  177. if not isinstance(dtype, np.dtype):
  178. # i.e. ExtensionDtype; note we have ruled out above the possibility
  179. # that values.dtype == dtype
  180. cls = dtype.construct_array_type()
  181. values = cls._from_sequence(values, dtype=dtype)
  182. else:
  183. values = values.astype(dtype, copy=False)
  184. return values
  185. def _ensure_arraylike(values) -> ArrayLike:
  186. """
  187. ensure that we are arraylike if not already
  188. """
  189. if not is_array_like(values):
  190. inferred = lib.infer_dtype(values, skipna=False)
  191. if inferred in ["mixed", "string", "mixed-integer"]:
  192. # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160
  193. if isinstance(values, tuple):
  194. values = list(values)
  195. values = construct_1d_object_array_from_listlike(values)
  196. else:
  197. values = np.asarray(values)
  198. return values
  199. _hashtables = {
  200. "complex128": htable.Complex128HashTable,
  201. "complex64": htable.Complex64HashTable,
  202. "float64": htable.Float64HashTable,
  203. "float32": htable.Float32HashTable,
  204. "uint64": htable.UInt64HashTable,
  205. "uint32": htable.UInt32HashTable,
  206. "uint16": htable.UInt16HashTable,
  207. "uint8": htable.UInt8HashTable,
  208. "int64": htable.Int64HashTable,
  209. "int32": htable.Int32HashTable,
  210. "int16": htable.Int16HashTable,
  211. "int8": htable.Int8HashTable,
  212. "string": htable.StringHashTable,
  213. "object": htable.PyObjectHashTable,
  214. }
  215. def _get_hashtable_algo(values: np.ndarray):
  216. """
  217. Parameters
  218. ----------
  219. values : np.ndarray
  220. Returns
  221. -------
  222. htable : HashTable subclass
  223. values : ndarray
  224. """
  225. values = _ensure_data(values)
  226. ndtype = _check_object_for_strings(values)
  227. hashtable = _hashtables[ndtype]
  228. return hashtable, values
  229. def _check_object_for_strings(values: np.ndarray) -> str:
  230. """
  231. Check if we can use string hashtable instead of object hashtable.
  232. Parameters
  233. ----------
  234. values : ndarray
  235. Returns
  236. -------
  237. str
  238. """
  239. ndtype = values.dtype.name
  240. if ndtype == "object":
  241. # it's cheaper to use a String Hash Table than Object; we infer
  242. # including nulls because that is the only difference between
  243. # StringHashTable and ObjectHashtable
  244. if lib.infer_dtype(values, skipna=False) in ["string"]:
  245. ndtype = "string"
  246. return ndtype
  247. # --------------- #
  248. # top-level algos #
  249. # --------------- #
  250. def unique(values):
  251. """
  252. Return unique values based on a hash table.
  253. Uniques are returned in order of appearance. This does NOT sort.
  254. Significantly faster than numpy.unique for long enough sequences.
  255. Includes NA values.
  256. Parameters
  257. ----------
  258. values : 1d array-like
  259. Returns
  260. -------
  261. numpy.ndarray or ExtensionArray
  262. The return can be:
  263. * Index : when the input is an Index
  264. * Categorical : when the input is a Categorical dtype
  265. * ndarray : when the input is a Series/ndarray
  266. Return numpy.ndarray or ExtensionArray.
  267. See Also
  268. --------
  269. Index.unique : Return unique values from an Index.
  270. Series.unique : Return unique values of Series object.
  271. Examples
  272. --------
  273. >>> pd.unique(pd.Series([2, 1, 3, 3]))
  274. array([2, 1, 3])
  275. >>> pd.unique(pd.Series([2] + [1] * 5))
  276. array([2, 1])
  277. >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]))
  278. array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
  279. >>> pd.unique(
  280. ... pd.Series(
  281. ... [
  282. ... pd.Timestamp("20160101", tz="US/Eastern"),
  283. ... pd.Timestamp("20160101", tz="US/Eastern"),
  284. ... ]
  285. ... )
  286. ... )
  287. <DatetimeArray>
  288. ['2016-01-01 00:00:00-05:00']
  289. Length: 1, dtype: datetime64[ns, US/Eastern]
  290. >>> pd.unique(
  291. ... pd.Index(
  292. ... [
  293. ... pd.Timestamp("20160101", tz="US/Eastern"),
  294. ... pd.Timestamp("20160101", tz="US/Eastern"),
  295. ... ]
  296. ... )
  297. ... )
  298. DatetimeIndex(['2016-01-01 00:00:00-05:00'],
  299. dtype='datetime64[ns, US/Eastern]',
  300. freq=None)
  301. >>> pd.unique(list("baabc"))
  302. array(['b', 'a', 'c'], dtype=object)
  303. An unordered Categorical will return categories in the
  304. order of appearance.
  305. >>> pd.unique(pd.Series(pd.Categorical(list("baabc"))))
  306. ['b', 'a', 'c']
  307. Categories (3, object): ['a', 'b', 'c']
  308. >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc"))))
  309. ['b', 'a', 'c']
  310. Categories (3, object): ['a', 'b', 'c']
  311. An ordered Categorical preserves the category ordering.
  312. >>> pd.unique(
  313. ... pd.Series(
  314. ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
  315. ... )
  316. ... )
  317. ['b', 'a', 'c']
  318. Categories (3, object): ['a' < 'b' < 'c']
  319. An array of tuples
  320. >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")])
  321. array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
  322. """
  323. return unique_with_mask(values)
  324. def nunique_ints(values: ArrayLike) -> int:
  325. """
  326. Return the number of unique values for integer array-likes.
  327. Significantly faster than pandas.unique for long enough sequences.
  328. No checks are done to ensure input is integral.
  329. Parameters
  330. ----------
  331. values : 1d array-like
  332. Returns
  333. -------
  334. int : The number of unique values in ``values``
  335. """
  336. if len(values) == 0:
  337. return 0
  338. values = _ensure_data(values)
  339. # bincount requires intp
  340. result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
  341. return result
  342. def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
  343. """See algorithms.unique for docs. Takes a mask for masked arrays."""
  344. values = _ensure_arraylike(values)
  345. if is_extension_array_dtype(values.dtype):
  346. # Dispatch to extension dtype's unique.
  347. return values.unique()
  348. original = values
  349. hashtable, values = _get_hashtable_algo(values)
  350. table = hashtable(len(values))
  351. if mask is None:
  352. uniques = table.unique(values)
  353. uniques = _reconstruct_data(uniques, original.dtype, original)
  354. return uniques
  355. else:
  356. uniques, mask = table.unique(values, mask=mask)
  357. uniques = _reconstruct_data(uniques, original.dtype, original)
  358. assert mask is not None # for mypy
  359. return uniques, mask.astype("bool")
  360. unique1d = unique
  361. def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
  362. """
  363. Compute the isin boolean array.
  364. Parameters
  365. ----------
  366. comps : array-like
  367. values : array-like
  368. Returns
  369. -------
  370. ndarray[bool]
  371. Same length as `comps`.
  372. """
  373. if not is_list_like(comps):
  374. raise TypeError(
  375. "only list-like objects are allowed to be passed "
  376. f"to isin(), you passed a [{type(comps).__name__}]"
  377. )
  378. if not is_list_like(values):
  379. raise TypeError(
  380. "only list-like objects are allowed to be passed "
  381. f"to isin(), you passed a [{type(values).__name__}]"
  382. )
  383. if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
  384. orig_values = list(values)
  385. values = _ensure_arraylike(orig_values)
  386. if (
  387. len(values) > 0
  388. and is_numeric_dtype(values)
  389. and not is_signed_integer_dtype(comps)
  390. ):
  391. # GH#46485 Use object to avoid upcast to float64 later
  392. # TODO: Share with _find_common_type_compat
  393. values = construct_1d_object_array_from_listlike(orig_values)
  394. elif isinstance(values, ABCMultiIndex):
  395. # Avoid raising in extract_array
  396. values = np.array(values)
  397. else:
  398. values = extract_array(values, extract_numpy=True, extract_range=True)
  399. comps_array = _ensure_arraylike(comps)
  400. comps_array = extract_array(comps_array, extract_numpy=True)
  401. if not isinstance(comps_array, np.ndarray):
  402. # i.e. Extension Array
  403. return comps_array.isin(values)
  404. elif needs_i8_conversion(comps_array.dtype):
  405. # Dispatch to DatetimeLikeArrayMixin.isin
  406. return pd_array(comps_array).isin(values)
  407. elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype):
  408. # e.g. comps_array are integers and values are datetime64s
  409. return np.zeros(comps_array.shape, dtype=bool)
  410. # TODO: not quite right ... Sparse/Categorical
  411. elif needs_i8_conversion(values.dtype):
  412. return isin(comps_array, values.astype(object))
  413. elif isinstance(values.dtype, ExtensionDtype):
  414. return isin(np.asarray(comps_array), np.asarray(values))
  415. # GH16012
  416. # Ensure np.in1d doesn't get object types or it *may* throw an exception
  417. # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
  418. # in1d is faster for small sizes
  419. if (
  420. len(comps_array) > 1_000_000
  421. and len(values) <= 26
  422. and not is_object_dtype(comps_array)
  423. ):
  424. # If the values include nan we need to check for nan explicitly
  425. # since np.nan it not equal to np.nan
  426. if isna(values).any():
  427. def f(c, v):
  428. return np.logical_or(np.in1d(c, v), np.isnan(c))
  429. else:
  430. f = np.in1d
  431. else:
  432. common = np_find_common_type(values.dtype, comps_array.dtype)
  433. values = values.astype(common, copy=False)
  434. comps_array = comps_array.astype(common, copy=False)
  435. f = htable.ismember
  436. return f(comps_array, values)
  437. def factorize_array(
  438. values: np.ndarray,
  439. use_na_sentinel: bool = True,
  440. size_hint: int | None = None,
  441. na_value: object = None,
  442. mask: npt.NDArray[np.bool_] | None = None,
  443. ) -> tuple[npt.NDArray[np.intp], np.ndarray]:
  444. """
  445. Factorize a numpy array to codes and uniques.
  446. This doesn't do any coercion of types or unboxing before factorization.
  447. Parameters
  448. ----------
  449. values : ndarray
  450. use_na_sentinel : bool, default True
  451. If True, the sentinel -1 will be used for NaN values. If False,
  452. NaN values will be encoded as non-negative integers and will not drop the
  453. NaN from the uniques of the values.
  454. size_hint : int, optional
  455. Passed through to the hashtable's 'get_labels' method
  456. na_value : object, optional
  457. A value in `values` to consider missing. Note: only use this
  458. parameter when you know that you don't have any values pandas would
  459. consider missing in the array (NaN for float data, iNaT for
  460. datetimes, etc.).
  461. mask : ndarray[bool], optional
  462. If not None, the mask is used as indicator for missing values
  463. (True = missing, False = valid) instead of `na_value` or
  464. condition "val != val".
  465. Returns
  466. -------
  467. codes : ndarray[np.intp]
  468. uniques : ndarray
  469. """
  470. original = values
  471. if values.dtype.kind in ["m", "M"]:
  472. # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
  473. # need to do the same to na_value. We are assuming here that the passed
  474. # na_value is an appropriately-typed NaT.
  475. # e.g. test_where_datetimelike_categorical
  476. na_value = iNaT
  477. hash_klass, values = _get_hashtable_algo(values)
  478. table = hash_klass(size_hint or len(values))
  479. uniques, codes = table.factorize(
  480. values,
  481. na_sentinel=-1,
  482. na_value=na_value,
  483. mask=mask,
  484. ignore_na=use_na_sentinel,
  485. )
  486. # re-cast e.g. i8->dt64/td64, uint8->bool
  487. uniques = _reconstruct_data(uniques, original.dtype, original)
  488. codes = ensure_platform_int(codes)
  489. return codes, uniques
  490. @doc(
  491. values=dedent(
  492. """\
  493. values : sequence
  494. A 1-D sequence. Sequences that aren't pandas objects are
  495. coerced to ndarrays before factorization.
  496. """
  497. ),
  498. sort=dedent(
  499. """\
  500. sort : bool, default False
  501. Sort `uniques` and shuffle `codes` to maintain the
  502. relationship.
  503. """
  504. ),
  505. size_hint=dedent(
  506. """\
  507. size_hint : int, optional
  508. Hint to the hashtable sizer.
  509. """
  510. ),
  511. )
  512. def factorize(
  513. values,
  514. sort: bool = False,
  515. use_na_sentinel: bool = True,
  516. size_hint: int | None = None,
  517. ) -> tuple[np.ndarray, np.ndarray | Index]:
  518. """
  519. Encode the object as an enumerated type or categorical variable.
  520. This method is useful for obtaining a numeric representation of an
  521. array when all that matters is identifying distinct values. `factorize`
  522. is available as both a top-level function :func:`pandas.factorize`,
  523. and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
  524. Parameters
  525. ----------
  526. {values}{sort}
  527. use_na_sentinel : bool, default True
  528. If True, the sentinel -1 will be used for NaN values. If False,
  529. NaN values will be encoded as non-negative integers and will not drop the
  530. NaN from the uniques of the values.
  531. .. versionadded:: 1.5.0
  532. {size_hint}\
  533. Returns
  534. -------
  535. codes : ndarray
  536. An integer ndarray that's an indexer into `uniques`.
  537. ``uniques.take(codes)`` will have the same values as `values`.
  538. uniques : ndarray, Index, or Categorical
  539. The unique valid values. When `values` is Categorical, `uniques`
  540. is a Categorical. When `values` is some other pandas object, an
  541. `Index` is returned. Otherwise, a 1-D ndarray is returned.
  542. .. note::
  543. Even if there's a missing value in `values`, `uniques` will
  544. *not* contain an entry for it.
  545. See Also
  546. --------
  547. cut : Discretize continuous-valued array.
  548. unique : Find the unique value in an array.
  549. Notes
  550. -----
  551. Reference :ref:`the user guide <reshaping.factorize>` for more examples.
  552. Examples
  553. --------
  554. These examples all show factorize as a top-level method like
  555. ``pd.factorize(values)``. The results are identical for methods like
  556. :meth:`Series.factorize`.
  557. >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
  558. >>> codes
  559. array([0, 0, 1, 2, 0])
  560. >>> uniques
  561. array(['b', 'a', 'c'], dtype=object)
  562. With ``sort=True``, the `uniques` will be sorted, and `codes` will be
  563. shuffled so that the relationship is the maintained.
  564. >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
  565. >>> codes
  566. array([1, 1, 0, 2, 1])
  567. >>> uniques
  568. array(['a', 'b', 'c'], dtype=object)
  569. When ``use_na_sentinel=True`` (the default), missing values are indicated in
  570. the `codes` with the sentinel value ``-1`` and missing values are not
  571. included in `uniques`.
  572. >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
  573. >>> codes
  574. array([ 0, -1, 1, 2, 0])
  575. >>> uniques
  576. array(['b', 'a', 'c'], dtype=object)
  577. Thus far, we've only factorized lists (which are internally coerced to
  578. NumPy arrays). When factorizing pandas objects, the type of `uniques`
  579. will differ. For Categoricals, a `Categorical` is returned.
  580. >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
  581. >>> codes, uniques = pd.factorize(cat)
  582. >>> codes
  583. array([0, 0, 1])
  584. >>> uniques
  585. ['a', 'c']
  586. Categories (3, object): ['a', 'b', 'c']
  587. Notice that ``'b'`` is in ``uniques.categories``, despite not being
  588. present in ``cat.values``.
  589. For all other pandas objects, an Index of the appropriate type is
  590. returned.
  591. >>> cat = pd.Series(['a', 'a', 'c'])
  592. >>> codes, uniques = pd.factorize(cat)
  593. >>> codes
  594. array([0, 0, 1])
  595. >>> uniques
  596. Index(['a', 'c'], dtype='object')
  597. If NaN is in the values, and we want to include NaN in the uniques of the
  598. values, it can be achieved by setting ``use_na_sentinel=False``.
  599. >>> values = np.array([1, 2, 1, np.nan])
  600. >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
  601. >>> codes
  602. array([ 0, 1, 0, -1])
  603. >>> uniques
  604. array([1., 2.])
  605. >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
  606. >>> codes
  607. array([0, 1, 0, 2])
  608. >>> uniques
  609. array([ 1., 2., nan])
  610. """
  611. # Implementation notes: This method is responsible for 3 things
  612. # 1.) coercing data to array-like (ndarray, Index, extension array)
  613. # 2.) factorizing codes and uniques
  614. # 3.) Maybe boxing the uniques in an Index
  615. #
  616. # Step 2 is dispatched to extension types (like Categorical). They are
  617. # responsible only for factorization. All data coercion, sorting and boxing
  618. # should happen here.
  619. if isinstance(values, (ABCIndex, ABCSeries)):
  620. return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)
  621. values = _ensure_arraylike(values)
  622. original = values
  623. if (
  624. isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
  625. and values.freq is not None
  626. ):
  627. # The presence of 'freq' means we can fast-path sorting and know there
  628. # aren't NAs
  629. codes, uniques = values.factorize(sort=sort)
  630. return codes, uniques
  631. elif not isinstance(values, np.ndarray):
  632. # i.e. ExtensionArray
  633. codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
  634. else:
  635. values = np.asarray(values) # convert DTA/TDA/MultiIndex
  636. if not use_na_sentinel and is_object_dtype(values):
  637. # factorize can now handle differentiating various types of null values.
  638. # These can only occur when the array has object dtype.
  639. # However, for backwards compatibility we only use the null for the
  640. # provided dtype. This may be revisited in the future, see GH#48476.
  641. null_mask = isna(values)
  642. if null_mask.any():
  643. na_value = na_value_for_dtype(values.dtype, compat=False)
  644. # Don't modify (potentially user-provided) array
  645. values = np.where(null_mask, na_value, values)
  646. codes, uniques = factorize_array(
  647. values,
  648. use_na_sentinel=use_na_sentinel,
  649. size_hint=size_hint,
  650. )
  651. if sort and len(uniques) > 0:
  652. uniques, codes = safe_sort(
  653. uniques,
  654. codes,
  655. use_na_sentinel=use_na_sentinel,
  656. assume_unique=True,
  657. verify=False,
  658. )
  659. uniques = _reconstruct_data(uniques, original.dtype, original)
  660. return codes, uniques
  661. def value_counts(
  662. values,
  663. sort: bool = True,
  664. ascending: bool = False,
  665. normalize: bool = False,
  666. bins=None,
  667. dropna: bool = True,
  668. ) -> Series:
  669. """
  670. Compute a histogram of the counts of non-null values.
  671. Parameters
  672. ----------
  673. values : ndarray (1-d)
  674. sort : bool, default True
  675. Sort by values
  676. ascending : bool, default False
  677. Sort in ascending order
  678. normalize: bool, default False
  679. If True then compute a relative histogram
  680. bins : integer, optional
  681. Rather than count values, group them into half-open bins,
  682. convenience for pd.cut, only works with numeric data
  683. dropna : bool, default True
  684. Don't include counts of NaN
  685. Returns
  686. -------
  687. Series
  688. """
  689. from pandas import (
  690. Index,
  691. Series,
  692. )
  693. index_name = getattr(values, "name", None)
  694. name = "proportion" if normalize else "count"
  695. if bins is not None:
  696. from pandas.core.reshape.tile import cut
  697. values = Series(values, copy=False)
  698. try:
  699. ii = cut(values, bins, include_lowest=True)
  700. except TypeError as err:
  701. raise TypeError("bins argument only works with numeric data.") from err
  702. # count, remove nulls (from the index), and but the bins
  703. result = ii.value_counts(dropna=dropna)
  704. result.name = name
  705. result = result[result.index.notna()]
  706. result.index = result.index.astype("interval")
  707. result = result.sort_index()
  708. # if we are dropna and we have NO values
  709. if dropna and (result._values == 0).all():
  710. result = result.iloc[0:0]
  711. # normalizing is by len of all (regardless of dropna)
  712. counts = np.array([len(ii)])
  713. else:
  714. if is_extension_array_dtype(values):
  715. # handle Categorical and sparse,
  716. result = Series(values, copy=False)._values.value_counts(dropna=dropna)
  717. result.name = name
  718. result.index.name = index_name
  719. counts = result._values
  720. if not isinstance(counts, np.ndarray):
  721. # e.g. ArrowExtensionArray
  722. counts = np.asarray(counts)
  723. elif isinstance(values, ABCMultiIndex):
  724. # GH49558
  725. levels = list(range(values.nlevels))
  726. result = (
  727. Series(index=values, name=name)
  728. .groupby(level=levels, dropna=dropna)
  729. .size()
  730. )
  731. result.index.names = values.names
  732. counts = result._values
  733. else:
  734. values = _ensure_arraylike(values)
  735. keys, counts = value_counts_arraylike(values, dropna)
  736. if keys.dtype == np.float16:
  737. keys = keys.astype(np.float32)
  738. # For backwards compatibility, we let Index do its normal type
  739. # inference, _except_ for if if infers from object to bool.
  740. idx = Index(keys)
  741. if idx.dtype == bool and keys.dtype == object:
  742. idx = idx.astype(object)
  743. idx.name = index_name
  744. result = Series(counts, index=idx, name=name, copy=False)
  745. if sort:
  746. result = result.sort_values(ascending=ascending)
  747. if normalize:
  748. result = result / counts.sum()
  749. return result
  750. # Called once from SparseArray, otherwise could be private
  751. def value_counts_arraylike(
  752. values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
  753. ) -> tuple[ArrayLike, npt.NDArray[np.int64]]:
  754. """
  755. Parameters
  756. ----------
  757. values : np.ndarray
  758. dropna : bool
  759. mask : np.ndarray[bool] or None, default None
  760. Returns
  761. -------
  762. uniques : np.ndarray
  763. counts : np.ndarray[np.int64]
  764. """
  765. original = values
  766. values = _ensure_data(values)
  767. keys, counts = htable.value_count(values, dropna, mask=mask)
  768. if needs_i8_conversion(original.dtype):
  769. # datetime, timedelta, or period
  770. if dropna:
  771. mask = keys != iNaT
  772. keys, counts = keys[mask], counts[mask]
  773. res_keys = _reconstruct_data(keys, original.dtype, original)
  774. return res_keys, counts
  775. def duplicated(
  776. values: ArrayLike, keep: Literal["first", "last", False] = "first"
  777. ) -> npt.NDArray[np.bool_]:
  778. """
  779. Return boolean ndarray denoting duplicate values.
  780. Parameters
  781. ----------
  782. values : nd.array, ExtensionArray or Series
  783. Array over which to check for duplicate values.
  784. keep : {'first', 'last', False}, default 'first'
  785. - ``first`` : Mark duplicates as ``True`` except for the first
  786. occurrence.
  787. - ``last`` : Mark duplicates as ``True`` except for the last
  788. occurrence.
  789. - False : Mark all duplicates as ``True``.
  790. Returns
  791. -------
  792. duplicated : ndarray[bool]
  793. """
  794. if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype):
  795. values = cast("BaseMaskedArray", values)
  796. return htable.duplicated(values._data, keep=keep, mask=values._mask)
  797. values = _ensure_data(values)
  798. return htable.duplicated(values, keep=keep)
  799. def mode(
  800. values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None
  801. ) -> ArrayLike:
  802. """
  803. Returns the mode(s) of an array.
  804. Parameters
  805. ----------
  806. values : array-like
  807. Array over which to check for duplicate values.
  808. dropna : bool, default True
  809. Don't consider counts of NaN/NaT.
  810. Returns
  811. -------
  812. np.ndarray or ExtensionArray
  813. """
  814. values = _ensure_arraylike(values)
  815. original = values
  816. if needs_i8_conversion(values.dtype):
  817. # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray.
  818. values = ensure_wrapped_if_datetimelike(values)
  819. values = cast("ExtensionArray", values)
  820. return values._mode(dropna=dropna)
  821. values = _ensure_data(values)
  822. npresult = htable.mode(values, dropna=dropna, mask=mask)
  823. try:
  824. npresult = np.sort(npresult)
  825. except TypeError as err:
  826. warnings.warn(
  827. f"Unable to sort modes: {err}",
  828. stacklevel=find_stack_level(),
  829. )
  830. result = _reconstruct_data(npresult, original.dtype, original)
  831. return result
  832. def rank(
  833. values: ArrayLike,
  834. axis: AxisInt = 0,
  835. method: str = "average",
  836. na_option: str = "keep",
  837. ascending: bool = True,
  838. pct: bool = False,
  839. ) -> npt.NDArray[np.float64]:
  840. """
  841. Rank the values along a given axis.
  842. Parameters
  843. ----------
  844. values : np.ndarray or ExtensionArray
  845. Array whose values will be ranked. The number of dimensions in this
  846. array must not exceed 2.
  847. axis : int, default 0
  848. Axis over which to perform rankings.
  849. method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
  850. The method by which tiebreaks are broken during the ranking.
  851. na_option : {'keep', 'top'}, default 'keep'
  852. The method by which NaNs are placed in the ranking.
  853. - ``keep``: rank each NaN value with a NaN ranking
  854. - ``top``: replace each NaN with either +/- inf so that they
  855. there are ranked at the top
  856. ascending : bool, default True
  857. Whether or not the elements should be ranked in ascending order.
  858. pct : bool, default False
  859. Whether or not to the display the returned rankings in integer form
  860. (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
  861. """
  862. is_datetimelike = needs_i8_conversion(values.dtype)
  863. values = _ensure_data(values)
  864. if values.ndim == 1:
  865. ranks = algos.rank_1d(
  866. values,
  867. is_datetimelike=is_datetimelike,
  868. ties_method=method,
  869. ascending=ascending,
  870. na_option=na_option,
  871. pct=pct,
  872. )
  873. elif values.ndim == 2:
  874. ranks = algos.rank_2d(
  875. values,
  876. axis=axis,
  877. is_datetimelike=is_datetimelike,
  878. ties_method=method,
  879. ascending=ascending,
  880. na_option=na_option,
  881. pct=pct,
  882. )
  883. else:
  884. raise TypeError("Array with ndim > 2 are not supported.")
  885. return ranks
  886. def checked_add_with_arr(
  887. arr: npt.NDArray[np.int64],
  888. b: int | npt.NDArray[np.int64],
  889. arr_mask: npt.NDArray[np.bool_] | None = None,
  890. b_mask: npt.NDArray[np.bool_] | None = None,
  891. ) -> npt.NDArray[np.int64]:
  892. """
  893. Perform array addition that checks for underflow and overflow.
  894. Performs the addition of an int64 array and an int64 integer (or array)
  895. but checks that they do not result in overflow first. For elements that
  896. are indicated to be NaN, whether or not there is overflow for that element
  897. is automatically ignored.
  898. Parameters
  899. ----------
  900. arr : np.ndarray[int64] addend.
  901. b : array or scalar addend.
  902. arr_mask : np.ndarray[bool] or None, default None
  903. array indicating which elements to exclude from checking
  904. b_mask : np.ndarray[bool] or None, default None
  905. array or scalar indicating which element(s) to exclude from checking
  906. Returns
  907. -------
  908. sum : An array for elements x + b for each element x in arr if b is
  909. a scalar or an array for elements x + y for each element pair
  910. (x, y) in (arr, b).
  911. Raises
  912. ------
  913. OverflowError if any x + y exceeds the maximum or minimum int64 value.
  914. """
  915. # For performance reasons, we broadcast 'b' to the new array 'b2'
  916. # so that it has the same size as 'arr'.
  917. b2 = np.broadcast_to(b, arr.shape)
  918. if b_mask is not None:
  919. # We do the same broadcasting for b_mask as well.
  920. b2_mask = np.broadcast_to(b_mask, arr.shape)
  921. else:
  922. b2_mask = None
  923. # For elements that are NaN, regardless of their value, we should
  924. # ignore whether they overflow or not when doing the checked add.
  925. if arr_mask is not None and b2_mask is not None:
  926. not_nan = np.logical_not(arr_mask | b2_mask)
  927. elif arr_mask is not None:
  928. not_nan = np.logical_not(arr_mask)
  929. elif b_mask is not None:
  930. # error: Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has
  931. # incompatible type "Optional[ndarray[Any, dtype[bool_]]]";
  932. # expected "Union[_SupportsArray[dtype[Any]], _NestedSequence
  933. # [_SupportsArray[dtype[Any]]], bool, int, float, complex, str
  934. # , bytes, _NestedSequence[Union[bool, int, float, complex, str
  935. # , bytes]]]"
  936. not_nan = np.logical_not(b2_mask) # type: ignore[arg-type]
  937. else:
  938. not_nan = np.empty(arr.shape, dtype=bool)
  939. not_nan.fill(True)
  940. # gh-14324: For each element in 'arr' and its corresponding element
  941. # in 'b2', we check the sign of the element in 'b2'. If it is positive,
  942. # we then check whether its sum with the element in 'arr' exceeds
  943. # np.iinfo(np.int64).max. If so, we have an overflow error. If it
  944. # it is negative, we then check whether its sum with the element in
  945. # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
  946. # error as well.
  947. i8max = lib.i8max
  948. i8min = iNaT
  949. mask1 = b2 > 0
  950. mask2 = b2 < 0
  951. if not mask1.any():
  952. to_raise = ((i8min - b2 > arr) & not_nan).any()
  953. elif not mask2.any():
  954. to_raise = ((i8max - b2 < arr) & not_nan).any()
  955. else:
  956. to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or (
  957. (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2]
  958. ).any()
  959. if to_raise:
  960. raise OverflowError("Overflow in int64 addition")
  961. result = arr + b
  962. if arr_mask is not None or b2_mask is not None:
  963. np.putmask(result, ~not_nan, iNaT)
  964. return result
  965. # ---- #
  966. # take #
  967. # ---- #
  968. def take(
  969. arr,
  970. indices: TakeIndexer,
  971. axis: AxisInt = 0,
  972. allow_fill: bool = False,
  973. fill_value=None,
  974. ):
  975. """
  976. Take elements from an array.
  977. Parameters
  978. ----------
  979. arr : array-like or scalar value
  980. Non array-likes (sequences/scalars without a dtype) are coerced
  981. to an ndarray.
  982. indices : sequence of int or one-dimensional np.ndarray of int
  983. Indices to be taken.
  984. axis : int, default 0
  985. The axis over which to select values.
  986. allow_fill : bool, default False
  987. How to handle negative values in `indices`.
  988. * False: negative values in `indices` indicate positional indices
  989. from the right (the default). This is similar to :func:`numpy.take`.
  990. * True: negative values in `indices` indicate
  991. missing values. These values are set to `fill_value`. Any other
  992. negative values raise a ``ValueError``.
  993. fill_value : any, optional
  994. Fill value to use for NA-indices when `allow_fill` is True.
  995. This may be ``None``, in which case the default NA value for
  996. the type (``self.dtype.na_value``) is used.
  997. For multi-dimensional `arr`, each *element* is filled with
  998. `fill_value`.
  999. Returns
  1000. -------
  1001. ndarray or ExtensionArray
  1002. Same type as the input.
  1003. Raises
  1004. ------
  1005. IndexError
  1006. When `indices` is out of bounds for the array.
  1007. ValueError
  1008. When the indexer contains negative values other than ``-1``
  1009. and `allow_fill` is True.
  1010. Notes
  1011. -----
  1012. When `allow_fill` is False, `indices` may be whatever dimensionality
  1013. is accepted by NumPy for `arr`.
  1014. When `allow_fill` is True, `indices` should be 1-D.
  1015. See Also
  1016. --------
  1017. numpy.take : Take elements from an array along an axis.
  1018. Examples
  1019. --------
  1020. >>> import pandas as pd
  1021. With the default ``allow_fill=False``, negative numbers indicate
  1022. positional indices from the right.
  1023. >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1])
  1024. array([10, 10, 30])
  1025. Setting ``allow_fill=True`` will place `fill_value` in those positions.
  1026. >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
  1027. array([10., 10., nan])
  1028. >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
  1029. ... fill_value=-10)
  1030. array([ 10, 10, -10])
  1031. """
  1032. if not is_array_like(arr):
  1033. arr = np.asarray(arr)
  1034. indices = np.asarray(indices, dtype=np.intp)
  1035. if allow_fill:
  1036. # Pandas style, -1 means NA
  1037. validate_indices(indices, arr.shape[axis])
  1038. result = take_nd(
  1039. arr, indices, axis=axis, allow_fill=True, fill_value=fill_value
  1040. )
  1041. else:
  1042. # NumPy style
  1043. result = arr.take(indices, axis=axis)
  1044. return result
  1045. # ------------ #
  1046. # searchsorted #
  1047. # ------------ #
  1048. def searchsorted(
  1049. arr: ArrayLike,
  1050. value: NumpyValueArrayLike | ExtensionArray,
  1051. side: Literal["left", "right"] = "left",
  1052. sorter: NumpySorter = None,
  1053. ) -> npt.NDArray[np.intp] | np.intp:
  1054. """
  1055. Find indices where elements should be inserted to maintain order.
  1056. Find the indices into a sorted array `arr` (a) such that, if the
  1057. corresponding elements in `value` were inserted before the indices,
  1058. the order of `arr` would be preserved.
  1059. Assuming that `arr` is sorted:
  1060. ====== ================================
  1061. `side` returned index `i` satisfies
  1062. ====== ================================
  1063. left ``arr[i-1] < value <= self[i]``
  1064. right ``arr[i-1] <= value < self[i]``
  1065. ====== ================================
  1066. Parameters
  1067. ----------
  1068. arr: np.ndarray, ExtensionArray, Series
  1069. Input array. If `sorter` is None, then it must be sorted in
  1070. ascending order, otherwise `sorter` must be an array of indices
  1071. that sort it.
  1072. value : array-like or scalar
  1073. Values to insert into `arr`.
  1074. side : {'left', 'right'}, optional
  1075. If 'left', the index of the first suitable location found is given.
  1076. If 'right', return the last such index. If there is no suitable
  1077. index, return either 0 or N (where N is the length of `self`).
  1078. sorter : 1-D array-like, optional
  1079. Optional array of integer indices that sort array a into ascending
  1080. order. They are typically the result of argsort.
  1081. Returns
  1082. -------
  1083. array of ints or int
  1084. If value is array-like, array of insertion points.
  1085. If value is scalar, a single integer.
  1086. See Also
  1087. --------
  1088. numpy.searchsorted : Similar method from NumPy.
  1089. """
  1090. if sorter is not None:
  1091. sorter = ensure_platform_int(sorter)
  1092. if (
  1093. isinstance(arr, np.ndarray)
  1094. and is_integer_dtype(arr.dtype)
  1095. and (is_integer(value) or is_integer_dtype(value))
  1096. ):
  1097. # if `arr` and `value` have different dtypes, `arr` would be
  1098. # recast by numpy, causing a slow search.
  1099. # Before searching below, we therefore try to give `value` the
  1100. # same dtype as `arr`, while guarding against integer overflows.
  1101. iinfo = np.iinfo(arr.dtype.type)
  1102. value_arr = np.array([value]) if is_scalar(value) else np.array(value)
  1103. if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
  1104. # value within bounds, so no overflow, so can convert value dtype
  1105. # to dtype of arr
  1106. dtype = arr.dtype
  1107. else:
  1108. dtype = value_arr.dtype
  1109. if is_scalar(value):
  1110. # We know that value is int
  1111. value = cast(int, dtype.type(value))
  1112. else:
  1113. value = pd_array(cast(ArrayLike, value), dtype=dtype)
  1114. else:
  1115. # E.g. if `arr` is an array with dtype='datetime64[ns]'
  1116. # and `value` is a pd.Timestamp, we may need to convert value
  1117. arr = ensure_wrapped_if_datetimelike(arr)
  1118. # Argument 1 to "searchsorted" of "ndarray" has incompatible type
  1119. # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike"
  1120. return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type]
  1121. # ---- #
  1122. # diff #
  1123. # ---- #
  1124. _diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}
  1125. def diff(arr, n: int, axis: AxisInt = 0):
  1126. """
  1127. difference of n between self,
  1128. analogous to s-s.shift(n)
  1129. Parameters
  1130. ----------
  1131. arr : ndarray or ExtensionArray
  1132. n : int
  1133. number of periods
  1134. axis : {0, 1}
  1135. axis to shift on
  1136. stacklevel : int, default 3
  1137. The stacklevel for the lost dtype warning.
  1138. Returns
  1139. -------
  1140. shifted
  1141. """
  1142. n = int(n)
  1143. na = np.nan
  1144. dtype = arr.dtype
  1145. is_bool = is_bool_dtype(dtype)
  1146. if is_bool:
  1147. op = operator.xor
  1148. else:
  1149. op = operator.sub
  1150. if isinstance(dtype, PandasDtype):
  1151. # PandasArray cannot necessarily hold shifted versions of itself.
  1152. arr = arr.to_numpy()
  1153. dtype = arr.dtype
  1154. if not isinstance(dtype, np.dtype):
  1155. # i.e ExtensionDtype
  1156. if hasattr(arr, f"__{op.__name__}__"):
  1157. if axis != 0:
  1158. raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
  1159. return op(arr, arr.shift(n))
  1160. else:
  1161. raise TypeError(
  1162. f"{type(arr).__name__} has no 'diff' method. "
  1163. "Convert to a suitable dtype prior to calling 'diff'."
  1164. )
  1165. is_timedelta = False
  1166. if needs_i8_conversion(arr.dtype):
  1167. dtype = np.int64
  1168. arr = arr.view("i8")
  1169. na = iNaT
  1170. is_timedelta = True
  1171. elif is_bool:
  1172. # We have to cast in order to be able to hold np.nan
  1173. dtype = np.object_
  1174. elif is_integer_dtype(dtype):
  1175. # We have to cast in order to be able to hold np.nan
  1176. # int8, int16 are incompatible with float64,
  1177. # see https://github.com/cython/cython/issues/2646
  1178. if arr.dtype.name in ["int8", "int16"]:
  1179. dtype = np.float32
  1180. else:
  1181. dtype = np.float64
  1182. orig_ndim = arr.ndim
  1183. if orig_ndim == 1:
  1184. # reshape so we can always use algos.diff_2d
  1185. arr = arr.reshape(-1, 1)
  1186. # TODO: require axis == 0
  1187. dtype = np.dtype(dtype)
  1188. out_arr = np.empty(arr.shape, dtype=dtype)
  1189. na_indexer = [slice(None)] * 2
  1190. na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
  1191. out_arr[tuple(na_indexer)] = na
  1192. if arr.dtype.name in _diff_special:
  1193. # TODO: can diff_2d dtype specialization troubles be fixed by defining
  1194. # out_arr inside diff_2d?
  1195. algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta)
  1196. else:
  1197. # To keep mypy happy, _res_indexer is a list while res_indexer is
  1198. # a tuple, ditto for lag_indexer.
  1199. _res_indexer = [slice(None)] * 2
  1200. _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
  1201. res_indexer = tuple(_res_indexer)
  1202. _lag_indexer = [slice(None)] * 2
  1203. _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
  1204. lag_indexer = tuple(_lag_indexer)
  1205. out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer])
  1206. if is_timedelta:
  1207. out_arr = out_arr.view("timedelta64[ns]")
  1208. if orig_ndim == 1:
  1209. out_arr = out_arr[:, 0]
  1210. return out_arr
  1211. # --------------------------------------------------------------------
  1212. # Helper functions
  1213. # Note: safe_sort is in algorithms.py instead of sorting.py because it is
  1214. # low-dependency, is used in this module, and used private methods from
  1215. # this module.
  1216. def safe_sort(
  1217. values,
  1218. codes=None,
  1219. use_na_sentinel: bool = True,
  1220. assume_unique: bool = False,
  1221. verify: bool = True,
  1222. ) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]:
  1223. """
  1224. Sort ``values`` and reorder corresponding ``codes``.
  1225. ``values`` should be unique if ``codes`` is not None.
  1226. Safe for use with mixed types (int, str), orders ints before strs.
  1227. Parameters
  1228. ----------
  1229. values : list-like
  1230. Sequence; must be unique if ``codes`` is not None.
  1231. codes : list_like, optional
  1232. Indices to ``values``. All out of bound indices are treated as
  1233. "not found" and will be masked with ``-1``.
  1234. use_na_sentinel : bool, default True
  1235. If True, the sentinel -1 will be used for NaN values. If False,
  1236. NaN values will be encoded as non-negative integers and will not drop the
  1237. NaN from the uniques of the values.
  1238. assume_unique : bool, default False
  1239. When True, ``values`` are assumed to be unique, which can speed up
  1240. the calculation. Ignored when ``codes`` is None.
  1241. verify : bool, default True
  1242. Check if codes are out of bound for the values and put out of bound
  1243. codes equal to ``-1``. If ``verify=False``, it is assumed there
  1244. are no out of bound codes. Ignored when ``codes`` is None.
  1245. Returns
  1246. -------
  1247. ordered : AnyArrayLike
  1248. Sorted ``values``
  1249. new_codes : ndarray
  1250. Reordered ``codes``; returned when ``codes`` is not None.
  1251. Raises
  1252. ------
  1253. TypeError
  1254. * If ``values`` is not list-like or if ``codes`` is neither None
  1255. nor list-like
  1256. * If ``values`` cannot be sorted
  1257. ValueError
  1258. * If ``codes`` is not None and ``values`` contain duplicates.
  1259. """
  1260. if not is_list_like(values):
  1261. raise TypeError(
  1262. "Only list-like objects are allowed to be passed to safe_sort as values"
  1263. )
  1264. if not is_array_like(values):
  1265. # don't convert to string types
  1266. dtype, _ = infer_dtype_from_array(values)
  1267. # error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any],
  1268. # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
  1269. # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
  1270. # _DTypeDict, Tuple[Any, Any]]]"
  1271. values = np.asarray(values, dtype=dtype) # type: ignore[arg-type]
  1272. sorter = None
  1273. ordered: AnyArrayLike
  1274. if (
  1275. not is_extension_array_dtype(values)
  1276. and lib.infer_dtype(values, skipna=False) == "mixed-integer"
  1277. ):
  1278. ordered = _sort_mixed(values)
  1279. else:
  1280. try:
  1281. sorter = values.argsort()
  1282. ordered = values.take(sorter)
  1283. except TypeError:
  1284. # Previous sorters failed or were not applicable, try `_sort_mixed`
  1285. # which would work, but which fails for special case of 1d arrays
  1286. # with tuples.
  1287. if values.size and isinstance(values[0], tuple):
  1288. ordered = _sort_tuples(values)
  1289. else:
  1290. ordered = _sort_mixed(values)
  1291. # codes:
  1292. if codes is None:
  1293. return ordered
  1294. if not is_list_like(codes):
  1295. raise TypeError(
  1296. "Only list-like objects or None are allowed to "
  1297. "be passed to safe_sort as codes"
  1298. )
  1299. codes = ensure_platform_int(np.asarray(codes))
  1300. if not assume_unique and not len(unique(values)) == len(values):
  1301. raise ValueError("values should be unique if codes is not None")
  1302. if sorter is None:
  1303. # mixed types
  1304. hash_klass, values = _get_hashtable_algo(values)
  1305. t = hash_klass(len(values))
  1306. t.map_locations(values)
  1307. sorter = ensure_platform_int(t.lookup(ordered))
  1308. if use_na_sentinel:
  1309. # take_nd is faster, but only works for na_sentinels of -1
  1310. order2 = sorter.argsort()
  1311. new_codes = take_nd(order2, codes, fill_value=-1)
  1312. if verify:
  1313. mask = (codes < -len(values)) | (codes >= len(values))
  1314. else:
  1315. mask = None
  1316. else:
  1317. reverse_indexer = np.empty(len(sorter), dtype=np.int_)
  1318. reverse_indexer.put(sorter, np.arange(len(sorter)))
  1319. # Out of bound indices will be masked with `-1` next, so we
  1320. # may deal with them here without performance loss using `mode='wrap'`
  1321. new_codes = reverse_indexer.take(codes, mode="wrap")
  1322. if use_na_sentinel:
  1323. mask = codes == -1
  1324. if verify:
  1325. mask = mask | (codes < -len(values)) | (codes >= len(values))
  1326. if use_na_sentinel and mask is not None:
  1327. np.putmask(new_codes, mask, -1)
  1328. return ordered, ensure_platform_int(new_codes)
  1329. def _sort_mixed(values) -> AnyArrayLike:
  1330. """order ints before strings before nulls in 1d arrays"""
  1331. str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
  1332. null_pos = np.array([isna(x) for x in values], dtype=bool)
  1333. num_pos = ~str_pos & ~null_pos
  1334. str_argsort = np.argsort(values[str_pos])
  1335. num_argsort = np.argsort(values[num_pos])
  1336. # convert boolean arrays to positional indices, then order by underlying values
  1337. str_locs = str_pos.nonzero()[0].take(str_argsort)
  1338. num_locs = num_pos.nonzero()[0].take(num_argsort)
  1339. null_locs = null_pos.nonzero()[0]
  1340. locs = np.concatenate([num_locs, str_locs, null_locs])
  1341. return values.take(locs)
  1342. def _sort_tuples(values: np.ndarray) -> np.ndarray:
  1343. """
  1344. Convert array of tuples (1d) to array of arrays (2d).
  1345. We need to keep the columns separately as they contain different types and
  1346. nans (can't use `np.sort` as it may fail when str and nan are mixed in a
  1347. column as types cannot be compared).
  1348. """
  1349. from pandas.core.internals.construction import to_arrays
  1350. from pandas.core.sorting import lexsort_indexer
  1351. arrays, _ = to_arrays(values, None)
  1352. indexer = lexsort_indexer(arrays, orders=True)
  1353. return values[indexer]
  1354. def union_with_duplicates(
  1355. lvals: ArrayLike | Index, rvals: ArrayLike | Index
  1356. ) -> ArrayLike | Index:
  1357. """
  1358. Extracts the union from lvals and rvals with respect to duplicates and nans in
  1359. both arrays.
  1360. Parameters
  1361. ----------
  1362. lvals: np.ndarray or ExtensionArray
  1363. left values which is ordered in front.
  1364. rvals: np.ndarray or ExtensionArray
  1365. right values ordered after lvals.
  1366. Returns
  1367. -------
  1368. np.ndarray or ExtensionArray
  1369. Containing the unsorted union of both arrays.
  1370. Notes
  1371. -----
  1372. Caller is responsible for ensuring lvals.dtype == rvals.dtype.
  1373. """
  1374. from pandas import Series
  1375. l_count = value_counts(lvals, dropna=False)
  1376. r_count = value_counts(rvals, dropna=False)
  1377. l_count, r_count = l_count.align(r_count, fill_value=0)
  1378. final_count = np.maximum(l_count.values, r_count.values)
  1379. final_count = Series(final_count, index=l_count.index, dtype="int", copy=False)
  1380. if isinstance(lvals, ABCMultiIndex) and isinstance(rvals, ABCMultiIndex):
  1381. unique_vals = lvals.append(rvals).unique()
  1382. else:
  1383. if isinstance(lvals, ABCIndex):
  1384. lvals = lvals._values
  1385. if isinstance(rvals, ABCIndex):
  1386. rvals = rvals._values
  1387. unique_vals = unique(concat_compat([lvals, rvals]))
  1388. unique_vals = ensure_wrapped_if_datetimelike(unique_vals)
  1389. repeats = final_count.reindex(unique_vals).values
  1390. return np.repeat(unique_vals, repeats)