1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672 |
- """
- Generic data algorithms. This module is experimental at the moment and not
- intended for public consumption
- """
- from __future__ import annotations
- import operator
- from textwrap import dedent
- from typing import (
- TYPE_CHECKING,
- Literal,
- cast,
- )
- import warnings
- import numpy as np
- from pandas._libs import (
- algos,
- hashtable as htable,
- iNaT,
- lib,
- )
- from pandas._typing import (
- AnyArrayLike,
- ArrayLike,
- AxisInt,
- DtypeObj,
- TakeIndexer,
- npt,
- )
- from pandas.util._decorators import doc
- from pandas.util._exceptions import find_stack_level
- from pandas.core.dtypes.cast import (
- construct_1d_object_array_from_listlike,
- infer_dtype_from_array,
- np_find_common_type,
- )
- from pandas.core.dtypes.common import (
- ensure_float64,
- ensure_object,
- ensure_platform_int,
- is_array_like,
- is_bool_dtype,
- is_categorical_dtype,
- is_complex_dtype,
- is_extension_array_dtype,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
- is_numeric_dtype,
- is_object_dtype,
- is_scalar,
- is_signed_integer_dtype,
- needs_i8_conversion,
- )
- from pandas.core.dtypes.concat import concat_compat
- from pandas.core.dtypes.dtypes import (
- BaseMaskedDtype,
- ExtensionDtype,
- PandasDtype,
- )
- from pandas.core.dtypes.generic import (
- ABCDatetimeArray,
- ABCExtensionArray,
- ABCIndex,
- ABCMultiIndex,
- ABCSeries,
- ABCTimedeltaArray,
- )
- from pandas.core.dtypes.missing import (
- isna,
- na_value_for_dtype,
- )
- from pandas.core.array_algos.take import take_nd
- from pandas.core.construction import (
- array as pd_array,
- ensure_wrapped_if_datetimelike,
- extract_array,
- )
- from pandas.core.indexers import validate_indices
- if TYPE_CHECKING:
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
- from pandas import (
- Categorical,
- Index,
- Series,
- )
- from pandas.core.arrays import (
- BaseMaskedArray,
- ExtensionArray,
- )
- # --------------- #
- # dtype access #
- # --------------- #
- def _ensure_data(values: ArrayLike) -> np.ndarray:
- """
- routine to ensure that our data is of the correct
- input dtype for lower-level routines
- This will coerce:
- - ints -> int64
- - uint -> uint64
- - bool -> uint8
- - datetimelike -> i8
- - datetime64tz -> i8 (in local tz)
- - categorical -> codes
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
- Returns
- -------
- np.ndarray
- """
- if not isinstance(values, ABCMultiIndex):
- # extract_array would raise
- values = extract_array(values, extract_numpy=True)
- if is_object_dtype(values.dtype):
- return ensure_object(np.asarray(values))
- elif isinstance(values.dtype, BaseMaskedDtype):
- # i.e. BooleanArray, FloatingArray, IntegerArray
- values = cast("BaseMaskedArray", values)
- if not values._hasna:
- # No pd.NAs -> We can avoid an object-dtype cast (and copy) GH#41816
- # recurse to avoid re-implementing logic for eg bool->uint8
- return _ensure_data(values._data)
- return np.asarray(values)
- elif is_categorical_dtype(values.dtype):
- # NB: cases that go through here should NOT be using _reconstruct_data
- # on the back-end.
- values = cast("Categorical", values)
- return values.codes
- elif is_bool_dtype(values.dtype):
- if isinstance(values, np.ndarray):
- # i.e. actually dtype == np.dtype("bool")
- return np.asarray(values).view("uint8")
- else:
- # e.g. Sparse[bool, False] # TODO: no test cases get here
- return np.asarray(values).astype("uint8", copy=False)
- elif is_integer_dtype(values.dtype):
- return np.asarray(values)
- elif is_float_dtype(values.dtype):
- # Note: checking `values.dtype == "float128"` raises on Windows and 32bit
- # error: Item "ExtensionDtype" of "Union[Any, ExtensionDtype, dtype[Any]]"
- # has no attribute "itemsize"
- if values.dtype.itemsize in [2, 12, 16]: # type: ignore[union-attr]
- # we dont (yet) have float128 hashtable support
- return ensure_float64(values)
- return np.asarray(values)
- elif is_complex_dtype(values.dtype):
- return cast(np.ndarray, values)
- # datetimelike
- elif needs_i8_conversion(values.dtype):
- npvalues = values.view("i8")
- npvalues = cast(np.ndarray, npvalues)
- return npvalues
- # we have failed, return object
- values = np.asarray(values, dtype=object)
- return ensure_object(values)
- def _reconstruct_data(
- values: ArrayLike, dtype: DtypeObj, original: AnyArrayLike
- ) -> ArrayLike:
- """
- reverse of _ensure_data
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
- dtype : np.dtype or ExtensionDtype
- original : AnyArrayLike
- Returns
- -------
- ExtensionArray or np.ndarray
- """
- if isinstance(values, ABCExtensionArray) and values.dtype == dtype:
- # Catch DatetimeArray/TimedeltaArray
- return values
- if not isinstance(dtype, np.dtype):
- # i.e. ExtensionDtype; note we have ruled out above the possibility
- # that values.dtype == dtype
- cls = dtype.construct_array_type()
- values = cls._from_sequence(values, dtype=dtype)
- else:
- values = values.astype(dtype, copy=False)
- return values
- def _ensure_arraylike(values) -> ArrayLike:
- """
- ensure that we are arraylike if not already
- """
- if not is_array_like(values):
- inferred = lib.infer_dtype(values, skipna=False)
- if inferred in ["mixed", "string", "mixed-integer"]:
- # "mixed-integer" to ensure we do not cast ["ss", 42] to str GH#22160
- if isinstance(values, tuple):
- values = list(values)
- values = construct_1d_object_array_from_listlike(values)
- else:
- values = np.asarray(values)
- return values
- _hashtables = {
- "complex128": htable.Complex128HashTable,
- "complex64": htable.Complex64HashTable,
- "float64": htable.Float64HashTable,
- "float32": htable.Float32HashTable,
- "uint64": htable.UInt64HashTable,
- "uint32": htable.UInt32HashTable,
- "uint16": htable.UInt16HashTable,
- "uint8": htable.UInt8HashTable,
- "int64": htable.Int64HashTable,
- "int32": htable.Int32HashTable,
- "int16": htable.Int16HashTable,
- "int8": htable.Int8HashTable,
- "string": htable.StringHashTable,
- "object": htable.PyObjectHashTable,
- }
- def _get_hashtable_algo(values: np.ndarray):
- """
- Parameters
- ----------
- values : np.ndarray
- Returns
- -------
- htable : HashTable subclass
- values : ndarray
- """
- values = _ensure_data(values)
- ndtype = _check_object_for_strings(values)
- hashtable = _hashtables[ndtype]
- return hashtable, values
- def _check_object_for_strings(values: np.ndarray) -> str:
- """
- Check if we can use string hashtable instead of object hashtable.
- Parameters
- ----------
- values : ndarray
- Returns
- -------
- str
- """
- ndtype = values.dtype.name
- if ndtype == "object":
- # it's cheaper to use a String Hash Table than Object; we infer
- # including nulls because that is the only difference between
- # StringHashTable and ObjectHashtable
- if lib.infer_dtype(values, skipna=False) in ["string"]:
- ndtype = "string"
- return ndtype
- # --------------- #
- # top-level algos #
- # --------------- #
- def unique(values):
- """
- Return unique values based on a hash table.
- Uniques are returned in order of appearance. This does NOT sort.
- Significantly faster than numpy.unique for long enough sequences.
- Includes NA values.
- Parameters
- ----------
- values : 1d array-like
- Returns
- -------
- numpy.ndarray or ExtensionArray
- The return can be:
- * Index : when the input is an Index
- * Categorical : when the input is a Categorical dtype
- * ndarray : when the input is a Series/ndarray
- Return numpy.ndarray or ExtensionArray.
- See Also
- --------
- Index.unique : Return unique values from an Index.
- Series.unique : Return unique values of Series object.
- Examples
- --------
- >>> pd.unique(pd.Series([2, 1, 3, 3]))
- array([2, 1, 3])
- >>> pd.unique(pd.Series([2] + [1] * 5))
- array([2, 1])
- >>> pd.unique(pd.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]))
- array(['2016-01-01T00:00:00.000000000'], dtype='datetime64[ns]')
- >>> pd.unique(
- ... pd.Series(
- ... [
- ... pd.Timestamp("20160101", tz="US/Eastern"),
- ... pd.Timestamp("20160101", tz="US/Eastern"),
- ... ]
- ... )
- ... )
- <DatetimeArray>
- ['2016-01-01 00:00:00-05:00']
- Length: 1, dtype: datetime64[ns, US/Eastern]
- >>> pd.unique(
- ... pd.Index(
- ... [
- ... pd.Timestamp("20160101", tz="US/Eastern"),
- ... pd.Timestamp("20160101", tz="US/Eastern"),
- ... ]
- ... )
- ... )
- DatetimeIndex(['2016-01-01 00:00:00-05:00'],
- dtype='datetime64[ns, US/Eastern]',
- freq=None)
- >>> pd.unique(list("baabc"))
- array(['b', 'a', 'c'], dtype=object)
- An unordered Categorical will return categories in the
- order of appearance.
- >>> pd.unique(pd.Series(pd.Categorical(list("baabc"))))
- ['b', 'a', 'c']
- Categories (3, object): ['a', 'b', 'c']
- >>> pd.unique(pd.Series(pd.Categorical(list("baabc"), categories=list("abc"))))
- ['b', 'a', 'c']
- Categories (3, object): ['a', 'b', 'c']
- An ordered Categorical preserves the category ordering.
- >>> pd.unique(
- ... pd.Series(
- ... pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
- ... )
- ... )
- ['b', 'a', 'c']
- Categories (3, object): ['a' < 'b' < 'c']
- An array of tuples
- >>> pd.unique([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")])
- array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
- """
- return unique_with_mask(values)
- def nunique_ints(values: ArrayLike) -> int:
- """
- Return the number of unique values for integer array-likes.
- Significantly faster than pandas.unique for long enough sequences.
- No checks are done to ensure input is integral.
- Parameters
- ----------
- values : 1d array-like
- Returns
- -------
- int : The number of unique values in ``values``
- """
- if len(values) == 0:
- return 0
- values = _ensure_data(values)
- # bincount requires intp
- result = (np.bincount(values.ravel().astype("intp")) != 0).sum()
- return result
- def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
- """See algorithms.unique for docs. Takes a mask for masked arrays."""
- values = _ensure_arraylike(values)
- if is_extension_array_dtype(values.dtype):
- # Dispatch to extension dtype's unique.
- return values.unique()
- original = values
- hashtable, values = _get_hashtable_algo(values)
- table = hashtable(len(values))
- if mask is None:
- uniques = table.unique(values)
- uniques = _reconstruct_data(uniques, original.dtype, original)
- return uniques
- else:
- uniques, mask = table.unique(values, mask=mask)
- uniques = _reconstruct_data(uniques, original.dtype, original)
- assert mask is not None # for mypy
- return uniques, mask.astype("bool")
- unique1d = unique
- def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]:
- """
- Compute the isin boolean array.
- Parameters
- ----------
- comps : array-like
- values : array-like
- Returns
- -------
- ndarray[bool]
- Same length as `comps`.
- """
- if not is_list_like(comps):
- raise TypeError(
- "only list-like objects are allowed to be passed "
- f"to isin(), you passed a [{type(comps).__name__}]"
- )
- if not is_list_like(values):
- raise TypeError(
- "only list-like objects are allowed to be passed "
- f"to isin(), you passed a [{type(values).__name__}]"
- )
- if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
- orig_values = list(values)
- values = _ensure_arraylike(orig_values)
- if (
- len(values) > 0
- and is_numeric_dtype(values)
- and not is_signed_integer_dtype(comps)
- ):
- # GH#46485 Use object to avoid upcast to float64 later
- # TODO: Share with _find_common_type_compat
- values = construct_1d_object_array_from_listlike(orig_values)
- elif isinstance(values, ABCMultiIndex):
- # Avoid raising in extract_array
- values = np.array(values)
- else:
- values = extract_array(values, extract_numpy=True, extract_range=True)
- comps_array = _ensure_arraylike(comps)
- comps_array = extract_array(comps_array, extract_numpy=True)
- if not isinstance(comps_array, np.ndarray):
- # i.e. Extension Array
- return comps_array.isin(values)
- elif needs_i8_conversion(comps_array.dtype):
- # Dispatch to DatetimeLikeArrayMixin.isin
- return pd_array(comps_array).isin(values)
- elif needs_i8_conversion(values.dtype) and not is_object_dtype(comps_array.dtype):
- # e.g. comps_array are integers and values are datetime64s
- return np.zeros(comps_array.shape, dtype=bool)
- # TODO: not quite right ... Sparse/Categorical
- elif needs_i8_conversion(values.dtype):
- return isin(comps_array, values.astype(object))
- elif isinstance(values.dtype, ExtensionDtype):
- return isin(np.asarray(comps_array), np.asarray(values))
- # GH16012
- # Ensure np.in1d doesn't get object types or it *may* throw an exception
- # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
- # in1d is faster for small sizes
- if (
- len(comps_array) > 1_000_000
- and len(values) <= 26
- and not is_object_dtype(comps_array)
- ):
- # If the values include nan we need to check for nan explicitly
- # since np.nan it not equal to np.nan
- if isna(values).any():
- def f(c, v):
- return np.logical_or(np.in1d(c, v), np.isnan(c))
- else:
- f = np.in1d
- else:
- common = np_find_common_type(values.dtype, comps_array.dtype)
- values = values.astype(common, copy=False)
- comps_array = comps_array.astype(common, copy=False)
- f = htable.ismember
- return f(comps_array, values)
- def factorize_array(
- values: np.ndarray,
- use_na_sentinel: bool = True,
- size_hint: int | None = None,
- na_value: object = None,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> tuple[npt.NDArray[np.intp], np.ndarray]:
- """
- Factorize a numpy array to codes and uniques.
- This doesn't do any coercion of types or unboxing before factorization.
- Parameters
- ----------
- values : ndarray
- use_na_sentinel : bool, default True
- If True, the sentinel -1 will be used for NaN values. If False,
- NaN values will be encoded as non-negative integers and will not drop the
- NaN from the uniques of the values.
- size_hint : int, optional
- Passed through to the hashtable's 'get_labels' method
- na_value : object, optional
- A value in `values` to consider missing. Note: only use this
- parameter when you know that you don't have any values pandas would
- consider missing in the array (NaN for float data, iNaT for
- datetimes, etc.).
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- Returns
- -------
- codes : ndarray[np.intp]
- uniques : ndarray
- """
- original = values
- if values.dtype.kind in ["m", "M"]:
- # _get_hashtable_algo will cast dt64/td64 to i8 via _ensure_data, so we
- # need to do the same to na_value. We are assuming here that the passed
- # na_value is an appropriately-typed NaT.
- # e.g. test_where_datetimelike_categorical
- na_value = iNaT
- hash_klass, values = _get_hashtable_algo(values)
- table = hash_klass(size_hint or len(values))
- uniques, codes = table.factorize(
- values,
- na_sentinel=-1,
- na_value=na_value,
- mask=mask,
- ignore_na=use_na_sentinel,
- )
- # re-cast e.g. i8->dt64/td64, uint8->bool
- uniques = _reconstruct_data(uniques, original.dtype, original)
- codes = ensure_platform_int(codes)
- return codes, uniques
- @doc(
- values=dedent(
- """\
- values : sequence
- A 1-D sequence. Sequences that aren't pandas objects are
- coerced to ndarrays before factorization.
- """
- ),
- sort=dedent(
- """\
- sort : bool, default False
- Sort `uniques` and shuffle `codes` to maintain the
- relationship.
- """
- ),
- size_hint=dedent(
- """\
- size_hint : int, optional
- Hint to the hashtable sizer.
- """
- ),
- )
- def factorize(
- values,
- sort: bool = False,
- use_na_sentinel: bool = True,
- size_hint: int | None = None,
- ) -> tuple[np.ndarray, np.ndarray | Index]:
- """
- Encode the object as an enumerated type or categorical variable.
- This method is useful for obtaining a numeric representation of an
- array when all that matters is identifying distinct values. `factorize`
- is available as both a top-level function :func:`pandas.factorize`,
- and as a method :meth:`Series.factorize` and :meth:`Index.factorize`.
- Parameters
- ----------
- {values}{sort}
- use_na_sentinel : bool, default True
- If True, the sentinel -1 will be used for NaN values. If False,
- NaN values will be encoded as non-negative integers and will not drop the
- NaN from the uniques of the values.
- .. versionadded:: 1.5.0
- {size_hint}\
- Returns
- -------
- codes : ndarray
- An integer ndarray that's an indexer into `uniques`.
- ``uniques.take(codes)`` will have the same values as `values`.
- uniques : ndarray, Index, or Categorical
- The unique valid values. When `values` is Categorical, `uniques`
- is a Categorical. When `values` is some other pandas object, an
- `Index` is returned. Otherwise, a 1-D ndarray is returned.
- .. note::
- Even if there's a missing value in `values`, `uniques` will
- *not* contain an entry for it.
- See Also
- --------
- cut : Discretize continuous-valued array.
- unique : Find the unique value in an array.
- Notes
- -----
- Reference :ref:`the user guide <reshaping.factorize>` for more examples.
- Examples
- --------
- These examples all show factorize as a top-level method like
- ``pd.factorize(values)``. The results are identical for methods like
- :meth:`Series.factorize`.
- >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
- >>> codes
- array([0, 0, 1, 2, 0])
- >>> uniques
- array(['b', 'a', 'c'], dtype=object)
- With ``sort=True``, the `uniques` will be sorted, and `codes` will be
- shuffled so that the relationship is the maintained.
- >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
- >>> codes
- array([1, 1, 0, 2, 1])
- >>> uniques
- array(['a', 'b', 'c'], dtype=object)
- When ``use_na_sentinel=True`` (the default), missing values are indicated in
- the `codes` with the sentinel value ``-1`` and missing values are not
- included in `uniques`.
- >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
- >>> codes
- array([ 0, -1, 1, 2, 0])
- >>> uniques
- array(['b', 'a', 'c'], dtype=object)
- Thus far, we've only factorized lists (which are internally coerced to
- NumPy arrays). When factorizing pandas objects, the type of `uniques`
- will differ. For Categoricals, a `Categorical` is returned.
- >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
- >>> codes, uniques = pd.factorize(cat)
- >>> codes
- array([0, 0, 1])
- >>> uniques
- ['a', 'c']
- Categories (3, object): ['a', 'b', 'c']
- Notice that ``'b'`` is in ``uniques.categories``, despite not being
- present in ``cat.values``.
- For all other pandas objects, an Index of the appropriate type is
- returned.
- >>> cat = pd.Series(['a', 'a', 'c'])
- >>> codes, uniques = pd.factorize(cat)
- >>> codes
- array([0, 0, 1])
- >>> uniques
- Index(['a', 'c'], dtype='object')
- If NaN is in the values, and we want to include NaN in the uniques of the
- values, it can be achieved by setting ``use_na_sentinel=False``.
- >>> values = np.array([1, 2, 1, np.nan])
- >>> codes, uniques = pd.factorize(values) # default: use_na_sentinel=True
- >>> codes
- array([ 0, 1, 0, -1])
- >>> uniques
- array([1., 2.])
- >>> codes, uniques = pd.factorize(values, use_na_sentinel=False)
- >>> codes
- array([0, 1, 0, 2])
- >>> uniques
- array([ 1., 2., nan])
- """
- # Implementation notes: This method is responsible for 3 things
- # 1.) coercing data to array-like (ndarray, Index, extension array)
- # 2.) factorizing codes and uniques
- # 3.) Maybe boxing the uniques in an Index
- #
- # Step 2 is dispatched to extension types (like Categorical). They are
- # responsible only for factorization. All data coercion, sorting and boxing
- # should happen here.
- if isinstance(values, (ABCIndex, ABCSeries)):
- return values.factorize(sort=sort, use_na_sentinel=use_na_sentinel)
- values = _ensure_arraylike(values)
- original = values
- if (
- isinstance(values, (ABCDatetimeArray, ABCTimedeltaArray))
- and values.freq is not None
- ):
- # The presence of 'freq' means we can fast-path sorting and know there
- # aren't NAs
- codes, uniques = values.factorize(sort=sort)
- return codes, uniques
- elif not isinstance(values, np.ndarray):
- # i.e. ExtensionArray
- codes, uniques = values.factorize(use_na_sentinel=use_na_sentinel)
- else:
- values = np.asarray(values) # convert DTA/TDA/MultiIndex
- if not use_na_sentinel and is_object_dtype(values):
- # factorize can now handle differentiating various types of null values.
- # These can only occur when the array has object dtype.
- # However, for backwards compatibility we only use the null for the
- # provided dtype. This may be revisited in the future, see GH#48476.
- null_mask = isna(values)
- if null_mask.any():
- na_value = na_value_for_dtype(values.dtype, compat=False)
- # Don't modify (potentially user-provided) array
- values = np.where(null_mask, na_value, values)
- codes, uniques = factorize_array(
- values,
- use_na_sentinel=use_na_sentinel,
- size_hint=size_hint,
- )
- if sort and len(uniques) > 0:
- uniques, codes = safe_sort(
- uniques,
- codes,
- use_na_sentinel=use_na_sentinel,
- assume_unique=True,
- verify=False,
- )
- uniques = _reconstruct_data(uniques, original.dtype, original)
- return codes, uniques
- def value_counts(
- values,
- sort: bool = True,
- ascending: bool = False,
- normalize: bool = False,
- bins=None,
- dropna: bool = True,
- ) -> Series:
- """
- Compute a histogram of the counts of non-null values.
- Parameters
- ----------
- values : ndarray (1-d)
- sort : bool, default True
- Sort by values
- ascending : bool, default False
- Sort in ascending order
- normalize: bool, default False
- If True then compute a relative histogram
- bins : integer, optional
- Rather than count values, group them into half-open bins,
- convenience for pd.cut, only works with numeric data
- dropna : bool, default True
- Don't include counts of NaN
- Returns
- -------
- Series
- """
- from pandas import (
- Index,
- Series,
- )
- index_name = getattr(values, "name", None)
- name = "proportion" if normalize else "count"
- if bins is not None:
- from pandas.core.reshape.tile import cut
- values = Series(values, copy=False)
- try:
- ii = cut(values, bins, include_lowest=True)
- except TypeError as err:
- raise TypeError("bins argument only works with numeric data.") from err
- # count, remove nulls (from the index), and but the bins
- result = ii.value_counts(dropna=dropna)
- result.name = name
- result = result[result.index.notna()]
- result.index = result.index.astype("interval")
- result = result.sort_index()
- # if we are dropna and we have NO values
- if dropna and (result._values == 0).all():
- result = result.iloc[0:0]
- # normalizing is by len of all (regardless of dropna)
- counts = np.array([len(ii)])
- else:
- if is_extension_array_dtype(values):
- # handle Categorical and sparse,
- result = Series(values, copy=False)._values.value_counts(dropna=dropna)
- result.name = name
- result.index.name = index_name
- counts = result._values
- if not isinstance(counts, np.ndarray):
- # e.g. ArrowExtensionArray
- counts = np.asarray(counts)
- elif isinstance(values, ABCMultiIndex):
- # GH49558
- levels = list(range(values.nlevels))
- result = (
- Series(index=values, name=name)
- .groupby(level=levels, dropna=dropna)
- .size()
- )
- result.index.names = values.names
- counts = result._values
- else:
- values = _ensure_arraylike(values)
- keys, counts = value_counts_arraylike(values, dropna)
- if keys.dtype == np.float16:
- keys = keys.astype(np.float32)
- # For backwards compatibility, we let Index do its normal type
- # inference, _except_ for if if infers from object to bool.
- idx = Index(keys)
- if idx.dtype == bool and keys.dtype == object:
- idx = idx.astype(object)
- idx.name = index_name
- result = Series(counts, index=idx, name=name, copy=False)
- if sort:
- result = result.sort_values(ascending=ascending)
- if normalize:
- result = result / counts.sum()
- return result
- # Called once from SparseArray, otherwise could be private
- def value_counts_arraylike(
- values: np.ndarray, dropna: bool, mask: npt.NDArray[np.bool_] | None = None
- ) -> tuple[ArrayLike, npt.NDArray[np.int64]]:
- """
- Parameters
- ----------
- values : np.ndarray
- dropna : bool
- mask : np.ndarray[bool] or None, default None
- Returns
- -------
- uniques : np.ndarray
- counts : np.ndarray[np.int64]
- """
- original = values
- values = _ensure_data(values)
- keys, counts = htable.value_count(values, dropna, mask=mask)
- if needs_i8_conversion(original.dtype):
- # datetime, timedelta, or period
- if dropna:
- mask = keys != iNaT
- keys, counts = keys[mask], counts[mask]
- res_keys = _reconstruct_data(keys, original.dtype, original)
- return res_keys, counts
- def duplicated(
- values: ArrayLike, keep: Literal["first", "last", False] = "first"
- ) -> npt.NDArray[np.bool_]:
- """
- Return boolean ndarray denoting duplicate values.
- Parameters
- ----------
- values : nd.array, ExtensionArray or Series
- Array over which to check for duplicate values.
- keep : {'first', 'last', False}, default 'first'
- - ``first`` : Mark duplicates as ``True`` except for the first
- occurrence.
- - ``last`` : Mark duplicates as ``True`` except for the last
- occurrence.
- - False : Mark all duplicates as ``True``.
- Returns
- -------
- duplicated : ndarray[bool]
- """
- if hasattr(values, "dtype") and isinstance(values.dtype, BaseMaskedDtype):
- values = cast("BaseMaskedArray", values)
- return htable.duplicated(values._data, keep=keep, mask=values._mask)
- values = _ensure_data(values)
- return htable.duplicated(values, keep=keep)
- def mode(
- values: ArrayLike, dropna: bool = True, mask: npt.NDArray[np.bool_] | None = None
- ) -> ArrayLike:
- """
- Returns the mode(s) of an array.
- Parameters
- ----------
- values : array-like
- Array over which to check for duplicate values.
- dropna : bool, default True
- Don't consider counts of NaN/NaT.
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- values = _ensure_arraylike(values)
- original = values
- if needs_i8_conversion(values.dtype):
- # Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray.
- values = ensure_wrapped_if_datetimelike(values)
- values = cast("ExtensionArray", values)
- return values._mode(dropna=dropna)
- values = _ensure_data(values)
- npresult = htable.mode(values, dropna=dropna, mask=mask)
- try:
- npresult = np.sort(npresult)
- except TypeError as err:
- warnings.warn(
- f"Unable to sort modes: {err}",
- stacklevel=find_stack_level(),
- )
- result = _reconstruct_data(npresult, original.dtype, original)
- return result
- def rank(
- values: ArrayLike,
- axis: AxisInt = 0,
- method: str = "average",
- na_option: str = "keep",
- ascending: bool = True,
- pct: bool = False,
- ) -> npt.NDArray[np.float64]:
- """
- Rank the values along a given axis.
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
- Array whose values will be ranked. The number of dimensions in this
- array must not exceed 2.
- axis : int, default 0
- Axis over which to perform rankings.
- method : {'average', 'min', 'max', 'first', 'dense'}, default 'average'
- The method by which tiebreaks are broken during the ranking.
- na_option : {'keep', 'top'}, default 'keep'
- The method by which NaNs are placed in the ranking.
- - ``keep``: rank each NaN value with a NaN ranking
- - ``top``: replace each NaN with either +/- inf so that they
- there are ranked at the top
- ascending : bool, default True
- Whether or not the elements should be ranked in ascending order.
- pct : bool, default False
- Whether or not to the display the returned rankings in integer form
- (e.g. 1, 2, 3) or in percentile form (e.g. 0.333..., 0.666..., 1).
- """
- is_datetimelike = needs_i8_conversion(values.dtype)
- values = _ensure_data(values)
- if values.ndim == 1:
- ranks = algos.rank_1d(
- values,
- is_datetimelike=is_datetimelike,
- ties_method=method,
- ascending=ascending,
- na_option=na_option,
- pct=pct,
- )
- elif values.ndim == 2:
- ranks = algos.rank_2d(
- values,
- axis=axis,
- is_datetimelike=is_datetimelike,
- ties_method=method,
- ascending=ascending,
- na_option=na_option,
- pct=pct,
- )
- else:
- raise TypeError("Array with ndim > 2 are not supported.")
- return ranks
- def checked_add_with_arr(
- arr: npt.NDArray[np.int64],
- b: int | npt.NDArray[np.int64],
- arr_mask: npt.NDArray[np.bool_] | None = None,
- b_mask: npt.NDArray[np.bool_] | None = None,
- ) -> npt.NDArray[np.int64]:
- """
- Perform array addition that checks for underflow and overflow.
- Performs the addition of an int64 array and an int64 integer (or array)
- but checks that they do not result in overflow first. For elements that
- are indicated to be NaN, whether or not there is overflow for that element
- is automatically ignored.
- Parameters
- ----------
- arr : np.ndarray[int64] addend.
- b : array or scalar addend.
- arr_mask : np.ndarray[bool] or None, default None
- array indicating which elements to exclude from checking
- b_mask : np.ndarray[bool] or None, default None
- array or scalar indicating which element(s) to exclude from checking
- Returns
- -------
- sum : An array for elements x + b for each element x in arr if b is
- a scalar or an array for elements x + y for each element pair
- (x, y) in (arr, b).
- Raises
- ------
- OverflowError if any x + y exceeds the maximum or minimum int64 value.
- """
- # For performance reasons, we broadcast 'b' to the new array 'b2'
- # so that it has the same size as 'arr'.
- b2 = np.broadcast_to(b, arr.shape)
- if b_mask is not None:
- # We do the same broadcasting for b_mask as well.
- b2_mask = np.broadcast_to(b_mask, arr.shape)
- else:
- b2_mask = None
- # For elements that are NaN, regardless of their value, we should
- # ignore whether they overflow or not when doing the checked add.
- if arr_mask is not None and b2_mask is not None:
- not_nan = np.logical_not(arr_mask | b2_mask)
- elif arr_mask is not None:
- not_nan = np.logical_not(arr_mask)
- elif b_mask is not None:
- # error: Argument 1 to "__call__" of "_UFunc_Nin1_Nout1" has
- # incompatible type "Optional[ndarray[Any, dtype[bool_]]]";
- # expected "Union[_SupportsArray[dtype[Any]], _NestedSequence
- # [_SupportsArray[dtype[Any]]], bool, int, float, complex, str
- # , bytes, _NestedSequence[Union[bool, int, float, complex, str
- # , bytes]]]"
- not_nan = np.logical_not(b2_mask) # type: ignore[arg-type]
- else:
- not_nan = np.empty(arr.shape, dtype=bool)
- not_nan.fill(True)
- # gh-14324: For each element in 'arr' and its corresponding element
- # in 'b2', we check the sign of the element in 'b2'. If it is positive,
- # we then check whether its sum with the element in 'arr' exceeds
- # np.iinfo(np.int64).max. If so, we have an overflow error. If it
- # it is negative, we then check whether its sum with the element in
- # 'arr' exceeds np.iinfo(np.int64).min. If so, we have an overflow
- # error as well.
- i8max = lib.i8max
- i8min = iNaT
- mask1 = b2 > 0
- mask2 = b2 < 0
- if not mask1.any():
- to_raise = ((i8min - b2 > arr) & not_nan).any()
- elif not mask2.any():
- to_raise = ((i8max - b2 < arr) & not_nan).any()
- else:
- to_raise = ((i8max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() or (
- (i8min - b2[mask2] > arr[mask2]) & not_nan[mask2]
- ).any()
- if to_raise:
- raise OverflowError("Overflow in int64 addition")
- result = arr + b
- if arr_mask is not None or b2_mask is not None:
- np.putmask(result, ~not_nan, iNaT)
- return result
- # ---- #
- # take #
- # ---- #
- def take(
- arr,
- indices: TakeIndexer,
- axis: AxisInt = 0,
- allow_fill: bool = False,
- fill_value=None,
- ):
- """
- Take elements from an array.
- Parameters
- ----------
- arr : array-like or scalar value
- Non array-likes (sequences/scalars without a dtype) are coerced
- to an ndarray.
- indices : sequence of int or one-dimensional np.ndarray of int
- Indices to be taken.
- axis : int, default 0
- The axis over which to select values.
- allow_fill : bool, default False
- How to handle negative values in `indices`.
- * False: negative values in `indices` indicate positional indices
- from the right (the default). This is similar to :func:`numpy.take`.
- * True: negative values in `indices` indicate
- missing values. These values are set to `fill_value`. Any other
- negative values raise a ``ValueError``.
- fill_value : any, optional
- Fill value to use for NA-indices when `allow_fill` is True.
- This may be ``None``, in which case the default NA value for
- the type (``self.dtype.na_value``) is used.
- For multi-dimensional `arr`, each *element* is filled with
- `fill_value`.
- Returns
- -------
- ndarray or ExtensionArray
- Same type as the input.
- Raises
- ------
- IndexError
- When `indices` is out of bounds for the array.
- ValueError
- When the indexer contains negative values other than ``-1``
- and `allow_fill` is True.
- Notes
- -----
- When `allow_fill` is False, `indices` may be whatever dimensionality
- is accepted by NumPy for `arr`.
- When `allow_fill` is True, `indices` should be 1-D.
- See Also
- --------
- numpy.take : Take elements from an array along an axis.
- Examples
- --------
- >>> import pandas as pd
- With the default ``allow_fill=False``, negative numbers indicate
- positional indices from the right.
- >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1])
- array([10, 10, 30])
- Setting ``allow_fill=True`` will place `fill_value` in those positions.
- >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True)
- array([10., 10., nan])
- >>> pd.api.extensions.take(np.array([10, 20, 30]), [0, 0, -1], allow_fill=True,
- ... fill_value=-10)
- array([ 10, 10, -10])
- """
- if not is_array_like(arr):
- arr = np.asarray(arr)
- indices = np.asarray(indices, dtype=np.intp)
- if allow_fill:
- # Pandas style, -1 means NA
- validate_indices(indices, arr.shape[axis])
- result = take_nd(
- arr, indices, axis=axis, allow_fill=True, fill_value=fill_value
- )
- else:
- # NumPy style
- result = arr.take(indices, axis=axis)
- return result
- # ------------ #
- # searchsorted #
- # ------------ #
- def searchsorted(
- arr: ArrayLike,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- """
- Find indices where elements should be inserted to maintain order.
- Find the indices into a sorted array `arr` (a) such that, if the
- corresponding elements in `value` were inserted before the indices,
- the order of `arr` would be preserved.
- Assuming that `arr` is sorted:
- ====== ================================
- `side` returned index `i` satisfies
- ====== ================================
- left ``arr[i-1] < value <= self[i]``
- right ``arr[i-1] <= value < self[i]``
- ====== ================================
- Parameters
- ----------
- arr: np.ndarray, ExtensionArray, Series
- Input array. If `sorter` is None, then it must be sorted in
- ascending order, otherwise `sorter` must be an array of indices
- that sort it.
- value : array-like or scalar
- Values to insert into `arr`.
- side : {'left', 'right'}, optional
- If 'left', the index of the first suitable location found is given.
- If 'right', return the last such index. If there is no suitable
- index, return either 0 or N (where N is the length of `self`).
- sorter : 1-D array-like, optional
- Optional array of integer indices that sort array a into ascending
- order. They are typically the result of argsort.
- Returns
- -------
- array of ints or int
- If value is array-like, array of insertion points.
- If value is scalar, a single integer.
- See Also
- --------
- numpy.searchsorted : Similar method from NumPy.
- """
- if sorter is not None:
- sorter = ensure_platform_int(sorter)
- if (
- isinstance(arr, np.ndarray)
- and is_integer_dtype(arr.dtype)
- and (is_integer(value) or is_integer_dtype(value))
- ):
- # if `arr` and `value` have different dtypes, `arr` would be
- # recast by numpy, causing a slow search.
- # Before searching below, we therefore try to give `value` the
- # same dtype as `arr`, while guarding against integer overflows.
- iinfo = np.iinfo(arr.dtype.type)
- value_arr = np.array([value]) if is_scalar(value) else np.array(value)
- if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all():
- # value within bounds, so no overflow, so can convert value dtype
- # to dtype of arr
- dtype = arr.dtype
- else:
- dtype = value_arr.dtype
- if is_scalar(value):
- # We know that value is int
- value = cast(int, dtype.type(value))
- else:
- value = pd_array(cast(ArrayLike, value), dtype=dtype)
- else:
- # E.g. if `arr` is an array with dtype='datetime64[ns]'
- # and `value` is a pd.Timestamp, we may need to convert value
- arr = ensure_wrapped_if_datetimelike(arr)
- # Argument 1 to "searchsorted" of "ndarray" has incompatible type
- # "Union[NumpyValueArrayLike, ExtensionArray]"; expected "NumpyValueArrayLike"
- return arr.searchsorted(value, side=side, sorter=sorter) # type: ignore[arg-type]
- # ---- #
- # diff #
- # ---- #
- _diff_special = {"float64", "float32", "int64", "int32", "int16", "int8"}
- def diff(arr, n: int, axis: AxisInt = 0):
- """
- difference of n between self,
- analogous to s-s.shift(n)
- Parameters
- ----------
- arr : ndarray or ExtensionArray
- n : int
- number of periods
- axis : {0, 1}
- axis to shift on
- stacklevel : int, default 3
- The stacklevel for the lost dtype warning.
- Returns
- -------
- shifted
- """
- n = int(n)
- na = np.nan
- dtype = arr.dtype
- is_bool = is_bool_dtype(dtype)
- if is_bool:
- op = operator.xor
- else:
- op = operator.sub
- if isinstance(dtype, PandasDtype):
- # PandasArray cannot necessarily hold shifted versions of itself.
- arr = arr.to_numpy()
- dtype = arr.dtype
- if not isinstance(dtype, np.dtype):
- # i.e ExtensionDtype
- if hasattr(arr, f"__{op.__name__}__"):
- if axis != 0:
- raise ValueError(f"cannot diff {type(arr).__name__} on axis={axis}")
- return op(arr, arr.shift(n))
- else:
- raise TypeError(
- f"{type(arr).__name__} has no 'diff' method. "
- "Convert to a suitable dtype prior to calling 'diff'."
- )
- is_timedelta = False
- if needs_i8_conversion(arr.dtype):
- dtype = np.int64
- arr = arr.view("i8")
- na = iNaT
- is_timedelta = True
- elif is_bool:
- # We have to cast in order to be able to hold np.nan
- dtype = np.object_
- elif is_integer_dtype(dtype):
- # We have to cast in order to be able to hold np.nan
- # int8, int16 are incompatible with float64,
- # see https://github.com/cython/cython/issues/2646
- if arr.dtype.name in ["int8", "int16"]:
- dtype = np.float32
- else:
- dtype = np.float64
- orig_ndim = arr.ndim
- if orig_ndim == 1:
- # reshape so we can always use algos.diff_2d
- arr = arr.reshape(-1, 1)
- # TODO: require axis == 0
- dtype = np.dtype(dtype)
- out_arr = np.empty(arr.shape, dtype=dtype)
- na_indexer = [slice(None)] * 2
- na_indexer[axis] = slice(None, n) if n >= 0 else slice(n, None)
- out_arr[tuple(na_indexer)] = na
- if arr.dtype.name in _diff_special:
- # TODO: can diff_2d dtype specialization troubles be fixed by defining
- # out_arr inside diff_2d?
- algos.diff_2d(arr, out_arr, n, axis, datetimelike=is_timedelta)
- else:
- # To keep mypy happy, _res_indexer is a list while res_indexer is
- # a tuple, ditto for lag_indexer.
- _res_indexer = [slice(None)] * 2
- _res_indexer[axis] = slice(n, None) if n >= 0 else slice(None, n)
- res_indexer = tuple(_res_indexer)
- _lag_indexer = [slice(None)] * 2
- _lag_indexer[axis] = slice(None, -n) if n > 0 else slice(-n, None)
- lag_indexer = tuple(_lag_indexer)
- out_arr[res_indexer] = op(arr[res_indexer], arr[lag_indexer])
- if is_timedelta:
- out_arr = out_arr.view("timedelta64[ns]")
- if orig_ndim == 1:
- out_arr = out_arr[:, 0]
- return out_arr
- # --------------------------------------------------------------------
- # Helper functions
- # Note: safe_sort is in algorithms.py instead of sorting.py because it is
- # low-dependency, is used in this module, and used private methods from
- # this module.
- def safe_sort(
- values,
- codes=None,
- use_na_sentinel: bool = True,
- assume_unique: bool = False,
- verify: bool = True,
- ) -> AnyArrayLike | tuple[AnyArrayLike, np.ndarray]:
- """
- Sort ``values`` and reorder corresponding ``codes``.
- ``values`` should be unique if ``codes`` is not None.
- Safe for use with mixed types (int, str), orders ints before strs.
- Parameters
- ----------
- values : list-like
- Sequence; must be unique if ``codes`` is not None.
- codes : list_like, optional
- Indices to ``values``. All out of bound indices are treated as
- "not found" and will be masked with ``-1``.
- use_na_sentinel : bool, default True
- If True, the sentinel -1 will be used for NaN values. If False,
- NaN values will be encoded as non-negative integers and will not drop the
- NaN from the uniques of the values.
- assume_unique : bool, default False
- When True, ``values`` are assumed to be unique, which can speed up
- the calculation. Ignored when ``codes`` is None.
- verify : bool, default True
- Check if codes are out of bound for the values and put out of bound
- codes equal to ``-1``. If ``verify=False``, it is assumed there
- are no out of bound codes. Ignored when ``codes`` is None.
- Returns
- -------
- ordered : AnyArrayLike
- Sorted ``values``
- new_codes : ndarray
- Reordered ``codes``; returned when ``codes`` is not None.
- Raises
- ------
- TypeError
- * If ``values`` is not list-like or if ``codes`` is neither None
- nor list-like
- * If ``values`` cannot be sorted
- ValueError
- * If ``codes`` is not None and ``values`` contain duplicates.
- """
- if not is_list_like(values):
- raise TypeError(
- "Only list-like objects are allowed to be passed to safe_sort as values"
- )
- if not is_array_like(values):
- # don't convert to string types
- dtype, _ = infer_dtype_from_array(values)
- # error: Argument "dtype" to "asarray" has incompatible type "Union[dtype[Any],
- # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
- # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
- # _DTypeDict, Tuple[Any, Any]]]"
- values = np.asarray(values, dtype=dtype) # type: ignore[arg-type]
- sorter = None
- ordered: AnyArrayLike
- if (
- not is_extension_array_dtype(values)
- and lib.infer_dtype(values, skipna=False) == "mixed-integer"
- ):
- ordered = _sort_mixed(values)
- else:
- try:
- sorter = values.argsort()
- ordered = values.take(sorter)
- except TypeError:
- # Previous sorters failed or were not applicable, try `_sort_mixed`
- # which would work, but which fails for special case of 1d arrays
- # with tuples.
- if values.size and isinstance(values[0], tuple):
- ordered = _sort_tuples(values)
- else:
- ordered = _sort_mixed(values)
- # codes:
- if codes is None:
- return ordered
- if not is_list_like(codes):
- raise TypeError(
- "Only list-like objects or None are allowed to "
- "be passed to safe_sort as codes"
- )
- codes = ensure_platform_int(np.asarray(codes))
- if not assume_unique and not len(unique(values)) == len(values):
- raise ValueError("values should be unique if codes is not None")
- if sorter is None:
- # mixed types
- hash_klass, values = _get_hashtable_algo(values)
- t = hash_klass(len(values))
- t.map_locations(values)
- sorter = ensure_platform_int(t.lookup(ordered))
- if use_na_sentinel:
- # take_nd is faster, but only works for na_sentinels of -1
- order2 = sorter.argsort()
- new_codes = take_nd(order2, codes, fill_value=-1)
- if verify:
- mask = (codes < -len(values)) | (codes >= len(values))
- else:
- mask = None
- else:
- reverse_indexer = np.empty(len(sorter), dtype=np.int_)
- reverse_indexer.put(sorter, np.arange(len(sorter)))
- # Out of bound indices will be masked with `-1` next, so we
- # may deal with them here without performance loss using `mode='wrap'`
- new_codes = reverse_indexer.take(codes, mode="wrap")
- if use_na_sentinel:
- mask = codes == -1
- if verify:
- mask = mask | (codes < -len(values)) | (codes >= len(values))
- if use_na_sentinel and mask is not None:
- np.putmask(new_codes, mask, -1)
- return ordered, ensure_platform_int(new_codes)
- def _sort_mixed(values) -> AnyArrayLike:
- """order ints before strings before nulls in 1d arrays"""
- str_pos = np.array([isinstance(x, str) for x in values], dtype=bool)
- null_pos = np.array([isna(x) for x in values], dtype=bool)
- num_pos = ~str_pos & ~null_pos
- str_argsort = np.argsort(values[str_pos])
- num_argsort = np.argsort(values[num_pos])
- # convert boolean arrays to positional indices, then order by underlying values
- str_locs = str_pos.nonzero()[0].take(str_argsort)
- num_locs = num_pos.nonzero()[0].take(num_argsort)
- null_locs = null_pos.nonzero()[0]
- locs = np.concatenate([num_locs, str_locs, null_locs])
- return values.take(locs)
- def _sort_tuples(values: np.ndarray) -> np.ndarray:
- """
- Convert array of tuples (1d) to array of arrays (2d).
- We need to keep the columns separately as they contain different types and
- nans (can't use `np.sort` as it may fail when str and nan are mixed in a
- column as types cannot be compared).
- """
- from pandas.core.internals.construction import to_arrays
- from pandas.core.sorting import lexsort_indexer
- arrays, _ = to_arrays(values, None)
- indexer = lexsort_indexer(arrays, orders=True)
- return values[indexer]
- def union_with_duplicates(
- lvals: ArrayLike | Index, rvals: ArrayLike | Index
- ) -> ArrayLike | Index:
- """
- Extracts the union from lvals and rvals with respect to duplicates and nans in
- both arrays.
- Parameters
- ----------
- lvals: np.ndarray or ExtensionArray
- left values which is ordered in front.
- rvals: np.ndarray or ExtensionArray
- right values ordered after lvals.
- Returns
- -------
- np.ndarray or ExtensionArray
- Containing the unsorted union of both arrays.
- Notes
- -----
- Caller is responsible for ensuring lvals.dtype == rvals.dtype.
- """
- from pandas import Series
- l_count = value_counts(lvals, dropna=False)
- r_count = value_counts(rvals, dropna=False)
- l_count, r_count = l_count.align(r_count, fill_value=0)
- final_count = np.maximum(l_count.values, r_count.values)
- final_count = Series(final_count, index=l_count.index, dtype="int", copy=False)
- if isinstance(lvals, ABCMultiIndex) and isinstance(rvals, ABCMultiIndex):
- unique_vals = lvals.append(rvals).unique()
- else:
- if isinstance(lvals, ABCIndex):
- lvals = lvals._values
- if isinstance(rvals, ABCIndex):
- rvals = rvals._values
- unique_vals = unique(concat_compat([lvals, rvals]))
- unique_vals = ensure_wrapped_if_datetimelike(unique_vals)
- repeats = final_count.reindex(unique_vals).values
- return np.repeat(unique_vals, repeats)
|