12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280 |
- cimport cython
- import numpy as np
- cimport numpy as cnp
- from numpy cimport (
- int64_t,
- intp_t,
- ndarray,
- uint8_t,
- uint64_t,
- )
- cnp.import_array()
- from pandas._libs cimport util
- from pandas._libs.hashtable cimport HashTable
- from pandas._libs.tslibs.nattype cimport c_NaT as NaT
- from pandas._libs.tslibs.np_datetime cimport (
- NPY_DATETIMEUNIT,
- get_unit_from_dtype,
- )
- from pandas._libs.tslibs.period cimport is_period_object
- from pandas._libs.tslibs.timedeltas cimport _Timedelta
- from pandas._libs.tslibs.timestamps cimport _Timestamp
- from pandas._libs import (
- algos,
- hashtable as _hash,
- )
- from pandas._libs.lib cimport eq_NA_compat
- from pandas._libs.missing cimport (
- C_NA,
- checknull,
- is_matching_na,
- )
- # Defines shift of MultiIndex codes to avoid negative codes (missing values)
- multiindex_nulls_shift = 2
- cdef bint is_definitely_invalid_key(object val):
- try:
- hash(val)
- except TypeError:
- return True
- return False
- cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
- """
- Return a ndarray[bool] of locations where val matches self.values.
- If val is not NA, this is equivalent to `self.values == val`
- """
- # Caller is responsible for ensuring _check_type has already been called
- cdef:
- ndarray[uint8_t, ndim=1, cast=True] indexer
- Py_ssize_t i
- object item
- if values.descr.type_num == cnp.NPY_OBJECT:
- assert mask is None # no mask for object dtype
- # i.e. values.dtype == object
- if not checknull(val):
- indexer = eq_NA_compat(values, val)
- else:
- # We need to check for _matching_ NA values
- indexer = np.empty(len(values), dtype=np.uint8)
- for i in range(len(values)):
- item = values[i]
- indexer[i] = is_matching_na(item, val)
- else:
- if mask is not None:
- if val is C_NA:
- indexer = mask == 1
- else:
- indexer = (values == val) & ~mask
- else:
- if util.is_nan(val):
- indexer = np.isnan(values)
- else:
- indexer = values == val
- return indexer.view(bool)
- # Don't populate hash tables in monotonic indexes larger than this
- _SIZE_CUTOFF = 1_000_000
- cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val):
- """
- Possibly unpack a boolean mask to a single indexer.
- """
- # Returns ndarray[bool] or int
- cdef:
- ndarray[intp_t, ndim=1] found
- int count
- found = np.where(indexer)[0]
- count = len(found)
- if count > 1:
- return indexer
- if count == 1:
- return int(found[0])
- raise KeyError(val)
- @cython.freelist(32)
- cdef class IndexEngine:
- cdef readonly:
- ndarray values
- ndarray mask
- HashTable mapping
- bint over_size_threshold
- cdef:
- bint unique, monotonic_inc, monotonic_dec
- bint need_monotonic_check, need_unique_check
- object _np_type
- def __init__(self, ndarray values):
- self.values = values
- self.mask = None
- self.over_size_threshold = len(values) >= _SIZE_CUTOFF
- self.clear_mapping()
- self._np_type = values.dtype.type
- def __contains__(self, val: object) -> bool:
- hash(val)
- try:
- self.get_loc(val)
- except KeyError:
- return False
- return True
- cpdef get_loc(self, object val):
- # -> Py_ssize_t | slice | ndarray[bool]
- cdef:
- Py_ssize_t loc
- if is_definitely_invalid_key(val):
- raise TypeError(f"'{val}' is an invalid key")
- val = self._check_type(val)
- if self.over_size_threshold and self.is_monotonic_increasing:
- if not self.is_unique:
- return self._get_loc_duplicates(val)
- values = self.values
- loc = self._searchsorted_left(val)
- if loc >= len(values):
- raise KeyError(val)
- if values[loc] != val:
- raise KeyError(val)
- return loc
- self._ensure_mapping_populated()
- if not self.unique:
- return self._get_loc_duplicates(val)
- if self.mask is not None and val is C_NA:
- return self.mapping.get_na()
- try:
- return self.mapping.get_item(val)
- except OverflowError as err:
- # GH#41775 OverflowError e.g. if we are uint64 and val is -1
- # or if we are int64 and value is np.iinfo(np.int64).max+1
- # (the uint64 with -1 case should actually be excluded by _check_type)
- raise KeyError(val) from err
- cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
- """
- See ObjectEngine._searchsorted_left.__doc__.
- """
- # Caller is responsible for ensuring _check_type has already been called
- loc = self.values.searchsorted(self._np_type(val), side="left")
- return loc
- cdef _get_loc_duplicates(self, object val):
- # -> Py_ssize_t | slice | ndarray[bool]
- cdef:
- Py_ssize_t diff, left, right
- if self.is_monotonic_increasing:
- values = self.values
- try:
- left = values.searchsorted(val, side="left")
- right = values.searchsorted(val, side="right")
- except TypeError:
- # e.g. GH#29189 get_loc(None) with a Float64Index
- # 2021-09-29 Now only reached for object-dtype
- raise KeyError(val)
- diff = right - left
- if diff == 0:
- raise KeyError(val)
- elif diff == 1:
- return left
- else:
- return slice(left, right)
- return self._maybe_get_bool_indexer(val)
- cdef _maybe_get_bool_indexer(self, object val):
- # Returns ndarray[bool] or int
- cdef:
- ndarray[uint8_t, ndim=1, cast=True] indexer
- indexer = _get_bool_indexer(self.values, val, self.mask)
- return _unpack_bool_indexer(indexer, val)
- def sizeof(self, deep: bool = False) -> int:
- """ return the sizeof our mapping """
- if not self.is_mapping_populated:
- return 0
- return self.mapping.sizeof(deep=deep)
- def __sizeof__(self) -> int:
- return self.sizeof()
- @property
- def is_unique(self) -> bool:
- if self.need_unique_check:
- self._do_unique_check()
- return self.unique == 1
- cdef _do_unique_check(self):
- self._ensure_mapping_populated()
- @property
- def is_monotonic_increasing(self) -> bool:
- if self.need_monotonic_check:
- self._do_monotonic_check()
- return self.monotonic_inc == 1
- @property
- def is_monotonic_decreasing(self) -> bool:
- if self.need_monotonic_check:
- self._do_monotonic_check()
- return self.monotonic_dec == 1
- cdef _do_monotonic_check(self):
- cdef:
- bint is_unique
- if self.mask is not None and np.any(self.mask):
- self.monotonic_inc = 0
- self.monotonic_dec = 0
- else:
- try:
- values = self.values
- self.monotonic_inc, self.monotonic_dec, is_unique = \
- self._call_monotonic(values)
- except TypeError:
- self.monotonic_inc = 0
- self.monotonic_dec = 0
- is_unique = 0
- self.need_monotonic_check = 0
- # we can only be sure of uniqueness if is_unique=1
- if is_unique:
- self.unique = 1
- self.need_unique_check = 0
- cdef _call_monotonic(self, values):
- return algos.is_monotonic(values, timelike=False)
- cdef _make_hash_table(self, Py_ssize_t n):
- raise NotImplementedError # pragma: no cover
- cdef _check_type(self, object val):
- hash(val)
- return val
- @property
- def is_mapping_populated(self) -> bool:
- return self.mapping is not None
- cdef _ensure_mapping_populated(self):
- # this populates the mapping
- # if its not already populated
- # also satisfies the need_unique_check
- if not self.is_mapping_populated:
- values = self.values
- self.mapping = self._make_hash_table(len(values))
- self.mapping.map_locations(values, self.mask)
- if len(self.mapping) == len(values):
- self.unique = 1
- self.need_unique_check = 0
- def clear_mapping(self):
- self.mapping = None
- self.need_monotonic_check = 1
- self.need_unique_check = 1
- self.unique = 0
- self.monotonic_inc = 0
- self.monotonic_dec = 0
- def get_indexer(self, ndarray values) -> np.ndarray:
- self._ensure_mapping_populated()
- return self.mapping.lookup(values)
- def get_indexer_non_unique(self, ndarray targets):
- """
- Return an indexer suitable for taking from a non unique index
- return the labels in the same order as the target
- and a missing indexer into the targets (which correspond
- to the -1 indices in the results
- Returns
- -------
- indexer : np.ndarray[np.intp]
- missing : np.ndarray[np.intp]
- """
- cdef:
- ndarray values
- ndarray[intp_t] result, missing
- set stargets, remaining_stargets, found_nas
- dict d = {}
- object val
- Py_ssize_t count = 0, count_missing = 0
- Py_ssize_t i, j, n, n_t, n_alloc, start, end
- bint check_na_values = False
- values = self.values
- stargets = set(targets)
- n = len(values)
- n_t = len(targets)
- if n > 10_000:
- n_alloc = 10_000
- else:
- n_alloc = n
- result = np.empty(n_alloc, dtype=np.intp)
- missing = np.empty(n_t, dtype=np.intp)
- # map each starget to its position in the index
- if (
- stargets and
- len(stargets) < 5 and
- not any([checknull(t) for t in stargets]) and
- self.is_monotonic_increasing
- ):
- # if there are few enough stargets and the index is monotonically
- # increasing, then use binary search for each starget
- remaining_stargets = set()
- for starget in stargets:
- try:
- start = values.searchsorted(starget, side="left")
- end = values.searchsorted(starget, side="right")
- except TypeError: # e.g. if we tried to search for string in int array
- remaining_stargets.add(starget)
- else:
- if start != end:
- d[starget] = list(range(start, end))
- stargets = remaining_stargets
- if stargets:
- # otherwise, map by iterating through all items in the index
- # short-circuit na check
- if values.dtype == object:
- check_na_values = True
- # keep track of nas in values
- found_nas = set()
- for i in range(n):
- val = values[i]
- # GH#43870
- # handle lookup for nas
- # (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat)
- if check_na_values and checknull(val):
- match = [na for na in found_nas if is_matching_na(val, na)]
- # matching na not found
- if not len(match):
- found_nas.add(val)
- # add na to stargets to utilize `in` for stargets/d lookup
- match_stargets = [
- x for x in stargets if is_matching_na(val, x)
- ]
- if len(match_stargets):
- # add our 'standardized' na
- stargets.add(val)
- # matching na found
- else:
- assert len(match) == 1
- val = match[0]
- if val in stargets:
- if val not in d:
- d[val] = []
- d[val].append(i)
- for i in range(n_t):
- val = targets[i]
- # ensure there are nas in values before looking for a matching na
- if check_na_values and checknull(val):
- match = [na for na in found_nas if is_matching_na(val, na)]
- if len(match):
- assert len(match) == 1
- val = match[0]
- # found
- if val in d:
- key = val
- for j in d[key]:
- # realloc if needed
- if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
- result[count] = j
- count += 1
- # value not found
- else:
- if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
- result[count] = -1
- count += 1
- missing[count_missing] = i
- count_missing += 1
- return result[0:count], missing[0:count_missing]
- cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
- # GH#1757 ndarray.searchsorted is not safe to use with array of tuples
- # (treats a tuple `val` as a sequence of keys instead of a single key),
- # so we implement something similar.
- # This is equivalent to the stdlib's bisect.bisect_left
- cdef:
- Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1
- object pval
- if hi == 0 or (hi > 0 and val > values[hi]):
- return len(values)
- while lo < hi:
- mid = (lo + hi) // 2
- pval = values[mid]
- if val < pval:
- hi = mid
- elif val > pval:
- lo = mid + 1
- else:
- while mid > 0 and val == values[mid - 1]:
- mid -= 1
- return mid
- if val <= values[mid]:
- return mid
- else:
- return mid + 1
- cdef class ObjectEngine(IndexEngine):
- """
- Index Engine for use with object-dtype Index, namely the base class Index.
- """
- cdef _make_hash_table(self, Py_ssize_t n):
- return _hash.PyObjectHashTable(n)
- cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
- # using values.searchsorted here would treat a tuple `val` as a sequence
- # instead of a single key, so we use a different implementation
- try:
- loc = _bin_search(self.values, val)
- except TypeError as err:
- raise KeyError(val) from err
- return loc
- cdef class DatetimeEngine(Int64Engine):
- cdef:
- NPY_DATETIMEUNIT _creso
- def __init__(self, ndarray values):
- super().__init__(values.view("i8"))
- self._creso = get_unit_from_dtype(values.dtype)
- cdef int64_t _unbox_scalar(self, scalar) except? -1:
- # NB: caller is responsible for ensuring tzawareness compat
- # before we get here
- if scalar is NaT:
- return NaT._value
- elif isinstance(scalar, _Timestamp):
- if scalar._creso == self._creso:
- return scalar._value
- else:
- # Note: caller is responsible for catching potential ValueError
- # from _as_creso
- return (
- (<_Timestamp>scalar)._as_creso(self._creso, round_ok=False)._value
- )
- raise TypeError(scalar)
- def __contains__(self, val: object) -> bool:
- # We assume before we get here:
- # - val is hashable
- try:
- self._unbox_scalar(val)
- except ValueError:
- return False
- try:
- self.get_loc(val)
- return True
- except KeyError:
- return False
- cdef _call_monotonic(self, values):
- return algos.is_monotonic(values, timelike=True)
- cpdef get_loc(self, object val):
- # NB: the caller is responsible for ensuring that we are called
- # with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine)
- cdef:
- Py_ssize_t loc
- if is_definitely_invalid_key(val):
- raise TypeError(f"'{val}' is an invalid key")
- try:
- conv = self._unbox_scalar(val)
- except (TypeError, ValueError) as err:
- raise KeyError(val) from err
- # Welcome to the spaghetti factory
- if self.over_size_threshold and self.is_monotonic_increasing:
- if not self.is_unique:
- return self._get_loc_duplicates(conv)
- values = self.values
- loc = values.searchsorted(conv, side="left")
- if loc == len(values) or values[loc] != conv:
- raise KeyError(val)
- return loc
- self._ensure_mapping_populated()
- if not self.unique:
- return self._get_loc_duplicates(conv)
- try:
- return self.mapping.get_item(conv)
- except KeyError:
- raise KeyError(val)
- cdef class TimedeltaEngine(DatetimeEngine):
- cdef int64_t _unbox_scalar(self, scalar) except? -1:
- if scalar is NaT:
- return NaT._value
- elif isinstance(scalar, _Timedelta):
- if scalar._creso == self._creso:
- return scalar._value
- else:
- # Note: caller is responsible for catching potential ValueError
- # from _as_creso
- return (
- (<_Timedelta>scalar)._as_creso(self._creso, round_ok=False)._value
- )
- raise TypeError(scalar)
- cdef class PeriodEngine(Int64Engine):
- cdef int64_t _unbox_scalar(self, scalar) except? -1:
- if scalar is NaT:
- return scalar._value
- if is_period_object(scalar):
- # NB: we assume that we have the correct freq here.
- return scalar.ordinal
- raise TypeError(scalar)
- cpdef get_loc(self, object val):
- # NB: the caller is responsible for ensuring that we are called
- # with either a Period or NaT
- cdef:
- int64_t conv
- try:
- conv = self._unbox_scalar(val)
- except TypeError:
- raise KeyError(val)
- return Int64Engine.get_loc(self, conv)
- cdef _call_monotonic(self, values):
- return algos.is_monotonic(values, timelike=True)
- cdef class BaseMultiIndexCodesEngine:
- """
- Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
- represent each label in a MultiIndex as an integer, by juxtaposing the bits
- encoding each level, with appropriate offsets.
- For instance: if 3 levels have respectively 3, 6 and 1 possible values,
- then their labels can be represented using respectively 2, 3 and 1 bits,
- as follows:
- _ _ _ _____ _ __ __ __
- |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
- — — — ————— — —— —— ——
- |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
- — — — ————— — —— —— ——
- |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
- ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
- and the resulting unsigned integer representation will be:
- _ _ _ _____ _ __ __ __ __ __ __
- |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
- ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
- Offsets are calculated at initialization, labels are transformed by method
- _codes_to_ints.
- Keys are located by first locating each component against the respective
- level, then locating (the integer representation of) codes.
- """
- def __init__(self, object levels, object labels,
- ndarray[uint64_t, ndim=1] offsets):
- """
- Parameters
- ----------
- levels : list-like of numpy arrays
- Levels of the MultiIndex.
- labels : list-like of numpy arrays of integer dtype
- Labels of the MultiIndex.
- offsets : numpy array of uint64 dtype
- Pre-calculated offsets, one for each level of the index.
- """
- self.levels = levels
- self.offsets = offsets
- # Transform labels in a single array, and add 2 so that we are working
- # with positive integers (-1 for NaN becomes 1). This enables us to
- # differentiate between values that are missing in other and matching
- # NaNs. We will set values that are not found to 0 later:
- labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
- codes = labels_arr.astype("uint64", copy=False)
- self.level_has_nans = [-1 in lab for lab in labels]
- # Map each codes combination in the index to an integer unambiguously
- # (no collisions possible), based on the "offsets", which describe the
- # number of bits to switch labels for each level:
- lab_ints = self._codes_to_ints(codes)
- # Initialize underlying index (e.g. libindex.UInt64Engine) with
- # integers representing labels: we will use its get_loc and get_indexer
- self._base.__init__(self, lab_ints)
- def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
- raise NotImplementedError("Implemented by subclass") # pragma: no cover
- def _extract_level_codes(self, target) -> np.ndarray:
- """
- Map the requested list of (tuple) keys to their integer representations
- for searching in the underlying integer index.
- Parameters
- ----------
- target : MultiIndex
- Returns
- ------
- int_keys : 1-dimensional array of dtype uint64 or object
- Integers representing one combination each
- """
- zt = [target._get_level_values(i) for i in range(target.nlevels)]
- level_codes = []
- for i, (lev, codes) in enumerate(zip(self.levels, zt)):
- result = lev.get_indexer_for(codes) + 1
- result[result > 0] += 1
- if self.level_has_nans[i] and codes.hasnans:
- result[codes.isna()] += 1
- level_codes.append(result)
- return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
- def get_indexer(self, target: np.ndarray) -> np.ndarray:
- """
- Returns an array giving the positions of each value of `target` in
- `self.values`, where -1 represents a value in `target` which does not
- appear in `self.values`
- Parameters
- ----------
- target : np.ndarray
- Returns
- -------
- np.ndarray[intp_t, ndim=1] of the indexer of `target` into
- `self.values`
- """
- return self._base.get_indexer(self, target)
- def get_indexer_with_fill(self, ndarray target, ndarray values,
- str method, object limit) -> np.ndarray:
- """
- Returns an array giving the positions of each value of `target` in
- `values`, where -1 represents a value in `target` which does not
- appear in `values`
- If `method` is "backfill" then the position for a value in `target`
- which does not appear in `values` is that of the next greater value
- in `values` (if one exists), and -1 if there is no such value.
- Similarly, if the method is "pad" then the position for a value in
- `target` which does not appear in `values` is that of the next smaller
- value in `values` (if one exists), and -1 if there is no such value.
- Parameters
- ----------
- target: ndarray[object] of tuples
- need not be sorted, but all must have the same length, which must be
- the same as the length of all tuples in `values`
- values : ndarray[object] of tuples
- must be sorted and all have the same length. Should be the set of
- the MultiIndex's values.
- method: string
- "backfill" or "pad"
- limit: int or None
- if provided, limit the number of fills to this value
- Returns
- -------
- np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
- filled with the `method` (and optionally `limit`) specified
- """
- assert method in ("backfill", "pad")
- cdef:
- int64_t i, j, next_code
- int64_t num_values, num_target_values
- ndarray[int64_t, ndim=1] target_order
- ndarray[object, ndim=1] target_values
- ndarray[int64_t, ndim=1] new_codes, new_target_codes
- ndarray[intp_t, ndim=1] sorted_indexer
- target_order = np.argsort(target).astype("int64")
- target_values = target[target_order]
- num_values, num_target_values = len(values), len(target_values)
- new_codes, new_target_codes = (
- np.empty((num_values,)).astype("int64"),
- np.empty((num_target_values,)).astype("int64"),
- )
- # `values` and `target_values` are both sorted, so we walk through them
- # and memoize the (ordered) set of indices in the (implicit) merged-and
- # sorted list of the two which belong to each of them
- # the effect of this is to create a factorization for the (sorted)
- # merger of the index values, where `new_codes` and `new_target_codes`
- # are the subset of the factors which appear in `values` and `target`,
- # respectively
- i, j, next_code = 0, 0, 0
- while i < num_values and j < num_target_values:
- val, target_val = values[i], target_values[j]
- if val <= target_val:
- new_codes[i] = next_code
- i += 1
- if target_val <= val:
- new_target_codes[j] = next_code
- j += 1
- next_code += 1
- # at this point, at least one should have reached the end
- # the remaining values of the other should be added to the end
- assert i == num_values or j == num_target_values
- while i < num_values:
- new_codes[i] = next_code
- i += 1
- next_code += 1
- while j < num_target_values:
- new_target_codes[j] = next_code
- j += 1
- next_code += 1
- # get the indexer, and undo the sorting of `target.values`
- algo = algos.backfill if method == "backfill" else algos.pad
- sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
- return sorted_indexer[np.argsort(target_order)]
- def get_loc(self, object key):
- if is_definitely_invalid_key(key):
- raise TypeError(f"'{key}' is an invalid key")
- if not isinstance(key, tuple):
- raise KeyError(key)
- try:
- indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift
- for lev, v in zip(self.levels, key)]
- except KeyError:
- raise KeyError(key)
- # Transform indices into single integer:
- lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
- return self._base.get_loc(self, lab_int)
- def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray:
- indexer = self._base.get_indexer_non_unique(self, target)
- return indexer
- def __contains__(self, val: object) -> bool:
- # We assume before we get here:
- # - val is hashable
- # Default __contains__ looks in the underlying mapping, which in this
- # case only contains integer representations.
- try:
- self.get_loc(val)
- return True
- except (KeyError, TypeError, ValueError):
- return False
- # Generated from template.
- include "index_class_helper.pxi"
- cdef class BoolEngine(UInt8Engine):
- cdef _check_type(self, object val):
- if not util.is_bool_object(val):
- raise KeyError(val)
- return <uint8_t>val
- cdef class MaskedBoolEngine(MaskedUInt8Engine):
- cdef _check_type(self, object val):
- if val is C_NA:
- return val
- if not util.is_bool_object(val):
- raise KeyError(val)
- return <uint8_t>val
- @cython.internal
- @cython.freelist(32)
- cdef class SharedEngine:
- cdef readonly:
- object values # ExtensionArray
- bint over_size_threshold
- cdef:
- bint unique, monotonic_inc, monotonic_dec
- bint need_monotonic_check, need_unique_check
- def __contains__(self, val: object) -> bool:
- # We assume before we get here:
- # - val is hashable
- try:
- self.get_loc(val)
- return True
- except KeyError:
- return False
- def clear_mapping(self):
- # for compat with IndexEngine
- pass
- @property
- def is_unique(self) -> bool:
- if self.need_unique_check:
- arr = self.values.unique()
- self.unique = len(arr) == len(self.values)
- self.need_unique_check = False
- return self.unique
- cdef _do_monotonic_check(self):
- raise NotImplementedError
- @property
- def is_monotonic_increasing(self) -> bool:
- if self.need_monotonic_check:
- self._do_monotonic_check()
- return self.monotonic_inc == 1
- @property
- def is_monotonic_decreasing(self) -> bool:
- if self.need_monotonic_check:
- self._do_monotonic_check()
- return self.monotonic_dec == 1
- cdef _call_monotonic(self, values):
- return algos.is_monotonic(values, timelike=False)
- def sizeof(self, deep: bool = False) -> int:
- """ return the sizeof our mapping """
- return 0
- def __sizeof__(self) -> int:
- return self.sizeof()
- cdef _check_type(self, object obj):
- raise NotImplementedError
- cpdef get_loc(self, object val):
- # -> Py_ssize_t | slice | ndarray[bool]
- cdef:
- Py_ssize_t loc
- if is_definitely_invalid_key(val):
- raise TypeError(f"'{val}' is an invalid key")
- self._check_type(val)
- if self.over_size_threshold and self.is_monotonic_increasing:
- if not self.is_unique:
- return self._get_loc_duplicates(val)
- values = self.values
- loc = self._searchsorted_left(val)
- if loc >= len(values):
- raise KeyError(val)
- if values[loc] != val:
- raise KeyError(val)
- return loc
- if not self.unique:
- return self._get_loc_duplicates(val)
- return self._get_loc_duplicates(val)
- cdef _get_loc_duplicates(self, object val):
- # -> Py_ssize_t | slice | ndarray[bool]
- cdef:
- Py_ssize_t diff
- if self.is_monotonic_increasing:
- values = self.values
- try:
- left = values.searchsorted(val, side="left")
- right = values.searchsorted(val, side="right")
- except TypeError:
- # e.g. GH#29189 get_loc(None) with a Float64Index
- raise KeyError(val)
- diff = right - left
- if diff == 0:
- raise KeyError(val)
- elif diff == 1:
- return left
- else:
- return slice(left, right)
- return self._maybe_get_bool_indexer(val)
- cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
- """
- See ObjectEngine._searchsorted_left.__doc__.
- """
- try:
- loc = self.values.searchsorted(val, side="left")
- except TypeError as err:
- # GH#35788 e.g. val=None with float64 values
- raise KeyError(val)
- return loc
- cdef ndarray _get_bool_indexer(self, val):
- raise NotImplementedError
- cdef _maybe_get_bool_indexer(self, object val):
- # Returns ndarray[bool] or int
- cdef:
- ndarray[uint8_t, ndim=1, cast=True] indexer
- indexer = self._get_bool_indexer(val)
- return _unpack_bool_indexer(indexer, val)
- def get_indexer(self, values) -> np.ndarray:
- # values : type(self.values)
- # Note: we only get here with self.is_unique
- cdef:
- Py_ssize_t i, N = len(values)
- res = np.empty(N, dtype=np.intp)
- for i in range(N):
- val = values[i]
- try:
- loc = self.get_loc(val)
- # Because we are unique, loc should always be an integer
- except KeyError:
- loc = -1
- else:
- assert util.is_integer_object(loc), (loc, val)
- res[i] = loc
- return res
- def get_indexer_non_unique(self, targets):
- """
- Return an indexer suitable for taking from a non unique index
- return the labels in the same order as the target
- and a missing indexer into the targets (which correspond
- to the -1 indices in the results
- Parameters
- ----------
- targets : type(self.values)
- Returns
- -------
- indexer : np.ndarray[np.intp]
- missing : np.ndarray[np.intp]
- """
- cdef:
- Py_ssize_t i, N = len(targets)
- indexer = []
- missing = []
- # See also IntervalIndex.get_indexer_pointwise
- for i in range(N):
- val = targets[i]
- try:
- locs = self.get_loc(val)
- except KeyError:
- locs = np.array([-1], dtype=np.intp)
- missing.append(i)
- else:
- if isinstance(locs, slice):
- # Only needed for get_indexer_non_unique
- locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
- elif util.is_integer_object(locs):
- locs = np.array([locs], dtype=np.intp)
- else:
- assert locs.dtype.kind == "b"
- locs = locs.nonzero()[0]
- indexer.append(locs)
- try:
- indexer = np.concatenate(indexer, dtype=np.intp)
- except TypeError:
- # numpy<1.20 doesn't accept dtype keyword
- indexer = np.concatenate(indexer).astype(np.intp, copy=False)
- missing = np.array(missing, dtype=np.intp)
- return indexer, missing
- cdef class ExtensionEngine(SharedEngine):
- def __init__(self, values: "ExtensionArray"):
- self.values = values
- self.over_size_threshold = len(values) >= _SIZE_CUTOFF
- self.need_unique_check = True
- self.need_monotonic_check = True
- self.need_unique_check = True
- cdef _do_monotonic_check(self):
- cdef:
- bint is_unique
- values = self.values
- if values._hasna:
- self.monotonic_inc = 0
- self.monotonic_dec = 0
- nunique = len(values.unique())
- self.unique = nunique == len(values)
- self.need_unique_check = 0
- return
- try:
- ranks = values._rank()
- except TypeError:
- self.monotonic_inc = 0
- self.monotonic_dec = 0
- is_unique = 0
- else:
- self.monotonic_inc, self.monotonic_dec, is_unique = \
- self._call_monotonic(ranks)
- self.need_monotonic_check = 0
- # we can only be sure of uniqueness if is_unique=1
- if is_unique:
- self.unique = 1
- self.need_unique_check = 0
- cdef ndarray _get_bool_indexer(self, val):
- if checknull(val):
- return self.values.isna()
- try:
- return self.values == val
- except TypeError:
- # e.g. if __eq__ returns a BooleanArray instead of ndarray[bool]
- try:
- return (self.values == val).to_numpy(dtype=bool, na_value=False)
- except (TypeError, AttributeError) as err:
- # e.g. (self.values == val) returned a bool
- # see test_get_loc_generator[string[pyarrow]]
- # e.g. self.value == val raises TypeError bc generator has no len
- # see test_get_loc_generator[string[python]]
- raise KeyError from err
- cdef _check_type(self, object val):
- hash(val)
- cdef class MaskedIndexEngine(IndexEngine):
- def __init__(self, object values):
- super().__init__(self._get_data(values))
- self.mask = self._get_mask(values)
- def _get_data(self, object values) -> np.ndarray:
- if hasattr(values, "_mask"):
- return values._data
- # We are an ArrowExtensionArray
- # Set 1 as na_value to avoid ending up with NA and an object array
- # TODO: Remove when arrow engine is implemented
- return values.to_numpy(na_value=1, dtype=values.dtype.numpy_dtype)
- def _get_mask(self, object values) -> np.ndarray:
- if hasattr(values, "_mask"):
- return values._mask
- # We are an ArrowExtensionArray
- return values.isna()
- def get_indexer(self, object values) -> np.ndarray:
- self._ensure_mapping_populated()
- return self.mapping.lookup(self._get_data(values), self._get_mask(values))
- def get_indexer_non_unique(self, object targets):
- """
- Return an indexer suitable for taking from a non unique index
- return the labels in the same order as the target
- and a missing indexer into the targets (which correspond
- to the -1 indices in the results
- Returns
- -------
- indexer : np.ndarray[np.intp]
- missing : np.ndarray[np.intp]
- """
- # TODO: Unify with parent class
- cdef:
- ndarray values, mask, target_vals, target_mask
- ndarray[intp_t] result, missing
- set stargets
- list na_pos
- dict d = {}
- object val
- Py_ssize_t count = 0, count_missing = 0
- Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx
- target_vals = self._get_data(targets)
- target_mask = self._get_mask(targets)
- values = self.values
- assert not values.dtype == object # go through object path instead
- mask = self.mask
- stargets = set(target_vals[~target_mask])
- n = len(values)
- n_t = len(target_vals)
- if n > 10_000:
- n_alloc = 10_000
- else:
- n_alloc = n
- result = np.empty(n_alloc, dtype=np.intp)
- missing = np.empty(n_t, dtype=np.intp)
- # map each starget to its position in the index
- if (
- stargets and
- len(stargets) < 5 and
- not np.any(target_mask) and
- self.is_monotonic_increasing
- ):
- # if there are few enough stargets and the index is monotonically
- # increasing, then use binary search for each starget
- for starget in stargets:
- start = values.searchsorted(starget, side="left")
- end = values.searchsorted(starget, side="right")
- if start != end:
- d[starget] = list(range(start, end))
- stargets = set()
- if stargets:
- # otherwise, map by iterating through all items in the index
- na_pos = []
- for i in range(n):
- val = values[i]
- if mask[i]:
- na_pos.append(i)
- else:
- if val in stargets:
- if val not in d:
- d[val] = []
- d[val].append(i)
- for i in range(n_t):
- val = target_vals[i]
- if target_mask[i]:
- if na_pos:
- for na_idx in na_pos:
- # realloc if needed
- if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
- result[count] = na_idx
- count += 1
- continue
- elif val in d:
- # found
- key = val
- for j in d[key]:
- # realloc if needed
- if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
- result[count] = j
- count += 1
- continue
- # value not found
- if count >= n_alloc:
- n_alloc += 10_000
- result = np.resize(result, n_alloc)
- result[count] = -1
- count += 1
- missing[count_missing] = i
- count_missing += 1
- return result[0:count], missing[0:count_missing]
|