index.pyx 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280
  1. cimport cython
  2. import numpy as np
  3. cimport numpy as cnp
  4. from numpy cimport (
  5. int64_t,
  6. intp_t,
  7. ndarray,
  8. uint8_t,
  9. uint64_t,
  10. )
  11. cnp.import_array()
  12. from pandas._libs cimport util
  13. from pandas._libs.hashtable cimport HashTable
  14. from pandas._libs.tslibs.nattype cimport c_NaT as NaT
  15. from pandas._libs.tslibs.np_datetime cimport (
  16. NPY_DATETIMEUNIT,
  17. get_unit_from_dtype,
  18. )
  19. from pandas._libs.tslibs.period cimport is_period_object
  20. from pandas._libs.tslibs.timedeltas cimport _Timedelta
  21. from pandas._libs.tslibs.timestamps cimport _Timestamp
  22. from pandas._libs import (
  23. algos,
  24. hashtable as _hash,
  25. )
  26. from pandas._libs.lib cimport eq_NA_compat
  27. from pandas._libs.missing cimport (
  28. C_NA,
  29. checknull,
  30. is_matching_na,
  31. )
  32. # Defines shift of MultiIndex codes to avoid negative codes (missing values)
  33. multiindex_nulls_shift = 2
  34. cdef bint is_definitely_invalid_key(object val):
  35. try:
  36. hash(val)
  37. except TypeError:
  38. return True
  39. return False
  40. cdef ndarray _get_bool_indexer(ndarray values, object val, ndarray mask = None):
  41. """
  42. Return a ndarray[bool] of locations where val matches self.values.
  43. If val is not NA, this is equivalent to `self.values == val`
  44. """
  45. # Caller is responsible for ensuring _check_type has already been called
  46. cdef:
  47. ndarray[uint8_t, ndim=1, cast=True] indexer
  48. Py_ssize_t i
  49. object item
  50. if values.descr.type_num == cnp.NPY_OBJECT:
  51. assert mask is None # no mask for object dtype
  52. # i.e. values.dtype == object
  53. if not checknull(val):
  54. indexer = eq_NA_compat(values, val)
  55. else:
  56. # We need to check for _matching_ NA values
  57. indexer = np.empty(len(values), dtype=np.uint8)
  58. for i in range(len(values)):
  59. item = values[i]
  60. indexer[i] = is_matching_na(item, val)
  61. else:
  62. if mask is not None:
  63. if val is C_NA:
  64. indexer = mask == 1
  65. else:
  66. indexer = (values == val) & ~mask
  67. else:
  68. if util.is_nan(val):
  69. indexer = np.isnan(values)
  70. else:
  71. indexer = values == val
  72. return indexer.view(bool)
  73. # Don't populate hash tables in monotonic indexes larger than this
  74. _SIZE_CUTOFF = 1_000_000
  75. cdef _unpack_bool_indexer(ndarray[uint8_t, ndim=1, cast=True] indexer, object val):
  76. """
  77. Possibly unpack a boolean mask to a single indexer.
  78. """
  79. # Returns ndarray[bool] or int
  80. cdef:
  81. ndarray[intp_t, ndim=1] found
  82. int count
  83. found = np.where(indexer)[0]
  84. count = len(found)
  85. if count > 1:
  86. return indexer
  87. if count == 1:
  88. return int(found[0])
  89. raise KeyError(val)
  90. @cython.freelist(32)
  91. cdef class IndexEngine:
  92. cdef readonly:
  93. ndarray values
  94. ndarray mask
  95. HashTable mapping
  96. bint over_size_threshold
  97. cdef:
  98. bint unique, monotonic_inc, monotonic_dec
  99. bint need_monotonic_check, need_unique_check
  100. object _np_type
  101. def __init__(self, ndarray values):
  102. self.values = values
  103. self.mask = None
  104. self.over_size_threshold = len(values) >= _SIZE_CUTOFF
  105. self.clear_mapping()
  106. self._np_type = values.dtype.type
  107. def __contains__(self, val: object) -> bool:
  108. hash(val)
  109. try:
  110. self.get_loc(val)
  111. except KeyError:
  112. return False
  113. return True
  114. cpdef get_loc(self, object val):
  115. # -> Py_ssize_t | slice | ndarray[bool]
  116. cdef:
  117. Py_ssize_t loc
  118. if is_definitely_invalid_key(val):
  119. raise TypeError(f"'{val}' is an invalid key")
  120. val = self._check_type(val)
  121. if self.over_size_threshold and self.is_monotonic_increasing:
  122. if not self.is_unique:
  123. return self._get_loc_duplicates(val)
  124. values = self.values
  125. loc = self._searchsorted_left(val)
  126. if loc >= len(values):
  127. raise KeyError(val)
  128. if values[loc] != val:
  129. raise KeyError(val)
  130. return loc
  131. self._ensure_mapping_populated()
  132. if not self.unique:
  133. return self._get_loc_duplicates(val)
  134. if self.mask is not None and val is C_NA:
  135. return self.mapping.get_na()
  136. try:
  137. return self.mapping.get_item(val)
  138. except OverflowError as err:
  139. # GH#41775 OverflowError e.g. if we are uint64 and val is -1
  140. # or if we are int64 and value is np.iinfo(np.int64).max+1
  141. # (the uint64 with -1 case should actually be excluded by _check_type)
  142. raise KeyError(val) from err
  143. cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
  144. """
  145. See ObjectEngine._searchsorted_left.__doc__.
  146. """
  147. # Caller is responsible for ensuring _check_type has already been called
  148. loc = self.values.searchsorted(self._np_type(val), side="left")
  149. return loc
  150. cdef _get_loc_duplicates(self, object val):
  151. # -> Py_ssize_t | slice | ndarray[bool]
  152. cdef:
  153. Py_ssize_t diff, left, right
  154. if self.is_monotonic_increasing:
  155. values = self.values
  156. try:
  157. left = values.searchsorted(val, side="left")
  158. right = values.searchsorted(val, side="right")
  159. except TypeError:
  160. # e.g. GH#29189 get_loc(None) with a Float64Index
  161. # 2021-09-29 Now only reached for object-dtype
  162. raise KeyError(val)
  163. diff = right - left
  164. if diff == 0:
  165. raise KeyError(val)
  166. elif diff == 1:
  167. return left
  168. else:
  169. return slice(left, right)
  170. return self._maybe_get_bool_indexer(val)
  171. cdef _maybe_get_bool_indexer(self, object val):
  172. # Returns ndarray[bool] or int
  173. cdef:
  174. ndarray[uint8_t, ndim=1, cast=True] indexer
  175. indexer = _get_bool_indexer(self.values, val, self.mask)
  176. return _unpack_bool_indexer(indexer, val)
  177. def sizeof(self, deep: bool = False) -> int:
  178. """ return the sizeof our mapping """
  179. if not self.is_mapping_populated:
  180. return 0
  181. return self.mapping.sizeof(deep=deep)
  182. def __sizeof__(self) -> int:
  183. return self.sizeof()
  184. @property
  185. def is_unique(self) -> bool:
  186. if self.need_unique_check:
  187. self._do_unique_check()
  188. return self.unique == 1
  189. cdef _do_unique_check(self):
  190. self._ensure_mapping_populated()
  191. @property
  192. def is_monotonic_increasing(self) -> bool:
  193. if self.need_monotonic_check:
  194. self._do_monotonic_check()
  195. return self.monotonic_inc == 1
  196. @property
  197. def is_monotonic_decreasing(self) -> bool:
  198. if self.need_monotonic_check:
  199. self._do_monotonic_check()
  200. return self.monotonic_dec == 1
  201. cdef _do_monotonic_check(self):
  202. cdef:
  203. bint is_unique
  204. if self.mask is not None and np.any(self.mask):
  205. self.monotonic_inc = 0
  206. self.monotonic_dec = 0
  207. else:
  208. try:
  209. values = self.values
  210. self.monotonic_inc, self.monotonic_dec, is_unique = \
  211. self._call_monotonic(values)
  212. except TypeError:
  213. self.monotonic_inc = 0
  214. self.monotonic_dec = 0
  215. is_unique = 0
  216. self.need_monotonic_check = 0
  217. # we can only be sure of uniqueness if is_unique=1
  218. if is_unique:
  219. self.unique = 1
  220. self.need_unique_check = 0
  221. cdef _call_monotonic(self, values):
  222. return algos.is_monotonic(values, timelike=False)
  223. cdef _make_hash_table(self, Py_ssize_t n):
  224. raise NotImplementedError # pragma: no cover
  225. cdef _check_type(self, object val):
  226. hash(val)
  227. return val
  228. @property
  229. def is_mapping_populated(self) -> bool:
  230. return self.mapping is not None
  231. cdef _ensure_mapping_populated(self):
  232. # this populates the mapping
  233. # if its not already populated
  234. # also satisfies the need_unique_check
  235. if not self.is_mapping_populated:
  236. values = self.values
  237. self.mapping = self._make_hash_table(len(values))
  238. self.mapping.map_locations(values, self.mask)
  239. if len(self.mapping) == len(values):
  240. self.unique = 1
  241. self.need_unique_check = 0
  242. def clear_mapping(self):
  243. self.mapping = None
  244. self.need_monotonic_check = 1
  245. self.need_unique_check = 1
  246. self.unique = 0
  247. self.monotonic_inc = 0
  248. self.monotonic_dec = 0
  249. def get_indexer(self, ndarray values) -> np.ndarray:
  250. self._ensure_mapping_populated()
  251. return self.mapping.lookup(values)
  252. def get_indexer_non_unique(self, ndarray targets):
  253. """
  254. Return an indexer suitable for taking from a non unique index
  255. return the labels in the same order as the target
  256. and a missing indexer into the targets (which correspond
  257. to the -1 indices in the results
  258. Returns
  259. -------
  260. indexer : np.ndarray[np.intp]
  261. missing : np.ndarray[np.intp]
  262. """
  263. cdef:
  264. ndarray values
  265. ndarray[intp_t] result, missing
  266. set stargets, remaining_stargets, found_nas
  267. dict d = {}
  268. object val
  269. Py_ssize_t count = 0, count_missing = 0
  270. Py_ssize_t i, j, n, n_t, n_alloc, start, end
  271. bint check_na_values = False
  272. values = self.values
  273. stargets = set(targets)
  274. n = len(values)
  275. n_t = len(targets)
  276. if n > 10_000:
  277. n_alloc = 10_000
  278. else:
  279. n_alloc = n
  280. result = np.empty(n_alloc, dtype=np.intp)
  281. missing = np.empty(n_t, dtype=np.intp)
  282. # map each starget to its position in the index
  283. if (
  284. stargets and
  285. len(stargets) < 5 and
  286. not any([checknull(t) for t in stargets]) and
  287. self.is_monotonic_increasing
  288. ):
  289. # if there are few enough stargets and the index is monotonically
  290. # increasing, then use binary search for each starget
  291. remaining_stargets = set()
  292. for starget in stargets:
  293. try:
  294. start = values.searchsorted(starget, side="left")
  295. end = values.searchsorted(starget, side="right")
  296. except TypeError: # e.g. if we tried to search for string in int array
  297. remaining_stargets.add(starget)
  298. else:
  299. if start != end:
  300. d[starget] = list(range(start, end))
  301. stargets = remaining_stargets
  302. if stargets:
  303. # otherwise, map by iterating through all items in the index
  304. # short-circuit na check
  305. if values.dtype == object:
  306. check_na_values = True
  307. # keep track of nas in values
  308. found_nas = set()
  309. for i in range(n):
  310. val = values[i]
  311. # GH#43870
  312. # handle lookup for nas
  313. # (ie. np.nan, float("NaN"), Decimal("NaN"), dt64nat, td64nat)
  314. if check_na_values and checknull(val):
  315. match = [na for na in found_nas if is_matching_na(val, na)]
  316. # matching na not found
  317. if not len(match):
  318. found_nas.add(val)
  319. # add na to stargets to utilize `in` for stargets/d lookup
  320. match_stargets = [
  321. x for x in stargets if is_matching_na(val, x)
  322. ]
  323. if len(match_stargets):
  324. # add our 'standardized' na
  325. stargets.add(val)
  326. # matching na found
  327. else:
  328. assert len(match) == 1
  329. val = match[0]
  330. if val in stargets:
  331. if val not in d:
  332. d[val] = []
  333. d[val].append(i)
  334. for i in range(n_t):
  335. val = targets[i]
  336. # ensure there are nas in values before looking for a matching na
  337. if check_na_values and checknull(val):
  338. match = [na for na in found_nas if is_matching_na(val, na)]
  339. if len(match):
  340. assert len(match) == 1
  341. val = match[0]
  342. # found
  343. if val in d:
  344. key = val
  345. for j in d[key]:
  346. # realloc if needed
  347. if count >= n_alloc:
  348. n_alloc += 10_000
  349. result = np.resize(result, n_alloc)
  350. result[count] = j
  351. count += 1
  352. # value not found
  353. else:
  354. if count >= n_alloc:
  355. n_alloc += 10_000
  356. result = np.resize(result, n_alloc)
  357. result[count] = -1
  358. count += 1
  359. missing[count_missing] = i
  360. count_missing += 1
  361. return result[0:count], missing[0:count_missing]
  362. cdef Py_ssize_t _bin_search(ndarray values, object val) except -1:
  363. # GH#1757 ndarray.searchsorted is not safe to use with array of tuples
  364. # (treats a tuple `val` as a sequence of keys instead of a single key),
  365. # so we implement something similar.
  366. # This is equivalent to the stdlib's bisect.bisect_left
  367. cdef:
  368. Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1
  369. object pval
  370. if hi == 0 or (hi > 0 and val > values[hi]):
  371. return len(values)
  372. while lo < hi:
  373. mid = (lo + hi) // 2
  374. pval = values[mid]
  375. if val < pval:
  376. hi = mid
  377. elif val > pval:
  378. lo = mid + 1
  379. else:
  380. while mid > 0 and val == values[mid - 1]:
  381. mid -= 1
  382. return mid
  383. if val <= values[mid]:
  384. return mid
  385. else:
  386. return mid + 1
  387. cdef class ObjectEngine(IndexEngine):
  388. """
  389. Index Engine for use with object-dtype Index, namely the base class Index.
  390. """
  391. cdef _make_hash_table(self, Py_ssize_t n):
  392. return _hash.PyObjectHashTable(n)
  393. cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
  394. # using values.searchsorted here would treat a tuple `val` as a sequence
  395. # instead of a single key, so we use a different implementation
  396. try:
  397. loc = _bin_search(self.values, val)
  398. except TypeError as err:
  399. raise KeyError(val) from err
  400. return loc
  401. cdef class DatetimeEngine(Int64Engine):
  402. cdef:
  403. NPY_DATETIMEUNIT _creso
  404. def __init__(self, ndarray values):
  405. super().__init__(values.view("i8"))
  406. self._creso = get_unit_from_dtype(values.dtype)
  407. cdef int64_t _unbox_scalar(self, scalar) except? -1:
  408. # NB: caller is responsible for ensuring tzawareness compat
  409. # before we get here
  410. if scalar is NaT:
  411. return NaT._value
  412. elif isinstance(scalar, _Timestamp):
  413. if scalar._creso == self._creso:
  414. return scalar._value
  415. else:
  416. # Note: caller is responsible for catching potential ValueError
  417. # from _as_creso
  418. return (
  419. (<_Timestamp>scalar)._as_creso(self._creso, round_ok=False)._value
  420. )
  421. raise TypeError(scalar)
  422. def __contains__(self, val: object) -> bool:
  423. # We assume before we get here:
  424. # - val is hashable
  425. try:
  426. self._unbox_scalar(val)
  427. except ValueError:
  428. return False
  429. try:
  430. self.get_loc(val)
  431. return True
  432. except KeyError:
  433. return False
  434. cdef _call_monotonic(self, values):
  435. return algos.is_monotonic(values, timelike=True)
  436. cpdef get_loc(self, object val):
  437. # NB: the caller is responsible for ensuring that we are called
  438. # with either a Timestamp or NaT (Timedelta or NaT for TimedeltaEngine)
  439. cdef:
  440. Py_ssize_t loc
  441. if is_definitely_invalid_key(val):
  442. raise TypeError(f"'{val}' is an invalid key")
  443. try:
  444. conv = self._unbox_scalar(val)
  445. except (TypeError, ValueError) as err:
  446. raise KeyError(val) from err
  447. # Welcome to the spaghetti factory
  448. if self.over_size_threshold and self.is_monotonic_increasing:
  449. if not self.is_unique:
  450. return self._get_loc_duplicates(conv)
  451. values = self.values
  452. loc = values.searchsorted(conv, side="left")
  453. if loc == len(values) or values[loc] != conv:
  454. raise KeyError(val)
  455. return loc
  456. self._ensure_mapping_populated()
  457. if not self.unique:
  458. return self._get_loc_duplicates(conv)
  459. try:
  460. return self.mapping.get_item(conv)
  461. except KeyError:
  462. raise KeyError(val)
  463. cdef class TimedeltaEngine(DatetimeEngine):
  464. cdef int64_t _unbox_scalar(self, scalar) except? -1:
  465. if scalar is NaT:
  466. return NaT._value
  467. elif isinstance(scalar, _Timedelta):
  468. if scalar._creso == self._creso:
  469. return scalar._value
  470. else:
  471. # Note: caller is responsible for catching potential ValueError
  472. # from _as_creso
  473. return (
  474. (<_Timedelta>scalar)._as_creso(self._creso, round_ok=False)._value
  475. )
  476. raise TypeError(scalar)
  477. cdef class PeriodEngine(Int64Engine):
  478. cdef int64_t _unbox_scalar(self, scalar) except? -1:
  479. if scalar is NaT:
  480. return scalar._value
  481. if is_period_object(scalar):
  482. # NB: we assume that we have the correct freq here.
  483. return scalar.ordinal
  484. raise TypeError(scalar)
  485. cpdef get_loc(self, object val):
  486. # NB: the caller is responsible for ensuring that we are called
  487. # with either a Period or NaT
  488. cdef:
  489. int64_t conv
  490. try:
  491. conv = self._unbox_scalar(val)
  492. except TypeError:
  493. raise KeyError(val)
  494. return Int64Engine.get_loc(self, conv)
  495. cdef _call_monotonic(self, values):
  496. return algos.is_monotonic(values, timelike=True)
  497. cdef class BaseMultiIndexCodesEngine:
  498. """
  499. Base class for MultiIndexUIntEngine and MultiIndexPyIntEngine, which
  500. represent each label in a MultiIndex as an integer, by juxtaposing the bits
  501. encoding each level, with appropriate offsets.
  502. For instance: if 3 levels have respectively 3, 6 and 1 possible values,
  503. then their labels can be represented using respectively 2, 3 and 1 bits,
  504. as follows:
  505. _ _ _ _____ _ __ __ __
  506. |0|0|0| ... |0| 0|a1|a0| -> offset 0 (first level)
  507. — — — ————— — —— —— ——
  508. |0|0|0| ... |0|b2|b1|b0| -> offset 2 (bits required for first level)
  509. — — — ————— — —— —— ——
  510. |0|0|0| ... |0| 0| 0|c0| -> offset 5 (bits required for first two levels)
  511. ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾
  512. and the resulting unsigned integer representation will be:
  513. _ _ _ _____ _ __ __ __ __ __ __
  514. |0|0|0| ... |0|c0|b2|b1|b0|a1|a0|
  515. ‾ ‾ ‾ ‾‾‾‾‾ ‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾ ‾‾
  516. Offsets are calculated at initialization, labels are transformed by method
  517. _codes_to_ints.
  518. Keys are located by first locating each component against the respective
  519. level, then locating (the integer representation of) codes.
  520. """
  521. def __init__(self, object levels, object labels,
  522. ndarray[uint64_t, ndim=1] offsets):
  523. """
  524. Parameters
  525. ----------
  526. levels : list-like of numpy arrays
  527. Levels of the MultiIndex.
  528. labels : list-like of numpy arrays of integer dtype
  529. Labels of the MultiIndex.
  530. offsets : numpy array of uint64 dtype
  531. Pre-calculated offsets, one for each level of the index.
  532. """
  533. self.levels = levels
  534. self.offsets = offsets
  535. # Transform labels in a single array, and add 2 so that we are working
  536. # with positive integers (-1 for NaN becomes 1). This enables us to
  537. # differentiate between values that are missing in other and matching
  538. # NaNs. We will set values that are not found to 0 later:
  539. labels_arr = np.array(labels, dtype="int64").T + multiindex_nulls_shift
  540. codes = labels_arr.astype("uint64", copy=False)
  541. self.level_has_nans = [-1 in lab for lab in labels]
  542. # Map each codes combination in the index to an integer unambiguously
  543. # (no collisions possible), based on the "offsets", which describe the
  544. # number of bits to switch labels for each level:
  545. lab_ints = self._codes_to_ints(codes)
  546. # Initialize underlying index (e.g. libindex.UInt64Engine) with
  547. # integers representing labels: we will use its get_loc and get_indexer
  548. self._base.__init__(self, lab_ints)
  549. def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
  550. raise NotImplementedError("Implemented by subclass") # pragma: no cover
  551. def _extract_level_codes(self, target) -> np.ndarray:
  552. """
  553. Map the requested list of (tuple) keys to their integer representations
  554. for searching in the underlying integer index.
  555. Parameters
  556. ----------
  557. target : MultiIndex
  558. Returns
  559. ------
  560. int_keys : 1-dimensional array of dtype uint64 or object
  561. Integers representing one combination each
  562. """
  563. zt = [target._get_level_values(i) for i in range(target.nlevels)]
  564. level_codes = []
  565. for i, (lev, codes) in enumerate(zip(self.levels, zt)):
  566. result = lev.get_indexer_for(codes) + 1
  567. result[result > 0] += 1
  568. if self.level_has_nans[i] and codes.hasnans:
  569. result[codes.isna()] += 1
  570. level_codes.append(result)
  571. return self._codes_to_ints(np.array(level_codes, dtype="uint64").T)
  572. def get_indexer(self, target: np.ndarray) -> np.ndarray:
  573. """
  574. Returns an array giving the positions of each value of `target` in
  575. `self.values`, where -1 represents a value in `target` which does not
  576. appear in `self.values`
  577. Parameters
  578. ----------
  579. target : np.ndarray
  580. Returns
  581. -------
  582. np.ndarray[intp_t, ndim=1] of the indexer of `target` into
  583. `self.values`
  584. """
  585. return self._base.get_indexer(self, target)
  586. def get_indexer_with_fill(self, ndarray target, ndarray values,
  587. str method, object limit) -> np.ndarray:
  588. """
  589. Returns an array giving the positions of each value of `target` in
  590. `values`, where -1 represents a value in `target` which does not
  591. appear in `values`
  592. If `method` is "backfill" then the position for a value in `target`
  593. which does not appear in `values` is that of the next greater value
  594. in `values` (if one exists), and -1 if there is no such value.
  595. Similarly, if the method is "pad" then the position for a value in
  596. `target` which does not appear in `values` is that of the next smaller
  597. value in `values` (if one exists), and -1 if there is no such value.
  598. Parameters
  599. ----------
  600. target: ndarray[object] of tuples
  601. need not be sorted, but all must have the same length, which must be
  602. the same as the length of all tuples in `values`
  603. values : ndarray[object] of tuples
  604. must be sorted and all have the same length. Should be the set of
  605. the MultiIndex's values.
  606. method: string
  607. "backfill" or "pad"
  608. limit: int or None
  609. if provided, limit the number of fills to this value
  610. Returns
  611. -------
  612. np.ndarray[intp_t, ndim=1] of the indexer of `target` into `values`,
  613. filled with the `method` (and optionally `limit`) specified
  614. """
  615. assert method in ("backfill", "pad")
  616. cdef:
  617. int64_t i, j, next_code
  618. int64_t num_values, num_target_values
  619. ndarray[int64_t, ndim=1] target_order
  620. ndarray[object, ndim=1] target_values
  621. ndarray[int64_t, ndim=1] new_codes, new_target_codes
  622. ndarray[intp_t, ndim=1] sorted_indexer
  623. target_order = np.argsort(target).astype("int64")
  624. target_values = target[target_order]
  625. num_values, num_target_values = len(values), len(target_values)
  626. new_codes, new_target_codes = (
  627. np.empty((num_values,)).astype("int64"),
  628. np.empty((num_target_values,)).astype("int64"),
  629. )
  630. # `values` and `target_values` are both sorted, so we walk through them
  631. # and memoize the (ordered) set of indices in the (implicit) merged-and
  632. # sorted list of the two which belong to each of them
  633. # the effect of this is to create a factorization for the (sorted)
  634. # merger of the index values, where `new_codes` and `new_target_codes`
  635. # are the subset of the factors which appear in `values` and `target`,
  636. # respectively
  637. i, j, next_code = 0, 0, 0
  638. while i < num_values and j < num_target_values:
  639. val, target_val = values[i], target_values[j]
  640. if val <= target_val:
  641. new_codes[i] = next_code
  642. i += 1
  643. if target_val <= val:
  644. new_target_codes[j] = next_code
  645. j += 1
  646. next_code += 1
  647. # at this point, at least one should have reached the end
  648. # the remaining values of the other should be added to the end
  649. assert i == num_values or j == num_target_values
  650. while i < num_values:
  651. new_codes[i] = next_code
  652. i += 1
  653. next_code += 1
  654. while j < num_target_values:
  655. new_target_codes[j] = next_code
  656. j += 1
  657. next_code += 1
  658. # get the indexer, and undo the sorting of `target.values`
  659. algo = algos.backfill if method == "backfill" else algos.pad
  660. sorted_indexer = algo(new_codes, new_target_codes, limit=limit)
  661. return sorted_indexer[np.argsort(target_order)]
  662. def get_loc(self, object key):
  663. if is_definitely_invalid_key(key):
  664. raise TypeError(f"'{key}' is an invalid key")
  665. if not isinstance(key, tuple):
  666. raise KeyError(key)
  667. try:
  668. indices = [1 if checknull(v) else lev.get_loc(v) + multiindex_nulls_shift
  669. for lev, v in zip(self.levels, key)]
  670. except KeyError:
  671. raise KeyError(key)
  672. # Transform indices into single integer:
  673. lab_int = self._codes_to_ints(np.array(indices, dtype="uint64"))
  674. return self._base.get_loc(self, lab_int)
  675. def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray:
  676. indexer = self._base.get_indexer_non_unique(self, target)
  677. return indexer
  678. def __contains__(self, val: object) -> bool:
  679. # We assume before we get here:
  680. # - val is hashable
  681. # Default __contains__ looks in the underlying mapping, which in this
  682. # case only contains integer representations.
  683. try:
  684. self.get_loc(val)
  685. return True
  686. except (KeyError, TypeError, ValueError):
  687. return False
  688. # Generated from template.
  689. include "index_class_helper.pxi"
  690. cdef class BoolEngine(UInt8Engine):
  691. cdef _check_type(self, object val):
  692. if not util.is_bool_object(val):
  693. raise KeyError(val)
  694. return <uint8_t>val
  695. cdef class MaskedBoolEngine(MaskedUInt8Engine):
  696. cdef _check_type(self, object val):
  697. if val is C_NA:
  698. return val
  699. if not util.is_bool_object(val):
  700. raise KeyError(val)
  701. return <uint8_t>val
  702. @cython.internal
  703. @cython.freelist(32)
  704. cdef class SharedEngine:
  705. cdef readonly:
  706. object values # ExtensionArray
  707. bint over_size_threshold
  708. cdef:
  709. bint unique, monotonic_inc, monotonic_dec
  710. bint need_monotonic_check, need_unique_check
  711. def __contains__(self, val: object) -> bool:
  712. # We assume before we get here:
  713. # - val is hashable
  714. try:
  715. self.get_loc(val)
  716. return True
  717. except KeyError:
  718. return False
  719. def clear_mapping(self):
  720. # for compat with IndexEngine
  721. pass
  722. @property
  723. def is_unique(self) -> bool:
  724. if self.need_unique_check:
  725. arr = self.values.unique()
  726. self.unique = len(arr) == len(self.values)
  727. self.need_unique_check = False
  728. return self.unique
  729. cdef _do_monotonic_check(self):
  730. raise NotImplementedError
  731. @property
  732. def is_monotonic_increasing(self) -> bool:
  733. if self.need_monotonic_check:
  734. self._do_monotonic_check()
  735. return self.monotonic_inc == 1
  736. @property
  737. def is_monotonic_decreasing(self) -> bool:
  738. if self.need_monotonic_check:
  739. self._do_monotonic_check()
  740. return self.monotonic_dec == 1
  741. cdef _call_monotonic(self, values):
  742. return algos.is_monotonic(values, timelike=False)
  743. def sizeof(self, deep: bool = False) -> int:
  744. """ return the sizeof our mapping """
  745. return 0
  746. def __sizeof__(self) -> int:
  747. return self.sizeof()
  748. cdef _check_type(self, object obj):
  749. raise NotImplementedError
  750. cpdef get_loc(self, object val):
  751. # -> Py_ssize_t | slice | ndarray[bool]
  752. cdef:
  753. Py_ssize_t loc
  754. if is_definitely_invalid_key(val):
  755. raise TypeError(f"'{val}' is an invalid key")
  756. self._check_type(val)
  757. if self.over_size_threshold and self.is_monotonic_increasing:
  758. if not self.is_unique:
  759. return self._get_loc_duplicates(val)
  760. values = self.values
  761. loc = self._searchsorted_left(val)
  762. if loc >= len(values):
  763. raise KeyError(val)
  764. if values[loc] != val:
  765. raise KeyError(val)
  766. return loc
  767. if not self.unique:
  768. return self._get_loc_duplicates(val)
  769. return self._get_loc_duplicates(val)
  770. cdef _get_loc_duplicates(self, object val):
  771. # -> Py_ssize_t | slice | ndarray[bool]
  772. cdef:
  773. Py_ssize_t diff
  774. if self.is_monotonic_increasing:
  775. values = self.values
  776. try:
  777. left = values.searchsorted(val, side="left")
  778. right = values.searchsorted(val, side="right")
  779. except TypeError:
  780. # e.g. GH#29189 get_loc(None) with a Float64Index
  781. raise KeyError(val)
  782. diff = right - left
  783. if diff == 0:
  784. raise KeyError(val)
  785. elif diff == 1:
  786. return left
  787. else:
  788. return slice(left, right)
  789. return self._maybe_get_bool_indexer(val)
  790. cdef Py_ssize_t _searchsorted_left(self, val) except? -1:
  791. """
  792. See ObjectEngine._searchsorted_left.__doc__.
  793. """
  794. try:
  795. loc = self.values.searchsorted(val, side="left")
  796. except TypeError as err:
  797. # GH#35788 e.g. val=None with float64 values
  798. raise KeyError(val)
  799. return loc
  800. cdef ndarray _get_bool_indexer(self, val):
  801. raise NotImplementedError
  802. cdef _maybe_get_bool_indexer(self, object val):
  803. # Returns ndarray[bool] or int
  804. cdef:
  805. ndarray[uint8_t, ndim=1, cast=True] indexer
  806. indexer = self._get_bool_indexer(val)
  807. return _unpack_bool_indexer(indexer, val)
  808. def get_indexer(self, values) -> np.ndarray:
  809. # values : type(self.values)
  810. # Note: we only get here with self.is_unique
  811. cdef:
  812. Py_ssize_t i, N = len(values)
  813. res = np.empty(N, dtype=np.intp)
  814. for i in range(N):
  815. val = values[i]
  816. try:
  817. loc = self.get_loc(val)
  818. # Because we are unique, loc should always be an integer
  819. except KeyError:
  820. loc = -1
  821. else:
  822. assert util.is_integer_object(loc), (loc, val)
  823. res[i] = loc
  824. return res
  825. def get_indexer_non_unique(self, targets):
  826. """
  827. Return an indexer suitable for taking from a non unique index
  828. return the labels in the same order as the target
  829. and a missing indexer into the targets (which correspond
  830. to the -1 indices in the results
  831. Parameters
  832. ----------
  833. targets : type(self.values)
  834. Returns
  835. -------
  836. indexer : np.ndarray[np.intp]
  837. missing : np.ndarray[np.intp]
  838. """
  839. cdef:
  840. Py_ssize_t i, N = len(targets)
  841. indexer = []
  842. missing = []
  843. # See also IntervalIndex.get_indexer_pointwise
  844. for i in range(N):
  845. val = targets[i]
  846. try:
  847. locs = self.get_loc(val)
  848. except KeyError:
  849. locs = np.array([-1], dtype=np.intp)
  850. missing.append(i)
  851. else:
  852. if isinstance(locs, slice):
  853. # Only needed for get_indexer_non_unique
  854. locs = np.arange(locs.start, locs.stop, locs.step, dtype=np.intp)
  855. elif util.is_integer_object(locs):
  856. locs = np.array([locs], dtype=np.intp)
  857. else:
  858. assert locs.dtype.kind == "b"
  859. locs = locs.nonzero()[0]
  860. indexer.append(locs)
  861. try:
  862. indexer = np.concatenate(indexer, dtype=np.intp)
  863. except TypeError:
  864. # numpy<1.20 doesn't accept dtype keyword
  865. indexer = np.concatenate(indexer).astype(np.intp, copy=False)
  866. missing = np.array(missing, dtype=np.intp)
  867. return indexer, missing
  868. cdef class ExtensionEngine(SharedEngine):
  869. def __init__(self, values: "ExtensionArray"):
  870. self.values = values
  871. self.over_size_threshold = len(values) >= _SIZE_CUTOFF
  872. self.need_unique_check = True
  873. self.need_monotonic_check = True
  874. self.need_unique_check = True
  875. cdef _do_monotonic_check(self):
  876. cdef:
  877. bint is_unique
  878. values = self.values
  879. if values._hasna:
  880. self.monotonic_inc = 0
  881. self.monotonic_dec = 0
  882. nunique = len(values.unique())
  883. self.unique = nunique == len(values)
  884. self.need_unique_check = 0
  885. return
  886. try:
  887. ranks = values._rank()
  888. except TypeError:
  889. self.monotonic_inc = 0
  890. self.monotonic_dec = 0
  891. is_unique = 0
  892. else:
  893. self.monotonic_inc, self.monotonic_dec, is_unique = \
  894. self._call_monotonic(ranks)
  895. self.need_monotonic_check = 0
  896. # we can only be sure of uniqueness if is_unique=1
  897. if is_unique:
  898. self.unique = 1
  899. self.need_unique_check = 0
  900. cdef ndarray _get_bool_indexer(self, val):
  901. if checknull(val):
  902. return self.values.isna()
  903. try:
  904. return self.values == val
  905. except TypeError:
  906. # e.g. if __eq__ returns a BooleanArray instead of ndarray[bool]
  907. try:
  908. return (self.values == val).to_numpy(dtype=bool, na_value=False)
  909. except (TypeError, AttributeError) as err:
  910. # e.g. (self.values == val) returned a bool
  911. # see test_get_loc_generator[string[pyarrow]]
  912. # e.g. self.value == val raises TypeError bc generator has no len
  913. # see test_get_loc_generator[string[python]]
  914. raise KeyError from err
  915. cdef _check_type(self, object val):
  916. hash(val)
  917. cdef class MaskedIndexEngine(IndexEngine):
  918. def __init__(self, object values):
  919. super().__init__(self._get_data(values))
  920. self.mask = self._get_mask(values)
  921. def _get_data(self, object values) -> np.ndarray:
  922. if hasattr(values, "_mask"):
  923. return values._data
  924. # We are an ArrowExtensionArray
  925. # Set 1 as na_value to avoid ending up with NA and an object array
  926. # TODO: Remove when arrow engine is implemented
  927. return values.to_numpy(na_value=1, dtype=values.dtype.numpy_dtype)
  928. def _get_mask(self, object values) -> np.ndarray:
  929. if hasattr(values, "_mask"):
  930. return values._mask
  931. # We are an ArrowExtensionArray
  932. return values.isna()
  933. def get_indexer(self, object values) -> np.ndarray:
  934. self._ensure_mapping_populated()
  935. return self.mapping.lookup(self._get_data(values), self._get_mask(values))
  936. def get_indexer_non_unique(self, object targets):
  937. """
  938. Return an indexer suitable for taking from a non unique index
  939. return the labels in the same order as the target
  940. and a missing indexer into the targets (which correspond
  941. to the -1 indices in the results
  942. Returns
  943. -------
  944. indexer : np.ndarray[np.intp]
  945. missing : np.ndarray[np.intp]
  946. """
  947. # TODO: Unify with parent class
  948. cdef:
  949. ndarray values, mask, target_vals, target_mask
  950. ndarray[intp_t] result, missing
  951. set stargets
  952. list na_pos
  953. dict d = {}
  954. object val
  955. Py_ssize_t count = 0, count_missing = 0
  956. Py_ssize_t i, j, n, n_t, n_alloc, start, end, na_idx
  957. target_vals = self._get_data(targets)
  958. target_mask = self._get_mask(targets)
  959. values = self.values
  960. assert not values.dtype == object # go through object path instead
  961. mask = self.mask
  962. stargets = set(target_vals[~target_mask])
  963. n = len(values)
  964. n_t = len(target_vals)
  965. if n > 10_000:
  966. n_alloc = 10_000
  967. else:
  968. n_alloc = n
  969. result = np.empty(n_alloc, dtype=np.intp)
  970. missing = np.empty(n_t, dtype=np.intp)
  971. # map each starget to its position in the index
  972. if (
  973. stargets and
  974. len(stargets) < 5 and
  975. not np.any(target_mask) and
  976. self.is_monotonic_increasing
  977. ):
  978. # if there are few enough stargets and the index is monotonically
  979. # increasing, then use binary search for each starget
  980. for starget in stargets:
  981. start = values.searchsorted(starget, side="left")
  982. end = values.searchsorted(starget, side="right")
  983. if start != end:
  984. d[starget] = list(range(start, end))
  985. stargets = set()
  986. if stargets:
  987. # otherwise, map by iterating through all items in the index
  988. na_pos = []
  989. for i in range(n):
  990. val = values[i]
  991. if mask[i]:
  992. na_pos.append(i)
  993. else:
  994. if val in stargets:
  995. if val not in d:
  996. d[val] = []
  997. d[val].append(i)
  998. for i in range(n_t):
  999. val = target_vals[i]
  1000. if target_mask[i]:
  1001. if na_pos:
  1002. for na_idx in na_pos:
  1003. # realloc if needed
  1004. if count >= n_alloc:
  1005. n_alloc += 10_000
  1006. result = np.resize(result, n_alloc)
  1007. result[count] = na_idx
  1008. count += 1
  1009. continue
  1010. elif val in d:
  1011. # found
  1012. key = val
  1013. for j in d[key]:
  1014. # realloc if needed
  1015. if count >= n_alloc:
  1016. n_alloc += 10_000
  1017. result = np.resize(result, n_alloc)
  1018. result[count] = j
  1019. count += 1
  1020. continue
  1021. # value not found
  1022. if count >= n_alloc:
  1023. n_alloc += 10_000
  1024. result = np.resize(result, n_alloc)
  1025. result[count] = -1
  1026. count += 1
  1027. missing[count_missing] = i
  1028. count_missing += 1
  1029. return result[0:count], missing[0:count_missing]