hashtable_class_helper.pxi.in 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506
  1. """
  2. Template for each `dtype` helper function for hashtable
  3. WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
  4. """
  5. {{py:
  6. # name
  7. complex_types = ['complex64',
  8. 'complex128']
  9. }}
  10. {{for name in complex_types}}
  11. cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
  12. cdef kh{{name}}_t res
  13. res.real = val.real
  14. res.imag = val.imag
  15. return res
  16. {{endfor}}
  17. {{py:
  18. # name
  19. c_types = ['khcomplex128_t',
  20. 'khcomplex64_t',
  21. 'float64_t',
  22. 'float32_t',
  23. 'int64_t',
  24. 'int32_t',
  25. 'int16_t',
  26. 'int8_t',
  27. 'uint64_t',
  28. 'uint32_t',
  29. 'uint16_t',
  30. 'uint8_t']
  31. }}
  32. {{for c_type in c_types}}
  33. cdef bint is_nan_{{c_type}}({{c_type}} val) nogil:
  34. {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }}
  35. return val.real != val.real or val.imag != val.imag
  36. {{elif c_type in {'float64_t', 'float32_t'} }}
  37. return val != val
  38. {{else}}
  39. return False
  40. {{endif}}
  41. {{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }}
  42. # are_equivalent_{{c_type}} is cimported via khash.pxd
  43. {{else}}
  44. cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil:
  45. return val1 == val2
  46. {{endif}}
  47. {{endfor}}
  48. {{py:
  49. # name
  50. cimported_types = ['complex64',
  51. 'complex128',
  52. 'float32',
  53. 'float64',
  54. 'int8',
  55. 'int16',
  56. 'int32',
  57. 'int64',
  58. 'pymap',
  59. 'str',
  60. 'strbox',
  61. 'uint8',
  62. 'uint16',
  63. 'uint32',
  64. 'uint64']
  65. }}
  66. {{for name in cimported_types}}
  67. from pandas._libs.khash cimport (
  68. kh_destroy_{{name}},
  69. kh_exist_{{name}},
  70. kh_get_{{name}},
  71. kh_init_{{name}},
  72. kh_put_{{name}},
  73. kh_resize_{{name}},
  74. )
  75. {{endfor}}
  76. # ----------------------------------------------------------------------
  77. # VectorData
  78. # ----------------------------------------------------------------------
  79. from pandas._libs.tslibs.util cimport get_c_string
  80. from pandas._libs.missing cimport C_NA
  81. {{py:
  82. # name, dtype, c_type
  83. # the generated StringVector is not actually used
  84. # but is included for completeness (rather ObjectVector is used
  85. # for uniques in hashtables)
  86. dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
  87. ('Complex64', 'complex64', 'khcomplex64_t'),
  88. ('Float64', 'float64', 'float64_t'),
  89. ('Float32', 'float32', 'float32_t'),
  90. ('Int64', 'int64', 'int64_t'),
  91. ('Int32', 'int32', 'int32_t'),
  92. ('Int16', 'int16', 'int16_t'),
  93. ('Int8', 'int8', 'int8_t'),
  94. ('String', 'string', 'char *'),
  95. ('UInt64', 'uint64', 'uint64_t'),
  96. ('UInt32', 'uint32', 'uint32_t'),
  97. ('UInt16', 'uint16', 'uint16_t'),
  98. ('UInt8', 'uint8', 'uint8_t')]
  99. }}
  100. {{for name, dtype, c_type in dtypes}}
  101. {{if dtype != 'int64'}}
  102. # Int64VectorData is defined in the .pxd file because it is needed (indirectly)
  103. # by IntervalTree
  104. ctypedef struct {{name}}VectorData:
  105. {{c_type}} *data
  106. Py_ssize_t n, m
  107. {{endif}}
  108. @cython.wraparound(False)
  109. @cython.boundscheck(False)
  110. cdef void append_data_{{dtype}}({{name}}VectorData *data,
  111. {{c_type}} x) nogil:
  112. data.data[data.n] = x
  113. data.n += 1
  114. {{endfor}}
  115. ctypedef fused vector_data:
  116. Int64VectorData
  117. Int32VectorData
  118. Int16VectorData
  119. Int8VectorData
  120. UInt64VectorData
  121. UInt32VectorData
  122. UInt16VectorData
  123. UInt8VectorData
  124. Float64VectorData
  125. Float32VectorData
  126. Complex128VectorData
  127. Complex64VectorData
  128. StringVectorData
  129. cdef bint needs_resize(vector_data *data) nogil:
  130. return data.n == data.m
  131. # ----------------------------------------------------------------------
  132. # Vector
  133. # ----------------------------------------------------------------------
  134. cdef class Vector:
  135. # cdef readonly:
  136. # bint external_view_exists
  137. def __cinit__(self):
  138. self.external_view_exists = False
  139. {{py:
  140. # name, dtype, c_type
  141. dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
  142. ('Complex64', 'complex64', 'khcomplex64_t'),
  143. ('Float64', 'float64', 'float64_t'),
  144. ('UInt64', 'uint64', 'uint64_t'),
  145. ('Int64', 'int64', 'int64_t'),
  146. ('Float32', 'float32', 'float32_t'),
  147. ('UInt32', 'uint32', 'uint32_t'),
  148. ('Int32', 'int32', 'int32_t'),
  149. ('UInt16', 'uint16', 'uint16_t'),
  150. ('Int16', 'int16', 'int16_t'),
  151. ('UInt8', 'uint8', 'uint8_t'),
  152. ('Int8', 'int8', 'int8_t')]
  153. }}
  154. {{for name, dtype, c_type in dtypes}}
  155. cdef class {{name}}Vector(Vector):
  156. # For int64 we have to put this declaration in the .pxd file;
  157. # Int64Vector is the only one we need exposed for other cython files.
  158. {{if dtype != 'int64'}}
  159. cdef:
  160. {{name}}VectorData *data
  161. ndarray ao
  162. {{endif}}
  163. def __cinit__(self):
  164. self.data = <{{name}}VectorData *>PyMem_Malloc(
  165. sizeof({{name}}VectorData))
  166. if not self.data:
  167. raise MemoryError()
  168. self.data.n = 0
  169. self.data.m = _INIT_VEC_CAP
  170. self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
  171. self.data.data = <{{c_type}}*>self.ao.data
  172. cdef resize(self):
  173. self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
  174. self.ao.resize(self.data.m, refcheck=False)
  175. self.data.data = <{{c_type}}*>self.ao.data
  176. def __dealloc__(self):
  177. if self.data is not NULL:
  178. PyMem_Free(self.data)
  179. self.data = NULL
  180. def __len__(self) -> int:
  181. return self.data.n
  182. cpdef ndarray to_array(self):
  183. if self.data.m != self.data.n:
  184. if self.external_view_exists:
  185. # should never happen
  186. raise ValueError("should have raised on append()")
  187. self.ao.resize(self.data.n, refcheck=False)
  188. self.data.m = self.data.n
  189. self.external_view_exists = True
  190. return self.ao
  191. cdef void append(self, {{c_type}} x):
  192. if needs_resize(self.data):
  193. if self.external_view_exists:
  194. raise ValueError("external reference but "
  195. "Vector.resize() needed")
  196. self.resize()
  197. append_data_{{dtype}}(self.data, x)
  198. cdef extend(self, const {{c_type}}[:] x):
  199. for i in range(len(x)):
  200. self.append(x[i])
  201. {{endfor}}
  202. cdef class StringVector(Vector):
  203. cdef:
  204. StringVectorData *data
  205. def __cinit__(self):
  206. self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
  207. if not self.data:
  208. raise MemoryError()
  209. self.data.n = 0
  210. self.data.m = _INIT_VEC_CAP
  211. self.data.data = <char **>malloc(self.data.m * sizeof(char *))
  212. if not self.data.data:
  213. raise MemoryError()
  214. cdef resize(self):
  215. cdef:
  216. char **orig_data
  217. Py_ssize_t i, m
  218. m = self.data.m
  219. self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
  220. orig_data = self.data.data
  221. self.data.data = <char **>malloc(self.data.m * sizeof(char *))
  222. if not self.data.data:
  223. raise MemoryError()
  224. for i in range(m):
  225. self.data.data[i] = orig_data[i]
  226. def __dealloc__(self):
  227. if self.data is not NULL:
  228. if self.data.data is not NULL:
  229. free(self.data.data)
  230. PyMem_Free(self.data)
  231. self.data = NULL
  232. def __len__(self) -> int:
  233. return self.data.n
  234. cpdef ndarray[object, ndim=1] to_array(self):
  235. cdef:
  236. ndarray ao
  237. Py_ssize_t n
  238. object val
  239. ao = np.empty(self.data.n, dtype=object)
  240. for i in range(self.data.n):
  241. val = self.data.data[i]
  242. ao[i] = val
  243. self.external_view_exists = True
  244. self.data.m = self.data.n
  245. return ao
  246. cdef void append(self, char *x):
  247. if needs_resize(self.data):
  248. self.resize()
  249. append_data_string(self.data, x)
  250. cdef extend(self, ndarray[object] x):
  251. for i in range(len(x)):
  252. self.append(x[i])
  253. cdef class ObjectVector(Vector):
  254. cdef:
  255. PyObject **data
  256. Py_ssize_t n, m
  257. ndarray ao
  258. def __cinit__(self):
  259. self.n = 0
  260. self.m = _INIT_VEC_CAP
  261. self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
  262. self.data = <PyObject**>self.ao.data
  263. def __len__(self) -> int:
  264. return self.n
  265. cdef append(self, object obj):
  266. if self.n == self.m:
  267. if self.external_view_exists:
  268. raise ValueError("external reference but "
  269. "Vector.resize() needed")
  270. self.m = max(self.m * 2, _INIT_VEC_CAP)
  271. self.ao.resize(self.m, refcheck=False)
  272. self.data = <PyObject**>self.ao.data
  273. Py_INCREF(obj)
  274. self.data[self.n] = <PyObject*>obj
  275. self.n += 1
  276. cpdef ndarray[object, ndim=1] to_array(self):
  277. if self.m != self.n:
  278. if self.external_view_exists:
  279. raise ValueError("should have raised on append()")
  280. self.ao.resize(self.n, refcheck=False)
  281. self.m = self.n
  282. self.external_view_exists = True
  283. return self.ao
  284. cdef extend(self, ndarray[object] x):
  285. for i in range(len(x)):
  286. self.append(x[i])
  287. # ----------------------------------------------------------------------
  288. # HashTable
  289. # ----------------------------------------------------------------------
  290. cdef class HashTable:
  291. pass
  292. {{py:
  293. # name, dtype, c_type, to_c_type
  294. dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
  295. ('Float64', 'float64', 'float64_t', ''),
  296. ('UInt64', 'uint64', 'uint64_t', ''),
  297. ('Int64', 'int64', 'int64_t', ''),
  298. ('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
  299. ('Float32', 'float32', 'float32_t', ''),
  300. ('UInt32', 'uint32', 'uint32_t', ''),
  301. ('Int32', 'int32', 'int32_t', ''),
  302. ('UInt16', 'uint16', 'uint16_t', ''),
  303. ('Int16', 'int16', 'int16_t', ''),
  304. ('UInt8', 'uint8', 'uint8_t', ''),
  305. ('Int8', 'int8', 'int8_t', '')]
  306. }}
  307. {{for name, dtype, c_type, to_c_type in dtypes}}
  308. cdef class {{name}}HashTable(HashTable):
  309. def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
  310. self.table = kh_init_{{dtype}}()
  311. size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
  312. kh_resize_{{dtype}}(self.table, size_hint)
  313. self.uses_mask = uses_mask
  314. self.na_position = -1
  315. def __len__(self) -> int:
  316. return self.table.size + (0 if self.na_position == -1 else 1)
  317. def __dealloc__(self):
  318. if self.table is not NULL:
  319. kh_destroy_{{dtype}}(self.table)
  320. self.table = NULL
  321. def __contains__(self, object key) -> bool:
  322. # The caller is responsible to check for compatible NA values in case
  323. # of masked arrays.
  324. cdef:
  325. khiter_t k
  326. {{c_type}} ckey
  327. if self.uses_mask and checknull(key):
  328. return -1 != self.na_position
  329. ckey = {{to_c_type}}(key)
  330. k = kh_get_{{dtype}}(self.table, ckey)
  331. return k != self.table.n_buckets
  332. def sizeof(self, deep: bool = False) -> int:
  333. """ return the size of my table in bytes """
  334. overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
  335. for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
  336. for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
  337. sizeof(Py_ssize_t)) # vals
  338. return overhead + for_flags + for_pairs
  339. def get_state(self) -> dict[str, int]:
  340. """ returns infos about the state of the hashtable"""
  341. return {
  342. 'n_buckets' : self.table.n_buckets,
  343. 'size' : self.table.size,
  344. 'n_occupied' : self.table.n_occupied,
  345. 'upper_bound' : self.table.upper_bound,
  346. }
  347. cpdef get_item(self, {{dtype}}_t val):
  348. """Extracts the position of val from the hashtable.
  349. Parameters
  350. ----------
  351. val : Scalar
  352. The value that is looked up in the hashtable
  353. Returns
  354. -------
  355. The position of the requested integer.
  356. """
  357. # Used in core.sorting, IndexEngine.get_loc
  358. # Caller is responsible for checking for pd.NA
  359. cdef:
  360. khiter_t k
  361. {{c_type}} cval
  362. cval = {{to_c_type}}(val)
  363. k = kh_get_{{dtype}}(self.table, cval)
  364. if k != self.table.n_buckets:
  365. return self.table.vals[k]
  366. else:
  367. raise KeyError(val)
  368. cpdef get_na(self):
  369. """Extracts the position of na_value from the hashtable.
  370. Returns
  371. -------
  372. The position of the last na value.
  373. """
  374. if not self.uses_mask:
  375. raise NotImplementedError
  376. if self.na_position == -1:
  377. raise KeyError("NA")
  378. return self.na_position
  379. cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
  380. # Used in libjoin
  381. # Caller is responsible for checking for pd.NA
  382. cdef:
  383. khiter_t k
  384. int ret = 0
  385. {{c_type}} ckey
  386. ckey = {{to_c_type}}(key)
  387. k = kh_put_{{dtype}}(self.table, ckey, &ret)
  388. if kh_exist_{{dtype}}(self.table, k):
  389. self.table.vals[k] = val
  390. else:
  391. raise KeyError(key)
  392. cpdef set_na(self, Py_ssize_t val):
  393. # Caller is responsible for checking for pd.NA
  394. cdef:
  395. khiter_t k
  396. int ret = 0
  397. {{c_type}} ckey
  398. if not self.uses_mask:
  399. raise NotImplementedError
  400. self.na_position = val
  401. {{if dtype == "int64" }}
  402. # We only use this for int64, can reduce build size and make .pyi
  403. # more accurate by only implementing it for int64
  404. @cython.boundscheck(False)
  405. def map_keys_to_values(
  406. self, const {{dtype}}_t[:] keys, const int64_t[:] values
  407. ) -> None:
  408. cdef:
  409. Py_ssize_t i, n = len(values)
  410. int ret = 0
  411. {{c_type}} key
  412. khiter_t k
  413. with nogil:
  414. for i in range(n):
  415. key = {{to_c_type}}(keys[i])
  416. k = kh_put_{{dtype}}(self.table, key, &ret)
  417. self.table.vals[k] = <Py_ssize_t>values[i]
  418. {{endif}}
  419. @cython.boundscheck(False)
  420. def map_locations(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> None:
  421. # Used in libindex, safe_sort
  422. cdef:
  423. Py_ssize_t i, n = len(values)
  424. int ret = 0
  425. {{c_type}} val
  426. khiter_t k
  427. int8_t na_position = self.na_position
  428. if self.uses_mask and mask is None:
  429. raise NotImplementedError # pragma: no cover
  430. with nogil:
  431. if self.uses_mask:
  432. for i in range(n):
  433. if mask[i]:
  434. na_position = i
  435. else:
  436. val= {{to_c_type}}(values[i])
  437. k = kh_put_{{dtype}}(self.table, val, &ret)
  438. self.table.vals[k] = i
  439. else:
  440. for i in range(n):
  441. val= {{to_c_type}}(values[i])
  442. k = kh_put_{{dtype}}(self.table, val, &ret)
  443. self.table.vals[k] = i
  444. self.na_position = na_position
  445. @cython.boundscheck(False)
  446. def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray:
  447. # -> np.ndarray[np.intp]
  448. # Used in safe_sort, IndexEngine.get_indexer
  449. cdef:
  450. Py_ssize_t i, n = len(values)
  451. int ret = 0
  452. {{c_type}} val
  453. khiter_t k
  454. intp_t[::1] locs = np.empty(n, dtype=np.intp)
  455. int8_t na_position = self.na_position
  456. if self.uses_mask and mask is None:
  457. raise NotImplementedError # pragma: no cover
  458. with nogil:
  459. for i in range(n):
  460. if self.uses_mask and mask[i]:
  461. locs[i] = na_position
  462. else:
  463. val = {{to_c_type}}(values[i])
  464. k = kh_get_{{dtype}}(self.table, val)
  465. if k != self.table.n_buckets:
  466. locs[i] = self.table.vals[k]
  467. else:
  468. locs[i] = -1
  469. return np.asarray(locs)
  470. @cython.boundscheck(False)
  471. @cython.wraparound(False)
  472. def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
  473. Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
  474. object na_value=None, bint ignore_na=False,
  475. object mask=None, bint return_inverse=False, bint use_result_mask=False):
  476. """
  477. Calculate unique values and labels (no sorting!)
  478. Parameters
  479. ----------
  480. values : ndarray[{{dtype}}]
  481. Array of values of which unique will be calculated
  482. uniques : {{name}}Vector
  483. Vector into which uniques will be written
  484. count_prior : Py_ssize_t, default 0
  485. Number of existing entries in uniques
  486. na_sentinel : Py_ssize_t, default -1
  487. Sentinel value used for all NA-values in inverse
  488. na_value : object, default None
  489. Value to identify as missing. If na_value is None, then
  490. any value "val" satisfying val != val is considered missing.
  491. If na_value is not None, then _additionally_, any value "val"
  492. satisfying val == na_value is considered missing.
  493. ignore_na : bool, default False
  494. Whether NA-values should be ignored for calculating the uniques. If
  495. True, the labels corresponding to missing values will be set to
  496. na_sentinel.
  497. mask : ndarray[bool], optional
  498. If not None, the mask is used as indicator for missing values
  499. (True = missing, False = valid) instead of `na_value` or
  500. condition "val != val".
  501. return_inverse : bool, default False
  502. Whether the mapping of the original array values to their location
  503. in the vector of uniques should be returned.
  504. use_result_mask: bool, default False
  505. Whether to create a result mask for the unique values. Not supported
  506. with return_inverse=True.
  507. Returns
  508. -------
  509. uniques : ndarray[{{dtype}}]
  510. Unique values of input, not sorted
  511. labels : ndarray[intp_t] (if return_inverse=True)
  512. The labels from values to uniques
  513. result_mask: ndarray[bool], if use_result_mask is true
  514. The mask for the result values.
  515. """
  516. cdef:
  517. Py_ssize_t i, idx, count = count_prior, n = len(values)
  518. intp_t[::1] labels
  519. int ret = 0
  520. {{c_type}} val, na_value2
  521. khiter_t k
  522. {{name}}VectorData *ud
  523. UInt8Vector result_mask
  524. UInt8VectorData *rmd
  525. bint use_na_value, use_mask, seen_na = False
  526. uint8_t[:] mask_values
  527. if return_inverse:
  528. labels = np.empty(n, dtype=np.intp)
  529. ud = uniques.data
  530. use_na_value = na_value is not None
  531. use_mask = mask is not None
  532. if not use_mask and use_result_mask:
  533. raise NotImplementedError # pragma: no cover
  534. if use_result_mask and return_inverse:
  535. raise NotImplementedError # pragma: no cover
  536. result_mask = UInt8Vector()
  537. rmd = result_mask.data
  538. if use_mask:
  539. mask_values = mask.view("uint8")
  540. if use_na_value:
  541. # We need this na_value2 because we want to allow users
  542. # to *optionally* specify an NA sentinel *of the correct* type.
  543. # We use None, to make it optional, which requires `object` type
  544. # for the parameter. To please the compiler, we use na_value2,
  545. # which is only used if it's *specified*.
  546. na_value2 = {{to_c_type}}(na_value)
  547. else:
  548. na_value2 = {{to_c_type}}(0)
  549. with nogil:
  550. for i in range(n):
  551. val = {{to_c_type}}(values[i])
  552. if ignore_na and use_mask:
  553. if mask_values[i]:
  554. labels[i] = na_sentinel
  555. continue
  556. elif ignore_na and (
  557. is_nan_{{c_type}}(val) or
  558. (use_na_value and are_equivalent_{{c_type}}(val, na_value2))
  559. ):
  560. # if missing values do not count as unique values (i.e. if
  561. # ignore_na is True), skip the hashtable entry for them,
  562. # and replace the corresponding label with na_sentinel
  563. labels[i] = na_sentinel
  564. continue
  565. elif not ignore_na and use_result_mask:
  566. if mask_values[i]:
  567. if seen_na:
  568. continue
  569. seen_na = True
  570. if needs_resize(ud):
  571. with gil:
  572. if uniques.external_view_exists:
  573. raise ValueError("external reference to "
  574. "uniques held, but "
  575. "Vector.resize() needed")
  576. uniques.resize()
  577. if result_mask.external_view_exists:
  578. raise ValueError("external reference to "
  579. "result_mask held, but "
  580. "Vector.resize() needed")
  581. result_mask.resize()
  582. append_data_{{dtype}}(ud, val)
  583. append_data_uint8(rmd, 1)
  584. continue
  585. k = kh_get_{{dtype}}(self.table, val)
  586. if k == self.table.n_buckets:
  587. # k hasn't been seen yet
  588. k = kh_put_{{dtype}}(self.table, val, &ret)
  589. if needs_resize(ud):
  590. with gil:
  591. if uniques.external_view_exists:
  592. raise ValueError("external reference to "
  593. "uniques held, but "
  594. "Vector.resize() needed")
  595. uniques.resize()
  596. if use_result_mask:
  597. if result_mask.external_view_exists:
  598. raise ValueError("external reference to "
  599. "result_mask held, but "
  600. "Vector.resize() needed")
  601. result_mask.resize()
  602. append_data_{{dtype}}(ud, val)
  603. if use_result_mask:
  604. append_data_uint8(rmd, 0)
  605. if return_inverse:
  606. self.table.vals[k] = count
  607. labels[i] = count
  608. count += 1
  609. elif return_inverse:
  610. # k falls into a previous bucket
  611. # only relevant in case we need to construct the inverse
  612. idx = self.table.vals[k]
  613. labels[i] = idx
  614. if return_inverse:
  615. return uniques.to_array(), labels.base # .base -> underlying ndarray
  616. if use_result_mask:
  617. return uniques.to_array(), result_mask.to_array()
  618. return uniques.to_array()
  619. def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
  620. """
  621. Calculate unique values and labels (no sorting!)
  622. Parameters
  623. ----------
  624. values : ndarray[{{dtype}}]
  625. Array of values of which unique will be calculated
  626. return_inverse : bool, default False
  627. Whether the mapping of the original array values to their location
  628. in the vector of uniques should be returned.
  629. mask : ndarray[bool], optional
  630. If not None, the mask is used as indicator for missing values
  631. (True = missing, False = valid) instead of `na_value` or
  632. Returns
  633. -------
  634. uniques : ndarray[{{dtype}}]
  635. Unique values of input, not sorted
  636. labels : ndarray[intp_t] (if return_inverse)
  637. The labels from values to uniques
  638. result_mask: ndarray[bool], if mask is given as input
  639. The mask for the result values.
  640. """
  641. uniques = {{name}}Vector()
  642. use_result_mask = True if mask is not None else False
  643. return self._unique(values, uniques, ignore_na=False,
  644. return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
  645. def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
  646. object na_value=None, object mask=None, ignore_na=True):
  647. """
  648. Calculate unique values and labels (no sorting!)
  649. Missing values are not included in the "uniques" for this method.
  650. The labels for any missing values will be set to "na_sentinel"
  651. Parameters
  652. ----------
  653. values : ndarray[{{dtype}}]
  654. Array of values of which unique will be calculated
  655. na_sentinel : Py_ssize_t, default -1
  656. Sentinel value used for all NA-values in inverse
  657. na_value : object, default None
  658. Value to identify as missing. If na_value is None, then
  659. any value "val" satisfying val != val is considered missing.
  660. If na_value is not None, then _additionally_, any value "val"
  661. satisfying val == na_value is considered missing.
  662. mask : ndarray[bool], optional
  663. If not None, the mask is used as indicator for missing values
  664. (True = missing, False = valid) instead of `na_value` or
  665. condition "val != val".
  666. Returns
  667. -------
  668. uniques : ndarray[{{dtype}}]
  669. Unique values of input, not sorted
  670. labels : ndarray[intp_t]
  671. The labels from values to uniques
  672. """
  673. uniques_vector = {{name}}Vector()
  674. return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
  675. na_value=na_value, ignore_na=ignore_na, mask=mask,
  676. return_inverse=True)
  677. def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
  678. Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
  679. object na_value=None, object mask=None):
  680. # -> np.ndarray[np.intp]
  681. _, labels = self._unique(values, uniques, count_prior=count_prior,
  682. na_sentinel=na_sentinel, na_value=na_value,
  683. ignore_na=True, return_inverse=True, mask=mask)
  684. return labels
  685. {{if dtype == 'int64'}}
  686. @cython.boundscheck(False)
  687. def get_labels_groupby(
  688. self, const {{dtype}}_t[:] values
  689. ) -> tuple[ndarray, ndarray]:
  690. # tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]]
  691. cdef:
  692. Py_ssize_t i, n = len(values)
  693. intp_t[::1] labels
  694. Py_ssize_t idx, count = 0
  695. int ret = 0
  696. {{c_type}} val
  697. khiter_t k
  698. {{name}}Vector uniques = {{name}}Vector()
  699. {{name}}VectorData *ud
  700. labels = np.empty(n, dtype=np.intp)
  701. ud = uniques.data
  702. with nogil:
  703. for i in range(n):
  704. val = {{to_c_type}}(values[i])
  705. # specific for groupby
  706. if val < 0:
  707. labels[i] = -1
  708. continue
  709. k = kh_get_{{dtype}}(self.table, val)
  710. if k != self.table.n_buckets:
  711. idx = self.table.vals[k]
  712. labels[i] = idx
  713. else:
  714. k = kh_put_{{dtype}}(self.table, val, &ret)
  715. self.table.vals[k] = count
  716. if needs_resize(ud):
  717. with gil:
  718. uniques.resize()
  719. append_data_{{dtype}}(ud, val)
  720. labels[i] = count
  721. count += 1
  722. arr_uniques = uniques.to_array()
  723. return np.asarray(labels), arr_uniques
  724. {{endif}}
  725. cdef class {{name}}Factorizer(Factorizer):
  726. cdef public:
  727. {{name}}HashTable table
  728. {{name}}Vector uniques
  729. def __cinit__(self, size_hint: int):
  730. self.table = {{name}}HashTable(size_hint)
  731. self.uniques = {{name}}Vector()
  732. def factorize(self, const {{c_type}}[:] values,
  733. na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
  734. """
  735. Returns
  736. -------
  737. ndarray[intp_t]
  738. Examples
  739. --------
  740. Factorize values with nans replaced by na_sentinel
  741. >>> fac = {{name}}Factorizer(3)
  742. >>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20)
  743. array([0, 1, 2])
  744. """
  745. cdef:
  746. ndarray[intp_t] labels
  747. if self.uniques.external_view_exists:
  748. uniques = {{name}}Vector()
  749. uniques.extend(self.uniques.to_array())
  750. self.uniques = uniques
  751. labels = self.table.get_labels(values, self.uniques,
  752. self.count, na_sentinel,
  753. na_value=na_value, mask=mask)
  754. self.count = len(self.uniques)
  755. return labels
  756. {{endfor}}
  757. cdef class StringHashTable(HashTable):
  758. # these by-definition *must* be strings
  759. # or a sentinel np.nan / None missing value
  760. na_string_sentinel = '__nan__'
  761. def __init__(self, int64_t size_hint=1):
  762. self.table = kh_init_str()
  763. size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
  764. kh_resize_str(self.table, size_hint)
  765. def __dealloc__(self):
  766. if self.table is not NULL:
  767. kh_destroy_str(self.table)
  768. self.table = NULL
  769. def sizeof(self, deep: bool = False) -> int:
  770. overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
  771. for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
  772. for_pairs = self.table.n_buckets * (sizeof(char *) + # keys
  773. sizeof(Py_ssize_t)) # vals
  774. return overhead + for_flags + for_pairs
  775. def get_state(self) -> dict[str, int]:
  776. """ returns infos about the state of the hashtable"""
  777. return {
  778. 'n_buckets' : self.table.n_buckets,
  779. 'size' : self.table.size,
  780. 'n_occupied' : self.table.n_occupied,
  781. 'upper_bound' : self.table.upper_bound,
  782. }
  783. cpdef get_item(self, str val):
  784. cdef:
  785. khiter_t k
  786. const char *v
  787. v = get_c_string(val)
  788. k = kh_get_str(self.table, v)
  789. if k != self.table.n_buckets:
  790. return self.table.vals[k]
  791. else:
  792. raise KeyError(val)
  793. cpdef set_item(self, str key, Py_ssize_t val):
  794. cdef:
  795. khiter_t k
  796. int ret = 0
  797. const char *v
  798. v = get_c_string(key)
  799. k = kh_put_str(self.table, v, &ret)
  800. if kh_exist_str(self.table, k):
  801. self.table.vals[k] = val
  802. else:
  803. raise KeyError(key)
  804. @cython.boundscheck(False)
  805. def get_indexer(self, ndarray[object] values) -> ndarray:
  806. # -> np.ndarray[np.intp]
  807. cdef:
  808. Py_ssize_t i, n = len(values)
  809. ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
  810. intp_t *resbuf = <intp_t*>labels.data
  811. khiter_t k
  812. kh_str_t *table = self.table
  813. const char *v
  814. const char **vecs
  815. vecs = <const char **>malloc(n * sizeof(char *))
  816. for i in range(n):
  817. val = values[i]
  818. v = get_c_string(val)
  819. vecs[i] = v
  820. with nogil:
  821. for i in range(n):
  822. k = kh_get_str(table, vecs[i])
  823. if k != table.n_buckets:
  824. resbuf[i] = table.vals[k]
  825. else:
  826. resbuf[i] = -1
  827. free(vecs)
  828. return labels
  829. @cython.boundscheck(False)
  830. def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
  831. # -> np.ndarray[np.intp]
  832. # mask not yet implemented
  833. cdef:
  834. Py_ssize_t i, n = len(values)
  835. int ret = 0
  836. object val
  837. const char *v
  838. khiter_t k
  839. intp_t[::1] locs = np.empty(n, dtype=np.intp)
  840. # these by-definition *must* be strings
  841. vecs = <const char **>malloc(n * sizeof(char *))
  842. for i in range(n):
  843. val = values[i]
  844. if isinstance(val, str):
  845. # GH#31499 if we have a np.str_ get_c_string won't recognize
  846. # it as a str, even though isinstance does.
  847. v = get_c_string(<str>val)
  848. else:
  849. v = get_c_string(self.na_string_sentinel)
  850. vecs[i] = v
  851. with nogil:
  852. for i in range(n):
  853. v = vecs[i]
  854. k = kh_get_str(self.table, v)
  855. if k != self.table.n_buckets:
  856. locs[i] = self.table.vals[k]
  857. else:
  858. locs[i] = -1
  859. free(vecs)
  860. return np.asarray(locs)
  861. @cython.boundscheck(False)
  862. def map_locations(self, ndarray[object] values, object mask = None) -> None:
  863. # mask not yet implemented
  864. cdef:
  865. Py_ssize_t i, n = len(values)
  866. int ret = 0
  867. object val
  868. const char *v
  869. const char **vecs
  870. khiter_t k
  871. # these by-definition *must* be strings
  872. vecs = <const char **>malloc(n * sizeof(char *))
  873. for i in range(n):
  874. val = values[i]
  875. if isinstance(val, str):
  876. # GH#31499 if we have a np.str_ get_c_string won't recognize
  877. # it as a str, even though isinstance does.
  878. v = get_c_string(<str>val)
  879. else:
  880. v = get_c_string(self.na_string_sentinel)
  881. vecs[i] = v
  882. with nogil:
  883. for i in range(n):
  884. v = vecs[i]
  885. k = kh_put_str(self.table, v, &ret)
  886. self.table.vals[k] = i
  887. free(vecs)
  888. @cython.boundscheck(False)
  889. @cython.wraparound(False)
  890. def _unique(self, ndarray[object] values, ObjectVector uniques,
  891. Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
  892. object na_value=None, bint ignore_na=False,
  893. bint return_inverse=False):
  894. """
  895. Calculate unique values and labels (no sorting!)
  896. Parameters
  897. ----------
  898. values : ndarray[object]
  899. Array of values of which unique will be calculated
  900. uniques : ObjectVector
  901. Vector into which uniques will be written
  902. count_prior : Py_ssize_t, default 0
  903. Number of existing entries in uniques
  904. na_sentinel : Py_ssize_t, default -1
  905. Sentinel value used for all NA-values in inverse
  906. na_value : object, default None
  907. Value to identify as missing. If na_value is None, then any value
  908. that is not a string is considered missing. If na_value is
  909. not None, then _additionally_ any value "val" satisfying
  910. val == na_value is considered missing.
  911. ignore_na : bool, default False
  912. Whether NA-values should be ignored for calculating the uniques. If
  913. True, the labels corresponding to missing values will be set to
  914. na_sentinel.
  915. return_inverse : bool, default False
  916. Whether the mapping of the original array values to their location
  917. in the vector of uniques should be returned.
  918. Returns
  919. -------
  920. uniques : ndarray[object]
  921. Unique values of input, not sorted
  922. labels : ndarray[intp_t] (if return_inverse=True)
  923. The labels from values to uniques
  924. """
  925. cdef:
  926. Py_ssize_t i, idx, count = count_prior, n = len(values)
  927. intp_t[::1] labels
  928. int64_t[::1] uindexer
  929. int ret = 0
  930. object val
  931. const char *v
  932. const char **vecs
  933. khiter_t k
  934. bint use_na_value
  935. if return_inverse:
  936. labels = np.zeros(n, dtype=np.intp)
  937. uindexer = np.empty(n, dtype=np.int64)
  938. use_na_value = na_value is not None
  939. # assign pointers and pre-filter out missing (if ignore_na)
  940. vecs = <const char **>malloc(n * sizeof(char *))
  941. for i in range(n):
  942. val = values[i]
  943. if (ignore_na
  944. and (not isinstance(val, str)
  945. or (use_na_value and val == na_value))):
  946. # if missing values do not count as unique values (i.e. if
  947. # ignore_na is True), we can skip the actual value, and
  948. # replace the label with na_sentinel directly
  949. labels[i] = na_sentinel
  950. else:
  951. # if ignore_na is False, we also stringify NaN/None/etc.
  952. try:
  953. v = get_c_string(<str>val)
  954. except UnicodeEncodeError:
  955. v = get_c_string(<str>repr(val))
  956. vecs[i] = v
  957. # compute
  958. with nogil:
  959. for i in range(n):
  960. if ignore_na and labels[i] == na_sentinel:
  961. # skip entries for ignored missing values (see above)
  962. continue
  963. v = vecs[i]
  964. k = kh_get_str(self.table, v)
  965. if k == self.table.n_buckets:
  966. # k hasn't been seen yet
  967. k = kh_put_str(self.table, v, &ret)
  968. uindexer[count] = i
  969. if return_inverse:
  970. self.table.vals[k] = count
  971. labels[i] = count
  972. count += 1
  973. elif return_inverse:
  974. # k falls into a previous bucket
  975. # only relevant in case we need to construct the inverse
  976. idx = self.table.vals[k]
  977. labels[i] = idx
  978. free(vecs)
  979. # uniques
  980. for i in range(count):
  981. uniques.append(values[uindexer[i]])
  982. if return_inverse:
  983. return uniques.to_array(), labels.base # .base -> underlying ndarray
  984. return uniques.to_array()
  985. def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
  986. """
  987. Calculate unique values and labels (no sorting!)
  988. Parameters
  989. ----------
  990. values : ndarray[object]
  991. Array of values of which unique will be calculated
  992. return_inverse : bool, default False
  993. Whether the mapping of the original array values to their location
  994. in the vector of uniques should be returned.
  995. mask : ndarray[bool], optional
  996. Not yet implemented for StringHashTable
  997. Returns
  998. -------
  999. uniques : ndarray[object]
  1000. Unique values of input, not sorted
  1001. labels : ndarray[intp_t] (if return_inverse)
  1002. The labels from values to uniques
  1003. """
  1004. uniques = ObjectVector()
  1005. return self._unique(values, uniques, ignore_na=False,
  1006. return_inverse=return_inverse)
  1007. def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
  1008. object na_value=None, object mask=None, ignore_na=True):
  1009. """
  1010. Calculate unique values and labels (no sorting!)
  1011. Missing values are not included in the "uniques" for this method.
  1012. The labels for any missing values will be set to "na_sentinel"
  1013. Parameters
  1014. ----------
  1015. values : ndarray[object]
  1016. Array of values of which unique will be calculated
  1017. na_sentinel : Py_ssize_t, default -1
  1018. Sentinel value used for all NA-values in inverse
  1019. na_value : object, default None
  1020. Value to identify as missing. If na_value is None, then any value
  1021. that is not a string is considered missing. If na_value is
  1022. not None, then _additionally_ any value "val" satisfying
  1023. val == na_value is considered missing.
  1024. mask : ndarray[bool], optional
  1025. Not yet implemented for StringHashTable.
  1026. Returns
  1027. -------
  1028. uniques : ndarray[object]
  1029. Unique values of input, not sorted
  1030. labels : ndarray[intp]
  1031. The labels from values to uniques
  1032. """
  1033. uniques_vector = ObjectVector()
  1034. return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
  1035. na_value=na_value, ignore_na=ignore_na,
  1036. return_inverse=True)
  1037. def get_labels(self, ndarray[object] values, ObjectVector uniques,
  1038. Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
  1039. object na_value=None):
  1040. # -> np.ndarray[np.intp]
  1041. _, labels = self._unique(values, uniques, count_prior=count_prior,
  1042. na_sentinel=na_sentinel, na_value=na_value,
  1043. ignore_na=True, return_inverse=True)
  1044. return labels
  1045. cdef class PyObjectHashTable(HashTable):
  1046. def __init__(self, int64_t size_hint=1):
  1047. self.table = kh_init_pymap()
  1048. size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
  1049. kh_resize_pymap(self.table, size_hint)
  1050. def __dealloc__(self):
  1051. if self.table is not NULL:
  1052. kh_destroy_pymap(self.table)
  1053. self.table = NULL
  1054. def __len__(self) -> int:
  1055. return self.table.size
  1056. def __contains__(self, object key) -> bool:
  1057. cdef:
  1058. khiter_t k
  1059. hash(key)
  1060. k = kh_get_pymap(self.table, <PyObject*>key)
  1061. return k != self.table.n_buckets
  1062. def sizeof(self, deep: bool = False) -> int:
  1063. """ return the size of my table in bytes """
  1064. overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
  1065. for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
  1066. for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys
  1067. sizeof(Py_ssize_t)) # vals
  1068. return overhead + for_flags + for_pairs
  1069. def get_state(self) -> dict[str, int]:
  1070. """
  1071. returns infos about the current state of the hashtable like size,
  1072. number of buckets and so on.
  1073. """
  1074. return {
  1075. 'n_buckets' : self.table.n_buckets,
  1076. 'size' : self.table.size,
  1077. 'n_occupied' : self.table.n_occupied,
  1078. 'upper_bound' : self.table.upper_bound,
  1079. }
  1080. cpdef get_item(self, object val):
  1081. cdef:
  1082. khiter_t k
  1083. k = kh_get_pymap(self.table, <PyObject*>val)
  1084. if k != self.table.n_buckets:
  1085. return self.table.vals[k]
  1086. else:
  1087. raise KeyError(val)
  1088. cpdef set_item(self, object key, Py_ssize_t val):
  1089. cdef:
  1090. khiter_t k
  1091. int ret = 0
  1092. char* buf
  1093. hash(key)
  1094. k = kh_put_pymap(self.table, <PyObject*>key, &ret)
  1095. if kh_exist_pymap(self.table, k):
  1096. self.table.vals[k] = val
  1097. else:
  1098. raise KeyError(key)
  1099. def map_locations(self, ndarray[object] values, object mask = None) -> None:
  1100. # mask not yet implemented
  1101. cdef:
  1102. Py_ssize_t i, n = len(values)
  1103. int ret = 0
  1104. object val
  1105. khiter_t k
  1106. for i in range(n):
  1107. val = values[i]
  1108. hash(val)
  1109. k = kh_put_pymap(self.table, <PyObject*>val, &ret)
  1110. self.table.vals[k] = i
  1111. def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
  1112. # -> np.ndarray[np.intp]
  1113. # mask not yet implemented
  1114. cdef:
  1115. Py_ssize_t i, n = len(values)
  1116. int ret = 0
  1117. object val
  1118. khiter_t k
  1119. intp_t[::1] locs = np.empty(n, dtype=np.intp)
  1120. for i in range(n):
  1121. val = values[i]
  1122. hash(val)
  1123. k = kh_get_pymap(self.table, <PyObject*>val)
  1124. if k != self.table.n_buckets:
  1125. locs[i] = self.table.vals[k]
  1126. else:
  1127. locs[i] = -1
  1128. return np.asarray(locs)
  1129. @cython.boundscheck(False)
  1130. @cython.wraparound(False)
  1131. def _unique(self, ndarray[object] values, ObjectVector uniques,
  1132. Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
  1133. object na_value=None, bint ignore_na=False,
  1134. bint return_inverse=False):
  1135. """
  1136. Calculate unique values and labels (no sorting!)
  1137. Parameters
  1138. ----------
  1139. values : ndarray[object]
  1140. Array of values of which unique will be calculated
  1141. uniques : ObjectVector
  1142. Vector into which uniques will be written
  1143. count_prior : Py_ssize_t, default 0
  1144. Number of existing entries in uniques
  1145. na_sentinel : Py_ssize_t, default -1
  1146. Sentinel value used for all NA-values in inverse
  1147. na_value : object, default None
  1148. Value to identify as missing. If na_value is None, then None _plus_
  1149. any value "val" satisfying val != val is considered missing.
  1150. If na_value is not None, then _additionally_, any value "val"
  1151. satisfying val == na_value is considered missing.
  1152. ignore_na : bool, default False
  1153. Whether NA-values should be ignored for calculating the uniques. If
  1154. True, the labels corresponding to missing values will be set to
  1155. na_sentinel.
  1156. return_inverse : bool, default False
  1157. Whether the mapping of the original array values to their location
  1158. in the vector of uniques should be returned.
  1159. Returns
  1160. -------
  1161. uniques : ndarray[object]
  1162. Unique values of input, not sorted
  1163. labels : ndarray[intp_t] (if return_inverse=True)
  1164. The labels from values to uniques
  1165. """
  1166. cdef:
  1167. Py_ssize_t i, idx, count = count_prior, n = len(values)
  1168. intp_t[::1] labels
  1169. int ret = 0
  1170. object val
  1171. khiter_t k
  1172. bint use_na_value
  1173. if return_inverse:
  1174. labels = np.empty(n, dtype=np.intp)
  1175. use_na_value = na_value is not None
  1176. for i in range(n):
  1177. val = values[i]
  1178. hash(val)
  1179. if ignore_na and (
  1180. checknull(val)
  1181. or (use_na_value and val == na_value)
  1182. ):
  1183. # if missing values do not count as unique values (i.e. if
  1184. # ignore_na is True), skip the hashtable entry for them, and
  1185. # replace the corresponding label with na_sentinel
  1186. labels[i] = na_sentinel
  1187. continue
  1188. k = kh_get_pymap(self.table, <PyObject*>val)
  1189. if k == self.table.n_buckets:
  1190. # k hasn't been seen yet
  1191. k = kh_put_pymap(self.table, <PyObject*>val, &ret)
  1192. uniques.append(val)
  1193. if return_inverse:
  1194. self.table.vals[k] = count
  1195. labels[i] = count
  1196. count += 1
  1197. elif return_inverse:
  1198. # k falls into a previous bucket
  1199. # only relevant in case we need to construct the inverse
  1200. idx = self.table.vals[k]
  1201. labels[i] = idx
  1202. if return_inverse:
  1203. return uniques.to_array(), labels.base # .base -> underlying ndarray
  1204. return uniques.to_array()
  1205. def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
  1206. """
  1207. Calculate unique values and labels (no sorting!)
  1208. Parameters
  1209. ----------
  1210. values : ndarray[object]
  1211. Array of values of which unique will be calculated
  1212. return_inverse : bool, default False
  1213. Whether the mapping of the original array values to their location
  1214. in the vector of uniques should be returned.
  1215. mask : ndarray[bool], optional
  1216. Not yet implemented for PyObjectHashTable
  1217. Returns
  1218. -------
  1219. uniques : ndarray[object]
  1220. Unique values of input, not sorted
  1221. labels : ndarray[intp_t] (if return_inverse)
  1222. The labels from values to uniques
  1223. """
  1224. uniques = ObjectVector()
  1225. return self._unique(values, uniques, ignore_na=False,
  1226. return_inverse=return_inverse)
  1227. def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
  1228. object na_value=None, object mask=None, ignore_na=True):
  1229. """
  1230. Calculate unique values and labels (no sorting!)
  1231. Missing values are not included in the "uniques" for this method.
  1232. The labels for any missing values will be set to "na_sentinel"
  1233. Parameters
  1234. ----------
  1235. values : ndarray[object]
  1236. Array of values of which unique will be calculated
  1237. na_sentinel : Py_ssize_t, default -1
  1238. Sentinel value used for all NA-values in inverse
  1239. na_value : object, default None
  1240. Value to identify as missing. If na_value is None, then None _plus_
  1241. any value "val" satisfying val != val is considered missing.
  1242. If na_value is not None, then _additionally_, any value "val"
  1243. satisfying val == na_value is considered missing.
  1244. mask : ndarray[bool], optional
  1245. Not yet implemented for PyObjectHashTable.
  1246. Returns
  1247. -------
  1248. uniques : ndarray[object]
  1249. Unique values of input, not sorted
  1250. labels : ndarray[intp_t]
  1251. The labels from values to uniques
  1252. """
  1253. uniques_vector = ObjectVector()
  1254. return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
  1255. na_value=na_value, ignore_na=ignore_na,
  1256. return_inverse=True)
  1257. def get_labels(self, ndarray[object] values, ObjectVector uniques,
  1258. Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
  1259. object na_value=None):
  1260. # -> np.ndarray[np.intp]
  1261. _, labels = self._unique(values, uniques, count_prior=count_prior,
  1262. na_sentinel=na_sentinel, na_value=na_value,
  1263. ignore_na=True, return_inverse=True)
  1264. return labels