123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506 |
- """
- Template for each `dtype` helper function for hashtable
- WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
- """
- {{py:
- # name
- complex_types = ['complex64',
- 'complex128']
- }}
- {{for name in complex_types}}
- cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
- cdef kh{{name}}_t res
- res.real = val.real
- res.imag = val.imag
- return res
- {{endfor}}
- {{py:
- # name
- c_types = ['khcomplex128_t',
- 'khcomplex64_t',
- 'float64_t',
- 'float32_t',
- 'int64_t',
- 'int32_t',
- 'int16_t',
- 'int8_t',
- 'uint64_t',
- 'uint32_t',
- 'uint16_t',
- 'uint8_t']
- }}
- {{for c_type in c_types}}
- cdef bint is_nan_{{c_type}}({{c_type}} val) nogil:
- {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }}
- return val.real != val.real or val.imag != val.imag
- {{elif c_type in {'float64_t', 'float32_t'} }}
- return val != val
- {{else}}
- return False
- {{endif}}
- {{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }}
- # are_equivalent_{{c_type}} is cimported via khash.pxd
- {{else}}
- cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil:
- return val1 == val2
- {{endif}}
- {{endfor}}
- {{py:
- # name
- cimported_types = ['complex64',
- 'complex128',
- 'float32',
- 'float64',
- 'int8',
- 'int16',
- 'int32',
- 'int64',
- 'pymap',
- 'str',
- 'strbox',
- 'uint8',
- 'uint16',
- 'uint32',
- 'uint64']
- }}
- {{for name in cimported_types}}
- from pandas._libs.khash cimport (
- kh_destroy_{{name}},
- kh_exist_{{name}},
- kh_get_{{name}},
- kh_init_{{name}},
- kh_put_{{name}},
- kh_resize_{{name}},
- )
- {{endfor}}
- # ----------------------------------------------------------------------
- # VectorData
- # ----------------------------------------------------------------------
- from pandas._libs.tslibs.util cimport get_c_string
- from pandas._libs.missing cimport C_NA
- {{py:
- # name, dtype, c_type
- # the generated StringVector is not actually used
- # but is included for completeness (rather ObjectVector is used
- # for uniques in hashtables)
- dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
- ('Complex64', 'complex64', 'khcomplex64_t'),
- ('Float64', 'float64', 'float64_t'),
- ('Float32', 'float32', 'float32_t'),
- ('Int64', 'int64', 'int64_t'),
- ('Int32', 'int32', 'int32_t'),
- ('Int16', 'int16', 'int16_t'),
- ('Int8', 'int8', 'int8_t'),
- ('String', 'string', 'char *'),
- ('UInt64', 'uint64', 'uint64_t'),
- ('UInt32', 'uint32', 'uint32_t'),
- ('UInt16', 'uint16', 'uint16_t'),
- ('UInt8', 'uint8', 'uint8_t')]
- }}
- {{for name, dtype, c_type in dtypes}}
- {{if dtype != 'int64'}}
- # Int64VectorData is defined in the .pxd file because it is needed (indirectly)
- # by IntervalTree
- ctypedef struct {{name}}VectorData:
- {{c_type}} *data
- Py_ssize_t n, m
- {{endif}}
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cdef void append_data_{{dtype}}({{name}}VectorData *data,
- {{c_type}} x) nogil:
- data.data[data.n] = x
- data.n += 1
- {{endfor}}
- ctypedef fused vector_data:
- Int64VectorData
- Int32VectorData
- Int16VectorData
- Int8VectorData
- UInt64VectorData
- UInt32VectorData
- UInt16VectorData
- UInt8VectorData
- Float64VectorData
- Float32VectorData
- Complex128VectorData
- Complex64VectorData
- StringVectorData
- cdef bint needs_resize(vector_data *data) nogil:
- return data.n == data.m
- # ----------------------------------------------------------------------
- # Vector
- # ----------------------------------------------------------------------
- cdef class Vector:
- # cdef readonly:
- # bint external_view_exists
- def __cinit__(self):
- self.external_view_exists = False
- {{py:
- # name, dtype, c_type
- dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
- ('Complex64', 'complex64', 'khcomplex64_t'),
- ('Float64', 'float64', 'float64_t'),
- ('UInt64', 'uint64', 'uint64_t'),
- ('Int64', 'int64', 'int64_t'),
- ('Float32', 'float32', 'float32_t'),
- ('UInt32', 'uint32', 'uint32_t'),
- ('Int32', 'int32', 'int32_t'),
- ('UInt16', 'uint16', 'uint16_t'),
- ('Int16', 'int16', 'int16_t'),
- ('UInt8', 'uint8', 'uint8_t'),
- ('Int8', 'int8', 'int8_t')]
- }}
- {{for name, dtype, c_type in dtypes}}
- cdef class {{name}}Vector(Vector):
- # For int64 we have to put this declaration in the .pxd file;
- # Int64Vector is the only one we need exposed for other cython files.
- {{if dtype != 'int64'}}
- cdef:
- {{name}}VectorData *data
- ndarray ao
- {{endif}}
- def __cinit__(self):
- self.data = <{{name}}VectorData *>PyMem_Malloc(
- sizeof({{name}}VectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
- self.data.data = <{{c_type}}*>self.ao.data
- cdef resize(self):
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- self.ao.resize(self.data.m, refcheck=False)
- self.data.data = <{{c_type}}*>self.ao.data
- def __dealloc__(self):
- if self.data is not NULL:
- PyMem_Free(self.data)
- self.data = NULL
- def __len__(self) -> int:
- return self.data.n
- cpdef ndarray to_array(self):
- if self.data.m != self.data.n:
- if self.external_view_exists:
- # should never happen
- raise ValueError("should have raised on append()")
- self.ao.resize(self.data.n, refcheck=False)
- self.data.m = self.data.n
- self.external_view_exists = True
- return self.ao
- cdef void append(self, {{c_type}} x):
- if needs_resize(self.data):
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.resize()
- append_data_{{dtype}}(self.data, x)
- cdef extend(self, const {{c_type}}[:] x):
- for i in range(len(x)):
- self.append(x[i])
- {{endfor}}
- cdef class StringVector(Vector):
- cdef:
- StringVectorData *data
- def __cinit__(self):
- self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
- if not self.data:
- raise MemoryError()
- self.data.n = 0
- self.data.m = _INIT_VEC_CAP
- self.data.data = <char **>malloc(self.data.m * sizeof(char *))
- if not self.data.data:
- raise MemoryError()
- cdef resize(self):
- cdef:
- char **orig_data
- Py_ssize_t i, m
- m = self.data.m
- self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
- orig_data = self.data.data
- self.data.data = <char **>malloc(self.data.m * sizeof(char *))
- if not self.data.data:
- raise MemoryError()
- for i in range(m):
- self.data.data[i] = orig_data[i]
- def __dealloc__(self):
- if self.data is not NULL:
- if self.data.data is not NULL:
- free(self.data.data)
- PyMem_Free(self.data)
- self.data = NULL
- def __len__(self) -> int:
- return self.data.n
- cpdef ndarray[object, ndim=1] to_array(self):
- cdef:
- ndarray ao
- Py_ssize_t n
- object val
- ao = np.empty(self.data.n, dtype=object)
- for i in range(self.data.n):
- val = self.data.data[i]
- ao[i] = val
- self.external_view_exists = True
- self.data.m = self.data.n
- return ao
- cdef void append(self, char *x):
- if needs_resize(self.data):
- self.resize()
- append_data_string(self.data, x)
- cdef extend(self, ndarray[object] x):
- for i in range(len(x)):
- self.append(x[i])
- cdef class ObjectVector(Vector):
- cdef:
- PyObject **data
- Py_ssize_t n, m
- ndarray ao
- def __cinit__(self):
- self.n = 0
- self.m = _INIT_VEC_CAP
- self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
- self.data = <PyObject**>self.ao.data
- def __len__(self) -> int:
- return self.n
- cdef append(self, object obj):
- if self.n == self.m:
- if self.external_view_exists:
- raise ValueError("external reference but "
- "Vector.resize() needed")
- self.m = max(self.m * 2, _INIT_VEC_CAP)
- self.ao.resize(self.m, refcheck=False)
- self.data = <PyObject**>self.ao.data
- Py_INCREF(obj)
- self.data[self.n] = <PyObject*>obj
- self.n += 1
- cpdef ndarray[object, ndim=1] to_array(self):
- if self.m != self.n:
- if self.external_view_exists:
- raise ValueError("should have raised on append()")
- self.ao.resize(self.n, refcheck=False)
- self.m = self.n
- self.external_view_exists = True
- return self.ao
- cdef extend(self, ndarray[object] x):
- for i in range(len(x)):
- self.append(x[i])
- # ----------------------------------------------------------------------
- # HashTable
- # ----------------------------------------------------------------------
- cdef class HashTable:
- pass
- {{py:
- # name, dtype, c_type, to_c_type
- dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
- ('Float64', 'float64', 'float64_t', ''),
- ('UInt64', 'uint64', 'uint64_t', ''),
- ('Int64', 'int64', 'int64_t', ''),
- ('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
- ('Float32', 'float32', 'float32_t', ''),
- ('UInt32', 'uint32', 'uint32_t', ''),
- ('Int32', 'int32', 'int32_t', ''),
- ('UInt16', 'uint16', 'uint16_t', ''),
- ('Int16', 'int16', 'int16_t', ''),
- ('UInt8', 'uint8', 'uint8_t', ''),
- ('Int8', 'int8', 'int8_t', '')]
- }}
- {{for name, dtype, c_type, to_c_type in dtypes}}
- cdef class {{name}}HashTable(HashTable):
- def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
- self.table = kh_init_{{dtype}}()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_{{dtype}}(self.table, size_hint)
- self.uses_mask = uses_mask
- self.na_position = -1
- def __len__(self) -> int:
- return self.table.size + (0 if self.na_position == -1 else 1)
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_{{dtype}}(self.table)
- self.table = NULL
- def __contains__(self, object key) -> bool:
- # The caller is responsible to check for compatible NA values in case
- # of masked arrays.
- cdef:
- khiter_t k
- {{c_type}} ckey
- if self.uses_mask and checknull(key):
- return -1 != self.na_position
- ckey = {{to_c_type}}(key)
- k = kh_get_{{dtype}}(self.table, ckey)
- return k != self.table.n_buckets
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
- cpdef get_item(self, {{dtype}}_t val):
- """Extracts the position of val from the hashtable.
- Parameters
- ----------
- val : Scalar
- The value that is looked up in the hashtable
- Returns
- -------
- The position of the requested integer.
- """
- # Used in core.sorting, IndexEngine.get_loc
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- {{c_type}} cval
- cval = {{to_c_type}}(val)
- k = kh_get_{{dtype}}(self.table, cval)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
- cpdef get_na(self):
- """Extracts the position of na_value from the hashtable.
- Returns
- -------
- The position of the last na value.
- """
- if not self.uses_mask:
- raise NotImplementedError
- if self.na_position == -1:
- raise KeyError("NA")
- return self.na_position
- cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
- # Used in libjoin
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- {{c_type}} ckey
- ckey = {{to_c_type}}(key)
- k = kh_put_{{dtype}}(self.table, ckey, &ret)
- if kh_exist_{{dtype}}(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
- cpdef set_na(self, Py_ssize_t val):
- # Caller is responsible for checking for pd.NA
- cdef:
- khiter_t k
- int ret = 0
- {{c_type}} ckey
- if not self.uses_mask:
- raise NotImplementedError
- self.na_position = val
- {{if dtype == "int64" }}
- # We only use this for int64, can reduce build size and make .pyi
- # more accurate by only implementing it for int64
- @cython.boundscheck(False)
- def map_keys_to_values(
- self, const {{dtype}}_t[:] keys, const int64_t[:] values
- ) -> None:
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- {{c_type}} key
- khiter_t k
- with nogil:
- for i in range(n):
- key = {{to_c_type}}(keys[i])
- k = kh_put_{{dtype}}(self.table, key, &ret)
- self.table.vals[k] = <Py_ssize_t>values[i]
- {{endif}}
- @cython.boundscheck(False)
- def map_locations(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> None:
- # Used in libindex, safe_sort
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- {{c_type}} val
- khiter_t k
- int8_t na_position = self.na_position
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
- with nogil:
- if self.uses_mask:
- for i in range(n):
- if mask[i]:
- na_position = i
- else:
- val= {{to_c_type}}(values[i])
- k = kh_put_{{dtype}}(self.table, val, &ret)
- self.table.vals[k] = i
- else:
- for i in range(n):
- val= {{to_c_type}}(values[i])
- k = kh_put_{{dtype}}(self.table, val, &ret)
- self.table.vals[k] = i
- self.na_position = na_position
- @cython.boundscheck(False)
- def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # Used in safe_sort, IndexEngine.get_indexer
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- {{c_type}} val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- int8_t na_position = self.na_position
- if self.uses_mask and mask is None:
- raise NotImplementedError # pragma: no cover
- with nogil:
- for i in range(n):
- if self.uses_mask and mask[i]:
- locs[i] = na_position
- else:
- val = {{to_c_type}}(values[i])
- k = kh_get_{{dtype}}(self.table, val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
- return np.asarray(locs)
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- object mask=None, bint return_inverse=False, bint use_result_mask=False):
- """
- Calculate unique values and labels (no sorting!)
- Parameters
- ----------
- values : ndarray[{{dtype}}]
- Array of values of which unique will be calculated
- uniques : {{name}}Vector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- use_result_mask: bool, default False
- Whether to create a result mask for the unique values. Not supported
- with return_inverse=True.
- Returns
- -------
- uniques : ndarray[{{dtype}}]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- result_mask: ndarray[bool], if use_result_mask is true
- The mask for the result values.
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- {{c_type}} val, na_value2
- khiter_t k
- {{name}}VectorData *ud
- UInt8Vector result_mask
- UInt8VectorData *rmd
- bint use_na_value, use_mask, seen_na = False
- uint8_t[:] mask_values
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- use_na_value = na_value is not None
- use_mask = mask is not None
- if not use_mask and use_result_mask:
- raise NotImplementedError # pragma: no cover
- if use_result_mask and return_inverse:
- raise NotImplementedError # pragma: no cover
- result_mask = UInt8Vector()
- rmd = result_mask.data
- if use_mask:
- mask_values = mask.view("uint8")
- if use_na_value:
- # We need this na_value2 because we want to allow users
- # to *optionally* specify an NA sentinel *of the correct* type.
- # We use None, to make it optional, which requires `object` type
- # for the parameter. To please the compiler, we use na_value2,
- # which is only used if it's *specified*.
- na_value2 = {{to_c_type}}(na_value)
- else:
- na_value2 = {{to_c_type}}(0)
- with nogil:
- for i in range(n):
- val = {{to_c_type}}(values[i])
- if ignore_na and use_mask:
- if mask_values[i]:
- labels[i] = na_sentinel
- continue
- elif ignore_na and (
- is_nan_{{c_type}}(val) or
- (use_na_value and are_equivalent_{{c_type}}(val, na_value2))
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them,
- # and replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- elif not ignore_na and use_result_mask:
- if mask_values[i]:
- if seen_na:
- continue
- seen_na = True
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_{{dtype}}(ud, val)
- append_data_uint8(rmd, 1)
- continue
- k = kh_get_{{dtype}}(self.table, val)
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_{{dtype}}(self.table, val, &ret)
- if needs_resize(ud):
- with gil:
- if uniques.external_view_exists:
- raise ValueError("external reference to "
- "uniques held, but "
- "Vector.resize() needed")
- uniques.resize()
- if use_result_mask:
- if result_mask.external_view_exists:
- raise ValueError("external reference to "
- "result_mask held, but "
- "Vector.resize() needed")
- result_mask.resize()
- append_data_{{dtype}}(ud, val)
- if use_result_mask:
- append_data_uint8(rmd, 0)
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- if use_result_mask:
- return uniques.to_array(), result_mask.to_array()
- return uniques.to_array()
- def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
- Parameters
- ----------
- values : ndarray[{{dtype}}]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- Returns
- -------
- uniques : ndarray[{{dtype}}]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- result_mask: ndarray[bool], if mask is given as input
- The mask for the result values.
- """
- uniques = {{name}}Vector()
- use_result_mask = True if mask is not None else False
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)
- def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
- Parameters
- ----------
- values : ndarray[{{dtype}}]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- If not None, the mask is used as indicator for missing values
- (True = missing, False = valid) instead of `na_value` or
- condition "val != val".
- Returns
- -------
- uniques : ndarray[{{dtype}}]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = {{name}}Vector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na, mask=mask,
- return_inverse=True)
- def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True, mask=mask)
- return labels
- {{if dtype == 'int64'}}
- @cython.boundscheck(False)
- def get_labels_groupby(
- self, const {{dtype}}_t[:] values
- ) -> tuple[ndarray, ndarray]:
- # tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]]
- cdef:
- Py_ssize_t i, n = len(values)
- intp_t[::1] labels
- Py_ssize_t idx, count = 0
- int ret = 0
- {{c_type}} val
- khiter_t k
- {{name}}Vector uniques = {{name}}Vector()
- {{name}}VectorData *ud
- labels = np.empty(n, dtype=np.intp)
- ud = uniques.data
- with nogil:
- for i in range(n):
- val = {{to_c_type}}(values[i])
- # specific for groupby
- if val < 0:
- labels[i] = -1
- continue
- k = kh_get_{{dtype}}(self.table, val)
- if k != self.table.n_buckets:
- idx = self.table.vals[k]
- labels[i] = idx
- else:
- k = kh_put_{{dtype}}(self.table, val, &ret)
- self.table.vals[k] = count
- if needs_resize(ud):
- with gil:
- uniques.resize()
- append_data_{{dtype}}(ud, val)
- labels[i] = count
- count += 1
- arr_uniques = uniques.to_array()
- return np.asarray(labels), arr_uniques
- {{endif}}
- cdef class {{name}}Factorizer(Factorizer):
- cdef public:
- {{name}}HashTable table
- {{name}}Vector uniques
- def __cinit__(self, size_hint: int):
- self.table = {{name}}HashTable(size_hint)
- self.uniques = {{name}}Vector()
- def factorize(self, const {{c_type}}[:] values,
- na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
- """
- Returns
- -------
- ndarray[intp_t]
- Examples
- --------
- Factorize values with nans replaced by na_sentinel
- >>> fac = {{name}}Factorizer(3)
- >>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20)
- array([0, 1, 2])
- """
- cdef:
- ndarray[intp_t] labels
- if self.uniques.external_view_exists:
- uniques = {{name}}Vector()
- uniques.extend(self.uniques.to_array())
- self.uniques = uniques
- labels = self.table.get_labels(values, self.uniques,
- self.count, na_sentinel,
- na_value=na_value, mask=mask)
- self.count = len(self.uniques)
- return labels
- {{endfor}}
- cdef class StringHashTable(HashTable):
- # these by-definition *must* be strings
- # or a sentinel np.nan / None missing value
- na_string_sentinel = '__nan__'
- def __init__(self, int64_t size_hint=1):
- self.table = kh_init_str()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_str(self.table, size_hint)
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_str(self.table)
- self.table = NULL
- def sizeof(self, deep: bool = False) -> int:
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(char *) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
- def get_state(self) -> dict[str, int]:
- """ returns infos about the state of the hashtable"""
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
- cpdef get_item(self, str val):
- cdef:
- khiter_t k
- const char *v
- v = get_c_string(val)
- k = kh_get_str(self.table, v)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
- cpdef set_item(self, str key, Py_ssize_t val):
- cdef:
- khiter_t k
- int ret = 0
- const char *v
- v = get_c_string(key)
- k = kh_put_str(self.table, v, &ret)
- if kh_exist_str(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
- @cython.boundscheck(False)
- def get_indexer(self, ndarray[object] values) -> ndarray:
- # -> np.ndarray[np.intp]
- cdef:
- Py_ssize_t i, n = len(values)
- ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
- intp_t *resbuf = <intp_t*>labels.data
- khiter_t k
- kh_str_t *table = self.table
- const char *v
- const char **vecs
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
- v = get_c_string(val)
- vecs[i] = v
- with nogil:
- for i in range(n):
- k = kh_get_str(table, vecs[i])
- if k != table.n_buckets:
- resbuf[i] = table.vals[k]
- else:
- resbuf[i] = -1
- free(vecs)
- return labels
- @cython.boundscheck(False)
- def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- const char *v
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- # these by-definition *must* be strings
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
- if isinstance(val, str):
- # GH#31499 if we have a np.str_ get_c_string won't recognize
- # it as a str, even though isinstance does.
- v = get_c_string(<str>val)
- else:
- v = get_c_string(self.na_string_sentinel)
- vecs[i] = v
- with nogil:
- for i in range(n):
- v = vecs[i]
- k = kh_get_str(self.table, v)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
- free(vecs)
- return np.asarray(locs)
- @cython.boundscheck(False)
- def map_locations(self, ndarray[object] values, object mask = None) -> None:
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- const char *v
- const char **vecs
- khiter_t k
- # these by-definition *must* be strings
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
- if isinstance(val, str):
- # GH#31499 if we have a np.str_ get_c_string won't recognize
- # it as a str, even though isinstance does.
- v = get_c_string(<str>val)
- else:
- v = get_c_string(self.na_string_sentinel)
- vecs[i] = v
- with nogil:
- for i in range(n):
- v = vecs[i]
- k = kh_put_str(self.table, v, &ret)
- self.table.vals[k] = i
- free(vecs)
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- bint return_inverse=False):
- """
- Calculate unique values and labels (no sorting!)
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- uniques : ObjectVector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then any value
- that is not a string is considered missing. If na_value is
- not None, then _additionally_ any value "val" satisfying
- val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int64_t[::1] uindexer
- int ret = 0
- object val
- const char *v
- const char **vecs
- khiter_t k
- bint use_na_value
- if return_inverse:
- labels = np.zeros(n, dtype=np.intp)
- uindexer = np.empty(n, dtype=np.int64)
- use_na_value = na_value is not None
- # assign pointers and pre-filter out missing (if ignore_na)
- vecs = <const char **>malloc(n * sizeof(char *))
- for i in range(n):
- val = values[i]
- if (ignore_na
- and (not isinstance(val, str)
- or (use_na_value and val == na_value))):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), we can skip the actual value, and
- # replace the label with na_sentinel directly
- labels[i] = na_sentinel
- else:
- # if ignore_na is False, we also stringify NaN/None/etc.
- try:
- v = get_c_string(<str>val)
- except UnicodeEncodeError:
- v = get_c_string(<str>repr(val))
- vecs[i] = v
- # compute
- with nogil:
- for i in range(n):
- if ignore_na and labels[i] == na_sentinel:
- # skip entries for ignored missing values (see above)
- continue
- v = vecs[i]
- k = kh_get_str(self.table, v)
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_str(self.table, v, &ret)
- uindexer[count] = i
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
- free(vecs)
- # uniques
- for i in range(count):
- uniques.append(values[uindexer[i]])
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- return uniques.to_array()
- def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- Not yet implemented for StringHashTable
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- """
- uniques = ObjectVector()
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse)
- def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then any value
- that is not a string is considered missing. If na_value is
- not None, then _additionally_ any value "val" satisfying
- val == na_value is considered missing.
- mask : ndarray[bool], optional
- Not yet implemented for StringHashTable.
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp]
- The labels from values to uniques
- """
- uniques_vector = ObjectVector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na,
- return_inverse=True)
- def get_labels(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True)
- return labels
- cdef class PyObjectHashTable(HashTable):
- def __init__(self, int64_t size_hint=1):
- self.table = kh_init_pymap()
- size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
- kh_resize_pymap(self.table, size_hint)
- def __dealloc__(self):
- if self.table is not NULL:
- kh_destroy_pymap(self.table)
- self.table = NULL
- def __len__(self) -> int:
- return self.table.size
- def __contains__(self, object key) -> bool:
- cdef:
- khiter_t k
- hash(key)
- k = kh_get_pymap(self.table, <PyObject*>key)
- return k != self.table.n_buckets
- def sizeof(self, deep: bool = False) -> int:
- """ return the size of my table in bytes """
- overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
- for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
- for_pairs = self.table.n_buckets * (sizeof(PyObject *) + # keys
- sizeof(Py_ssize_t)) # vals
- return overhead + for_flags + for_pairs
- def get_state(self) -> dict[str, int]:
- """
- returns infos about the current state of the hashtable like size,
- number of buckets and so on.
- """
- return {
- 'n_buckets' : self.table.n_buckets,
- 'size' : self.table.size,
- 'n_occupied' : self.table.n_occupied,
- 'upper_bound' : self.table.upper_bound,
- }
- cpdef get_item(self, object val):
- cdef:
- khiter_t k
- k = kh_get_pymap(self.table, <PyObject*>val)
- if k != self.table.n_buckets:
- return self.table.vals[k]
- else:
- raise KeyError(val)
- cpdef set_item(self, object key, Py_ssize_t val):
- cdef:
- khiter_t k
- int ret = 0
- char* buf
- hash(key)
- k = kh_put_pymap(self.table, <PyObject*>key, &ret)
- if kh_exist_pymap(self.table, k):
- self.table.vals[k] = val
- else:
- raise KeyError(key)
- def map_locations(self, ndarray[object] values, object mask = None) -> None:
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- khiter_t k
- for i in range(n):
- val = values[i]
- hash(val)
- k = kh_put_pymap(self.table, <PyObject*>val, &ret)
- self.table.vals[k] = i
- def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
- # -> np.ndarray[np.intp]
- # mask not yet implemented
- cdef:
- Py_ssize_t i, n = len(values)
- int ret = 0
- object val
- khiter_t k
- intp_t[::1] locs = np.empty(n, dtype=np.intp)
- for i in range(n):
- val = values[i]
- hash(val)
- k = kh_get_pymap(self.table, <PyObject*>val)
- if k != self.table.n_buckets:
- locs[i] = self.table.vals[k]
- else:
- locs[i] = -1
- return np.asarray(locs)
- @cython.boundscheck(False)
- @cython.wraparound(False)
- def _unique(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None, bint ignore_na=False,
- bint return_inverse=False):
- """
- Calculate unique values and labels (no sorting!)
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- uniques : ObjectVector
- Vector into which uniques will be written
- count_prior : Py_ssize_t, default 0
- Number of existing entries in uniques
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then None _plus_
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- ignore_na : bool, default False
- Whether NA-values should be ignored for calculating the uniques. If
- True, the labels corresponding to missing values will be set to
- na_sentinel.
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse=True)
- The labels from values to uniques
- """
- cdef:
- Py_ssize_t i, idx, count = count_prior, n = len(values)
- intp_t[::1] labels
- int ret = 0
- object val
- khiter_t k
- bint use_na_value
- if return_inverse:
- labels = np.empty(n, dtype=np.intp)
- use_na_value = na_value is not None
- for i in range(n):
- val = values[i]
- hash(val)
- if ignore_na and (
- checknull(val)
- or (use_na_value and val == na_value)
- ):
- # if missing values do not count as unique values (i.e. if
- # ignore_na is True), skip the hashtable entry for them, and
- # replace the corresponding label with na_sentinel
- labels[i] = na_sentinel
- continue
- k = kh_get_pymap(self.table, <PyObject*>val)
- if k == self.table.n_buckets:
- # k hasn't been seen yet
- k = kh_put_pymap(self.table, <PyObject*>val, &ret)
- uniques.append(val)
- if return_inverse:
- self.table.vals[k] = count
- labels[i] = count
- count += 1
- elif return_inverse:
- # k falls into a previous bucket
- # only relevant in case we need to construct the inverse
- idx = self.table.vals[k]
- labels[i] = idx
- if return_inverse:
- return uniques.to_array(), labels.base # .base -> underlying ndarray
- return uniques.to_array()
- def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
- """
- Calculate unique values and labels (no sorting!)
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- return_inverse : bool, default False
- Whether the mapping of the original array values to their location
- in the vector of uniques should be returned.
- mask : ndarray[bool], optional
- Not yet implemented for PyObjectHashTable
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t] (if return_inverse)
- The labels from values to uniques
- """
- uniques = ObjectVector()
- return self._unique(values, uniques, ignore_na=False,
- return_inverse=return_inverse)
- def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
- object na_value=None, object mask=None, ignore_na=True):
- """
- Calculate unique values and labels (no sorting!)
- Missing values are not included in the "uniques" for this method.
- The labels for any missing values will be set to "na_sentinel"
- Parameters
- ----------
- values : ndarray[object]
- Array of values of which unique will be calculated
- na_sentinel : Py_ssize_t, default -1
- Sentinel value used for all NA-values in inverse
- na_value : object, default None
- Value to identify as missing. If na_value is None, then None _plus_
- any value "val" satisfying val != val is considered missing.
- If na_value is not None, then _additionally_, any value "val"
- satisfying val == na_value is considered missing.
- mask : ndarray[bool], optional
- Not yet implemented for PyObjectHashTable.
- Returns
- -------
- uniques : ndarray[object]
- Unique values of input, not sorted
- labels : ndarray[intp_t]
- The labels from values to uniques
- """
- uniques_vector = ObjectVector()
- return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
- na_value=na_value, ignore_na=ignore_na,
- return_inverse=True)
- def get_labels(self, ndarray[object] values, ObjectVector uniques,
- Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
- object na_value=None):
- # -> np.ndarray[np.intp]
- _, labels = self._unique(values, uniques, count_prior=count_prior,
- na_sentinel=na_sentinel, na_value=na_value,
- ignore_na=True, return_inverse=True)
- return labels
|