SRI-DYZBC2
/
Vehicle-cpp


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506
							"""
Template for each `dtype` helper function for hashtable

WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
"""


{{py:

# name
complex_types = ['complex64',
                 'complex128']
}}

{{for name in complex_types}}
cdef kh{{name}}_t to_kh{{name}}_t({{name}}_t val) nogil:
    cdef kh{{name}}_t res
    res.real = val.real
    res.imag = val.imag
    return res

{{endfor}}


{{py:


# name
c_types = ['khcomplex128_t',
           'khcomplex64_t',
           'float64_t',
           'float32_t',
           'int64_t',
           'int32_t',
           'int16_t',
           'int8_t',
           'uint64_t',
           'uint32_t',
           'uint16_t',
           'uint8_t']
}}

{{for c_type in c_types}}

cdef bint is_nan_{{c_type}}({{c_type}} val) nogil:
    {{if c_type in {'khcomplex128_t', 'khcomplex64_t'} }}
    return val.real != val.real or val.imag != val.imag
    {{elif c_type in {'float64_t', 'float32_t'} }}
    return val != val
    {{else}}
    return False
    {{endif}}


{{if c_type in {'khcomplex128_t', 'khcomplex64_t', 'float64_t', 'float32_t'} }}
# are_equivalent_{{c_type}} is cimported via khash.pxd
{{else}}
cdef bint are_equivalent_{{c_type}}({{c_type}} val1, {{c_type}} val2) nogil:
    return val1 == val2
{{endif}}

{{endfor}}


{{py:

# name
cimported_types = ['complex64',
                   'complex128',
                   'float32',
                   'float64',
                   'int8',
                   'int16',
                   'int32',
                   'int64',
                   'pymap',
                   'str',
                   'strbox',
                   'uint8',
                   'uint16',
                   'uint32',
                   'uint64']
}}

{{for name in cimported_types}}
from pandas._libs.khash cimport (
    kh_destroy_{{name}},
    kh_exist_{{name}},
    kh_get_{{name}},
    kh_init_{{name}},
    kh_put_{{name}},
    kh_resize_{{name}},
)

{{endfor}}

# ----------------------------------------------------------------------
# VectorData
# ----------------------------------------------------------------------

from pandas._libs.tslibs.util cimport get_c_string
from pandas._libs.missing cimport C_NA


{{py:

# name, dtype, c_type
# the generated StringVector is not actually used
# but is included for completeness (rather ObjectVector is used
# for uniques in hashtables)

dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
          ('Complex64', 'complex64', 'khcomplex64_t'),
          ('Float64', 'float64', 'float64_t'),
          ('Float32', 'float32', 'float32_t'),
          ('Int64', 'int64', 'int64_t'),
          ('Int32', 'int32', 'int32_t'),
          ('Int16', 'int16', 'int16_t'),
          ('Int8', 'int8', 'int8_t'),
          ('String', 'string', 'char *'),
          ('UInt64', 'uint64', 'uint64_t'),
          ('UInt32', 'uint32', 'uint32_t'),
          ('UInt16', 'uint16', 'uint16_t'),
          ('UInt8', 'uint8', 'uint8_t')]
}}

{{for name, dtype, c_type in dtypes}}


{{if dtype != 'int64'}}
# Int64VectorData is defined in the .pxd file because it is needed (indirectly)
#  by IntervalTree

ctypedef struct {{name}}VectorData:
    {{c_type}} *data
    Py_ssize_t n, m

{{endif}}


@cython.wraparound(False)
@cython.boundscheck(False)
cdef void append_data_{{dtype}}({{name}}VectorData *data,
                                       {{c_type}} x) nogil:

    data.data[data.n] = x
    data.n += 1

{{endfor}}

ctypedef fused vector_data:
    Int64VectorData
    Int32VectorData
    Int16VectorData
    Int8VectorData
    UInt64VectorData
    UInt32VectorData
    UInt16VectorData
    UInt8VectorData
    Float64VectorData
    Float32VectorData
    Complex128VectorData
    Complex64VectorData
    StringVectorData

cdef bint needs_resize(vector_data *data) nogil:
    return data.n == data.m

# ----------------------------------------------------------------------
# Vector
# ----------------------------------------------------------------------

cdef class Vector:
    # cdef readonly:
    #    bint external_view_exists

    def __cinit__(self):
        self.external_view_exists = False


{{py:

# name, dtype, c_type
dtypes = [('Complex128', 'complex128', 'khcomplex128_t'),
          ('Complex64', 'complex64', 'khcomplex64_t'),
          ('Float64', 'float64', 'float64_t'),
          ('UInt64', 'uint64', 'uint64_t'),
          ('Int64', 'int64', 'int64_t'),
          ('Float32', 'float32', 'float32_t'),
          ('UInt32', 'uint32', 'uint32_t'),
          ('Int32', 'int32', 'int32_t'),
          ('UInt16', 'uint16', 'uint16_t'),
          ('Int16', 'int16', 'int16_t'),
          ('UInt8', 'uint8', 'uint8_t'),
          ('Int8', 'int8', 'int8_t')]

}}

{{for name, dtype, c_type in dtypes}}

cdef class {{name}}Vector(Vector):

    # For int64 we have to put this declaration in the .pxd file;
    # Int64Vector is the only one we need exposed for other cython files.
    {{if dtype != 'int64'}}
    cdef:
        {{name}}VectorData *data
        ndarray ao
    {{endif}}

    def __cinit__(self):
        self.data = <{{name}}VectorData *>PyMem_Malloc(
            sizeof({{name}}VectorData))
        if not self.data:
            raise MemoryError()
        self.data.n = 0
        self.data.m = _INIT_VEC_CAP
        self.ao = np.empty(self.data.m, dtype=np.{{dtype}})
        self.data.data = <{{c_type}}*>self.ao.data

    cdef resize(self):
        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)
        self.ao.resize(self.data.m, refcheck=False)
        self.data.data = <{{c_type}}*>self.ao.data

    def __dealloc__(self):
        if self.data is not NULL:
            PyMem_Free(self.data)
            self.data = NULL

    def __len__(self) -> int:
        return self.data.n

    cpdef ndarray to_array(self):
        if self.data.m != self.data.n:
            if self.external_view_exists:
                # should never happen
                raise ValueError("should have raised on append()")
            self.ao.resize(self.data.n, refcheck=False)
            self.data.m = self.data.n
        self.external_view_exists = True
        return self.ao

    cdef void append(self, {{c_type}} x):

        if needs_resize(self.data):
            if self.external_view_exists:
                raise ValueError("external reference but "
                                 "Vector.resize() needed")
            self.resize()

        append_data_{{dtype}}(self.data, x)

    cdef extend(self, const {{c_type}}[:] x):
        for i in range(len(x)):
            self.append(x[i])

{{endfor}}

cdef class StringVector(Vector):

    cdef:
        StringVectorData *data

    def __cinit__(self):
        self.data = <StringVectorData *>PyMem_Malloc(sizeof(StringVectorData))
        if not self.data:
            raise MemoryError()
        self.data.n = 0
        self.data.m = _INIT_VEC_CAP
        self.data.data = <char **>malloc(self.data.m * sizeof(char *))
        if not self.data.data:
            raise MemoryError()

    cdef resize(self):
        cdef:
            char **orig_data
            Py_ssize_t i, m

        m = self.data.m
        self.data.m = max(self.data.m * 4, _INIT_VEC_CAP)

        orig_data = self.data.data
        self.data.data = <char **>malloc(self.data.m * sizeof(char *))
        if not self.data.data:
            raise MemoryError()
        for i in range(m):
            self.data.data[i] = orig_data[i]

    def __dealloc__(self):
        if self.data is not NULL:
            if self.data.data is not NULL:
                free(self.data.data)
            PyMem_Free(self.data)
            self.data = NULL

    def __len__(self) -> int:
        return self.data.n

    cpdef ndarray[object, ndim=1] to_array(self):
        cdef:
            ndarray ao
            Py_ssize_t n
            object val

        ao = np.empty(self.data.n, dtype=object)
        for i in range(self.data.n):
            val = self.data.data[i]
            ao[i] = val
        self.external_view_exists = True
        self.data.m = self.data.n
        return ao

    cdef void append(self, char *x):

        if needs_resize(self.data):
            self.resize()

        append_data_string(self.data, x)

    cdef extend(self, ndarray[object] x):
        for i in range(len(x)):
            self.append(x[i])


cdef class ObjectVector(Vector):

    cdef:
        PyObject **data
        Py_ssize_t n, m
        ndarray ao

    def __cinit__(self):
        self.n = 0
        self.m = _INIT_VEC_CAP
        self.ao = np.empty(_INIT_VEC_CAP, dtype=object)
        self.data = <PyObject**>self.ao.data

    def __len__(self) -> int:
        return self.n

    cdef append(self, object obj):
        if self.n == self.m:
            if self.external_view_exists:
                raise ValueError("external reference but "
                                 "Vector.resize() needed")
            self.m = max(self.m * 2, _INIT_VEC_CAP)
            self.ao.resize(self.m, refcheck=False)
            self.data = <PyObject**>self.ao.data

        Py_INCREF(obj)
        self.data[self.n] = <PyObject*>obj
        self.n += 1

    cpdef ndarray[object, ndim=1] to_array(self):
        if self.m != self.n:
            if self.external_view_exists:
                raise ValueError("should have raised on append()")
            self.ao.resize(self.n, refcheck=False)
            self.m = self.n
        self.external_view_exists = True
        return self.ao

    cdef extend(self, ndarray[object] x):
        for i in range(len(x)):
            self.append(x[i])

# ----------------------------------------------------------------------
# HashTable
# ----------------------------------------------------------------------


cdef class HashTable:

    pass

{{py:

# name, dtype, c_type, to_c_type
dtypes = [('Complex128', 'complex128', 'khcomplex128_t', 'to_khcomplex128_t'),
          ('Float64', 'float64', 'float64_t', ''),
          ('UInt64', 'uint64', 'uint64_t', ''),
          ('Int64', 'int64', 'int64_t', ''),
          ('Complex64', 'complex64', 'khcomplex64_t', 'to_khcomplex64_t'),
          ('Float32', 'float32', 'float32_t', ''),
          ('UInt32', 'uint32', 'uint32_t', ''),
          ('Int32', 'int32', 'int32_t', ''),
          ('UInt16', 'uint16', 'uint16_t', ''),
          ('Int16', 'int16', 'int16_t', ''),
          ('UInt8', 'uint8', 'uint8_t', ''),
          ('Int8', 'int8', 'int8_t', '')]

}}


{{for name, dtype, c_type, to_c_type in dtypes}}

cdef class {{name}}HashTable(HashTable):

    def __cinit__(self, int64_t size_hint=1, bint uses_mask=False):
        self.table = kh_init_{{dtype}}()
        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
        kh_resize_{{dtype}}(self.table, size_hint)

        self.uses_mask = uses_mask
        self.na_position = -1

    def __len__(self) -> int:
        return self.table.size + (0 if self.na_position == -1 else 1)

    def __dealloc__(self):
        if self.table is not NULL:
            kh_destroy_{{dtype}}(self.table)
            self.table = NULL

    def __contains__(self, object key) -> bool:
        # The caller is responsible to check for compatible NA values in case
        # of masked arrays.
        cdef:
            khiter_t k
            {{c_type}} ckey

        if self.uses_mask and checknull(key):
            return -1 != self.na_position

        ckey = {{to_c_type}}(key)
        k = kh_get_{{dtype}}(self.table, ckey)
        return k != self.table.n_buckets

    def sizeof(self, deep: bool = False) -> int:
        """ return the size of my table in bytes """
        overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
        for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
        for_pairs =  self.table.n_buckets * (sizeof({{dtype}}_t) + # keys
                                             sizeof(Py_ssize_t))   # vals
        return overhead + for_flags + for_pairs

    def get_state(self) -> dict[str, int]:
        """ returns infos about the state of the hashtable"""
        return {
            'n_buckets' : self.table.n_buckets,
            'size' : self.table.size,
            'n_occupied' : self.table.n_occupied,
            'upper_bound' : self.table.upper_bound,
        }

    cpdef get_item(self, {{dtype}}_t val):
        """Extracts the position of val from the hashtable.

        Parameters
        ----------
        val : Scalar
            The value that is looked up in the hashtable

        Returns
        -------
        The position of the requested integer.
        """

        # Used in core.sorting, IndexEngine.get_loc
        # Caller is responsible for checking for pd.NA
        cdef:
            khiter_t k
            {{c_type}} cval

        cval = {{to_c_type}}(val)
        k = kh_get_{{dtype}}(self.table, cval)
        if k != self.table.n_buckets:
            return self.table.vals[k]
        else:
            raise KeyError(val)

    cpdef get_na(self):
        """Extracts the position of na_value from the hashtable.

        Returns
        -------
        The position of the last na value.
        """

        if not self.uses_mask:
            raise NotImplementedError

        if self.na_position == -1:
            raise KeyError("NA")
        return self.na_position

    cpdef set_item(self, {{dtype}}_t key, Py_ssize_t val):
        # Used in libjoin
        # Caller is responsible for checking for pd.NA
        cdef:
            khiter_t k
            int ret = 0
            {{c_type}} ckey

        ckey = {{to_c_type}}(key)
        k = kh_put_{{dtype}}(self.table, ckey, &ret)
        if kh_exist_{{dtype}}(self.table, k):
            self.table.vals[k] = val
        else:
            raise KeyError(key)

    cpdef set_na(self, Py_ssize_t val):
        # Caller is responsible for checking for pd.NA
        cdef:
            khiter_t k
            int ret = 0
            {{c_type}} ckey

        if not self.uses_mask:
            raise NotImplementedError

        self.na_position = val

    {{if dtype == "int64" }}
    # We only use this for int64, can reduce build size and make .pyi
    #  more accurate by only implementing it for int64
    @cython.boundscheck(False)
    def map_keys_to_values(
        self, const {{dtype}}_t[:] keys, const int64_t[:] values
    ) -> None:
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            {{c_type}} key
            khiter_t k

        with nogil:
            for i in range(n):
                key = {{to_c_type}}(keys[i])
                k = kh_put_{{dtype}}(self.table, key, &ret)
                self.table.vals[k] = <Py_ssize_t>values[i]
    {{endif}}

    @cython.boundscheck(False)
    def map_locations(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> None:
        # Used in libindex, safe_sort
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            {{c_type}} val
            khiter_t k
            int8_t na_position = self.na_position

        if self.uses_mask and mask is None:
            raise NotImplementedError  # pragma: no cover

        with nogil:
            if self.uses_mask:
                for i in range(n):
                    if mask[i]:
                        na_position = i
                    else:
                        val= {{to_c_type}}(values[i])
                        k = kh_put_{{dtype}}(self.table, val, &ret)
                        self.table.vals[k] = i
            else:
                for i in range(n):
                    val= {{to_c_type}}(values[i])
                    k = kh_put_{{dtype}}(self.table, val, &ret)
                    self.table.vals[k] = i
        self.na_position = na_position

    @cython.boundscheck(False)
    def lookup(self, const {{dtype}}_t[:] values, const uint8_t[:] mask = None) -> ndarray:
        # -> np.ndarray[np.intp]
        # Used in safe_sort, IndexEngine.get_indexer
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            {{c_type}} val
            khiter_t k
            intp_t[::1] locs = np.empty(n, dtype=np.intp)
            int8_t na_position = self.na_position

        if self.uses_mask and mask is None:
            raise NotImplementedError  # pragma: no cover

        with nogil:
            for i in range(n):
                if self.uses_mask and mask[i]:
                    locs[i] = na_position
                else:
                    val = {{to_c_type}}(values[i])
                    k = kh_get_{{dtype}}(self.table, val)
                    if k != self.table.n_buckets:
                        locs[i] = self.table.vals[k]
                    else:
                        locs[i] = -1

        return np.asarray(locs)

    @cython.boundscheck(False)
    @cython.wraparound(False)
    def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                object na_value=None, bint ignore_na=False,
                object mask=None, bint return_inverse=False, bint use_result_mask=False):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[{{dtype}}]
            Array of values of which unique will be calculated
        uniques : {{name}}Vector
            Vector into which uniques will be written
        count_prior : Py_ssize_t, default 0
            Number of existing entries in uniques
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then
            any value "val" satisfying val != val is considered missing.
            If na_value is not None, then _additionally_, any value "val"
            satisfying val == na_value is considered missing.
        ignore_na : bool, default False
            Whether NA-values should be ignored for calculating the uniques. If
            True, the labels corresponding to missing values will be set to
            na_sentinel.
        mask : ndarray[bool], optional
            If not None, the mask is used as indicator for missing values
            (True = missing, False = valid) instead of `na_value` or
            condition "val != val".
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.
        use_result_mask: bool, default False
            Whether to create a result mask for the unique values. Not supported
            with return_inverse=True.

        Returns
        -------
        uniques : ndarray[{{dtype}}]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse=True)
            The labels from values to uniques
        result_mask: ndarray[bool], if use_result_mask is true
            The mask for the result values.
        """
        cdef:
            Py_ssize_t i, idx, count = count_prior, n = len(values)
            intp_t[::1] labels
            int ret = 0
            {{c_type}} val, na_value2
            khiter_t k
            {{name}}VectorData *ud
            UInt8Vector result_mask
            UInt8VectorData *rmd
            bint use_na_value, use_mask, seen_na = False
            uint8_t[:] mask_values

        if return_inverse:
            labels = np.empty(n, dtype=np.intp)
        ud = uniques.data
        use_na_value = na_value is not None
        use_mask = mask is not None
        if not use_mask and use_result_mask:
            raise NotImplementedError  # pragma: no cover

        if use_result_mask and return_inverse:
            raise NotImplementedError  # pragma: no cover

        result_mask = UInt8Vector()
        rmd = result_mask.data

        if use_mask:
            mask_values = mask.view("uint8")

        if use_na_value:
            # We need this na_value2 because we want to allow users
            # to *optionally* specify an NA sentinel *of the correct* type.
            # We use None, to make it optional, which requires `object` type
            # for the parameter. To please the compiler, we use na_value2,
            # which is only used if it's *specified*.
            na_value2 = {{to_c_type}}(na_value)
        else:
            na_value2 = {{to_c_type}}(0)

        with nogil:
            for i in range(n):
                val = {{to_c_type}}(values[i])

                if ignore_na and use_mask:
                    if mask_values[i]:
                        labels[i] = na_sentinel
                        continue
                elif ignore_na and (
                   is_nan_{{c_type}}(val) or
                   (use_na_value and are_equivalent_{{c_type}}(val, na_value2))
                ):
                    # if missing values do not count as unique values (i.e. if
                    # ignore_na is True), skip the hashtable entry for them,
                    # and replace the corresponding label with na_sentinel
                    labels[i] = na_sentinel
                    continue
                elif not ignore_na and use_result_mask:
                    if mask_values[i]:
                        if seen_na:
                            continue

                        seen_na = True
                        if needs_resize(ud):
                            with gil:
                                if uniques.external_view_exists:
                                    raise ValueError("external reference to "
                                                     "uniques held, but "
                                                     "Vector.resize() needed")
                                uniques.resize()
                                if result_mask.external_view_exists:
                                    raise ValueError("external reference to "
                                                     "result_mask held, but "
                                                     "Vector.resize() needed")
                                result_mask.resize()
                        append_data_{{dtype}}(ud, val)
                        append_data_uint8(rmd, 1)
                        continue

                k = kh_get_{{dtype}}(self.table, val)

                if k == self.table.n_buckets:
                    # k hasn't been seen yet
                    k = kh_put_{{dtype}}(self.table, val, &ret)

                    if needs_resize(ud):
                        with gil:
                            if uniques.external_view_exists:
                                raise ValueError("external reference to "
                                                 "uniques held, but "
                                                 "Vector.resize() needed")
                            uniques.resize()
                            if use_result_mask:
                                if result_mask.external_view_exists:
                                    raise ValueError("external reference to "
                                                     "result_mask held, but "
                                                     "Vector.resize() needed")
                                result_mask.resize()
                    append_data_{{dtype}}(ud, val)
                    if use_result_mask:
                        append_data_uint8(rmd, 0)

                    if return_inverse:
                        self.table.vals[k] = count
                        labels[i] = count
                        count += 1
                elif return_inverse:
                    # k falls into a previous bucket
                    # only relevant in case we need to construct the inverse
                    idx = self.table.vals[k]
                    labels[i] = idx

        if return_inverse:
            return uniques.to_array(), labels.base  # .base -> underlying ndarray
        if use_result_mask:
            return uniques.to_array(), result_mask.to_array()
        return uniques.to_array()

    def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False, object mask=None):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[{{dtype}}]
            Array of values of which unique will be calculated
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.
        mask : ndarray[bool], optional
            If not None, the mask is used as indicator for missing values
            (True = missing, False = valid) instead of `na_value` or

        Returns
        -------
        uniques : ndarray[{{dtype}}]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse)
            The labels from values to uniques
        result_mask: ndarray[bool], if mask is given as input
            The mask for the result values.
        """
        uniques = {{name}}Vector()
        use_result_mask = True if mask is not None else False
        return self._unique(values, uniques, ignore_na=False,
                            return_inverse=return_inverse, mask=mask, use_result_mask=use_result_mask)

    def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
                  object na_value=None, object mask=None, ignore_na=True):
        """
        Calculate unique values and labels (no sorting!)

        Missing values are not included in the "uniques" for this method.
        The labels for any missing values will be set to "na_sentinel"

        Parameters
        ----------
        values : ndarray[{{dtype}}]
            Array of values of which unique will be calculated
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then
            any value "val" satisfying val != val is considered missing.
            If na_value is not None, then _additionally_, any value "val"
            satisfying val == na_value is considered missing.
        mask : ndarray[bool], optional
            If not None, the mask is used as indicator for missing values
            (True = missing, False = valid) instead of `na_value` or
            condition "val != val".

        Returns
        -------
        uniques : ndarray[{{dtype}}]
            Unique values of input, not sorted
        labels : ndarray[intp_t]
            The labels from values to uniques
        """
        uniques_vector = {{name}}Vector()
        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
                            na_value=na_value, ignore_na=ignore_na, mask=mask,
                            return_inverse=True)

    def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                   object na_value=None, object mask=None):
        # -> np.ndarray[np.intp]
        _, labels = self._unique(values, uniques, count_prior=count_prior,
                                 na_sentinel=na_sentinel, na_value=na_value,
                                 ignore_na=True, return_inverse=True, mask=mask)
        return labels

    {{if dtype == 'int64'}}
    @cython.boundscheck(False)
    def get_labels_groupby(
        self, const {{dtype}}_t[:] values
    ) -> tuple[ndarray, ndarray]:
        # tuple[np.ndarray[np.intp], np.ndarray[{{dtype}}]]
        cdef:
            Py_ssize_t i, n = len(values)
            intp_t[::1] labels
            Py_ssize_t idx, count = 0
            int ret = 0
            {{c_type}} val
            khiter_t k
            {{name}}Vector uniques = {{name}}Vector()
            {{name}}VectorData *ud

        labels = np.empty(n, dtype=np.intp)
        ud = uniques.data

        with nogil:
            for i in range(n):
                val = {{to_c_type}}(values[i])

                # specific for groupby
                if val < 0:
                    labels[i] = -1
                    continue

                k = kh_get_{{dtype}}(self.table, val)
                if k != self.table.n_buckets:
                    idx = self.table.vals[k]
                    labels[i] = idx
                else:
                    k = kh_put_{{dtype}}(self.table, val, &ret)
                    self.table.vals[k] = count

                    if needs_resize(ud):
                        with gil:
                            uniques.resize()
                    append_data_{{dtype}}(ud, val)
                    labels[i] = count
                    count += 1

        arr_uniques = uniques.to_array()

        return np.asarray(labels), arr_uniques
    {{endif}}


cdef class {{name}}Factorizer(Factorizer):
    cdef public:
        {{name}}HashTable table
        {{name}}Vector uniques

    def __cinit__(self, size_hint: int):
        self.table = {{name}}HashTable(size_hint)
        self.uniques = {{name}}Vector()

    def factorize(self, const {{c_type}}[:] values,
                  na_sentinel=-1, na_value=None, object mask=None) -> np.ndarray:
        """
        Returns
        -------
        ndarray[intp_t]

        Examples
        --------
        Factorize values with nans replaced by na_sentinel

        >>> fac = {{name}}Factorizer(3)
        >>> fac.factorize(np.array([1,2,3], dtype="{{dtype}}"), na_sentinel=20)
        array([0, 1, 2])
        """
        cdef:
            ndarray[intp_t] labels

        if self.uniques.external_view_exists:
            uniques = {{name}}Vector()
            uniques.extend(self.uniques.to_array())
            self.uniques = uniques
        labels = self.table.get_labels(values, self.uniques,
                                       self.count, na_sentinel,
                                       na_value=na_value, mask=mask)
        self.count = len(self.uniques)
        return labels

{{endfor}}


cdef class StringHashTable(HashTable):
    # these by-definition *must* be strings
    # or a sentinel np.nan / None missing value
    na_string_sentinel = '__nan__'

    def __init__(self, int64_t size_hint=1):
        self.table = kh_init_str()
        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
        kh_resize_str(self.table, size_hint)

    def __dealloc__(self):
        if self.table is not NULL:
            kh_destroy_str(self.table)
            self.table = NULL

    def sizeof(self, deep: bool = False) -> int:
        overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
        for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
        for_pairs =  self.table.n_buckets * (sizeof(char *) +      # keys
                                             sizeof(Py_ssize_t))   # vals
        return overhead + for_flags + for_pairs

    def get_state(self) -> dict[str, int]:
        """ returns infos about the state of the hashtable"""
        return {
            'n_buckets' : self.table.n_buckets,
            'size' : self.table.size,
            'n_occupied' : self.table.n_occupied,
            'upper_bound' : self.table.upper_bound,
        }

    cpdef get_item(self, str val):
        cdef:
            khiter_t k
            const char *v
        v = get_c_string(val)

        k = kh_get_str(self.table, v)
        if k != self.table.n_buckets:
            return self.table.vals[k]
        else:
            raise KeyError(val)

    cpdef set_item(self, str key, Py_ssize_t val):
        cdef:
            khiter_t k
            int ret = 0
            const char *v

        v = get_c_string(key)

        k = kh_put_str(self.table, v, &ret)
        if kh_exist_str(self.table, k):
            self.table.vals[k] = val
        else:
            raise KeyError(key)

    @cython.boundscheck(False)
    def get_indexer(self, ndarray[object] values) -> ndarray:
        # -> np.ndarray[np.intp]
        cdef:
            Py_ssize_t i, n = len(values)
            ndarray[intp_t] labels = np.empty(n, dtype=np.intp)
            intp_t *resbuf = <intp_t*>labels.data
            khiter_t k
            kh_str_t *table = self.table
            const char *v
            const char **vecs

        vecs = <const char **>malloc(n * sizeof(char *))
        for i in range(n):
            val = values[i]
            v = get_c_string(val)
            vecs[i] = v

        with nogil:
            for i in range(n):
                k = kh_get_str(table, vecs[i])
                if k != table.n_buckets:
                    resbuf[i] = table.vals[k]
                else:
                    resbuf[i] = -1

        free(vecs)
        return labels

    @cython.boundscheck(False)
    def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
        # -> np.ndarray[np.intp]
        # mask not yet implemented
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            object val
            const char *v
            khiter_t k
            intp_t[::1] locs = np.empty(n, dtype=np.intp)

        # these by-definition *must* be strings
        vecs = <const char **>malloc(n * sizeof(char *))
        for i in range(n):
            val = values[i]

            if isinstance(val, str):
                # GH#31499 if we have a np.str_ get_c_string won't recognize
                #  it as a str, even though isinstance does.
                v = get_c_string(<str>val)
            else:
                v = get_c_string(self.na_string_sentinel)
            vecs[i] = v

        with nogil:
            for i in range(n):
                v = vecs[i]
                k = kh_get_str(self.table, v)
                if k != self.table.n_buckets:
                    locs[i] = self.table.vals[k]
                else:
                    locs[i] = -1

        free(vecs)
        return np.asarray(locs)

    @cython.boundscheck(False)
    def map_locations(self, ndarray[object] values, object mask = None) -> None:
        # mask not yet implemented
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            object val
            const char *v
            const char **vecs
            khiter_t k

        # these by-definition *must* be strings
        vecs = <const char **>malloc(n * sizeof(char *))
        for i in range(n):
            val = values[i]

            if isinstance(val, str):
                # GH#31499 if we have a np.str_ get_c_string won't recognize
                #  it as a str, even though isinstance does.
                v = get_c_string(<str>val)
            else:
                v = get_c_string(self.na_string_sentinel)
            vecs[i] = v

        with nogil:
            for i in range(n):
                v = vecs[i]
                k = kh_put_str(self.table, v, &ret)
                self.table.vals[k] = i
        free(vecs)

    @cython.boundscheck(False)
    @cython.wraparound(False)
    def _unique(self, ndarray[object] values, ObjectVector uniques,
                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                object na_value=None, bint ignore_na=False,
                bint return_inverse=False):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        uniques : ObjectVector
            Vector into which uniques will be written
        count_prior : Py_ssize_t, default 0
            Number of existing entries in uniques
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then any value
            that is not a string is considered missing. If na_value is
            not None, then _additionally_ any value "val" satisfying
            val == na_value is considered missing.
        ignore_na : bool, default False
            Whether NA-values should be ignored for calculating the uniques. If
            True, the labels corresponding to missing values will be set to
            na_sentinel.
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse=True)
            The labels from values to uniques
        """
        cdef:
            Py_ssize_t i, idx, count = count_prior, n = len(values)
            intp_t[::1] labels
            int64_t[::1] uindexer
            int ret = 0
            object val
            const char *v
            const char **vecs
            khiter_t k
            bint use_na_value

        if return_inverse:
            labels = np.zeros(n, dtype=np.intp)
        uindexer = np.empty(n, dtype=np.int64)
        use_na_value = na_value is not None

        # assign pointers and pre-filter out missing (if ignore_na)
        vecs = <const char **>malloc(n * sizeof(char *))
        for i in range(n):
            val = values[i]

            if (ignore_na
                and (not isinstance(val, str)
                     or (use_na_value and val == na_value))):
                # if missing values do not count as unique values (i.e. if
                # ignore_na is True), we can skip the actual value, and
                # replace the label with na_sentinel directly
                labels[i] = na_sentinel
            else:
                # if ignore_na is False, we also stringify NaN/None/etc.
                try:
                    v = get_c_string(<str>val)
                except UnicodeEncodeError:
                    v = get_c_string(<str>repr(val))
                vecs[i] = v

        # compute
        with nogil:
            for i in range(n):
                if ignore_na and labels[i] == na_sentinel:
                    # skip entries for ignored missing values (see above)
                    continue

                v = vecs[i]
                k = kh_get_str(self.table, v)
                if k == self.table.n_buckets:
                    # k hasn't been seen yet
                    k = kh_put_str(self.table, v, &ret)
                    uindexer[count] = i
                    if return_inverse:
                        self.table.vals[k] = count
                        labels[i] = count
                    count += 1
                elif return_inverse:
                    # k falls into a previous bucket
                    # only relevant in case we need to construct the inverse
                    idx = self.table.vals[k]
                    labels[i] = idx

        free(vecs)

        # uniques
        for i in range(count):
            uniques.append(values[uindexer[i]])

        if return_inverse:
            return uniques.to_array(), labels.base  # .base -> underlying ndarray
        return uniques.to_array()

    def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.
        mask : ndarray[bool], optional
            Not yet implemented for StringHashTable

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse)
            The labels from values to uniques
        """
        uniques = ObjectVector()
        return self._unique(values, uniques, ignore_na=False,
                            return_inverse=return_inverse)

    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
                  object na_value=None, object mask=None, ignore_na=True):
        """
        Calculate unique values and labels (no sorting!)

        Missing values are not included in the "uniques" for this method.
        The labels for any missing values will be set to "na_sentinel"

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then any value
            that is not a string is considered missing. If na_value is
            not None, then _additionally_ any value "val" satisfying
            val == na_value is considered missing.
        mask : ndarray[bool], optional
            Not yet implemented for StringHashTable.

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp]
            The labels from values to uniques
        """
        uniques_vector = ObjectVector()
        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
                            na_value=na_value, ignore_na=ignore_na,
                            return_inverse=True)

    def get_labels(self, ndarray[object] values, ObjectVector uniques,
                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
        # -> np.ndarray[np.intp]
        _, labels = self._unique(values, uniques, count_prior=count_prior,
                                 na_sentinel=na_sentinel, na_value=na_value,
                                 ignore_na=True, return_inverse=True)
        return labels


cdef class PyObjectHashTable(HashTable):

    def __init__(self, int64_t size_hint=1):
        self.table = kh_init_pymap()
        size_hint = min(kh_needed_n_buckets(size_hint), SIZE_HINT_LIMIT)
        kh_resize_pymap(self.table, size_hint)

    def __dealloc__(self):
        if self.table is not NULL:
            kh_destroy_pymap(self.table)
            self.table = NULL

    def __len__(self) -> int:
        return self.table.size

    def __contains__(self, object key) -> bool:
        cdef:
            khiter_t k
        hash(key)

        k = kh_get_pymap(self.table, <PyObject*>key)
        return k != self.table.n_buckets

    def sizeof(self, deep: bool = False) -> int:
        """ return the size of my table in bytes """
        overhead = 4 * sizeof(uint32_t) + 3 * sizeof(uint32_t*)
        for_flags = max(1, self.table.n_buckets >> 5) * sizeof(uint32_t)
        for_pairs =  self.table.n_buckets * (sizeof(PyObject *) +  # keys
                                             sizeof(Py_ssize_t))   # vals
        return overhead + for_flags + for_pairs

    def get_state(self) -> dict[str, int]:
        """
        returns infos about the current state of the hashtable like size,
        number of buckets and so on.
        """
        return {
            'n_buckets' : self.table.n_buckets,
            'size' : self.table.size,
            'n_occupied' : self.table.n_occupied,
            'upper_bound' : self.table.upper_bound,
        }

    cpdef get_item(self, object val):
        cdef:
            khiter_t k

        k = kh_get_pymap(self.table, <PyObject*>val)
        if k != self.table.n_buckets:
            return self.table.vals[k]
        else:
            raise KeyError(val)

    cpdef set_item(self, object key, Py_ssize_t val):
        cdef:
            khiter_t k
            int ret = 0
            char* buf

        hash(key)

        k = kh_put_pymap(self.table, <PyObject*>key, &ret)
        if kh_exist_pymap(self.table, k):
            self.table.vals[k] = val
        else:
            raise KeyError(key)

    def map_locations(self, ndarray[object] values, object mask = None) -> None:
        # mask not yet implemented
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            object val
            khiter_t k

        for i in range(n):
            val = values[i]
            hash(val)

            k = kh_put_pymap(self.table, <PyObject*>val, &ret)
            self.table.vals[k] = i

    def lookup(self, ndarray[object] values, object mask = None) -> ndarray:
        # -> np.ndarray[np.intp]
        # mask not yet implemented
        cdef:
            Py_ssize_t i, n = len(values)
            int ret = 0
            object val
            khiter_t k
            intp_t[::1] locs = np.empty(n, dtype=np.intp)

        for i in range(n):
            val = values[i]
            hash(val)

            k = kh_get_pymap(self.table, <PyObject*>val)
            if k != self.table.n_buckets:
                locs[i] = self.table.vals[k]
            else:
                locs[i] = -1

        return np.asarray(locs)

    @cython.boundscheck(False)
    @cython.wraparound(False)
    def _unique(self, ndarray[object] values, ObjectVector uniques,
                Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                object na_value=None, bint ignore_na=False,
                bint return_inverse=False):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        uniques : ObjectVector
            Vector into which uniques will be written
        count_prior : Py_ssize_t, default 0
            Number of existing entries in uniques
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then None _plus_
            any value "val" satisfying val != val is considered missing.
            If na_value is not None, then _additionally_, any value "val"
            satisfying val == na_value is considered missing.
        ignore_na : bool, default False
            Whether NA-values should be ignored for calculating the uniques. If
            True, the labels corresponding to missing values will be set to
            na_sentinel.
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse=True)
            The labels from values to uniques
        """
        cdef:
            Py_ssize_t i, idx, count = count_prior, n = len(values)
            intp_t[::1] labels
            int ret = 0
            object val
            khiter_t k
            bint use_na_value

        if return_inverse:
            labels = np.empty(n, dtype=np.intp)
        use_na_value = na_value is not None

        for i in range(n):
            val = values[i]
            hash(val)

            if ignore_na and (
                checknull(val)
                or (use_na_value and val == na_value)
            ):
                # if missing values do not count as unique values (i.e. if
                # ignore_na is True), skip the hashtable entry for them, and
                # replace the corresponding label with na_sentinel
                labels[i] = na_sentinel
                continue

            k = kh_get_pymap(self.table, <PyObject*>val)
            if k == self.table.n_buckets:
                # k hasn't been seen yet
                k = kh_put_pymap(self.table, <PyObject*>val, &ret)
                uniques.append(val)
                if return_inverse:
                    self.table.vals[k] = count
                    labels[i] = count
                    count += 1
            elif return_inverse:
                # k falls into a previous bucket
                # only relevant in case we need to construct the inverse
                idx = self.table.vals[k]
                labels[i] = idx

        if return_inverse:
            return uniques.to_array(), labels.base  # .base -> underlying ndarray
        return uniques.to_array()

    def unique(self, ndarray[object] values, bint return_inverse=False, object mask=None):
        """
        Calculate unique values and labels (no sorting!)

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        return_inverse : bool, default False
            Whether the mapping of the original array values to their location
            in the vector of uniques should be returned.
        mask : ndarray[bool], optional
            Not yet implemented for PyObjectHashTable

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp_t] (if return_inverse)
            The labels from values to uniques
        """
        uniques = ObjectVector()
        return self._unique(values, uniques, ignore_na=False,
                            return_inverse=return_inverse)

    def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
                  object na_value=None, object mask=None, ignore_na=True):
        """
        Calculate unique values and labels (no sorting!)

        Missing values are not included in the "uniques" for this method.
        The labels for any missing values will be set to "na_sentinel"

        Parameters
        ----------
        values : ndarray[object]
            Array of values of which unique will be calculated
        na_sentinel : Py_ssize_t, default -1
            Sentinel value used for all NA-values in inverse
        na_value : object, default None
            Value to identify as missing. If na_value is None, then None _plus_
            any value "val" satisfying val != val is considered missing.
            If na_value is not None, then _additionally_, any value "val"
            satisfying val == na_value is considered missing.
        mask : ndarray[bool], optional
            Not yet implemented for PyObjectHashTable.

        Returns
        -------
        uniques : ndarray[object]
            Unique values of input, not sorted
        labels : ndarray[intp_t]
            The labels from values to uniques
        """
        uniques_vector = ObjectVector()
        return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
                            na_value=na_value, ignore_na=ignore_na,
                            return_inverse=True)

    def get_labels(self, ndarray[object] values, ObjectVector uniques,
                   Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1,
                   object na_value=None):
        # -> np.ndarray[np.intp]
        _, labels = self._unique(values, uniques, count_prior=count_prior,
                                 na_sentinel=na_sentinel, na_value=na_value,
                                 ignore_na=True, return_inverse=True)
        return labels