123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484 |
- """
- Template for each `dtype` helper function for hashtable
- WARNING: DO NOT edit .pxi FILE directly, .pxi is generated from .pxi.in
- """
- {{py:
- # name, dtype, ttype, c_type, to_c_type
- dtypes = [('Complex128', 'complex128', 'complex128',
- 'khcomplex128_t', 'to_khcomplex128_t'),
- ('Complex64', 'complex64', 'complex64',
- 'khcomplex64_t', 'to_khcomplex64_t'),
- ('Float64', 'float64', 'float64', 'float64_t', ''),
- ('Float32', 'float32', 'float32', 'float32_t', ''),
- ('UInt64', 'uint64', 'uint64', 'uint64_t', ''),
- ('UInt32', 'uint32', 'uint32', 'uint32_t', ''),
- ('UInt16', 'uint16', 'uint16', 'uint16_t', ''),
- ('UInt8', 'uint8', 'uint8', 'uint8_t', ''),
- ('Object', 'object', 'pymap', 'object', '<PyObject*>'),
- ('Int64', 'int64', 'int64', 'int64_t', ''),
- ('Int32', 'int32', 'int32', 'int32_t', ''),
- ('Int16', 'int16', 'int16', 'int16_t', ''),
- ('Int8', 'int8', 'int8', 'int8_t', '')]
- }}
- {{for name, dtype, ttype, c_type, to_c_type in dtypes}}
- @cython.wraparound(False)
- @cython.boundscheck(False)
- {{if dtype == 'object'}}
- cdef value_count_{{dtype}}(ndarray[{{dtype}}] values, bint dropna, const uint8_t[:] mask=None):
- {{else}}
- cdef value_count_{{dtype}}(const {{dtype}}_t[:] values, bint dropna, const uint8_t[:] mask=None):
- {{endif}}
- cdef:
- Py_ssize_t i = 0
- Py_ssize_t n = len(values)
- kh_{{ttype}}_t *table
- # Don't use Py_ssize_t, since table.n_buckets is unsigned
- khiter_t k
- {{c_type}} val
- int ret = 0
- bint uses_mask = mask is not None
- bint isna_entry = False
- if uses_mask and not dropna:
- raise NotImplementedError("uses_mask not implemented with dropna=False")
- # we track the order in which keys are first seen (GH39009),
- # khash-map isn't insertion-ordered, thus:
- # table maps keys to counts
- # result_keys remembers the original order of keys
- result_keys = {{name}}Vector()
- table = kh_init_{{ttype}}()
- {{if dtype == 'object'}}
- if uses_mask:
- raise NotImplementedError("uses_mask not implemented with object dtype")
- kh_resize_{{ttype}}(table, n // 10)
- for i in range(n):
- val = values[i]
- if not dropna or not checknull(val):
- k = kh_get_{{ttype}}(table, {{to_c_type}}val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_{{ttype}}(table, {{to_c_type}}val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
- {{else}}
- kh_resize_{{ttype}}(table, n)
- for i in range(n):
- val = {{to_c_type}}(values[i])
- if dropna:
- if uses_mask:
- isna_entry = mask[i]
- else:
- isna_entry = is_nan_{{c_type}}(val)
- if not dropna or not isna_entry:
- k = kh_get_{{ttype}}(table, val)
- if k != table.n_buckets:
- table.vals[k] += 1
- else:
- k = kh_put_{{ttype}}(table, val, &ret)
- table.vals[k] = 1
- result_keys.append(val)
- {{endif}}
- # collect counts in the order corresponding to result_keys:
- cdef:
- int64_t[::1] result_counts = np.empty(table.size, dtype=np.int64)
- for i in range(table.size):
- {{if dtype == 'object'}}
- k = kh_get_{{ttype}}(table, result_keys.data[i])
- {{else}}
- k = kh_get_{{ttype}}(table, result_keys.data.data[i])
- {{endif}}
- result_counts[i] = table.vals[k]
- kh_destroy_{{ttype}}(table)
- return result_keys.to_array(), result_counts.base
- @cython.wraparound(False)
- @cython.boundscheck(False)
- {{if dtype == 'object'}}
- cdef duplicated_{{dtype}}(ndarray[{{dtype}}] values, object keep='first', const uint8_t[:] mask=None):
- {{else}}
- cdef duplicated_{{dtype}}(const {{dtype}}_t[:] values, object keep='first', const uint8_t[:] mask=None):
- {{endif}}
- cdef:
- int ret = 0
- {{if dtype != 'object'}}
- {{c_type}} value
- {{else}}
- PyObject* value
- {{endif}}
- Py_ssize_t i, n = len(values), first_na = -1
- khiter_t k
- kh_{{ttype}}_t *table = kh_init_{{ttype}}()
- ndarray[uint8_t, ndim=1, cast=True] out = np.empty(n, dtype='bool')
- bint seen_na = False, uses_mask = mask is not None
- bint seen_multiple_na = False
- kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
- if keep not in ('last', 'first', False):
- raise ValueError('keep must be either "first", "last" or False')
- {{for cond, keep in [('if', '"last"'), ('elif', '"first"')]}}
- {{cond}} keep == {{keep}}:
- {{if dtype == 'object'}}
- if True:
- {{else}}
- with nogil:
- {{endif}}
- {{if keep == '"last"'}}
- for i in range(n - 1, -1, -1):
- {{else}}
- for i in range(n):
- {{endif}}
- if uses_mask and mask[i]:
- if seen_na:
- out[i] = True
- else:
- out[i] = False
- seen_na = True
- else:
- value = {{to_c_type}}(values[i])
- kh_put_{{ttype}}(table, value, &ret)
- out[i] = ret == 0
- {{endfor}}
- else:
- {{if dtype == 'object'}}
- if True:
- {{else}}
- with nogil:
- {{endif}}
- for i in range(n):
- if uses_mask and mask[i]:
- if not seen_na:
- first_na = i
- seen_na = True
- out[i] = 0
- elif not seen_multiple_na:
- out[i] = 1
- out[first_na] = 1
- seen_multiple_na = True
- else:
- out[i] = 1
- else:
- value = {{to_c_type}}(values[i])
- k = kh_get_{{ttype}}(table, value)
- if k != table.n_buckets:
- out[table.vals[k]] = 1
- out[i] = 1
- else:
- k = kh_put_{{ttype}}(table, value, &ret)
- table.vals[k] = i
- out[i] = 0
- kh_destroy_{{ttype}}(table)
- return out
- # ----------------------------------------------------------------------
- # Membership
- # ----------------------------------------------------------------------
- @cython.wraparound(False)
- @cython.boundscheck(False)
- {{if dtype == 'object'}}
- cdef ismember_{{dtype}}(ndarray[{{c_type}}] arr, ndarray[{{c_type}}] values):
- {{else}}
- cdef ismember_{{dtype}}(const {{dtype}}_t[:] arr, const {{dtype}}_t[:] values):
- {{endif}}
- """
- Return boolean of values in arr on an
- element by-element basis
- Parameters
- ----------
- arr : {{dtype}} ndarray
- values : {{dtype}} ndarray
- Returns
- -------
- boolean ndarray len of (arr)
- """
- cdef:
- Py_ssize_t i, n
- khiter_t k
- int ret = 0
- ndarray[uint8_t] result
- {{if dtype == "object"}}
- PyObject* val
- {{else}}
- {{c_type}} val
- {{endif}}
- kh_{{ttype}}_t *table = kh_init_{{ttype}}()
- # construct the table
- n = len(values)
- kh_resize_{{ttype}}(table, n)
- {{if dtype == 'object'}}
- if True:
- {{else}}
- with nogil:
- {{endif}}
- for i in range(n):
- val = {{to_c_type}}(values[i])
- kh_put_{{ttype}}(table, val, &ret)
- # test membership
- n = len(arr)
- result = np.empty(n, dtype=np.uint8)
- {{if dtype == 'object'}}
- if True:
- {{else}}
- with nogil:
- {{endif}}
- for i in range(n):
- val = {{to_c_type}}(arr[i])
- k = kh_get_{{ttype}}(table, val)
- result[i] = (k != table.n_buckets)
- kh_destroy_{{ttype}}(table)
- return result.view(np.bool_)
- # ----------------------------------------------------------------------
- # Mode Computations
- # ----------------------------------------------------------------------
- {{endfor}}
- ctypedef fused htfunc_t:
- numeric_object_t
- complex128_t
- complex64_t
- cpdef value_count(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
- if htfunc_t is object:
- return value_count_object(values, dropna, mask=mask)
- elif htfunc_t is int8_t:
- return value_count_int8(values, dropna, mask=mask)
- elif htfunc_t is int16_t:
- return value_count_int16(values, dropna, mask=mask)
- elif htfunc_t is int32_t:
- return value_count_int32(values, dropna, mask=mask)
- elif htfunc_t is int64_t:
- return value_count_int64(values, dropna, mask=mask)
- elif htfunc_t is uint8_t:
- return value_count_uint8(values, dropna, mask=mask)
- elif htfunc_t is uint16_t:
- return value_count_uint16(values, dropna, mask=mask)
- elif htfunc_t is uint32_t:
- return value_count_uint32(values, dropna, mask=mask)
- elif htfunc_t is uint64_t:
- return value_count_uint64(values, dropna, mask=mask)
- elif htfunc_t is float64_t:
- return value_count_float64(values, dropna, mask=mask)
- elif htfunc_t is float32_t:
- return value_count_float32(values, dropna, mask=mask)
- elif htfunc_t is complex128_t:
- return value_count_complex128(values, dropna, mask=mask)
- elif htfunc_t is complex64_t:
- return value_count_complex64(values, dropna, mask=mask)
- else:
- raise TypeError(values.dtype)
- cpdef duplicated(ndarray[htfunc_t] values, object keep="first", const uint8_t[:] mask=None):
- if htfunc_t is object:
- return duplicated_object(values, keep, mask=mask)
- elif htfunc_t is int8_t:
- return duplicated_int8(values, keep, mask=mask)
- elif htfunc_t is int16_t:
- return duplicated_int16(values, keep, mask=mask)
- elif htfunc_t is int32_t:
- return duplicated_int32(values, keep, mask=mask)
- elif htfunc_t is int64_t:
- return duplicated_int64(values, keep, mask=mask)
- elif htfunc_t is uint8_t:
- return duplicated_uint8(values, keep, mask=mask)
- elif htfunc_t is uint16_t:
- return duplicated_uint16(values, keep, mask=mask)
- elif htfunc_t is uint32_t:
- return duplicated_uint32(values, keep, mask=mask)
- elif htfunc_t is uint64_t:
- return duplicated_uint64(values, keep, mask=mask)
- elif htfunc_t is float64_t:
- return duplicated_float64(values, keep, mask=mask)
- elif htfunc_t is float32_t:
- return duplicated_float32(values, keep, mask=mask)
- elif htfunc_t is complex128_t:
- return duplicated_complex128(values, keep, mask=mask)
- elif htfunc_t is complex64_t:
- return duplicated_complex64(values, keep, mask=mask)
- else:
- raise TypeError(values.dtype)
- cpdef ismember(ndarray[htfunc_t] arr, ndarray[htfunc_t] values):
- if htfunc_t is object:
- return ismember_object(arr, values)
- elif htfunc_t is int8_t:
- return ismember_int8(arr, values)
- elif htfunc_t is int16_t:
- return ismember_int16(arr, values)
- elif htfunc_t is int32_t:
- return ismember_int32(arr, values)
- elif htfunc_t is int64_t:
- return ismember_int64(arr, values)
- elif htfunc_t is uint8_t:
- return ismember_uint8(arr, values)
- elif htfunc_t is uint16_t:
- return ismember_uint16(arr, values)
- elif htfunc_t is uint32_t:
- return ismember_uint32(arr, values)
- elif htfunc_t is uint64_t:
- return ismember_uint64(arr, values)
- elif htfunc_t is float64_t:
- return ismember_float64(arr, values)
- elif htfunc_t is float32_t:
- return ismember_float32(arr, values)
- elif htfunc_t is complex128_t:
- return ismember_complex128(arr, values)
- elif htfunc_t is complex64_t:
- return ismember_complex64(arr, values)
- else:
- raise TypeError(values.dtype)
- @cython.wraparound(False)
- @cython.boundscheck(False)
- def mode(ndarray[htfunc_t] values, bint dropna, const uint8_t[:] mask=None):
- # TODO(cython3): use const htfunct_t[:]
- cdef:
- ndarray[htfunc_t] keys
- ndarray[htfunc_t] modes
- int64_t[::1] counts
- int64_t count, max_count = -1
- Py_ssize_t nkeys, k, j = 0
- keys, counts = value_count(values, dropna, mask=mask)
- nkeys = len(keys)
- modes = np.empty(nkeys, dtype=values.dtype)
- if htfunc_t is not object:
- with nogil:
- for k in range(nkeys):
- count = counts[k]
- if count == max_count:
- j += 1
- elif count > max_count:
- max_count = count
- j = 0
- else:
- continue
- modes[j] = keys[k]
- else:
- for k in range(nkeys):
- count = counts[k]
- if count == max_count:
- j += 1
- elif count > max_count:
- max_count = count
- j = 0
- else:
- continue
- modes[j] = keys[k]
- return modes[:j + 1]
- {{py:
- # name, dtype, ttype, c_type
- dtypes = [('Int64', 'int64', 'int64', 'int64_t'),
- ('Int32', 'int32', 'int32', 'int32_t'), ]
- }}
- {{for name, dtype, ttype, c_type in dtypes}}
- @cython.wraparound(False)
- @cython.boundscheck(False)
- def _unique_label_indices_{{dtype}}(const {{c_type}}[:] labels) -> ndarray:
- """
- Indices of the first occurrences of the unique labels
- *excluding* -1. equivalent to:
- np.unique(labels, return_index=True)[1]
- """
- cdef:
- int ret = 0
- Py_ssize_t i, n = len(labels)
- kh_{{ttype}}_t *table = kh_init_{{ttype}}()
- {{name}}Vector idx = {{name}}Vector()
- ndarray[{{c_type}}, ndim=1] arr
- {{name}}VectorData *ud = idx.data
- kh_resize_{{ttype}}(table, min(kh_needed_n_buckets(n), SIZE_HINT_LIMIT))
- with nogil:
- for i in range(n):
- kh_put_{{ttype}}(table, labels[i], &ret)
- if ret != 0:
- if needs_resize(ud):
- with gil:
- idx.resize()
- append_data_{{ttype}}(ud, i)
- kh_destroy_{{ttype}}(table)
- arr = idx.to_array()
- arr = arr[np.asarray(labels)[arr].argsort()]
- return arr[1:] if arr.size != 0 and labels[arr[0]] == -1 else arr
- {{endfor}}
|