hashtable.pyx 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. cimport cython
  2. from cpython.mem cimport (
  3. PyMem_Free,
  4. PyMem_Malloc,
  5. )
  6. from cpython.ref cimport (
  7. Py_INCREF,
  8. PyObject,
  9. )
  10. from libc.stdlib cimport (
  11. free,
  12. malloc,
  13. )
  14. import numpy as np
  15. cimport numpy as cnp
  16. from numpy cimport ndarray
  17. cnp.import_array()
  18. from pandas._libs cimport util
  19. from pandas._libs.dtypes cimport numeric_object_t
  20. from pandas._libs.khash cimport (
  21. KHASH_TRACE_DOMAIN,
  22. are_equivalent_float32_t,
  23. are_equivalent_float64_t,
  24. are_equivalent_khcomplex64_t,
  25. are_equivalent_khcomplex128_t,
  26. kh_needed_n_buckets,
  27. kh_python_hash_equal,
  28. kh_python_hash_func,
  29. khiter_t,
  30. )
  31. from pandas._libs.missing cimport checknull
  32. def get_hashtable_trace_domain():
  33. return KHASH_TRACE_DOMAIN
  34. def object_hash(obj):
  35. return kh_python_hash_func(obj)
  36. def objects_are_equal(a, b):
  37. return kh_python_hash_equal(a, b)
  38. cdef int64_t NPY_NAT = util.get_nat()
  39. SIZE_HINT_LIMIT = (1 << 20) + 7
  40. cdef Py_ssize_t _INIT_VEC_CAP = 128
  41. include "hashtable_class_helper.pxi"
  42. include "hashtable_func_helper.pxi"
  43. # map derived hash-map types onto basic hash-map types:
  44. if np.dtype(np.intp) == np.dtype(np.int64):
  45. IntpHashTable = Int64HashTable
  46. unique_label_indices = _unique_label_indices_int64
  47. elif np.dtype(np.intp) == np.dtype(np.int32):
  48. IntpHashTable = Int32HashTable
  49. unique_label_indices = _unique_label_indices_int32
  50. else:
  51. raise ValueError(np.dtype(np.intp))
  52. cdef class Factorizer:
  53. cdef readonly:
  54. Py_ssize_t count
  55. def __cinit__(self, size_hint: int):
  56. self.count = 0
  57. def get_count(self) -> int:
  58. return self.count
  59. def factorize(self, values, na_sentinel=-1, na_value=None, mask=None) -> np.ndarray:
  60. raise NotImplementedError
  61. cdef class ObjectFactorizer(Factorizer):
  62. cdef public:
  63. PyObjectHashTable table
  64. ObjectVector uniques
  65. def __cinit__(self, size_hint: int):
  66. self.table = PyObjectHashTable(size_hint)
  67. self.uniques = ObjectVector()
  68. def factorize(
  69. self, ndarray[object] values, na_sentinel=-1, na_value=None, mask=None
  70. ) -> np.ndarray:
  71. """
  72. Returns
  73. -------
  74. np.ndarray[np.intp]
  75. Examples
  76. --------
  77. Factorize values with nans replaced by na_sentinel
  78. >>> fac = ObjectFactorizer(3)
  79. >>> fac.factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
  80. array([ 0, 1, 20])
  81. """
  82. cdef:
  83. ndarray[intp_t] labels
  84. if mask is not None:
  85. raise NotImplementedError("mask not supported for ObjectFactorizer.")
  86. if self.uniques.external_view_exists:
  87. uniques = ObjectVector()
  88. uniques.extend(self.uniques.to_array())
  89. self.uniques = uniques
  90. labels = self.table.get_labels(values, self.uniques,
  91. self.count, na_sentinel, na_value)
  92. self.count = len(self.uniques)
  93. return labels