hashing.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366
  1. """
  2. data hash pandas / numpy objects
  3. """
  4. from __future__ import annotations
  5. import itertools
  6. from typing import (
  7. TYPE_CHECKING,
  8. Hashable,
  9. Iterable,
  10. Iterator,
  11. cast,
  12. )
  13. import numpy as np
  14. from pandas._libs import lib
  15. from pandas._libs.hashing import hash_object_array
  16. from pandas._typing import (
  17. ArrayLike,
  18. npt,
  19. )
  20. from pandas.core.dtypes.common import (
  21. is_categorical_dtype,
  22. is_list_like,
  23. )
  24. from pandas.core.dtypes.generic import (
  25. ABCDataFrame,
  26. ABCExtensionArray,
  27. ABCIndex,
  28. ABCMultiIndex,
  29. ABCSeries,
  30. )
  31. if TYPE_CHECKING:
  32. from pandas import (
  33. Categorical,
  34. DataFrame,
  35. Index,
  36. MultiIndex,
  37. Series,
  38. )
  39. # 16 byte long hashing key
  40. _default_hash_key = "0123456789123456"
  41. def combine_hash_arrays(
  42. arrays: Iterator[np.ndarray], num_items: int
  43. ) -> npt.NDArray[np.uint64]:
  44. """
  45. Parameters
  46. ----------
  47. arrays : Iterator[np.ndarray]
  48. num_items : int
  49. Returns
  50. -------
  51. np.ndarray[uint64]
  52. Should be the same as CPython's tupleobject.c
  53. """
  54. try:
  55. first = next(arrays)
  56. except StopIteration:
  57. return np.array([], dtype=np.uint64)
  58. arrays = itertools.chain([first], arrays)
  59. mult = np.uint64(1000003)
  60. out = np.zeros_like(first) + np.uint64(0x345678)
  61. last_i = 0
  62. for i, a in enumerate(arrays):
  63. inverse_i = num_items - i
  64. out ^= a
  65. out *= mult
  66. mult += np.uint64(82520 + inverse_i + inverse_i)
  67. last_i = i
  68. assert last_i + 1 == num_items, "Fed in wrong num_items"
  69. out += np.uint64(97531)
  70. return out
  71. def hash_pandas_object(
  72. obj: Index | DataFrame | Series,
  73. index: bool = True,
  74. encoding: str = "utf8",
  75. hash_key: str | None = _default_hash_key,
  76. categorize: bool = True,
  77. ) -> Series:
  78. """
  79. Return a data hash of the Index/Series/DataFrame.
  80. Parameters
  81. ----------
  82. obj : Index, Series, or DataFrame
  83. index : bool, default True
  84. Include the index in the hash (if Series/DataFrame).
  85. encoding : str, default 'utf8'
  86. Encoding for data & key when strings.
  87. hash_key : str, default _default_hash_key
  88. Hash_key for string key to encode.
  89. categorize : bool, default True
  90. Whether to first categorize object arrays before hashing. This is more
  91. efficient when the array contains duplicate values.
  92. Returns
  93. -------
  94. Series of uint64, same length as the object
  95. """
  96. from pandas import Series
  97. if hash_key is None:
  98. hash_key = _default_hash_key
  99. if isinstance(obj, ABCMultiIndex):
  100. return Series(hash_tuples(obj, encoding, hash_key), dtype="uint64", copy=False)
  101. elif isinstance(obj, ABCIndex):
  102. h = hash_array(obj._values, encoding, hash_key, categorize).astype(
  103. "uint64", copy=False
  104. )
  105. ser = Series(h, index=obj, dtype="uint64", copy=False)
  106. elif isinstance(obj, ABCSeries):
  107. h = hash_array(obj._values, encoding, hash_key, categorize).astype(
  108. "uint64", copy=False
  109. )
  110. if index:
  111. index_iter = (
  112. hash_pandas_object(
  113. obj.index,
  114. index=False,
  115. encoding=encoding,
  116. hash_key=hash_key,
  117. categorize=categorize,
  118. )._values
  119. for _ in [None]
  120. )
  121. arrays = itertools.chain([h], index_iter)
  122. h = combine_hash_arrays(arrays, 2)
  123. ser = Series(h, index=obj.index, dtype="uint64", copy=False)
  124. elif isinstance(obj, ABCDataFrame):
  125. hashes = (
  126. hash_array(series._values, encoding, hash_key, categorize)
  127. for _, series in obj.items()
  128. )
  129. num_items = len(obj.columns)
  130. if index:
  131. index_hash_generator = (
  132. hash_pandas_object(
  133. obj.index,
  134. index=False,
  135. encoding=encoding,
  136. hash_key=hash_key,
  137. categorize=categorize,
  138. )._values
  139. for _ in [None]
  140. )
  141. num_items += 1
  142. # keep `hashes` specifically a generator to keep mypy happy
  143. _hashes = itertools.chain(hashes, index_hash_generator)
  144. hashes = (x for x in _hashes)
  145. h = combine_hash_arrays(hashes, num_items)
  146. ser = Series(h, index=obj.index, dtype="uint64", copy=False)
  147. else:
  148. raise TypeError(f"Unexpected type for hashing {type(obj)}")
  149. return ser
  150. def hash_tuples(
  151. vals: MultiIndex | Iterable[tuple[Hashable, ...]],
  152. encoding: str = "utf8",
  153. hash_key: str = _default_hash_key,
  154. ) -> npt.NDArray[np.uint64]:
  155. """
  156. Hash an MultiIndex / listlike-of-tuples efficiently.
  157. Parameters
  158. ----------
  159. vals : MultiIndex or listlike-of-tuples
  160. encoding : str, default 'utf8'
  161. hash_key : str, default _default_hash_key
  162. Returns
  163. -------
  164. ndarray[np.uint64] of hashed values
  165. """
  166. if not is_list_like(vals):
  167. raise TypeError("must be convertible to a list-of-tuples")
  168. from pandas import (
  169. Categorical,
  170. MultiIndex,
  171. )
  172. if not isinstance(vals, ABCMultiIndex):
  173. mi = MultiIndex.from_tuples(vals)
  174. else:
  175. mi = vals
  176. # create a list-of-Categoricals
  177. cat_vals = [
  178. Categorical(mi.codes[level], mi.levels[level], ordered=False, fastpath=True)
  179. for level in range(mi.nlevels)
  180. ]
  181. # hash the list-of-ndarrays
  182. hashes = (
  183. _hash_categorical(cat, encoding=encoding, hash_key=hash_key) for cat in cat_vals
  184. )
  185. h = combine_hash_arrays(hashes, len(cat_vals))
  186. return h
  187. def _hash_categorical(
  188. cat: Categorical, encoding: str, hash_key: str
  189. ) -> npt.NDArray[np.uint64]:
  190. """
  191. Hash a Categorical by hashing its categories, and then mapping the codes
  192. to the hashes
  193. Parameters
  194. ----------
  195. cat : Categorical
  196. encoding : str
  197. hash_key : str
  198. Returns
  199. -------
  200. ndarray[np.uint64] of hashed values, same size as len(c)
  201. """
  202. # Convert ExtensionArrays to ndarrays
  203. values = np.asarray(cat.categories._values)
  204. hashed = hash_array(values, encoding, hash_key, categorize=False)
  205. # we have uint64, as we don't directly support missing values
  206. # we don't want to use take_nd which will coerce to float
  207. # instead, directly construct the result with a
  208. # max(np.uint64) as the missing value indicator
  209. #
  210. # TODO: GH 15362
  211. mask = cat.isna()
  212. if len(hashed):
  213. result = hashed.take(cat.codes)
  214. else:
  215. result = np.zeros(len(mask), dtype="uint64")
  216. if mask.any():
  217. result[mask] = lib.u8max
  218. return result
  219. def hash_array(
  220. vals: ArrayLike,
  221. encoding: str = "utf8",
  222. hash_key: str = _default_hash_key,
  223. categorize: bool = True,
  224. ) -> npt.NDArray[np.uint64]:
  225. """
  226. Given a 1d array, return an array of deterministic integers.
  227. Parameters
  228. ----------
  229. vals : ndarray or ExtensionArray
  230. encoding : str, default 'utf8'
  231. Encoding for data & key when strings.
  232. hash_key : str, default _default_hash_key
  233. Hash_key for string key to encode.
  234. categorize : bool, default True
  235. Whether to first categorize object arrays before hashing. This is more
  236. efficient when the array contains duplicate values.
  237. Returns
  238. -------
  239. ndarray[np.uint64, ndim=1]
  240. Hashed values, same length as the vals.
  241. """
  242. if not hasattr(vals, "dtype"):
  243. raise TypeError("must pass a ndarray-like")
  244. dtype = vals.dtype
  245. # For categoricals, we hash the categories, then remap the codes to the
  246. # hash values. (This check is above the complex check so that we don't ask
  247. # numpy if categorical is a subdtype of complex, as it will choke).
  248. if is_categorical_dtype(dtype):
  249. vals = cast("Categorical", vals)
  250. return _hash_categorical(vals, encoding, hash_key)
  251. elif isinstance(vals, ABCExtensionArray):
  252. vals, _ = vals._values_for_factorize()
  253. elif not isinstance(vals, np.ndarray):
  254. # GH#42003
  255. raise TypeError(
  256. "hash_array requires np.ndarray or ExtensionArray, not "
  257. f"{type(vals).__name__}. Use hash_pandas_object instead."
  258. )
  259. return _hash_ndarray(vals, encoding, hash_key, categorize)
  260. def _hash_ndarray(
  261. vals: np.ndarray,
  262. encoding: str = "utf8",
  263. hash_key: str = _default_hash_key,
  264. categorize: bool = True,
  265. ) -> npt.NDArray[np.uint64]:
  266. """
  267. See hash_array.__doc__.
  268. """
  269. dtype = vals.dtype
  270. # we'll be working with everything as 64-bit values, so handle this
  271. # 128-bit value early
  272. if np.issubdtype(dtype, np.complex128):
  273. return hash_array(np.real(vals)) + 23 * hash_array(np.imag(vals))
  274. # First, turn whatever array this is into unsigned 64-bit ints, if we can
  275. # manage it.
  276. elif dtype == bool:
  277. vals = vals.astype("u8")
  278. elif issubclass(dtype.type, (np.datetime64, np.timedelta64)):
  279. vals = vals.view("i8").astype("u8", copy=False)
  280. elif issubclass(dtype.type, np.number) and dtype.itemsize <= 8:
  281. vals = vals.view(f"u{vals.dtype.itemsize}").astype("u8")
  282. else:
  283. # With repeated values, its MUCH faster to categorize object dtypes,
  284. # then hash and rename categories. We allow skipping the categorization
  285. # when the values are known/likely to be unique.
  286. if categorize:
  287. from pandas import (
  288. Categorical,
  289. Index,
  290. factorize,
  291. )
  292. codes, categories = factorize(vals, sort=False)
  293. cat = Categorical(codes, Index(categories), ordered=False, fastpath=True)
  294. return _hash_categorical(cat, encoding, hash_key)
  295. try:
  296. vals = hash_object_array(vals, hash_key, encoding)
  297. except TypeError:
  298. # we have mixed types
  299. vals = hash_object_array(
  300. vals.astype(str).astype(object), hash_key, encoding
  301. )
  302. # Then, redistribute these 64-bit ints within the space of 64-bit ints
  303. vals ^= vals >> 30
  304. vals *= np.uint64(0xBF58476D1CE4E5B9)
  305. vals ^= vals >> 27
  306. vals *= np.uint64(0x94D049BB133111EB)
  307. vals ^= vals >> 31
  308. return vals