utils.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. """
  2. Low-dependency indexing utilities.
  3. """
  4. from __future__ import annotations
  5. from typing import (
  6. TYPE_CHECKING,
  7. Any,
  8. )
  9. import numpy as np
  10. from pandas._typing import AnyArrayLike
  11. from pandas.core.dtypes.common import (
  12. is_array_like,
  13. is_bool_dtype,
  14. is_extension_array_dtype,
  15. is_integer,
  16. is_integer_dtype,
  17. is_list_like,
  18. )
  19. from pandas.core.dtypes.generic import (
  20. ABCIndex,
  21. ABCSeries,
  22. )
  23. if TYPE_CHECKING:
  24. from pandas.core.frame import DataFrame
  25. from pandas.core.indexes.base import Index
  26. # -----------------------------------------------------------
  27. # Indexer Identification
  28. def is_valid_positional_slice(slc: slice) -> bool:
  29. """
  30. Check if a slice object can be interpreted as a positional indexer.
  31. Parameters
  32. ----------
  33. slc : slice
  34. Returns
  35. -------
  36. bool
  37. Notes
  38. -----
  39. A valid positional slice may also be interpreted as a label-based slice
  40. depending on the index being sliced.
  41. """
  42. def is_int_or_none(val):
  43. return val is None or is_integer(val)
  44. return (
  45. is_int_or_none(slc.start)
  46. and is_int_or_none(slc.stop)
  47. and is_int_or_none(slc.step)
  48. )
  49. def is_list_like_indexer(key) -> bool:
  50. """
  51. Check if we have a list-like indexer that is *not* a NamedTuple.
  52. Parameters
  53. ----------
  54. key : object
  55. Returns
  56. -------
  57. bool
  58. """
  59. # allow a list_like, but exclude NamedTuples which can be indexers
  60. return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple)
  61. def is_scalar_indexer(indexer, ndim: int) -> bool:
  62. """
  63. Return True if we are all scalar indexers.
  64. Parameters
  65. ----------
  66. indexer : object
  67. ndim : int
  68. Number of dimensions in the object being indexed.
  69. Returns
  70. -------
  71. bool
  72. """
  73. if ndim == 1 and is_integer(indexer):
  74. # GH37748: allow indexer to be an integer for Series
  75. return True
  76. if isinstance(indexer, tuple) and len(indexer) == ndim:
  77. return all(is_integer(x) for x in indexer)
  78. return False
  79. def is_empty_indexer(indexer) -> bool:
  80. """
  81. Check if we have an empty indexer.
  82. Parameters
  83. ----------
  84. indexer : object
  85. Returns
  86. -------
  87. bool
  88. """
  89. if is_list_like(indexer) and not len(indexer):
  90. return True
  91. if not isinstance(indexer, tuple):
  92. indexer = (indexer,)
  93. return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer)
  94. # -----------------------------------------------------------
  95. # Indexer Validation
  96. def check_setitem_lengths(indexer, value, values) -> bool:
  97. """
  98. Validate that value and indexer are the same length.
  99. An special-case is allowed for when the indexer is a boolean array
  100. and the number of true values equals the length of ``value``. In
  101. this case, no exception is raised.
  102. Parameters
  103. ----------
  104. indexer : sequence
  105. Key for the setitem.
  106. value : array-like
  107. Value for the setitem.
  108. values : array-like
  109. Values being set into.
  110. Returns
  111. -------
  112. bool
  113. Whether this is an empty listlike setting which is a no-op.
  114. Raises
  115. ------
  116. ValueError
  117. When the indexer is an ndarray or list and the lengths don't match.
  118. """
  119. no_op = False
  120. if isinstance(indexer, (np.ndarray, list)):
  121. # We can ignore other listlikes because they are either
  122. # a) not necessarily 1-D indexers, e.g. tuple
  123. # b) boolean indexers e.g. BoolArray
  124. if is_list_like(value):
  125. if len(indexer) != len(value) and values.ndim == 1:
  126. # boolean with truth values == len of the value is ok too
  127. if isinstance(indexer, list):
  128. indexer = np.array(indexer)
  129. if not (
  130. isinstance(indexer, np.ndarray)
  131. and indexer.dtype == np.bool_
  132. and indexer.sum() == len(value)
  133. ):
  134. raise ValueError(
  135. "cannot set using a list-like indexer "
  136. "with a different length than the value"
  137. )
  138. if not len(indexer):
  139. no_op = True
  140. elif isinstance(indexer, slice):
  141. if is_list_like(value):
  142. if len(value) != length_of_indexer(indexer, values) and values.ndim == 1:
  143. # In case of two dimensional value is used row-wise and broadcasted
  144. raise ValueError(
  145. "cannot set using a slice indexer with a "
  146. "different length than the value"
  147. )
  148. if not len(value):
  149. no_op = True
  150. return no_op
  151. def validate_indices(indices: np.ndarray, n: int) -> None:
  152. """
  153. Perform bounds-checking for an indexer.
  154. -1 is allowed for indicating missing values.
  155. Parameters
  156. ----------
  157. indices : ndarray
  158. n : int
  159. Length of the array being indexed.
  160. Raises
  161. ------
  162. ValueError
  163. Examples
  164. --------
  165. >>> validate_indices(np.array([1, 2]), 3) # OK
  166. >>> validate_indices(np.array([1, -2]), 3)
  167. Traceback (most recent call last):
  168. ...
  169. ValueError: negative dimensions are not allowed
  170. >>> validate_indices(np.array([1, 2, 3]), 3)
  171. Traceback (most recent call last):
  172. ...
  173. IndexError: indices are out-of-bounds
  174. >>> validate_indices(np.array([-1, -1]), 0) # OK
  175. >>> validate_indices(np.array([0, 1]), 0)
  176. Traceback (most recent call last):
  177. ...
  178. IndexError: indices are out-of-bounds
  179. """
  180. if len(indices):
  181. min_idx = indices.min()
  182. if min_idx < -1:
  183. msg = f"'indices' contains values less than allowed ({min_idx} < -1)"
  184. raise ValueError(msg)
  185. max_idx = indices.max()
  186. if max_idx >= n:
  187. raise IndexError("indices are out-of-bounds")
  188. # -----------------------------------------------------------
  189. # Indexer Conversion
  190. def maybe_convert_indices(indices, n: int, verify: bool = True) -> np.ndarray:
  191. """
  192. Attempt to convert indices into valid, positive indices.
  193. If we have negative indices, translate to positive here.
  194. If we have indices that are out-of-bounds, raise an IndexError.
  195. Parameters
  196. ----------
  197. indices : array-like
  198. Array of indices that we are to convert.
  199. n : int
  200. Number of elements in the array that we are indexing.
  201. verify : bool, default True
  202. Check that all entries are between 0 and n - 1, inclusive.
  203. Returns
  204. -------
  205. array-like
  206. An array-like of positive indices that correspond to the ones
  207. that were passed in initially to this function.
  208. Raises
  209. ------
  210. IndexError
  211. One of the converted indices either exceeded the number of,
  212. elements (specified by `n`), or was still negative.
  213. """
  214. if isinstance(indices, list):
  215. indices = np.array(indices)
  216. if len(indices) == 0:
  217. # If `indices` is empty, np.array will return a float,
  218. # and will cause indexing errors.
  219. return np.empty(0, dtype=np.intp)
  220. mask = indices < 0
  221. if mask.any():
  222. indices = indices.copy()
  223. indices[mask] += n
  224. if verify:
  225. mask = (indices >= n) | (indices < 0)
  226. if mask.any():
  227. raise IndexError("indices are out-of-bounds")
  228. return indices
  229. # -----------------------------------------------------------
  230. # Unsorted
  231. def length_of_indexer(indexer, target=None) -> int:
  232. """
  233. Return the expected length of target[indexer]
  234. Returns
  235. -------
  236. int
  237. """
  238. if target is not None and isinstance(indexer, slice):
  239. target_len = len(target)
  240. start = indexer.start
  241. stop = indexer.stop
  242. step = indexer.step
  243. if start is None:
  244. start = 0
  245. elif start < 0:
  246. start += target_len
  247. if stop is None or stop > target_len:
  248. stop = target_len
  249. elif stop < 0:
  250. stop += target_len
  251. if step is None:
  252. step = 1
  253. elif step < 0:
  254. start, stop = stop + 1, start + 1
  255. step = -step
  256. return (stop - start + step - 1) // step
  257. elif isinstance(indexer, (ABCSeries, ABCIndex, np.ndarray, list)):
  258. if isinstance(indexer, list):
  259. indexer = np.array(indexer)
  260. if indexer.dtype == bool:
  261. # GH#25774
  262. return indexer.sum()
  263. return len(indexer)
  264. elif isinstance(indexer, range):
  265. return (indexer.stop - indexer.start) // indexer.step
  266. elif not is_list_like_indexer(indexer):
  267. return 1
  268. raise AssertionError("cannot find the length of the indexer")
  269. def disallow_ndim_indexing(result) -> None:
  270. """
  271. Helper function to disallow multi-dimensional indexing on 1D Series/Index.
  272. GH#27125 indexer like idx[:, None] expands dim, but we cannot do that
  273. and keep an index, so we used to return ndarray, which was deprecated
  274. in GH#30588.
  275. """
  276. if np.ndim(result) > 1:
  277. raise ValueError(
  278. "Multi-dimensional indexing (e.g. `obj[:, None]`) is no longer "
  279. "supported. Convert to a numpy array before indexing instead."
  280. )
  281. def unpack_1tuple(tup):
  282. """
  283. If we have a length-1 tuple/list that contains a slice, unpack to just
  284. the slice.
  285. Notes
  286. -----
  287. The list case is deprecated.
  288. """
  289. if len(tup) == 1 and isinstance(tup[0], slice):
  290. # if we don't have a MultiIndex, we may still be able to handle
  291. # a 1-tuple. see test_1tuple_without_multiindex
  292. if isinstance(tup, list):
  293. # GH#31299
  294. raise ValueError(
  295. "Indexing with a single-item list containing a "
  296. "slice is not allowed. Pass a tuple instead.",
  297. )
  298. return tup[0]
  299. return tup
  300. def check_key_length(columns: Index, key, value: DataFrame) -> None:
  301. """
  302. Checks if a key used as indexer has the same length as the columns it is
  303. associated with.
  304. Parameters
  305. ----------
  306. columns : Index The columns of the DataFrame to index.
  307. key : A list-like of keys to index with.
  308. value : DataFrame The value to set for the keys.
  309. Raises
  310. ------
  311. ValueError: If the length of key is not equal to the number of columns in value
  312. or if the number of columns referenced by key is not equal to number
  313. of columns.
  314. """
  315. if columns.is_unique:
  316. if len(value.columns) != len(key):
  317. raise ValueError("Columns must be same length as key")
  318. else:
  319. # Missing keys in columns are represented as -1
  320. if len(columns.get_indexer_non_unique(key)[0]) != len(value.columns):
  321. raise ValueError("Columns must be same length as key")
  322. def unpack_tuple_and_ellipses(item: tuple):
  323. """
  324. Possibly unpack arr[..., n] to arr[n]
  325. """
  326. if len(item) > 1:
  327. # Note: we are assuming this indexing is being done on a 1D arraylike
  328. if item[0] is Ellipsis:
  329. item = item[1:]
  330. elif item[-1] is Ellipsis:
  331. item = item[:-1]
  332. if len(item) > 1:
  333. raise IndexError("too many indices for array.")
  334. item = item[0]
  335. return item
  336. # -----------------------------------------------------------
  337. # Public indexer validation
  338. def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
  339. """
  340. Check if `indexer` is a valid array indexer for `array`.
  341. For a boolean mask, `array` and `indexer` are checked to have the same
  342. length. The dtype is validated, and if it is an integer or boolean
  343. ExtensionArray, it is checked if there are missing values present, and
  344. it is converted to the appropriate numpy array. Other dtypes will raise
  345. an error.
  346. Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed
  347. through as is.
  348. Parameters
  349. ----------
  350. array : array-like
  351. The array that is being indexed (only used for the length).
  352. indexer : array-like or list-like
  353. The array-like that's used to index. List-like input that is not yet
  354. a numpy array or an ExtensionArray is converted to one. Other input
  355. types are passed through as is.
  356. Returns
  357. -------
  358. numpy.ndarray
  359. The validated indexer as a numpy array that can be used to index.
  360. Raises
  361. ------
  362. IndexError
  363. When the lengths don't match.
  364. ValueError
  365. When `indexer` cannot be converted to a numpy ndarray to index
  366. (e.g. presence of missing values).
  367. See Also
  368. --------
  369. api.types.is_bool_dtype : Check if `key` is of boolean dtype.
  370. Examples
  371. --------
  372. When checking a boolean mask, a boolean ndarray is returned when the
  373. arguments are all valid.
  374. >>> mask = pd.array([True, False])
  375. >>> arr = pd.array([1, 2])
  376. >>> pd.api.indexers.check_array_indexer(arr, mask)
  377. array([ True, False])
  378. An IndexError is raised when the lengths don't match.
  379. >>> mask = pd.array([True, False, True])
  380. >>> pd.api.indexers.check_array_indexer(arr, mask)
  381. Traceback (most recent call last):
  382. ...
  383. IndexError: Boolean index has wrong length: 3 instead of 2.
  384. NA values in a boolean array are treated as False.
  385. >>> mask = pd.array([True, pd.NA])
  386. >>> pd.api.indexers.check_array_indexer(arr, mask)
  387. array([ True, False])
  388. A numpy boolean mask will get passed through (if the length is correct):
  389. >>> mask = np.array([True, False])
  390. >>> pd.api.indexers.check_array_indexer(arr, mask)
  391. array([ True, False])
  392. Similarly for integer indexers, an integer ndarray is returned when it is
  393. a valid indexer, otherwise an error is (for integer indexers, a matching
  394. length is not required):
  395. >>> indexer = pd.array([0, 2], dtype="Int64")
  396. >>> arr = pd.array([1, 2, 3])
  397. >>> pd.api.indexers.check_array_indexer(arr, indexer)
  398. array([0, 2])
  399. >>> indexer = pd.array([0, pd.NA], dtype="Int64")
  400. >>> pd.api.indexers.check_array_indexer(arr, indexer)
  401. Traceback (most recent call last):
  402. ...
  403. ValueError: Cannot index with an integer indexer containing NA values
  404. For non-integer/boolean dtypes, an appropriate error is raised:
  405. >>> indexer = np.array([0., 2.], dtype="float64")
  406. >>> pd.api.indexers.check_array_indexer(arr, indexer)
  407. Traceback (most recent call last):
  408. ...
  409. IndexError: arrays used as indices must be of integer or boolean type
  410. """
  411. from pandas.core.construction import array as pd_array
  412. # whatever is not an array-like is returned as-is (possible valid array
  413. # indexers that are not array-like: integer, slice, Ellipsis, None)
  414. # In this context, tuples are not considered as array-like, as they have
  415. # a specific meaning in indexing (multi-dimensional indexing)
  416. if is_list_like(indexer):
  417. if isinstance(indexer, tuple):
  418. return indexer
  419. else:
  420. return indexer
  421. # convert list-likes to array
  422. if not is_array_like(indexer):
  423. indexer = pd_array(indexer)
  424. if len(indexer) == 0:
  425. # empty list is converted to float array by pd.array
  426. indexer = np.array([], dtype=np.intp)
  427. dtype = indexer.dtype
  428. if is_bool_dtype(dtype):
  429. if is_extension_array_dtype(dtype):
  430. indexer = indexer.to_numpy(dtype=bool, na_value=False)
  431. else:
  432. indexer = np.asarray(indexer, dtype=bool)
  433. # GH26658
  434. if len(indexer) != len(array):
  435. raise IndexError(
  436. f"Boolean index has wrong length: "
  437. f"{len(indexer)} instead of {len(array)}"
  438. )
  439. elif is_integer_dtype(dtype):
  440. try:
  441. indexer = np.asarray(indexer, dtype=np.intp)
  442. except ValueError as err:
  443. raise ValueError(
  444. "Cannot index with an integer indexer containing NA values"
  445. ) from err
  446. else:
  447. raise IndexError("arrays used as indices must be of integer or boolean type")
  448. return indexer