123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555 |
- """
- Low-dependency indexing utilities.
- """
- from __future__ import annotations
- from typing import (
- TYPE_CHECKING,
- Any,
- )
- import numpy as np
- from pandas._typing import AnyArrayLike
- from pandas.core.dtypes.common import (
- is_array_like,
- is_bool_dtype,
- is_extension_array_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
- )
- from pandas.core.dtypes.generic import (
- ABCIndex,
- ABCSeries,
- )
- if TYPE_CHECKING:
- from pandas.core.frame import DataFrame
- from pandas.core.indexes.base import Index
- # -----------------------------------------------------------
- # Indexer Identification
- def is_valid_positional_slice(slc: slice) -> bool:
- """
- Check if a slice object can be interpreted as a positional indexer.
- Parameters
- ----------
- slc : slice
- Returns
- -------
- bool
- Notes
- -----
- A valid positional slice may also be interpreted as a label-based slice
- depending on the index being sliced.
- """
- def is_int_or_none(val):
- return val is None or is_integer(val)
- return (
- is_int_or_none(slc.start)
- and is_int_or_none(slc.stop)
- and is_int_or_none(slc.step)
- )
- def is_list_like_indexer(key) -> bool:
- """
- Check if we have a list-like indexer that is *not* a NamedTuple.
- Parameters
- ----------
- key : object
- Returns
- -------
- bool
- """
- # allow a list_like, but exclude NamedTuples which can be indexers
- return is_list_like(key) and not (isinstance(key, tuple) and type(key) is not tuple)
- def is_scalar_indexer(indexer, ndim: int) -> bool:
- """
- Return True if we are all scalar indexers.
- Parameters
- ----------
- indexer : object
- ndim : int
- Number of dimensions in the object being indexed.
- Returns
- -------
- bool
- """
- if ndim == 1 and is_integer(indexer):
- # GH37748: allow indexer to be an integer for Series
- return True
- if isinstance(indexer, tuple) and len(indexer) == ndim:
- return all(is_integer(x) for x in indexer)
- return False
- def is_empty_indexer(indexer) -> bool:
- """
- Check if we have an empty indexer.
- Parameters
- ----------
- indexer : object
- Returns
- -------
- bool
- """
- if is_list_like(indexer) and not len(indexer):
- return True
- if not isinstance(indexer, tuple):
- indexer = (indexer,)
- return any(isinstance(idx, np.ndarray) and len(idx) == 0 for idx in indexer)
- # -----------------------------------------------------------
- # Indexer Validation
- def check_setitem_lengths(indexer, value, values) -> bool:
- """
- Validate that value and indexer are the same length.
- An special-case is allowed for when the indexer is a boolean array
- and the number of true values equals the length of ``value``. In
- this case, no exception is raised.
- Parameters
- ----------
- indexer : sequence
- Key for the setitem.
- value : array-like
- Value for the setitem.
- values : array-like
- Values being set into.
- Returns
- -------
- bool
- Whether this is an empty listlike setting which is a no-op.
- Raises
- ------
- ValueError
- When the indexer is an ndarray or list and the lengths don't match.
- """
- no_op = False
- if isinstance(indexer, (np.ndarray, list)):
- # We can ignore other listlikes because they are either
- # a) not necessarily 1-D indexers, e.g. tuple
- # b) boolean indexers e.g. BoolArray
- if is_list_like(value):
- if len(indexer) != len(value) and values.ndim == 1:
- # boolean with truth values == len of the value is ok too
- if isinstance(indexer, list):
- indexer = np.array(indexer)
- if not (
- isinstance(indexer, np.ndarray)
- and indexer.dtype == np.bool_
- and indexer.sum() == len(value)
- ):
- raise ValueError(
- "cannot set using a list-like indexer "
- "with a different length than the value"
- )
- if not len(indexer):
- no_op = True
- elif isinstance(indexer, slice):
- if is_list_like(value):
- if len(value) != length_of_indexer(indexer, values) and values.ndim == 1:
- # In case of two dimensional value is used row-wise and broadcasted
- raise ValueError(
- "cannot set using a slice indexer with a "
- "different length than the value"
- )
- if not len(value):
- no_op = True
- return no_op
- def validate_indices(indices: np.ndarray, n: int) -> None:
- """
- Perform bounds-checking for an indexer.
- -1 is allowed for indicating missing values.
- Parameters
- ----------
- indices : ndarray
- n : int
- Length of the array being indexed.
- Raises
- ------
- ValueError
- Examples
- --------
- >>> validate_indices(np.array([1, 2]), 3) # OK
- >>> validate_indices(np.array([1, -2]), 3)
- Traceback (most recent call last):
- ...
- ValueError: negative dimensions are not allowed
- >>> validate_indices(np.array([1, 2, 3]), 3)
- Traceback (most recent call last):
- ...
- IndexError: indices are out-of-bounds
- >>> validate_indices(np.array([-1, -1]), 0) # OK
- >>> validate_indices(np.array([0, 1]), 0)
- Traceback (most recent call last):
- ...
- IndexError: indices are out-of-bounds
- """
- if len(indices):
- min_idx = indices.min()
- if min_idx < -1:
- msg = f"'indices' contains values less than allowed ({min_idx} < -1)"
- raise ValueError(msg)
- max_idx = indices.max()
- if max_idx >= n:
- raise IndexError("indices are out-of-bounds")
- # -----------------------------------------------------------
- # Indexer Conversion
- def maybe_convert_indices(indices, n: int, verify: bool = True) -> np.ndarray:
- """
- Attempt to convert indices into valid, positive indices.
- If we have negative indices, translate to positive here.
- If we have indices that are out-of-bounds, raise an IndexError.
- Parameters
- ----------
- indices : array-like
- Array of indices that we are to convert.
- n : int
- Number of elements in the array that we are indexing.
- verify : bool, default True
- Check that all entries are between 0 and n - 1, inclusive.
- Returns
- -------
- array-like
- An array-like of positive indices that correspond to the ones
- that were passed in initially to this function.
- Raises
- ------
- IndexError
- One of the converted indices either exceeded the number of,
- elements (specified by `n`), or was still negative.
- """
- if isinstance(indices, list):
- indices = np.array(indices)
- if len(indices) == 0:
- # If `indices` is empty, np.array will return a float,
- # and will cause indexing errors.
- return np.empty(0, dtype=np.intp)
- mask = indices < 0
- if mask.any():
- indices = indices.copy()
- indices[mask] += n
- if verify:
- mask = (indices >= n) | (indices < 0)
- if mask.any():
- raise IndexError("indices are out-of-bounds")
- return indices
- # -----------------------------------------------------------
- # Unsorted
- def length_of_indexer(indexer, target=None) -> int:
- """
- Return the expected length of target[indexer]
- Returns
- -------
- int
- """
- if target is not None and isinstance(indexer, slice):
- target_len = len(target)
- start = indexer.start
- stop = indexer.stop
- step = indexer.step
- if start is None:
- start = 0
- elif start < 0:
- start += target_len
- if stop is None or stop > target_len:
- stop = target_len
- elif stop < 0:
- stop += target_len
- if step is None:
- step = 1
- elif step < 0:
- start, stop = stop + 1, start + 1
- step = -step
- return (stop - start + step - 1) // step
- elif isinstance(indexer, (ABCSeries, ABCIndex, np.ndarray, list)):
- if isinstance(indexer, list):
- indexer = np.array(indexer)
- if indexer.dtype == bool:
- # GH#25774
- return indexer.sum()
- return len(indexer)
- elif isinstance(indexer, range):
- return (indexer.stop - indexer.start) // indexer.step
- elif not is_list_like_indexer(indexer):
- return 1
- raise AssertionError("cannot find the length of the indexer")
- def disallow_ndim_indexing(result) -> None:
- """
- Helper function to disallow multi-dimensional indexing on 1D Series/Index.
- GH#27125 indexer like idx[:, None] expands dim, but we cannot do that
- and keep an index, so we used to return ndarray, which was deprecated
- in GH#30588.
- """
- if np.ndim(result) > 1:
- raise ValueError(
- "Multi-dimensional indexing (e.g. `obj[:, None]`) is no longer "
- "supported. Convert to a numpy array before indexing instead."
- )
- def unpack_1tuple(tup):
- """
- If we have a length-1 tuple/list that contains a slice, unpack to just
- the slice.
- Notes
- -----
- The list case is deprecated.
- """
- if len(tup) == 1 and isinstance(tup[0], slice):
- # if we don't have a MultiIndex, we may still be able to handle
- # a 1-tuple. see test_1tuple_without_multiindex
- if isinstance(tup, list):
- # GH#31299
- raise ValueError(
- "Indexing with a single-item list containing a "
- "slice is not allowed. Pass a tuple instead.",
- )
- return tup[0]
- return tup
- def check_key_length(columns: Index, key, value: DataFrame) -> None:
- """
- Checks if a key used as indexer has the same length as the columns it is
- associated with.
- Parameters
- ----------
- columns : Index The columns of the DataFrame to index.
- key : A list-like of keys to index with.
- value : DataFrame The value to set for the keys.
- Raises
- ------
- ValueError: If the length of key is not equal to the number of columns in value
- or if the number of columns referenced by key is not equal to number
- of columns.
- """
- if columns.is_unique:
- if len(value.columns) != len(key):
- raise ValueError("Columns must be same length as key")
- else:
- # Missing keys in columns are represented as -1
- if len(columns.get_indexer_non_unique(key)[0]) != len(value.columns):
- raise ValueError("Columns must be same length as key")
- def unpack_tuple_and_ellipses(item: tuple):
- """
- Possibly unpack arr[..., n] to arr[n]
- """
- if len(item) > 1:
- # Note: we are assuming this indexing is being done on a 1D arraylike
- if item[0] is Ellipsis:
- item = item[1:]
- elif item[-1] is Ellipsis:
- item = item[:-1]
- if len(item) > 1:
- raise IndexError("too many indices for array.")
- item = item[0]
- return item
- # -----------------------------------------------------------
- # Public indexer validation
- def check_array_indexer(array: AnyArrayLike, indexer: Any) -> Any:
- """
- Check if `indexer` is a valid array indexer for `array`.
- For a boolean mask, `array` and `indexer` are checked to have the same
- length. The dtype is validated, and if it is an integer or boolean
- ExtensionArray, it is checked if there are missing values present, and
- it is converted to the appropriate numpy array. Other dtypes will raise
- an error.
- Non-array indexers (integer, slice, Ellipsis, tuples, ..) are passed
- through as is.
- Parameters
- ----------
- array : array-like
- The array that is being indexed (only used for the length).
- indexer : array-like or list-like
- The array-like that's used to index. List-like input that is not yet
- a numpy array or an ExtensionArray is converted to one. Other input
- types are passed through as is.
- Returns
- -------
- numpy.ndarray
- The validated indexer as a numpy array that can be used to index.
- Raises
- ------
- IndexError
- When the lengths don't match.
- ValueError
- When `indexer` cannot be converted to a numpy ndarray to index
- (e.g. presence of missing values).
- See Also
- --------
- api.types.is_bool_dtype : Check if `key` is of boolean dtype.
- Examples
- --------
- When checking a boolean mask, a boolean ndarray is returned when the
- arguments are all valid.
- >>> mask = pd.array([True, False])
- >>> arr = pd.array([1, 2])
- >>> pd.api.indexers.check_array_indexer(arr, mask)
- array([ True, False])
- An IndexError is raised when the lengths don't match.
- >>> mask = pd.array([True, False, True])
- >>> pd.api.indexers.check_array_indexer(arr, mask)
- Traceback (most recent call last):
- ...
- IndexError: Boolean index has wrong length: 3 instead of 2.
- NA values in a boolean array are treated as False.
- >>> mask = pd.array([True, pd.NA])
- >>> pd.api.indexers.check_array_indexer(arr, mask)
- array([ True, False])
- A numpy boolean mask will get passed through (if the length is correct):
- >>> mask = np.array([True, False])
- >>> pd.api.indexers.check_array_indexer(arr, mask)
- array([ True, False])
- Similarly for integer indexers, an integer ndarray is returned when it is
- a valid indexer, otherwise an error is (for integer indexers, a matching
- length is not required):
- >>> indexer = pd.array([0, 2], dtype="Int64")
- >>> arr = pd.array([1, 2, 3])
- >>> pd.api.indexers.check_array_indexer(arr, indexer)
- array([0, 2])
- >>> indexer = pd.array([0, pd.NA], dtype="Int64")
- >>> pd.api.indexers.check_array_indexer(arr, indexer)
- Traceback (most recent call last):
- ...
- ValueError: Cannot index with an integer indexer containing NA values
- For non-integer/boolean dtypes, an appropriate error is raised:
- >>> indexer = np.array([0., 2.], dtype="float64")
- >>> pd.api.indexers.check_array_indexer(arr, indexer)
- Traceback (most recent call last):
- ...
- IndexError: arrays used as indices must be of integer or boolean type
- """
- from pandas.core.construction import array as pd_array
- # whatever is not an array-like is returned as-is (possible valid array
- # indexers that are not array-like: integer, slice, Ellipsis, None)
- # In this context, tuples are not considered as array-like, as they have
- # a specific meaning in indexing (multi-dimensional indexing)
- if is_list_like(indexer):
- if isinstance(indexer, tuple):
- return indexer
- else:
- return indexer
- # convert list-likes to array
- if not is_array_like(indexer):
- indexer = pd_array(indexer)
- if len(indexer) == 0:
- # empty list is converted to float array by pd.array
- indexer = np.array([], dtype=np.intp)
- dtype = indexer.dtype
- if is_bool_dtype(dtype):
- if is_extension_array_dtype(dtype):
- indexer = indexer.to_numpy(dtype=bool, na_value=False)
- else:
- indexer = np.asarray(indexer, dtype=bool)
- # GH26658
- if len(indexer) != len(array):
- raise IndexError(
- f"Boolean index has wrong length: "
- f"{len(indexer)} instead of {len(array)}"
- )
- elif is_integer_dtype(dtype):
- try:
- indexer = np.asarray(indexer, dtype=np.intp)
- except ValueError as err:
- raise ValueError(
- "Cannot index with an integer indexer containing NA values"
- ) from err
- else:
- raise IndexError("arrays used as indices must be of integer or boolean type")
- return indexer
|