123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- """
- Methods used by Block.replace and related methods.
- """
- from __future__ import annotations
- import operator
- import re
- from typing import (
- Any,
- Pattern,
- )
- import numpy as np
- from pandas._typing import (
- ArrayLike,
- Scalar,
- npt,
- )
- from pandas.core.dtypes.common import (
- is_re,
- is_re_compilable,
- is_scalar,
- )
- from pandas.core.dtypes.missing import isna
- def should_use_regex(regex: bool, to_replace: Any) -> bool:
- """
- Decide whether to treat `to_replace` as a regular expression.
- """
- if is_re(to_replace):
- regex = True
- regex = regex and is_re_compilable(to_replace)
- # Don't use regex if the pattern is empty.
- regex = regex and re.compile(to_replace).pattern != ""
- return regex
- def compare_or_regex_search(
- a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: npt.NDArray[np.bool_]
- ) -> ArrayLike:
- """
- Compare two array-like inputs of the same shape or two scalar values
- Calls operator.eq or re.search, depending on regex argument. If regex is
- True, perform an element-wise regex matching.
- Parameters
- ----------
- a : array-like
- b : scalar or regex pattern
- regex : bool
- mask : np.ndarray[bool]
- Returns
- -------
- mask : array-like of bool
- """
- if isna(b):
- return ~mask
- def _check_comparison_types(
- result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern
- ):
- """
- Raises an error if the two arrays (a,b) cannot be compared.
- Otherwise, returns the comparison result as expected.
- """
- if is_scalar(result) and isinstance(a, np.ndarray):
- type_names = [type(a).__name__, type(b).__name__]
- type_names[0] = f"ndarray(dtype={a.dtype})"
- raise TypeError(
- f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
- )
- if not regex or not should_use_regex(regex, b):
- # TODO: should use missing.mask_missing?
- op = lambda x: operator.eq(x, b)
- else:
- op = np.vectorize(
- lambda x: bool(re.search(b, x))
- if isinstance(x, str) and isinstance(b, (str, Pattern))
- else False
- )
- # GH#32621 use mask to avoid comparing to NAs
- if isinstance(a, np.ndarray):
- a = a[mask]
- result = op(a)
- if isinstance(result, np.ndarray) and mask is not None:
- # The shape of the mask can differ to that of the result
- # since we may compare only a subset of a's or b's elements
- tmp = np.zeros(mask.shape, dtype=np.bool_)
- np.place(tmp, mask, result)
- result = tmp
- _check_comparison_types(result, a, b)
- return result
- def replace_regex(
- values: ArrayLike, rx: re.Pattern, value, mask: npt.NDArray[np.bool_] | None
- ) -> None:
- """
- Parameters
- ----------
- values : ArrayLike
- Object dtype.
- rx : re.Pattern
- value : Any
- mask : np.ndarray[bool], optional
- Notes
- -----
- Alters values in-place.
- """
- # deal with replacing values with objects (strings) that match but
- # whose replacement is not a string (numeric, nan, object)
- if isna(value) or not isinstance(value, str):
- def re_replacer(s):
- if is_re(rx) and isinstance(s, str):
- return value if rx.search(s) is not None else s
- else:
- return s
- else:
- # value is guaranteed to be a string here, s can be either a string
- # or null if it's null it gets returned
- def re_replacer(s):
- if is_re(rx) and isinstance(s, str):
- return rx.sub(value, s)
- else:
- return s
- f = np.vectorize(re_replacer, otypes=[np.object_])
- if mask is None:
- values[:] = f(values)
- else:
- values[mask] = f(values[mask])
|