replace.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150
  1. """
  2. Methods used by Block.replace and related methods.
  3. """
  4. from __future__ import annotations
  5. import operator
  6. import re
  7. from typing import (
  8. Any,
  9. Pattern,
  10. )
  11. import numpy as np
  12. from pandas._typing import (
  13. ArrayLike,
  14. Scalar,
  15. npt,
  16. )
  17. from pandas.core.dtypes.common import (
  18. is_re,
  19. is_re_compilable,
  20. is_scalar,
  21. )
  22. from pandas.core.dtypes.missing import isna
  23. def should_use_regex(regex: bool, to_replace: Any) -> bool:
  24. """
  25. Decide whether to treat `to_replace` as a regular expression.
  26. """
  27. if is_re(to_replace):
  28. regex = True
  29. regex = regex and is_re_compilable(to_replace)
  30. # Don't use regex if the pattern is empty.
  31. regex = regex and re.compile(to_replace).pattern != ""
  32. return regex
  33. def compare_or_regex_search(
  34. a: ArrayLike, b: Scalar | Pattern, regex: bool, mask: npt.NDArray[np.bool_]
  35. ) -> ArrayLike:
  36. """
  37. Compare two array-like inputs of the same shape or two scalar values
  38. Calls operator.eq or re.search, depending on regex argument. If regex is
  39. True, perform an element-wise regex matching.
  40. Parameters
  41. ----------
  42. a : array-like
  43. b : scalar or regex pattern
  44. regex : bool
  45. mask : np.ndarray[bool]
  46. Returns
  47. -------
  48. mask : array-like of bool
  49. """
  50. if isna(b):
  51. return ~mask
  52. def _check_comparison_types(
  53. result: ArrayLike | bool, a: ArrayLike, b: Scalar | Pattern
  54. ):
  55. """
  56. Raises an error if the two arrays (a,b) cannot be compared.
  57. Otherwise, returns the comparison result as expected.
  58. """
  59. if is_scalar(result) and isinstance(a, np.ndarray):
  60. type_names = [type(a).__name__, type(b).__name__]
  61. type_names[0] = f"ndarray(dtype={a.dtype})"
  62. raise TypeError(
  63. f"Cannot compare types {repr(type_names[0])} and {repr(type_names[1])}"
  64. )
  65. if not regex or not should_use_regex(regex, b):
  66. # TODO: should use missing.mask_missing?
  67. op = lambda x: operator.eq(x, b)
  68. else:
  69. op = np.vectorize(
  70. lambda x: bool(re.search(b, x))
  71. if isinstance(x, str) and isinstance(b, (str, Pattern))
  72. else False
  73. )
  74. # GH#32621 use mask to avoid comparing to NAs
  75. if isinstance(a, np.ndarray):
  76. a = a[mask]
  77. result = op(a)
  78. if isinstance(result, np.ndarray) and mask is not None:
  79. # The shape of the mask can differ to that of the result
  80. # since we may compare only a subset of a's or b's elements
  81. tmp = np.zeros(mask.shape, dtype=np.bool_)
  82. np.place(tmp, mask, result)
  83. result = tmp
  84. _check_comparison_types(result, a, b)
  85. return result
  86. def replace_regex(
  87. values: ArrayLike, rx: re.Pattern, value, mask: npt.NDArray[np.bool_] | None
  88. ) -> None:
  89. """
  90. Parameters
  91. ----------
  92. values : ArrayLike
  93. Object dtype.
  94. rx : re.Pattern
  95. value : Any
  96. mask : np.ndarray[bool], optional
  97. Notes
  98. -----
  99. Alters values in-place.
  100. """
  101. # deal with replacing values with objects (strings) that match but
  102. # whose replacement is not a string (numeric, nan, object)
  103. if isna(value) or not isinstance(value, str):
  104. def re_replacer(s):
  105. if is_re(rx) and isinstance(s, str):
  106. return value if rx.search(s) is not None else s
  107. else:
  108. return s
  109. else:
  110. # value is guaranteed to be a string here, s can be either a string
  111. # or null if it's null it gets returned
  112. def re_replacer(s):
  113. if is_re(rx) and isinstance(s, str):
  114. return rx.sub(value, s)
  115. else:
  116. return s
  117. f = np.vectorize(re_replacer, otypes=[np.object_])
  118. if mask is None:
  119. values[:] = f(values)
  120. else:
  121. values[mask] = f(values[mask])