object_array.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. from __future__ import annotations
  2. import functools
  3. import re
  4. import sys
  5. import textwrap
  6. from typing import (
  7. TYPE_CHECKING,
  8. Callable,
  9. Literal,
  10. )
  11. import unicodedata
  12. import numpy as np
  13. from pandas._libs import lib
  14. import pandas._libs.missing as libmissing
  15. import pandas._libs.ops as libops
  16. from pandas._typing import (
  17. NpDtype,
  18. Scalar,
  19. )
  20. from pandas.core.dtypes.common import is_scalar
  21. from pandas.core.dtypes.missing import isna
  22. from pandas.core.strings.base import BaseStringArrayMethods
  23. if TYPE_CHECKING:
  24. from pandas import Series
  25. class ObjectStringArrayMixin(BaseStringArrayMethods):
  26. """
  27. String Methods operating on object-dtype ndarrays.
  28. """
  29. _str_na_value = np.nan
  30. def __len__(self) -> int:
  31. # For typing, _str_map relies on the object being sized.
  32. raise NotImplementedError
  33. def _str_map(
  34. self, f, na_value=None, dtype: NpDtype | None = None, convert: bool = True
  35. ):
  36. """
  37. Map a callable over valid elements of the array.
  38. Parameters
  39. ----------
  40. f : Callable
  41. A function to call on each non-NA element.
  42. na_value : Scalar, optional
  43. The value to set for NA values. Might also be used for the
  44. fill value if the callable `f` raises an exception.
  45. This defaults to ``self._str_na_value`` which is ``np.nan``
  46. for object-dtype and Categorical and ``pd.NA`` for StringArray.
  47. dtype : Dtype, optional
  48. The dtype of the result array.
  49. convert : bool, default True
  50. Whether to call `maybe_convert_objects` on the resulting ndarray
  51. """
  52. if dtype is None:
  53. dtype = np.dtype("object")
  54. if na_value is None:
  55. na_value = self._str_na_value
  56. if not len(self):
  57. return np.array([], dtype=dtype)
  58. arr = np.asarray(self, dtype=object)
  59. mask = isna(arr)
  60. map_convert = convert and not np.all(mask)
  61. try:
  62. result = lib.map_infer_mask(arr, f, mask.view(np.uint8), map_convert)
  63. except (TypeError, AttributeError) as err:
  64. # Reraise the exception if callable `f` got wrong number of args.
  65. # The user may want to be warned by this, instead of getting NaN
  66. p_err = (
  67. r"((takes)|(missing)) (?(2)from \d+ to )?\d+ "
  68. r"(?(3)required )positional arguments?"
  69. )
  70. if len(err.args) >= 1 and re.search(p_err, err.args[0]):
  71. # FIXME: this should be totally avoidable
  72. raise err
  73. def g(x):
  74. # This type of fallback behavior can be removed once
  75. # we remove object-dtype .str accessor.
  76. try:
  77. return f(x)
  78. except (TypeError, AttributeError):
  79. return na_value
  80. return self._str_map(g, na_value=na_value, dtype=dtype)
  81. if not isinstance(result, np.ndarray):
  82. return result
  83. if na_value is not np.nan:
  84. np.putmask(result, mask, na_value)
  85. if convert and result.dtype == object:
  86. result = lib.maybe_convert_objects(result)
  87. return result
  88. def _str_count(self, pat, flags: int = 0):
  89. regex = re.compile(pat, flags=flags)
  90. f = lambda x: len(regex.findall(x))
  91. return self._str_map(f, dtype="int64")
  92. def _str_pad(
  93. self,
  94. width,
  95. side: Literal["left", "right", "both"] = "left",
  96. fillchar: str = " ",
  97. ):
  98. if side == "left":
  99. f = lambda x: x.rjust(width, fillchar)
  100. elif side == "right":
  101. f = lambda x: x.ljust(width, fillchar)
  102. elif side == "both":
  103. f = lambda x: x.center(width, fillchar)
  104. else: # pragma: no cover
  105. raise ValueError("Invalid side")
  106. return self._str_map(f)
  107. def _str_contains(
  108. self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
  109. ):
  110. if regex:
  111. if not case:
  112. flags |= re.IGNORECASE
  113. pat = re.compile(pat, flags=flags)
  114. f = lambda x: pat.search(x) is not None
  115. else:
  116. if case:
  117. f = lambda x: pat in x
  118. else:
  119. upper_pat = pat.upper()
  120. f = lambda x: upper_pat in x.upper()
  121. return self._str_map(f, na, dtype=np.dtype("bool"))
  122. def _str_startswith(self, pat, na=None):
  123. f = lambda x: x.startswith(pat)
  124. return self._str_map(f, na_value=na, dtype=np.dtype(bool))
  125. def _str_endswith(self, pat, na=None):
  126. f = lambda x: x.endswith(pat)
  127. return self._str_map(f, na_value=na, dtype=np.dtype(bool))
  128. def _str_replace(
  129. self,
  130. pat: str | re.Pattern,
  131. repl: str | Callable,
  132. n: int = -1,
  133. case: bool = True,
  134. flags: int = 0,
  135. regex: bool = True,
  136. ):
  137. if case is False:
  138. # add case flag, if provided
  139. flags |= re.IGNORECASE
  140. if regex or flags or callable(repl):
  141. if not isinstance(pat, re.Pattern):
  142. if regex is False:
  143. pat = re.escape(pat)
  144. pat = re.compile(pat, flags=flags)
  145. n = n if n >= 0 else 0
  146. f = lambda x: pat.sub(repl=repl, string=x, count=n)
  147. else:
  148. f = lambda x: x.replace(pat, repl, n)
  149. return self._str_map(f, dtype=str)
  150. def _str_repeat(self, repeats):
  151. if is_scalar(repeats):
  152. def scalar_rep(x):
  153. try:
  154. return bytes.__mul__(x, repeats)
  155. except TypeError:
  156. return str.__mul__(x, repeats)
  157. return self._str_map(scalar_rep, dtype=str)
  158. else:
  159. from pandas.core.arrays.string_ import BaseStringArray
  160. def rep(x, r):
  161. if x is libmissing.NA:
  162. return x
  163. try:
  164. return bytes.__mul__(x, r)
  165. except TypeError:
  166. return str.__mul__(x, r)
  167. repeats = np.asarray(repeats, dtype=object)
  168. result = libops.vec_binop(np.asarray(self), repeats, rep)
  169. if isinstance(self, BaseStringArray):
  170. # Not going through map, so we have to do this here.
  171. result = type(self)._from_sequence(result)
  172. return result
  173. def _str_match(
  174. self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
  175. ):
  176. if not case:
  177. flags |= re.IGNORECASE
  178. regex = re.compile(pat, flags=flags)
  179. f = lambda x: regex.match(x) is not None
  180. return self._str_map(f, na_value=na, dtype=np.dtype(bool))
  181. def _str_fullmatch(
  182. self,
  183. pat: str | re.Pattern,
  184. case: bool = True,
  185. flags: int = 0,
  186. na: Scalar | None = None,
  187. ):
  188. if not case:
  189. flags |= re.IGNORECASE
  190. regex = re.compile(pat, flags=flags)
  191. f = lambda x: regex.fullmatch(x) is not None
  192. return self._str_map(f, na_value=na, dtype=np.dtype(bool))
  193. def _str_encode(self, encoding, errors: str = "strict"):
  194. f = lambda x: x.encode(encoding, errors=errors)
  195. return self._str_map(f, dtype=object)
  196. def _str_find(self, sub, start: int = 0, end=None):
  197. return self._str_find_(sub, start, end, side="left")
  198. def _str_rfind(self, sub, start: int = 0, end=None):
  199. return self._str_find_(sub, start, end, side="right")
  200. def _str_find_(self, sub, start, end, side):
  201. if side == "left":
  202. method = "find"
  203. elif side == "right":
  204. method = "rfind"
  205. else: # pragma: no cover
  206. raise ValueError("Invalid side")
  207. if end is None:
  208. f = lambda x: getattr(x, method)(sub, start)
  209. else:
  210. f = lambda x: getattr(x, method)(sub, start, end)
  211. return self._str_map(f, dtype="int64")
  212. def _str_findall(self, pat, flags: int = 0):
  213. regex = re.compile(pat, flags=flags)
  214. return self._str_map(regex.findall, dtype="object")
  215. def _str_get(self, i):
  216. def f(x):
  217. if isinstance(x, dict):
  218. return x.get(i)
  219. elif len(x) > i >= -len(x):
  220. return x[i]
  221. return self._str_na_value
  222. return self._str_map(f)
  223. def _str_index(self, sub, start: int = 0, end=None):
  224. if end:
  225. f = lambda x: x.index(sub, start, end)
  226. else:
  227. f = lambda x: x.index(sub, start, end)
  228. return self._str_map(f, dtype="int64")
  229. def _str_rindex(self, sub, start: int = 0, end=None):
  230. if end:
  231. f = lambda x: x.rindex(sub, start, end)
  232. else:
  233. f = lambda x: x.rindex(sub, start, end)
  234. return self._str_map(f, dtype="int64")
  235. def _str_join(self, sep):
  236. return self._str_map(sep.join)
  237. def _str_partition(self, sep, expand):
  238. result = self._str_map(lambda x: x.partition(sep), dtype="object")
  239. return result
  240. def _str_rpartition(self, sep, expand):
  241. return self._str_map(lambda x: x.rpartition(sep), dtype="object")
  242. def _str_len(self):
  243. return self._str_map(len, dtype="int64")
  244. def _str_slice(self, start=None, stop=None, step=None):
  245. obj = slice(start, stop, step)
  246. return self._str_map(lambda x: x[obj])
  247. def _str_slice_replace(self, start=None, stop=None, repl=None):
  248. if repl is None:
  249. repl = ""
  250. def f(x):
  251. if x[start:stop] == "":
  252. local_stop = start
  253. else:
  254. local_stop = stop
  255. y = ""
  256. if start is not None:
  257. y += x[:start]
  258. y += repl
  259. if stop is not None:
  260. y += x[local_stop:]
  261. return y
  262. return self._str_map(f)
  263. def _str_split(
  264. self,
  265. pat: str | re.Pattern | None = None,
  266. n=-1,
  267. expand: bool = False,
  268. regex: bool | None = None,
  269. ):
  270. if pat is None:
  271. if n is None or n == 0:
  272. n = -1
  273. f = lambda x: x.split(pat, n)
  274. else:
  275. new_pat: str | re.Pattern
  276. if regex is True or isinstance(pat, re.Pattern):
  277. new_pat = re.compile(pat)
  278. elif regex is False:
  279. new_pat = pat
  280. # regex is None so link to old behavior #43563
  281. else:
  282. if len(pat) == 1:
  283. new_pat = pat
  284. else:
  285. new_pat = re.compile(pat)
  286. if isinstance(new_pat, re.Pattern):
  287. if n is None or n == -1:
  288. n = 0
  289. f = lambda x: new_pat.split(x, maxsplit=n)
  290. else:
  291. if n is None or n == 0:
  292. n = -1
  293. f = lambda x: x.split(pat, n)
  294. return self._str_map(f, dtype=object)
  295. def _str_rsplit(self, pat=None, n=-1):
  296. if n is None or n == 0:
  297. n = -1
  298. f = lambda x: x.rsplit(pat, n)
  299. return self._str_map(f, dtype="object")
  300. def _str_translate(self, table):
  301. return self._str_map(lambda x: x.translate(table))
  302. def _str_wrap(self, width, **kwargs):
  303. kwargs["width"] = width
  304. tw = textwrap.TextWrapper(**kwargs)
  305. return self._str_map(lambda s: "\n".join(tw.wrap(s)))
  306. def _str_get_dummies(self, sep: str = "|"):
  307. from pandas import Series
  308. arr = Series(self).fillna("")
  309. try:
  310. arr = sep + arr + sep
  311. except (TypeError, NotImplementedError):
  312. arr = sep + arr.astype(str) + sep
  313. tags: set[str] = set()
  314. for ts in Series(arr, copy=False).str.split(sep):
  315. tags.update(ts)
  316. tags2 = sorted(tags - {""})
  317. dummies = np.empty((len(arr), len(tags2)), dtype=np.int64)
  318. def _isin(test_elements: str, element: str) -> bool:
  319. return element in test_elements
  320. for i, t in enumerate(tags2):
  321. pat = sep + t + sep
  322. dummies[:, i] = lib.map_infer(
  323. arr.to_numpy(), functools.partial(_isin, element=pat)
  324. )
  325. return dummies, tags2
  326. def _str_upper(self):
  327. return self._str_map(lambda x: x.upper())
  328. def _str_isalnum(self):
  329. return self._str_map(str.isalnum, dtype="bool")
  330. def _str_isalpha(self):
  331. return self._str_map(str.isalpha, dtype="bool")
  332. def _str_isdecimal(self):
  333. return self._str_map(str.isdecimal, dtype="bool")
  334. def _str_isdigit(self):
  335. return self._str_map(str.isdigit, dtype="bool")
  336. def _str_islower(self):
  337. return self._str_map(str.islower, dtype="bool")
  338. def _str_isnumeric(self):
  339. return self._str_map(str.isnumeric, dtype="bool")
  340. def _str_isspace(self):
  341. return self._str_map(str.isspace, dtype="bool")
  342. def _str_istitle(self):
  343. return self._str_map(str.istitle, dtype="bool")
  344. def _str_isupper(self):
  345. return self._str_map(str.isupper, dtype="bool")
  346. def _str_capitalize(self):
  347. return self._str_map(str.capitalize)
  348. def _str_casefold(self):
  349. return self._str_map(str.casefold)
  350. def _str_title(self):
  351. return self._str_map(str.title)
  352. def _str_swapcase(self):
  353. return self._str_map(str.swapcase)
  354. def _str_lower(self):
  355. return self._str_map(str.lower)
  356. def _str_normalize(self, form):
  357. f = lambda x: unicodedata.normalize(form, x)
  358. return self._str_map(f)
  359. def _str_strip(self, to_strip=None):
  360. return self._str_map(lambda x: x.strip(to_strip))
  361. def _str_lstrip(self, to_strip=None):
  362. return self._str_map(lambda x: x.lstrip(to_strip))
  363. def _str_rstrip(self, to_strip=None):
  364. return self._str_map(lambda x: x.rstrip(to_strip))
  365. def _str_removeprefix(self, prefix: str) -> Series:
  366. # outstanding question on whether to use native methods for users on Python 3.9+
  367. # https://github.com/pandas-dev/pandas/pull/39226#issuecomment-836719770,
  368. # in which case we could do return self._str_map(str.removeprefix)
  369. def removeprefix(text: str) -> str:
  370. if text.startswith(prefix):
  371. return text[len(prefix) :]
  372. return text
  373. return self._str_map(removeprefix)
  374. def _str_removesuffix(self, suffix: str) -> Series:
  375. if sys.version_info < (3, 9):
  376. # NOTE pyupgrade will remove this when we run it with --py39-plus
  377. # so don't remove the unnecessary `else` statement below
  378. from pandas.util._str_methods import removesuffix
  379. return self._str_map(functools.partial(removesuffix, suffix=suffix))
  380. else:
  381. return self._str_map(lambda x: x.removesuffix(suffix))
  382. def _str_extract(self, pat: str, flags: int = 0, expand: bool = True):
  383. regex = re.compile(pat, flags=flags)
  384. na_value = self._str_na_value
  385. if not expand:
  386. def g(x):
  387. m = regex.search(x)
  388. return m.groups()[0] if m else na_value
  389. return self._str_map(g, convert=False)
  390. empty_row = [na_value] * regex.groups
  391. def f(x):
  392. if not isinstance(x, str):
  393. return empty_row
  394. m = regex.search(x)
  395. if m:
  396. return [na_value if item is None else item for item in m.groups()]
  397. else:
  398. return empty_row
  399. return [f(val) for val in np.asarray(self)]