string_.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608
  1. from __future__ import annotations
  2. from typing import (
  3. TYPE_CHECKING,
  4. Literal,
  5. )
  6. import numpy as np
  7. from pandas._config import get_option
  8. from pandas._libs import (
  9. lib,
  10. missing as libmissing,
  11. )
  12. from pandas._libs.arrays import NDArrayBacked
  13. from pandas._typing import (
  14. AxisInt,
  15. Dtype,
  16. Scalar,
  17. npt,
  18. type_t,
  19. )
  20. from pandas.compat import pa_version_under7p0
  21. from pandas.compat.numpy import function as nv
  22. from pandas.util._decorators import doc
  23. from pandas.core.dtypes.base import (
  24. ExtensionDtype,
  25. StorageExtensionDtype,
  26. register_extension_dtype,
  27. )
  28. from pandas.core.dtypes.common import (
  29. is_array_like,
  30. is_bool_dtype,
  31. is_dtype_equal,
  32. is_integer_dtype,
  33. is_object_dtype,
  34. is_string_dtype,
  35. pandas_dtype,
  36. )
  37. from pandas.core import ops
  38. from pandas.core.array_algos import masked_reductions
  39. from pandas.core.arrays import (
  40. ExtensionArray,
  41. FloatingArray,
  42. IntegerArray,
  43. )
  44. from pandas.core.arrays.floating import FloatingDtype
  45. from pandas.core.arrays.integer import IntegerDtype
  46. from pandas.core.arrays.numpy_ import PandasArray
  47. from pandas.core.construction import extract_array
  48. from pandas.core.indexers import check_array_indexer
  49. from pandas.core.missing import isna
  50. if TYPE_CHECKING:
  51. import pyarrow
  52. from pandas._typing import (
  53. NumpySorter,
  54. NumpyValueArrayLike,
  55. )
  56. from pandas import Series
  57. @register_extension_dtype
  58. class StringDtype(StorageExtensionDtype):
  59. """
  60. Extension dtype for string data.
  61. .. warning::
  62. StringDtype is considered experimental. The implementation and
  63. parts of the API may change without warning.
  64. Parameters
  65. ----------
  66. storage : {"python", "pyarrow"}, optional
  67. If not given, the value of ``pd.options.mode.string_storage``.
  68. Attributes
  69. ----------
  70. None
  71. Methods
  72. -------
  73. None
  74. Examples
  75. --------
  76. >>> pd.StringDtype()
  77. string[python]
  78. >>> pd.StringDtype(storage="pyarrow")
  79. string[pyarrow]
  80. """
  81. name = "string"
  82. #: StringDtype().na_value uses pandas.NA
  83. @property
  84. def na_value(self) -> libmissing.NAType:
  85. return libmissing.NA
  86. _metadata = ("storage",)
  87. def __init__(self, storage=None) -> None:
  88. if storage is None:
  89. storage = get_option("mode.string_storage")
  90. if storage not in {"python", "pyarrow"}:
  91. raise ValueError(
  92. f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
  93. )
  94. if storage == "pyarrow" and pa_version_under7p0:
  95. raise ImportError(
  96. "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
  97. )
  98. self.storage = storage
  99. @property
  100. def type(self) -> type[str]:
  101. return str
  102. @classmethod
  103. def construct_from_string(cls, string):
  104. """
  105. Construct a StringDtype from a string.
  106. Parameters
  107. ----------
  108. string : str
  109. The type of the name. The storage type will be taking from `string`.
  110. Valid options and their storage types are
  111. ========================== ==============================================
  112. string result storage
  113. ========================== ==============================================
  114. ``'string'`` pd.options.mode.string_storage, default python
  115. ``'string[python]'`` python
  116. ``'string[pyarrow]'`` pyarrow
  117. ========================== ==============================================
  118. Returns
  119. -------
  120. StringDtype
  121. Raise
  122. -----
  123. TypeError
  124. If the string is not a valid option.
  125. """
  126. if not isinstance(string, str):
  127. raise TypeError(
  128. f"'construct_from_string' expects a string, got {type(string)}"
  129. )
  130. if string == "string":
  131. return cls()
  132. elif string == "string[python]":
  133. return cls(storage="python")
  134. elif string == "string[pyarrow]":
  135. return cls(storage="pyarrow")
  136. else:
  137. raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
  138. # https://github.com/pandas-dev/pandas/issues/36126
  139. # error: Signature of "construct_array_type" incompatible with supertype
  140. # "ExtensionDtype"
  141. def construct_array_type( # type: ignore[override]
  142. self,
  143. ) -> type_t[BaseStringArray]:
  144. """
  145. Return the array type associated with this dtype.
  146. Returns
  147. -------
  148. type
  149. """
  150. from pandas.core.arrays.string_arrow import ArrowStringArray
  151. if self.storage == "python":
  152. return StringArray
  153. else:
  154. return ArrowStringArray
  155. def __from_arrow__(
  156. self, array: pyarrow.Array | pyarrow.ChunkedArray
  157. ) -> BaseStringArray:
  158. """
  159. Construct StringArray from pyarrow Array/ChunkedArray.
  160. """
  161. if self.storage == "pyarrow":
  162. from pandas.core.arrays.string_arrow import ArrowStringArray
  163. return ArrowStringArray(array)
  164. else:
  165. import pyarrow
  166. if isinstance(array, pyarrow.Array):
  167. chunks = [array]
  168. else:
  169. # pyarrow.ChunkedArray
  170. chunks = array.chunks
  171. results = []
  172. for arr in chunks:
  173. # using _from_sequence to ensure None is converted to NA
  174. str_arr = StringArray._from_sequence(np.array(arr))
  175. results.append(str_arr)
  176. if results:
  177. return StringArray._concat_same_type(results)
  178. else:
  179. return StringArray(np.array([], dtype="object"))
  180. class BaseStringArray(ExtensionArray):
  181. """
  182. Mixin class for StringArray, ArrowStringArray.
  183. """
  184. @doc(ExtensionArray.tolist)
  185. def tolist(self):
  186. if self.ndim > 1:
  187. return [x.tolist() for x in self]
  188. return list(self.to_numpy())
  189. class StringArray(BaseStringArray, PandasArray):
  190. """
  191. Extension array for string data.
  192. .. warning::
  193. StringArray is considered experimental. The implementation and
  194. parts of the API may change without warning.
  195. Parameters
  196. ----------
  197. values : array-like
  198. The array of data.
  199. .. warning::
  200. Currently, this expects an object-dtype ndarray
  201. where the elements are Python strings
  202. or nan-likes (``None``, ``np.nan``, ``NA``).
  203. This may change without warning in the future. Use
  204. :meth:`pandas.array` with ``dtype="string"`` for a stable way of
  205. creating a `StringArray` from any sequence.
  206. .. versionchanged:: 1.5.0
  207. StringArray now accepts array-likes containing
  208. nan-likes(``None``, ``np.nan``) for the ``values`` parameter
  209. in addition to strings and :attr:`pandas.NA`
  210. copy : bool, default False
  211. Whether to copy the array of data.
  212. Attributes
  213. ----------
  214. None
  215. Methods
  216. -------
  217. None
  218. See Also
  219. --------
  220. :func:`pandas.array`
  221. The recommended function for creating a StringArray.
  222. Series.str
  223. The string methods are available on Series backed by
  224. a StringArray.
  225. Notes
  226. -----
  227. StringArray returns a BooleanArray for comparison methods.
  228. Examples
  229. --------
  230. >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
  231. <StringArray>
  232. ['This is', 'some text', <NA>, 'data.']
  233. Length: 4, dtype: string
  234. Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
  235. will convert the values to strings.
  236. >>> pd.array(['1', 1], dtype="object")
  237. <PandasArray>
  238. ['1', 1]
  239. Length: 2, dtype: object
  240. >>> pd.array(['1', 1], dtype="string")
  241. <StringArray>
  242. ['1', '1']
  243. Length: 2, dtype: string
  244. However, instantiating StringArrays directly with non-strings will raise an error.
  245. For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
  246. >>> pd.array(["a", None, "c"], dtype="string") == "a"
  247. <BooleanArray>
  248. [True, <NA>, False]
  249. Length: 3, dtype: boolean
  250. """
  251. # undo the PandasArray hack
  252. _typ = "extension"
  253. def __init__(self, values, copy: bool = False) -> None:
  254. values = extract_array(values)
  255. super().__init__(values, copy=copy)
  256. if not isinstance(values, type(self)):
  257. self._validate()
  258. NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
  259. def _validate(self):
  260. """Validate that we only store NA or strings."""
  261. if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
  262. raise ValueError("StringArray requires a sequence of strings or pandas.NA")
  263. if self._ndarray.dtype != "object":
  264. raise ValueError(
  265. "StringArray requires a sequence of strings or pandas.NA. Got "
  266. f"'{self._ndarray.dtype}' dtype instead."
  267. )
  268. # Check to see if need to convert Na values to pd.NA
  269. if self._ndarray.ndim > 2:
  270. # Ravel if ndims > 2 b/c no cythonized version available
  271. lib.convert_nans_to_NA(self._ndarray.ravel("K"))
  272. else:
  273. lib.convert_nans_to_NA(self._ndarray)
  274. @classmethod
  275. def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
  276. if dtype and not (isinstance(dtype, str) and dtype == "string"):
  277. dtype = pandas_dtype(dtype)
  278. assert isinstance(dtype, StringDtype) and dtype.storage == "python"
  279. from pandas.core.arrays.masked import BaseMaskedArray
  280. if isinstance(scalars, BaseMaskedArray):
  281. # avoid costly conversion to object dtype
  282. na_values = scalars._mask
  283. result = scalars._data
  284. result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
  285. result[na_values] = libmissing.NA
  286. else:
  287. if hasattr(scalars, "type"):
  288. # pyarrow array
  289. scalars = np.array(scalars)
  290. # convert non-na-likes to str, and nan-likes to StringDtype().na_value
  291. result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
  292. # Manually creating new array avoids the validation step in the __init__, so is
  293. # faster. Refactor need for validation?
  294. new_string_array = cls.__new__(cls)
  295. NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
  296. return new_string_array
  297. @classmethod
  298. def _from_sequence_of_strings(
  299. cls, strings, *, dtype: Dtype | None = None, copy: bool = False
  300. ):
  301. return cls._from_sequence(strings, dtype=dtype, copy=copy)
  302. @classmethod
  303. def _empty(cls, shape, dtype) -> StringArray:
  304. values = np.empty(shape, dtype=object)
  305. values[:] = libmissing.NA
  306. return cls(values).astype(dtype, copy=False)
  307. def __arrow_array__(self, type=None):
  308. """
  309. Convert myself into a pyarrow Array.
  310. """
  311. import pyarrow as pa
  312. if type is None:
  313. type = pa.string()
  314. values = self._ndarray.copy()
  315. values[self.isna()] = None
  316. return pa.array(values, type=type, from_pandas=True)
  317. def _values_for_factorize(self):
  318. arr = self._ndarray.copy()
  319. mask = self.isna()
  320. arr[mask] = None
  321. return arr, None
  322. def __setitem__(self, key, value):
  323. value = extract_array(value, extract_numpy=True)
  324. if isinstance(value, type(self)):
  325. # extract_array doesn't extract PandasArray subclasses
  326. value = value._ndarray
  327. key = check_array_indexer(self, key)
  328. scalar_key = lib.is_scalar(key)
  329. scalar_value = lib.is_scalar(value)
  330. if scalar_key and not scalar_value:
  331. raise ValueError("setting an array element with a sequence.")
  332. # validate new items
  333. if scalar_value:
  334. if isna(value):
  335. value = libmissing.NA
  336. elif not isinstance(value, str):
  337. raise TypeError(
  338. f"Cannot set non-string value '{value}' into a StringArray."
  339. )
  340. else:
  341. if not is_array_like(value):
  342. value = np.asarray(value, dtype=object)
  343. if len(value) and not lib.is_string_array(value, skipna=True):
  344. raise TypeError("Must provide strings.")
  345. mask = isna(value)
  346. if mask.any():
  347. value = value.copy()
  348. value[isna(value)] = libmissing.NA
  349. super().__setitem__(key, value)
  350. def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
  351. # the super() method NDArrayBackedExtensionArray._putmask uses
  352. # np.putmask which doesn't properly handle None/pd.NA, so using the
  353. # base class implementation that uses __setitem__
  354. ExtensionArray._putmask(self, mask, value)
  355. def astype(self, dtype, copy: bool = True):
  356. dtype = pandas_dtype(dtype)
  357. if is_dtype_equal(dtype, self.dtype):
  358. if copy:
  359. return self.copy()
  360. return self
  361. elif isinstance(dtype, IntegerDtype):
  362. arr = self._ndarray.copy()
  363. mask = self.isna()
  364. arr[mask] = 0
  365. values = arr.astype(dtype.numpy_dtype)
  366. return IntegerArray(values, mask, copy=False)
  367. elif isinstance(dtype, FloatingDtype):
  368. arr = self.copy()
  369. mask = self.isna()
  370. arr[mask] = "0"
  371. values = arr.astype(dtype.numpy_dtype)
  372. return FloatingArray(values, mask, copy=False)
  373. elif isinstance(dtype, ExtensionDtype):
  374. # Skip the PandasArray.astype method
  375. return ExtensionArray.astype(self, dtype, copy)
  376. elif np.issubdtype(dtype, np.floating):
  377. arr = self._ndarray.copy()
  378. mask = self.isna()
  379. arr[mask] = 0
  380. values = arr.astype(dtype)
  381. values[mask] = np.nan
  382. return values
  383. return super().astype(dtype, copy)
  384. def _reduce(
  385. self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs
  386. ):
  387. if name in ["min", "max"]:
  388. return getattr(self, name)(skipna=skipna, axis=axis)
  389. raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
  390. def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
  391. nv.validate_min((), kwargs)
  392. result = masked_reductions.min(
  393. values=self.to_numpy(), mask=self.isna(), skipna=skipna
  394. )
  395. return self._wrap_reduction_result(axis, result)
  396. def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
  397. nv.validate_max((), kwargs)
  398. result = masked_reductions.max(
  399. values=self.to_numpy(), mask=self.isna(), skipna=skipna
  400. )
  401. return self._wrap_reduction_result(axis, result)
  402. def value_counts(self, dropna: bool = True) -> Series:
  403. from pandas import value_counts
  404. result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
  405. result.index = result.index.astype(self.dtype)
  406. return result
  407. def memory_usage(self, deep: bool = False) -> int:
  408. result = self._ndarray.nbytes
  409. if deep:
  410. return result + lib.memory_usage_of_objects(self._ndarray)
  411. return result
  412. @doc(ExtensionArray.searchsorted)
  413. def searchsorted(
  414. self,
  415. value: NumpyValueArrayLike | ExtensionArray,
  416. side: Literal["left", "right"] = "left",
  417. sorter: NumpySorter = None,
  418. ) -> npt.NDArray[np.intp] | np.intp:
  419. if self._hasna:
  420. raise ValueError(
  421. "searchsorted requires array to be sorted, which is impossible "
  422. "with NAs present."
  423. )
  424. return super().searchsorted(value=value, side=side, sorter=sorter)
  425. def _cmp_method(self, other, op):
  426. from pandas.arrays import BooleanArray
  427. if isinstance(other, StringArray):
  428. other = other._ndarray
  429. mask = isna(self) | isna(other)
  430. valid = ~mask
  431. if not lib.is_scalar(other):
  432. if len(other) != len(self):
  433. # prevent improper broadcasting when other is 2D
  434. raise ValueError(
  435. f"Lengths of operands do not match: {len(self)} != {len(other)}"
  436. )
  437. other = np.asarray(other)
  438. other = other[valid]
  439. if op.__name__ in ops.ARITHMETIC_BINOPS:
  440. result = np.empty_like(self._ndarray, dtype="object")
  441. result[mask] = libmissing.NA
  442. result[valid] = op(self._ndarray[valid], other)
  443. return StringArray(result)
  444. else:
  445. # logical
  446. result = np.zeros(len(self._ndarray), dtype="bool")
  447. result[valid] = op(self._ndarray[valid], other)
  448. return BooleanArray(result, mask)
  449. _arith_method = _cmp_method
  450. # ------------------------------------------------------------------------
  451. # String methods interface
  452. # error: Incompatible types in assignment (expression has type "NAType",
  453. # base class "PandasArray" defined the type as "float")
  454. _str_na_value = libmissing.NA # type: ignore[assignment]
  455. def _str_map(
  456. self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
  457. ):
  458. from pandas.arrays import BooleanArray
  459. if dtype is None:
  460. dtype = StringDtype(storage="python")
  461. if na_value is None:
  462. na_value = self.dtype.na_value
  463. mask = isna(self)
  464. arr = np.asarray(self)
  465. if is_integer_dtype(dtype) or is_bool_dtype(dtype):
  466. constructor: type[IntegerArray] | type[BooleanArray]
  467. if is_integer_dtype(dtype):
  468. constructor = IntegerArray
  469. else:
  470. constructor = BooleanArray
  471. na_value_is_na = isna(na_value)
  472. if na_value_is_na:
  473. na_value = 1
  474. result = lib.map_infer_mask(
  475. arr,
  476. f,
  477. mask.view("uint8"),
  478. convert=False,
  479. na_value=na_value,
  480. # error: Argument 1 to "dtype" has incompatible type
  481. # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
  482. # "Type[object]"
  483. dtype=np.dtype(dtype), # type: ignore[arg-type]
  484. )
  485. if not na_value_is_na:
  486. mask[:] = False
  487. return constructor(result, mask)
  488. elif is_string_dtype(dtype) and not is_object_dtype(dtype):
  489. # i.e. StringDtype
  490. result = lib.map_infer_mask(
  491. arr, f, mask.view("uint8"), convert=False, na_value=na_value
  492. )
  493. return StringArray(result)
  494. else:
  495. # This is when the result type is object. We reach this when
  496. # -> We know the result type is truly object (e.g. .encode returns bytes
  497. # or .findall returns a list).
  498. # -> We don't know the result type. E.g. `.get` can return anything.
  499. return lib.map_infer_mask(arr, f, mask.view("uint8"))