123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412 |
- from __future__ import annotations
- import re
- from typing import (
- Callable,
- Union,
- )
- import numpy as np
- from pandas._libs import (
- lib,
- missing as libmissing,
- )
- from pandas._typing import (
- Dtype,
- Scalar,
- npt,
- )
- from pandas.compat import pa_version_under7p0
- from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_dtype_equal,
- is_integer_dtype,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- pandas_dtype,
- )
- from pandas.core.dtypes.missing import isna
- from pandas.core.arrays.arrow import ArrowExtensionArray
- from pandas.core.arrays.boolean import BooleanDtype
- from pandas.core.arrays.integer import Int64Dtype
- from pandas.core.arrays.numeric import NumericDtype
- from pandas.core.arrays.string_ import (
- BaseStringArray,
- StringDtype,
- )
- from pandas.core.strings.object_array import ObjectStringArrayMixin
- if not pa_version_under7p0:
- import pyarrow as pa
- import pyarrow.compute as pc
- from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
- ArrowStringScalarOrNAT = Union[str, libmissing.NAType]
- def _chk_pyarrow_available() -> None:
- if pa_version_under7p0:
- msg = "pyarrow>=7.0.0 is required for PyArrow backed ArrowExtensionArray."
- raise ImportError(msg)
- # TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
- # ObjectStringArrayMixin because we want to have the object-dtype based methods as
- # fallback for the ones that pyarrow doesn't yet support
- class ArrowStringArray(ObjectStringArrayMixin, ArrowExtensionArray, BaseStringArray):
- """
- Extension array for string data in a ``pyarrow.ChunkedArray``.
- .. versionadded:: 1.2.0
- .. warning::
- ArrowStringArray is considered experimental. The implementation and
- parts of the API may change without warning.
- Parameters
- ----------
- values : pyarrow.Array or pyarrow.ChunkedArray
- The array of data.
- Attributes
- ----------
- None
- Methods
- -------
- None
- See Also
- --------
- :func:`pandas.array`
- The recommended function for creating a ArrowStringArray.
- Series.str
- The string methods are available on Series backed by
- a ArrowStringArray.
- Notes
- -----
- ArrowStringArray returns a BooleanArray for comparison methods.
- Examples
- --------
- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string[pyarrow]")
- <ArrowStringArray>
- ['This is', 'some text', <NA>, 'data.']
- Length: 4, dtype: string
- """
- # error: Incompatible types in assignment (expression has type "StringDtype",
- # base class "ArrowExtensionArray" defined the type as "ArrowDtype")
- _dtype: StringDtype # type: ignore[assignment]
- def __init__(self, values) -> None:
- super().__init__(values)
- self._dtype = StringDtype(storage="pyarrow")
- if not pa.types.is_string(self._data.type):
- raise ValueError(
- "ArrowStringArray requires a PyArrow (chunked) array of string type"
- )
- def __len__(self) -> int:
- """
- Length of this array.
- Returns
- -------
- length : int
- """
- return len(self._data)
- @classmethod
- def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False):
- from pandas.core.arrays.masked import BaseMaskedArray
- _chk_pyarrow_available()
- if dtype and not (isinstance(dtype, str) and dtype == "string"):
- dtype = pandas_dtype(dtype)
- assert isinstance(dtype, StringDtype) and dtype.storage == "pyarrow"
- if isinstance(scalars, BaseMaskedArray):
- # avoid costly conversion to object dtype in ensure_string_array and
- # numerical issues with Float32Dtype
- na_values = scalars._mask
- result = scalars._data
- result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
- return cls(pa.array(result, mask=na_values, type=pa.string()))
- elif isinstance(scalars, (pa.Array, pa.ChunkedArray)):
- return cls(pc.cast(scalars, pa.string()))
- # convert non-na-likes to str
- result = lib.ensure_string_array(scalars, copy=copy)
- return cls(pa.array(result, type=pa.string(), from_pandas=True))
- @classmethod
- def _from_sequence_of_strings(
- cls, strings, dtype: Dtype | None = None, copy: bool = False
- ):
- return cls._from_sequence(strings, dtype=dtype, copy=copy)
- @property
- def dtype(self) -> StringDtype: # type: ignore[override]
- """
- An instance of 'string[pyarrow]'.
- """
- return self._dtype
- def insert(self, loc: int, item) -> ArrowStringArray:
- if not isinstance(item, str) and item is not libmissing.NA:
- raise TypeError("Scalar must be NA or str")
- return super().insert(loc, item)
- def _maybe_convert_setitem_value(self, value):
- """Maybe convert value to be pyarrow compatible."""
- if is_scalar(value):
- if isna(value):
- value = None
- elif not isinstance(value, str):
- raise TypeError("Scalar must be NA or str")
- else:
- value = np.array(value, dtype=object, copy=True)
- value[isna(value)] = None
- for v in value:
- if not (v is None or isinstance(v, str)):
- raise TypeError("Scalar must be NA or str")
- return super()._maybe_convert_setitem_value(value)
- def isin(self, values) -> npt.NDArray[np.bool_]:
- value_set = [
- pa_scalar.as_py()
- for pa_scalar in [pa.scalar(value, from_pandas=True) for value in values]
- if pa_scalar.type in (pa.string(), pa.null())
- ]
- # short-circuit to return all False array.
- if not len(value_set):
- return np.zeros(len(self), dtype=bool)
- result = pc.is_in(self._data, value_set=pa.array(value_set))
- # pyarrow 2.0.0 returned nulls, so we explicily specify dtype to convert nulls
- # to False
- return np.array(result, dtype=np.bool_)
- def astype(self, dtype, copy: bool = True):
- dtype = pandas_dtype(dtype)
- if is_dtype_equal(dtype, self.dtype):
- if copy:
- return self.copy()
- return self
- elif isinstance(dtype, NumericDtype):
- data = self._data.cast(pa.from_numpy_dtype(dtype.numpy_dtype))
- return dtype.__from_arrow__(data)
- elif isinstance(dtype, np.dtype) and np.issubdtype(dtype, np.floating):
- return self.to_numpy(dtype=dtype, na_value=np.nan)
- return super().astype(dtype, copy=copy)
- # ------------------------------------------------------------------------
- # String methods interface
- # error: Incompatible types in assignment (expression has type "NAType",
- # base class "ObjectStringArrayMixin" defined the type as "float")
- _str_na_value = libmissing.NA # type: ignore[assignment]
- def _str_map(
- self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
- ):
- # TODO: de-duplicate with StringArray method. This method is moreless copy and
- # paste.
- from pandas.arrays import (
- BooleanArray,
- IntegerArray,
- )
- if dtype is None:
- dtype = self.dtype
- if na_value is None:
- na_value = self.dtype.na_value
- mask = isna(self)
- arr = np.asarray(self)
- if is_integer_dtype(dtype) or is_bool_dtype(dtype):
- constructor: type[IntegerArray] | type[BooleanArray]
- if is_integer_dtype(dtype):
- constructor = IntegerArray
- else:
- constructor = BooleanArray
- na_value_is_na = isna(na_value)
- if na_value_is_na:
- na_value = 1
- result = lib.map_infer_mask(
- arr,
- f,
- mask.view("uint8"),
- convert=False,
- na_value=na_value,
- # error: Argument 1 to "dtype" has incompatible type
- # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
- # "Type[object]"
- dtype=np.dtype(dtype), # type: ignore[arg-type]
- )
- if not na_value_is_na:
- mask[:] = False
- return constructor(result, mask)
- elif is_string_dtype(dtype) and not is_object_dtype(dtype):
- # i.e. StringDtype
- result = lib.map_infer_mask(
- arr, f, mask.view("uint8"), convert=False, na_value=na_value
- )
- result = pa.array(result, mask=mask, type=pa.string(), from_pandas=True)
- return type(self)(result)
- else:
- # This is when the result type is object. We reach this when
- # -> We know the result type is truly object (e.g. .encode returns bytes
- # or .findall returns a list).
- # -> We don't know the result type. E.g. `.get` can return anything.
- return lib.map_infer_mask(arr, f, mask.view("uint8"))
- def _str_contains(
- self, pat, case: bool = True, flags: int = 0, na=np.nan, regex: bool = True
- ):
- if flags:
- fallback_performancewarning()
- return super()._str_contains(pat, case, flags, na, regex)
- if regex:
- if case is False:
- fallback_performancewarning()
- return super()._str_contains(pat, case, flags, na, regex)
- else:
- result = pc.match_substring_regex(self._data, pat)
- else:
- if case:
- result = pc.match_substring(self._data, pat)
- else:
- result = pc.match_substring(pc.utf8_upper(self._data), pat.upper())
- result = BooleanDtype().__from_arrow__(result)
- if not isna(na):
- result[isna(result)] = bool(na)
- return result
- def _str_startswith(self, pat: str, na=None):
- pat = f"^{re.escape(pat)}"
- return self._str_contains(pat, na=na, regex=True)
- def _str_endswith(self, pat: str, na=None):
- pat = f"{re.escape(pat)}$"
- return self._str_contains(pat, na=na, regex=True)
- def _str_replace(
- self,
- pat: str | re.Pattern,
- repl: str | Callable,
- n: int = -1,
- case: bool = True,
- flags: int = 0,
- regex: bool = True,
- ):
- if isinstance(pat, re.Pattern) or callable(repl) or not case or flags:
- fallback_performancewarning()
- return super()._str_replace(pat, repl, n, case, flags, regex)
- func = pc.replace_substring_regex if regex else pc.replace_substring
- result = func(self._data, pattern=pat, replacement=repl, max_replacements=n)
- return type(self)(result)
- def _str_match(
- self, pat: str, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.startswith("^"):
- pat = f"^{pat}"
- return self._str_contains(pat, case, flags, na, regex=True)
- def _str_fullmatch(
- self, pat, case: bool = True, flags: int = 0, na: Scalar | None = None
- ):
- if not pat.endswith("$") or pat.endswith("//$"):
- pat = f"{pat}$"
- return self._str_match(pat, case, flags, na)
- def _str_isalnum(self):
- result = pc.utf8_is_alnum(self._data)
- return BooleanDtype().__from_arrow__(result)
- def _str_isalpha(self):
- result = pc.utf8_is_alpha(self._data)
- return BooleanDtype().__from_arrow__(result)
- def _str_isdecimal(self):
- result = pc.utf8_is_decimal(self._data)
- return BooleanDtype().__from_arrow__(result)
- def _str_isdigit(self):
- result = pc.utf8_is_digit(self._data)
- return BooleanDtype().__from_arrow__(result)
- def _str_islower(self):
- result = pc.utf8_is_lower(self._data)
- return BooleanDtype().__from_arrow__(result)
- def _str_isnumeric(self):
- result = pc.utf8_is_numeric(self._data)
- return BooleanDtype().__from_arrow__(result)
- def _str_isspace(self):
- result = pc.utf8_is_space(self._data)
- return BooleanDtype().__from_arrow__(result)
- def _str_istitle(self):
- result = pc.utf8_is_title(self._data)
- return BooleanDtype().__from_arrow__(result)
- def _str_isupper(self):
- result = pc.utf8_is_upper(self._data)
- return BooleanDtype().__from_arrow__(result)
- def _str_len(self):
- result = pc.utf8_length(self._data)
- return Int64Dtype().__from_arrow__(result)
- def _str_lower(self):
- return type(self)(pc.utf8_lower(self._data))
- def _str_upper(self):
- return type(self)(pc.utf8_upper(self._data))
- def _str_strip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_trim_whitespace(self._data)
- else:
- result = pc.utf8_trim(self._data, characters=to_strip)
- return type(self)(result)
- def _str_lstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_ltrim_whitespace(self._data)
- else:
- result = pc.utf8_ltrim(self._data, characters=to_strip)
- return type(self)(result)
- def _str_rstrip(self, to_strip=None):
- if to_strip is None:
- result = pc.utf8_rtrim_whitespace(self._data)
- else:
- result = pc.utf8_rtrim(self._data, characters=to_strip)
- return type(self)(result)
|