123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608 |
- from __future__ import annotations
- from typing import (
- TYPE_CHECKING,
- Literal,
- )
- import numpy as np
- from pandas._config import get_option
- from pandas._libs import (
- lib,
- missing as libmissing,
- )
- from pandas._libs.arrays import NDArrayBacked
- from pandas._typing import (
- AxisInt,
- Dtype,
- Scalar,
- npt,
- type_t,
- )
- from pandas.compat import pa_version_under7p0
- from pandas.compat.numpy import function as nv
- from pandas.util._decorators import doc
- from pandas.core.dtypes.base import (
- ExtensionDtype,
- StorageExtensionDtype,
- register_extension_dtype,
- )
- from pandas.core.dtypes.common import (
- is_array_like,
- is_bool_dtype,
- is_dtype_equal,
- is_integer_dtype,
- is_object_dtype,
- is_string_dtype,
- pandas_dtype,
- )
- from pandas.core import ops
- from pandas.core.array_algos import masked_reductions
- from pandas.core.arrays import (
- ExtensionArray,
- FloatingArray,
- IntegerArray,
- )
- from pandas.core.arrays.floating import FloatingDtype
- from pandas.core.arrays.integer import IntegerDtype
- from pandas.core.arrays.numpy_ import PandasArray
- from pandas.core.construction import extract_array
- from pandas.core.indexers import check_array_indexer
- from pandas.core.missing import isna
- if TYPE_CHECKING:
- import pyarrow
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
- from pandas import Series
- @register_extension_dtype
- class StringDtype(StorageExtensionDtype):
- """
- Extension dtype for string data.
- .. warning::
- StringDtype is considered experimental. The implementation and
- parts of the API may change without warning.
- Parameters
- ----------
- storage : {"python", "pyarrow"}, optional
- If not given, the value of ``pd.options.mode.string_storage``.
- Attributes
- ----------
- None
- Methods
- -------
- None
- Examples
- --------
- >>> pd.StringDtype()
- string[python]
- >>> pd.StringDtype(storage="pyarrow")
- string[pyarrow]
- """
- name = "string"
- #: StringDtype().na_value uses pandas.NA
- @property
- def na_value(self) -> libmissing.NAType:
- return libmissing.NA
- _metadata = ("storage",)
- def __init__(self, storage=None) -> None:
- if storage is None:
- storage = get_option("mode.string_storage")
- if storage not in {"python", "pyarrow"}:
- raise ValueError(
- f"Storage must be 'python' or 'pyarrow'. Got {storage} instead."
- )
- if storage == "pyarrow" and pa_version_under7p0:
- raise ImportError(
- "pyarrow>=7.0.0 is required for PyArrow backed StringArray."
- )
- self.storage = storage
- @property
- def type(self) -> type[str]:
- return str
- @classmethod
- def construct_from_string(cls, string):
- """
- Construct a StringDtype from a string.
- Parameters
- ----------
- string : str
- The type of the name. The storage type will be taking from `string`.
- Valid options and their storage types are
- ========================== ==============================================
- string result storage
- ========================== ==============================================
- ``'string'`` pd.options.mode.string_storage, default python
- ``'string[python]'`` python
- ``'string[pyarrow]'`` pyarrow
- ========================== ==============================================
- Returns
- -------
- StringDtype
- Raise
- -----
- TypeError
- If the string is not a valid option.
- """
- if not isinstance(string, str):
- raise TypeError(
- f"'construct_from_string' expects a string, got {type(string)}"
- )
- if string == "string":
- return cls()
- elif string == "string[python]":
- return cls(storage="python")
- elif string == "string[pyarrow]":
- return cls(storage="pyarrow")
- else:
- raise TypeError(f"Cannot construct a '{cls.__name__}' from '{string}'")
- # https://github.com/pandas-dev/pandas/issues/36126
- # error: Signature of "construct_array_type" incompatible with supertype
- # "ExtensionDtype"
- def construct_array_type( # type: ignore[override]
- self,
- ) -> type_t[BaseStringArray]:
- """
- Return the array type associated with this dtype.
- Returns
- -------
- type
- """
- from pandas.core.arrays.string_arrow import ArrowStringArray
- if self.storage == "python":
- return StringArray
- else:
- return ArrowStringArray
- def __from_arrow__(
- self, array: pyarrow.Array | pyarrow.ChunkedArray
- ) -> BaseStringArray:
- """
- Construct StringArray from pyarrow Array/ChunkedArray.
- """
- if self.storage == "pyarrow":
- from pandas.core.arrays.string_arrow import ArrowStringArray
- return ArrowStringArray(array)
- else:
- import pyarrow
- if isinstance(array, pyarrow.Array):
- chunks = [array]
- else:
- # pyarrow.ChunkedArray
- chunks = array.chunks
- results = []
- for arr in chunks:
- # using _from_sequence to ensure None is converted to NA
- str_arr = StringArray._from_sequence(np.array(arr))
- results.append(str_arr)
- if results:
- return StringArray._concat_same_type(results)
- else:
- return StringArray(np.array([], dtype="object"))
- class BaseStringArray(ExtensionArray):
- """
- Mixin class for StringArray, ArrowStringArray.
- """
- @doc(ExtensionArray.tolist)
- def tolist(self):
- if self.ndim > 1:
- return [x.tolist() for x in self]
- return list(self.to_numpy())
- class StringArray(BaseStringArray, PandasArray):
- """
- Extension array for string data.
- .. warning::
- StringArray is considered experimental. The implementation and
- parts of the API may change without warning.
- Parameters
- ----------
- values : array-like
- The array of data.
- .. warning::
- Currently, this expects an object-dtype ndarray
- where the elements are Python strings
- or nan-likes (``None``, ``np.nan``, ``NA``).
- This may change without warning in the future. Use
- :meth:`pandas.array` with ``dtype="string"`` for a stable way of
- creating a `StringArray` from any sequence.
- .. versionchanged:: 1.5.0
- StringArray now accepts array-likes containing
- nan-likes(``None``, ``np.nan``) for the ``values`` parameter
- in addition to strings and :attr:`pandas.NA`
- copy : bool, default False
- Whether to copy the array of data.
- Attributes
- ----------
- None
- Methods
- -------
- None
- See Also
- --------
- :func:`pandas.array`
- The recommended function for creating a StringArray.
- Series.str
- The string methods are available on Series backed by
- a StringArray.
- Notes
- -----
- StringArray returns a BooleanArray for comparison methods.
- Examples
- --------
- >>> pd.array(['This is', 'some text', None, 'data.'], dtype="string")
- <StringArray>
- ['This is', 'some text', <NA>, 'data.']
- Length: 4, dtype: string
- Unlike arrays instantiated with ``dtype="object"``, ``StringArray``
- will convert the values to strings.
- >>> pd.array(['1', 1], dtype="object")
- <PandasArray>
- ['1', 1]
- Length: 2, dtype: object
- >>> pd.array(['1', 1], dtype="string")
- <StringArray>
- ['1', '1']
- Length: 2, dtype: string
- However, instantiating StringArrays directly with non-strings will raise an error.
- For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
- >>> pd.array(["a", None, "c"], dtype="string") == "a"
- <BooleanArray>
- [True, <NA>, False]
- Length: 3, dtype: boolean
- """
- # undo the PandasArray hack
- _typ = "extension"
- def __init__(self, values, copy: bool = False) -> None:
- values = extract_array(values)
- super().__init__(values, copy=copy)
- if not isinstance(values, type(self)):
- self._validate()
- NDArrayBacked.__init__(self, self._ndarray, StringDtype(storage="python"))
- def _validate(self):
- """Validate that we only store NA or strings."""
- if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
- raise ValueError("StringArray requires a sequence of strings or pandas.NA")
- if self._ndarray.dtype != "object":
- raise ValueError(
- "StringArray requires a sequence of strings or pandas.NA. Got "
- f"'{self._ndarray.dtype}' dtype instead."
- )
- # Check to see if need to convert Na values to pd.NA
- if self._ndarray.ndim > 2:
- # Ravel if ndims > 2 b/c no cythonized version available
- lib.convert_nans_to_NA(self._ndarray.ravel("K"))
- else:
- lib.convert_nans_to_NA(self._ndarray)
- @classmethod
- def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = False):
- if dtype and not (isinstance(dtype, str) and dtype == "string"):
- dtype = pandas_dtype(dtype)
- assert isinstance(dtype, StringDtype) and dtype.storage == "python"
- from pandas.core.arrays.masked import BaseMaskedArray
- if isinstance(scalars, BaseMaskedArray):
- # avoid costly conversion to object dtype
- na_values = scalars._mask
- result = scalars._data
- result = lib.ensure_string_array(result, copy=copy, convert_na_value=False)
- result[na_values] = libmissing.NA
- else:
- if hasattr(scalars, "type"):
- # pyarrow array
- scalars = np.array(scalars)
- # convert non-na-likes to str, and nan-likes to StringDtype().na_value
- result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)
- # Manually creating new array avoids the validation step in the __init__, so is
- # faster. Refactor need for validation?
- new_string_array = cls.__new__(cls)
- NDArrayBacked.__init__(new_string_array, result, StringDtype(storage="python"))
- return new_string_array
- @classmethod
- def _from_sequence_of_strings(
- cls, strings, *, dtype: Dtype | None = None, copy: bool = False
- ):
- return cls._from_sequence(strings, dtype=dtype, copy=copy)
- @classmethod
- def _empty(cls, shape, dtype) -> StringArray:
- values = np.empty(shape, dtype=object)
- values[:] = libmissing.NA
- return cls(values).astype(dtype, copy=False)
- def __arrow_array__(self, type=None):
- """
- Convert myself into a pyarrow Array.
- """
- import pyarrow as pa
- if type is None:
- type = pa.string()
- values = self._ndarray.copy()
- values[self.isna()] = None
- return pa.array(values, type=type, from_pandas=True)
- def _values_for_factorize(self):
- arr = self._ndarray.copy()
- mask = self.isna()
- arr[mask] = None
- return arr, None
- def __setitem__(self, key, value):
- value = extract_array(value, extract_numpy=True)
- if isinstance(value, type(self)):
- # extract_array doesn't extract PandasArray subclasses
- value = value._ndarray
- key = check_array_indexer(self, key)
- scalar_key = lib.is_scalar(key)
- scalar_value = lib.is_scalar(value)
- if scalar_key and not scalar_value:
- raise ValueError("setting an array element with a sequence.")
- # validate new items
- if scalar_value:
- if isna(value):
- value = libmissing.NA
- elif not isinstance(value, str):
- raise TypeError(
- f"Cannot set non-string value '{value}' into a StringArray."
- )
- else:
- if not is_array_like(value):
- value = np.asarray(value, dtype=object)
- if len(value) and not lib.is_string_array(value, skipna=True):
- raise TypeError("Must provide strings.")
- mask = isna(value)
- if mask.any():
- value = value.copy()
- value[isna(value)] = libmissing.NA
- super().__setitem__(key, value)
- def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
- # the super() method NDArrayBackedExtensionArray._putmask uses
- # np.putmask which doesn't properly handle None/pd.NA, so using the
- # base class implementation that uses __setitem__
- ExtensionArray._putmask(self, mask, value)
- def astype(self, dtype, copy: bool = True):
- dtype = pandas_dtype(dtype)
- if is_dtype_equal(dtype, self.dtype):
- if copy:
- return self.copy()
- return self
- elif isinstance(dtype, IntegerDtype):
- arr = self._ndarray.copy()
- mask = self.isna()
- arr[mask] = 0
- values = arr.astype(dtype.numpy_dtype)
- return IntegerArray(values, mask, copy=False)
- elif isinstance(dtype, FloatingDtype):
- arr = self.copy()
- mask = self.isna()
- arr[mask] = "0"
- values = arr.astype(dtype.numpy_dtype)
- return FloatingArray(values, mask, copy=False)
- elif isinstance(dtype, ExtensionDtype):
- # Skip the PandasArray.astype method
- return ExtensionArray.astype(self, dtype, copy)
- elif np.issubdtype(dtype, np.floating):
- arr = self._ndarray.copy()
- mask = self.isna()
- arr[mask] = 0
- values = arr.astype(dtype)
- values[mask] = np.nan
- return values
- return super().astype(dtype, copy)
- def _reduce(
- self, name: str, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs
- ):
- if name in ["min", "max"]:
- return getattr(self, name)(skipna=skipna, axis=axis)
- raise TypeError(f"Cannot perform reduction '{name}' with string dtype")
- def min(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
- nv.validate_min((), kwargs)
- result = masked_reductions.min(
- values=self.to_numpy(), mask=self.isna(), skipna=skipna
- )
- return self._wrap_reduction_result(axis, result)
- def max(self, axis=None, skipna: bool = True, **kwargs) -> Scalar:
- nv.validate_max((), kwargs)
- result = masked_reductions.max(
- values=self.to_numpy(), mask=self.isna(), skipna=skipna
- )
- return self._wrap_reduction_result(axis, result)
- def value_counts(self, dropna: bool = True) -> Series:
- from pandas import value_counts
- result = value_counts(self._ndarray, dropna=dropna).astype("Int64")
- result.index = result.index.astype(self.dtype)
- return result
- def memory_usage(self, deep: bool = False) -> int:
- result = self._ndarray.nbytes
- if deep:
- return result + lib.memory_usage_of_objects(self._ndarray)
- return result
- @doc(ExtensionArray.searchsorted)
- def searchsorted(
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- if self._hasna:
- raise ValueError(
- "searchsorted requires array to be sorted, which is impossible "
- "with NAs present."
- )
- return super().searchsorted(value=value, side=side, sorter=sorter)
- def _cmp_method(self, other, op):
- from pandas.arrays import BooleanArray
- if isinstance(other, StringArray):
- other = other._ndarray
- mask = isna(self) | isna(other)
- valid = ~mask
- if not lib.is_scalar(other):
- if len(other) != len(self):
- # prevent improper broadcasting when other is 2D
- raise ValueError(
- f"Lengths of operands do not match: {len(self)} != {len(other)}"
- )
- other = np.asarray(other)
- other = other[valid]
- if op.__name__ in ops.ARITHMETIC_BINOPS:
- result = np.empty_like(self._ndarray, dtype="object")
- result[mask] = libmissing.NA
- result[valid] = op(self._ndarray[valid], other)
- return StringArray(result)
- else:
- # logical
- result = np.zeros(len(self._ndarray), dtype="bool")
- result[valid] = op(self._ndarray[valid], other)
- return BooleanArray(result, mask)
- _arith_method = _cmp_method
- # ------------------------------------------------------------------------
- # String methods interface
- # error: Incompatible types in assignment (expression has type "NAType",
- # base class "PandasArray" defined the type as "float")
- _str_na_value = libmissing.NA # type: ignore[assignment]
- def _str_map(
- self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True
- ):
- from pandas.arrays import BooleanArray
- if dtype is None:
- dtype = StringDtype(storage="python")
- if na_value is None:
- na_value = self.dtype.na_value
- mask = isna(self)
- arr = np.asarray(self)
- if is_integer_dtype(dtype) or is_bool_dtype(dtype):
- constructor: type[IntegerArray] | type[BooleanArray]
- if is_integer_dtype(dtype):
- constructor = IntegerArray
- else:
- constructor = BooleanArray
- na_value_is_na = isna(na_value)
- if na_value_is_na:
- na_value = 1
- result = lib.map_infer_mask(
- arr,
- f,
- mask.view("uint8"),
- convert=False,
- na_value=na_value,
- # error: Argument 1 to "dtype" has incompatible type
- # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
- # "Type[object]"
- dtype=np.dtype(dtype), # type: ignore[arg-type]
- )
- if not na_value_is_na:
- mask[:] = False
- return constructor(result, mask)
- elif is_string_dtype(dtype) and not is_object_dtype(dtype):
- # i.e. StringDtype
- result = lib.map_infer_mask(
- arr, f, mask.view("uint8"), convert=False, na_value=na_value
- )
- return StringArray(result)
- else:
- # This is when the result type is object. We reach this when
- # -> We know the result type is truly object (e.g. .encode returns bytes
- # or .findall returns a list).
- # -> We don't know the result type. E.g. `.get` can return anything.
- return lib.map_infer_mask(arr, f, mask.view("uint8"))
|