123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291 |
- from __future__ import annotations
- import numbers
- from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Mapping,
- TypeVar,
- )
- import numpy as np
- from pandas._libs import (
- lib,
- missing as libmissing,
- )
- from pandas._typing import (
- Dtype,
- DtypeObj,
- npt,
- )
- from pandas.errors import AbstractMethodError
- from pandas.util._decorators import cache_readonly
- from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_float_dtype,
- is_integer_dtype,
- is_object_dtype,
- is_string_dtype,
- pandas_dtype,
- )
- from pandas.core.arrays.masked import (
- BaseMaskedArray,
- BaseMaskedDtype,
- )
- if TYPE_CHECKING:
- import pyarrow
- T = TypeVar("T", bound="NumericArray")
- class NumericDtype(BaseMaskedDtype):
- _default_np_dtype: np.dtype
- _checker: Callable[[Any], bool] # is_foo_dtype
- def __repr__(self) -> str:
- return f"{self.name}Dtype()"
- @cache_readonly
- def is_signed_integer(self) -> bool:
- return self.kind == "i"
- @cache_readonly
- def is_unsigned_integer(self) -> bool:
- return self.kind == "u"
- @property
- def _is_numeric(self) -> bool:
- return True
- def __from_arrow__(
- self, array: pyarrow.Array | pyarrow.ChunkedArray
- ) -> BaseMaskedArray:
- """
- Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
- """
- import pyarrow
- from pandas.core.arrays.arrow._arrow_utils import (
- pyarrow_array_to_numpy_and_mask,
- )
- array_class = self.construct_array_type()
- pyarrow_type = pyarrow.from_numpy_dtype(self.type)
- if not array.type.equals(pyarrow_type):
- # test_from_arrow_type_error raise for string, but allow
- # through itemsize conversion GH#31896
- rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
- if rt_dtype.kind not in ["i", "u", "f"]:
- # Could allow "c" or potentially disallow float<->int conversion,
- # but at the moment we specifically test that uint<->int works
- raise TypeError(
- f"Expected array of {self} type, got {array.type} instead"
- )
- array = array.cast(pyarrow_type)
- if isinstance(array, pyarrow.Array):
- chunks = [array]
- else:
- # pyarrow.ChunkedArray
- chunks = array.chunks
- results = []
- for arr in chunks:
- data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.numpy_dtype)
- num_arr = array_class(data.copy(), ~mask, copy=False)
- results.append(num_arr)
- if not results:
- return array_class(
- np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_)
- )
- elif len(results) == 1:
- # avoid additional copy in _concat_same_type
- return results[0]
- else:
- return array_class._concat_same_type(results)
- @classmethod
- def _str_to_dtype_mapping(cls) -> Mapping[str, NumericDtype]:
- raise AbstractMethodError(cls)
- @classmethod
- def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
- """
- Convert a string representation or a numpy dtype to NumericDtype.
- """
- if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
- # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
- # https://github.com/numpy/numpy/pull/7476
- dtype = dtype.lower()
- if not isinstance(dtype, NumericDtype):
- mapping = cls._str_to_dtype_mapping()
- try:
- dtype = mapping[str(np.dtype(dtype))]
- except KeyError as err:
- raise ValueError(f"invalid dtype specified {dtype}") from err
- return dtype
- @classmethod
- def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
- """
- Safely cast the values to the given dtype.
- "safe" in this context means the casting is lossless.
- """
- raise AbstractMethodError(cls)
- def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype):
- checker = dtype_cls._checker
- inferred_type = None
- if dtype is None and hasattr(values, "dtype"):
- if checker(values.dtype):
- dtype = values.dtype
- if dtype is not None:
- dtype = dtype_cls._standardize_dtype(dtype)
- cls = dtype_cls.construct_array_type()
- if isinstance(values, cls):
- values, mask = values._data, values._mask
- if dtype is not None:
- values = values.astype(dtype.numpy_dtype, copy=False)
- if copy:
- values = values.copy()
- mask = mask.copy()
- return values, mask, dtype, inferred_type
- original = values
- values = np.array(values, copy=copy)
- inferred_type = None
- if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
- inferred_type = lib.infer_dtype(values, skipna=True)
- if inferred_type == "boolean" and dtype is None:
- name = dtype_cls.__name__.strip("_")
- raise TypeError(f"{values.dtype} cannot be converted to {name}")
- elif is_bool_dtype(values) and checker(dtype):
- values = np.array(values, dtype=default_dtype, copy=copy)
- elif not (is_integer_dtype(values) or is_float_dtype(values)):
- name = dtype_cls.__name__.strip("_")
- raise TypeError(f"{values.dtype} cannot be converted to {name}")
- if values.ndim != 1:
- raise TypeError("values must be a 1D list-like")
- if mask is None:
- if is_integer_dtype(values):
- # fastpath
- mask = np.zeros(len(values), dtype=np.bool_)
- else:
- mask = libmissing.is_numeric_na(values)
- else:
- assert len(mask) == len(values)
- if mask.ndim != 1:
- raise TypeError("mask must be a 1D list-like")
- # infer dtype if needed
- if dtype is None:
- dtype = default_dtype
- else:
- dtype = dtype.type
- if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0:
- if mask.all():
- values = np.ones(values.shape, dtype=dtype)
- else:
- idx = np.nanargmax(values)
- if int(values[idx]) != original[idx]:
- # We have ints that lost precision during the cast.
- inferred_type = lib.infer_dtype(original, skipna=True)
- if (
- inferred_type not in ["floating", "mixed-integer-float"]
- and not mask.any()
- ):
- values = np.array(original, dtype=dtype, copy=False)
- else:
- values = np.array(original, dtype="object", copy=False)
- # we copy as need to coerce here
- if mask.any():
- values = values.copy()
- values[mask] = cls._internal_fill_value
- if inferred_type in ("string", "unicode"):
- # casts from str are always safe since they raise
- # a ValueError if the str cannot be parsed into a float
- values = values.astype(dtype, copy=copy)
- else:
- values = dtype_cls._safe_cast(values, dtype, copy=False)
- return values, mask, dtype, inferred_type
- class NumericArray(BaseMaskedArray):
- """
- Base class for IntegerArray and FloatingArray.
- """
- _dtype_cls: type[NumericDtype]
- def __init__(
- self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
- ) -> None:
- checker = self._dtype_cls._checker
- if not (isinstance(values, np.ndarray) and checker(values.dtype)):
- descr = (
- "floating"
- if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
- else "integer"
- )
- raise TypeError(
- f"values should be {descr} numpy array. Use "
- "the 'pd.array' function instead"
- )
- if values.dtype == np.float16:
- # If we don't raise here, then accessing self.dtype would raise
- raise TypeError("FloatingArray does not support np.float16 dtype.")
- super().__init__(values, mask, copy=copy)
- @cache_readonly
- def dtype(self) -> NumericDtype:
- mapping = self._dtype_cls._str_to_dtype_mapping()
- return mapping[str(self._data.dtype)]
- @classmethod
- def _coerce_to_array(
- cls, value, *, dtype: DtypeObj, copy: bool = False
- ) -> tuple[np.ndarray, np.ndarray]:
- dtype_cls = cls._dtype_cls
- default_dtype = dtype_cls._default_np_dtype
- mask = None
- values, mask, _, _ = _coerce_to_data_and_mask(
- value, mask, dtype, copy, dtype_cls, default_dtype
- )
- return values, mask
- @classmethod
- def _from_sequence_of_strings(
- cls: type[T], strings, *, dtype: Dtype | None = None, copy: bool = False
- ) -> T:
- from pandas.core.tools.numeric import to_numeric
- scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
- return cls._from_sequence(scalars, dtype=dtype, copy=copy)
- _HANDLED_TYPES = (np.ndarray, numbers.Number)
|