numeric.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. from __future__ import annotations
  2. import numbers
  3. from typing import (
  4. TYPE_CHECKING,
  5. Any,
  6. Callable,
  7. Mapping,
  8. TypeVar,
  9. )
  10. import numpy as np
  11. from pandas._libs import (
  12. lib,
  13. missing as libmissing,
  14. )
  15. from pandas._typing import (
  16. Dtype,
  17. DtypeObj,
  18. npt,
  19. )
  20. from pandas.errors import AbstractMethodError
  21. from pandas.util._decorators import cache_readonly
  22. from pandas.core.dtypes.common import (
  23. is_bool_dtype,
  24. is_float_dtype,
  25. is_integer_dtype,
  26. is_object_dtype,
  27. is_string_dtype,
  28. pandas_dtype,
  29. )
  30. from pandas.core.arrays.masked import (
  31. BaseMaskedArray,
  32. BaseMaskedDtype,
  33. )
  34. if TYPE_CHECKING:
  35. import pyarrow
  36. T = TypeVar("T", bound="NumericArray")
  37. class NumericDtype(BaseMaskedDtype):
  38. _default_np_dtype: np.dtype
  39. _checker: Callable[[Any], bool] # is_foo_dtype
  40. def __repr__(self) -> str:
  41. return f"{self.name}Dtype()"
  42. @cache_readonly
  43. def is_signed_integer(self) -> bool:
  44. return self.kind == "i"
  45. @cache_readonly
  46. def is_unsigned_integer(self) -> bool:
  47. return self.kind == "u"
  48. @property
  49. def _is_numeric(self) -> bool:
  50. return True
  51. def __from_arrow__(
  52. self, array: pyarrow.Array | pyarrow.ChunkedArray
  53. ) -> BaseMaskedArray:
  54. """
  55. Construct IntegerArray/FloatingArray from pyarrow Array/ChunkedArray.
  56. """
  57. import pyarrow
  58. from pandas.core.arrays.arrow._arrow_utils import (
  59. pyarrow_array_to_numpy_and_mask,
  60. )
  61. array_class = self.construct_array_type()
  62. pyarrow_type = pyarrow.from_numpy_dtype(self.type)
  63. if not array.type.equals(pyarrow_type):
  64. # test_from_arrow_type_error raise for string, but allow
  65. # through itemsize conversion GH#31896
  66. rt_dtype = pandas_dtype(array.type.to_pandas_dtype())
  67. if rt_dtype.kind not in ["i", "u", "f"]:
  68. # Could allow "c" or potentially disallow float<->int conversion,
  69. # but at the moment we specifically test that uint<->int works
  70. raise TypeError(
  71. f"Expected array of {self} type, got {array.type} instead"
  72. )
  73. array = array.cast(pyarrow_type)
  74. if isinstance(array, pyarrow.Array):
  75. chunks = [array]
  76. else:
  77. # pyarrow.ChunkedArray
  78. chunks = array.chunks
  79. results = []
  80. for arr in chunks:
  81. data, mask = pyarrow_array_to_numpy_and_mask(arr, dtype=self.numpy_dtype)
  82. num_arr = array_class(data.copy(), ~mask, copy=False)
  83. results.append(num_arr)
  84. if not results:
  85. return array_class(
  86. np.array([], dtype=self.numpy_dtype), np.array([], dtype=np.bool_)
  87. )
  88. elif len(results) == 1:
  89. # avoid additional copy in _concat_same_type
  90. return results[0]
  91. else:
  92. return array_class._concat_same_type(results)
  93. @classmethod
  94. def _str_to_dtype_mapping(cls) -> Mapping[str, NumericDtype]:
  95. raise AbstractMethodError(cls)
  96. @classmethod
  97. def _standardize_dtype(cls, dtype: NumericDtype | str | np.dtype) -> NumericDtype:
  98. """
  99. Convert a string representation or a numpy dtype to NumericDtype.
  100. """
  101. if isinstance(dtype, str) and (dtype.startswith(("Int", "UInt", "Float"))):
  102. # Avoid DeprecationWarning from NumPy about np.dtype("Int64")
  103. # https://github.com/numpy/numpy/pull/7476
  104. dtype = dtype.lower()
  105. if not isinstance(dtype, NumericDtype):
  106. mapping = cls._str_to_dtype_mapping()
  107. try:
  108. dtype = mapping[str(np.dtype(dtype))]
  109. except KeyError as err:
  110. raise ValueError(f"invalid dtype specified {dtype}") from err
  111. return dtype
  112. @classmethod
  113. def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
  114. """
  115. Safely cast the values to the given dtype.
  116. "safe" in this context means the casting is lossless.
  117. """
  118. raise AbstractMethodError(cls)
  119. def _coerce_to_data_and_mask(values, mask, dtype, copy, dtype_cls, default_dtype):
  120. checker = dtype_cls._checker
  121. inferred_type = None
  122. if dtype is None and hasattr(values, "dtype"):
  123. if checker(values.dtype):
  124. dtype = values.dtype
  125. if dtype is not None:
  126. dtype = dtype_cls._standardize_dtype(dtype)
  127. cls = dtype_cls.construct_array_type()
  128. if isinstance(values, cls):
  129. values, mask = values._data, values._mask
  130. if dtype is not None:
  131. values = values.astype(dtype.numpy_dtype, copy=False)
  132. if copy:
  133. values = values.copy()
  134. mask = mask.copy()
  135. return values, mask, dtype, inferred_type
  136. original = values
  137. values = np.array(values, copy=copy)
  138. inferred_type = None
  139. if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
  140. inferred_type = lib.infer_dtype(values, skipna=True)
  141. if inferred_type == "boolean" and dtype is None:
  142. name = dtype_cls.__name__.strip("_")
  143. raise TypeError(f"{values.dtype} cannot be converted to {name}")
  144. elif is_bool_dtype(values) and checker(dtype):
  145. values = np.array(values, dtype=default_dtype, copy=copy)
  146. elif not (is_integer_dtype(values) or is_float_dtype(values)):
  147. name = dtype_cls.__name__.strip("_")
  148. raise TypeError(f"{values.dtype} cannot be converted to {name}")
  149. if values.ndim != 1:
  150. raise TypeError("values must be a 1D list-like")
  151. if mask is None:
  152. if is_integer_dtype(values):
  153. # fastpath
  154. mask = np.zeros(len(values), dtype=np.bool_)
  155. else:
  156. mask = libmissing.is_numeric_na(values)
  157. else:
  158. assert len(mask) == len(values)
  159. if mask.ndim != 1:
  160. raise TypeError("mask must be a 1D list-like")
  161. # infer dtype if needed
  162. if dtype is None:
  163. dtype = default_dtype
  164. else:
  165. dtype = dtype.type
  166. if is_integer_dtype(dtype) and is_float_dtype(values.dtype) and len(values) > 0:
  167. if mask.all():
  168. values = np.ones(values.shape, dtype=dtype)
  169. else:
  170. idx = np.nanargmax(values)
  171. if int(values[idx]) != original[idx]:
  172. # We have ints that lost precision during the cast.
  173. inferred_type = lib.infer_dtype(original, skipna=True)
  174. if (
  175. inferred_type not in ["floating", "mixed-integer-float"]
  176. and not mask.any()
  177. ):
  178. values = np.array(original, dtype=dtype, copy=False)
  179. else:
  180. values = np.array(original, dtype="object", copy=False)
  181. # we copy as need to coerce here
  182. if mask.any():
  183. values = values.copy()
  184. values[mask] = cls._internal_fill_value
  185. if inferred_type in ("string", "unicode"):
  186. # casts from str are always safe since they raise
  187. # a ValueError if the str cannot be parsed into a float
  188. values = values.astype(dtype, copy=copy)
  189. else:
  190. values = dtype_cls._safe_cast(values, dtype, copy=False)
  191. return values, mask, dtype, inferred_type
  192. class NumericArray(BaseMaskedArray):
  193. """
  194. Base class for IntegerArray and FloatingArray.
  195. """
  196. _dtype_cls: type[NumericDtype]
  197. def __init__(
  198. self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
  199. ) -> None:
  200. checker = self._dtype_cls._checker
  201. if not (isinstance(values, np.ndarray) and checker(values.dtype)):
  202. descr = (
  203. "floating"
  204. if self._dtype_cls.kind == "f" # type: ignore[comparison-overlap]
  205. else "integer"
  206. )
  207. raise TypeError(
  208. f"values should be {descr} numpy array. Use "
  209. "the 'pd.array' function instead"
  210. )
  211. if values.dtype == np.float16:
  212. # If we don't raise here, then accessing self.dtype would raise
  213. raise TypeError("FloatingArray does not support np.float16 dtype.")
  214. super().__init__(values, mask, copy=copy)
  215. @cache_readonly
  216. def dtype(self) -> NumericDtype:
  217. mapping = self._dtype_cls._str_to_dtype_mapping()
  218. return mapping[str(self._data.dtype)]
  219. @classmethod
  220. def _coerce_to_array(
  221. cls, value, *, dtype: DtypeObj, copy: bool = False
  222. ) -> tuple[np.ndarray, np.ndarray]:
  223. dtype_cls = cls._dtype_cls
  224. default_dtype = dtype_cls._default_np_dtype
  225. mask = None
  226. values, mask, _, _ = _coerce_to_data_and_mask(
  227. value, mask, dtype, copy, dtype_cls, default_dtype
  228. )
  229. return values, mask
  230. @classmethod
  231. def _from_sequence_of_strings(
  232. cls: type[T], strings, *, dtype: Dtype | None = None, copy: bool = False
  233. ) -> T:
  234. from pandas.core.tools.numeric import to_numeric
  235. scalars = to_numeric(strings, errors="raise", dtype_backend="numpy_nullable")
  236. return cls._from_sequence(scalars, dtype=dtype, copy=copy)
  237. _HANDLED_TYPES = (np.ndarray, numbers.Number)