numpy_.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. from __future__ import annotations
  2. import numpy as np
  3. from pandas._libs import lib
  4. from pandas._libs.tslibs import (
  5. get_unit_from_dtype,
  6. is_supported_unit,
  7. )
  8. from pandas._typing import (
  9. AxisInt,
  10. Dtype,
  11. NpDtype,
  12. Scalar,
  13. npt,
  14. )
  15. from pandas.compat.numpy import function as nv
  16. from pandas.core.dtypes.astype import astype_array
  17. from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
  18. from pandas.core.dtypes.common import (
  19. is_dtype_equal,
  20. pandas_dtype,
  21. )
  22. from pandas.core.dtypes.dtypes import PandasDtype
  23. from pandas.core.dtypes.missing import isna
  24. from pandas.core import (
  25. arraylike,
  26. nanops,
  27. ops,
  28. )
  29. from pandas.core.arraylike import OpsMixin
  30. from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
  31. from pandas.core.construction import ensure_wrapped_if_datetimelike
  32. from pandas.core.strings.object_array import ObjectStringArrayMixin
  33. class PandasArray(
  34. OpsMixin,
  35. NDArrayBackedExtensionArray,
  36. ObjectStringArrayMixin,
  37. ):
  38. """
  39. A pandas ExtensionArray for NumPy data.
  40. This is mostly for internal compatibility, and is not especially
  41. useful on its own.
  42. Parameters
  43. ----------
  44. values : ndarray
  45. The NumPy ndarray to wrap. Must be 1-dimensional.
  46. copy : bool, default False
  47. Whether to copy `values`.
  48. Attributes
  49. ----------
  50. None
  51. Methods
  52. -------
  53. None
  54. """
  55. # If you're wondering why pd.Series(cls) doesn't put the array in an
  56. # ExtensionBlock, search for `ABCPandasArray`. We check for
  57. # that _typ to ensure that users don't unnecessarily use EAs inside
  58. # pandas internals, which turns off things like block consolidation.
  59. _typ = "npy_extension"
  60. __array_priority__ = 1000
  61. _ndarray: np.ndarray
  62. _dtype: PandasDtype
  63. _internal_fill_value = np.nan
  64. # ------------------------------------------------------------------------
  65. # Constructors
  66. def __init__(self, values: np.ndarray | PandasArray, copy: bool = False) -> None:
  67. if isinstance(values, type(self)):
  68. values = values._ndarray
  69. if not isinstance(values, np.ndarray):
  70. raise ValueError(
  71. f"'values' must be a NumPy array, not {type(values).__name__}"
  72. )
  73. if values.ndim == 0:
  74. # Technically we support 2, but do not advertise that fact.
  75. raise ValueError("PandasArray must be 1-dimensional.")
  76. if copy:
  77. values = values.copy()
  78. dtype = PandasDtype(values.dtype)
  79. super().__init__(values, dtype)
  80. @classmethod
  81. def _from_sequence(
  82. cls, scalars, *, dtype: Dtype | None = None, copy: bool = False
  83. ) -> PandasArray:
  84. if isinstance(dtype, PandasDtype):
  85. dtype = dtype._dtype
  86. # error: Argument "dtype" to "asarray" has incompatible type
  87. # "Union[ExtensionDtype, str, dtype[Any], dtype[floating[_64Bit]], Type[object],
  88. # None]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
  89. # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
  90. # _DTypeDict, Tuple[Any, Any]]]"
  91. result = np.asarray(scalars, dtype=dtype) # type: ignore[arg-type]
  92. if (
  93. result.ndim > 1
  94. and not hasattr(scalars, "dtype")
  95. and (dtype is None or dtype == object)
  96. ):
  97. # e.g. list-of-tuples
  98. result = construct_1d_object_array_from_listlike(scalars)
  99. if copy and result is scalars:
  100. result = result.copy()
  101. return cls(result)
  102. def _from_backing_data(self, arr: np.ndarray) -> PandasArray:
  103. return type(self)(arr)
  104. # ------------------------------------------------------------------------
  105. # Data
  106. @property
  107. def dtype(self) -> PandasDtype:
  108. return self._dtype
  109. # ------------------------------------------------------------------------
  110. # NumPy Array Interface
  111. def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
  112. return np.asarray(self._ndarray, dtype=dtype)
  113. def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
  114. # Lightly modified version of
  115. # https://numpy.org/doc/stable/reference/generated/numpy.lib.mixins.NDArrayOperatorsMixin.html
  116. # The primary modification is not boxing scalar return values
  117. # in PandasArray, since pandas' ExtensionArrays are 1-d.
  118. out = kwargs.get("out", ())
  119. result = ops.maybe_dispatch_ufunc_to_dunder_op(
  120. self, ufunc, method, *inputs, **kwargs
  121. )
  122. if result is not NotImplemented:
  123. return result
  124. if "out" in kwargs:
  125. # e.g. test_ufunc_unary
  126. return arraylike.dispatch_ufunc_with_out(
  127. self, ufunc, method, *inputs, **kwargs
  128. )
  129. if method == "reduce":
  130. result = arraylike.dispatch_reduction_ufunc(
  131. self, ufunc, method, *inputs, **kwargs
  132. )
  133. if result is not NotImplemented:
  134. # e.g. tests.series.test_ufunc.TestNumpyReductions
  135. return result
  136. # Defer to the implementation of the ufunc on unwrapped values.
  137. inputs = tuple(x._ndarray if isinstance(x, PandasArray) else x for x in inputs)
  138. if out:
  139. kwargs["out"] = tuple(
  140. x._ndarray if isinstance(x, PandasArray) else x for x in out
  141. )
  142. result = getattr(ufunc, method)(*inputs, **kwargs)
  143. if ufunc.nout > 1:
  144. # multiple return values; re-box array-like results
  145. return tuple(type(self)(x) for x in result)
  146. elif method == "at":
  147. # no return value
  148. return None
  149. elif method == "reduce":
  150. if isinstance(result, np.ndarray):
  151. # e.g. test_np_reduce_2d
  152. return type(self)(result)
  153. # e.g. test_np_max_nested_tuples
  154. return result
  155. else:
  156. # one return value; re-box array-like results
  157. return type(self)(result)
  158. # ------------------------------------------------------------------------
  159. # Pandas ExtensionArray Interface
  160. def astype(self, dtype, copy: bool = True):
  161. dtype = pandas_dtype(dtype)
  162. if is_dtype_equal(dtype, self.dtype):
  163. if copy:
  164. return self.copy()
  165. return self
  166. result = astype_array(self._ndarray, dtype=dtype, copy=copy)
  167. return result
  168. def isna(self) -> np.ndarray:
  169. return isna(self._ndarray)
  170. def _validate_scalar(self, fill_value):
  171. if fill_value is None:
  172. # Primarily for subclasses
  173. fill_value = self.dtype.na_value
  174. return fill_value
  175. def _values_for_factorize(self) -> tuple[np.ndarray, float | None]:
  176. if self.dtype.kind in ["i", "u", "b"]:
  177. fv = None
  178. else:
  179. fv = np.nan
  180. return self._ndarray, fv
  181. # ------------------------------------------------------------------------
  182. # Reductions
  183. def any(
  184. self,
  185. *,
  186. axis: AxisInt | None = None,
  187. out=None,
  188. keepdims: bool = False,
  189. skipna: bool = True,
  190. ):
  191. nv.validate_any((), {"out": out, "keepdims": keepdims})
  192. result = nanops.nanany(self._ndarray, axis=axis, skipna=skipna)
  193. return self._wrap_reduction_result(axis, result)
  194. def all(
  195. self,
  196. *,
  197. axis: AxisInt | None = None,
  198. out=None,
  199. keepdims: bool = False,
  200. skipna: bool = True,
  201. ):
  202. nv.validate_all((), {"out": out, "keepdims": keepdims})
  203. result = nanops.nanall(self._ndarray, axis=axis, skipna=skipna)
  204. return self._wrap_reduction_result(axis, result)
  205. def min(
  206. self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
  207. ) -> Scalar:
  208. nv.validate_min((), kwargs)
  209. result = nanops.nanmin(
  210. values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
  211. )
  212. return self._wrap_reduction_result(axis, result)
  213. def max(
  214. self, *, axis: AxisInt | None = None, skipna: bool = True, **kwargs
  215. ) -> Scalar:
  216. nv.validate_max((), kwargs)
  217. result = nanops.nanmax(
  218. values=self._ndarray, axis=axis, mask=self.isna(), skipna=skipna
  219. )
  220. return self._wrap_reduction_result(axis, result)
  221. def sum(
  222. self,
  223. *,
  224. axis: AxisInt | None = None,
  225. skipna: bool = True,
  226. min_count: int = 0,
  227. **kwargs,
  228. ) -> Scalar:
  229. nv.validate_sum((), kwargs)
  230. result = nanops.nansum(
  231. self._ndarray, axis=axis, skipna=skipna, min_count=min_count
  232. )
  233. return self._wrap_reduction_result(axis, result)
  234. def prod(
  235. self,
  236. *,
  237. axis: AxisInt | None = None,
  238. skipna: bool = True,
  239. min_count: int = 0,
  240. **kwargs,
  241. ) -> Scalar:
  242. nv.validate_prod((), kwargs)
  243. result = nanops.nanprod(
  244. self._ndarray, axis=axis, skipna=skipna, min_count=min_count
  245. )
  246. return self._wrap_reduction_result(axis, result)
  247. def mean(
  248. self,
  249. *,
  250. axis: AxisInt | None = None,
  251. dtype: NpDtype | None = None,
  252. out=None,
  253. keepdims: bool = False,
  254. skipna: bool = True,
  255. ):
  256. nv.validate_mean((), {"dtype": dtype, "out": out, "keepdims": keepdims})
  257. result = nanops.nanmean(self._ndarray, axis=axis, skipna=skipna)
  258. return self._wrap_reduction_result(axis, result)
  259. def median(
  260. self,
  261. *,
  262. axis: AxisInt | None = None,
  263. out=None,
  264. overwrite_input: bool = False,
  265. keepdims: bool = False,
  266. skipna: bool = True,
  267. ):
  268. nv.validate_median(
  269. (), {"out": out, "overwrite_input": overwrite_input, "keepdims": keepdims}
  270. )
  271. result = nanops.nanmedian(self._ndarray, axis=axis, skipna=skipna)
  272. return self._wrap_reduction_result(axis, result)
  273. def std(
  274. self,
  275. *,
  276. axis: AxisInt | None = None,
  277. dtype: NpDtype | None = None,
  278. out=None,
  279. ddof: int = 1,
  280. keepdims: bool = False,
  281. skipna: bool = True,
  282. ):
  283. nv.validate_stat_ddof_func(
  284. (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="std"
  285. )
  286. result = nanops.nanstd(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
  287. return self._wrap_reduction_result(axis, result)
  288. def var(
  289. self,
  290. *,
  291. axis: AxisInt | None = None,
  292. dtype: NpDtype | None = None,
  293. out=None,
  294. ddof: int = 1,
  295. keepdims: bool = False,
  296. skipna: bool = True,
  297. ):
  298. nv.validate_stat_ddof_func(
  299. (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="var"
  300. )
  301. result = nanops.nanvar(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
  302. return self._wrap_reduction_result(axis, result)
  303. def sem(
  304. self,
  305. *,
  306. axis: AxisInt | None = None,
  307. dtype: NpDtype | None = None,
  308. out=None,
  309. ddof: int = 1,
  310. keepdims: bool = False,
  311. skipna: bool = True,
  312. ):
  313. nv.validate_stat_ddof_func(
  314. (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="sem"
  315. )
  316. result = nanops.nansem(self._ndarray, axis=axis, skipna=skipna, ddof=ddof)
  317. return self._wrap_reduction_result(axis, result)
  318. def kurt(
  319. self,
  320. *,
  321. axis: AxisInt | None = None,
  322. dtype: NpDtype | None = None,
  323. out=None,
  324. keepdims: bool = False,
  325. skipna: bool = True,
  326. ):
  327. nv.validate_stat_ddof_func(
  328. (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="kurt"
  329. )
  330. result = nanops.nankurt(self._ndarray, axis=axis, skipna=skipna)
  331. return self._wrap_reduction_result(axis, result)
  332. def skew(
  333. self,
  334. *,
  335. axis: AxisInt | None = None,
  336. dtype: NpDtype | None = None,
  337. out=None,
  338. keepdims: bool = False,
  339. skipna: bool = True,
  340. ):
  341. nv.validate_stat_ddof_func(
  342. (), {"dtype": dtype, "out": out, "keepdims": keepdims}, fname="skew"
  343. )
  344. result = nanops.nanskew(self._ndarray, axis=axis, skipna=skipna)
  345. return self._wrap_reduction_result(axis, result)
  346. # ------------------------------------------------------------------------
  347. # Additional Methods
  348. def to_numpy(
  349. self,
  350. dtype: npt.DTypeLike | None = None,
  351. copy: bool = False,
  352. na_value: object = lib.no_default,
  353. ) -> np.ndarray:
  354. mask = self.isna()
  355. if na_value is not lib.no_default and mask.any():
  356. result = self._ndarray.copy()
  357. result[mask] = na_value
  358. else:
  359. result = self._ndarray
  360. result = np.asarray(result, dtype=dtype)
  361. if copy and result is self._ndarray:
  362. result = result.copy()
  363. return result
  364. # ------------------------------------------------------------------------
  365. # Ops
  366. def __invert__(self) -> PandasArray:
  367. return type(self)(~self._ndarray)
  368. def __neg__(self) -> PandasArray:
  369. return type(self)(-self._ndarray)
  370. def __pos__(self) -> PandasArray:
  371. return type(self)(+self._ndarray)
  372. def __abs__(self) -> PandasArray:
  373. return type(self)(abs(self._ndarray))
  374. def _cmp_method(self, other, op):
  375. if isinstance(other, PandasArray):
  376. other = other._ndarray
  377. other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
  378. pd_op = ops.get_array_op(op)
  379. other = ensure_wrapped_if_datetimelike(other)
  380. with np.errstate(all="ignore"):
  381. result = pd_op(self._ndarray, other)
  382. if op is divmod or op is ops.rdivmod:
  383. a, b = result
  384. if isinstance(a, np.ndarray):
  385. # for e.g. op vs TimedeltaArray, we may already
  386. # have an ExtensionArray, in which case we do not wrap
  387. return self._wrap_ndarray_result(a), self._wrap_ndarray_result(b)
  388. return a, b
  389. if isinstance(result, np.ndarray):
  390. # for e.g. multiplication vs TimedeltaArray, we may already
  391. # have an ExtensionArray, in which case we do not wrap
  392. return self._wrap_ndarray_result(result)
  393. return result
  394. _arith_method = _cmp_method
  395. def _wrap_ndarray_result(self, result: np.ndarray):
  396. # If we have timedelta64[ns] result, return a TimedeltaArray instead
  397. # of a PandasArray
  398. if result.dtype.kind == "m" and is_supported_unit(
  399. get_unit_from_dtype(result.dtype)
  400. ):
  401. from pandas.core.arrays import TimedeltaArray
  402. return TimedeltaArray._simple_new(result, dtype=result.dtype)
  403. return type(self)(result)
  404. # ------------------------------------------------------------------------
  405. # String methods interface
  406. _str_na_value = np.nan