12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391 |
- from __future__ import annotations
- from typing import (
- TYPE_CHECKING,
- Any,
- Iterator,
- Literal,
- Sequence,
- TypeVar,
- overload,
- )
- import warnings
- import numpy as np
- from pandas._libs import (
- lib,
- missing as libmissing,
- )
- from pandas._libs.tslibs import (
- get_unit_from_dtype,
- is_supported_unit,
- )
- from pandas._typing import (
- ArrayLike,
- AstypeArg,
- AxisInt,
- DtypeObj,
- NpDtype,
- PositionalIndexer,
- Scalar,
- ScalarIndexer,
- SequenceIndexer,
- Shape,
- npt,
- )
- from pandas.errors import AbstractMethodError
- from pandas.util._decorators import doc
- from pandas.util._validators import validate_fillna_kwargs
- from pandas.core.dtypes.base import ExtensionDtype
- from pandas.core.dtypes.common import (
- is_bool,
- is_bool_dtype,
- is_datetime64_dtype,
- is_dtype_equal,
- is_float_dtype,
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- pandas_dtype,
- )
- from pandas.core.dtypes.dtypes import BaseMaskedDtype
- from pandas.core.dtypes.inference import is_array_like
- from pandas.core.dtypes.missing import (
- array_equivalent,
- is_valid_na_for_dtype,
- isna,
- notna,
- )
- from pandas.core import (
- algorithms as algos,
- arraylike,
- missing,
- nanops,
- ops,
- )
- from pandas.core.algorithms import (
- factorize_array,
- isin,
- take,
- )
- from pandas.core.array_algos import (
- masked_accumulations,
- masked_reductions,
- )
- from pandas.core.array_algos.quantile import quantile_with_mask
- from pandas.core.arraylike import OpsMixin
- from pandas.core.arrays import ExtensionArray
- from pandas.core.construction import ensure_wrapped_if_datetimelike
- from pandas.core.indexers import check_array_indexer
- from pandas.core.ops import invalid_comparison
- if TYPE_CHECKING:
- from pandas import Series
- from pandas.core.arrays import BooleanArray
- from pandas._typing import (
- NumpySorter,
- NumpyValueArrayLike,
- )
- from pandas.compat.numpy import function as nv
- BaseMaskedArrayT = TypeVar("BaseMaskedArrayT", bound="BaseMaskedArray")
- class BaseMaskedArray(OpsMixin, ExtensionArray):
- """
- Base class for masked arrays (which use _data and _mask to store the data).
- numpy based
- """
- # The value used to fill '_data' to avoid upcasting
- _internal_fill_value: Scalar
- # our underlying data and mask are each ndarrays
- _data: np.ndarray
- _mask: npt.NDArray[np.bool_]
- # Fill values used for any/all
- _truthy_value = Scalar # bool(_truthy_value) = True
- _falsey_value = Scalar # bool(_falsey_value) = False
- def __init__(
- self, values: np.ndarray, mask: npt.NDArray[np.bool_], copy: bool = False
- ) -> None:
- # values is supposed to already be validated in the subclass
- if not (isinstance(mask, np.ndarray) and mask.dtype == np.bool_):
- raise TypeError(
- "mask should be boolean numpy array. Use "
- "the 'pd.array' function instead"
- )
- if values.shape != mask.shape:
- raise ValueError("values.shape must match mask.shape")
- if copy:
- values = values.copy()
- mask = mask.copy()
- self._data = values
- self._mask = mask
- @classmethod
- def _from_sequence(
- cls: type[BaseMaskedArrayT], scalars, *, dtype=None, copy: bool = False
- ) -> BaseMaskedArrayT:
- values, mask = cls._coerce_to_array(scalars, dtype=dtype, copy=copy)
- return cls(values, mask)
- @property
- def dtype(self) -> BaseMaskedDtype:
- raise AbstractMethodError(self)
- @overload
- def __getitem__(self, item: ScalarIndexer) -> Any:
- ...
- @overload
- def __getitem__(self: BaseMaskedArrayT, item: SequenceIndexer) -> BaseMaskedArrayT:
- ...
- def __getitem__(
- self: BaseMaskedArrayT, item: PositionalIndexer
- ) -> BaseMaskedArrayT | Any:
- item = check_array_indexer(self, item)
- newmask = self._mask[item]
- if is_bool(newmask):
- # This is a scalar indexing
- if newmask:
- return self.dtype.na_value
- return self._data[item]
- return type(self)(self._data[item], newmask)
- @doc(ExtensionArray.fillna)
- def fillna(
- self: BaseMaskedArrayT, value=None, method=None, limit=None
- ) -> BaseMaskedArrayT:
- value, method = validate_fillna_kwargs(value, method)
- mask = self._mask
- if is_array_like(value):
- if len(value) != len(self):
- raise ValueError(
- f"Length of 'value' does not match. Got ({len(value)}) "
- f" expected {len(self)}"
- )
- value = value[mask]
- if mask.any():
- if method is not None:
- func = missing.get_fill_func(method, ndim=self.ndim)
- npvalues = self._data.copy().T
- new_mask = mask.copy().T
- func(npvalues, limit=limit, mask=new_mask)
- return type(self)(npvalues.T, new_mask.T)
- else:
- # fill with value
- new_values = self.copy()
- new_values[mask] = value
- else:
- new_values = self.copy()
- return new_values
- @classmethod
- def _coerce_to_array(
- cls, values, *, dtype: DtypeObj, copy: bool = False
- ) -> tuple[np.ndarray, np.ndarray]:
- raise AbstractMethodError(cls)
- def _validate_setitem_value(self, value):
- """
- Check if we have a scalar that we can cast losslessly.
- Raises
- ------
- TypeError
- """
- kind = self.dtype.kind
- # TODO: get this all from np_can_hold_element?
- if kind == "b":
- if lib.is_bool(value):
- return value
- elif kind == "f":
- if lib.is_integer(value) or lib.is_float(value):
- return value
- else:
- if lib.is_integer(value) or (lib.is_float(value) and value.is_integer()):
- return value
- # TODO: unsigned checks
- # Note: without the "str" here, the f-string rendering raises in
- # py38 builds.
- raise TypeError(f"Invalid value '{str(value)}' for dtype {self.dtype}")
- def __setitem__(self, key, value) -> None:
- key = check_array_indexer(self, key)
- if is_scalar(value):
- if is_valid_na_for_dtype(value, self.dtype):
- self._mask[key] = True
- else:
- value = self._validate_setitem_value(value)
- self._data[key] = value
- self._mask[key] = False
- return
- value, mask = self._coerce_to_array(value, dtype=self.dtype)
- self._data[key] = value
- self._mask[key] = mask
- def __iter__(self) -> Iterator:
- if self.ndim == 1:
- if not self._hasna:
- for val in self._data:
- yield val
- else:
- na_value = self.dtype.na_value
- for isna_, val in zip(self._mask, self._data):
- if isna_:
- yield na_value
- else:
- yield val
- else:
- for i in range(len(self)):
- yield self[i]
- def __len__(self) -> int:
- return len(self._data)
- @property
- def shape(self) -> Shape:
- return self._data.shape
- @property
- def ndim(self) -> int:
- return self._data.ndim
- def swapaxes(self: BaseMaskedArrayT, axis1, axis2) -> BaseMaskedArrayT:
- data = self._data.swapaxes(axis1, axis2)
- mask = self._mask.swapaxes(axis1, axis2)
- return type(self)(data, mask)
- def delete(self: BaseMaskedArrayT, loc, axis: AxisInt = 0) -> BaseMaskedArrayT:
- data = np.delete(self._data, loc, axis=axis)
- mask = np.delete(self._mask, loc, axis=axis)
- return type(self)(data, mask)
- def reshape(self: BaseMaskedArrayT, *args, **kwargs) -> BaseMaskedArrayT:
- data = self._data.reshape(*args, **kwargs)
- mask = self._mask.reshape(*args, **kwargs)
- return type(self)(data, mask)
- def ravel(self: BaseMaskedArrayT, *args, **kwargs) -> BaseMaskedArrayT:
- # TODO: need to make sure we have the same order for data/mask
- data = self._data.ravel(*args, **kwargs)
- mask = self._mask.ravel(*args, **kwargs)
- return type(self)(data, mask)
- @property
- def T(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- return type(self)(self._data.T, self._mask.T)
- def round(self, decimals: int = 0, *args, **kwargs):
- """
- Round each value in the array a to the given number of decimals.
- Parameters
- ----------
- decimals : int, default 0
- Number of decimal places to round to. If decimals is negative,
- it specifies the number of positions to the left of the decimal point.
- *args, **kwargs
- Additional arguments and keywords have no effect but might be
- accepted for compatibility with NumPy.
- Returns
- -------
- NumericArray
- Rounded values of the NumericArray.
- See Also
- --------
- numpy.around : Round values of an np.array.
- DataFrame.round : Round values of a DataFrame.
- Series.round : Round values of a Series.
- """
- nv.validate_round(args, kwargs)
- values = np.round(self._data, decimals=decimals, **kwargs)
- # Usually we'll get same type as self, but ndarray[bool] casts to float
- return self._maybe_mask_result(values, self._mask.copy())
- # ------------------------------------------------------------------
- # Unary Methods
- def __invert__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- return type(self)(~self._data, self._mask.copy())
- def __neg__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- return type(self)(-self._data, self._mask.copy())
- def __pos__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- return self.copy()
- def __abs__(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- return type(self)(abs(self._data), self._mask.copy())
- # ------------------------------------------------------------------
- def to_numpy(
- self,
- dtype: npt.DTypeLike | None = None,
- copy: bool = False,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- """
- Convert to a NumPy Array.
- By default converts to an object-dtype NumPy array. Specify the `dtype` and
- `na_value` keywords to customize the conversion.
- Parameters
- ----------
- dtype : dtype, default object
- The numpy dtype to convert to.
- copy : bool, default False
- Whether to ensure that the returned value is a not a view on
- the array. Note that ``copy=False`` does not *ensure* that
- ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
- a copy is made, even if not strictly necessary. This is typically
- only possible when no missing values are present and `dtype`
- is the equivalent numpy dtype.
- na_value : scalar, optional
- Scalar missing value indicator to use in numpy array. Defaults
- to the native missing value indicator of this array (pd.NA).
- Returns
- -------
- numpy.ndarray
- Examples
- --------
- An object-dtype is the default result
- >>> a = pd.array([True, False, pd.NA], dtype="boolean")
- >>> a.to_numpy()
- array([True, False, <NA>], dtype=object)
- When no missing values are present, an equivalent dtype can be used.
- >>> pd.array([True, False], dtype="boolean").to_numpy(dtype="bool")
- array([ True, False])
- >>> pd.array([1, 2], dtype="Int64").to_numpy("int64")
- array([1, 2])
- However, requesting such dtype will raise a ValueError if
- missing values are present and the default missing value :attr:`NA`
- is used.
- >>> a = pd.array([True, False, pd.NA], dtype="boolean")
- >>> a
- <BooleanArray>
- [True, False, <NA>]
- Length: 3, dtype: boolean
- >>> a.to_numpy(dtype="bool")
- Traceback (most recent call last):
- ...
- ValueError: cannot convert to bool numpy array in presence of missing values
- Specify a valid `na_value` instead
- >>> a.to_numpy(dtype="bool", na_value=False)
- array([ True, False, False])
- """
- if na_value is lib.no_default:
- na_value = libmissing.NA
- if dtype is None:
- dtype = object
- if self._hasna:
- if (
- not is_object_dtype(dtype)
- and not is_string_dtype(dtype)
- and na_value is libmissing.NA
- ):
- raise ValueError(
- f"cannot convert to '{dtype}'-dtype NumPy array "
- "with missing values. Specify an appropriate 'na_value' "
- "for this dtype."
- )
- # don't pass copy to astype -> always need a copy since we are mutating
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore", category=RuntimeWarning)
- data = self._data.astype(dtype)
- data[self._mask] = na_value
- else:
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore", category=RuntimeWarning)
- data = self._data.astype(dtype, copy=copy)
- return data
- @doc(ExtensionArray.tolist)
- def tolist(self):
- if self.ndim > 1:
- return [x.tolist() for x in self]
- dtype = None if self._hasna else self._data.dtype
- return self.to_numpy(dtype=dtype).tolist()
- @overload
- def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray:
- ...
- @overload
- def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray:
- ...
- @overload
- def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike:
- ...
- def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike:
- dtype = pandas_dtype(dtype)
- if is_dtype_equal(dtype, self.dtype):
- if copy:
- return self.copy()
- return self
- # if we are astyping to another nullable masked dtype, we can fastpath
- if isinstance(dtype, BaseMaskedDtype):
- # TODO deal with NaNs for FloatingArray case
- with warnings.catch_warnings():
- warnings.filterwarnings("ignore", category=RuntimeWarning)
- # TODO: Is rounding what we want long term?
- data = self._data.astype(dtype.numpy_dtype, copy=copy)
- # mask is copied depending on whether the data was copied, and
- # not directly depending on the `copy` keyword
- mask = self._mask if data is self._data else self._mask.copy()
- cls = dtype.construct_array_type()
- return cls(data, mask, copy=False)
- if isinstance(dtype, ExtensionDtype):
- eacls = dtype.construct_array_type()
- return eacls._from_sequence(self, dtype=dtype, copy=copy)
- na_value: float | np.datetime64 | lib.NoDefault
- # coerce
- if is_float_dtype(dtype):
- # In astype, we consider dtype=float to also mean na_value=np.nan
- na_value = np.nan
- elif is_datetime64_dtype(dtype):
- na_value = np.datetime64("NaT")
- else:
- na_value = lib.no_default
- # to_numpy will also raise, but we get somewhat nicer exception messages here
- if is_integer_dtype(dtype) and self._hasna:
- raise ValueError("cannot convert NA to integer")
- if is_bool_dtype(dtype) and self._hasna:
- # careful: astype_nansafe converts np.nan to True
- raise ValueError("cannot convert float NaN to bool")
- data = self.to_numpy(dtype=dtype, na_value=na_value, copy=copy)
- return data
- __array_priority__ = 1000 # higher than ndarray so ops dispatch to us
- def __array__(self, dtype: NpDtype | None = None) -> np.ndarray:
- """
- the array interface, return my values
- We return an object array here to preserve our scalar values
- """
- return self.to_numpy(dtype=dtype)
- _HANDLED_TYPES: tuple[type, ...]
- def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
- # For MaskedArray inputs, we apply the ufunc to ._data
- # and mask the result.
- out = kwargs.get("out", ())
- for x in inputs + out:
- if not isinstance(x, self._HANDLED_TYPES + (BaseMaskedArray,)):
- return NotImplemented
- # for binary ops, use our custom dunder methods
- result = ops.maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
- if "out" in kwargs:
- # e.g. test_ufunc_with_out
- return arraylike.dispatch_ufunc_with_out(
- self, ufunc, method, *inputs, **kwargs
- )
- if method == "reduce":
- result = arraylike.dispatch_reduction_ufunc(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is not NotImplemented:
- return result
- mask = np.zeros(len(self), dtype=bool)
- inputs2 = []
- for x in inputs:
- if isinstance(x, BaseMaskedArray):
- mask |= x._mask
- inputs2.append(x._data)
- else:
- inputs2.append(x)
- def reconstruct(x):
- # we don't worry about scalar `x` here, since we
- # raise for reduce up above.
- from pandas.core.arrays import (
- BooleanArray,
- FloatingArray,
- IntegerArray,
- )
- if is_bool_dtype(x.dtype):
- m = mask.copy()
- return BooleanArray(x, m)
- elif is_integer_dtype(x.dtype):
- m = mask.copy()
- return IntegerArray(x, m)
- elif is_float_dtype(x.dtype):
- m = mask.copy()
- if x.dtype == np.float16:
- # reached in e.g. np.sqrt on BooleanArray
- # we don't support float16
- x = x.astype(np.float32)
- return FloatingArray(x, m)
- else:
- x[mask] = np.nan
- return x
- result = getattr(ufunc, method)(*inputs2, **kwargs)
- if ufunc.nout > 1:
- # e.g. np.divmod
- return tuple(reconstruct(x) for x in result)
- elif method == "reduce":
- # e.g. np.add.reduce; test_ufunc_reduce_raises
- if self._mask.any():
- return self._na_value
- return result
- else:
- return reconstruct(result)
- def __arrow_array__(self, type=None):
- """
- Convert myself into a pyarrow Array.
- """
- import pyarrow as pa
- return pa.array(self._data, mask=self._mask, type=type)
- @property
- def _hasna(self) -> bool:
- # Note: this is expensive right now! The hope is that we can
- # make this faster by having an optional mask, but not have to change
- # source code using it..
- # error: Incompatible return value type (got "bool_", expected "bool")
- return self._mask.any() # type: ignore[return-value]
- def _propagate_mask(
- self, mask: npt.NDArray[np.bool_] | None, other
- ) -> npt.NDArray[np.bool_]:
- if mask is None:
- mask = self._mask.copy() # TODO: need test for BooleanArray needing a copy
- if other is libmissing.NA:
- # GH#45421 don't alter inplace
- mask = mask | True
- elif is_list_like(other) and len(other) == len(mask):
- mask = mask | isna(other)
- else:
- mask = self._mask | mask
- # Incompatible return value type (got "Optional[ndarray[Any, dtype[bool_]]]",
- # expected "ndarray[Any, dtype[bool_]]")
- return mask # type: ignore[return-value]
- def _arith_method(self, other, op):
- op_name = op.__name__
- omask = None
- if (
- not hasattr(other, "dtype")
- and is_list_like(other)
- and len(other) == len(self)
- ):
- # Try inferring masked dtype instead of casting to object
- inferred_dtype = lib.infer_dtype(other, skipna=True)
- if inferred_dtype == "integer":
- from pandas.core.arrays import IntegerArray
- other = IntegerArray._from_sequence(other)
- elif inferred_dtype in ["floating", "mixed-integer-float"]:
- from pandas.core.arrays import FloatingArray
- other = FloatingArray._from_sequence(other)
- elif inferred_dtype in ["boolean"]:
- from pandas.core.arrays import BooleanArray
- other = BooleanArray._from_sequence(other)
- if isinstance(other, BaseMaskedArray):
- other, omask = other._data, other._mask
- elif is_list_like(other):
- if not isinstance(other, ExtensionArray):
- other = np.asarray(other)
- if other.ndim > 1:
- raise NotImplementedError("can only perform ops with 1-d structures")
- # We wrap the non-masked arithmetic logic used for numpy dtypes
- # in Series/Index arithmetic ops.
- other = ops.maybe_prepare_scalar_for_op(other, (len(self),))
- pd_op = ops.get_array_op(op)
- other = ensure_wrapped_if_datetimelike(other)
- if op_name in {"pow", "rpow"} and isinstance(other, np.bool_):
- # Avoid DeprecationWarning: In future, it will be an error
- # for 'np.bool_' scalars to be interpreted as an index
- # e.g. test_array_scalar_like_equivalence
- other = bool(other)
- mask = self._propagate_mask(omask, other)
- if other is libmissing.NA:
- result = np.ones_like(self._data)
- if self.dtype.kind == "b":
- if op_name in {
- "floordiv",
- "rfloordiv",
- "pow",
- "rpow",
- "truediv",
- "rtruediv",
- }:
- # GH#41165 Try to match non-masked Series behavior
- # This is still imperfect GH#46043
- raise NotImplementedError(
- f"operator '{op_name}' not implemented for bool dtypes"
- )
- if op_name in {"mod", "rmod"}:
- dtype = "int8"
- else:
- dtype = "bool"
- result = result.astype(dtype)
- elif "truediv" in op_name and self.dtype.kind != "f":
- # The actual data here doesn't matter since the mask
- # will be all-True, but since this is division, we want
- # to end up with floating dtype.
- result = result.astype(np.float64)
- else:
- # Make sure we do this before the "pow" mask checks
- # to get an expected exception message on shape mismatch.
- if self.dtype.kind in ["i", "u"] and op_name in ["floordiv", "mod"]:
- # TODO(GH#30188) ATM we don't match the behavior of non-masked
- # types with respect to floordiv-by-zero
- pd_op = op
- with np.errstate(all="ignore"):
- result = pd_op(self._data, other)
- if op_name == "pow":
- # 1 ** x is 1.
- mask = np.where((self._data == 1) & ~self._mask, False, mask)
- # x ** 0 is 1.
- if omask is not None:
- mask = np.where((other == 0) & ~omask, False, mask)
- elif other is not libmissing.NA:
- mask = np.where(other == 0, False, mask)
- elif op_name == "rpow":
- # 1 ** x is 1.
- if omask is not None:
- mask = np.where((other == 1) & ~omask, False, mask)
- elif other is not libmissing.NA:
- mask = np.where(other == 1, False, mask)
- # x ** 0 is 1.
- mask = np.where((self._data == 0) & ~self._mask, False, mask)
- return self._maybe_mask_result(result, mask)
- _logical_method = _arith_method
- def _cmp_method(self, other, op) -> BooleanArray:
- from pandas.core.arrays import BooleanArray
- mask = None
- if isinstance(other, BaseMaskedArray):
- other, mask = other._data, other._mask
- elif is_list_like(other):
- other = np.asarray(other)
- if other.ndim > 1:
- raise NotImplementedError("can only perform ops with 1-d structures")
- if len(self) != len(other):
- raise ValueError("Lengths must match to compare")
- if other is libmissing.NA:
- # numpy does not handle pd.NA well as "other" scalar (it returns
- # a scalar False instead of an array)
- # This may be fixed by NA.__array_ufunc__. Revisit this check
- # once that's implemented.
- result = np.zeros(self._data.shape, dtype="bool")
- mask = np.ones(self._data.shape, dtype="bool")
- else:
- with warnings.catch_warnings():
- # numpy may show a FutureWarning or DeprecationWarning:
- # elementwise comparison failed; returning scalar instead,
- # but in the future will perform elementwise comparison
- # before returning NotImplemented. We fall back to the correct
- # behavior today, so that should be fine to ignore.
- warnings.filterwarnings("ignore", "elementwise", FutureWarning)
- warnings.filterwarnings("ignore", "elementwise", DeprecationWarning)
- with np.errstate(all="ignore"):
- method = getattr(self._data, f"__{op.__name__}__")
- result = method(other)
- if result is NotImplemented:
- result = invalid_comparison(self._data, other, op)
- mask = self._propagate_mask(mask, other)
- return BooleanArray(result, mask, copy=False)
- def _maybe_mask_result(self, result, mask):
- """
- Parameters
- ----------
- result : array-like or tuple[array-like]
- mask : array-like bool
- """
- if isinstance(result, tuple):
- # i.e. divmod
- div, mod = result
- return (
- self._maybe_mask_result(div, mask),
- self._maybe_mask_result(mod, mask),
- )
- if is_float_dtype(result.dtype):
- from pandas.core.arrays import FloatingArray
- return FloatingArray(result, mask, copy=False)
- elif is_bool_dtype(result.dtype):
- from pandas.core.arrays import BooleanArray
- return BooleanArray(result, mask, copy=False)
- elif (
- isinstance(result.dtype, np.dtype)
- and result.dtype.kind == "m"
- and is_supported_unit(get_unit_from_dtype(result.dtype))
- ):
- # e.g. test_numeric_arr_mul_tdscalar_numexpr_path
- from pandas.core.arrays import TimedeltaArray
- if not isinstance(result, TimedeltaArray):
- result = TimedeltaArray._simple_new(result, dtype=result.dtype)
- result[mask] = result.dtype.type("NaT")
- return result
- elif is_integer_dtype(result.dtype):
- from pandas.core.arrays import IntegerArray
- return IntegerArray(result, mask, copy=False)
- else:
- result[mask] = np.nan
- return result
- def isna(self) -> np.ndarray:
- return self._mask.copy()
- @property
- def _na_value(self):
- return self.dtype.na_value
- @property
- def nbytes(self) -> int:
- return self._data.nbytes + self._mask.nbytes
- @classmethod
- def _concat_same_type(
- cls: type[BaseMaskedArrayT],
- to_concat: Sequence[BaseMaskedArrayT],
- axis: AxisInt = 0,
- ) -> BaseMaskedArrayT:
- data = np.concatenate([x._data for x in to_concat], axis=axis)
- mask = np.concatenate([x._mask for x in to_concat], axis=axis)
- return cls(data, mask)
- def take(
- self: BaseMaskedArrayT,
- indexer,
- *,
- allow_fill: bool = False,
- fill_value: Scalar | None = None,
- axis: AxisInt = 0,
- ) -> BaseMaskedArrayT:
- # we always fill with 1 internally
- # to avoid upcasting
- data_fill_value = self._internal_fill_value if isna(fill_value) else fill_value
- result = take(
- self._data,
- indexer,
- fill_value=data_fill_value,
- allow_fill=allow_fill,
- axis=axis,
- )
- mask = take(
- self._mask, indexer, fill_value=True, allow_fill=allow_fill, axis=axis
- )
- # if we are filling
- # we only fill where the indexer is null
- # not existing missing values
- # TODO(jreback) what if we have a non-na float as a fill value?
- if allow_fill and notna(fill_value):
- fill_mask = np.asarray(indexer) == -1
- result[fill_mask] = fill_value
- mask = mask ^ fill_mask
- return type(self)(result, mask, copy=False)
- # error: Return type "BooleanArray" of "isin" incompatible with return type
- # "ndarray" in supertype "ExtensionArray"
- def isin(self, values) -> BooleanArray: # type: ignore[override]
- from pandas.core.arrays import BooleanArray
- # algorithms.isin will eventually convert values to an ndarray, so no extra
- # cost to doing it here first
- values_arr = np.asarray(values)
- result = isin(self._data, values_arr)
- if self._hasna:
- values_have_NA = is_object_dtype(values_arr.dtype) and any(
- val is self.dtype.na_value for val in values_arr
- )
- # For now, NA does not propagate so set result according to presence of NA,
- # see https://github.com/pandas-dev/pandas/pull/38379 for some discussion
- result[self._mask] = values_have_NA
- mask = np.zeros(self._data.shape, dtype=bool)
- return BooleanArray(result, mask, copy=False)
- def copy(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- data, mask = self._data, self._mask
- data = data.copy()
- mask = mask.copy()
- return type(self)(data, mask, copy=False)
- def unique(self: BaseMaskedArrayT) -> BaseMaskedArrayT:
- """
- Compute the BaseMaskedArray of unique values.
- Returns
- -------
- uniques : BaseMaskedArray
- """
- uniques, mask = algos.unique_with_mask(self._data, self._mask)
- return type(self)(uniques, mask, copy=False)
- @doc(ExtensionArray.searchsorted)
- def searchsorted(
- self,
- value: NumpyValueArrayLike | ExtensionArray,
- side: Literal["left", "right"] = "left",
- sorter: NumpySorter = None,
- ) -> npt.NDArray[np.intp] | np.intp:
- if self._hasna:
- raise ValueError(
- "searchsorted requires array to be sorted, which is impossible "
- "with NAs present."
- )
- if isinstance(value, ExtensionArray):
- value = value.astype(object)
- # Base class searchsorted would cast to object, which is *much* slower.
- return self._data.searchsorted(value, side=side, sorter=sorter)
- @doc(ExtensionArray.factorize)
- def factorize(
- self,
- use_na_sentinel: bool = True,
- ) -> tuple[np.ndarray, ExtensionArray]:
- arr = self._data
- mask = self._mask
- # Use a sentinel for na; recode and add NA to uniques if necessary below
- codes, uniques = factorize_array(arr, use_na_sentinel=True, mask=mask)
- # check that factorize_array correctly preserves dtype.
- assert uniques.dtype == self.dtype.numpy_dtype, (uniques.dtype, self.dtype)
- has_na = mask.any()
- if use_na_sentinel or not has_na:
- size = len(uniques)
- else:
- # Make room for an NA value
- size = len(uniques) + 1
- uniques_mask = np.zeros(size, dtype=bool)
- if not use_na_sentinel and has_na:
- na_index = mask.argmax()
- # Insert na with the proper code
- if na_index == 0:
- na_code = np.intp(0)
- else:
- # mypy error: Slice index must be an integer or None
- # https://github.com/python/mypy/issues/2410
- na_code = codes[:na_index].max() + 1 # type: ignore[misc]
- codes[codes >= na_code] += 1
- codes[codes == -1] = na_code
- # dummy value for uniques; not used since uniques_mask will be True
- uniques = np.insert(uniques, na_code, 0)
- uniques_mask[na_code] = True
- uniques_ea = type(self)(uniques, uniques_mask)
- return codes, uniques_ea
- @doc(ExtensionArray._values_for_argsort)
- def _values_for_argsort(self) -> np.ndarray:
- return self._data
- def value_counts(self, dropna: bool = True) -> Series:
- """
- Returns a Series containing counts of each unique value.
- Parameters
- ----------
- dropna : bool, default True
- Don't include counts of missing values.
- Returns
- -------
- counts : Series
- See Also
- --------
- Series.value_counts
- """
- from pandas import (
- Index,
- Series,
- )
- from pandas.arrays import IntegerArray
- keys, value_counts = algos.value_counts_arraylike(
- self._data, dropna=True, mask=self._mask
- )
- if dropna:
- res = Series(value_counts, index=keys, name="count", copy=False)
- res.index = res.index.astype(self.dtype)
- res = res.astype("Int64")
- return res
- # if we want nans, count the mask
- counts = np.empty(len(value_counts) + 1, dtype="int64")
- counts[:-1] = value_counts
- counts[-1] = self._mask.sum()
- index = Index(keys, dtype=self.dtype).insert(len(keys), self.dtype.na_value)
- index = index.astype(self.dtype)
- mask = np.zeros(len(counts), dtype="bool")
- counts_array = IntegerArray(counts, mask)
- return Series(counts_array, index=index, name="count", copy=False)
- @doc(ExtensionArray.equals)
- def equals(self, other) -> bool:
- if type(self) != type(other):
- return False
- if other.dtype != self.dtype:
- return False
- # GH#44382 if e.g. self[1] is np.nan and other[1] is pd.NA, we are NOT
- # equal.
- if not np.array_equal(self._mask, other._mask):
- return False
- left = self._data[~self._mask]
- right = other._data[~other._mask]
- return array_equivalent(left, right, dtype_equal=True)
- def _quantile(
- self, qs: npt.NDArray[np.float64], interpolation: str
- ) -> BaseMaskedArray:
- """
- Dispatch to quantile_with_mask, needed because we do not have
- _from_factorized.
- Notes
- -----
- We assume that all impacted cases are 1D-only.
- """
- res = quantile_with_mask(
- self._data,
- mask=self._mask,
- # TODO(GH#40932): na_value_for_dtype(self.dtype.numpy_dtype)
- # instead of np.nan
- fill_value=np.nan,
- qs=qs,
- interpolation=interpolation,
- )
- if self._hasna:
- # Our result mask is all-False unless we are all-NA, in which
- # case it is all-True.
- if self.ndim == 2:
- # I think this should be out_mask=self.isna().all(axis=1)
- # but am holding off until we have tests
- raise NotImplementedError
- if self.isna().all():
- out_mask = np.ones(res.shape, dtype=bool)
- if is_integer_dtype(self.dtype):
- # We try to maintain int dtype if possible for not all-na case
- # as well
- res = np.zeros(res.shape, dtype=self.dtype.numpy_dtype)
- else:
- out_mask = np.zeros(res.shape, dtype=bool)
- else:
- out_mask = np.zeros(res.shape, dtype=bool)
- return self._maybe_mask_result(res, mask=out_mask)
- # ------------------------------------------------------------------
- # Reductions
- def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
- if name in {"any", "all", "min", "max", "sum", "prod", "mean", "var", "std"}:
- return getattr(self, name)(skipna=skipna, **kwargs)
- data = self._data
- mask = self._mask
- # median, skew, kurt, sem
- op = getattr(nanops, f"nan{name}")
- result = op(data, axis=0, skipna=skipna, mask=mask, **kwargs)
- if np.isnan(result):
- return libmissing.NA
- return result
- def _wrap_reduction_result(self, name: str, result, skipna, **kwargs):
- if isinstance(result, np.ndarray):
- axis = kwargs["axis"]
- if skipna:
- # we only retain mask for all-NA rows/columns
- mask = self._mask.all(axis=axis)
- else:
- mask = self._mask.any(axis=axis)
- return self._maybe_mask_result(result, mask)
- return result
- def sum(
- self,
- *,
- skipna: bool = True,
- min_count: int = 0,
- axis: AxisInt | None = 0,
- **kwargs,
- ):
- nv.validate_sum((), kwargs)
- # TODO: do this in validate_sum?
- if "out" in kwargs:
- # np.sum; test_floating_array_numpy_sum
- if kwargs["out"] is not None:
- raise NotImplementedError
- kwargs.pop("out")
- result = masked_reductions.sum(
- self._data,
- self._mask,
- skipna=skipna,
- min_count=min_count,
- axis=axis,
- )
- return self._wrap_reduction_result(
- "sum", result, skipna=skipna, axis=axis, **kwargs
- )
- def prod(
- self,
- *,
- skipna: bool = True,
- min_count: int = 0,
- axis: AxisInt | None = 0,
- **kwargs,
- ):
- nv.validate_prod((), kwargs)
- result = masked_reductions.prod(
- self._data,
- self._mask,
- skipna=skipna,
- min_count=min_count,
- axis=axis,
- )
- return self._wrap_reduction_result(
- "prod", result, skipna=skipna, axis=axis, **kwargs
- )
- def mean(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
- nv.validate_mean((), kwargs)
- result = masked_reductions.mean(
- self._data,
- self._mask,
- skipna=skipna,
- axis=axis,
- )
- return self._wrap_reduction_result(
- "mean", result, skipna=skipna, axis=axis, **kwargs
- )
- def var(
- self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs
- ):
- nv.validate_stat_ddof_func((), kwargs, fname="var")
- result = masked_reductions.var(
- self._data,
- self._mask,
- skipna=skipna,
- axis=axis,
- ddof=ddof,
- )
- return self._wrap_reduction_result(
- "var", result, skipna=skipna, axis=axis, **kwargs
- )
- def std(
- self, *, skipna: bool = True, axis: AxisInt | None = 0, ddof: int = 1, **kwargs
- ):
- nv.validate_stat_ddof_func((), kwargs, fname="std")
- result = masked_reductions.std(
- self._data,
- self._mask,
- skipna=skipna,
- axis=axis,
- ddof=ddof,
- )
- return self._wrap_reduction_result(
- "std", result, skipna=skipna, axis=axis, **kwargs
- )
- def min(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
- nv.validate_min((), kwargs)
- return masked_reductions.min(
- self._data,
- self._mask,
- skipna=skipna,
- axis=axis,
- )
- def max(self, *, skipna: bool = True, axis: AxisInt | None = 0, **kwargs):
- nv.validate_max((), kwargs)
- return masked_reductions.max(
- self._data,
- self._mask,
- skipna=skipna,
- axis=axis,
- )
- def any(self, *, skipna: bool = True, **kwargs):
- """
- Return whether any element is truthy.
- Returns False unless there is at least one element that is truthy.
- By default, NAs are skipped. If ``skipna=False`` is specified and
- missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
- is used as for logical operations.
- .. versionchanged:: 1.4.0
- Parameters
- ----------
- skipna : bool, default True
- Exclude NA values. If the entire array is NA and `skipna` is
- True, then the result will be False, as for an empty array.
- If `skipna` is False, the result will still be True if there is
- at least one element that is truthy, otherwise NA will be returned
- if there are NA's present.
- **kwargs : any, default None
- Additional keywords have no effect but might be accepted for
- compatibility with NumPy.
- Returns
- -------
- bool or :attr:`pandas.NA`
- See Also
- --------
- numpy.any : Numpy version of this method.
- BaseMaskedArray.all : Return whether all elements are truthy.
- Examples
- --------
- The result indicates whether any element is truthy (and by default
- skips NAs):
- >>> pd.array([True, False, True]).any()
- True
- >>> pd.array([True, False, pd.NA]).any()
- True
- >>> pd.array([False, False, pd.NA]).any()
- False
- >>> pd.array([], dtype="boolean").any()
- False
- >>> pd.array([pd.NA], dtype="boolean").any()
- False
- >>> pd.array([pd.NA], dtype="Float64").any()
- False
- With ``skipna=False``, the result can be NA if this is logically
- required (whether ``pd.NA`` is True or False influences the result):
- >>> pd.array([True, False, pd.NA]).any(skipna=False)
- True
- >>> pd.array([1, 0, pd.NA]).any(skipna=False)
- True
- >>> pd.array([False, False, pd.NA]).any(skipna=False)
- <NA>
- >>> pd.array([0, 0, pd.NA]).any(skipna=False)
- <NA>
- """
- kwargs.pop("axis", None)
- nv.validate_any((), kwargs)
- values = self._data.copy()
- # error: Argument 3 to "putmask" has incompatible type "object";
- # expected "Union[_SupportsArray[dtype[Any]],
- # _NestedSequence[_SupportsArray[dtype[Any]]],
- # bool, int, float, complex, str, bytes,
- # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
- np.putmask(values, self._mask, self._falsey_value) # type: ignore[arg-type]
- result = values.any()
- if skipna:
- return result
- else:
- if result or len(self) == 0 or not self._mask.any():
- return result
- else:
- return self.dtype.na_value
- def all(self, *, skipna: bool = True, **kwargs):
- """
- Return whether all elements are truthy.
- Returns True unless there is at least one element that is falsey.
- By default, NAs are skipped. If ``skipna=False`` is specified and
- missing values are present, similar :ref:`Kleene logic <boolean.kleene>`
- is used as for logical operations.
- .. versionchanged:: 1.4.0
- Parameters
- ----------
- skipna : bool, default True
- Exclude NA values. If the entire array is NA and `skipna` is
- True, then the result will be True, as for an empty array.
- If `skipna` is False, the result will still be False if there is
- at least one element that is falsey, otherwise NA will be returned
- if there are NA's present.
- **kwargs : any, default None
- Additional keywords have no effect but might be accepted for
- compatibility with NumPy.
- Returns
- -------
- bool or :attr:`pandas.NA`
- See Also
- --------
- numpy.all : Numpy version of this method.
- BooleanArray.any : Return whether any element is truthy.
- Examples
- --------
- The result indicates whether all elements are truthy (and by default
- skips NAs):
- >>> pd.array([True, True, pd.NA]).all()
- True
- >>> pd.array([1, 1, pd.NA]).all()
- True
- >>> pd.array([True, False, pd.NA]).all()
- False
- >>> pd.array([], dtype="boolean").all()
- True
- >>> pd.array([pd.NA], dtype="boolean").all()
- True
- >>> pd.array([pd.NA], dtype="Float64").all()
- True
- With ``skipna=False``, the result can be NA if this is logically
- required (whether ``pd.NA`` is True or False influences the result):
- >>> pd.array([True, True, pd.NA]).all(skipna=False)
- <NA>
- >>> pd.array([1, 1, pd.NA]).all(skipna=False)
- <NA>
- >>> pd.array([True, False, pd.NA]).all(skipna=False)
- False
- >>> pd.array([1, 0, pd.NA]).all(skipna=False)
- False
- """
- kwargs.pop("axis", None)
- nv.validate_all((), kwargs)
- values = self._data.copy()
- # error: Argument 3 to "putmask" has incompatible type "object";
- # expected "Union[_SupportsArray[dtype[Any]],
- # _NestedSequence[_SupportsArray[dtype[Any]]],
- # bool, int, float, complex, str, bytes,
- # _NestedSequence[Union[bool, int, float, complex, str, bytes]]]"
- np.putmask(values, self._mask, self._truthy_value) # type: ignore[arg-type]
- result = values.all()
- if skipna:
- return result
- else:
- if not result or len(self) == 0 or not self._mask.any():
- return result
- else:
- return self.dtype.na_value
- def _accumulate(
- self, name: str, *, skipna: bool = True, **kwargs
- ) -> BaseMaskedArray:
- data = self._data
- mask = self._mask
- op = getattr(masked_accumulations, name)
- data, mask = op(data, mask, skipna=skipna, **kwargs)
- return type(self)(data, mask, copy=False)
|