123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513 |
- from decimal import Decimal
- import numbers
- from sys import maxsize
- cimport cython
- from cpython.datetime cimport (
- date,
- time,
- timedelta,
- )
- from cython cimport Py_ssize_t
- import numpy as np
- cimport numpy as cnp
- from numpy cimport (
- flatiter,
- float64_t,
- int64_t,
- ndarray,
- uint8_t,
- )
- cnp.import_array()
- from pandas._libs cimport util
- from pandas._libs.tslibs.nattype cimport (
- c_NaT as NaT,
- checknull_with_nat,
- is_dt64nat,
- is_td64nat,
- )
- from pandas._libs.tslibs.np_datetime cimport (
- get_datetime64_unit,
- get_datetime64_value,
- get_timedelta64_value,
- )
- from pandas._libs.ops_dispatch import maybe_dispatch_ufunc_to_dunder_op
- cdef:
- float64_t INF = <float64_t>np.inf
- float64_t NEGINF = -INF
- int64_t NPY_NAT = util.get_nat()
- bint is_32bit = maxsize <= 2 ** 32
- type cDecimal = Decimal # for faster isinstance checks
- cpdef bint check_na_tuples_nonequal(object left, object right):
- """
- When we have NA in one of the tuples but not the other we have to check here,
- because our regular checks fail before with ambigous boolean value.
- Parameters
- ----------
- left: Any
- right: Any
- Returns
- -------
- True if we are dealing with tuples that have NA on one side and non NA on
- the other side.
- """
- if not isinstance(left, tuple) or not isinstance(right, tuple):
- return False
- if len(left) != len(right):
- return False
- for left_element, right_element in zip(left, right):
- if left_element is C_NA and right_element is not C_NA:
- return True
- elif right_element is C_NA and left_element is not C_NA:
- return True
- return False
- cpdef bint is_matching_na(object left, object right, bint nan_matches_none=False):
- """
- Check if two scalars are both NA of matching types.
- Parameters
- ----------
- left : Any
- right : Any
- nan_matches_none : bool, default False
- For backwards compatibility, consider NaN as matching None.
- Returns
- -------
- bool
- """
- if left is None:
- if nan_matches_none and util.is_nan(right):
- return True
- return right is None
- elif left is C_NA:
- return right is C_NA
- elif left is NaT:
- return right is NaT
- elif util.is_float_object(left):
- if nan_matches_none and right is None and util.is_nan(left):
- return True
- return (
- util.is_nan(left)
- and util.is_float_object(right)
- and util.is_nan(right)
- )
- elif util.is_complex_object(left):
- return (
- util.is_nan(left)
- and util.is_complex_object(right)
- and util.is_nan(right)
- )
- elif util.is_datetime64_object(left):
- return (
- get_datetime64_value(left) == NPY_NAT
- and util.is_datetime64_object(right)
- and get_datetime64_value(right) == NPY_NAT
- and get_datetime64_unit(left) == get_datetime64_unit(right)
- )
- elif util.is_timedelta64_object(left):
- return (
- get_timedelta64_value(left) == NPY_NAT
- and util.is_timedelta64_object(right)
- and get_timedelta64_value(right) == NPY_NAT
- and get_datetime64_unit(left) == get_datetime64_unit(right)
- )
- elif is_decimal_na(left):
- return is_decimal_na(right)
- return False
- cpdef bint checknull(object val, bint inf_as_na=False):
- """
- Return boolean describing of the input is NA-like, defined here as any
- of:
- - None
- - nan
- - NaT
- - np.datetime64 representation of NaT
- - np.timedelta64 representation of NaT
- - NA
- - Decimal("NaN")
- Parameters
- ----------
- val : object
- inf_as_na : bool, default False
- Whether to treat INF and -INF as NA values.
- Returns
- -------
- bool
- """
- if val is None or val is NaT or val is C_NA:
- return True
- elif util.is_float_object(val) or util.is_complex_object(val):
- if val != val:
- return True
- elif inf_as_na:
- return val == INF or val == NEGINF
- return False
- elif util.is_timedelta64_object(val):
- return get_timedelta64_value(val) == NPY_NAT
- elif util.is_datetime64_object(val):
- return get_datetime64_value(val) == NPY_NAT
- else:
- return is_decimal_na(val)
- cdef bint is_decimal_na(object val):
- """
- Is this a decimal.Decimal object Decimal("NAN").
- """
- return isinstance(val, cDecimal) and val != val
- @cython.wraparound(False)
- @cython.boundscheck(False)
- cpdef ndarray[uint8_t] isnaobj(ndarray arr, bint inf_as_na=False):
- """
- Return boolean mask denoting which elements of a 1-D array are na-like,
- according to the criteria defined in `checknull`:
- - None
- - nan
- - NaT
- - np.datetime64 representation of NaT
- - np.timedelta64 representation of NaT
- - NA
- - Decimal("NaN")
- Parameters
- ----------
- arr : ndarray
- Returns
- -------
- result : ndarray (dtype=np.bool_)
- """
- cdef:
- Py_ssize_t i, n = arr.size
- object val
- bint is_null
- ndarray result = np.empty((<object>arr).shape, dtype=np.uint8)
- flatiter it = cnp.PyArray_IterNew(arr)
- flatiter it2 = cnp.PyArray_IterNew(result)
- for i in range(n):
- # The PyArray_GETITEM and PyArray_ITER_NEXT are faster
- # equivalents to `val = values[i]`
- val = cnp.PyArray_GETITEM(arr, cnp.PyArray_ITER_DATA(it))
- cnp.PyArray_ITER_NEXT(it)
- is_null = checknull(val, inf_as_na=inf_as_na)
- # Dereference pointer (set value)
- (<uint8_t *>(cnp.PyArray_ITER_DATA(it2)))[0] = <uint8_t>is_null
- cnp.PyArray_ITER_NEXT(it2)
- return result.view(np.bool_)
- def isposinf_scalar(val: object) -> bool:
- return util.is_float_object(val) and val == INF
- def isneginf_scalar(val: object) -> bool:
- return util.is_float_object(val) and val == NEGINF
- cdef bint is_null_datetime64(v):
- # determine if we have a null for a datetime (or integer versions),
- # excluding np.timedelta64('nat')
- if checknull_with_nat(v) or is_dt64nat(v):
- return True
- return False
- cdef bint is_null_timedelta64(v):
- # determine if we have a null for a timedelta (or integer versions),
- # excluding np.datetime64('nat')
- if checknull_with_nat(v) or is_td64nat(v):
- return True
- return False
- cdef bint checknull_with_nat_and_na(object obj):
- # See GH#32214
- return checknull_with_nat(obj) or obj is C_NA
- @cython.wraparound(False)
- @cython.boundscheck(False)
- def is_float_nan(values: ndarray) -> ndarray:
- """
- True for elements which correspond to a float nan
- Returns
- -------
- ndarray[bool]
- """
- cdef:
- ndarray[uint8_t] result
- Py_ssize_t i, N
- object val
- N = len(values)
- result = np.zeros(N, dtype=np.uint8)
- for i in range(N):
- val = values[i]
- if util.is_nan(val):
- result[i] = True
- return result.view(bool)
- @cython.wraparound(False)
- @cython.boundscheck(False)
- def is_numeric_na(values: ndarray) -> ndarray:
- """
- Check for NA values consistent with IntegerArray/FloatingArray.
- Similar to a vectorized is_valid_na_for_dtype restricted to numeric dtypes.
- Returns
- -------
- ndarray[bool]
- """
- cdef:
- ndarray[uint8_t] result
- Py_ssize_t i, N
- object val
- N = len(values)
- result = np.zeros(N, dtype=np.uint8)
- for i in range(N):
- val = values[i]
- if checknull(val):
- if val is None or val is C_NA or util.is_nan(val) or is_decimal_na(val):
- result[i] = True
- else:
- raise TypeError(f"'values' contains non-numeric NA {val}")
- return result.view(bool)
- # -----------------------------------------------------------------------------
- # Implementation of NA singleton
- def _create_binary_propagating_op(name, is_divmod=False):
- is_cmp = name.strip("_") in ["eq", "ne", "le", "lt", "ge", "gt"]
- def method(self, other):
- if (other is C_NA or isinstance(other, (str, bytes))
- or isinstance(other, (numbers.Number, np.bool_))
- or util.is_array(other) and not other.shape):
- # Need the other.shape clause to handle NumPy scalars,
- # since we do a setitem on `out` below, which
- # won't work for NumPy scalars.
- if is_divmod:
- return NA, NA
- else:
- return NA
- elif util.is_array(other):
- out = np.empty(other.shape, dtype=object)
- out[:] = NA
- if is_divmod:
- return out, out.copy()
- else:
- return out
- elif is_cmp and isinstance(other, (date, time, timedelta)):
- return NA
- return NotImplemented
- method.__name__ = name
- return method
- def _create_unary_propagating_op(name: str):
- def method(self):
- return NA
- method.__name__ = name
- return method
- cdef class C_NAType:
- pass
- class NAType(C_NAType):
- """
- NA ("not available") missing value indicator.
- .. warning::
- Experimental: the behaviour of NA can still change without warning.
- The NA singleton is a missing value indicator defined by pandas. It is
- used in certain new extension dtypes (currently the "string" dtype).
- """
- _instance = None
- def __new__(cls, *args, **kwargs):
- if NAType._instance is None:
- NAType._instance = C_NAType.__new__(cls, *args, **kwargs)
- return NAType._instance
- def __repr__(self) -> str:
- return "<NA>"
- def __format__(self, format_spec) -> str:
- try:
- return self.__repr__().__format__(format_spec)
- except ValueError:
- return self.__repr__()
- def __bool__(self):
- raise TypeError("boolean value of NA is ambiguous")
- def __hash__(self):
- # GH 30013: Ensure hash is large enough to avoid hash collisions with integers
- exponent = 31 if is_32bit else 61
- return 2 ** exponent - 1
- def __reduce__(self):
- return "NA"
- # Binary arithmetic and comparison ops -> propagate
- __add__ = _create_binary_propagating_op("__add__")
- __radd__ = _create_binary_propagating_op("__radd__")
- __sub__ = _create_binary_propagating_op("__sub__")
- __rsub__ = _create_binary_propagating_op("__rsub__")
- __mul__ = _create_binary_propagating_op("__mul__")
- __rmul__ = _create_binary_propagating_op("__rmul__")
- __matmul__ = _create_binary_propagating_op("__matmul__")
- __rmatmul__ = _create_binary_propagating_op("__rmatmul__")
- __truediv__ = _create_binary_propagating_op("__truediv__")
- __rtruediv__ = _create_binary_propagating_op("__rtruediv__")
- __floordiv__ = _create_binary_propagating_op("__floordiv__")
- __rfloordiv__ = _create_binary_propagating_op("__rfloordiv__")
- __mod__ = _create_binary_propagating_op("__mod__")
- __rmod__ = _create_binary_propagating_op("__rmod__")
- __divmod__ = _create_binary_propagating_op("__divmod__", is_divmod=True)
- __rdivmod__ = _create_binary_propagating_op("__rdivmod__", is_divmod=True)
- # __lshift__ and __rshift__ are not implemented
- __eq__ = _create_binary_propagating_op("__eq__")
- __ne__ = _create_binary_propagating_op("__ne__")
- __le__ = _create_binary_propagating_op("__le__")
- __lt__ = _create_binary_propagating_op("__lt__")
- __gt__ = _create_binary_propagating_op("__gt__")
- __ge__ = _create_binary_propagating_op("__ge__")
- # Unary ops
- __neg__ = _create_unary_propagating_op("__neg__")
- __pos__ = _create_unary_propagating_op("__pos__")
- __abs__ = _create_unary_propagating_op("__abs__")
- __invert__ = _create_unary_propagating_op("__invert__")
- # pow has special
- def __pow__(self, other):
- if other is C_NA:
- return NA
- elif isinstance(other, (numbers.Number, np.bool_)):
- if other == 0:
- # returning positive is correct for +/- 0.
- return type(other)(1)
- else:
- return NA
- elif util.is_array(other):
- return np.where(other == 0, other.dtype.type(1), NA)
- return NotImplemented
- def __rpow__(self, other):
- if other is C_NA:
- return NA
- elif isinstance(other, (numbers.Number, np.bool_)):
- if other == 1:
- return other
- else:
- return NA
- elif util.is_array(other):
- return np.where(other == 1, other, NA)
- return NotImplemented
- # Logical ops using Kleene logic
- def __and__(self, other):
- if other is False:
- return False
- elif other is True or other is C_NA:
- return NA
- return NotImplemented
- __rand__ = __and__
- def __or__(self, other):
- if other is True:
- return True
- elif other is False or other is C_NA:
- return NA
- return NotImplemented
- __ror__ = __or__
- def __xor__(self, other):
- if other is False or other is True or other is C_NA:
- return NA
- return NotImplemented
- __rxor__ = __xor__
- __array_priority__ = 1000
- _HANDLED_TYPES = (np.ndarray, numbers.Number, str, np.bool_)
- def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
- types = self._HANDLED_TYPES + (NAType,)
- for x in inputs:
- if not isinstance(x, types):
- return NotImplemented
- if method != "__call__":
- raise ValueError(f"ufunc method '{method}' not supported for NA")
- result = maybe_dispatch_ufunc_to_dunder_op(
- self, ufunc, method, *inputs, **kwargs
- )
- if result is NotImplemented:
- # For a NumPy ufunc that's not a binop, like np.logaddexp
- index = [i for i, x in enumerate(inputs) if x is NA][0]
- result = np.broadcast_arrays(*inputs)[index]
- if result.ndim == 0:
- result = result.item()
- if ufunc.nout > 1:
- result = (NA,) * ufunc.nout
- return result
- C_NA = NAType() # C-visible
- NA = C_NA # Python-visible
|