123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767 |
- from __future__ import annotations
- import functools
- import itertools
- import operator
- from typing import (
- Any,
- Callable,
- cast,
- )
- import warnings
- import numpy as np
- from pandas._config import get_option
- from pandas._libs import (
- NaT,
- NaTType,
- iNaT,
- lib,
- )
- from pandas._typing import (
- ArrayLike,
- AxisInt,
- CorrelationMethod,
- Dtype,
- DtypeObj,
- F,
- Scalar,
- Shape,
- npt,
- )
- from pandas.compat._optional import import_optional_dependency
- from pandas.util._exceptions import find_stack_level
- from pandas.core.dtypes.common import (
- is_any_int_dtype,
- is_bool_dtype,
- is_complex,
- is_datetime64_any_dtype,
- is_float,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_numeric_dtype,
- is_object_dtype,
- is_scalar,
- is_timedelta64_dtype,
- needs_i8_conversion,
- pandas_dtype,
- )
- from pandas.core.dtypes.dtypes import PeriodDtype
- from pandas.core.dtypes.missing import (
- isna,
- na_value_for_dtype,
- notna,
- )
- from pandas.core.construction import extract_array
- bn = import_optional_dependency("bottleneck", errors="warn")
- _BOTTLENECK_INSTALLED = bn is not None
- _USE_BOTTLENECK = False
- def set_use_bottleneck(v: bool = True) -> None:
- # set/unset to use bottleneck
- global _USE_BOTTLENECK
- if _BOTTLENECK_INSTALLED:
- _USE_BOTTLENECK = v
- set_use_bottleneck(get_option("compute.use_bottleneck"))
- class disallow:
- def __init__(self, *dtypes: Dtype) -> None:
- super().__init__()
- self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
- def check(self, obj) -> bool:
- return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes)
- def __call__(self, f: F) -> F:
- @functools.wraps(f)
- def _f(*args, **kwargs):
- obj_iter = itertools.chain(args, kwargs.values())
- if any(self.check(obj) for obj in obj_iter):
- f_name = f.__name__.replace("nan", "")
- raise TypeError(
- f"reduction operation '{f_name}' not allowed for this dtype"
- )
- try:
- with np.errstate(invalid="ignore"):
- return f(*args, **kwargs)
- except ValueError as e:
- # we want to transform an object array
- # ValueError message to the more typical TypeError
- # e.g. this is normally a disallowed function on
- # object arrays that contain strings
- if is_object_dtype(args[0]):
- raise TypeError(e) from e
- raise
- return cast(F, _f)
- class bottleneck_switch:
- def __init__(self, name=None, **kwargs) -> None:
- self.name = name
- self.kwargs = kwargs
- def __call__(self, alt: F) -> F:
- bn_name = self.name or alt.__name__
- try:
- bn_func = getattr(bn, bn_name)
- except (AttributeError, NameError): # pragma: no cover
- bn_func = None
- @functools.wraps(alt)
- def f(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- **kwds,
- ):
- if len(self.kwargs) > 0:
- for k, v in self.kwargs.items():
- if k not in kwds:
- kwds[k] = v
- if values.size == 0 and kwds.get("min_count") is None:
- # We are empty, returning NA for our type
- # Only applies for the default `min_count` of None
- # since that affects how empty arrays are handled.
- # TODO(GH-18976) update all the nanops methods to
- # correctly handle empty inputs and remove this check.
- # It *may* just be `var`
- return _na_for_min_count(values, axis)
- if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
- if kwds.get("mask", None) is None:
- # `mask` is not recognised by bottleneck, would raise
- # TypeError if called
- kwds.pop("mask", None)
- result = bn_func(values, axis=axis, **kwds)
- # prefer to treat inf/-inf as NA, but must compute the func
- # twice :(
- if _has_infs(result):
- result = alt(values, axis=axis, skipna=skipna, **kwds)
- else:
- result = alt(values, axis=axis, skipna=skipna, **kwds)
- else:
- result = alt(values, axis=axis, skipna=skipna, **kwds)
- return result
- return cast(F, f)
- def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
- # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
- if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):
- # GH 42878
- # Bottleneck uses naive summation leading to O(n) loss of precision
- # unlike numpy which implements pairwise summation, which has O(log(n)) loss
- # crossref: https://github.com/pydata/bottleneck/issues/379
- # GH 15507
- # bottleneck does not properly upcast during the sum
- # so can overflow
- # GH 9422
- # further we also want to preserve NaN when all elements
- # are NaN, unlike bottleneck/numpy which consider this
- # to be 0
- return name not in ["nansum", "nanprod", "nanmean"]
- return False
- def _has_infs(result) -> bool:
- if isinstance(result, np.ndarray):
- if result.dtype in ("f8", "f4"):
- # Note: outside of an nanops-specific test, we always have
- # result.ndim == 1, so there is no risk of this ravel making a copy.
- return lib.has_infs(result.ravel("K"))
- try:
- return np.isinf(result).any()
- except (TypeError, NotImplementedError):
- # if it doesn't support infs, then it can't have infs
- return False
- def _get_fill_value(
- dtype: DtypeObj, fill_value: Scalar | None = None, fill_value_typ=None
- ):
- """return the correct fill value for the dtype of the values"""
- if fill_value is not None:
- return fill_value
- if _na_ok_dtype(dtype):
- if fill_value_typ is None:
- return np.nan
- else:
- if fill_value_typ == "+inf":
- return np.inf
- else:
- return -np.inf
- else:
- if fill_value_typ == "+inf":
- # need the max int here
- return lib.i8max
- else:
- return iNaT
- def _maybe_get_mask(
- values: np.ndarray, skipna: bool, mask: npt.NDArray[np.bool_] | None
- ) -> npt.NDArray[np.bool_] | None:
- """
- Compute a mask if and only if necessary.
- This function will compute a mask iff it is necessary. Otherwise,
- return the provided mask (potentially None) when a mask does not need to be
- computed.
- A mask is never necessary if the values array is of boolean or integer
- dtypes, as these are incapable of storing NaNs. If passing a NaN-capable
- dtype that is interpretable as either boolean or integer data (eg,
- timedelta64), a mask must be provided.
- If the skipna parameter is False, a new mask will not be computed.
- The mask is computed using isna() by default. Setting invert=True selects
- notna() as the masking function.
- Parameters
- ----------
- values : ndarray
- input array to potentially compute mask for
- skipna : bool
- boolean for whether NaNs should be skipped
- mask : Optional[ndarray]
- nan-mask if known
- Returns
- -------
- Optional[np.ndarray[bool]]
- """
- if mask is None:
- if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype):
- # Boolean data cannot contain nulls, so signal via mask being None
- return None
- if skipna or needs_i8_conversion(values.dtype):
- mask = isna(values)
- return mask
- def _get_values(
- values: np.ndarray,
- skipna: bool,
- fill_value: Any = None,
- fill_value_typ: str | None = None,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> tuple[np.ndarray, npt.NDArray[np.bool_] | None, np.dtype, np.dtype, Any]:
- """
- Utility to get the values view, mask, dtype, dtype_max, and fill_value.
- If both mask and fill_value/fill_value_typ are not None and skipna is True,
- the values array will be copied.
- For input arrays of boolean or integer dtypes, copies will only occur if a
- precomputed mask, a fill_value/fill_value_typ, and skipna=True are
- provided.
- Parameters
- ----------
- values : ndarray
- input array to potentially compute mask for
- skipna : bool
- boolean for whether NaNs should be skipped
- fill_value : Any
- value to fill NaNs with
- fill_value_typ : str
- Set to '+inf' or '-inf' to handle dtype-specific infinities
- mask : Optional[np.ndarray[bool]]
- nan-mask if known
- Returns
- -------
- values : ndarray
- Potential copy of input value array
- mask : Optional[ndarray[bool]]
- Mask for values, if deemed necessary to compute
- dtype : np.dtype
- dtype for values
- dtype_max : np.dtype
- platform independent dtype
- fill_value : Any
- fill value used
- """
- # In _get_values is only called from within nanops, and in all cases
- # with scalar fill_value. This guarantee is important for the
- # np.where call below
- assert is_scalar(fill_value)
- # error: Incompatible types in assignment (expression has type "Union[Any,
- # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
- values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
- mask = _maybe_get_mask(values, skipna, mask)
- dtype = values.dtype
- datetimelike = False
- if needs_i8_conversion(values.dtype):
- # changing timedelta64/datetime64 to int64 needs to happen after
- # finding `mask` above
- values = np.asarray(values.view("i8"))
- datetimelike = True
- dtype_ok = _na_ok_dtype(dtype)
- # get our fill value (in case we need to provide an alternative
- # dtype for it)
- fill_value = _get_fill_value(
- dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
- )
- if skipna and (mask is not None) and (fill_value is not None):
- if mask.any():
- if dtype_ok or datetimelike:
- values = values.copy()
- np.putmask(values, mask, fill_value)
- else:
- # np.where will promote if needed
- values = np.where(~mask, values, fill_value)
- # return a platform independent precision dtype
- dtype_max = dtype
- if is_integer_dtype(dtype) or is_bool_dtype(dtype):
- dtype_max = np.dtype(np.int64)
- elif is_float_dtype(dtype):
- dtype_max = np.dtype(np.float64)
- return values, mask, dtype, dtype_max, fill_value
- def _na_ok_dtype(dtype: DtypeObj) -> bool:
- if needs_i8_conversion(dtype):
- return False
- return not issubclass(dtype.type, np.integer)
- def _wrap_results(result, dtype: np.dtype, fill_value=None):
- """wrap our results if needed"""
- if result is NaT:
- pass
- elif is_datetime64_any_dtype(dtype):
- if fill_value is None:
- # GH#24293
- fill_value = iNaT
- if not isinstance(result, np.ndarray):
- assert not isna(fill_value), "Expected non-null fill_value"
- if result == fill_value:
- result = np.nan
- if isna(result):
- result = np.datetime64("NaT", "ns").astype(dtype)
- else:
- result = np.int64(result).view(dtype)
- # retain original unit
- result = result.astype(dtype, copy=False)
- else:
- # If we have float dtype, taking a view will give the wrong result
- result = result.astype(dtype)
- elif is_timedelta64_dtype(dtype):
- if not isinstance(result, np.ndarray):
- if result == fill_value or np.isnan(result):
- result = np.timedelta64("NaT").astype(dtype)
- elif np.fabs(result) > lib.i8max:
- # raise if we have a timedelta64[ns] which is too large
- raise ValueError("overflow in timedelta operation")
- else:
- # return a timedelta64 with the original unit
- result = np.int64(result).astype(dtype, copy=False)
- else:
- result = result.astype("m8[ns]").view(dtype)
- return result
- def _datetimelike_compat(func: F) -> F:
- """
- If we have datetime64 or timedelta64 values, ensure we have a correct
- mask before calling the wrapped function, then cast back afterwards.
- """
- @functools.wraps(func)
- def new_func(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- **kwargs,
- ):
- orig_values = values
- datetimelike = values.dtype.kind in ["m", "M"]
- if datetimelike and mask is None:
- mask = isna(values)
- result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
- if datetimelike:
- result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)
- if not skipna:
- assert mask is not None # checked above
- result = _mask_datetimelike_result(result, axis, mask, orig_values)
- return result
- return cast(F, new_func)
- def _na_for_min_count(values: np.ndarray, axis: AxisInt | None) -> Scalar | np.ndarray:
- """
- Return the missing value for `values`.
- Parameters
- ----------
- values : ndarray
- axis : int or None
- axis for the reduction, required if values.ndim > 1.
- Returns
- -------
- result : scalar or ndarray
- For 1-D values, returns a scalar of the correct missing type.
- For 2-D values, returns a 1-D array where each element is missing.
- """
- # we either return np.nan or pd.NaT
- if is_numeric_dtype(values):
- values = values.astype("float64")
- fill_value = na_value_for_dtype(values.dtype)
- if values.ndim == 1:
- return fill_value
- elif axis is None:
- return fill_value
- else:
- result_shape = values.shape[:axis] + values.shape[axis + 1 :]
- return np.full(result_shape, fill_value, dtype=values.dtype)
- def maybe_operate_rowwise(func: F) -> F:
- """
- NumPy operations on C-contiguous ndarrays with axis=1 can be
- very slow if axis 1 >> axis 0.
- Operate row-by-row and concatenate the results.
- """
- @functools.wraps(func)
- def newfunc(values: np.ndarray, *, axis: AxisInt | None = None, **kwargs):
- if (
- axis == 1
- and values.ndim == 2
- and values.flags["C_CONTIGUOUS"]
- # only takes this path for wide arrays (long dataframes), for threshold see
- # https://github.com/pandas-dev/pandas/pull/43311#issuecomment-974891737
- and (values.shape[1] / 1000) > values.shape[0]
- and values.dtype != object
- and values.dtype != bool
- ):
- arrs = list(values)
- if kwargs.get("mask") is not None:
- mask = kwargs.pop("mask")
- results = [
- func(arrs[i], mask=mask[i], **kwargs) for i in range(len(arrs))
- ]
- else:
- results = [func(x, **kwargs) for x in arrs]
- return np.array(results)
- return func(values, axis=axis, **kwargs)
- return cast(F, newfunc)
- def nanany(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> bool:
- """
- Check if any elements along an axis evaluate to True.
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : bool
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 2])
- >>> nanops.nanany(s)
- True
- >>> from pandas.core import nanops
- >>> s = pd.Series([np.nan])
- >>> nanops.nanany(s)
- False
- """
- if needs_i8_conversion(values.dtype) and values.dtype.kind != "m":
- # GH#34479
- warnings.warn(
- "'any' with datetime64 dtypes is deprecated and will raise in a "
- "future version. Use (obj != pd.Timestamp(0)).any() instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask)
- # For object type, any won't necessarily return
- # boolean values (numpy/numpy#4352)
- if is_object_dtype(values):
- values = values.astype(bool)
- # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
- # "bool")
- return values.any(axis) # type: ignore[return-value]
- def nanall(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> bool:
- """
- Check if all elements along an axis evaluate to True.
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : bool
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 2, np.nan])
- >>> nanops.nanall(s)
- True
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 0])
- >>> nanops.nanall(s)
- False
- """
- if needs_i8_conversion(values.dtype) and values.dtype.kind != "m":
- # GH#34479
- warnings.warn(
- "'all' with datetime64 dtypes is deprecated and will raise in a "
- "future version. Use (obj != pd.Timestamp(0)).all() instead.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask)
- # For object type, all won't necessarily return
- # boolean values (numpy/numpy#4352)
- if is_object_dtype(values):
- values = values.astype(bool)
- # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
- # "bool")
- return values.all(axis) # type: ignore[return-value]
- @disallow("M8")
- @_datetimelike_compat
- @maybe_operate_rowwise
- def nansum(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- min_count: int = 0,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> float:
- """
- Sum the elements along an axis ignoring NaNs
- Parameters
- ----------
- values : ndarray[dtype]
- axis : int, optional
- skipna : bool, default True
- min_count: int, default 0
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : dtype
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 2, np.nan])
- >>> nanops.nansum(s)
- 3.0
- """
- values, mask, dtype, dtype_max, _ = _get_values(
- values, skipna, fill_value=0, mask=mask
- )
- dtype_sum = dtype_max
- if is_float_dtype(dtype):
- dtype_sum = dtype
- elif is_timedelta64_dtype(dtype):
- dtype_sum = np.dtype(np.float64)
- the_sum = values.sum(axis, dtype=dtype_sum)
- the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
- return the_sum
- def _mask_datetimelike_result(
- result: np.ndarray | np.datetime64 | np.timedelta64,
- axis: AxisInt | None,
- mask: npt.NDArray[np.bool_],
- orig_values: np.ndarray,
- ) -> np.ndarray | np.datetime64 | np.timedelta64 | NaTType:
- if isinstance(result, np.ndarray):
- # we need to apply the mask
- result = result.astype("i8").view(orig_values.dtype)
- axis_mask = mask.any(axis=axis)
- # error: Unsupported target for indexed assignment ("Union[ndarray[Any, Any],
- # datetime64, timedelta64]")
- result[axis_mask] = iNaT # type: ignore[index]
- else:
- if mask.any():
- return np.int64(iNaT).view(orig_values.dtype)
- return result
- @disallow(PeriodDtype)
- @bottleneck_switch()
- @_datetimelike_compat
- def nanmean(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> float:
- """
- Compute the mean of the element along an axis ignoring NaNs
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- float
- Unless input is a float array, in which case use the same
- precision as the input array.
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 2, np.nan])
- >>> nanops.nanmean(s)
- 1.5
- """
- values, mask, dtype, dtype_max, _ = _get_values(
- values, skipna, fill_value=0, mask=mask
- )
- dtype_sum = dtype_max
- dtype_count = np.dtype(np.float64)
- # not using needs_i8_conversion because that includes period
- if dtype.kind in ["m", "M"]:
- dtype_sum = np.dtype(np.float64)
- elif is_integer_dtype(dtype):
- dtype_sum = np.dtype(np.float64)
- elif is_float_dtype(dtype):
- dtype_sum = dtype
- dtype_count = dtype
- count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
- the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
- if axis is not None and getattr(the_sum, "ndim", False):
- count = cast(np.ndarray, count)
- with np.errstate(all="ignore"):
- # suppress division by zero warnings
- the_mean = the_sum / count
- ct_mask = count == 0
- if ct_mask.any():
- the_mean[ct_mask] = np.nan
- else:
- the_mean = the_sum / count if count > 0 else np.nan
- return the_mean
- @bottleneck_switch()
- def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=None):
- """
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : float
- Unless input is a float array, in which case use the same
- precision as the input array.
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 2, 2])
- >>> nanops.nanmedian(s)
- 2.0
- """
- def get_median(x, _mask=None):
- if _mask is None:
- _mask = notna(x)
- else:
- _mask = ~_mask
- if not skipna and not _mask.all():
- return np.nan
- with warnings.catch_warnings():
- # Suppress RuntimeWarning about All-NaN slice
- warnings.filterwarnings(
- "ignore", "All-NaN slice encountered", RuntimeWarning
- )
- res = np.nanmedian(x[_mask])
- return res
- values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask, fill_value=0)
- if not is_float_dtype(values.dtype):
- try:
- values = values.astype("f8")
- except ValueError as err:
- # e.g. "could not convert string to float: 'a'"
- raise TypeError(str(err)) from err
- if mask is not None:
- values[mask] = np.nan
- notempty = values.size
- # an array from a frame
- if values.ndim > 1 and axis is not None:
- # there's a non-empty array to apply over otherwise numpy raises
- if notempty:
- if not skipna:
- res = np.apply_along_axis(get_median, axis, values)
- else:
- # fastpath for the skipna case
- with warnings.catch_warnings():
- # Suppress RuntimeWarning about All-NaN slice
- warnings.filterwarnings(
- "ignore", "All-NaN slice encountered", RuntimeWarning
- )
- res = np.nanmedian(values, axis)
- else:
- # must return the correct shape, but median is not defined for the
- # empty set so return nans of shape "everything but the passed axis"
- # since "axis" is where the reduction would occur if we had a nonempty
- # array
- res = get_empty_reduction_result(values.shape, axis, np.float_, np.nan)
- else:
- # otherwise return a scalar value
- res = get_median(values, mask) if notempty else np.nan
- return _wrap_results(res, dtype)
- def get_empty_reduction_result(
- shape: tuple[int, ...],
- axis: AxisInt,
- dtype: np.dtype | type[np.floating],
- fill_value: Any,
- ) -> np.ndarray:
- """
- The result from a reduction on an empty ndarray.
- Parameters
- ----------
- shape : Tuple[int]
- axis : int
- dtype : np.dtype
- fill_value : Any
- Returns
- -------
- np.ndarray
- """
- shp = np.array(shape)
- dims = np.arange(len(shape))
- ret = np.empty(shp[dims != axis], dtype=dtype)
- ret.fill(fill_value)
- return ret
- def _get_counts_nanvar(
- values_shape: Shape,
- mask: npt.NDArray[np.bool_] | None,
- axis: AxisInt | None,
- ddof: int,
- dtype: np.dtype = np.dtype(np.float64),
- ) -> tuple[float | np.ndarray, float | np.ndarray]:
- """
- Get the count of non-null values along an axis, accounting
- for degrees of freedom.
- Parameters
- ----------
- values_shape : Tuple[int, ...]
- shape tuple from values ndarray, used if mask is None
- mask : Optional[ndarray[bool]]
- locations in values that should be considered missing
- axis : Optional[int]
- axis to count along
- ddof : int
- degrees of freedom
- dtype : type, optional
- type to use for count
- Returns
- -------
- count : int, np.nan or np.ndarray
- d : int, np.nan or np.ndarray
- """
- count = _get_counts(values_shape, mask, axis, dtype=dtype)
- d = count - dtype.type(ddof)
- # always return NaN, never inf
- if is_scalar(count):
- if count <= ddof:
- count = np.nan
- d = np.nan
- else:
- # count is not narrowed by is_scalar check
- count = cast(np.ndarray, count)
- mask = count <= ddof
- if mask.any():
- np.putmask(d, mask, np.nan)
- np.putmask(count, mask, np.nan)
- return count, d
- @bottleneck_switch(ddof=1)
- def nanstd(
- values,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- ddof: int = 1,
- mask=None,
- ):
- """
- Compute the standard deviation along given axis while ignoring NaNs
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
- where N represents the number of elements.
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : float
- Unless input is a float array, in which case use the same
- precision as the input array.
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 2, 3])
- >>> nanops.nanstd(s)
- 1.0
- """
- if values.dtype == "M8[ns]":
- values = values.view("m8[ns]")
- orig_dtype = values.dtype
- values, mask, _, _, _ = _get_values(values, skipna, mask=mask)
- result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
- return _wrap_results(result, orig_dtype)
- @disallow("M8", "m8")
- @bottleneck_switch(ddof=1)
- def nanvar(
- values,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- ddof: int = 1,
- mask=None,
- ):
- """
- Compute the variance along given axis while ignoring NaNs
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
- where N represents the number of elements.
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : float
- Unless input is a float array, in which case use the same
- precision as the input array.
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 2, 3])
- >>> nanops.nanvar(s)
- 1.0
- """
- values = extract_array(values, extract_numpy=True)
- dtype = values.dtype
- mask = _maybe_get_mask(values, skipna, mask)
- if is_any_int_dtype(dtype):
- values = values.astype("f8")
- if mask is not None:
- values[mask] = np.nan
- if is_float_dtype(values.dtype):
- count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
- else:
- count, d = _get_counts_nanvar(values.shape, mask, axis, ddof)
- if skipna and mask is not None:
- values = values.copy()
- np.putmask(values, mask, 0)
- # xref GH10242
- # Compute variance via two-pass algorithm, which is stable against
- # cancellation errors and relatively accurate for small numbers of
- # observations.
- #
- # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
- if axis is not None:
- avg = np.expand_dims(avg, axis)
- sqr = _ensure_numeric((avg - values) ** 2)
- if mask is not None:
- np.putmask(sqr, mask, 0)
- result = sqr.sum(axis=axis, dtype=np.float64) / d
- # Return variance as np.float64 (the datatype used in the accumulator),
- # unless we were dealing with a float array, in which case use the same
- # precision as the original values array.
- if is_float_dtype(dtype):
- result = result.astype(dtype, copy=False)
- return result
- @disallow("M8", "m8")
- def nansem(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- ddof: int = 1,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> float:
- """
- Compute the standard error in the mean along given axis while ignoring NaNs
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
- where N represents the number of elements.
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : float64
- Unless input is a float array, in which case use the same
- precision as the input array.
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 2, 3])
- >>> nanops.nansem(s)
- 0.5773502691896258
- """
- # This checks if non-numeric-like data is passed with numeric_only=False
- # and raises a TypeError otherwise
- nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
- mask = _maybe_get_mask(values, skipna, mask)
- if not is_float_dtype(values.dtype):
- values = values.astype("f8")
- if not skipna and mask is not None and mask.any():
- return np.nan
- count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
- var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
- return np.sqrt(var) / np.sqrt(count)
- def _nanminmax(meth, fill_value_typ):
- @bottleneck_switch(name=f"nan{meth}")
- @_datetimelike_compat
- def reduction(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> Dtype:
- values, mask, dtype, dtype_max, fill_value = _get_values(
- values, skipna, fill_value_typ=fill_value_typ, mask=mask
- )
- if (axis is not None and values.shape[axis] == 0) or values.size == 0:
- try:
- result = getattr(values, meth)(axis, dtype=dtype_max)
- result.fill(np.nan)
- except (AttributeError, TypeError, ValueError):
- result = np.nan
- else:
- result = getattr(values, meth)(axis)
- result = _maybe_null_out(result, axis, mask, values.shape)
- return result
- return reduction
- nanmin = _nanminmax("min", fill_value_typ="+inf")
- nanmax = _nanminmax("max", fill_value_typ="-inf")
- @disallow("O")
- def nanargmax(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> int | np.ndarray:
- """
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : int or ndarray[int]
- The index/indices of max value in specified axis or -1 in the NA case
- Examples
- --------
- >>> from pandas.core import nanops
- >>> arr = np.array([1, 2, 3, np.nan, 4])
- >>> nanops.nanargmax(arr)
- 4
- >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
- >>> arr[2:, 2] = np.nan
- >>> arr
- array([[ 0., 1., 2.],
- [ 3., 4., 5.],
- [ 6., 7., nan],
- [ 9., 10., nan]])
- >>> nanops.nanargmax(arr, axis=1)
- array([2, 2, 1, 1])
- """
- values, mask, _, _, _ = _get_values(values, True, fill_value_typ="-inf", mask=mask)
- # error: Need type annotation for 'result'
- result = values.argmax(axis) # type: ignore[var-annotated]
- result = _maybe_arg_null_out(result, axis, mask, skipna)
- return result
- @disallow("O")
- def nanargmin(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> int | np.ndarray:
- """
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : int or ndarray[int]
- The index/indices of min value in specified axis or -1 in the NA case
- Examples
- --------
- >>> from pandas.core import nanops
- >>> arr = np.array([1, 2, 3, np.nan, 4])
- >>> nanops.nanargmin(arr)
- 0
- >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
- >>> arr[2:, 0] = np.nan
- >>> arr
- array([[ 0., 1., 2.],
- [ 3., 4., 5.],
- [nan, 7., 8.],
- [nan, 10., 11.]])
- >>> nanops.nanargmin(arr, axis=1)
- array([0, 0, 1, 1])
- """
- values, mask, _, _, _ = _get_values(values, True, fill_value_typ="+inf", mask=mask)
- # error: Need type annotation for 'result'
- result = values.argmin(axis) # type: ignore[var-annotated]
- result = _maybe_arg_null_out(result, axis, mask, skipna)
- return result
- @disallow("M8", "m8")
- @maybe_operate_rowwise
- def nanskew(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> float:
- """
- Compute the sample skewness.
- The statistic computed here is the adjusted Fisher-Pearson standardized
- moment coefficient G1. The algorithm computes this coefficient directly
- from the second and third central moment.
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : float64
- Unless input is a float array, in which case use the same
- precision as the input array.
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 1, 2])
- >>> nanops.nanskew(s)
- 1.7320508075688787
- """
- # error: Incompatible types in assignment (expression has type "Union[Any,
- # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
- values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
- mask = _maybe_get_mask(values, skipna, mask)
- if not is_float_dtype(values.dtype):
- values = values.astype("f8")
- count = _get_counts(values.shape, mask, axis)
- else:
- count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
- if skipna and mask is not None:
- values = values.copy()
- np.putmask(values, mask, 0)
- elif not skipna and mask is not None and mask.any():
- return np.nan
- mean = values.sum(axis, dtype=np.float64) / count
- if axis is not None:
- mean = np.expand_dims(mean, axis)
- adjusted = values - mean
- if skipna and mask is not None:
- np.putmask(adjusted, mask, 0)
- adjusted2 = adjusted**2
- adjusted3 = adjusted2 * adjusted
- m2 = adjusted2.sum(axis, dtype=np.float64)
- m3 = adjusted3.sum(axis, dtype=np.float64)
- # floating point error
- #
- # #18044 in _libs/windows.pyx calc_skew follow this behavior
- # to fix the fperr to treat m2 <1e-14 as zero
- m2 = _zero_out_fperr(m2)
- m3 = _zero_out_fperr(m3)
- with np.errstate(invalid="ignore", divide="ignore"):
- result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)
- dtype = values.dtype
- if is_float_dtype(dtype):
- result = result.astype(dtype, copy=False)
- if isinstance(result, np.ndarray):
- result = np.where(m2 == 0, 0, result)
- result[count < 3] = np.nan
- else:
- result = 0 if m2 == 0 else result
- if count < 3:
- return np.nan
- return result
- @disallow("M8", "m8")
- @maybe_operate_rowwise
- def nankurt(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> float:
- """
- Compute the sample excess kurtosis
- The statistic computed here is the adjusted Fisher-Pearson standardized
- moment coefficient G2, computed directly from the second and fourth
- central moment.
- Parameters
- ----------
- values : ndarray
- axis : int, optional
- skipna : bool, default True
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- result : float64
- Unless input is a float array, in which case use the same
- precision as the input array.
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, np.nan, 1, 3, 2])
- >>> nanops.nankurt(s)
- -1.2892561983471076
- """
- # error: Incompatible types in assignment (expression has type "Union[Any,
- # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
- values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
- mask = _maybe_get_mask(values, skipna, mask)
- if not is_float_dtype(values.dtype):
- values = values.astype("f8")
- count = _get_counts(values.shape, mask, axis)
- else:
- count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
- if skipna and mask is not None:
- values = values.copy()
- np.putmask(values, mask, 0)
- elif not skipna and mask is not None and mask.any():
- return np.nan
- mean = values.sum(axis, dtype=np.float64) / count
- if axis is not None:
- mean = np.expand_dims(mean, axis)
- adjusted = values - mean
- if skipna and mask is not None:
- np.putmask(adjusted, mask, 0)
- adjusted2 = adjusted**2
- adjusted4 = adjusted2**2
- m2 = adjusted2.sum(axis, dtype=np.float64)
- m4 = adjusted4.sum(axis, dtype=np.float64)
- with np.errstate(invalid="ignore", divide="ignore"):
- adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
- numerator = count * (count + 1) * (count - 1) * m4
- denominator = (count - 2) * (count - 3) * m2**2
- # floating point error
- #
- # #18044 in _libs/windows.pyx calc_kurt follow this behavior
- # to fix the fperr to treat denom <1e-14 as zero
- numerator = _zero_out_fperr(numerator)
- denominator = _zero_out_fperr(denominator)
- if not isinstance(denominator, np.ndarray):
- # if ``denom`` is a scalar, check these corner cases first before
- # doing division
- if count < 4:
- return np.nan
- if denominator == 0:
- return 0
- with np.errstate(invalid="ignore", divide="ignore"):
- result = numerator / denominator - adj
- dtype = values.dtype
- if is_float_dtype(dtype):
- result = result.astype(dtype, copy=False)
- if isinstance(result, np.ndarray):
- result = np.where(denominator == 0, 0, result)
- result[count < 4] = np.nan
- return result
- @disallow("M8", "m8")
- @maybe_operate_rowwise
- def nanprod(
- values: np.ndarray,
- *,
- axis: AxisInt | None = None,
- skipna: bool = True,
- min_count: int = 0,
- mask: npt.NDArray[np.bool_] | None = None,
- ) -> float:
- """
- Parameters
- ----------
- values : ndarray[dtype]
- axis : int, optional
- skipna : bool, default True
- min_count: int, default 0
- mask : ndarray[bool], optional
- nan-mask if known
- Returns
- -------
- Dtype
- The product of all elements on a given axis. ( NaNs are treated as 1)
- Examples
- --------
- >>> from pandas.core import nanops
- >>> s = pd.Series([1, 2, 3, np.nan])
- >>> nanops.nanprod(s)
- 6.0
- """
- mask = _maybe_get_mask(values, skipna, mask)
- if skipna and mask is not None:
- values = values.copy()
- values[mask] = 1
- result = values.prod(axis)
- # error: Incompatible return value type (got "Union[ndarray, float]", expected
- # "float")
- return _maybe_null_out( # type: ignore[return-value]
- result, axis, mask, values.shape, min_count=min_count
- )
- def _maybe_arg_null_out(
- result: np.ndarray,
- axis: AxisInt | None,
- mask: npt.NDArray[np.bool_] | None,
- skipna: bool,
- ) -> np.ndarray | int:
- # helper function for nanargmin/nanargmax
- if mask is None:
- return result
- if axis is None or not getattr(result, "ndim", False):
- if skipna:
- if mask.all():
- return -1
- else:
- if mask.any():
- return -1
- else:
- if skipna:
- na_mask = mask.all(axis)
- else:
- na_mask = mask.any(axis)
- if na_mask.any():
- result[na_mask] = -1
- return result
- def _get_counts(
- values_shape: Shape,
- mask: npt.NDArray[np.bool_] | None,
- axis: AxisInt | None,
- dtype: np.dtype = np.dtype(np.float64),
- ) -> float | np.ndarray:
- """
- Get the count of non-null values along an axis
- Parameters
- ----------
- values_shape : tuple of int
- shape tuple from values ndarray, used if mask is None
- mask : Optional[ndarray[bool]]
- locations in values that should be considered missing
- axis : Optional[int]
- axis to count along
- dtype : type, optional
- type to use for count
- Returns
- -------
- count : scalar or array
- """
- if axis is None:
- if mask is not None:
- n = mask.size - mask.sum()
- else:
- n = np.prod(values_shape)
- return dtype.type(n)
- if mask is not None:
- count = mask.shape[axis] - mask.sum(axis)
- else:
- count = values_shape[axis]
- if is_scalar(count):
- return dtype.type(count)
- return count.astype(dtype, copy=False)
- def _maybe_null_out(
- result: np.ndarray | float | NaTType,
- axis: AxisInt | None,
- mask: npt.NDArray[np.bool_] | None,
- shape: tuple[int, ...],
- min_count: int = 1,
- ) -> np.ndarray | float | NaTType:
- """
- Returns
- -------
- Dtype
- The product of all elements on a given axis. ( NaNs are treated as 1)
- """
- if mask is None and min_count == 0:
- # nothing to check; short-circuit
- return result
- if axis is not None and isinstance(result, np.ndarray):
- if mask is not None:
- null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
- else:
- # we have no nulls, kept mask=None in _maybe_get_mask
- below_count = shape[axis] - min_count < 0
- new_shape = shape[:axis] + shape[axis + 1 :]
- null_mask = np.broadcast_to(below_count, new_shape)
- if np.any(null_mask):
- if is_numeric_dtype(result):
- if np.iscomplexobj(result):
- result = result.astype("c16")
- elif not is_float_dtype(result):
- result = result.astype("f8", copy=False)
- result[null_mask] = np.nan
- else:
- # GH12941, use None to auto cast null
- result[null_mask] = None
- elif result is not NaT:
- if check_below_min_count(shape, mask, min_count):
- result_dtype = getattr(result, "dtype", None)
- if is_float_dtype(result_dtype):
- # error: Item "None" of "Optional[Any]" has no attribute "type"
- result = result_dtype.type("nan") # type: ignore[union-attr]
- else:
- result = np.nan
- return result
- def check_below_min_count(
- shape: tuple[int, ...], mask: npt.NDArray[np.bool_] | None, min_count: int
- ) -> bool:
- """
- Check for the `min_count` keyword. Returns True if below `min_count` (when
- missing value should be returned from the reduction).
- Parameters
- ----------
- shape : tuple
- The shape of the values (`values.shape`).
- mask : ndarray[bool] or None
- Boolean numpy array (typically of same shape as `shape`) or None.
- min_count : int
- Keyword passed through from sum/prod call.
- Returns
- -------
- bool
- """
- if min_count > 0:
- if mask is None:
- # no missing values, only check size
- non_nulls = np.prod(shape)
- else:
- non_nulls = mask.size - mask.sum()
- if non_nulls < min_count:
- return True
- return False
- def _zero_out_fperr(arg):
- # #18044 reference this behavior to fix rolling skew/kurt issue
- if isinstance(arg, np.ndarray):
- with np.errstate(invalid="ignore"):
- return np.where(np.abs(arg) < 1e-14, 0, arg)
- else:
- return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
- @disallow("M8", "m8")
- def nancorr(
- a: np.ndarray,
- b: np.ndarray,
- *,
- method: CorrelationMethod = "pearson",
- min_periods: int | None = None,
- ) -> float:
- """
- a, b: ndarrays
- """
- if len(a) != len(b):
- raise AssertionError("Operands to nancorr must have same size")
- if min_periods is None:
- min_periods = 1
- valid = notna(a) & notna(b)
- if not valid.all():
- a = a[valid]
- b = b[valid]
- if len(a) < min_periods:
- return np.nan
- f = get_corr_func(method)
- return f(a, b)
- def get_corr_func(
- method: CorrelationMethod,
- ) -> Callable[[np.ndarray, np.ndarray], float]:
- if method == "kendall":
- from scipy.stats import kendalltau
- def func(a, b):
- return kendalltau(a, b)[0]
- return func
- elif method == "spearman":
- from scipy.stats import spearmanr
- def func(a, b):
- return spearmanr(a, b)[0]
- return func
- elif method == "pearson":
- def func(a, b):
- return np.corrcoef(a, b)[0, 1]
- return func
- elif callable(method):
- return method
- raise ValueError(
- f"Unknown method '{method}', expected one of "
- "'kendall', 'spearman', 'pearson', or callable"
- )
- @disallow("M8", "m8")
- def nancov(
- a: np.ndarray,
- b: np.ndarray,
- *,
- min_periods: int | None = None,
- ddof: int | None = 1,
- ) -> float:
- if len(a) != len(b):
- raise AssertionError("Operands to nancov must have same size")
- if min_periods is None:
- min_periods = 1
- valid = notna(a) & notna(b)
- if not valid.all():
- a = a[valid]
- b = b[valid]
- if len(a) < min_periods:
- return np.nan
- return np.cov(a, b, ddof=ddof)[0, 1]
- def _ensure_numeric(x):
- if isinstance(x, np.ndarray):
- if is_integer_dtype(x) or is_bool_dtype(x):
- x = x.astype(np.float64)
- elif is_object_dtype(x):
- try:
- x = x.astype(np.complex128)
- except (TypeError, ValueError):
- try:
- x = x.astype(np.float64)
- except ValueError as err:
- # GH#29941 we get here with object arrays containing strs
- raise TypeError(f"Could not convert {x} to numeric") from err
- else:
- if not np.any(np.imag(x)):
- x = x.real
- elif not (is_float(x) or is_integer(x) or is_complex(x)):
- try:
- x = float(x)
- except (TypeError, ValueError):
- # e.g. "1+1j" or "foo"
- try:
- x = complex(x)
- except ValueError as err:
- # e.g. "foo"
- raise TypeError(f"Could not convert {x} to numeric") from err
- return x
- # NA-friendly array comparisons
- def make_nancomp(op):
- def f(x, y):
- xmask = isna(x)
- ymask = isna(y)
- mask = xmask | ymask
- with np.errstate(all="ignore"):
- result = op(x, y)
- if mask.any():
- if is_bool_dtype(result):
- result = result.astype("O")
- np.putmask(result, mask, np.nan)
- return result
- return f
- nangt = make_nancomp(operator.gt)
- nange = make_nancomp(operator.ge)
- nanlt = make_nancomp(operator.lt)
- nanle = make_nancomp(operator.le)
- naneq = make_nancomp(operator.eq)
- nanne = make_nancomp(operator.ne)
- def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
- """
- Cumulative function with skipna support.
- Parameters
- ----------
- values : np.ndarray or ExtensionArray
- accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate}
- skipna : bool
- Returns
- -------
- np.ndarray or ExtensionArray
- """
- mask_a, mask_b = {
- np.cumprod: (1.0, np.nan),
- np.maximum.accumulate: (-np.inf, np.nan),
- np.cumsum: (0.0, np.nan),
- np.minimum.accumulate: (np.inf, np.nan),
- }[accum_func]
- # This should go through ea interface
- assert values.dtype.kind not in ["m", "M"]
- # We will be applying this function to block values
- if skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)):
- vals = values.copy()
- mask = isna(vals)
- vals[mask] = mask_a
- result = accum_func(vals, axis=0)
- result[mask] = mask_b
- else:
- result = accum_func(values, axis=0)
- return result
|