nanops.py 50 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767
  1. from __future__ import annotations
  2. import functools
  3. import itertools
  4. import operator
  5. from typing import (
  6. Any,
  7. Callable,
  8. cast,
  9. )
  10. import warnings
  11. import numpy as np
  12. from pandas._config import get_option
  13. from pandas._libs import (
  14. NaT,
  15. NaTType,
  16. iNaT,
  17. lib,
  18. )
  19. from pandas._typing import (
  20. ArrayLike,
  21. AxisInt,
  22. CorrelationMethod,
  23. Dtype,
  24. DtypeObj,
  25. F,
  26. Scalar,
  27. Shape,
  28. npt,
  29. )
  30. from pandas.compat._optional import import_optional_dependency
  31. from pandas.util._exceptions import find_stack_level
  32. from pandas.core.dtypes.common import (
  33. is_any_int_dtype,
  34. is_bool_dtype,
  35. is_complex,
  36. is_datetime64_any_dtype,
  37. is_float,
  38. is_float_dtype,
  39. is_integer,
  40. is_integer_dtype,
  41. is_numeric_dtype,
  42. is_object_dtype,
  43. is_scalar,
  44. is_timedelta64_dtype,
  45. needs_i8_conversion,
  46. pandas_dtype,
  47. )
  48. from pandas.core.dtypes.dtypes import PeriodDtype
  49. from pandas.core.dtypes.missing import (
  50. isna,
  51. na_value_for_dtype,
  52. notna,
  53. )
  54. from pandas.core.construction import extract_array
  55. bn = import_optional_dependency("bottleneck", errors="warn")
  56. _BOTTLENECK_INSTALLED = bn is not None
  57. _USE_BOTTLENECK = False
  58. def set_use_bottleneck(v: bool = True) -> None:
  59. # set/unset to use bottleneck
  60. global _USE_BOTTLENECK
  61. if _BOTTLENECK_INSTALLED:
  62. _USE_BOTTLENECK = v
  63. set_use_bottleneck(get_option("compute.use_bottleneck"))
  64. class disallow:
  65. def __init__(self, *dtypes: Dtype) -> None:
  66. super().__init__()
  67. self.dtypes = tuple(pandas_dtype(dtype).type for dtype in dtypes)
  68. def check(self, obj) -> bool:
  69. return hasattr(obj, "dtype") and issubclass(obj.dtype.type, self.dtypes)
  70. def __call__(self, f: F) -> F:
  71. @functools.wraps(f)
  72. def _f(*args, **kwargs):
  73. obj_iter = itertools.chain(args, kwargs.values())
  74. if any(self.check(obj) for obj in obj_iter):
  75. f_name = f.__name__.replace("nan", "")
  76. raise TypeError(
  77. f"reduction operation '{f_name}' not allowed for this dtype"
  78. )
  79. try:
  80. with np.errstate(invalid="ignore"):
  81. return f(*args, **kwargs)
  82. except ValueError as e:
  83. # we want to transform an object array
  84. # ValueError message to the more typical TypeError
  85. # e.g. this is normally a disallowed function on
  86. # object arrays that contain strings
  87. if is_object_dtype(args[0]):
  88. raise TypeError(e) from e
  89. raise
  90. return cast(F, _f)
  91. class bottleneck_switch:
  92. def __init__(self, name=None, **kwargs) -> None:
  93. self.name = name
  94. self.kwargs = kwargs
  95. def __call__(self, alt: F) -> F:
  96. bn_name = self.name or alt.__name__
  97. try:
  98. bn_func = getattr(bn, bn_name)
  99. except (AttributeError, NameError): # pragma: no cover
  100. bn_func = None
  101. @functools.wraps(alt)
  102. def f(
  103. values: np.ndarray,
  104. *,
  105. axis: AxisInt | None = None,
  106. skipna: bool = True,
  107. **kwds,
  108. ):
  109. if len(self.kwargs) > 0:
  110. for k, v in self.kwargs.items():
  111. if k not in kwds:
  112. kwds[k] = v
  113. if values.size == 0 and kwds.get("min_count") is None:
  114. # We are empty, returning NA for our type
  115. # Only applies for the default `min_count` of None
  116. # since that affects how empty arrays are handled.
  117. # TODO(GH-18976) update all the nanops methods to
  118. # correctly handle empty inputs and remove this check.
  119. # It *may* just be `var`
  120. return _na_for_min_count(values, axis)
  121. if _USE_BOTTLENECK and skipna and _bn_ok_dtype(values.dtype, bn_name):
  122. if kwds.get("mask", None) is None:
  123. # `mask` is not recognised by bottleneck, would raise
  124. # TypeError if called
  125. kwds.pop("mask", None)
  126. result = bn_func(values, axis=axis, **kwds)
  127. # prefer to treat inf/-inf as NA, but must compute the func
  128. # twice :(
  129. if _has_infs(result):
  130. result = alt(values, axis=axis, skipna=skipna, **kwds)
  131. else:
  132. result = alt(values, axis=axis, skipna=skipna, **kwds)
  133. else:
  134. result = alt(values, axis=axis, skipna=skipna, **kwds)
  135. return result
  136. return cast(F, f)
  137. def _bn_ok_dtype(dtype: DtypeObj, name: str) -> bool:
  138. # Bottleneck chokes on datetime64, PeriodDtype (or and EA)
  139. if not is_object_dtype(dtype) and not needs_i8_conversion(dtype):
  140. # GH 42878
  141. # Bottleneck uses naive summation leading to O(n) loss of precision
  142. # unlike numpy which implements pairwise summation, which has O(log(n)) loss
  143. # crossref: https://github.com/pydata/bottleneck/issues/379
  144. # GH 15507
  145. # bottleneck does not properly upcast during the sum
  146. # so can overflow
  147. # GH 9422
  148. # further we also want to preserve NaN when all elements
  149. # are NaN, unlike bottleneck/numpy which consider this
  150. # to be 0
  151. return name not in ["nansum", "nanprod", "nanmean"]
  152. return False
  153. def _has_infs(result) -> bool:
  154. if isinstance(result, np.ndarray):
  155. if result.dtype in ("f8", "f4"):
  156. # Note: outside of an nanops-specific test, we always have
  157. # result.ndim == 1, so there is no risk of this ravel making a copy.
  158. return lib.has_infs(result.ravel("K"))
  159. try:
  160. return np.isinf(result).any()
  161. except (TypeError, NotImplementedError):
  162. # if it doesn't support infs, then it can't have infs
  163. return False
  164. def _get_fill_value(
  165. dtype: DtypeObj, fill_value: Scalar | None = None, fill_value_typ=None
  166. ):
  167. """return the correct fill value for the dtype of the values"""
  168. if fill_value is not None:
  169. return fill_value
  170. if _na_ok_dtype(dtype):
  171. if fill_value_typ is None:
  172. return np.nan
  173. else:
  174. if fill_value_typ == "+inf":
  175. return np.inf
  176. else:
  177. return -np.inf
  178. else:
  179. if fill_value_typ == "+inf":
  180. # need the max int here
  181. return lib.i8max
  182. else:
  183. return iNaT
  184. def _maybe_get_mask(
  185. values: np.ndarray, skipna: bool, mask: npt.NDArray[np.bool_] | None
  186. ) -> npt.NDArray[np.bool_] | None:
  187. """
  188. Compute a mask if and only if necessary.
  189. This function will compute a mask iff it is necessary. Otherwise,
  190. return the provided mask (potentially None) when a mask does not need to be
  191. computed.
  192. A mask is never necessary if the values array is of boolean or integer
  193. dtypes, as these are incapable of storing NaNs. If passing a NaN-capable
  194. dtype that is interpretable as either boolean or integer data (eg,
  195. timedelta64), a mask must be provided.
  196. If the skipna parameter is False, a new mask will not be computed.
  197. The mask is computed using isna() by default. Setting invert=True selects
  198. notna() as the masking function.
  199. Parameters
  200. ----------
  201. values : ndarray
  202. input array to potentially compute mask for
  203. skipna : bool
  204. boolean for whether NaNs should be skipped
  205. mask : Optional[ndarray]
  206. nan-mask if known
  207. Returns
  208. -------
  209. Optional[np.ndarray[bool]]
  210. """
  211. if mask is None:
  212. if is_bool_dtype(values.dtype) or is_integer_dtype(values.dtype):
  213. # Boolean data cannot contain nulls, so signal via mask being None
  214. return None
  215. if skipna or needs_i8_conversion(values.dtype):
  216. mask = isna(values)
  217. return mask
  218. def _get_values(
  219. values: np.ndarray,
  220. skipna: bool,
  221. fill_value: Any = None,
  222. fill_value_typ: str | None = None,
  223. mask: npt.NDArray[np.bool_] | None = None,
  224. ) -> tuple[np.ndarray, npt.NDArray[np.bool_] | None, np.dtype, np.dtype, Any]:
  225. """
  226. Utility to get the values view, mask, dtype, dtype_max, and fill_value.
  227. If both mask and fill_value/fill_value_typ are not None and skipna is True,
  228. the values array will be copied.
  229. For input arrays of boolean or integer dtypes, copies will only occur if a
  230. precomputed mask, a fill_value/fill_value_typ, and skipna=True are
  231. provided.
  232. Parameters
  233. ----------
  234. values : ndarray
  235. input array to potentially compute mask for
  236. skipna : bool
  237. boolean for whether NaNs should be skipped
  238. fill_value : Any
  239. value to fill NaNs with
  240. fill_value_typ : str
  241. Set to '+inf' or '-inf' to handle dtype-specific infinities
  242. mask : Optional[np.ndarray[bool]]
  243. nan-mask if known
  244. Returns
  245. -------
  246. values : ndarray
  247. Potential copy of input value array
  248. mask : Optional[ndarray[bool]]
  249. Mask for values, if deemed necessary to compute
  250. dtype : np.dtype
  251. dtype for values
  252. dtype_max : np.dtype
  253. platform independent dtype
  254. fill_value : Any
  255. fill value used
  256. """
  257. # In _get_values is only called from within nanops, and in all cases
  258. # with scalar fill_value. This guarantee is important for the
  259. # np.where call below
  260. assert is_scalar(fill_value)
  261. # error: Incompatible types in assignment (expression has type "Union[Any,
  262. # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
  263. values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
  264. mask = _maybe_get_mask(values, skipna, mask)
  265. dtype = values.dtype
  266. datetimelike = False
  267. if needs_i8_conversion(values.dtype):
  268. # changing timedelta64/datetime64 to int64 needs to happen after
  269. # finding `mask` above
  270. values = np.asarray(values.view("i8"))
  271. datetimelike = True
  272. dtype_ok = _na_ok_dtype(dtype)
  273. # get our fill value (in case we need to provide an alternative
  274. # dtype for it)
  275. fill_value = _get_fill_value(
  276. dtype, fill_value=fill_value, fill_value_typ=fill_value_typ
  277. )
  278. if skipna and (mask is not None) and (fill_value is not None):
  279. if mask.any():
  280. if dtype_ok or datetimelike:
  281. values = values.copy()
  282. np.putmask(values, mask, fill_value)
  283. else:
  284. # np.where will promote if needed
  285. values = np.where(~mask, values, fill_value)
  286. # return a platform independent precision dtype
  287. dtype_max = dtype
  288. if is_integer_dtype(dtype) or is_bool_dtype(dtype):
  289. dtype_max = np.dtype(np.int64)
  290. elif is_float_dtype(dtype):
  291. dtype_max = np.dtype(np.float64)
  292. return values, mask, dtype, dtype_max, fill_value
  293. def _na_ok_dtype(dtype: DtypeObj) -> bool:
  294. if needs_i8_conversion(dtype):
  295. return False
  296. return not issubclass(dtype.type, np.integer)
  297. def _wrap_results(result, dtype: np.dtype, fill_value=None):
  298. """wrap our results if needed"""
  299. if result is NaT:
  300. pass
  301. elif is_datetime64_any_dtype(dtype):
  302. if fill_value is None:
  303. # GH#24293
  304. fill_value = iNaT
  305. if not isinstance(result, np.ndarray):
  306. assert not isna(fill_value), "Expected non-null fill_value"
  307. if result == fill_value:
  308. result = np.nan
  309. if isna(result):
  310. result = np.datetime64("NaT", "ns").astype(dtype)
  311. else:
  312. result = np.int64(result).view(dtype)
  313. # retain original unit
  314. result = result.astype(dtype, copy=False)
  315. else:
  316. # If we have float dtype, taking a view will give the wrong result
  317. result = result.astype(dtype)
  318. elif is_timedelta64_dtype(dtype):
  319. if not isinstance(result, np.ndarray):
  320. if result == fill_value or np.isnan(result):
  321. result = np.timedelta64("NaT").astype(dtype)
  322. elif np.fabs(result) > lib.i8max:
  323. # raise if we have a timedelta64[ns] which is too large
  324. raise ValueError("overflow in timedelta operation")
  325. else:
  326. # return a timedelta64 with the original unit
  327. result = np.int64(result).astype(dtype, copy=False)
  328. else:
  329. result = result.astype("m8[ns]").view(dtype)
  330. return result
  331. def _datetimelike_compat(func: F) -> F:
  332. """
  333. If we have datetime64 or timedelta64 values, ensure we have a correct
  334. mask before calling the wrapped function, then cast back afterwards.
  335. """
  336. @functools.wraps(func)
  337. def new_func(
  338. values: np.ndarray,
  339. *,
  340. axis: AxisInt | None = None,
  341. skipna: bool = True,
  342. mask: npt.NDArray[np.bool_] | None = None,
  343. **kwargs,
  344. ):
  345. orig_values = values
  346. datetimelike = values.dtype.kind in ["m", "M"]
  347. if datetimelike and mask is None:
  348. mask = isna(values)
  349. result = func(values, axis=axis, skipna=skipna, mask=mask, **kwargs)
  350. if datetimelike:
  351. result = _wrap_results(result, orig_values.dtype, fill_value=iNaT)
  352. if not skipna:
  353. assert mask is not None # checked above
  354. result = _mask_datetimelike_result(result, axis, mask, orig_values)
  355. return result
  356. return cast(F, new_func)
  357. def _na_for_min_count(values: np.ndarray, axis: AxisInt | None) -> Scalar | np.ndarray:
  358. """
  359. Return the missing value for `values`.
  360. Parameters
  361. ----------
  362. values : ndarray
  363. axis : int or None
  364. axis for the reduction, required if values.ndim > 1.
  365. Returns
  366. -------
  367. result : scalar or ndarray
  368. For 1-D values, returns a scalar of the correct missing type.
  369. For 2-D values, returns a 1-D array where each element is missing.
  370. """
  371. # we either return np.nan or pd.NaT
  372. if is_numeric_dtype(values):
  373. values = values.astype("float64")
  374. fill_value = na_value_for_dtype(values.dtype)
  375. if values.ndim == 1:
  376. return fill_value
  377. elif axis is None:
  378. return fill_value
  379. else:
  380. result_shape = values.shape[:axis] + values.shape[axis + 1 :]
  381. return np.full(result_shape, fill_value, dtype=values.dtype)
  382. def maybe_operate_rowwise(func: F) -> F:
  383. """
  384. NumPy operations on C-contiguous ndarrays with axis=1 can be
  385. very slow if axis 1 >> axis 0.
  386. Operate row-by-row and concatenate the results.
  387. """
  388. @functools.wraps(func)
  389. def newfunc(values: np.ndarray, *, axis: AxisInt | None = None, **kwargs):
  390. if (
  391. axis == 1
  392. and values.ndim == 2
  393. and values.flags["C_CONTIGUOUS"]
  394. # only takes this path for wide arrays (long dataframes), for threshold see
  395. # https://github.com/pandas-dev/pandas/pull/43311#issuecomment-974891737
  396. and (values.shape[1] / 1000) > values.shape[0]
  397. and values.dtype != object
  398. and values.dtype != bool
  399. ):
  400. arrs = list(values)
  401. if kwargs.get("mask") is not None:
  402. mask = kwargs.pop("mask")
  403. results = [
  404. func(arrs[i], mask=mask[i], **kwargs) for i in range(len(arrs))
  405. ]
  406. else:
  407. results = [func(x, **kwargs) for x in arrs]
  408. return np.array(results)
  409. return func(values, axis=axis, **kwargs)
  410. return cast(F, newfunc)
  411. def nanany(
  412. values: np.ndarray,
  413. *,
  414. axis: AxisInt | None = None,
  415. skipna: bool = True,
  416. mask: npt.NDArray[np.bool_] | None = None,
  417. ) -> bool:
  418. """
  419. Check if any elements along an axis evaluate to True.
  420. Parameters
  421. ----------
  422. values : ndarray
  423. axis : int, optional
  424. skipna : bool, default True
  425. mask : ndarray[bool], optional
  426. nan-mask if known
  427. Returns
  428. -------
  429. result : bool
  430. Examples
  431. --------
  432. >>> from pandas.core import nanops
  433. >>> s = pd.Series([1, 2])
  434. >>> nanops.nanany(s)
  435. True
  436. >>> from pandas.core import nanops
  437. >>> s = pd.Series([np.nan])
  438. >>> nanops.nanany(s)
  439. False
  440. """
  441. if needs_i8_conversion(values.dtype) and values.dtype.kind != "m":
  442. # GH#34479
  443. warnings.warn(
  444. "'any' with datetime64 dtypes is deprecated and will raise in a "
  445. "future version. Use (obj != pd.Timestamp(0)).any() instead.",
  446. FutureWarning,
  447. stacklevel=find_stack_level(),
  448. )
  449. values, _, _, _, _ = _get_values(values, skipna, fill_value=False, mask=mask)
  450. # For object type, any won't necessarily return
  451. # boolean values (numpy/numpy#4352)
  452. if is_object_dtype(values):
  453. values = values.astype(bool)
  454. # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
  455. # "bool")
  456. return values.any(axis) # type: ignore[return-value]
  457. def nanall(
  458. values: np.ndarray,
  459. *,
  460. axis: AxisInt | None = None,
  461. skipna: bool = True,
  462. mask: npt.NDArray[np.bool_] | None = None,
  463. ) -> bool:
  464. """
  465. Check if all elements along an axis evaluate to True.
  466. Parameters
  467. ----------
  468. values : ndarray
  469. axis : int, optional
  470. skipna : bool, default True
  471. mask : ndarray[bool], optional
  472. nan-mask if known
  473. Returns
  474. -------
  475. result : bool
  476. Examples
  477. --------
  478. >>> from pandas.core import nanops
  479. >>> s = pd.Series([1, 2, np.nan])
  480. >>> nanops.nanall(s)
  481. True
  482. >>> from pandas.core import nanops
  483. >>> s = pd.Series([1, 0])
  484. >>> nanops.nanall(s)
  485. False
  486. """
  487. if needs_i8_conversion(values.dtype) and values.dtype.kind != "m":
  488. # GH#34479
  489. warnings.warn(
  490. "'all' with datetime64 dtypes is deprecated and will raise in a "
  491. "future version. Use (obj != pd.Timestamp(0)).all() instead.",
  492. FutureWarning,
  493. stacklevel=find_stack_level(),
  494. )
  495. values, _, _, _, _ = _get_values(values, skipna, fill_value=True, mask=mask)
  496. # For object type, all won't necessarily return
  497. # boolean values (numpy/numpy#4352)
  498. if is_object_dtype(values):
  499. values = values.astype(bool)
  500. # error: Incompatible return value type (got "Union[bool_, ndarray]", expected
  501. # "bool")
  502. return values.all(axis) # type: ignore[return-value]
  503. @disallow("M8")
  504. @_datetimelike_compat
  505. @maybe_operate_rowwise
  506. def nansum(
  507. values: np.ndarray,
  508. *,
  509. axis: AxisInt | None = None,
  510. skipna: bool = True,
  511. min_count: int = 0,
  512. mask: npt.NDArray[np.bool_] | None = None,
  513. ) -> float:
  514. """
  515. Sum the elements along an axis ignoring NaNs
  516. Parameters
  517. ----------
  518. values : ndarray[dtype]
  519. axis : int, optional
  520. skipna : bool, default True
  521. min_count: int, default 0
  522. mask : ndarray[bool], optional
  523. nan-mask if known
  524. Returns
  525. -------
  526. result : dtype
  527. Examples
  528. --------
  529. >>> from pandas.core import nanops
  530. >>> s = pd.Series([1, 2, np.nan])
  531. >>> nanops.nansum(s)
  532. 3.0
  533. """
  534. values, mask, dtype, dtype_max, _ = _get_values(
  535. values, skipna, fill_value=0, mask=mask
  536. )
  537. dtype_sum = dtype_max
  538. if is_float_dtype(dtype):
  539. dtype_sum = dtype
  540. elif is_timedelta64_dtype(dtype):
  541. dtype_sum = np.dtype(np.float64)
  542. the_sum = values.sum(axis, dtype=dtype_sum)
  543. the_sum = _maybe_null_out(the_sum, axis, mask, values.shape, min_count=min_count)
  544. return the_sum
  545. def _mask_datetimelike_result(
  546. result: np.ndarray | np.datetime64 | np.timedelta64,
  547. axis: AxisInt | None,
  548. mask: npt.NDArray[np.bool_],
  549. orig_values: np.ndarray,
  550. ) -> np.ndarray | np.datetime64 | np.timedelta64 | NaTType:
  551. if isinstance(result, np.ndarray):
  552. # we need to apply the mask
  553. result = result.astype("i8").view(orig_values.dtype)
  554. axis_mask = mask.any(axis=axis)
  555. # error: Unsupported target for indexed assignment ("Union[ndarray[Any, Any],
  556. # datetime64, timedelta64]")
  557. result[axis_mask] = iNaT # type: ignore[index]
  558. else:
  559. if mask.any():
  560. return np.int64(iNaT).view(orig_values.dtype)
  561. return result
  562. @disallow(PeriodDtype)
  563. @bottleneck_switch()
  564. @_datetimelike_compat
  565. def nanmean(
  566. values: np.ndarray,
  567. *,
  568. axis: AxisInt | None = None,
  569. skipna: bool = True,
  570. mask: npt.NDArray[np.bool_] | None = None,
  571. ) -> float:
  572. """
  573. Compute the mean of the element along an axis ignoring NaNs
  574. Parameters
  575. ----------
  576. values : ndarray
  577. axis : int, optional
  578. skipna : bool, default True
  579. mask : ndarray[bool], optional
  580. nan-mask if known
  581. Returns
  582. -------
  583. float
  584. Unless input is a float array, in which case use the same
  585. precision as the input array.
  586. Examples
  587. --------
  588. >>> from pandas.core import nanops
  589. >>> s = pd.Series([1, 2, np.nan])
  590. >>> nanops.nanmean(s)
  591. 1.5
  592. """
  593. values, mask, dtype, dtype_max, _ = _get_values(
  594. values, skipna, fill_value=0, mask=mask
  595. )
  596. dtype_sum = dtype_max
  597. dtype_count = np.dtype(np.float64)
  598. # not using needs_i8_conversion because that includes period
  599. if dtype.kind in ["m", "M"]:
  600. dtype_sum = np.dtype(np.float64)
  601. elif is_integer_dtype(dtype):
  602. dtype_sum = np.dtype(np.float64)
  603. elif is_float_dtype(dtype):
  604. dtype_sum = dtype
  605. dtype_count = dtype
  606. count = _get_counts(values.shape, mask, axis, dtype=dtype_count)
  607. the_sum = _ensure_numeric(values.sum(axis, dtype=dtype_sum))
  608. if axis is not None and getattr(the_sum, "ndim", False):
  609. count = cast(np.ndarray, count)
  610. with np.errstate(all="ignore"):
  611. # suppress division by zero warnings
  612. the_mean = the_sum / count
  613. ct_mask = count == 0
  614. if ct_mask.any():
  615. the_mean[ct_mask] = np.nan
  616. else:
  617. the_mean = the_sum / count if count > 0 else np.nan
  618. return the_mean
  619. @bottleneck_switch()
  620. def nanmedian(values, *, axis: AxisInt | None = None, skipna: bool = True, mask=None):
  621. """
  622. Parameters
  623. ----------
  624. values : ndarray
  625. axis : int, optional
  626. skipna : bool, default True
  627. mask : ndarray[bool], optional
  628. nan-mask if known
  629. Returns
  630. -------
  631. result : float
  632. Unless input is a float array, in which case use the same
  633. precision as the input array.
  634. Examples
  635. --------
  636. >>> from pandas.core import nanops
  637. >>> s = pd.Series([1, np.nan, 2, 2])
  638. >>> nanops.nanmedian(s)
  639. 2.0
  640. """
  641. def get_median(x, _mask=None):
  642. if _mask is None:
  643. _mask = notna(x)
  644. else:
  645. _mask = ~_mask
  646. if not skipna and not _mask.all():
  647. return np.nan
  648. with warnings.catch_warnings():
  649. # Suppress RuntimeWarning about All-NaN slice
  650. warnings.filterwarnings(
  651. "ignore", "All-NaN slice encountered", RuntimeWarning
  652. )
  653. res = np.nanmedian(x[_mask])
  654. return res
  655. values, mask, dtype, _, _ = _get_values(values, skipna, mask=mask, fill_value=0)
  656. if not is_float_dtype(values.dtype):
  657. try:
  658. values = values.astype("f8")
  659. except ValueError as err:
  660. # e.g. "could not convert string to float: 'a'"
  661. raise TypeError(str(err)) from err
  662. if mask is not None:
  663. values[mask] = np.nan
  664. notempty = values.size
  665. # an array from a frame
  666. if values.ndim > 1 and axis is not None:
  667. # there's a non-empty array to apply over otherwise numpy raises
  668. if notempty:
  669. if not skipna:
  670. res = np.apply_along_axis(get_median, axis, values)
  671. else:
  672. # fastpath for the skipna case
  673. with warnings.catch_warnings():
  674. # Suppress RuntimeWarning about All-NaN slice
  675. warnings.filterwarnings(
  676. "ignore", "All-NaN slice encountered", RuntimeWarning
  677. )
  678. res = np.nanmedian(values, axis)
  679. else:
  680. # must return the correct shape, but median is not defined for the
  681. # empty set so return nans of shape "everything but the passed axis"
  682. # since "axis" is where the reduction would occur if we had a nonempty
  683. # array
  684. res = get_empty_reduction_result(values.shape, axis, np.float_, np.nan)
  685. else:
  686. # otherwise return a scalar value
  687. res = get_median(values, mask) if notempty else np.nan
  688. return _wrap_results(res, dtype)
  689. def get_empty_reduction_result(
  690. shape: tuple[int, ...],
  691. axis: AxisInt,
  692. dtype: np.dtype | type[np.floating],
  693. fill_value: Any,
  694. ) -> np.ndarray:
  695. """
  696. The result from a reduction on an empty ndarray.
  697. Parameters
  698. ----------
  699. shape : Tuple[int]
  700. axis : int
  701. dtype : np.dtype
  702. fill_value : Any
  703. Returns
  704. -------
  705. np.ndarray
  706. """
  707. shp = np.array(shape)
  708. dims = np.arange(len(shape))
  709. ret = np.empty(shp[dims != axis], dtype=dtype)
  710. ret.fill(fill_value)
  711. return ret
  712. def _get_counts_nanvar(
  713. values_shape: Shape,
  714. mask: npt.NDArray[np.bool_] | None,
  715. axis: AxisInt | None,
  716. ddof: int,
  717. dtype: np.dtype = np.dtype(np.float64),
  718. ) -> tuple[float | np.ndarray, float | np.ndarray]:
  719. """
  720. Get the count of non-null values along an axis, accounting
  721. for degrees of freedom.
  722. Parameters
  723. ----------
  724. values_shape : Tuple[int, ...]
  725. shape tuple from values ndarray, used if mask is None
  726. mask : Optional[ndarray[bool]]
  727. locations in values that should be considered missing
  728. axis : Optional[int]
  729. axis to count along
  730. ddof : int
  731. degrees of freedom
  732. dtype : type, optional
  733. type to use for count
  734. Returns
  735. -------
  736. count : int, np.nan or np.ndarray
  737. d : int, np.nan or np.ndarray
  738. """
  739. count = _get_counts(values_shape, mask, axis, dtype=dtype)
  740. d = count - dtype.type(ddof)
  741. # always return NaN, never inf
  742. if is_scalar(count):
  743. if count <= ddof:
  744. count = np.nan
  745. d = np.nan
  746. else:
  747. # count is not narrowed by is_scalar check
  748. count = cast(np.ndarray, count)
  749. mask = count <= ddof
  750. if mask.any():
  751. np.putmask(d, mask, np.nan)
  752. np.putmask(count, mask, np.nan)
  753. return count, d
  754. @bottleneck_switch(ddof=1)
  755. def nanstd(
  756. values,
  757. *,
  758. axis: AxisInt | None = None,
  759. skipna: bool = True,
  760. ddof: int = 1,
  761. mask=None,
  762. ):
  763. """
  764. Compute the standard deviation along given axis while ignoring NaNs
  765. Parameters
  766. ----------
  767. values : ndarray
  768. axis : int, optional
  769. skipna : bool, default True
  770. ddof : int, default 1
  771. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  772. where N represents the number of elements.
  773. mask : ndarray[bool], optional
  774. nan-mask if known
  775. Returns
  776. -------
  777. result : float
  778. Unless input is a float array, in which case use the same
  779. precision as the input array.
  780. Examples
  781. --------
  782. >>> from pandas.core import nanops
  783. >>> s = pd.Series([1, np.nan, 2, 3])
  784. >>> nanops.nanstd(s)
  785. 1.0
  786. """
  787. if values.dtype == "M8[ns]":
  788. values = values.view("m8[ns]")
  789. orig_dtype = values.dtype
  790. values, mask, _, _, _ = _get_values(values, skipna, mask=mask)
  791. result = np.sqrt(nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask))
  792. return _wrap_results(result, orig_dtype)
  793. @disallow("M8", "m8")
  794. @bottleneck_switch(ddof=1)
  795. def nanvar(
  796. values,
  797. *,
  798. axis: AxisInt | None = None,
  799. skipna: bool = True,
  800. ddof: int = 1,
  801. mask=None,
  802. ):
  803. """
  804. Compute the variance along given axis while ignoring NaNs
  805. Parameters
  806. ----------
  807. values : ndarray
  808. axis : int, optional
  809. skipna : bool, default True
  810. ddof : int, default 1
  811. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  812. where N represents the number of elements.
  813. mask : ndarray[bool], optional
  814. nan-mask if known
  815. Returns
  816. -------
  817. result : float
  818. Unless input is a float array, in which case use the same
  819. precision as the input array.
  820. Examples
  821. --------
  822. >>> from pandas.core import nanops
  823. >>> s = pd.Series([1, np.nan, 2, 3])
  824. >>> nanops.nanvar(s)
  825. 1.0
  826. """
  827. values = extract_array(values, extract_numpy=True)
  828. dtype = values.dtype
  829. mask = _maybe_get_mask(values, skipna, mask)
  830. if is_any_int_dtype(dtype):
  831. values = values.astype("f8")
  832. if mask is not None:
  833. values[mask] = np.nan
  834. if is_float_dtype(values.dtype):
  835. count, d = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
  836. else:
  837. count, d = _get_counts_nanvar(values.shape, mask, axis, ddof)
  838. if skipna and mask is not None:
  839. values = values.copy()
  840. np.putmask(values, mask, 0)
  841. # xref GH10242
  842. # Compute variance via two-pass algorithm, which is stable against
  843. # cancellation errors and relatively accurate for small numbers of
  844. # observations.
  845. #
  846. # See https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
  847. avg = _ensure_numeric(values.sum(axis=axis, dtype=np.float64)) / count
  848. if axis is not None:
  849. avg = np.expand_dims(avg, axis)
  850. sqr = _ensure_numeric((avg - values) ** 2)
  851. if mask is not None:
  852. np.putmask(sqr, mask, 0)
  853. result = sqr.sum(axis=axis, dtype=np.float64) / d
  854. # Return variance as np.float64 (the datatype used in the accumulator),
  855. # unless we were dealing with a float array, in which case use the same
  856. # precision as the original values array.
  857. if is_float_dtype(dtype):
  858. result = result.astype(dtype, copy=False)
  859. return result
  860. @disallow("M8", "m8")
  861. def nansem(
  862. values: np.ndarray,
  863. *,
  864. axis: AxisInt | None = None,
  865. skipna: bool = True,
  866. ddof: int = 1,
  867. mask: npt.NDArray[np.bool_] | None = None,
  868. ) -> float:
  869. """
  870. Compute the standard error in the mean along given axis while ignoring NaNs
  871. Parameters
  872. ----------
  873. values : ndarray
  874. axis : int, optional
  875. skipna : bool, default True
  876. ddof : int, default 1
  877. Delta Degrees of Freedom. The divisor used in calculations is N - ddof,
  878. where N represents the number of elements.
  879. mask : ndarray[bool], optional
  880. nan-mask if known
  881. Returns
  882. -------
  883. result : float64
  884. Unless input is a float array, in which case use the same
  885. precision as the input array.
  886. Examples
  887. --------
  888. >>> from pandas.core import nanops
  889. >>> s = pd.Series([1, np.nan, 2, 3])
  890. >>> nanops.nansem(s)
  891. 0.5773502691896258
  892. """
  893. # This checks if non-numeric-like data is passed with numeric_only=False
  894. # and raises a TypeError otherwise
  895. nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
  896. mask = _maybe_get_mask(values, skipna, mask)
  897. if not is_float_dtype(values.dtype):
  898. values = values.astype("f8")
  899. if not skipna and mask is not None and mask.any():
  900. return np.nan
  901. count, _ = _get_counts_nanvar(values.shape, mask, axis, ddof, values.dtype)
  902. var = nanvar(values, axis=axis, skipna=skipna, ddof=ddof, mask=mask)
  903. return np.sqrt(var) / np.sqrt(count)
  904. def _nanminmax(meth, fill_value_typ):
  905. @bottleneck_switch(name=f"nan{meth}")
  906. @_datetimelike_compat
  907. def reduction(
  908. values: np.ndarray,
  909. *,
  910. axis: AxisInt | None = None,
  911. skipna: bool = True,
  912. mask: npt.NDArray[np.bool_] | None = None,
  913. ) -> Dtype:
  914. values, mask, dtype, dtype_max, fill_value = _get_values(
  915. values, skipna, fill_value_typ=fill_value_typ, mask=mask
  916. )
  917. if (axis is not None and values.shape[axis] == 0) or values.size == 0:
  918. try:
  919. result = getattr(values, meth)(axis, dtype=dtype_max)
  920. result.fill(np.nan)
  921. except (AttributeError, TypeError, ValueError):
  922. result = np.nan
  923. else:
  924. result = getattr(values, meth)(axis)
  925. result = _maybe_null_out(result, axis, mask, values.shape)
  926. return result
  927. return reduction
  928. nanmin = _nanminmax("min", fill_value_typ="+inf")
  929. nanmax = _nanminmax("max", fill_value_typ="-inf")
  930. @disallow("O")
  931. def nanargmax(
  932. values: np.ndarray,
  933. *,
  934. axis: AxisInt | None = None,
  935. skipna: bool = True,
  936. mask: npt.NDArray[np.bool_] | None = None,
  937. ) -> int | np.ndarray:
  938. """
  939. Parameters
  940. ----------
  941. values : ndarray
  942. axis : int, optional
  943. skipna : bool, default True
  944. mask : ndarray[bool], optional
  945. nan-mask if known
  946. Returns
  947. -------
  948. result : int or ndarray[int]
  949. The index/indices of max value in specified axis or -1 in the NA case
  950. Examples
  951. --------
  952. >>> from pandas.core import nanops
  953. >>> arr = np.array([1, 2, 3, np.nan, 4])
  954. >>> nanops.nanargmax(arr)
  955. 4
  956. >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
  957. >>> arr[2:, 2] = np.nan
  958. >>> arr
  959. array([[ 0., 1., 2.],
  960. [ 3., 4., 5.],
  961. [ 6., 7., nan],
  962. [ 9., 10., nan]])
  963. >>> nanops.nanargmax(arr, axis=1)
  964. array([2, 2, 1, 1])
  965. """
  966. values, mask, _, _, _ = _get_values(values, True, fill_value_typ="-inf", mask=mask)
  967. # error: Need type annotation for 'result'
  968. result = values.argmax(axis) # type: ignore[var-annotated]
  969. result = _maybe_arg_null_out(result, axis, mask, skipna)
  970. return result
  971. @disallow("O")
  972. def nanargmin(
  973. values: np.ndarray,
  974. *,
  975. axis: AxisInt | None = None,
  976. skipna: bool = True,
  977. mask: npt.NDArray[np.bool_] | None = None,
  978. ) -> int | np.ndarray:
  979. """
  980. Parameters
  981. ----------
  982. values : ndarray
  983. axis : int, optional
  984. skipna : bool, default True
  985. mask : ndarray[bool], optional
  986. nan-mask if known
  987. Returns
  988. -------
  989. result : int or ndarray[int]
  990. The index/indices of min value in specified axis or -1 in the NA case
  991. Examples
  992. --------
  993. >>> from pandas.core import nanops
  994. >>> arr = np.array([1, 2, 3, np.nan, 4])
  995. >>> nanops.nanargmin(arr)
  996. 0
  997. >>> arr = np.array(range(12), dtype=np.float64).reshape(4, 3)
  998. >>> arr[2:, 0] = np.nan
  999. >>> arr
  1000. array([[ 0., 1., 2.],
  1001. [ 3., 4., 5.],
  1002. [nan, 7., 8.],
  1003. [nan, 10., 11.]])
  1004. >>> nanops.nanargmin(arr, axis=1)
  1005. array([0, 0, 1, 1])
  1006. """
  1007. values, mask, _, _, _ = _get_values(values, True, fill_value_typ="+inf", mask=mask)
  1008. # error: Need type annotation for 'result'
  1009. result = values.argmin(axis) # type: ignore[var-annotated]
  1010. result = _maybe_arg_null_out(result, axis, mask, skipna)
  1011. return result
  1012. @disallow("M8", "m8")
  1013. @maybe_operate_rowwise
  1014. def nanskew(
  1015. values: np.ndarray,
  1016. *,
  1017. axis: AxisInt | None = None,
  1018. skipna: bool = True,
  1019. mask: npt.NDArray[np.bool_] | None = None,
  1020. ) -> float:
  1021. """
  1022. Compute the sample skewness.
  1023. The statistic computed here is the adjusted Fisher-Pearson standardized
  1024. moment coefficient G1. The algorithm computes this coefficient directly
  1025. from the second and third central moment.
  1026. Parameters
  1027. ----------
  1028. values : ndarray
  1029. axis : int, optional
  1030. skipna : bool, default True
  1031. mask : ndarray[bool], optional
  1032. nan-mask if known
  1033. Returns
  1034. -------
  1035. result : float64
  1036. Unless input is a float array, in which case use the same
  1037. precision as the input array.
  1038. Examples
  1039. --------
  1040. >>> from pandas.core import nanops
  1041. >>> s = pd.Series([1, np.nan, 1, 2])
  1042. >>> nanops.nanskew(s)
  1043. 1.7320508075688787
  1044. """
  1045. # error: Incompatible types in assignment (expression has type "Union[Any,
  1046. # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
  1047. values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
  1048. mask = _maybe_get_mask(values, skipna, mask)
  1049. if not is_float_dtype(values.dtype):
  1050. values = values.astype("f8")
  1051. count = _get_counts(values.shape, mask, axis)
  1052. else:
  1053. count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
  1054. if skipna and mask is not None:
  1055. values = values.copy()
  1056. np.putmask(values, mask, 0)
  1057. elif not skipna and mask is not None and mask.any():
  1058. return np.nan
  1059. mean = values.sum(axis, dtype=np.float64) / count
  1060. if axis is not None:
  1061. mean = np.expand_dims(mean, axis)
  1062. adjusted = values - mean
  1063. if skipna and mask is not None:
  1064. np.putmask(adjusted, mask, 0)
  1065. adjusted2 = adjusted**2
  1066. adjusted3 = adjusted2 * adjusted
  1067. m2 = adjusted2.sum(axis, dtype=np.float64)
  1068. m3 = adjusted3.sum(axis, dtype=np.float64)
  1069. # floating point error
  1070. #
  1071. # #18044 in _libs/windows.pyx calc_skew follow this behavior
  1072. # to fix the fperr to treat m2 <1e-14 as zero
  1073. m2 = _zero_out_fperr(m2)
  1074. m3 = _zero_out_fperr(m3)
  1075. with np.errstate(invalid="ignore", divide="ignore"):
  1076. result = (count * (count - 1) ** 0.5 / (count - 2)) * (m3 / m2**1.5)
  1077. dtype = values.dtype
  1078. if is_float_dtype(dtype):
  1079. result = result.astype(dtype, copy=False)
  1080. if isinstance(result, np.ndarray):
  1081. result = np.where(m2 == 0, 0, result)
  1082. result[count < 3] = np.nan
  1083. else:
  1084. result = 0 if m2 == 0 else result
  1085. if count < 3:
  1086. return np.nan
  1087. return result
  1088. @disallow("M8", "m8")
  1089. @maybe_operate_rowwise
  1090. def nankurt(
  1091. values: np.ndarray,
  1092. *,
  1093. axis: AxisInt | None = None,
  1094. skipna: bool = True,
  1095. mask: npt.NDArray[np.bool_] | None = None,
  1096. ) -> float:
  1097. """
  1098. Compute the sample excess kurtosis
  1099. The statistic computed here is the adjusted Fisher-Pearson standardized
  1100. moment coefficient G2, computed directly from the second and fourth
  1101. central moment.
  1102. Parameters
  1103. ----------
  1104. values : ndarray
  1105. axis : int, optional
  1106. skipna : bool, default True
  1107. mask : ndarray[bool], optional
  1108. nan-mask if known
  1109. Returns
  1110. -------
  1111. result : float64
  1112. Unless input is a float array, in which case use the same
  1113. precision as the input array.
  1114. Examples
  1115. --------
  1116. >>> from pandas.core import nanops
  1117. >>> s = pd.Series([1, np.nan, 1, 3, 2])
  1118. >>> nanops.nankurt(s)
  1119. -1.2892561983471076
  1120. """
  1121. # error: Incompatible types in assignment (expression has type "Union[Any,
  1122. # Union[ExtensionArray, ndarray]]", variable has type "ndarray")
  1123. values = extract_array(values, extract_numpy=True) # type: ignore[assignment]
  1124. mask = _maybe_get_mask(values, skipna, mask)
  1125. if not is_float_dtype(values.dtype):
  1126. values = values.astype("f8")
  1127. count = _get_counts(values.shape, mask, axis)
  1128. else:
  1129. count = _get_counts(values.shape, mask, axis, dtype=values.dtype)
  1130. if skipna and mask is not None:
  1131. values = values.copy()
  1132. np.putmask(values, mask, 0)
  1133. elif not skipna and mask is not None and mask.any():
  1134. return np.nan
  1135. mean = values.sum(axis, dtype=np.float64) / count
  1136. if axis is not None:
  1137. mean = np.expand_dims(mean, axis)
  1138. adjusted = values - mean
  1139. if skipna and mask is not None:
  1140. np.putmask(adjusted, mask, 0)
  1141. adjusted2 = adjusted**2
  1142. adjusted4 = adjusted2**2
  1143. m2 = adjusted2.sum(axis, dtype=np.float64)
  1144. m4 = adjusted4.sum(axis, dtype=np.float64)
  1145. with np.errstate(invalid="ignore", divide="ignore"):
  1146. adj = 3 * (count - 1) ** 2 / ((count - 2) * (count - 3))
  1147. numerator = count * (count + 1) * (count - 1) * m4
  1148. denominator = (count - 2) * (count - 3) * m2**2
  1149. # floating point error
  1150. #
  1151. # #18044 in _libs/windows.pyx calc_kurt follow this behavior
  1152. # to fix the fperr to treat denom <1e-14 as zero
  1153. numerator = _zero_out_fperr(numerator)
  1154. denominator = _zero_out_fperr(denominator)
  1155. if not isinstance(denominator, np.ndarray):
  1156. # if ``denom`` is a scalar, check these corner cases first before
  1157. # doing division
  1158. if count < 4:
  1159. return np.nan
  1160. if denominator == 0:
  1161. return 0
  1162. with np.errstate(invalid="ignore", divide="ignore"):
  1163. result = numerator / denominator - adj
  1164. dtype = values.dtype
  1165. if is_float_dtype(dtype):
  1166. result = result.astype(dtype, copy=False)
  1167. if isinstance(result, np.ndarray):
  1168. result = np.where(denominator == 0, 0, result)
  1169. result[count < 4] = np.nan
  1170. return result
  1171. @disallow("M8", "m8")
  1172. @maybe_operate_rowwise
  1173. def nanprod(
  1174. values: np.ndarray,
  1175. *,
  1176. axis: AxisInt | None = None,
  1177. skipna: bool = True,
  1178. min_count: int = 0,
  1179. mask: npt.NDArray[np.bool_] | None = None,
  1180. ) -> float:
  1181. """
  1182. Parameters
  1183. ----------
  1184. values : ndarray[dtype]
  1185. axis : int, optional
  1186. skipna : bool, default True
  1187. min_count: int, default 0
  1188. mask : ndarray[bool], optional
  1189. nan-mask if known
  1190. Returns
  1191. -------
  1192. Dtype
  1193. The product of all elements on a given axis. ( NaNs are treated as 1)
  1194. Examples
  1195. --------
  1196. >>> from pandas.core import nanops
  1197. >>> s = pd.Series([1, 2, 3, np.nan])
  1198. >>> nanops.nanprod(s)
  1199. 6.0
  1200. """
  1201. mask = _maybe_get_mask(values, skipna, mask)
  1202. if skipna and mask is not None:
  1203. values = values.copy()
  1204. values[mask] = 1
  1205. result = values.prod(axis)
  1206. # error: Incompatible return value type (got "Union[ndarray, float]", expected
  1207. # "float")
  1208. return _maybe_null_out( # type: ignore[return-value]
  1209. result, axis, mask, values.shape, min_count=min_count
  1210. )
  1211. def _maybe_arg_null_out(
  1212. result: np.ndarray,
  1213. axis: AxisInt | None,
  1214. mask: npt.NDArray[np.bool_] | None,
  1215. skipna: bool,
  1216. ) -> np.ndarray | int:
  1217. # helper function for nanargmin/nanargmax
  1218. if mask is None:
  1219. return result
  1220. if axis is None or not getattr(result, "ndim", False):
  1221. if skipna:
  1222. if mask.all():
  1223. return -1
  1224. else:
  1225. if mask.any():
  1226. return -1
  1227. else:
  1228. if skipna:
  1229. na_mask = mask.all(axis)
  1230. else:
  1231. na_mask = mask.any(axis)
  1232. if na_mask.any():
  1233. result[na_mask] = -1
  1234. return result
  1235. def _get_counts(
  1236. values_shape: Shape,
  1237. mask: npt.NDArray[np.bool_] | None,
  1238. axis: AxisInt | None,
  1239. dtype: np.dtype = np.dtype(np.float64),
  1240. ) -> float | np.ndarray:
  1241. """
  1242. Get the count of non-null values along an axis
  1243. Parameters
  1244. ----------
  1245. values_shape : tuple of int
  1246. shape tuple from values ndarray, used if mask is None
  1247. mask : Optional[ndarray[bool]]
  1248. locations in values that should be considered missing
  1249. axis : Optional[int]
  1250. axis to count along
  1251. dtype : type, optional
  1252. type to use for count
  1253. Returns
  1254. -------
  1255. count : scalar or array
  1256. """
  1257. if axis is None:
  1258. if mask is not None:
  1259. n = mask.size - mask.sum()
  1260. else:
  1261. n = np.prod(values_shape)
  1262. return dtype.type(n)
  1263. if mask is not None:
  1264. count = mask.shape[axis] - mask.sum(axis)
  1265. else:
  1266. count = values_shape[axis]
  1267. if is_scalar(count):
  1268. return dtype.type(count)
  1269. return count.astype(dtype, copy=False)
  1270. def _maybe_null_out(
  1271. result: np.ndarray | float | NaTType,
  1272. axis: AxisInt | None,
  1273. mask: npt.NDArray[np.bool_] | None,
  1274. shape: tuple[int, ...],
  1275. min_count: int = 1,
  1276. ) -> np.ndarray | float | NaTType:
  1277. """
  1278. Returns
  1279. -------
  1280. Dtype
  1281. The product of all elements on a given axis. ( NaNs are treated as 1)
  1282. """
  1283. if mask is None and min_count == 0:
  1284. # nothing to check; short-circuit
  1285. return result
  1286. if axis is not None and isinstance(result, np.ndarray):
  1287. if mask is not None:
  1288. null_mask = (mask.shape[axis] - mask.sum(axis) - min_count) < 0
  1289. else:
  1290. # we have no nulls, kept mask=None in _maybe_get_mask
  1291. below_count = shape[axis] - min_count < 0
  1292. new_shape = shape[:axis] + shape[axis + 1 :]
  1293. null_mask = np.broadcast_to(below_count, new_shape)
  1294. if np.any(null_mask):
  1295. if is_numeric_dtype(result):
  1296. if np.iscomplexobj(result):
  1297. result = result.astype("c16")
  1298. elif not is_float_dtype(result):
  1299. result = result.astype("f8", copy=False)
  1300. result[null_mask] = np.nan
  1301. else:
  1302. # GH12941, use None to auto cast null
  1303. result[null_mask] = None
  1304. elif result is not NaT:
  1305. if check_below_min_count(shape, mask, min_count):
  1306. result_dtype = getattr(result, "dtype", None)
  1307. if is_float_dtype(result_dtype):
  1308. # error: Item "None" of "Optional[Any]" has no attribute "type"
  1309. result = result_dtype.type("nan") # type: ignore[union-attr]
  1310. else:
  1311. result = np.nan
  1312. return result
  1313. def check_below_min_count(
  1314. shape: tuple[int, ...], mask: npt.NDArray[np.bool_] | None, min_count: int
  1315. ) -> bool:
  1316. """
  1317. Check for the `min_count` keyword. Returns True if below `min_count` (when
  1318. missing value should be returned from the reduction).
  1319. Parameters
  1320. ----------
  1321. shape : tuple
  1322. The shape of the values (`values.shape`).
  1323. mask : ndarray[bool] or None
  1324. Boolean numpy array (typically of same shape as `shape`) or None.
  1325. min_count : int
  1326. Keyword passed through from sum/prod call.
  1327. Returns
  1328. -------
  1329. bool
  1330. """
  1331. if min_count > 0:
  1332. if mask is None:
  1333. # no missing values, only check size
  1334. non_nulls = np.prod(shape)
  1335. else:
  1336. non_nulls = mask.size - mask.sum()
  1337. if non_nulls < min_count:
  1338. return True
  1339. return False
  1340. def _zero_out_fperr(arg):
  1341. # #18044 reference this behavior to fix rolling skew/kurt issue
  1342. if isinstance(arg, np.ndarray):
  1343. with np.errstate(invalid="ignore"):
  1344. return np.where(np.abs(arg) < 1e-14, 0, arg)
  1345. else:
  1346. return arg.dtype.type(0) if np.abs(arg) < 1e-14 else arg
  1347. @disallow("M8", "m8")
  1348. def nancorr(
  1349. a: np.ndarray,
  1350. b: np.ndarray,
  1351. *,
  1352. method: CorrelationMethod = "pearson",
  1353. min_periods: int | None = None,
  1354. ) -> float:
  1355. """
  1356. a, b: ndarrays
  1357. """
  1358. if len(a) != len(b):
  1359. raise AssertionError("Operands to nancorr must have same size")
  1360. if min_periods is None:
  1361. min_periods = 1
  1362. valid = notna(a) & notna(b)
  1363. if not valid.all():
  1364. a = a[valid]
  1365. b = b[valid]
  1366. if len(a) < min_periods:
  1367. return np.nan
  1368. f = get_corr_func(method)
  1369. return f(a, b)
  1370. def get_corr_func(
  1371. method: CorrelationMethod,
  1372. ) -> Callable[[np.ndarray, np.ndarray], float]:
  1373. if method == "kendall":
  1374. from scipy.stats import kendalltau
  1375. def func(a, b):
  1376. return kendalltau(a, b)[0]
  1377. return func
  1378. elif method == "spearman":
  1379. from scipy.stats import spearmanr
  1380. def func(a, b):
  1381. return spearmanr(a, b)[0]
  1382. return func
  1383. elif method == "pearson":
  1384. def func(a, b):
  1385. return np.corrcoef(a, b)[0, 1]
  1386. return func
  1387. elif callable(method):
  1388. return method
  1389. raise ValueError(
  1390. f"Unknown method '{method}', expected one of "
  1391. "'kendall', 'spearman', 'pearson', or callable"
  1392. )
  1393. @disallow("M8", "m8")
  1394. def nancov(
  1395. a: np.ndarray,
  1396. b: np.ndarray,
  1397. *,
  1398. min_periods: int | None = None,
  1399. ddof: int | None = 1,
  1400. ) -> float:
  1401. if len(a) != len(b):
  1402. raise AssertionError("Operands to nancov must have same size")
  1403. if min_periods is None:
  1404. min_periods = 1
  1405. valid = notna(a) & notna(b)
  1406. if not valid.all():
  1407. a = a[valid]
  1408. b = b[valid]
  1409. if len(a) < min_periods:
  1410. return np.nan
  1411. return np.cov(a, b, ddof=ddof)[0, 1]
  1412. def _ensure_numeric(x):
  1413. if isinstance(x, np.ndarray):
  1414. if is_integer_dtype(x) or is_bool_dtype(x):
  1415. x = x.astype(np.float64)
  1416. elif is_object_dtype(x):
  1417. try:
  1418. x = x.astype(np.complex128)
  1419. except (TypeError, ValueError):
  1420. try:
  1421. x = x.astype(np.float64)
  1422. except ValueError as err:
  1423. # GH#29941 we get here with object arrays containing strs
  1424. raise TypeError(f"Could not convert {x} to numeric") from err
  1425. else:
  1426. if not np.any(np.imag(x)):
  1427. x = x.real
  1428. elif not (is_float(x) or is_integer(x) or is_complex(x)):
  1429. try:
  1430. x = float(x)
  1431. except (TypeError, ValueError):
  1432. # e.g. "1+1j" or "foo"
  1433. try:
  1434. x = complex(x)
  1435. except ValueError as err:
  1436. # e.g. "foo"
  1437. raise TypeError(f"Could not convert {x} to numeric") from err
  1438. return x
  1439. # NA-friendly array comparisons
  1440. def make_nancomp(op):
  1441. def f(x, y):
  1442. xmask = isna(x)
  1443. ymask = isna(y)
  1444. mask = xmask | ymask
  1445. with np.errstate(all="ignore"):
  1446. result = op(x, y)
  1447. if mask.any():
  1448. if is_bool_dtype(result):
  1449. result = result.astype("O")
  1450. np.putmask(result, mask, np.nan)
  1451. return result
  1452. return f
  1453. nangt = make_nancomp(operator.gt)
  1454. nange = make_nancomp(operator.ge)
  1455. nanlt = make_nancomp(operator.lt)
  1456. nanle = make_nancomp(operator.le)
  1457. naneq = make_nancomp(operator.eq)
  1458. nanne = make_nancomp(operator.ne)
  1459. def na_accum_func(values: ArrayLike, accum_func, *, skipna: bool) -> ArrayLike:
  1460. """
  1461. Cumulative function with skipna support.
  1462. Parameters
  1463. ----------
  1464. values : np.ndarray or ExtensionArray
  1465. accum_func : {np.cumprod, np.maximum.accumulate, np.cumsum, np.minimum.accumulate}
  1466. skipna : bool
  1467. Returns
  1468. -------
  1469. np.ndarray or ExtensionArray
  1470. """
  1471. mask_a, mask_b = {
  1472. np.cumprod: (1.0, np.nan),
  1473. np.maximum.accumulate: (-np.inf, np.nan),
  1474. np.cumsum: (0.0, np.nan),
  1475. np.minimum.accumulate: (np.inf, np.nan),
  1476. }[accum_func]
  1477. # This should go through ea interface
  1478. assert values.dtype.kind not in ["m", "M"]
  1479. # We will be applying this function to block values
  1480. if skipna and not issubclass(values.dtype.type, (np.integer, np.bool_)):
  1481. vals = values.copy()
  1482. mask = isna(vals)
  1483. vals[mask] = mask_a
  1484. result = accum_func(vals, axis=0)
  1485. result[mask] = mask_b
  1486. else:
  1487. result = accum_func(values, axis=0)
  1488. return result