missing.py 30 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030
  1. """
  2. Routines for filling missing data.
  3. """
  4. from __future__ import annotations
  5. from functools import (
  6. partial,
  7. wraps,
  8. )
  9. from typing import (
  10. TYPE_CHECKING,
  11. Any,
  12. cast,
  13. )
  14. import numpy as np
  15. from pandas._libs import (
  16. NaT,
  17. algos,
  18. lib,
  19. )
  20. from pandas._typing import (
  21. ArrayLike,
  22. Axis,
  23. AxisInt,
  24. F,
  25. npt,
  26. )
  27. from pandas.compat._optional import import_optional_dependency
  28. from pandas.core.dtypes.cast import infer_dtype_from
  29. from pandas.core.dtypes.common import (
  30. is_array_like,
  31. is_numeric_v_string_like,
  32. is_object_dtype,
  33. needs_i8_conversion,
  34. )
  35. from pandas.core.dtypes.missing import (
  36. is_valid_na_for_dtype,
  37. isna,
  38. na_value_for_dtype,
  39. )
  40. if TYPE_CHECKING:
  41. from pandas import Index
  42. def check_value_size(value, mask: npt.NDArray[np.bool_], length: int):
  43. """
  44. Validate the size of the values passed to ExtensionArray.fillna.
  45. """
  46. if is_array_like(value):
  47. if len(value) != length:
  48. raise ValueError(
  49. f"Length of 'value' does not match. Got ({len(value)}) "
  50. f" expected {length}"
  51. )
  52. value = value[mask]
  53. return value
  54. def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]:
  55. """
  56. Return a masking array of same size/shape as arr
  57. with entries equaling any member of values_to_mask set to True
  58. Parameters
  59. ----------
  60. arr : ArrayLike
  61. values_to_mask: list, tuple, or scalar
  62. Returns
  63. -------
  64. np.ndarray[bool]
  65. """
  66. # When called from Block.replace/replace_list, values_to_mask is a scalar
  67. # known to be holdable by arr.
  68. # When called from Series._single_replace, values_to_mask is tuple or list
  69. dtype, values_to_mask = infer_dtype_from(values_to_mask)
  70. # error: Argument "dtype" to "array" has incompatible type "Union[dtype[Any],
  71. # ExtensionDtype]"; expected "Union[dtype[Any], None, type, _SupportsDType, str,
  72. # Union[Tuple[Any, int], Tuple[Any, Union[int, Sequence[int]]], List[Any],
  73. # _DTypeDict, Tuple[Any, Any]]]"
  74. values_to_mask = np.array(values_to_mask, dtype=dtype) # type: ignore[arg-type]
  75. potential_na = False
  76. if is_object_dtype(arr):
  77. # pre-compute mask to avoid comparison to NA
  78. potential_na = True
  79. arr_mask = ~isna(arr)
  80. na_mask = isna(values_to_mask)
  81. nonna = values_to_mask[~na_mask]
  82. # GH 21977
  83. mask = np.zeros(arr.shape, dtype=bool)
  84. for x in nonna:
  85. if is_numeric_v_string_like(arr, x):
  86. # GH#29553 prevent numpy deprecation warnings
  87. pass
  88. else:
  89. if potential_na:
  90. new_mask = np.zeros(arr.shape, dtype=np.bool_)
  91. new_mask[arr_mask] = arr[arr_mask] == x
  92. else:
  93. new_mask = arr == x
  94. if not isinstance(new_mask, np.ndarray):
  95. # usually BooleanArray
  96. new_mask = new_mask.to_numpy(dtype=bool, na_value=False)
  97. mask |= new_mask
  98. if na_mask.any():
  99. mask |= isna(arr)
  100. return mask
  101. def clean_fill_method(method: str | None, allow_nearest: bool = False):
  102. # asfreq is compat for resampling
  103. if method in [None, "asfreq"]:
  104. return None
  105. if isinstance(method, str):
  106. method = method.lower()
  107. if method == "ffill":
  108. method = "pad"
  109. elif method == "bfill":
  110. method = "backfill"
  111. valid_methods = ["pad", "backfill"]
  112. expecting = "pad (ffill) or backfill (bfill)"
  113. if allow_nearest:
  114. valid_methods.append("nearest")
  115. expecting = "pad (ffill), backfill (bfill) or nearest"
  116. if method not in valid_methods:
  117. raise ValueError(f"Invalid fill method. Expecting {expecting}. Got {method}")
  118. return method
  119. # interpolation methods that dispatch to np.interp
  120. NP_METHODS = ["linear", "time", "index", "values"]
  121. # interpolation methods that dispatch to _interpolate_scipy_wrapper
  122. SP_METHODS = [
  123. "nearest",
  124. "zero",
  125. "slinear",
  126. "quadratic",
  127. "cubic",
  128. "barycentric",
  129. "krogh",
  130. "spline",
  131. "polynomial",
  132. "from_derivatives",
  133. "piecewise_polynomial",
  134. "pchip",
  135. "akima",
  136. "cubicspline",
  137. ]
  138. def clean_interp_method(method: str, index: Index, **kwargs) -> str:
  139. order = kwargs.get("order")
  140. if method in ("spline", "polynomial") and order is None:
  141. raise ValueError("You must specify the order of the spline or polynomial.")
  142. valid = NP_METHODS + SP_METHODS
  143. if method not in valid:
  144. raise ValueError(f"method must be one of {valid}. Got '{method}' instead.")
  145. if method in ("krogh", "piecewise_polynomial", "pchip"):
  146. if not index.is_monotonic_increasing:
  147. raise ValueError(
  148. f"{method} interpolation requires that the index be monotonic."
  149. )
  150. return method
  151. def find_valid_index(
  152. values, *, how: str, is_valid: npt.NDArray[np.bool_]
  153. ) -> int | None:
  154. """
  155. Retrieves the index of the first valid value.
  156. Parameters
  157. ----------
  158. values : ndarray or ExtensionArray
  159. how : {'first', 'last'}
  160. Use this parameter to change between the first or last valid index.
  161. is_valid: np.ndarray
  162. Mask to find na_values.
  163. Returns
  164. -------
  165. int or None
  166. """
  167. assert how in ["first", "last"]
  168. if len(values) == 0: # early stop
  169. return None
  170. if values.ndim == 2:
  171. is_valid = is_valid.any(axis=1) # reduce axis 1
  172. if how == "first":
  173. idxpos = is_valid[::].argmax()
  174. elif how == "last":
  175. idxpos = len(values) - 1 - is_valid[::-1].argmax()
  176. chk_notna = is_valid[idxpos]
  177. if not chk_notna:
  178. return None
  179. # Incompatible return value type (got "signedinteger[Any]",
  180. # expected "Optional[int]")
  181. return idxpos # type: ignore[return-value]
  182. def interpolate_array_2d(
  183. data: np.ndarray,
  184. method: str = "pad",
  185. axis: AxisInt = 0,
  186. index: Index | None = None,
  187. limit: int | None = None,
  188. limit_direction: str = "forward",
  189. limit_area: str | None = None,
  190. fill_value: Any | None = None,
  191. coerce: bool = False,
  192. downcast: str | None = None,
  193. **kwargs,
  194. ) -> None:
  195. """
  196. Wrapper to dispatch to either interpolate_2d or _interpolate_2d_with_fill.
  197. Notes
  198. -----
  199. Alters 'data' in-place.
  200. """
  201. try:
  202. m = clean_fill_method(method)
  203. except ValueError:
  204. m = None
  205. if m is not None:
  206. if fill_value is not None:
  207. # similar to validate_fillna_kwargs
  208. raise ValueError("Cannot pass both fill_value and method")
  209. interpolate_2d(
  210. data,
  211. method=m,
  212. axis=axis,
  213. limit=limit,
  214. limit_area=limit_area,
  215. )
  216. else:
  217. assert index is not None # for mypy
  218. _interpolate_2d_with_fill(
  219. data=data,
  220. index=index,
  221. axis=axis,
  222. method=method,
  223. limit=limit,
  224. limit_direction=limit_direction,
  225. limit_area=limit_area,
  226. fill_value=fill_value,
  227. **kwargs,
  228. )
  229. def _interpolate_2d_with_fill(
  230. data: np.ndarray, # floating dtype
  231. index: Index,
  232. axis: AxisInt,
  233. method: str = "linear",
  234. limit: int | None = None,
  235. limit_direction: str = "forward",
  236. limit_area: str | None = None,
  237. fill_value: Any | None = None,
  238. **kwargs,
  239. ) -> None:
  240. """
  241. Column-wise application of _interpolate_1d.
  242. Notes
  243. -----
  244. Alters 'data' in-place.
  245. The signature does differ from _interpolate_1d because it only
  246. includes what is needed for Block.interpolate.
  247. """
  248. # validate the interp method
  249. clean_interp_method(method, index, **kwargs)
  250. if is_valid_na_for_dtype(fill_value, data.dtype):
  251. fill_value = na_value_for_dtype(data.dtype, compat=False)
  252. if method == "time":
  253. if not needs_i8_conversion(index.dtype):
  254. raise ValueError(
  255. "time-weighted interpolation only works "
  256. "on Series or DataFrames with a "
  257. "DatetimeIndex"
  258. )
  259. method = "values"
  260. valid_limit_directions = ["forward", "backward", "both"]
  261. limit_direction = limit_direction.lower()
  262. if limit_direction not in valid_limit_directions:
  263. raise ValueError(
  264. "Invalid limit_direction: expecting one of "
  265. f"{valid_limit_directions}, got '{limit_direction}'."
  266. )
  267. if limit_area is not None:
  268. valid_limit_areas = ["inside", "outside"]
  269. limit_area = limit_area.lower()
  270. if limit_area not in valid_limit_areas:
  271. raise ValueError(
  272. f"Invalid limit_area: expecting one of {valid_limit_areas}, got "
  273. f"{limit_area}."
  274. )
  275. # default limit is unlimited GH #16282
  276. limit = algos.validate_limit(nobs=None, limit=limit)
  277. indices = _index_to_interp_indices(index, method)
  278. def func(yvalues: np.ndarray) -> None:
  279. # process 1-d slices in the axis direction
  280. _interpolate_1d(
  281. indices=indices,
  282. yvalues=yvalues,
  283. method=method,
  284. limit=limit,
  285. limit_direction=limit_direction,
  286. limit_area=limit_area,
  287. fill_value=fill_value,
  288. bounds_error=False,
  289. **kwargs,
  290. )
  291. # error: Argument 1 to "apply_along_axis" has incompatible type
  292. # "Callable[[ndarray[Any, Any]], None]"; expected "Callable[...,
  293. # Union[_SupportsArray[dtype[<nothing>]], Sequence[_SupportsArray
  294. # [dtype[<nothing>]]], Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]],
  295. # Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]],
  296. # Sequence[Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]]]]]"
  297. np.apply_along_axis(func, axis, data) # type: ignore[arg-type]
  298. def _index_to_interp_indices(index: Index, method: str) -> np.ndarray:
  299. """
  300. Convert Index to ndarray of indices to pass to NumPy/SciPy.
  301. """
  302. xarr = index._values
  303. if needs_i8_conversion(xarr.dtype):
  304. # GH#1646 for dt64tz
  305. xarr = xarr.view("i8")
  306. if method == "linear":
  307. inds = xarr
  308. inds = cast(np.ndarray, inds)
  309. else:
  310. inds = np.asarray(xarr)
  311. if method in ("values", "index"):
  312. if inds.dtype == np.object_:
  313. inds = lib.maybe_convert_objects(inds)
  314. return inds
  315. def _interpolate_1d(
  316. indices: np.ndarray,
  317. yvalues: np.ndarray,
  318. method: str | None = "linear",
  319. limit: int | None = None,
  320. limit_direction: str = "forward",
  321. limit_area: str | None = None,
  322. fill_value: Any | None = None,
  323. bounds_error: bool = False,
  324. order: int | None = None,
  325. **kwargs,
  326. ) -> None:
  327. """
  328. Logic for the 1-d interpolation. The input
  329. indices and yvalues will each be 1-d arrays of the same length.
  330. Bounds_error is currently hardcoded to False since non-scipy ones don't
  331. take it as an argument.
  332. Notes
  333. -----
  334. Fills 'yvalues' in-place.
  335. """
  336. invalid = isna(yvalues)
  337. valid = ~invalid
  338. if not valid.any():
  339. return
  340. if valid.all():
  341. return
  342. # These are sets of index pointers to invalid values... i.e. {0, 1, etc...
  343. all_nans = set(np.flatnonzero(invalid))
  344. first_valid_index = find_valid_index(yvalues, how="first", is_valid=valid)
  345. if first_valid_index is None: # no nan found in start
  346. first_valid_index = 0
  347. start_nans = set(range(first_valid_index))
  348. last_valid_index = find_valid_index(yvalues, how="last", is_valid=valid)
  349. if last_valid_index is None: # no nan found in end
  350. last_valid_index = len(yvalues)
  351. end_nans = set(range(1 + last_valid_index, len(valid)))
  352. # Like the sets above, preserve_nans contains indices of invalid values,
  353. # but in this case, it is the final set of indices that need to be
  354. # preserved as NaN after the interpolation.
  355. # For example if limit_direction='forward' then preserve_nans will
  356. # contain indices of NaNs at the beginning of the series, and NaNs that
  357. # are more than 'limit' away from the prior non-NaN.
  358. # set preserve_nans based on direction using _interp_limit
  359. preserve_nans: list | set
  360. if limit_direction == "forward":
  361. preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0))
  362. elif limit_direction == "backward":
  363. preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit))
  364. else:
  365. # both directions... just use _interp_limit
  366. preserve_nans = set(_interp_limit(invalid, limit, limit))
  367. # if limit_area is set, add either mid or outside indices
  368. # to preserve_nans GH #16284
  369. if limit_area == "inside":
  370. # preserve NaNs on the outside
  371. preserve_nans |= start_nans | end_nans
  372. elif limit_area == "outside":
  373. # preserve NaNs on the inside
  374. mid_nans = all_nans - start_nans - end_nans
  375. preserve_nans |= mid_nans
  376. # sort preserve_nans and convert to list
  377. preserve_nans = sorted(preserve_nans)
  378. is_datetimelike = needs_i8_conversion(yvalues.dtype)
  379. if is_datetimelike:
  380. yvalues = yvalues.view("i8")
  381. if method in NP_METHODS:
  382. # np.interp requires sorted X values, #21037
  383. indexer = np.argsort(indices[valid])
  384. yvalues[invalid] = np.interp(
  385. indices[invalid], indices[valid][indexer], yvalues[valid][indexer]
  386. )
  387. else:
  388. yvalues[invalid] = _interpolate_scipy_wrapper(
  389. indices[valid],
  390. yvalues[valid],
  391. indices[invalid],
  392. method=method,
  393. fill_value=fill_value,
  394. bounds_error=bounds_error,
  395. order=order,
  396. **kwargs,
  397. )
  398. if is_datetimelike:
  399. yvalues[preserve_nans] = NaT.value
  400. else:
  401. yvalues[preserve_nans] = np.nan
  402. return
  403. def _interpolate_scipy_wrapper(
  404. x,
  405. y,
  406. new_x,
  407. method,
  408. fill_value=None,
  409. bounds_error: bool = False,
  410. order=None,
  411. **kwargs,
  412. ):
  413. """
  414. Passed off to scipy.interpolate.interp1d. method is scipy's kind.
  415. Returns an array interpolated at new_x. Add any new methods to
  416. the list in _clean_interp_method.
  417. """
  418. extra = f"{method} interpolation requires SciPy."
  419. import_optional_dependency("scipy", extra=extra)
  420. from scipy import interpolate
  421. new_x = np.asarray(new_x)
  422. # ignores some kwargs that could be passed along.
  423. alt_methods = {
  424. "barycentric": interpolate.barycentric_interpolate,
  425. "krogh": interpolate.krogh_interpolate,
  426. "from_derivatives": _from_derivatives,
  427. "piecewise_polynomial": _from_derivatives,
  428. }
  429. if getattr(x, "_is_all_dates", False):
  430. # GH 5975, scipy.interp1d can't handle datetime64s
  431. x, new_x = x._values.astype("i8"), new_x.astype("i8")
  432. if method == "pchip":
  433. alt_methods["pchip"] = interpolate.pchip_interpolate
  434. elif method == "akima":
  435. alt_methods["akima"] = _akima_interpolate
  436. elif method == "cubicspline":
  437. alt_methods["cubicspline"] = _cubicspline_interpolate
  438. interp1d_methods = [
  439. "nearest",
  440. "zero",
  441. "slinear",
  442. "quadratic",
  443. "cubic",
  444. "polynomial",
  445. ]
  446. if method in interp1d_methods:
  447. if method == "polynomial":
  448. method = order
  449. terp = interpolate.interp1d(
  450. x, y, kind=method, fill_value=fill_value, bounds_error=bounds_error
  451. )
  452. new_y = terp(new_x)
  453. elif method == "spline":
  454. # GH #10633, #24014
  455. if isna(order) or (order <= 0):
  456. raise ValueError(
  457. f"order needs to be specified and greater than 0; got order: {order}"
  458. )
  459. terp = interpolate.UnivariateSpline(x, y, k=order, **kwargs)
  460. new_y = terp(new_x)
  461. else:
  462. # GH 7295: need to be able to write for some reason
  463. # in some circumstances: check all three
  464. if not x.flags.writeable:
  465. x = x.copy()
  466. if not y.flags.writeable:
  467. y = y.copy()
  468. if not new_x.flags.writeable:
  469. new_x = new_x.copy()
  470. method = alt_methods[method]
  471. new_y = method(x, y, new_x, **kwargs)
  472. return new_y
  473. def _from_derivatives(
  474. xi, yi, x, order=None, der: int | list[int] | None = 0, extrapolate: bool = False
  475. ):
  476. """
  477. Convenience function for interpolate.BPoly.from_derivatives.
  478. Construct a piecewise polynomial in the Bernstein basis, compatible
  479. with the specified values and derivatives at breakpoints.
  480. Parameters
  481. ----------
  482. xi : array-like
  483. sorted 1D array of x-coordinates
  484. yi : array-like or list of array-likes
  485. yi[i][j] is the j-th derivative known at xi[i]
  486. order: None or int or array-like of ints. Default: None.
  487. Specifies the degree of local polynomials. If not None, some
  488. derivatives are ignored.
  489. der : int or list
  490. How many derivatives to extract; None for all potentially nonzero
  491. derivatives (that is a number equal to the number of points), or a
  492. list of derivatives to extract. This number includes the function
  493. value as 0th derivative.
  494. extrapolate : bool, optional
  495. Whether to extrapolate to ouf-of-bounds points based on first and last
  496. intervals, or to return NaNs. Default: True.
  497. See Also
  498. --------
  499. scipy.interpolate.BPoly.from_derivatives
  500. Returns
  501. -------
  502. y : scalar or array-like
  503. The result, of length R or length M or M by R.
  504. """
  505. from scipy import interpolate
  506. # return the method for compat with scipy version & backwards compat
  507. method = interpolate.BPoly.from_derivatives
  508. m = method(xi, yi.reshape(-1, 1), orders=order, extrapolate=extrapolate)
  509. return m(x)
  510. def _akima_interpolate(xi, yi, x, der: int | list[int] | None = 0, axis: AxisInt = 0):
  511. """
  512. Convenience function for akima interpolation.
  513. xi and yi are arrays of values used to approximate some function f,
  514. with ``yi = f(xi)``.
  515. See `Akima1DInterpolator` for details.
  516. Parameters
  517. ----------
  518. xi : array-like
  519. A sorted list of x-coordinates, of length N.
  520. yi : array-like
  521. A 1-D array of real values. `yi`'s length along the interpolation
  522. axis must be equal to the length of `xi`. If N-D array, use axis
  523. parameter to select correct axis.
  524. x : scalar or array-like
  525. Of length M.
  526. der : int, optional
  527. How many derivatives to extract; None for all potentially
  528. nonzero derivatives (that is a number equal to the number
  529. of points), or a list of derivatives to extract. This number
  530. includes the function value as 0th derivative.
  531. axis : int, optional
  532. Axis in the yi array corresponding to the x-coordinate values.
  533. See Also
  534. --------
  535. scipy.interpolate.Akima1DInterpolator
  536. Returns
  537. -------
  538. y : scalar or array-like
  539. The result, of length R or length M or M by R,
  540. """
  541. from scipy import interpolate
  542. P = interpolate.Akima1DInterpolator(xi, yi, axis=axis)
  543. return P(x, nu=der)
  544. def _cubicspline_interpolate(
  545. xi,
  546. yi,
  547. x,
  548. axis: AxisInt = 0,
  549. bc_type: str | tuple[Any, Any] = "not-a-knot",
  550. extrapolate=None,
  551. ):
  552. """
  553. Convenience function for cubic spline data interpolator.
  554. See `scipy.interpolate.CubicSpline` for details.
  555. Parameters
  556. ----------
  557. xi : array-like, shape (n,)
  558. 1-d array containing values of the independent variable.
  559. Values must be real, finite and in strictly increasing order.
  560. yi : array-like
  561. Array containing values of the dependent variable. It can have
  562. arbitrary number of dimensions, but the length along ``axis``
  563. (see below) must match the length of ``x``. Values must be finite.
  564. x : scalar or array-like, shape (m,)
  565. axis : int, optional
  566. Axis along which `y` is assumed to be varying. Meaning that for
  567. ``x[i]`` the corresponding values are ``np.take(y, i, axis=axis)``.
  568. Default is 0.
  569. bc_type : string or 2-tuple, optional
  570. Boundary condition type. Two additional equations, given by the
  571. boundary conditions, are required to determine all coefficients of
  572. polynomials on each segment [2]_.
  573. If `bc_type` is a string, then the specified condition will be applied
  574. at both ends of a spline. Available conditions are:
  575. * 'not-a-knot' (default): The first and second segment at a curve end
  576. are the same polynomial. It is a good default when there is no
  577. information on boundary conditions.
  578. * 'periodic': The interpolated functions is assumed to be periodic
  579. of period ``x[-1] - x[0]``. The first and last value of `y` must be
  580. identical: ``y[0] == y[-1]``. This boundary condition will result in
  581. ``y'[0] == y'[-1]`` and ``y''[0] == y''[-1]``.
  582. * 'clamped': The first derivative at curves ends are zero. Assuming
  583. a 1D `y`, ``bc_type=((1, 0.0), (1, 0.0))`` is the same condition.
  584. * 'natural': The second derivative at curve ends are zero. Assuming
  585. a 1D `y`, ``bc_type=((2, 0.0), (2, 0.0))`` is the same condition.
  586. If `bc_type` is a 2-tuple, the first and the second value will be
  587. applied at the curve start and end respectively. The tuple values can
  588. be one of the previously mentioned strings (except 'periodic') or a
  589. tuple `(order, deriv_values)` allowing to specify arbitrary
  590. derivatives at curve ends:
  591. * `order`: the derivative order, 1 or 2.
  592. * `deriv_value`: array-like containing derivative values, shape must
  593. be the same as `y`, excluding ``axis`` dimension. For example, if
  594. `y` is 1D, then `deriv_value` must be a scalar. If `y` is 3D with
  595. the shape (n0, n1, n2) and axis=2, then `deriv_value` must be 2D
  596. and have the shape (n0, n1).
  597. extrapolate : {bool, 'periodic', None}, optional
  598. If bool, determines whether to extrapolate to out-of-bounds points
  599. based on first and last intervals, or to return NaNs. If 'periodic',
  600. periodic extrapolation is used. If None (default), ``extrapolate`` is
  601. set to 'periodic' for ``bc_type='periodic'`` and to True otherwise.
  602. See Also
  603. --------
  604. scipy.interpolate.CubicHermiteSpline
  605. Returns
  606. -------
  607. y : scalar or array-like
  608. The result, of shape (m,)
  609. References
  610. ----------
  611. .. [1] `Cubic Spline Interpolation
  612. <https://en.wikiversity.org/wiki/Cubic_Spline_Interpolation>`_
  613. on Wikiversity.
  614. .. [2] Carl de Boor, "A Practical Guide to Splines", Springer-Verlag, 1978.
  615. """
  616. from scipy import interpolate
  617. P = interpolate.CubicSpline(
  618. xi, yi, axis=axis, bc_type=bc_type, extrapolate=extrapolate
  619. )
  620. return P(x)
  621. def _interpolate_with_limit_area(
  622. values: np.ndarray, method: str, limit: int | None, limit_area: str | None
  623. ) -> None:
  624. """
  625. Apply interpolation and limit_area logic to values along a to-be-specified axis.
  626. Parameters
  627. ----------
  628. values: np.ndarray
  629. Input array.
  630. method: str
  631. Interpolation method. Could be "bfill" or "pad"
  632. limit: int, optional
  633. Index limit on interpolation.
  634. limit_area: str
  635. Limit area for interpolation. Can be "inside" or "outside"
  636. Notes
  637. -----
  638. Modifies values in-place.
  639. """
  640. invalid = isna(values)
  641. is_valid = ~invalid
  642. if not invalid.all():
  643. first = find_valid_index(values, how="first", is_valid=is_valid)
  644. if first is None:
  645. first = 0
  646. last = find_valid_index(values, how="last", is_valid=is_valid)
  647. if last is None:
  648. last = len(values)
  649. interpolate_2d(
  650. values,
  651. method=method,
  652. limit=limit,
  653. )
  654. if limit_area == "inside":
  655. invalid[first : last + 1] = False
  656. elif limit_area == "outside":
  657. invalid[:first] = invalid[last + 1 :] = False
  658. values[invalid] = np.nan
  659. def interpolate_2d(
  660. values: np.ndarray,
  661. method: str = "pad",
  662. axis: Axis = 0,
  663. limit: int | None = None,
  664. limit_area: str | None = None,
  665. ) -> None:
  666. """
  667. Perform an actual interpolation of values, values will be make 2-d if
  668. needed fills inplace, returns the result.
  669. Parameters
  670. ----------
  671. values: np.ndarray
  672. Input array.
  673. method: str, default "pad"
  674. Interpolation method. Could be "bfill" or "pad"
  675. axis: 0 or 1
  676. Interpolation axis
  677. limit: int, optional
  678. Index limit on interpolation.
  679. limit_area: str, optional
  680. Limit area for interpolation. Can be "inside" or "outside"
  681. Notes
  682. -----
  683. Modifies values in-place.
  684. """
  685. if limit_area is not None:
  686. np.apply_along_axis(
  687. # error: Argument 1 to "apply_along_axis" has incompatible type
  688. # "partial[None]"; expected
  689. # "Callable[..., Union[_SupportsArray[dtype[<nothing>]],
  690. # Sequence[_SupportsArray[dtype[<nothing>]]],
  691. # Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]],
  692. # Sequence[Sequence[Sequence[_SupportsArray[dtype[<nothing>]]]]],
  693. # Sequence[Sequence[Sequence[Sequence[_
  694. # SupportsArray[dtype[<nothing>]]]]]]]]"
  695. partial( # type: ignore[arg-type]
  696. _interpolate_with_limit_area,
  697. method=method,
  698. limit=limit,
  699. limit_area=limit_area,
  700. ),
  701. # error: Argument 2 to "apply_along_axis" has incompatible type
  702. # "Union[str, int]"; expected "SupportsIndex"
  703. axis, # type: ignore[arg-type]
  704. values,
  705. )
  706. return
  707. transf = (lambda x: x) if axis == 0 else (lambda x: x.T)
  708. # reshape a 1 dim if needed
  709. if values.ndim == 1:
  710. if axis != 0: # pragma: no cover
  711. raise AssertionError("cannot interpolate on a ndim == 1 with axis != 0")
  712. values = values.reshape(tuple((1,) + values.shape))
  713. method = clean_fill_method(method)
  714. tvalues = transf(values)
  715. # _pad_2d and _backfill_2d both modify tvalues inplace
  716. if method == "pad":
  717. _pad_2d(tvalues, limit=limit)
  718. else:
  719. _backfill_2d(tvalues, limit=limit)
  720. return
  721. def _fillna_prep(
  722. values, mask: npt.NDArray[np.bool_] | None = None
  723. ) -> npt.NDArray[np.bool_]:
  724. # boilerplate for _pad_1d, _backfill_1d, _pad_2d, _backfill_2d
  725. if mask is None:
  726. mask = isna(values)
  727. mask = mask.view(np.uint8)
  728. return mask
  729. def _datetimelike_compat(func: F) -> F:
  730. """
  731. Wrapper to handle datetime64 and timedelta64 dtypes.
  732. """
  733. @wraps(func)
  734. def new_func(values, limit=None, mask=None):
  735. if needs_i8_conversion(values.dtype):
  736. if mask is None:
  737. # This needs to occur before casting to int64
  738. mask = isna(values)
  739. result, mask = func(values.view("i8"), limit=limit, mask=mask)
  740. return result.view(values.dtype), mask
  741. return func(values, limit=limit, mask=mask)
  742. return cast(F, new_func)
  743. @_datetimelike_compat
  744. def _pad_1d(
  745. values: np.ndarray,
  746. limit: int | None = None,
  747. mask: npt.NDArray[np.bool_] | None = None,
  748. ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]:
  749. mask = _fillna_prep(values, mask)
  750. algos.pad_inplace(values, mask, limit=limit)
  751. return values, mask
  752. @_datetimelike_compat
  753. def _backfill_1d(
  754. values: np.ndarray,
  755. limit: int | None = None,
  756. mask: npt.NDArray[np.bool_] | None = None,
  757. ) -> tuple[np.ndarray, npt.NDArray[np.bool_]]:
  758. mask = _fillna_prep(values, mask)
  759. algos.backfill_inplace(values, mask, limit=limit)
  760. return values, mask
  761. @_datetimelike_compat
  762. def _pad_2d(values: np.ndarray, limit=None, mask: npt.NDArray[np.bool_] | None = None):
  763. mask = _fillna_prep(values, mask)
  764. if np.all(values.shape):
  765. algos.pad_2d_inplace(values, mask, limit=limit)
  766. else:
  767. # for test coverage
  768. pass
  769. return values, mask
  770. @_datetimelike_compat
  771. def _backfill_2d(values, limit=None, mask: npt.NDArray[np.bool_] | None = None):
  772. mask = _fillna_prep(values, mask)
  773. if np.all(values.shape):
  774. algos.backfill_2d_inplace(values, mask, limit=limit)
  775. else:
  776. # for test coverage
  777. pass
  778. return values, mask
  779. _fill_methods = {"pad": _pad_1d, "backfill": _backfill_1d}
  780. def get_fill_func(method, ndim: int = 1):
  781. method = clean_fill_method(method)
  782. if ndim == 1:
  783. return _fill_methods[method]
  784. return {"pad": _pad_2d, "backfill": _backfill_2d}[method]
  785. def clean_reindex_fill_method(method) -> str | None:
  786. return clean_fill_method(method, allow_nearest=True)
  787. def _interp_limit(invalid: npt.NDArray[np.bool_], fw_limit, bw_limit):
  788. """
  789. Get indexers of values that won't be filled
  790. because they exceed the limits.
  791. Parameters
  792. ----------
  793. invalid : np.ndarray[bool]
  794. fw_limit : int or None
  795. forward limit to index
  796. bw_limit : int or None
  797. backward limit to index
  798. Returns
  799. -------
  800. set of indexers
  801. Notes
  802. -----
  803. This is equivalent to the more readable, but slower
  804. .. code-block:: python
  805. def _interp_limit(invalid, fw_limit, bw_limit):
  806. for x in np.where(invalid)[0]:
  807. if invalid[max(0, x - fw_limit):x + bw_limit + 1].all():
  808. yield x
  809. """
  810. # handle forward first; the backward direction is the same except
  811. # 1. operate on the reversed array
  812. # 2. subtract the returned indices from N - 1
  813. N = len(invalid)
  814. f_idx = set()
  815. b_idx = set()
  816. def inner(invalid, limit):
  817. limit = min(limit, N)
  818. windowed = _rolling_window(invalid, limit + 1).all(1)
  819. idx = set(np.where(windowed)[0] + limit) | set(
  820. np.where((~invalid[: limit + 1]).cumsum() == 0)[0]
  821. )
  822. return idx
  823. if fw_limit is not None:
  824. if fw_limit == 0:
  825. f_idx = set(np.where(invalid)[0])
  826. else:
  827. f_idx = inner(invalid, fw_limit)
  828. if bw_limit is not None:
  829. if bw_limit == 0:
  830. # then we don't even need to care about backwards
  831. # just use forwards
  832. return f_idx
  833. else:
  834. b_idx_inv = list(inner(invalid[::-1], bw_limit))
  835. b_idx = set(N - 1 - np.asarray(b_idx_inv))
  836. if fw_limit == 0:
  837. return b_idx
  838. return f_idx & b_idx
  839. def _rolling_window(a: npt.NDArray[np.bool_], window: int) -> npt.NDArray[np.bool_]:
  840. """
  841. [True, True, False, True, False], 2 ->
  842. [
  843. [True, True],
  844. [True, False],
  845. [False, True],
  846. [True, False],
  847. ]
  848. """
  849. # https://stackoverflow.com/a/6811241
  850. shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
  851. strides = a.strides + (a.strides[-1],)
  852. return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)