123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816 |
- from __future__ import annotations
- from textwrap import dedent
- from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- )
- from pandas._typing import (
- Axis,
- QuantileInterpolation,
- WindowingRankType,
- )
- if TYPE_CHECKING:
- from pandas import DataFrame, Series
- from pandas.core.generic import NDFrame
- from pandas.util._decorators import doc
- from pandas.core.indexers.objects import (
- BaseIndexer,
- ExpandingIndexer,
- GroupbyIndexer,
- )
- from pandas.core.window.doc import (
- _shared_docs,
- create_section_header,
- kwargs_numeric_only,
- numba_notes,
- template_header,
- template_returns,
- template_see_also,
- window_agg_numba_parameters,
- window_apply_parameters,
- )
- from pandas.core.window.rolling import (
- BaseWindowGroupby,
- RollingAndExpandingMixin,
- )
- class Expanding(RollingAndExpandingMixin):
- """
- Provide expanding window calculations.
- Parameters
- ----------
- min_periods : int, default 1
- Minimum number of observations in window required to have a value;
- otherwise, result is ``np.nan``.
- axis : int or str, default 0
- If ``0`` or ``'index'``, roll across the rows.
- If ``1`` or ``'columns'``, roll across the columns.
- For `Series` this parameter is unused and defaults to 0.
- method : str {'single', 'table'}, default 'single'
- Execute the rolling operation per single column or row (``'single'``)
- or over the entire object (``'table'``).
- This argument is only implemented when specifying ``engine='numba'``
- in the method call.
- .. versionadded:: 1.3.0
- Returns
- -------
- ``Expanding`` subclass
- See Also
- --------
- rolling : Provides rolling window calculations.
- ewm : Provides exponential weighted functions.
- Notes
- -----
- See :ref:`Windowing Operations <window.expanding>` for further usage details
- and examples.
- Examples
- --------
- >>> df = pd.DataFrame({"B": [0, 1, 2, np.nan, 4]})
- >>> df
- B
- 0 0.0
- 1 1.0
- 2 2.0
- 3 NaN
- 4 4.0
- **min_periods**
- Expanding sum with 1 vs 3 observations needed to calculate a value.
- >>> df.expanding(1).sum()
- B
- 0 0.0
- 1 1.0
- 2 3.0
- 3 3.0
- 4 7.0
- >>> df.expanding(3).sum()
- B
- 0 NaN
- 1 NaN
- 2 3.0
- 3 3.0
- 4 7.0
- """
- _attributes: list[str] = ["min_periods", "axis", "method"]
- def __init__(
- self,
- obj: NDFrame,
- min_periods: int = 1,
- axis: Axis = 0,
- method: str = "single",
- selection=None,
- ) -> None:
- super().__init__(
- obj=obj,
- min_periods=min_periods,
- axis=axis,
- method=method,
- selection=selection,
- )
- def _get_window_indexer(self) -> BaseIndexer:
- """
- Return an indexer class that will compute the window start and end bounds
- """
- return ExpandingIndexer()
- @doc(
- _shared_docs["aggregate"],
- see_also=dedent(
- """
- See Also
- --------
- pandas.DataFrame.aggregate : Similar DataFrame method.
- pandas.Series.aggregate : Similar Series method.
- """
- ),
- examples=dedent(
- """
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6], "C": [7, 8, 9]})
- >>> df
- A B C
- 0 1 4 7
- 1 2 5 8
- 2 3 6 9
- >>> df.ewm(alpha=0.5).mean()
- A B C
- 0 1.000000 4.000000 7.000000
- 1 1.666667 4.666667 7.666667
- 2 2.428571 5.428571 8.428571
- """
- ),
- klass="Series/Dataframe",
- axis="",
- )
- def aggregate(self, func, *args, **kwargs):
- return super().aggregate(func, *args, **kwargs)
- agg = aggregate
- @doc(
- template_header,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="expanding",
- aggregation_description="count of non NaN observations",
- agg_method="count",
- )
- def count(self, numeric_only: bool = False):
- return super().count(numeric_only=numeric_only)
- @doc(
- template_header,
- create_section_header("Parameters"),
- window_apply_parameters,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="expanding",
- aggregation_description="custom aggregation function",
- agg_method="apply",
- )
- def apply(
- self,
- func: Callable[..., Any],
- raw: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- args: tuple[Any, ...] | None = None,
- kwargs: dict[str, Any] | None = None,
- ):
- return super().apply(
- func,
- raw=raw,
- engine=engine,
- engine_kwargs=engine_kwargs,
- args=args,
- kwargs=kwargs,
- )
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="expanding",
- aggregation_description="sum",
- agg_method="sum",
- )
- def sum(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().sum(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="expanding",
- aggregation_description="maximum",
- agg_method="max",
- )
- def max(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().max(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="expanding",
- aggregation_description="minimum",
- agg_method="min",
- )
- def min(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().min(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="expanding",
- aggregation_description="mean",
- agg_method="mean",
- )
- def mean(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().mean(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- window_agg_numba_parameters(),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- numba_notes[:-1],
- window_method="expanding",
- aggregation_description="median",
- agg_method="median",
- )
- def median(
- self,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().median(
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.\n
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- window_agg_numba_parameters("1.4"),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "numpy.std : Equivalent method for NumPy array.\n",
- template_see_also,
- create_section_header("Notes"),
- dedent(
- """
- The default ``ddof`` of 1 used in :meth:`Series.std` is different
- than the default ``ddof`` of 0 in :func:`numpy.std`.
- A minimum of one period is required for the rolling calculation.\n
- """
- ).replace("\n", "", 1),
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
- >>> s.expanding(3).std()
- 0 NaN
- 1 NaN
- 2 0.577350
- 3 0.957427
- 4 0.894427
- 5 0.836660
- 6 0.786796
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="standard deviation",
- agg_method="std",
- )
- def std(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().std(
- ddof=ddof,
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.\n
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- window_agg_numba_parameters("1.4"),
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "numpy.var : Equivalent method for NumPy array.\n",
- template_see_also,
- create_section_header("Notes"),
- dedent(
- """
- The default ``ddof`` of 1 used in :meth:`Series.var` is different
- than the default ``ddof`` of 0 in :func:`numpy.var`.
- A minimum of one period is required for the rolling calculation.\n
- """
- ).replace("\n", "", 1),
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([5, 5, 6, 7, 5, 5, 5])
- >>> s.expanding(3).var()
- 0 NaN
- 1 NaN
- 2 0.333333
- 3 0.916667
- 4 0.800000
- 5 0.700000
- 6 0.619048
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="variance",
- agg_method="var",
- )
- def var(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- engine: str | None = None,
- engine_kwargs: dict[str, bool] | None = None,
- ):
- return super().var(
- ddof=ddof,
- numeric_only=numeric_only,
- engine=engine,
- engine_kwargs=engine_kwargs,
- )
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.\n
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Notes"),
- "A minimum of one period is required for the calculation.\n\n",
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([0, 1, 2, 3])
- >>> s.expanding().sem()
- 0 NaN
- 1 0.707107
- 2 0.707107
- 3 0.745356
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="standard error of mean",
- agg_method="sem",
- )
- def sem(self, ddof: int = 1, numeric_only: bool = False):
- return super().sem(ddof=ddof, numeric_only=numeric_only)
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "scipy.stats.skew : Third moment of a probability density.\n",
- template_see_also,
- create_section_header("Notes"),
- "A minimum of three periods is required for the rolling calculation.\n",
- window_method="expanding",
- aggregation_description="unbiased skewness",
- agg_method="skew",
- )
- def skew(self, numeric_only: bool = False):
- return super().skew(numeric_only=numeric_only)
- @doc(
- template_header,
- create_section_header("Parameters"),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- "scipy.stats.kurtosis : Reference SciPy method.\n",
- template_see_also,
- create_section_header("Notes"),
- "A minimum of four periods is required for the calculation.\n\n",
- create_section_header("Examples"),
- dedent(
- """
- The example below will show a rolling calculation with a window size of
- four matching the equivalent function call using `scipy.stats`.
- >>> arr = [1, 2, 3, 4, 999]
- >>> import scipy.stats
- >>> print(f"{{scipy.stats.kurtosis(arr[:-1], bias=False):.6f}}")
- -1.200000
- >>> print(f"{{scipy.stats.kurtosis(arr, bias=False):.6f}}")
- 4.999874
- >>> s = pd.Series(arr)
- >>> s.expanding(4).kurt()
- 0 NaN
- 1 NaN
- 2 NaN
- 3 -1.200000
- 4 4.999874
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="Fisher's definition of kurtosis without bias",
- agg_method="kurt",
- )
- def kurt(self, numeric_only: bool = False):
- return super().kurt(numeric_only=numeric_only)
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- quantile : float
- Quantile to compute. 0 <= quantile <= 1.
- interpolation : {{'linear', 'lower', 'higher', 'midpoint', 'nearest'}}
- This optional parameter specifies the interpolation method to use,
- when the desired quantile lies between two data points `i` and `j`:
- * linear: `i + (j - i) * fraction`, where `fraction` is the
- fractional part of the index surrounded by `i` and `j`.
- * lower: `i`.
- * higher: `j`.
- * nearest: `i` or `j` whichever is nearest.
- * midpoint: (`i` + `j`) / 2.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="expanding",
- aggregation_description="quantile",
- agg_method="quantile",
- )
- def quantile(
- self,
- quantile: float,
- interpolation: QuantileInterpolation = "linear",
- numeric_only: bool = False,
- ):
- return super().quantile(
- quantile=quantile,
- interpolation=interpolation,
- numeric_only=numeric_only,
- )
- @doc(
- template_header,
- ".. versionadded:: 1.4.0 \n\n",
- create_section_header("Parameters"),
- dedent(
- """
- method : {{'average', 'min', 'max'}}, default 'average'
- How to rank the group of records that have the same value (i.e. ties):
- * average: average rank of the group
- * min: lowest rank in the group
- * max: highest rank in the group
- ascending : bool, default True
- Whether or not the elements should be ranked in ascending order.
- pct : bool, default False
- Whether or not to display the returned rankings in percentile
- form.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also,
- create_section_header("Examples"),
- dedent(
- """
- >>> s = pd.Series([1, 4, 2, 3, 5, 3])
- >>> s.expanding().rank()
- 0 1.0
- 1 2.0
- 2 2.0
- 3 3.0
- 4 5.0
- 5 3.5
- dtype: float64
- >>> s.expanding().rank(method="max")
- 0 1.0
- 1 2.0
- 2 2.0
- 3 3.0
- 4 5.0
- 5 4.0
- dtype: float64
- >>> s.expanding().rank(method="min")
- 0 1.0
- 1 2.0
- 2 2.0
- 3 3.0
- 4 5.0
- 5 3.0
- dtype: float64
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="rank",
- agg_method="rank",
- )
- def rank(
- self,
- method: WindowingRankType = "average",
- ascending: bool = True,
- pct: bool = False,
- numeric_only: bool = False,
- ):
- return super().rank(
- method=method,
- ascending=ascending,
- pct=pct,
- numeric_only=numeric_only,
- )
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- other : Series or DataFrame, optional
- If not supplied then will default to self and produce pairwise
- output.
- pairwise : bool, default None
- If False then only matching columns between self and other will be
- used and the output will be a DataFrame.
- If True then all pairwise combinations will be calculated and the
- output will be a MultiIndexed DataFrame in the case of DataFrame
- inputs. In the case of missing elements, only complete pairwise
- observations will be used.
- ddof : int, default 1
- Delta Degrees of Freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- template_see_also[:-1],
- window_method="expanding",
- aggregation_description="sample covariance",
- agg_method="cov",
- )
- def cov(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- ddof: int = 1,
- numeric_only: bool = False,
- ):
- return super().cov(
- other=other,
- pairwise=pairwise,
- ddof=ddof,
- numeric_only=numeric_only,
- )
- @doc(
- template_header,
- create_section_header("Parameters"),
- dedent(
- """
- other : Series or DataFrame, optional
- If not supplied then will default to self and produce pairwise
- output.
- pairwise : bool, default None
- If False then only matching columns between self and other will be
- used and the output will be a DataFrame.
- If True then all pairwise combinations will be calculated and the
- output will be a MultiIndexed DataFrame in the case of DataFrame
- inputs. In the case of missing elements, only complete pairwise
- observations will be used.
- """
- ).replace("\n", "", 1),
- kwargs_numeric_only,
- create_section_header("Returns"),
- template_returns,
- create_section_header("See Also"),
- dedent(
- """
- cov : Similar method to calculate covariance.
- numpy.corrcoef : NumPy Pearson's correlation calculation.
- """
- ).replace("\n", "", 1),
- template_see_also,
- create_section_header("Notes"),
- dedent(
- """
- This function uses Pearson's definition of correlation
- (https://en.wikipedia.org/wiki/Pearson_correlation_coefficient).
- When `other` is not specified, the output will be self correlation (e.g.
- all 1's), except for :class:`~pandas.DataFrame` inputs with `pairwise`
- set to `True`.
- Function will return ``NaN`` for correlations of equal valued sequences;
- this is the result of a 0/0 division error.
- When `pairwise` is set to `False`, only matching columns between `self` and
- `other` will be used.
- When `pairwise` is set to `True`, the output will be a MultiIndex DataFrame
- with the original index on the first level, and the `other` DataFrame
- columns on the second level.
- In the case of missing elements, only complete pairwise observations
- will be used.
- """
- ).replace("\n", "", 1),
- window_method="expanding",
- aggregation_description="correlation",
- agg_method="corr",
- )
- def corr(
- self,
- other: DataFrame | Series | None = None,
- pairwise: bool | None = None,
- ddof: int = 1,
- numeric_only: bool = False,
- ):
- return super().corr(
- other=other,
- pairwise=pairwise,
- ddof=ddof,
- numeric_only=numeric_only,
- )
- class ExpandingGroupby(BaseWindowGroupby, Expanding):
- """
- Provide a expanding groupby implementation.
- """
- _attributes = Expanding._attributes + BaseWindowGroupby._attributes
- def _get_window_indexer(self) -> GroupbyIndexer:
- """
- Return an indexer class that will compute the window start and end bounds
- Returns
- -------
- GroupbyIndexer
- """
- window_indexer = GroupbyIndexer(
- groupby_indices=self._grouper.indices,
- window_indexer=ExpandingIndexer,
- )
- return window_indexer
|