1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302 |
- from __future__ import annotations
- import copy
- from textwrap import dedent
- from typing import (
- TYPE_CHECKING,
- Callable,
- Hashable,
- Literal,
- cast,
- final,
- no_type_check,
- )
- import warnings
- import numpy as np
- from pandas._libs import lib
- from pandas._libs.tslibs import (
- BaseOffset,
- IncompatibleFrequency,
- NaT,
- Period,
- Timedelta,
- Timestamp,
- to_offset,
- )
- from pandas._typing import (
- AnyArrayLike,
- Axis,
- AxisInt,
- Frequency,
- IndexLabel,
- NDFrameT,
- QuantileInterpolation,
- T,
- TimedeltaConvertibleTypes,
- TimeGrouperOrigin,
- TimestampConvertibleTypes,
- npt,
- )
- from pandas.compat.numpy import function as nv
- from pandas.errors import AbstractMethodError
- from pandas.util._decorators import (
- Appender,
- Substitution,
- doc,
- )
- from pandas.util._exceptions import find_stack_level
- from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
- )
- import pandas.core.algorithms as algos
- from pandas.core.apply import ResamplerWindowApply
- from pandas.core.base import PandasObject
- import pandas.core.common as com
- from pandas.core.generic import (
- NDFrame,
- _shared_docs,
- )
- from pandas.core.groupby.generic import SeriesGroupBy
- from pandas.core.groupby.groupby import (
- BaseGroupBy,
- GroupBy,
- _pipe_template,
- get_groupby,
- )
- from pandas.core.groupby.grouper import Grouper
- from pandas.core.groupby.ops import BinGrouper
- from pandas.core.indexes.datetimes import (
- DatetimeIndex,
- date_range,
- )
- from pandas.core.indexes.period import (
- PeriodIndex,
- period_range,
- )
- from pandas.core.indexes.timedeltas import (
- TimedeltaIndex,
- timedelta_range,
- )
- from pandas.tseries.frequencies import (
- is_subperiod,
- is_superperiod,
- )
- from pandas.tseries.offsets import (
- Day,
- Tick,
- )
- if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Index,
- Series,
- )
- _shared_docs_kwargs: dict[str, str] = {}
- class Resampler(BaseGroupBy, PandasObject):
- """
- Class for resampling datetimelike data, a groupby-like operation.
- See aggregate, transform, and apply functions on this object.
- It's easiest to use obj.resample(...) to use Resampler.
- Parameters
- ----------
- obj : Series or DataFrame
- groupby : TimeGrouper
- axis : int, default 0
- kind : str or None
- 'period', 'timestamp' to override default index treatment
- Returns
- -------
- a Resampler of the appropriate type
- Notes
- -----
- After resampling, see aggregate, apply, and transform functions.
- """
- grouper: BinGrouper
- _timegrouper: TimeGrouper
- binner: DatetimeIndex | TimedeltaIndex | PeriodIndex # depends on subclass
- exclusions: frozenset[Hashable] = frozenset() # for SelectionMixin compat
- _internal_names_set = set({"obj", "ax", "_indexer"})
- # to the groupby descriptor
- _attributes = [
- "freq",
- "axis",
- "closed",
- "label",
- "convention",
- "kind",
- "origin",
- "offset",
- ]
- def __init__(
- self,
- obj: NDFrame,
- timegrouper: TimeGrouper,
- axis: Axis = 0,
- kind=None,
- *,
- gpr_index: Index,
- group_keys: bool = False,
- selection=None,
- ) -> None:
- self._timegrouper = timegrouper
- self.keys = None
- self.sort = True
- self.axis = obj._get_axis_number(axis)
- self.kind = kind
- self.group_keys = group_keys
- self.as_index = True
- self.obj, self.ax, self._indexer = self._timegrouper._set_grouper(
- self._convert_obj(obj), sort=True, gpr_index=gpr_index
- )
- self.binner, self.grouper = self._get_binner()
- self._selection = selection
- if self._timegrouper.key is not None:
- self.exclusions = frozenset([self._timegrouper.key])
- else:
- self.exclusions = frozenset()
- def __str__(self) -> str:
- """
- Provide a nice str repr of our rolling object.
- """
- attrs = (
- f"{k}={getattr(self._timegrouper, k)}"
- for k in self._attributes
- if getattr(self._timegrouper, k, None) is not None
- )
- return f"{type(self).__name__} [{', '.join(attrs)}]"
- def __getattr__(self, attr: str):
- if attr in self._internal_names_set:
- return object.__getattribute__(self, attr)
- if attr in self._attributes:
- return getattr(self._timegrouper, attr)
- if attr in self.obj:
- return self[attr]
- return object.__getattribute__(self, attr)
- @property
- def _from_selection(self) -> bool:
- """
- Is the resampling from a DataFrame column or MultiIndex level.
- """
- # upsampling and PeriodIndex resampling do not work
- # with selection, this state used to catch and raise an error
- return self._timegrouper is not None and (
- self._timegrouper.key is not None or self._timegrouper.level is not None
- )
- def _convert_obj(self, obj: NDFrameT) -> NDFrameT:
- """
- Provide any conversions for the object in order to correctly handle.
- Parameters
- ----------
- obj : Series or DataFrame
- Returns
- -------
- Series or DataFrame
- """
- return obj._consolidate()
- def _get_binner_for_time(self):
- raise AbstractMethodError(self)
- @final
- def _get_binner(self):
- """
- Create the BinGrouper, assume that self.set_grouper(obj)
- has already been called.
- """
- binner, bins, binlabels = self._get_binner_for_time()
- assert len(bins) == len(binlabels)
- bin_grouper = BinGrouper(bins, binlabels, indexer=self._indexer)
- return binner, bin_grouper
- @Substitution(
- klass="Resampler",
- examples="""
- >>> df = pd.DataFrame({'A': [1, 2, 3, 4]},
- ... index=pd.date_range('2012-08-02', periods=4))
- >>> df
- A
- 2012-08-02 1
- 2012-08-03 2
- 2012-08-04 3
- 2012-08-05 4
- To get the difference between each 2-day period's maximum and minimum
- value in one pass, you can do
- >>> df.resample('2D').pipe(lambda x: x.max() - x.min())
- A
- 2012-08-02 1
- 2012-08-04 1""",
- )
- @Appender(_pipe_template)
- def pipe(
- self,
- func: Callable[..., T] | tuple[Callable[..., T], str],
- *args,
- **kwargs,
- ) -> T:
- return super().pipe(func, *args, **kwargs)
- _agg_see_also_doc = dedent(
- """
- See Also
- --------
- DataFrame.groupby.aggregate : Aggregate using callable, string, dict,
- or list of string/callables.
- DataFrame.resample.transform : Transforms the Series on each group
- based on the given function.
- DataFrame.aggregate: Aggregate using one or more
- operations over the specified axis.
- """
- )
- _agg_examples_doc = dedent(
- """
- Examples
- --------
- >>> s = pd.Series([1, 2, 3, 4, 5],
- ... index=pd.date_range('20130101', periods=5, freq='s'))
- >>> s
- 2013-01-01 00:00:00 1
- 2013-01-01 00:00:01 2
- 2013-01-01 00:00:02 3
- 2013-01-01 00:00:03 4
- 2013-01-01 00:00:04 5
- Freq: S, dtype: int64
- >>> r = s.resample('2s')
- >>> r.agg(np.sum)
- 2013-01-01 00:00:00 3
- 2013-01-01 00:00:02 7
- 2013-01-01 00:00:04 5
- Freq: 2S, dtype: int64
- >>> r.agg(['sum', 'mean', 'max'])
- sum mean max
- 2013-01-01 00:00:00 3 1.5 2
- 2013-01-01 00:00:02 7 3.5 4
- 2013-01-01 00:00:04 5 5.0 5
- >>> r.agg({'result': lambda x: x.mean() / x.std(),
- ... 'total': np.sum})
- result total
- 2013-01-01 00:00:00 2.121320 3
- 2013-01-01 00:00:02 4.949747 7
- 2013-01-01 00:00:04 NaN 5
- >>> r.agg(average="mean", total="sum")
- average total
- 2013-01-01 00:00:00 1.5 3
- 2013-01-01 00:00:02 3.5 7
- 2013-01-01 00:00:04 5.0 5
- """
- )
- @doc(
- _shared_docs["aggregate"],
- see_also=_agg_see_also_doc,
- examples=_agg_examples_doc,
- klass="DataFrame",
- axis="",
- )
- def aggregate(self, func=None, *args, **kwargs):
- result = ResamplerWindowApply(self, func, args=args, kwargs=kwargs).agg()
- if result is None:
- how = func
- result = self._groupby_and_aggregate(how, *args, **kwargs)
- return result
- agg = aggregate
- apply = aggregate
- def transform(self, arg, *args, **kwargs):
- """
- Call function producing a like-indexed Series on each group.
- Return a Series with the transformed values.
- Parameters
- ----------
- arg : function
- To apply to each group. Should return a Series with the same index.
- Returns
- -------
- Series
- Examples
- --------
- >>> s = pd.Series([1, 2],
- ... index=pd.date_range('20180101',
- ... periods=2,
- ... freq='1h'))
- >>> s
- 2018-01-01 00:00:00 1
- 2018-01-01 01:00:00 2
- Freq: H, dtype: int64
- >>> resampled = s.resample('15min')
- >>> resampled.transform(lambda x: (x - x.mean()) / x.std())
- 2018-01-01 00:00:00 NaN
- 2018-01-01 01:00:00 NaN
- Freq: H, dtype: float64
- """
- return self._selected_obj.groupby(self._timegrouper).transform(
- arg, *args, **kwargs
- )
- def _downsample(self, f, **kwargs):
- raise AbstractMethodError(self)
- def _upsample(self, f, limit=None, fill_value=None):
- raise AbstractMethodError(self)
- def _gotitem(self, key, ndim: int, subset=None):
- """
- Sub-classes to define. Return a sliced object.
- Parameters
- ----------
- key : string / list of selections
- ndim : {1, 2}
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- grouper = self.grouper
- if subset is None:
- subset = self.obj
- if key is not None:
- subset = subset[key]
- else:
- # reached via Apply.agg_dict_like with selection=None and ndim=1
- assert subset.ndim == 1
- if ndim == 1:
- assert subset.ndim == 1
- grouped = get_groupby(
- subset, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys
- )
- return grouped
- def _groupby_and_aggregate(self, how, *args, **kwargs):
- """
- Re-evaluate the obj with a groupby aggregation.
- """
- grouper = self.grouper
- if self._selected_obj.ndim == 1:
- obj = self._selected_obj
- else:
- # Excludes `on` column when provided
- obj = self._obj_with_exclusions
- grouped = get_groupby(
- obj, by=None, grouper=grouper, axis=self.axis, group_keys=self.group_keys
- )
- try:
- if callable(how):
- # TODO: test_resample_apply_with_additional_args fails if we go
- # through the non-lambda path, not clear that it should.
- func = lambda x: how(x, *args, **kwargs)
- result = grouped.aggregate(func)
- else:
- result = grouped.aggregate(how, *args, **kwargs)
- except (AttributeError, KeyError):
- # we have a non-reducing function; try to evaluate
- # alternatively we want to evaluate only a column of the input
- # test_apply_to_one_column_of_df the function being applied references
- # a DataFrame column, but aggregate_item_by_item operates column-wise
- # on Series, raising AttributeError or KeyError
- # (depending on whether the column lookup uses getattr/__getitem__)
- result = grouped.apply(how, *args, **kwargs)
- except ValueError as err:
- if "Must produce aggregated value" in str(err):
- # raised in _aggregate_named
- # see test_apply_without_aggregation, test_apply_with_mutated_index
- pass
- else:
- raise
- # we have a non-reducing function
- # try to evaluate
- result = grouped.apply(how, *args, **kwargs)
- return self._wrap_result(result)
- def _get_resampler_for_grouping(self, groupby: GroupBy, key):
- """
- Return the correct class for resampling with groupby.
- """
- return self._resampler_for_grouping(groupby=groupby, key=key, parent=self)
- def _wrap_result(self, result):
- """
- Potentially wrap any results.
- """
- # GH 47705
- obj = self.obj
- if (
- isinstance(result, ABCDataFrame)
- and len(result) == 0
- and not isinstance(result.index, PeriodIndex)
- ):
- result = result.set_index(
- _asfreq_compat(obj.index[:0], freq=self.freq), append=True
- )
- if isinstance(result, ABCSeries) and self._selection is not None:
- result.name = self._selection
- if isinstance(result, ABCSeries) and result.empty:
- # When index is all NaT, result is empty but index is not
- result.index = _asfreq_compat(obj.index[:0], freq=self.freq)
- result.name = getattr(obj, "name", None)
- return result
- def ffill(self, limit=None):
- """
- Forward fill the values.
- Parameters
- ----------
- limit : int, optional
- Limit of how many values to fill.
- Returns
- -------
- An upsampled Series.
- See Also
- --------
- Series.fillna: Fill NA/NaN values using the specified method.
- DataFrame.fillna: Fill NA/NaN values using the specified method.
- """
- return self._upsample("ffill", limit=limit)
- def nearest(self, limit=None):
- """
- Resample by using the nearest value.
- When resampling data, missing values may appear (e.g., when the
- resampling frequency is higher than the original frequency).
- The `nearest` method will replace ``NaN`` values that appeared in
- the resampled data with the value from the nearest member of the
- sequence, based on the index value.
- Missing values that existed in the original data will not be modified.
- If `limit` is given, fill only this many values in each direction for
- each of the original values.
- Parameters
- ----------
- limit : int, optional
- Limit of how many values to fill.
- Returns
- -------
- Series or DataFrame
- An upsampled Series or DataFrame with ``NaN`` values filled with
- their nearest value.
- See Also
- --------
- backfill : Backward fill the new missing values in the resampled data.
- pad : Forward fill ``NaN`` values.
- Examples
- --------
- >>> s = pd.Series([1, 2],
- ... index=pd.date_range('20180101',
- ... periods=2,
- ... freq='1h'))
- >>> s
- 2018-01-01 00:00:00 1
- 2018-01-01 01:00:00 2
- Freq: H, dtype: int64
- >>> s.resample('15min').nearest()
- 2018-01-01 00:00:00 1
- 2018-01-01 00:15:00 1
- 2018-01-01 00:30:00 2
- 2018-01-01 00:45:00 2
- 2018-01-01 01:00:00 2
- Freq: 15T, dtype: int64
- Limit the number of upsampled values imputed by the nearest:
- >>> s.resample('15min').nearest(limit=1)
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:15:00 1.0
- 2018-01-01 00:30:00 NaN
- 2018-01-01 00:45:00 2.0
- 2018-01-01 01:00:00 2.0
- Freq: 15T, dtype: float64
- """
- return self._upsample("nearest", limit=limit)
- def bfill(self, limit=None):
- """
- Backward fill the new missing values in the resampled data.
- In statistics, imputation is the process of replacing missing data with
- substituted values [1]_. When resampling data, missing values may
- appear (e.g., when the resampling frequency is higher than the original
- frequency). The backward fill will replace NaN values that appeared in
- the resampled data with the next value in the original sequence.
- Missing values that existed in the original data will not be modified.
- Parameters
- ----------
- limit : int, optional
- Limit of how many values to fill.
- Returns
- -------
- Series, DataFrame
- An upsampled Series or DataFrame with backward filled NaN values.
- See Also
- --------
- bfill : Alias of backfill.
- fillna : Fill NaN values using the specified method, which can be
- 'backfill'.
- nearest : Fill NaN values with nearest neighbor starting from center.
- ffill : Forward fill NaN values.
- Series.fillna : Fill NaN values in the Series using the
- specified method, which can be 'backfill'.
- DataFrame.fillna : Fill NaN values in the DataFrame using the
- specified method, which can be 'backfill'.
- References
- ----------
- .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
- Examples
- --------
- Resampling a Series:
- >>> s = pd.Series([1, 2, 3],
- ... index=pd.date_range('20180101', periods=3, freq='h'))
- >>> s
- 2018-01-01 00:00:00 1
- 2018-01-01 01:00:00 2
- 2018-01-01 02:00:00 3
- Freq: H, dtype: int64
- >>> s.resample('30min').bfill()
- 2018-01-01 00:00:00 1
- 2018-01-01 00:30:00 2
- 2018-01-01 01:00:00 2
- 2018-01-01 01:30:00 3
- 2018-01-01 02:00:00 3
- Freq: 30T, dtype: int64
- >>> s.resample('15min').bfill(limit=2)
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:15:00 NaN
- 2018-01-01 00:30:00 2.0
- 2018-01-01 00:45:00 2.0
- 2018-01-01 01:00:00 2.0
- 2018-01-01 01:15:00 NaN
- 2018-01-01 01:30:00 3.0
- 2018-01-01 01:45:00 3.0
- 2018-01-01 02:00:00 3.0
- Freq: 15T, dtype: float64
- Resampling a DataFrame that has missing values:
- >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
- ... index=pd.date_range('20180101', periods=3,
- ... freq='h'))
- >>> df
- a b
- 2018-01-01 00:00:00 2.0 1
- 2018-01-01 01:00:00 NaN 3
- 2018-01-01 02:00:00 6.0 5
- >>> df.resample('30min').bfill()
- a b
- 2018-01-01 00:00:00 2.0 1
- 2018-01-01 00:30:00 NaN 3
- 2018-01-01 01:00:00 NaN 3
- 2018-01-01 01:30:00 6.0 5
- 2018-01-01 02:00:00 6.0 5
- >>> df.resample('15min').bfill(limit=2)
- a b
- 2018-01-01 00:00:00 2.0 1.0
- 2018-01-01 00:15:00 NaN NaN
- 2018-01-01 00:30:00 NaN 3.0
- 2018-01-01 00:45:00 NaN 3.0
- 2018-01-01 01:00:00 NaN 3.0
- 2018-01-01 01:15:00 NaN NaN
- 2018-01-01 01:30:00 6.0 5.0
- 2018-01-01 01:45:00 6.0 5.0
- 2018-01-01 02:00:00 6.0 5.0
- """
- return self._upsample("bfill", limit=limit)
- def fillna(self, method, limit=None):
- """
- Fill missing values introduced by upsampling.
- In statistics, imputation is the process of replacing missing data with
- substituted values [1]_. When resampling data, missing values may
- appear (e.g., when the resampling frequency is higher than the original
- frequency).
- Missing values that existed in the original data will
- not be modified.
- Parameters
- ----------
- method : {'pad', 'backfill', 'ffill', 'bfill', 'nearest'}
- Method to use for filling holes in resampled data
- * 'pad' or 'ffill': use previous valid observation to fill gap
- (forward fill).
- * 'backfill' or 'bfill': use next valid observation to fill gap.
- * 'nearest': use nearest valid observation to fill gap.
- limit : int, optional
- Limit of how many consecutive missing values to fill.
- Returns
- -------
- Series or DataFrame
- An upsampled Series or DataFrame with missing values filled.
- See Also
- --------
- bfill : Backward fill NaN values in the resampled data.
- ffill : Forward fill NaN values in the resampled data.
- nearest : Fill NaN values in the resampled data
- with nearest neighbor starting from center.
- interpolate : Fill NaN values using interpolation.
- Series.fillna : Fill NaN values in the Series using the
- specified method, which can be 'bfill' and 'ffill'.
- DataFrame.fillna : Fill NaN values in the DataFrame using the
- specified method, which can be 'bfill' and 'ffill'.
- References
- ----------
- .. [1] https://en.wikipedia.org/wiki/Imputation_(statistics)
- Examples
- --------
- Resampling a Series:
- >>> s = pd.Series([1, 2, 3],
- ... index=pd.date_range('20180101', periods=3, freq='h'))
- >>> s
- 2018-01-01 00:00:00 1
- 2018-01-01 01:00:00 2
- 2018-01-01 02:00:00 3
- Freq: H, dtype: int64
- Without filling the missing values you get:
- >>> s.resample("30min").asfreq()
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:30:00 NaN
- 2018-01-01 01:00:00 2.0
- 2018-01-01 01:30:00 NaN
- 2018-01-01 02:00:00 3.0
- Freq: 30T, dtype: float64
- >>> s.resample('30min').fillna("backfill")
- 2018-01-01 00:00:00 1
- 2018-01-01 00:30:00 2
- 2018-01-01 01:00:00 2
- 2018-01-01 01:30:00 3
- 2018-01-01 02:00:00 3
- Freq: 30T, dtype: int64
- >>> s.resample('15min').fillna("backfill", limit=2)
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:15:00 NaN
- 2018-01-01 00:30:00 2.0
- 2018-01-01 00:45:00 2.0
- 2018-01-01 01:00:00 2.0
- 2018-01-01 01:15:00 NaN
- 2018-01-01 01:30:00 3.0
- 2018-01-01 01:45:00 3.0
- 2018-01-01 02:00:00 3.0
- Freq: 15T, dtype: float64
- >>> s.resample('30min').fillna("pad")
- 2018-01-01 00:00:00 1
- 2018-01-01 00:30:00 1
- 2018-01-01 01:00:00 2
- 2018-01-01 01:30:00 2
- 2018-01-01 02:00:00 3
- Freq: 30T, dtype: int64
- >>> s.resample('30min').fillna("nearest")
- 2018-01-01 00:00:00 1
- 2018-01-01 00:30:00 2
- 2018-01-01 01:00:00 2
- 2018-01-01 01:30:00 3
- 2018-01-01 02:00:00 3
- Freq: 30T, dtype: int64
- Missing values present before the upsampling are not affected.
- >>> sm = pd.Series([1, None, 3],
- ... index=pd.date_range('20180101', periods=3, freq='h'))
- >>> sm
- 2018-01-01 00:00:00 1.0
- 2018-01-01 01:00:00 NaN
- 2018-01-01 02:00:00 3.0
- Freq: H, dtype: float64
- >>> sm.resample('30min').fillna('backfill')
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:30:00 NaN
- 2018-01-01 01:00:00 NaN
- 2018-01-01 01:30:00 3.0
- 2018-01-01 02:00:00 3.0
- Freq: 30T, dtype: float64
- >>> sm.resample('30min').fillna('pad')
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:30:00 1.0
- 2018-01-01 01:00:00 NaN
- 2018-01-01 01:30:00 NaN
- 2018-01-01 02:00:00 3.0
- Freq: 30T, dtype: float64
- >>> sm.resample('30min').fillna('nearest')
- 2018-01-01 00:00:00 1.0
- 2018-01-01 00:30:00 NaN
- 2018-01-01 01:00:00 NaN
- 2018-01-01 01:30:00 3.0
- 2018-01-01 02:00:00 3.0
- Freq: 30T, dtype: float64
- DataFrame resampling is done column-wise. All the same options are
- available.
- >>> df = pd.DataFrame({'a': [2, np.nan, 6], 'b': [1, 3, 5]},
- ... index=pd.date_range('20180101', periods=3,
- ... freq='h'))
- >>> df
- a b
- 2018-01-01 00:00:00 2.0 1
- 2018-01-01 01:00:00 NaN 3
- 2018-01-01 02:00:00 6.0 5
- >>> df.resample('30min').fillna("bfill")
- a b
- 2018-01-01 00:00:00 2.0 1
- 2018-01-01 00:30:00 NaN 3
- 2018-01-01 01:00:00 NaN 3
- 2018-01-01 01:30:00 6.0 5
- 2018-01-01 02:00:00 6.0 5
- """
- return self._upsample(method, limit=limit)
- @doc(NDFrame.interpolate, **_shared_docs_kwargs)
- def interpolate(
- self,
- method: QuantileInterpolation = "linear",
- *,
- axis: Axis = 0,
- limit=None,
- inplace: bool = False,
- limit_direction: Literal["forward", "backward", "both"] = "forward",
- limit_area=None,
- downcast=None,
- **kwargs,
- ):
- """
- Interpolate values according to different methods.
- """
- result = self._upsample("asfreq")
- return result.interpolate(
- method=method,
- axis=axis,
- limit=limit,
- inplace=inplace,
- limit_direction=limit_direction,
- limit_area=limit_area,
- downcast=downcast,
- **kwargs,
- )
- def asfreq(self, fill_value=None):
- """
- Return the values at the new freq, essentially a reindex.
- Parameters
- ----------
- fill_value : scalar, optional
- Value to use for missing values, applied during upsampling (note
- this does not fill NaNs that already were present).
- Returns
- -------
- DataFrame or Series
- Values at the specified freq.
- See Also
- --------
- Series.asfreq: Convert TimeSeries to specified frequency.
- DataFrame.asfreq: Convert TimeSeries to specified frequency.
- """
- return self._upsample("asfreq", fill_value=fill_value)
- def sum(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "sum", args, kwargs)
- nv.validate_resampler_func("sum", args, kwargs)
- return self._downsample("sum", numeric_only=numeric_only, min_count=min_count)
- @doc(GroupBy.prod)
- def prod(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "prod", args, kwargs)
- nv.validate_resampler_func("prod", args, kwargs)
- return self._downsample("prod", numeric_only=numeric_only, min_count=min_count)
- def min(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "min", args, kwargs)
- nv.validate_resampler_func("min", args, kwargs)
- return self._downsample("min", numeric_only=numeric_only, min_count=min_count)
- def max(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "max", args, kwargs)
- nv.validate_resampler_func("max", args, kwargs)
- return self._downsample("max", numeric_only=numeric_only, min_count=min_count)
- @doc(GroupBy.first)
- def first(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "first", args, kwargs)
- nv.validate_resampler_func("first", args, kwargs)
- return self._downsample("first", numeric_only=numeric_only, min_count=min_count)
- @doc(GroupBy.last)
- def last(
- self,
- numeric_only: bool = False,
- min_count: int = 0,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "last", args, kwargs)
- nv.validate_resampler_func("last", args, kwargs)
- return self._downsample("last", numeric_only=numeric_only, min_count=min_count)
- @doc(GroupBy.median)
- def median(self, numeric_only: bool = False, *args, **kwargs):
- maybe_warn_args_and_kwargs(type(self), "median", args, kwargs)
- nv.validate_resampler_func("median", args, kwargs)
- return self._downsample("median", numeric_only=numeric_only)
- def mean(
- self,
- numeric_only: bool = False,
- *args,
- **kwargs,
- ):
- """
- Compute mean of groups, excluding missing values.
- Parameters
- ----------
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionchanged:: 2.0.0
- numeric_only now defaults to ``False``.
- Returns
- -------
- DataFrame or Series
- Mean of values within each group.
- """
- maybe_warn_args_and_kwargs(type(self), "mean", args, kwargs)
- nv.validate_resampler_func("mean", args, kwargs)
- return self._downsample("mean", numeric_only=numeric_only)
- def std(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- *args,
- **kwargs,
- ):
- """
- Compute standard deviation of groups, excluding missing values.
- Parameters
- ----------
- ddof : int, default 1
- Degrees of freedom.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- .. versionchanged:: 2.0.0
- numeric_only now defaults to ``False``.
- Returns
- -------
- DataFrame or Series
- Standard deviation of values within each group.
- """
- maybe_warn_args_and_kwargs(type(self), "std", args, kwargs)
- nv.validate_resampler_func("std", args, kwargs)
- return self._downsample("std", ddof=ddof, numeric_only=numeric_only)
- def var(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- *args,
- **kwargs,
- ):
- """
- Compute variance of groups, excluding missing values.
- Parameters
- ----------
- ddof : int, default 1
- Degrees of freedom.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- .. versionchanged:: 2.0.0
- numeric_only now defaults to ``False``.
- Returns
- -------
- DataFrame or Series
- Variance of values within each group.
- """
- maybe_warn_args_and_kwargs(type(self), "var", args, kwargs)
- nv.validate_resampler_func("var", args, kwargs)
- return self._downsample("var", ddof=ddof, numeric_only=numeric_only)
- @doc(GroupBy.sem)
- def sem(
- self,
- ddof: int = 1,
- numeric_only: bool = False,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "sem", args, kwargs)
- nv.validate_resampler_func("sem", args, kwargs)
- return self._downsample("sem", ddof=ddof, numeric_only=numeric_only)
- @doc(GroupBy.ohlc)
- def ohlc(
- self,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "ohlc", args, kwargs)
- nv.validate_resampler_func("ohlc", args, kwargs)
- return self._downsample("ohlc")
- @doc(SeriesGroupBy.nunique)
- def nunique(
- self,
- *args,
- **kwargs,
- ):
- maybe_warn_args_and_kwargs(type(self), "nunique", args, kwargs)
- nv.validate_resampler_func("nunique", args, kwargs)
- return self._downsample("nunique")
- @doc(GroupBy.size)
- def size(self):
- result = self._downsample("size")
- # If the result is a non-empty DataFrame we stack to get a Series
- # GH 46826
- if isinstance(result, ABCDataFrame) and not result.empty:
- result = result.stack()
- if not len(self.ax):
- from pandas import Series
- if self._selected_obj.ndim == 1:
- name = self._selected_obj.name
- else:
- name = None
- result = Series([], index=result.index, dtype="int64", name=name)
- return result
- @doc(GroupBy.count)
- def count(self):
- result = self._downsample("count")
- if not len(self.ax):
- if self._selected_obj.ndim == 1:
- result = type(self._selected_obj)(
- [], index=result.index, dtype="int64", name=self._selected_obj.name
- )
- else:
- from pandas import DataFrame
- result = DataFrame(
- [], index=result.index, columns=result.columns, dtype="int64"
- )
- return result
- def quantile(self, q: float | AnyArrayLike = 0.5, **kwargs):
- """
- Return value at the given quantile.
- Parameters
- ----------
- q : float or array-like, default 0.5 (50% quantile)
- Returns
- -------
- DataFrame or Series
- Quantile of values within each group.
- See Also
- --------
- Series.quantile
- Return a series, where the index is q and the values are the quantiles.
- DataFrame.quantile
- Return a DataFrame, where the columns are the columns of self,
- and the values are the quantiles.
- DataFrameGroupBy.quantile
- Return a DataFrame, where the columns are groupby columns,
- and the values are its quantiles.
- """
- return self._downsample("quantile", q=q, **kwargs)
- class _GroupByMixin(PandasObject):
- """
- Provide the groupby facilities.
- """
- _attributes: list[str] # in practice the same as Resampler._attributes
- _selection: IndexLabel | None = None
- _groupby: GroupBy
- _timegrouper: TimeGrouper
- def __init__(
- self,
- *,
- parent: Resampler,
- groupby: GroupBy,
- key=None,
- selection: IndexLabel | None = None,
- ) -> None:
- # reached via ._gotitem and _get_resampler_for_grouping
- assert isinstance(groupby, GroupBy), type(groupby)
- # parent is always a Resampler, sometimes a _GroupByMixin
- assert isinstance(parent, Resampler), type(parent)
- # initialize our GroupByMixin object with
- # the resampler attributes
- for attr in self._attributes:
- setattr(self, attr, getattr(parent, attr))
- self._selection = selection
- self.binner = parent.binner
- self.key = key
- self._groupby = groupby
- self._timegrouper = copy.copy(parent._timegrouper)
- self.ax = parent.ax
- self.obj = parent.obj
- @no_type_check
- def _apply(self, f, *args, **kwargs):
- """
- Dispatch to _upsample; we are stripping all of the _upsample kwargs and
- performing the original function call on the grouped object.
- """
- def func(x):
- x = self._resampler_cls(x, timegrouper=self._timegrouper, gpr_index=self.ax)
- if isinstance(f, str):
- return getattr(x, f)(**kwargs)
- return x.apply(f, *args, **kwargs)
- result = self._groupby.apply(func)
- return self._wrap_result(result)
- _upsample = _apply
- _downsample = _apply
- _groupby_and_aggregate = _apply
- @final
- def _gotitem(self, key, ndim, subset=None):
- """
- Sub-classes to define. Return a sliced object.
- Parameters
- ----------
- key : string / list of selections
- ndim : {1, 2}
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- # create a new object to prevent aliasing
- if subset is None:
- subset = self.obj
- if key is not None:
- subset = subset[key]
- else:
- # reached via Apply.agg_dict_like with selection=None, ndim=1
- assert subset.ndim == 1
- # Try to select from a DataFrame, falling back to a Series
- try:
- if isinstance(key, list) and self.key not in key and self.key is not None:
- key.append(self.key)
- groupby = self._groupby[key]
- except IndexError:
- groupby = self._groupby
- selection = None
- if subset.ndim == 2 and (
- (lib.is_scalar(key) and key in subset) or lib.is_list_like(key)
- ):
- selection = key
- elif subset.ndim == 1 and lib.is_scalar(key) and key == subset.name:
- selection = key
- new_rs = type(self)(
- groupby=groupby,
- parent=cast(Resampler, self),
- selection=selection,
- )
- return new_rs
- class DatetimeIndexResampler(Resampler):
- @property
- def _resampler_for_grouping(self):
- return DatetimeIndexResamplerGroupby
- def _get_binner_for_time(self):
- # this is how we are actually creating the bins
- if self.kind == "period":
- return self._timegrouper._get_time_period_bins(self.ax)
- return self._timegrouper._get_time_bins(self.ax)
- def _downsample(self, how, **kwargs):
- """
- Downsample the cython defined function.
- Parameters
- ----------
- how : string / cython mapped function
- **kwargs : kw args passed to how function
- """
- how = com.get_cython_func(how) or how
- ax = self.ax
- if self._selected_obj.ndim == 1:
- obj = self._selected_obj
- else:
- # Excludes `on` column when provided
- obj = self._obj_with_exclusions
- if not len(ax):
- # reset to the new freq
- obj = obj.copy()
- obj.index = obj.index._with_freq(self.freq)
- assert obj.index.freq == self.freq, (obj.index.freq, self.freq)
- return obj
- # do we have a regular frequency
- # error: Item "None" of "Optional[Any]" has no attribute "binlabels"
- if (
- (ax.freq is not None or ax.inferred_freq is not None)
- and len(self.grouper.binlabels) > len(ax)
- and how is None
- ):
- # let's do an asfreq
- return self.asfreq()
- # we are downsampling
- # we want to call the actual grouper method here
- result = obj.groupby(self.grouper, axis=self.axis).aggregate(how, **kwargs)
- return self._wrap_result(result)
- def _adjust_binner_for_upsample(self, binner):
- """
- Adjust our binner when upsampling.
- The range of a new index should not be outside specified range
- """
- if self.closed == "right":
- binner = binner[1:]
- else:
- binner = binner[:-1]
- return binner
- def _upsample(self, method, limit=None, fill_value=None):
- """
- Parameters
- ----------
- method : string {'backfill', 'bfill', 'pad',
- 'ffill', 'asfreq'} method for upsampling
- limit : int, default None
- Maximum size gap to fill when reindexing
- fill_value : scalar, default None
- Value to use for missing values
- See Also
- --------
- .fillna: Fill NA/NaN values using the specified method.
- """
- if self.axis:
- raise AssertionError("axis must be 0")
- if self._from_selection:
- raise ValueError(
- "Upsampling from level= or on= selection "
- "is not supported, use .set_index(...) "
- "to explicitly set index to datetime-like"
- )
- ax = self.ax
- obj = self._selected_obj
- binner = self.binner
- res_index = self._adjust_binner_for_upsample(binner)
- # if we have the same frequency as our axis, then we are equal sampling
- if (
- limit is None
- and to_offset(ax.inferred_freq) == self.freq
- and len(obj) == len(res_index)
- ):
- result = obj.copy()
- result.index = res_index
- else:
- result = obj.reindex(
- res_index, method=method, limit=limit, fill_value=fill_value
- )
- return self._wrap_result(result)
- def _wrap_result(self, result):
- result = super()._wrap_result(result)
- # we may have a different kind that we were asked originally
- # convert if needed
- if self.kind == "period" and not isinstance(result.index, PeriodIndex):
- result.index = result.index.to_period(self.freq)
- return result
- class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler):
- """
- Provides a resample of a groupby implementation
- """
- @property
- def _resampler_cls(self):
- return DatetimeIndexResampler
- class PeriodIndexResampler(DatetimeIndexResampler):
- @property
- def _resampler_for_grouping(self):
- return PeriodIndexResamplerGroupby
- def _get_binner_for_time(self):
- if self.kind == "timestamp":
- return super()._get_binner_for_time()
- return self._timegrouper._get_period_bins(self.ax)
- def _convert_obj(self, obj: NDFrameT) -> NDFrameT:
- obj = super()._convert_obj(obj)
- if self._from_selection:
- # see GH 14008, GH 12871
- msg = (
- "Resampling from level= or on= selection "
- "with a PeriodIndex is not currently supported, "
- "use .set_index(...) to explicitly set index"
- )
- raise NotImplementedError(msg)
- # convert to timestamp
- if self.kind == "timestamp":
- obj = obj.to_timestamp(how=self.convention)
- return obj
- def _downsample(self, how, **kwargs):
- """
- Downsample the cython defined function.
- Parameters
- ----------
- how : string / cython mapped function
- **kwargs : kw args passed to how function
- """
- # we may need to actually resample as if we are timestamps
- if self.kind == "timestamp":
- return super()._downsample(how, **kwargs)
- how = com.get_cython_func(how) or how
- ax = self.ax
- if is_subperiod(ax.freq, self.freq):
- # Downsampling
- return self._groupby_and_aggregate(how, **kwargs)
- elif is_superperiod(ax.freq, self.freq):
- if how == "ohlc":
- # GH #13083
- # upsampling to subperiods is handled as an asfreq, which works
- # for pure aggregating/reducing methods
- # OHLC reduces along the time dimension, but creates multiple
- # values for each period -> handle by _groupby_and_aggregate()
- return self._groupby_and_aggregate(how)
- return self.asfreq()
- elif ax.freq == self.freq:
- return self.asfreq()
- raise IncompatibleFrequency(
- f"Frequency {ax.freq} cannot be resampled to {self.freq}, "
- "as they are not sub or super periods"
- )
- def _upsample(self, method, limit=None, fill_value=None):
- """
- Parameters
- ----------
- method : {'backfill', 'bfill', 'pad', 'ffill'}
- Method for upsampling.
- limit : int, default None
- Maximum size gap to fill when reindexing.
- fill_value : scalar, default None
- Value to use for missing values.
- See Also
- --------
- .fillna: Fill NA/NaN values using the specified method.
- """
- # we may need to actually resample as if we are timestamps
- if self.kind == "timestamp":
- return super()._upsample(method, limit=limit, fill_value=fill_value)
- ax = self.ax
- obj = self.obj
- new_index = self.binner
- # Start vs. end of period
- memb = ax.asfreq(self.freq, how=self.convention)
- # Get the fill indexer
- indexer = memb.get_indexer(new_index, method=method, limit=limit)
- new_obj = _take_new_index(
- obj,
- indexer,
- new_index,
- axis=self.axis,
- )
- return self._wrap_result(new_obj)
- class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler):
- """
- Provides a resample of a groupby implementation.
- """
- @property
- def _resampler_cls(self):
- return PeriodIndexResampler
- class TimedeltaIndexResampler(DatetimeIndexResampler):
- @property
- def _resampler_for_grouping(self):
- return TimedeltaIndexResamplerGroupby
- def _get_binner_for_time(self):
- return self._timegrouper._get_time_delta_bins(self.ax)
- def _adjust_binner_for_upsample(self, binner):
- """
- Adjust our binner when upsampling.
- The range of a new index is allowed to be greater than original range
- so we don't need to change the length of a binner, GH 13022
- """
- return binner
- class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler):
- """
- Provides a resample of a groupby implementation.
- """
- @property
- def _resampler_cls(self):
- return TimedeltaIndexResampler
- def get_resampler(obj: Series | DataFrame, kind=None, **kwds) -> Resampler:
- """
- Create a TimeGrouper and return our resampler.
- """
- tg = TimeGrouper(**kwds)
- return tg._get_resampler(obj, kind=kind)
- get_resampler.__doc__ = Resampler.__doc__
- def get_resampler_for_grouping(
- groupby: GroupBy,
- rule,
- how=None,
- fill_method=None,
- limit=None,
- kind=None,
- on=None,
- **kwargs,
- ) -> Resampler:
- """
- Return our appropriate resampler when grouping as well.
- """
- # .resample uses 'on' similar to how .groupby uses 'key'
- tg = TimeGrouper(freq=rule, key=on, **kwargs)
- resampler = tg._get_resampler(groupby.obj, kind=kind)
- return resampler._get_resampler_for_grouping(groupby=groupby, key=tg.key)
- class TimeGrouper(Grouper):
- """
- Custom groupby class for time-interval grouping.
- Parameters
- ----------
- freq : pandas date offset or offset alias for identifying bin edges
- closed : closed end of interval; 'left' or 'right'
- label : interval boundary to use for labeling; 'left' or 'right'
- convention : {'start', 'end', 'e', 's'}
- If axis is PeriodIndex
- """
- _attributes = Grouper._attributes + (
- "closed",
- "label",
- "how",
- "kind",
- "convention",
- "origin",
- "offset",
- )
- origin: TimeGrouperOrigin
- def __init__(
- self,
- freq: Frequency = "Min",
- closed: Literal["left", "right"] | None = None,
- label: Literal["left", "right"] | None = None,
- how: str = "mean",
- axis: Axis = 0,
- fill_method=None,
- limit=None,
- kind: str | None = None,
- convention: Literal["start", "end", "e", "s"] | None = None,
- origin: Literal["epoch", "start", "start_day", "end", "end_day"]
- | TimestampConvertibleTypes = "start_day",
- offset: TimedeltaConvertibleTypes | None = None,
- group_keys: bool = False,
- **kwargs,
- ) -> None:
- # Check for correctness of the keyword arguments which would
- # otherwise silently use the default if misspelled
- if label not in {None, "left", "right"}:
- raise ValueError(f"Unsupported value {label} for `label`")
- if closed not in {None, "left", "right"}:
- raise ValueError(f"Unsupported value {closed} for `closed`")
- if convention not in {None, "start", "end", "e", "s"}:
- raise ValueError(f"Unsupported value {convention} for `convention`")
- freq = to_offset(freq)
- end_types = {"M", "A", "Q", "BM", "BA", "BQ", "W"}
- rule = freq.rule_code
- if rule in end_types or ("-" in rule and rule[: rule.find("-")] in end_types):
- if closed is None:
- closed = "right"
- if label is None:
- label = "right"
- else:
- # The backward resample sets ``closed`` to ``'right'`` by default
- # since the last value should be considered as the edge point for
- # the last bin. When origin in "end" or "end_day", the value for a
- # specific ``Timestamp`` index stands for the resample result from
- # the current ``Timestamp`` minus ``freq`` to the current
- # ``Timestamp`` with a right close.
- if origin in ["end", "end_day"]:
- if closed is None:
- closed = "right"
- if label is None:
- label = "right"
- else:
- if closed is None:
- closed = "left"
- if label is None:
- label = "left"
- self.closed = closed
- self.label = label
- self.kind = kind
- self.convention = convention if convention is not None else "e"
- self.how = how
- self.fill_method = fill_method
- self.limit = limit
- self.group_keys = group_keys
- if origin in ("epoch", "start", "start_day", "end", "end_day"):
- # error: Incompatible types in assignment (expression has type "Union[Union[
- # Timestamp, datetime, datetime64, signedinteger[_64Bit], float, str],
- # Literal['epoch', 'start', 'start_day', 'end', 'end_day']]", variable has
- # type "Union[Timestamp, Literal['epoch', 'start', 'start_day', 'end',
- # 'end_day']]")
- self.origin = origin # type: ignore[assignment]
- else:
- try:
- self.origin = Timestamp(origin)
- except (ValueError, TypeError) as err:
- raise ValueError(
- "'origin' should be equal to 'epoch', 'start', 'start_day', "
- "'end', 'end_day' or "
- f"should be a Timestamp convertible type. Got '{origin}' instead."
- ) from err
- try:
- self.offset = Timedelta(offset) if offset is not None else None
- except (ValueError, TypeError) as err:
- raise ValueError(
- "'offset' should be a Timedelta convertible type. "
- f"Got '{offset}' instead."
- ) from err
- # always sort time groupers
- kwargs["sort"] = True
- super().__init__(freq=freq, axis=axis, **kwargs)
- def _get_resampler(self, obj: NDFrame, kind=None) -> Resampler:
- """
- Return my resampler or raise if we have an invalid axis.
- Parameters
- ----------
- obj : Series or DataFrame
- kind : string, optional
- 'period','timestamp','timedelta' are valid
- Returns
- -------
- Resampler
- Raises
- ------
- TypeError if incompatible axis
- """
- _, ax, indexer = self._set_grouper(obj, gpr_index=None)
- if isinstance(ax, DatetimeIndex):
- return DatetimeIndexResampler(
- obj,
- timegrouper=self,
- kind=kind,
- axis=self.axis,
- group_keys=self.group_keys,
- gpr_index=ax,
- )
- elif isinstance(ax, PeriodIndex) or kind == "period":
- return PeriodIndexResampler(
- obj,
- timegrouper=self,
- kind=kind,
- axis=self.axis,
- group_keys=self.group_keys,
- gpr_index=ax,
- )
- elif isinstance(ax, TimedeltaIndex):
- return TimedeltaIndexResampler(
- obj,
- timegrouper=self,
- axis=self.axis,
- group_keys=self.group_keys,
- gpr_index=ax,
- )
- raise TypeError(
- "Only valid with DatetimeIndex, "
- "TimedeltaIndex or PeriodIndex, "
- f"but got an instance of '{type(ax).__name__}'"
- )
- def _get_grouper(
- self, obj: NDFrameT, validate: bool = True
- ) -> tuple[BinGrouper, NDFrameT]:
- # create the resampler and return our binner
- r = self._get_resampler(obj)
- return r.grouper, cast(NDFrameT, r.obj)
- def _get_time_bins(self, ax: DatetimeIndex):
- if not isinstance(ax, DatetimeIndex):
- raise TypeError(
- "axis must be a DatetimeIndex, but got "
- f"an instance of {type(ax).__name__}"
- )
- if len(ax) == 0:
- binner = labels = DatetimeIndex(data=[], freq=self.freq, name=ax.name)
- return binner, [], labels
- first, last = _get_timestamp_range_edges(
- ax.min(),
- ax.max(),
- self.freq,
- unit=ax.unit,
- closed=self.closed,
- origin=self.origin,
- offset=self.offset,
- )
- # GH #12037
- # use first/last directly instead of call replace() on them
- # because replace() will swallow the nanosecond part
- # thus last bin maybe slightly before the end if the end contains
- # nanosecond part and lead to `Values falls after last bin` error
- # GH 25758: If DST lands at midnight (e.g. 'America/Havana'), user feedback
- # has noted that ambiguous=True provides the most sensible result
- binner = labels = date_range(
- freq=self.freq,
- start=first,
- end=last,
- tz=ax.tz,
- name=ax.name,
- ambiguous=True,
- nonexistent="shift_forward",
- unit=ax.unit,
- )
- ax_values = ax.asi8
- binner, bin_edges = self._adjust_bin_edges(binner, ax_values)
- # general version, knowing nothing about relative frequencies
- bins = lib.generate_bins_dt64(
- ax_values, bin_edges, self.closed, hasnans=ax.hasnans
- )
- if self.closed == "right":
- labels = binner
- if self.label == "right":
- labels = labels[1:]
- elif self.label == "right":
- labels = labels[1:]
- if ax.hasnans:
- binner = binner.insert(0, NaT)
- labels = labels.insert(0, NaT)
- # if we end up with more labels than bins
- # adjust the labels
- # GH4076
- if len(bins) < len(labels):
- labels = labels[: len(bins)]
- return binner, bins, labels
- def _adjust_bin_edges(
- self, binner: DatetimeIndex, ax_values: npt.NDArray[np.int64]
- ) -> tuple[DatetimeIndex, npt.NDArray[np.int64]]:
- # Some hacks for > daily data, see #1471, #1458, #1483
- if self.freq != "D" and is_superperiod(self.freq, "D"):
- if self.closed == "right":
- # GH 21459, GH 9119: Adjust the bins relative to the wall time
- edges_dti = binner.tz_localize(None)
- edges_dti = (
- edges_dti
- + Timedelta(days=1, unit=edges_dti.unit).as_unit(edges_dti.unit)
- - Timedelta(1, unit=edges_dti.unit).as_unit(edges_dti.unit)
- )
- bin_edges = edges_dti.tz_localize(binner.tz).asi8
- else:
- bin_edges = binner.asi8
- # intraday values on last day
- if bin_edges[-2] > ax_values.max():
- bin_edges = bin_edges[:-1]
- binner = binner[:-1]
- else:
- bin_edges = binner.asi8
- return binner, bin_edges
- def _get_time_delta_bins(self, ax: TimedeltaIndex):
- if not isinstance(ax, TimedeltaIndex):
- raise TypeError(
- "axis must be a TimedeltaIndex, but got "
- f"an instance of {type(ax).__name__}"
- )
- if not len(ax):
- binner = labels = TimedeltaIndex(data=[], freq=self.freq, name=ax.name)
- return binner, [], labels
- start, end = ax.min(), ax.max()
- if self.closed == "right":
- end += self.freq
- labels = binner = timedelta_range(
- start=start, end=end, freq=self.freq, name=ax.name
- )
- end_stamps = labels
- if self.closed == "left":
- end_stamps += self.freq
- bins = ax.searchsorted(end_stamps, side=self.closed)
- if self.offset:
- # GH 10530 & 31809
- labels += self.offset
- return binner, bins, labels
- def _get_time_period_bins(self, ax: DatetimeIndex):
- if not isinstance(ax, DatetimeIndex):
- raise TypeError(
- "axis must be a DatetimeIndex, but got "
- f"an instance of {type(ax).__name__}"
- )
- freq = self.freq
- if not len(ax):
- binner = labels = PeriodIndex(data=[], freq=freq, name=ax.name)
- return binner, [], labels
- labels = binner = period_range(start=ax[0], end=ax[-1], freq=freq, name=ax.name)
- end_stamps = (labels + freq).asfreq(freq, "s").to_timestamp()
- if ax.tz:
- end_stamps = end_stamps.tz_localize(ax.tz)
- bins = ax.searchsorted(end_stamps, side="left")
- return binner, bins, labels
- def _get_period_bins(self, ax: PeriodIndex):
- if not isinstance(ax, PeriodIndex):
- raise TypeError(
- "axis must be a PeriodIndex, but got "
- f"an instance of {type(ax).__name__}"
- )
- memb = ax.asfreq(self.freq, how=self.convention)
- # NaT handling as in pandas._lib.lib.generate_bins_dt64()
- nat_count = 0
- if memb.hasnans:
- # error: Incompatible types in assignment (expression has type
- # "bool_", variable has type "int") [assignment]
- nat_count = np.sum(memb._isnan) # type: ignore[assignment]
- memb = memb[~memb._isnan]
- if not len(memb):
- # index contains no valid (non-NaT) values
- bins = np.array([], dtype=np.int64)
- binner = labels = PeriodIndex(data=[], freq=self.freq, name=ax.name)
- if len(ax) > 0:
- # index is all NaT
- binner, bins, labels = _insert_nat_bin(binner, bins, labels, len(ax))
- return binner, bins, labels
- freq_mult = self.freq.n
- start = ax.min().asfreq(self.freq, how=self.convention)
- end = ax.max().asfreq(self.freq, how="end")
- bin_shift = 0
- if isinstance(self.freq, Tick):
- # GH 23882 & 31809: get adjusted bin edge labels with 'origin'
- # and 'origin' support. This call only makes sense if the freq is a
- # Tick since offset and origin are only used in those cases.
- # Not doing this check could create an extra empty bin.
- p_start, end = _get_period_range_edges(
- start,
- end,
- self.freq,
- closed=self.closed,
- origin=self.origin,
- offset=self.offset,
- )
- # Get offset for bin edge (not label edge) adjustment
- start_offset = Period(start, self.freq) - Period(p_start, self.freq)
- # error: Item "Period" of "Union[Period, Any]" has no attribute "n"
- bin_shift = start_offset.n % freq_mult # type: ignore[union-attr]
- start = p_start
- labels = binner = period_range(
- start=start, end=end, freq=self.freq, name=ax.name
- )
- i8 = memb.asi8
- # when upsampling to subperiods, we need to generate enough bins
- expected_bins_count = len(binner) * freq_mult
- i8_extend = expected_bins_count - (i8[-1] - i8[0])
- rng = np.arange(i8[0], i8[-1] + i8_extend, freq_mult)
- rng += freq_mult
- # adjust bin edge indexes to account for base
- rng -= bin_shift
- # Wrap in PeriodArray for PeriodArray.searchsorted
- prng = type(memb._data)(rng, dtype=memb.dtype)
- bins = memb.searchsorted(prng, side="left")
- if nat_count > 0:
- binner, bins, labels = _insert_nat_bin(binner, bins, labels, nat_count)
- return binner, bins, labels
- def _take_new_index(
- obj: NDFrameT, indexer: npt.NDArray[np.intp], new_index: Index, axis: AxisInt = 0
- ) -> NDFrameT:
- if isinstance(obj, ABCSeries):
- new_values = algos.take_nd(obj._values, indexer)
- # error: Incompatible return value type (got "Series", expected "NDFrameT")
- return obj._constructor( # type: ignore[return-value]
- new_values, index=new_index, name=obj.name
- )
- elif isinstance(obj, ABCDataFrame):
- if axis == 1:
- raise NotImplementedError("axis 1 is not supported")
- new_mgr = obj._mgr.reindex_indexer(new_axis=new_index, indexer=indexer, axis=1)
- # error: Incompatible return value type
- # (got "DataFrame", expected "NDFrameT")
- return obj._constructor(new_mgr) # type: ignore[return-value]
- else:
- raise ValueError("'obj' should be either a Series or a DataFrame")
- def _get_timestamp_range_edges(
- first: Timestamp,
- last: Timestamp,
- freq: BaseOffset,
- unit: str,
- closed: Literal["right", "left"] = "left",
- origin: TimeGrouperOrigin = "start_day",
- offset: Timedelta | None = None,
- ) -> tuple[Timestamp, Timestamp]:
- """
- Adjust the `first` Timestamp to the preceding Timestamp that resides on
- the provided offset. Adjust the `last` Timestamp to the following
- Timestamp that resides on the provided offset. Input Timestamps that
- already reside on the offset will be adjusted depending on the type of
- offset and the `closed` parameter.
- Parameters
- ----------
- first : pd.Timestamp
- The beginning Timestamp of the range to be adjusted.
- last : pd.Timestamp
- The ending Timestamp of the range to be adjusted.
- freq : pd.DateOffset
- The dateoffset to which the Timestamps will be adjusted.
- closed : {'right', 'left'}, default "left"
- Which side of bin interval is closed.
- origin : {'epoch', 'start', 'start_day'} or Timestamp, default 'start_day'
- The timestamp on which to adjust the grouping. The timezone of origin must
- match the timezone of the index.
- If a timestamp is not used, these values are also supported:
- - 'epoch': `origin` is 1970-01-01
- - 'start': `origin` is the first value of the timeseries
- - 'start_day': `origin` is the first day at midnight of the timeseries
- offset : pd.Timedelta, default is None
- An offset timedelta added to the origin.
- Returns
- -------
- A tuple of length 2, containing the adjusted pd.Timestamp objects.
- """
- if isinstance(freq, Tick):
- index_tz = first.tz
- if isinstance(origin, Timestamp) and (origin.tz is None) != (index_tz is None):
- raise ValueError("The origin must have the same timezone as the index.")
- if origin == "epoch":
- # set the epoch based on the timezone to have similar bins results when
- # resampling on the same kind of indexes on different timezones
- origin = Timestamp("1970-01-01", tz=index_tz)
- if isinstance(freq, Day):
- # _adjust_dates_anchored assumes 'D' means 24H, but first/last
- # might contain a DST transition (23H, 24H, or 25H).
- # So "pretend" the dates are naive when adjusting the endpoints
- first = first.tz_localize(None)
- last = last.tz_localize(None)
- if isinstance(origin, Timestamp):
- origin = origin.tz_localize(None)
- first, last = _adjust_dates_anchored(
- first, last, freq, closed=closed, origin=origin, offset=offset, unit=unit
- )
- if isinstance(freq, Day):
- first = first.tz_localize(index_tz)
- last = last.tz_localize(index_tz)
- else:
- first = first.normalize()
- last = last.normalize()
- if closed == "left":
- first = Timestamp(freq.rollback(first))
- else:
- first = Timestamp(first - freq)
- last = Timestamp(last + freq)
- return first, last
- def _get_period_range_edges(
- first: Period,
- last: Period,
- freq: BaseOffset,
- closed: Literal["right", "left"] = "left",
- origin: TimeGrouperOrigin = "start_day",
- offset: Timedelta | None = None,
- ) -> tuple[Period, Period]:
- """
- Adjust the provided `first` and `last` Periods to the respective Period of
- the given offset that encompasses them.
- Parameters
- ----------
- first : pd.Period
- The beginning Period of the range to be adjusted.
- last : pd.Period
- The ending Period of the range to be adjusted.
- freq : pd.DateOffset
- The freq to which the Periods will be adjusted.
- closed : {'right', 'left'}, default "left"
- Which side of bin interval is closed.
- origin : {'epoch', 'start', 'start_day'}, Timestamp, default 'start_day'
- The timestamp on which to adjust the grouping. The timezone of origin must
- match the timezone of the index.
- If a timestamp is not used, these values are also supported:
- - 'epoch': `origin` is 1970-01-01
- - 'start': `origin` is the first value of the timeseries
- - 'start_day': `origin` is the first day at midnight of the timeseries
- offset : pd.Timedelta, default is None
- An offset timedelta added to the origin.
- Returns
- -------
- A tuple of length 2, containing the adjusted pd.Period objects.
- """
- if not all(isinstance(obj, Period) for obj in [first, last]):
- raise TypeError("'first' and 'last' must be instances of type Period")
- # GH 23882
- first_ts = first.to_timestamp()
- last_ts = last.to_timestamp()
- adjust_first = not freq.is_on_offset(first_ts)
- adjust_last = freq.is_on_offset(last_ts)
- first_ts, last_ts = _get_timestamp_range_edges(
- first_ts, last_ts, freq, unit="ns", closed=closed, origin=origin, offset=offset
- )
- first = (first_ts + int(adjust_first) * freq).to_period(freq)
- last = (last_ts - int(adjust_last) * freq).to_period(freq)
- return first, last
- def _insert_nat_bin(
- binner: PeriodIndex, bins: np.ndarray, labels: PeriodIndex, nat_count: int
- ) -> tuple[PeriodIndex, np.ndarray, PeriodIndex]:
- # NaT handling as in pandas._lib.lib.generate_bins_dt64()
- # shift bins by the number of NaT
- assert nat_count > 0
- bins += nat_count
- bins = np.insert(bins, 0, nat_count)
- # Incompatible types in assignment (expression has type "Index", variable
- # has type "PeriodIndex")
- binner = binner.insert(0, NaT) # type: ignore[assignment]
- # Incompatible types in assignment (expression has type "Index", variable
- # has type "PeriodIndex")
- labels = labels.insert(0, NaT) # type: ignore[assignment]
- return binner, bins, labels
- def _adjust_dates_anchored(
- first: Timestamp,
- last: Timestamp,
- freq: Tick,
- closed: Literal["right", "left"] = "right",
- origin: TimeGrouperOrigin = "start_day",
- offset: Timedelta | None = None,
- unit: str = "ns",
- ) -> tuple[Timestamp, Timestamp]:
- # First and last offsets should be calculated from the start day to fix an
- # error cause by resampling across multiple days when a one day period is
- # not a multiple of the frequency. See GH 8683
- # To handle frequencies that are not multiple or divisible by a day we let
- # the possibility to define a fixed origin timestamp. See GH 31809
- first = first.as_unit(unit)
- last = last.as_unit(unit)
- if offset is not None:
- offset = offset.as_unit(unit)
- freq_value = Timedelta(freq).as_unit(unit)._value
- origin_timestamp = 0 # origin == "epoch"
- if origin == "start_day":
- origin_timestamp = first.normalize()._value
- elif origin == "start":
- origin_timestamp = first._value
- elif isinstance(origin, Timestamp):
- origin_timestamp = origin.as_unit(unit)._value
- elif origin in ["end", "end_day"]:
- origin_last = last if origin == "end" else last.ceil("D")
- sub_freq_times = (origin_last._value - first._value) // freq_value
- if closed == "left":
- sub_freq_times += 1
- first = origin_last - sub_freq_times * freq
- origin_timestamp = first._value
- origin_timestamp += offset._value if offset else 0
- # GH 10117 & GH 19375. If first and last contain timezone information,
- # Perform the calculation in UTC in order to avoid localizing on an
- # Ambiguous or Nonexistent time.
- first_tzinfo = first.tzinfo
- last_tzinfo = last.tzinfo
- if first_tzinfo is not None:
- first = first.tz_convert("UTC")
- if last_tzinfo is not None:
- last = last.tz_convert("UTC")
- foffset = (first._value - origin_timestamp) % freq_value
- loffset = (last._value - origin_timestamp) % freq_value
- if closed == "right":
- if foffset > 0:
- # roll back
- fresult_int = first._value - foffset
- else:
- fresult_int = first._value - freq_value
- if loffset > 0:
- # roll forward
- lresult_int = last._value + (freq_value - loffset)
- else:
- # already the end of the road
- lresult_int = last._value
- else: # closed == 'left'
- if foffset > 0:
- fresult_int = first._value - foffset
- else:
- # start of the road
- fresult_int = first._value
- if loffset > 0:
- # roll forward
- lresult_int = last._value + (freq_value - loffset)
- else:
- lresult_int = last._value + freq_value
- fresult = Timestamp(fresult_int, unit=unit)
- lresult = Timestamp(lresult_int, unit=unit)
- if first_tzinfo is not None:
- fresult = fresult.tz_localize("UTC").tz_convert(first_tzinfo)
- if last_tzinfo is not None:
- lresult = lresult.tz_localize("UTC").tz_convert(last_tzinfo)
- return fresult, lresult
- def asfreq(
- obj: NDFrameT,
- freq,
- method=None,
- how=None,
- normalize: bool = False,
- fill_value=None,
- ) -> NDFrameT:
- """
- Utility frequency conversion method for Series/DataFrame.
- See :meth:`pandas.NDFrame.asfreq` for full documentation.
- """
- if isinstance(obj.index, PeriodIndex):
- if method is not None:
- raise NotImplementedError("'method' argument is not supported")
- if how is None:
- how = "E"
- new_obj = obj.copy()
- new_obj.index = obj.index.asfreq(freq, how=how)
- elif len(obj.index) == 0:
- new_obj = obj.copy()
- new_obj.index = _asfreq_compat(obj.index, freq)
- else:
- dti = date_range(obj.index.min(), obj.index.max(), freq=freq)
- dti.name = obj.index.name
- new_obj = obj.reindex(dti, method=method, fill_value=fill_value)
- if normalize:
- new_obj.index = new_obj.index.normalize()
- return new_obj
- def _asfreq_compat(index: DatetimeIndex | PeriodIndex | TimedeltaIndex, freq):
- """
- Helper to mimic asfreq on (empty) DatetimeIndex and TimedeltaIndex.
- Parameters
- ----------
- index : PeriodIndex, DatetimeIndex, or TimedeltaIndex
- freq : DateOffset
- Returns
- -------
- same type as index
- """
- if len(index) != 0:
- # This should never be reached, always checked by the caller
- raise ValueError(
- "Can only set arbitrary freq for empty DatetimeIndex or TimedeltaIndex"
- )
- new_index: Index
- if isinstance(index, PeriodIndex):
- new_index = index.asfreq(freq=freq)
- elif isinstance(index, DatetimeIndex):
- new_index = DatetimeIndex([], dtype=index.dtype, freq=freq, name=index.name)
- elif isinstance(index, TimedeltaIndex):
- new_index = TimedeltaIndex([], dtype=index.dtype, freq=freq, name=index.name)
- else: # pragma: no cover
- raise TypeError(type(index))
- return new_index
- def maybe_warn_args_and_kwargs(cls, kernel: str, args, kwargs) -> None:
- """
- Warn for deprecation of args and kwargs in resample functions.
- Parameters
- ----------
- cls : type
- Class to warn about.
- kernel : str
- Operation name.
- args : tuple or None
- args passed by user. Will be None if and only if kernel does not have args.
- kwargs : dict or None
- kwargs passed by user. Will be None if and only if kernel does not have kwargs.
- """
- warn_args = args is not None and len(args) > 0
- warn_kwargs = kwargs is not None and len(kwargs) > 0
- if warn_args and warn_kwargs:
- msg = "args and kwargs"
- elif warn_args:
- msg = "args"
- elif warn_kwargs:
- msg = "kwargs"
- else:
- return
- warnings.warn(
- f"Passing additional {msg} to {cls.__name__}.{kernel} has "
- "no impact on the result and is deprecated. This will "
- "raise a TypeError in a future version of pandas.",
- category=FutureWarning,
- stacklevel=find_stack_level(),
- )
|