123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651 |
- """
- Quantilization functions and related stuff
- """
- from __future__ import annotations
- from typing import (
- Any,
- Callable,
- Literal,
- )
- import numpy as np
- from pandas._libs import (
- Timedelta,
- Timestamp,
- )
- from pandas._libs.lib import infer_dtype
- from pandas._typing import IntervalLeftRight
- from pandas.core.dtypes.common import (
- DT64NS_DTYPE,
- ensure_platform_int,
- is_bool_dtype,
- is_categorical_dtype,
- is_datetime64_dtype,
- is_datetime64tz_dtype,
- is_datetime_or_timedelta_dtype,
- is_extension_array_dtype,
- is_integer,
- is_list_like,
- is_numeric_dtype,
- is_scalar,
- is_timedelta64_dtype,
- )
- from pandas.core.dtypes.generic import ABCSeries
- from pandas.core.dtypes.missing import isna
- from pandas import (
- Categorical,
- Index,
- IntervalIndex,
- to_datetime,
- to_timedelta,
- )
- from pandas.core import nanops
- import pandas.core.algorithms as algos
- def cut(
- x,
- bins,
- right: bool = True,
- labels=None,
- retbins: bool = False,
- precision: int = 3,
- include_lowest: bool = False,
- duplicates: str = "raise",
- ordered: bool = True,
- ):
- """
- Bin values into discrete intervals.
- Use `cut` when you need to segment and sort data values into bins. This
- function is also useful for going from a continuous variable to a
- categorical variable. For example, `cut` could convert ages to groups of
- age ranges. Supports binning into an equal number of bins, or a
- pre-specified array of bins.
- Parameters
- ----------
- x : array-like
- The input array to be binned. Must be 1-dimensional.
- bins : int, sequence of scalars, or IntervalIndex
- The criteria to bin by.
- * int : Defines the number of equal-width bins in the range of `x`. The
- range of `x` is extended by .1% on each side to include the minimum
- and maximum values of `x`.
- * sequence of scalars : Defines the bin edges allowing for non-uniform
- width. No extension of the range of `x` is done.
- * IntervalIndex : Defines the exact bins to be used. Note that
- IntervalIndex for `bins` must be non-overlapping.
- right : bool, default True
- Indicates whether `bins` includes the rightmost edge or not. If
- ``right == True`` (the default), then the `bins` ``[1, 2, 3, 4]``
- indicate (1,2], (2,3], (3,4]. This argument is ignored when
- `bins` is an IntervalIndex.
- labels : array or False, default None
- Specifies the labels for the returned bins. Must be the same length as
- the resulting bins. If False, returns only integer indicators of the
- bins. This affects the type of the output container (see below).
- This argument is ignored when `bins` is an IntervalIndex. If True,
- raises an error. When `ordered=False`, labels must be provided.
- retbins : bool, default False
- Whether to return the bins or not. Useful when bins is provided
- as a scalar.
- precision : int, default 3
- The precision at which to store and display the bins labels.
- include_lowest : bool, default False
- Whether the first interval should be left-inclusive or not.
- duplicates : {default 'raise', 'drop'}, optional
- If bin edges are not unique, raise ValueError or drop non-uniques.
- ordered : bool, default True
- Whether the labels are ordered or not. Applies to returned types
- Categorical and Series (with Categorical dtype). If True,
- the resulting categorical will be ordered. If False, the resulting
- categorical will be unordered (labels must be provided).
- .. versionadded:: 1.1.0
- Returns
- -------
- out : Categorical, Series, or ndarray
- An array-like object representing the respective bin for each value
- of `x`. The type depends on the value of `labels`.
- * None (default) : returns a Series for Series `x` or a
- Categorical for all other inputs. The values stored within
- are Interval dtype.
- * sequence of scalars : returns a Series for Series `x` or a
- Categorical for all other inputs. The values stored within
- are whatever the type in the sequence is.
- * False : returns an ndarray of integers.
- bins : numpy.ndarray or IntervalIndex.
- The computed or specified bins. Only returned when `retbins=True`.
- For scalar or sequence `bins`, this is an ndarray with the computed
- bins. If set `duplicates=drop`, `bins` will drop non-unique bin. For
- an IntervalIndex `bins`, this is equal to `bins`.
- See Also
- --------
- qcut : Discretize variable into equal-sized buckets based on rank
- or based on sample quantiles.
- Categorical : Array type for storing data that come from a
- fixed set of values.
- Series : One-dimensional array with axis labels (including time series).
- IntervalIndex : Immutable Index implementing an ordered, sliceable set.
- Notes
- -----
- Any NA values will be NA in the result. Out of bounds values will be NA in
- the resulting Series or Categorical object.
- Reference :ref:`the user guide <reshaping.tile.cut>` for more examples.
- Examples
- --------
- Discretize into three equal-sized bins.
- >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3)
- ... # doctest: +ELLIPSIS
- [(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
- Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
- >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True)
- ... # doctest: +ELLIPSIS
- ([(0.994, 3.0], (5.0, 7.0], (3.0, 5.0], (3.0, 5.0], (5.0, 7.0], ...
- Categories (3, interval[float64, right]): [(0.994, 3.0] < (3.0, 5.0] ...
- array([0.994, 3. , 5. , 7. ]))
- Discovers the same bins, but assign them specific labels. Notice that
- the returned Categorical's categories are `labels` and is ordered.
- >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]),
- ... 3, labels=["bad", "medium", "good"])
- ['bad', 'good', 'medium', 'medium', 'good', 'bad']
- Categories (3, object): ['bad' < 'medium' < 'good']
- ``ordered=False`` will result in unordered categories when labels are passed.
- This parameter can be used to allow non-unique labels:
- >>> pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3,
- ... labels=["B", "A", "B"], ordered=False)
- ['B', 'B', 'A', 'A', 'B', 'B']
- Categories (2, object): ['A', 'B']
- ``labels=False`` implies you just want the bins back.
- >>> pd.cut([0, 1, 1, 2], bins=4, labels=False)
- array([0, 1, 1, 3])
- Passing a Series as an input returns a Series with categorical dtype:
- >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
- ... index=['a', 'b', 'c', 'd', 'e'])
- >>> pd.cut(s, 3)
- ... # doctest: +ELLIPSIS
- a (1.992, 4.667]
- b (1.992, 4.667]
- c (4.667, 7.333]
- d (7.333, 10.0]
- e (7.333, 10.0]
- dtype: category
- Categories (3, interval[float64, right]): [(1.992, 4.667] < (4.667, ...
- Passing a Series as an input returns a Series with mapping value.
- It is used to map numerically to intervals based on bins.
- >>> s = pd.Series(np.array([2, 4, 6, 8, 10]),
- ... index=['a', 'b', 'c', 'd', 'e'])
- >>> pd.cut(s, [0, 2, 4, 6, 8, 10], labels=False, retbins=True, right=False)
- ... # doctest: +ELLIPSIS
- (a 1.0
- b 2.0
- c 3.0
- d 4.0
- e NaN
- dtype: float64,
- array([ 0, 2, 4, 6, 8, 10]))
- Use `drop` optional when bins is not unique
- >>> pd.cut(s, [0, 2, 4, 6, 10, 10], labels=False, retbins=True,
- ... right=False, duplicates='drop')
- ... # doctest: +ELLIPSIS
- (a 1.0
- b 2.0
- c 3.0
- d 3.0
- e NaN
- dtype: float64,
- array([ 0, 2, 4, 6, 10]))
- Passing an IntervalIndex for `bins` results in those categories exactly.
- Notice that values not covered by the IntervalIndex are set to NaN. 0
- is to the left of the first bin (which is closed on the right), and 1.5
- falls between two bins.
- >>> bins = pd.IntervalIndex.from_tuples([(0, 1), (2, 3), (4, 5)])
- >>> pd.cut([0, 0.5, 1.5, 2.5, 4.5], bins)
- [NaN, (0.0, 1.0], NaN, (2.0, 3.0], (4.0, 5.0]]
- Categories (3, interval[int64, right]): [(0, 1] < (2, 3] < (4, 5]]
- """
- # NOTE: this binning code is changed a bit from histogram for var(x) == 0
- original = x
- x = _preprocess_for_cut(x)
- x, dtype = _coerce_to_type(x)
- if not np.iterable(bins):
- if is_scalar(bins) and bins < 1:
- raise ValueError("`bins` should be a positive integer.")
- try: # for array-like
- sz = x.size
- except AttributeError:
- x = np.asarray(x)
- sz = x.size
- if sz == 0:
- raise ValueError("Cannot cut empty array")
- rng = (nanops.nanmin(x), nanops.nanmax(x))
- mn, mx = (mi + 0.0 for mi in rng)
- if np.isinf(mn) or np.isinf(mx):
- # GH 24314
- raise ValueError(
- "cannot specify integer `bins` when input data contains infinity"
- )
- if mn == mx: # adjust end points before binning
- mn -= 0.001 * abs(mn) if mn != 0 else 0.001
- mx += 0.001 * abs(mx) if mx != 0 else 0.001
- bins = np.linspace(mn, mx, bins + 1, endpoint=True)
- else: # adjust end points after binning
- bins = np.linspace(mn, mx, bins + 1, endpoint=True)
- adj = (mx - mn) * 0.001 # 0.1% of the range
- if right:
- bins[0] -= adj
- else:
- bins[-1] += adj
- elif isinstance(bins, IntervalIndex):
- if bins.is_overlapping:
- raise ValueError("Overlapping IntervalIndex is not accepted.")
- else:
- if is_datetime64tz_dtype(bins):
- bins = np.asarray(bins, dtype=DT64NS_DTYPE)
- else:
- bins = np.asarray(bins)
- bins = _convert_bin_to_numeric_type(bins, dtype)
- # GH 26045: cast to float64 to avoid an overflow
- if (np.diff(bins.astype("float64")) < 0).any():
- raise ValueError("bins must increase monotonically.")
- fac, bins = _bins_to_cuts(
- x,
- bins,
- right=right,
- labels=labels,
- precision=precision,
- include_lowest=include_lowest,
- dtype=dtype,
- duplicates=duplicates,
- ordered=ordered,
- )
- return _postprocess_for_cut(fac, bins, retbins, dtype, original)
- def qcut(
- x,
- q,
- labels=None,
- retbins: bool = False,
- precision: int = 3,
- duplicates: str = "raise",
- ):
- """
- Quantile-based discretization function.
- Discretize variable into equal-sized buckets based on rank or based
- on sample quantiles. For example 1000 values for 10 quantiles would
- produce a Categorical object indicating quantile membership for each data point.
- Parameters
- ----------
- x : 1d ndarray or Series
- q : int or list-like of float
- Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
- array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.
- labels : array or False, default None
- Used as labels for the resulting bins. Must be of the same length as
- the resulting bins. If False, return only integer indicators of the
- bins. If True, raises an error.
- retbins : bool, optional
- Whether to return the (bins, labels) or not. Can be useful if bins
- is given as a scalar.
- precision : int, optional
- The precision at which to store and display the bins labels.
- duplicates : {default 'raise', 'drop'}, optional
- If bin edges are not unique, raise ValueError or drop non-uniques.
- Returns
- -------
- out : Categorical or Series or array of integers if labels is False
- The return type (Categorical or Series) depends on the input: a Series
- of type category if input is a Series else Categorical. Bins are
- represented as categories when categorical data is returned.
- bins : ndarray of floats
- Returned only if `retbins` is True.
- Notes
- -----
- Out of bounds values will be NA in the resulting Categorical object
- Examples
- --------
- >>> pd.qcut(range(5), 4)
- ... # doctest: +ELLIPSIS
- [(-0.001, 1.0], (-0.001, 1.0], (1.0, 2.0], (2.0, 3.0], (3.0, 4.0]]
- Categories (4, interval[float64, right]): [(-0.001, 1.0] < (1.0, 2.0] ...
- >>> pd.qcut(range(5), 3, labels=["good", "medium", "bad"])
- ... # doctest: +SKIP
- [good, good, medium, bad, bad]
- Categories (3, object): [good < medium < bad]
- >>> pd.qcut(range(5), 4, labels=False)
- array([0, 0, 1, 2, 3])
- """
- original = x
- x = _preprocess_for_cut(x)
- x, dtype = _coerce_to_type(x)
- quantiles = np.linspace(0, 1, q + 1) if is_integer(q) else q
- x_np = np.asarray(x)
- x_np = x_np[~np.isnan(x_np)]
- bins = np.quantile(x_np, quantiles)
- fac, bins = _bins_to_cuts(
- x,
- bins,
- labels=labels,
- precision=precision,
- include_lowest=True,
- dtype=dtype,
- duplicates=duplicates,
- )
- return _postprocess_for_cut(fac, bins, retbins, dtype, original)
- def _bins_to_cuts(
- x,
- bins: np.ndarray,
- right: bool = True,
- labels=None,
- precision: int = 3,
- include_lowest: bool = False,
- dtype=None,
- duplicates: str = "raise",
- ordered: bool = True,
- ):
- if not ordered and labels is None:
- raise ValueError("'labels' must be provided if 'ordered = False'")
- if duplicates not in ["raise", "drop"]:
- raise ValueError(
- "invalid value for 'duplicates' parameter, valid options are: raise, drop"
- )
- if isinstance(bins, IntervalIndex):
- # we have a fast-path here
- ids = bins.get_indexer(x)
- result = Categorical.from_codes(ids, categories=bins, ordered=True)
- return result, bins
- unique_bins = algos.unique(bins)
- if len(unique_bins) < len(bins) and len(bins) != 2:
- if duplicates == "raise":
- raise ValueError(
- f"Bin edges must be unique: {repr(bins)}.\n"
- f"You can drop duplicate edges by setting the 'duplicates' kwarg"
- )
- bins = unique_bins
- side: Literal["left", "right"] = "left" if right else "right"
- ids = ensure_platform_int(bins.searchsorted(x, side=side))
- if include_lowest:
- ids[np.asarray(x) == bins[0]] = 1
- na_mask = isna(x) | (ids == len(bins)) | (ids == 0)
- has_nas = na_mask.any()
- if labels is not False:
- if not (labels is None or is_list_like(labels)):
- raise ValueError(
- "Bin labels must either be False, None or passed in as a "
- "list-like argument"
- )
- if labels is None:
- labels = _format_labels(
- bins, precision, right=right, include_lowest=include_lowest, dtype=dtype
- )
- elif ordered and len(set(labels)) != len(labels):
- raise ValueError(
- "labels must be unique if ordered=True; pass ordered=False "
- "for duplicate labels"
- )
- else:
- if len(labels) != len(bins) - 1:
- raise ValueError(
- "Bin labels must be one fewer than the number of bin edges"
- )
- if not is_categorical_dtype(labels):
- labels = Categorical(
- labels,
- categories=labels if len(set(labels)) == len(labels) else None,
- ordered=ordered,
- )
- # TODO: handle mismatch between categorical label order and pandas.cut order.
- np.putmask(ids, na_mask, 0)
- result = algos.take_nd(labels, ids - 1)
- else:
- result = ids - 1
- if has_nas:
- result = result.astype(np.float64)
- np.putmask(result, na_mask, np.nan)
- return result, bins
- def _coerce_to_type(x):
- """
- if the passed data is of datetime/timedelta, bool or nullable int type,
- this method converts it to numeric so that cut or qcut method can
- handle it
- """
- dtype = None
- if is_datetime64tz_dtype(x.dtype):
- dtype = x.dtype
- elif is_datetime64_dtype(x.dtype):
- x = to_datetime(x).astype("datetime64[ns]", copy=False)
- dtype = np.dtype("datetime64[ns]")
- elif is_timedelta64_dtype(x.dtype):
- x = to_timedelta(x)
- dtype = np.dtype("timedelta64[ns]")
- elif is_bool_dtype(x.dtype):
- # GH 20303
- x = x.astype(np.int64)
- # To support cut and qcut for IntegerArray we convert to float dtype.
- # Will properly support in the future.
- # https://github.com/pandas-dev/pandas/pull/31290
- # https://github.com/pandas-dev/pandas/issues/31389
- elif is_extension_array_dtype(x.dtype) and is_numeric_dtype(x.dtype):
- x = x.to_numpy(dtype=np.float64, na_value=np.nan)
- if dtype is not None:
- # GH 19768: force NaT to NaN during integer conversion
- x = np.where(x.notna(), x.view(np.int64), np.nan)
- return x, dtype
- def _convert_bin_to_numeric_type(bins, dtype):
- """
- if the passed bin is of datetime/timedelta type,
- this method converts it to integer
- Parameters
- ----------
- bins : list-like of bins
- dtype : dtype of data
- Raises
- ------
- ValueError if bins are not of a compat dtype to dtype
- """
- bins_dtype = infer_dtype(bins, skipna=False)
- if is_timedelta64_dtype(dtype):
- if bins_dtype in ["timedelta", "timedelta64"]:
- bins = to_timedelta(bins).view(np.int64)
- else:
- raise ValueError("bins must be of timedelta64 dtype")
- elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
- if bins_dtype in ["datetime", "datetime64"]:
- bins = to_datetime(bins)
- if is_datetime64_dtype(bins):
- # As of 2.0, to_datetime may give non-nano, so we need to convert
- # here until the rest of this file recognizes non-nano
- bins = bins.astype("datetime64[ns]", copy=False)
- bins = bins.view(np.int64)
- else:
- raise ValueError("bins must be of datetime64 dtype")
- return bins
- def _convert_bin_to_datelike_type(bins, dtype):
- """
- Convert bins to a DatetimeIndex or TimedeltaIndex if the original dtype is
- datelike
- Parameters
- ----------
- bins : list-like of bins
- dtype : dtype of data
- Returns
- -------
- bins : Array-like of bins, DatetimeIndex or TimedeltaIndex if dtype is
- datelike
- """
- if is_datetime64tz_dtype(dtype):
- bins = to_datetime(bins.astype(np.int64), utc=True).tz_convert(dtype.tz)
- elif is_datetime_or_timedelta_dtype(dtype):
- bins = Index(bins.astype(np.int64), dtype=dtype)
- return bins
- def _format_labels(
- bins, precision: int, right: bool = True, include_lowest: bool = False, dtype=None
- ):
- """based on the dtype, return our labels"""
- closed: IntervalLeftRight = "right" if right else "left"
- formatter: Callable[[Any], Timestamp] | Callable[[Any], Timedelta]
- if is_datetime64tz_dtype(dtype):
- formatter = lambda x: Timestamp(x, tz=dtype.tz)
- adjust = lambda x: x - Timedelta("1ns")
- elif is_datetime64_dtype(dtype):
- formatter = Timestamp
- adjust = lambda x: x - Timedelta("1ns")
- elif is_timedelta64_dtype(dtype):
- formatter = Timedelta
- adjust = lambda x: x - Timedelta("1ns")
- else:
- precision = _infer_precision(precision, bins)
- formatter = lambda x: _round_frac(x, precision)
- adjust = lambda x: x - 10 ** (-precision)
- breaks = [formatter(b) for b in bins]
- if right and include_lowest:
- # adjust lhs of first interval by precision to account for being right closed
- breaks[0] = adjust(breaks[0])
- return IntervalIndex.from_breaks(breaks, closed=closed)
- def _preprocess_for_cut(x):
- """
- handles preprocessing for cut where we convert passed
- input to array, strip the index information and store it
- separately
- """
- # Check that the passed array is a Pandas or Numpy object
- # We don't want to strip away a Pandas data-type here (e.g. datetimetz)
- ndim = getattr(x, "ndim", None)
- if ndim is None:
- x = np.asarray(x)
- if x.ndim != 1:
- raise ValueError("Input array must be 1 dimensional")
- return x
- def _postprocess_for_cut(fac, bins, retbins: bool, dtype, original):
- """
- handles post processing for the cut method where
- we combine the index information if the originally passed
- datatype was a series
- """
- if isinstance(original, ABCSeries):
- fac = original._constructor(fac, index=original.index, name=original.name)
- if not retbins:
- return fac
- bins = _convert_bin_to_datelike_type(bins, dtype)
- return fac, bins
- def _round_frac(x, precision: int):
- """
- Round the fractional part of the given number
- """
- if not np.isfinite(x) or x == 0:
- return x
- else:
- frac, whole = np.modf(x)
- if whole == 0:
- digits = -int(np.floor(np.log10(abs(frac)))) - 1 + precision
- else:
- digits = precision
- return np.around(x, digits)
- def _infer_precision(base_precision: int, bins) -> int:
- """
- Infer an appropriate precision for _round_frac
- """
- for precision in range(base_precision, 20):
- levels = [_round_frac(b, precision) for b in bins]
- if algos.unique(levels).size == bins.size:
- return precision
- return base_precision # default
|