123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- """
- Module containing utilities for NDFrame.sample() and .GroupBy.sample()
- """
- from __future__ import annotations
- from typing import TYPE_CHECKING
- import numpy as np
- from pandas._libs import lib
- from pandas._typing import AxisInt
- from pandas.core.dtypes.generic import (
- ABCDataFrame,
- ABCSeries,
- )
- if TYPE_CHECKING:
- from pandas.core.generic import NDFrame
- def preprocess_weights(obj: NDFrame, weights, axis: AxisInt) -> np.ndarray:
- """
- Process and validate the `weights` argument to `NDFrame.sample` and
- `.GroupBy.sample`.
- Returns `weights` as an ndarray[np.float64], validated except for normalizing
- weights (because that must be done groupwise in groupby sampling).
- """
- # If a series, align with frame
- if isinstance(weights, ABCSeries):
- weights = weights.reindex(obj.axes[axis])
- # Strings acceptable if a dataframe and axis = 0
- if isinstance(weights, str):
- if isinstance(obj, ABCDataFrame):
- if axis == 0:
- try:
- weights = obj[weights]
- except KeyError as err:
- raise KeyError(
- "String passed to weights not a valid column"
- ) from err
- else:
- raise ValueError(
- "Strings can only be passed to "
- "weights when sampling from rows on "
- "a DataFrame"
- )
- else:
- raise ValueError(
- "Strings cannot be passed as weights when sampling from a Series."
- )
- if isinstance(obj, ABCSeries):
- func = obj._constructor
- else:
- func = obj._constructor_sliced
- weights = func(weights, dtype="float64")._values
- if len(weights) != obj.shape[axis]:
- raise ValueError("Weights and axis to be sampled must be of same length")
- if lib.has_infs(weights):
- raise ValueError("weight vector may not include `inf` values")
- if (weights < 0).any():
- raise ValueError("weight vector many not include negative values")
- missing = np.isnan(weights)
- if missing.any():
- # Don't modify weights in place
- weights = weights.copy()
- weights[missing] = 0
- return weights
- def process_sampling_size(
- n: int | None, frac: float | None, replace: bool
- ) -> int | None:
- """
- Process and validate the `n` and `frac` arguments to `NDFrame.sample` and
- `.GroupBy.sample`.
- Returns None if `frac` should be used (variable sampling sizes), otherwise returns
- the constant sampling size.
- """
- # If no frac or n, default to n=1.
- if n is None and frac is None:
- n = 1
- elif n is not None and frac is not None:
- raise ValueError("Please enter a value for `frac` OR `n`, not both")
- elif n is not None:
- if n < 0:
- raise ValueError(
- "A negative number of rows requested. Please provide `n` >= 0."
- )
- if n % 1 != 0:
- raise ValueError("Only integers accepted as `n` values")
- else:
- assert frac is not None # for mypy
- if frac > 1 and not replace:
- raise ValueError(
- "Replace has to be set to `True` when "
- "upsampling the population `frac` > 1."
- )
- if frac < 0:
- raise ValueError(
- "A negative number of rows requested. Please provide `frac` >= 0."
- )
- return n
- def sample(
- obj_len: int,
- size: int,
- replace: bool,
- weights: np.ndarray | None,
- random_state: np.random.RandomState | np.random.Generator,
- ) -> np.ndarray:
- """
- Randomly sample `size` indices in `np.arange(obj_len)`
- Parameters
- ----------
- obj_len : int
- The length of the indices being considered
- size : int
- The number of values to choose
- replace : bool
- Allow or disallow sampling of the same row more than once.
- weights : np.ndarray[np.float64] or None
- If None, equal probability weighting, otherwise weights according
- to the vector normalized
- random_state: np.random.RandomState or np.random.Generator
- State used for the random sampling
- Returns
- -------
- np.ndarray[np.intp]
- """
- if weights is not None:
- weight_sum = weights.sum()
- if weight_sum != 0:
- weights = weights / weight_sum
- else:
- raise ValueError("Invalid weights: weights sum to zero")
- return random_state.choice(obj_len, size=size, replace=replace, p=weights).astype(
- np.intp, copy=False
- )
|