123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408 |
- """
- Module responsible for execution of NDFrame.describe() method.
- Method NDFrame.describe() delegates actual execution to function describe_ndframe().
- """
- from __future__ import annotations
- from abc import (
- ABC,
- abstractmethod,
- )
- from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Sequence,
- cast,
- )
- import numpy as np
- from pandas._libs.tslibs import Timestamp
- from pandas._typing import (
- DtypeObj,
- NDFrameT,
- npt,
- )
- from pandas.util._validators import validate_percentile
- from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_complex_dtype,
- is_extension_array_dtype,
- is_numeric_dtype,
- )
- from pandas.core.dtypes.dtypes import DatetimeTZDtype
- from pandas.core.arrays.arrow.dtype import ArrowDtype
- from pandas.core.arrays.floating import Float64Dtype
- from pandas.core.reshape.concat import concat
- from pandas.io.formats.format import format_percentiles
- if TYPE_CHECKING:
- from pandas import (
- DataFrame,
- Series,
- )
- def describe_ndframe(
- *,
- obj: NDFrameT,
- include: str | Sequence[str] | None,
- exclude: str | Sequence[str] | None,
- percentiles: Sequence[float] | np.ndarray | None,
- ) -> NDFrameT:
- """Describe series or dataframe.
- Called from pandas.core.generic.NDFrame.describe()
- Parameters
- ----------
- obj: DataFrame or Series
- Either dataframe or series to be described.
- include : 'all', list-like of dtypes or None (default), optional
- A white list of data types to include in the result. Ignored for ``Series``.
- exclude : list-like of dtypes or None (default), optional,
- A black list of data types to omit from the result. Ignored for ``Series``.
- percentiles : list-like of numbers, optional
- The percentiles to include in the output. All should fall between 0 and 1.
- The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
- 75th percentiles.
- Returns
- -------
- Dataframe or series description.
- """
- percentiles = refine_percentiles(percentiles)
- describer: NDFrameDescriberAbstract
- if obj.ndim == 1:
- describer = SeriesDescriber(
- obj=cast("Series", obj),
- )
- else:
- describer = DataFrameDescriber(
- obj=cast("DataFrame", obj),
- include=include,
- exclude=exclude,
- )
- result = describer.describe(percentiles=percentiles)
- return cast(NDFrameT, result)
- class NDFrameDescriberAbstract(ABC):
- """Abstract class for describing dataframe or series.
- Parameters
- ----------
- obj : Series or DataFrame
- Object to be described.
- """
- def __init__(self, obj: DataFrame | Series) -> None:
- self.obj = obj
- @abstractmethod
- def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
- """Do describe either series or dataframe.
- Parameters
- ----------
- percentiles : list-like of numbers
- The percentiles to include in the output.
- """
- class SeriesDescriber(NDFrameDescriberAbstract):
- """Class responsible for creating series description."""
- obj: Series
- def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
- describe_func = select_describe_func(
- self.obj,
- )
- return describe_func(self.obj, percentiles)
- class DataFrameDescriber(NDFrameDescriberAbstract):
- """Class responsible for creating dataobj description.
- Parameters
- ----------
- obj : DataFrame
- DataFrame to be described.
- include : 'all', list-like of dtypes or None
- A white list of data types to include in the result.
- exclude : list-like of dtypes or None
- A black list of data types to omit from the result.
- """
- def __init__(
- self,
- obj: DataFrame,
- *,
- include: str | Sequence[str] | None,
- exclude: str | Sequence[str] | None,
- ) -> None:
- self.include = include
- self.exclude = exclude
- if obj.ndim == 2 and obj.columns.size == 0:
- raise ValueError("Cannot describe a DataFrame without columns")
- super().__init__(obj)
- def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
- data = self._select_data()
- ldesc: list[Series] = []
- for _, series in data.items():
- describe_func = select_describe_func(series)
- ldesc.append(describe_func(series, percentiles))
- col_names = reorder_columns(ldesc)
- d = concat(
- [x.reindex(col_names, copy=False) for x in ldesc],
- axis=1,
- sort=False,
- )
- d.columns = data.columns.copy()
- return d
- def _select_data(self):
- """Select columns to be described."""
- if (self.include is None) and (self.exclude is None):
- # when some numerics are found, keep only numerics
- default_include: list[npt.DTypeLike] = [np.number, "datetime"]
- data = self.obj.select_dtypes(include=default_include)
- if len(data.columns) == 0:
- data = self.obj
- elif self.include == "all":
- if self.exclude is not None:
- msg = "exclude must be None when include is 'all'"
- raise ValueError(msg)
- data = self.obj
- else:
- data = self.obj.select_dtypes(
- include=self.include,
- exclude=self.exclude,
- )
- return data
- def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]:
- """Set a convenient order for rows for display."""
- names: list[Hashable] = []
- ldesc_indexes = sorted((x.index for x in ldesc), key=len)
- for idxnames in ldesc_indexes:
- for name in idxnames:
- if name not in names:
- names.append(name)
- return names
- def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
- """Describe series containing numerical data.
- Parameters
- ----------
- series : Series
- Series to be described.
- percentiles : list-like of numbers
- The percentiles to include in the output.
- """
- from pandas import Series
- formatted_percentiles = format_percentiles(percentiles)
- stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
- d = (
- [series.count(), series.mean(), series.std(), series.min()]
- + series.quantile(percentiles).tolist()
- + [series.max()]
- )
- # GH#48340 - always return float on non-complex numeric data
- dtype: DtypeObj | None
- if is_extension_array_dtype(series):
- if isinstance(series.dtype, ArrowDtype):
- if series.dtype.kind == "m":
- # GH53001: describe timedeltas with object dtype
- dtype = None
- else:
- import pyarrow as pa
- dtype = ArrowDtype(pa.float64())
- else:
- dtype = Float64Dtype()
- elif is_numeric_dtype(series) and not is_complex_dtype(series):
- dtype = np.dtype("float")
- else:
- dtype = None
- return Series(d, index=stat_index, name=series.name, dtype=dtype)
- def describe_categorical_1d(
- data: Series,
- percentiles_ignored: Sequence[float],
- ) -> Series:
- """Describe series containing categorical data.
- Parameters
- ----------
- data : Series
- Series to be described.
- percentiles_ignored : list-like of numbers
- Ignored, but in place to unify interface.
- """
- names = ["count", "unique", "top", "freq"]
- objcounts = data.value_counts()
- count_unique = len(objcounts[objcounts != 0])
- if count_unique > 0:
- top, freq = objcounts.index[0], objcounts.iloc[0]
- dtype = None
- else:
- # If the DataFrame is empty, set 'top' and 'freq' to None
- # to maintain output shape consistency
- top, freq = np.nan, np.nan
- dtype = "object"
- result = [data.count(), count_unique, top, freq]
- from pandas import Series
- return Series(result, index=names, name=data.name, dtype=dtype)
- def describe_timestamp_as_categorical_1d(
- data: Series,
- percentiles_ignored: Sequence[float],
- ) -> Series:
- """Describe series containing timestamp data treated as categorical.
- Parameters
- ----------
- data : Series
- Series to be described.
- percentiles_ignored : list-like of numbers
- Ignored, but in place to unify interface.
- """
- names = ["count", "unique"]
- objcounts = data.value_counts()
- count_unique = len(objcounts[objcounts != 0])
- result = [data.count(), count_unique]
- dtype = None
- if count_unique > 0:
- top, freq = objcounts.index[0], objcounts.iloc[0]
- tz = data.dt.tz
- asint = data.dropna().values.view("i8")
- top = Timestamp(top)
- if top.tzinfo is not None and tz is not None:
- # Don't tz_localize(None) if key is already tz-aware
- top = top.tz_convert(tz)
- else:
- top = top.tz_localize(tz)
- names += ["top", "freq", "first", "last"]
- result += [
- top,
- freq,
- Timestamp(asint.min(), tz=tz),
- Timestamp(asint.max(), tz=tz),
- ]
- # If the DataFrame is empty, set 'top' and 'freq' to None
- # to maintain output shape consistency
- else:
- names += ["top", "freq"]
- result += [np.nan, np.nan]
- dtype = "object"
- from pandas import Series
- return Series(result, index=names, name=data.name, dtype=dtype)
- def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:
- """Describe series containing datetime64 dtype.
- Parameters
- ----------
- data : Series
- Series to be described.
- percentiles : list-like of numbers
- The percentiles to include in the output.
- """
- # GH-30164
- from pandas import Series
- formatted_percentiles = format_percentiles(percentiles)
- stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
- d = (
- [data.count(), data.mean(), data.min()]
- + data.quantile(percentiles).tolist()
- + [data.max()]
- )
- return Series(d, index=stat_index, name=data.name)
- def select_describe_func(
- data: Series,
- ) -> Callable:
- """Select proper function for describing series based on data type.
- Parameters
- ----------
- data : Series
- Series to be described.
- """
- if is_bool_dtype(data.dtype):
- return describe_categorical_1d
- elif is_numeric_dtype(data):
- return describe_numeric_1d
- elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
- return describe_timestamp_1d
- elif data.dtype.kind == "m":
- return describe_numeric_1d
- else:
- return describe_categorical_1d
- def refine_percentiles(
- percentiles: Sequence[float] | np.ndarray | None,
- ) -> np.ndarray[Any, np.dtype[np.float64]]:
- """
- Ensure that percentiles are unique and sorted.
- Parameters
- ----------
- percentiles : list-like of numbers, optional
- The percentiles to include in the output.
- """
- if percentiles is None:
- return np.array([0.25, 0.5, 0.75])
- # explicit conversion of `percentiles` to list
- percentiles = list(percentiles)
- # get them all to be in [0, 1]
- validate_percentile(percentiles)
- # median should always be included
- if 0.5 not in percentiles:
- percentiles.append(0.5)
- percentiles = np.asarray(percentiles)
- # sort and check for duplicates
- unique_pcts = np.unique(percentiles)
- assert percentiles is not None
- if len(unique_pcts) < len(percentiles):
- raise ValueError("percentiles cannot contain duplicates")
- return unique_pcts
|