describe.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408
  1. """
  2. Module responsible for execution of NDFrame.describe() method.
  3. Method NDFrame.describe() delegates actual execution to function describe_ndframe().
  4. """
  5. from __future__ import annotations
  6. from abc import (
  7. ABC,
  8. abstractmethod,
  9. )
  10. from typing import (
  11. TYPE_CHECKING,
  12. Any,
  13. Callable,
  14. Hashable,
  15. Sequence,
  16. cast,
  17. )
  18. import numpy as np
  19. from pandas._libs.tslibs import Timestamp
  20. from pandas._typing import (
  21. DtypeObj,
  22. NDFrameT,
  23. npt,
  24. )
  25. from pandas.util._validators import validate_percentile
  26. from pandas.core.dtypes.common import (
  27. is_bool_dtype,
  28. is_complex_dtype,
  29. is_extension_array_dtype,
  30. is_numeric_dtype,
  31. )
  32. from pandas.core.dtypes.dtypes import DatetimeTZDtype
  33. from pandas.core.arrays.arrow.dtype import ArrowDtype
  34. from pandas.core.arrays.floating import Float64Dtype
  35. from pandas.core.reshape.concat import concat
  36. from pandas.io.formats.format import format_percentiles
  37. if TYPE_CHECKING:
  38. from pandas import (
  39. DataFrame,
  40. Series,
  41. )
  42. def describe_ndframe(
  43. *,
  44. obj: NDFrameT,
  45. include: str | Sequence[str] | None,
  46. exclude: str | Sequence[str] | None,
  47. percentiles: Sequence[float] | np.ndarray | None,
  48. ) -> NDFrameT:
  49. """Describe series or dataframe.
  50. Called from pandas.core.generic.NDFrame.describe()
  51. Parameters
  52. ----------
  53. obj: DataFrame or Series
  54. Either dataframe or series to be described.
  55. include : 'all', list-like of dtypes or None (default), optional
  56. A white list of data types to include in the result. Ignored for ``Series``.
  57. exclude : list-like of dtypes or None (default), optional,
  58. A black list of data types to omit from the result. Ignored for ``Series``.
  59. percentiles : list-like of numbers, optional
  60. The percentiles to include in the output. All should fall between 0 and 1.
  61. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
  62. 75th percentiles.
  63. Returns
  64. -------
  65. Dataframe or series description.
  66. """
  67. percentiles = refine_percentiles(percentiles)
  68. describer: NDFrameDescriberAbstract
  69. if obj.ndim == 1:
  70. describer = SeriesDescriber(
  71. obj=cast("Series", obj),
  72. )
  73. else:
  74. describer = DataFrameDescriber(
  75. obj=cast("DataFrame", obj),
  76. include=include,
  77. exclude=exclude,
  78. )
  79. result = describer.describe(percentiles=percentiles)
  80. return cast(NDFrameT, result)
  81. class NDFrameDescriberAbstract(ABC):
  82. """Abstract class for describing dataframe or series.
  83. Parameters
  84. ----------
  85. obj : Series or DataFrame
  86. Object to be described.
  87. """
  88. def __init__(self, obj: DataFrame | Series) -> None:
  89. self.obj = obj
  90. @abstractmethod
  91. def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
  92. """Do describe either series or dataframe.
  93. Parameters
  94. ----------
  95. percentiles : list-like of numbers
  96. The percentiles to include in the output.
  97. """
  98. class SeriesDescriber(NDFrameDescriberAbstract):
  99. """Class responsible for creating series description."""
  100. obj: Series
  101. def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
  102. describe_func = select_describe_func(
  103. self.obj,
  104. )
  105. return describe_func(self.obj, percentiles)
  106. class DataFrameDescriber(NDFrameDescriberAbstract):
  107. """Class responsible for creating dataobj description.
  108. Parameters
  109. ----------
  110. obj : DataFrame
  111. DataFrame to be described.
  112. include : 'all', list-like of dtypes or None
  113. A white list of data types to include in the result.
  114. exclude : list-like of dtypes or None
  115. A black list of data types to omit from the result.
  116. """
  117. def __init__(
  118. self,
  119. obj: DataFrame,
  120. *,
  121. include: str | Sequence[str] | None,
  122. exclude: str | Sequence[str] | None,
  123. ) -> None:
  124. self.include = include
  125. self.exclude = exclude
  126. if obj.ndim == 2 and obj.columns.size == 0:
  127. raise ValueError("Cannot describe a DataFrame without columns")
  128. super().__init__(obj)
  129. def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
  130. data = self._select_data()
  131. ldesc: list[Series] = []
  132. for _, series in data.items():
  133. describe_func = select_describe_func(series)
  134. ldesc.append(describe_func(series, percentiles))
  135. col_names = reorder_columns(ldesc)
  136. d = concat(
  137. [x.reindex(col_names, copy=False) for x in ldesc],
  138. axis=1,
  139. sort=False,
  140. )
  141. d.columns = data.columns.copy()
  142. return d
  143. def _select_data(self):
  144. """Select columns to be described."""
  145. if (self.include is None) and (self.exclude is None):
  146. # when some numerics are found, keep only numerics
  147. default_include: list[npt.DTypeLike] = [np.number, "datetime"]
  148. data = self.obj.select_dtypes(include=default_include)
  149. if len(data.columns) == 0:
  150. data = self.obj
  151. elif self.include == "all":
  152. if self.exclude is not None:
  153. msg = "exclude must be None when include is 'all'"
  154. raise ValueError(msg)
  155. data = self.obj
  156. else:
  157. data = self.obj.select_dtypes(
  158. include=self.include,
  159. exclude=self.exclude,
  160. )
  161. return data
  162. def reorder_columns(ldesc: Sequence[Series]) -> list[Hashable]:
  163. """Set a convenient order for rows for display."""
  164. names: list[Hashable] = []
  165. ldesc_indexes = sorted((x.index for x in ldesc), key=len)
  166. for idxnames in ldesc_indexes:
  167. for name in idxnames:
  168. if name not in names:
  169. names.append(name)
  170. return names
  171. def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
  172. """Describe series containing numerical data.
  173. Parameters
  174. ----------
  175. series : Series
  176. Series to be described.
  177. percentiles : list-like of numbers
  178. The percentiles to include in the output.
  179. """
  180. from pandas import Series
  181. formatted_percentiles = format_percentiles(percentiles)
  182. stat_index = ["count", "mean", "std", "min"] + formatted_percentiles + ["max"]
  183. d = (
  184. [series.count(), series.mean(), series.std(), series.min()]
  185. + series.quantile(percentiles).tolist()
  186. + [series.max()]
  187. )
  188. # GH#48340 - always return float on non-complex numeric data
  189. dtype: DtypeObj | None
  190. if is_extension_array_dtype(series):
  191. if isinstance(series.dtype, ArrowDtype):
  192. if series.dtype.kind == "m":
  193. # GH53001: describe timedeltas with object dtype
  194. dtype = None
  195. else:
  196. import pyarrow as pa
  197. dtype = ArrowDtype(pa.float64())
  198. else:
  199. dtype = Float64Dtype()
  200. elif is_numeric_dtype(series) and not is_complex_dtype(series):
  201. dtype = np.dtype("float")
  202. else:
  203. dtype = None
  204. return Series(d, index=stat_index, name=series.name, dtype=dtype)
  205. def describe_categorical_1d(
  206. data: Series,
  207. percentiles_ignored: Sequence[float],
  208. ) -> Series:
  209. """Describe series containing categorical data.
  210. Parameters
  211. ----------
  212. data : Series
  213. Series to be described.
  214. percentiles_ignored : list-like of numbers
  215. Ignored, but in place to unify interface.
  216. """
  217. names = ["count", "unique", "top", "freq"]
  218. objcounts = data.value_counts()
  219. count_unique = len(objcounts[objcounts != 0])
  220. if count_unique > 0:
  221. top, freq = objcounts.index[0], objcounts.iloc[0]
  222. dtype = None
  223. else:
  224. # If the DataFrame is empty, set 'top' and 'freq' to None
  225. # to maintain output shape consistency
  226. top, freq = np.nan, np.nan
  227. dtype = "object"
  228. result = [data.count(), count_unique, top, freq]
  229. from pandas import Series
  230. return Series(result, index=names, name=data.name, dtype=dtype)
  231. def describe_timestamp_as_categorical_1d(
  232. data: Series,
  233. percentiles_ignored: Sequence[float],
  234. ) -> Series:
  235. """Describe series containing timestamp data treated as categorical.
  236. Parameters
  237. ----------
  238. data : Series
  239. Series to be described.
  240. percentiles_ignored : list-like of numbers
  241. Ignored, but in place to unify interface.
  242. """
  243. names = ["count", "unique"]
  244. objcounts = data.value_counts()
  245. count_unique = len(objcounts[objcounts != 0])
  246. result = [data.count(), count_unique]
  247. dtype = None
  248. if count_unique > 0:
  249. top, freq = objcounts.index[0], objcounts.iloc[0]
  250. tz = data.dt.tz
  251. asint = data.dropna().values.view("i8")
  252. top = Timestamp(top)
  253. if top.tzinfo is not None and tz is not None:
  254. # Don't tz_localize(None) if key is already tz-aware
  255. top = top.tz_convert(tz)
  256. else:
  257. top = top.tz_localize(tz)
  258. names += ["top", "freq", "first", "last"]
  259. result += [
  260. top,
  261. freq,
  262. Timestamp(asint.min(), tz=tz),
  263. Timestamp(asint.max(), tz=tz),
  264. ]
  265. # If the DataFrame is empty, set 'top' and 'freq' to None
  266. # to maintain output shape consistency
  267. else:
  268. names += ["top", "freq"]
  269. result += [np.nan, np.nan]
  270. dtype = "object"
  271. from pandas import Series
  272. return Series(result, index=names, name=data.name, dtype=dtype)
  273. def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:
  274. """Describe series containing datetime64 dtype.
  275. Parameters
  276. ----------
  277. data : Series
  278. Series to be described.
  279. percentiles : list-like of numbers
  280. The percentiles to include in the output.
  281. """
  282. # GH-30164
  283. from pandas import Series
  284. formatted_percentiles = format_percentiles(percentiles)
  285. stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"]
  286. d = (
  287. [data.count(), data.mean(), data.min()]
  288. + data.quantile(percentiles).tolist()
  289. + [data.max()]
  290. )
  291. return Series(d, index=stat_index, name=data.name)
  292. def select_describe_func(
  293. data: Series,
  294. ) -> Callable:
  295. """Select proper function for describing series based on data type.
  296. Parameters
  297. ----------
  298. data : Series
  299. Series to be described.
  300. """
  301. if is_bool_dtype(data.dtype):
  302. return describe_categorical_1d
  303. elif is_numeric_dtype(data):
  304. return describe_numeric_1d
  305. elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
  306. return describe_timestamp_1d
  307. elif data.dtype.kind == "m":
  308. return describe_numeric_1d
  309. else:
  310. return describe_categorical_1d
  311. def refine_percentiles(
  312. percentiles: Sequence[float] | np.ndarray | None,
  313. ) -> np.ndarray[Any, np.dtype[np.float64]]:
  314. """
  315. Ensure that percentiles are unique and sorted.
  316. Parameters
  317. ----------
  318. percentiles : list-like of numbers, optional
  319. The percentiles to include in the output.
  320. """
  321. if percentiles is None:
  322. return np.array([0.25, 0.5, 0.75])
  323. # explicit conversion of `percentiles` to list
  324. percentiles = list(percentiles)
  325. # get them all to be in [0, 1]
  326. validate_percentile(percentiles)
  327. # median should always be included
  328. if 0.5 not in percentiles:
  329. percentiles.append(0.5)
  330. percentiles = np.asarray(percentiles)
  331. # sort and check for duplicates
  332. unique_pcts = np.unique(percentiles)
  333. assert percentiles is not None
  334. if len(unique_pcts) < len(percentiles):
  335. raise ValueError("percentiles cannot contain duplicates")
  336. return unique_pcts