_statistics.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639
  1. """Statistical transformations for visualization.
  2. This module is currently private, but is being written to eventually form part
  3. of the public API.
  4. The classes should behave roughly in the style of scikit-learn.
  5. - All data-independent parameters should be passed to the class constructor.
  6. - Each class should implement a default transformation that is exposed through
  7. __call__. These are currently written for vector arguments, but I think
  8. consuming a whole `plot_data` DataFrame and return it with transformed
  9. variables would make more sense.
  10. - Some class have data-dependent preprocessing that should be cached and used
  11. multiple times (think defining histogram bins off all data and then counting
  12. observations within each bin multiple times per data subsets). These currently
  13. have unique names, but it would be good to have a common name. Not quite
  14. `fit`, but something similar.
  15. - Alternatively, the transform interface could take some information about grouping
  16. variables and do a groupby internally.
  17. - Some classes should define alternate transforms that might make the most sense
  18. with a different function. For example, KDE usually evaluates the distribution
  19. on a regular grid, but it would be useful for it to transform at the actual
  20. datapoints. Then again, this could be controlled by a parameter at the time of
  21. class instantiation.
  22. """
  23. from numbers import Number
  24. import numpy as np
  25. import pandas as pd
  26. try:
  27. from scipy.stats import gaussian_kde
  28. _no_scipy = False
  29. except ImportError:
  30. from .external.kde import gaussian_kde
  31. _no_scipy = True
  32. from .algorithms import bootstrap
  33. from .utils import _check_argument, _normal_quantile_func
  34. class KDE:
  35. """Univariate and bivariate kernel density estimator."""
  36. def __init__(
  37. self, *,
  38. bw_method=None,
  39. bw_adjust=1,
  40. gridsize=200,
  41. cut=3,
  42. clip=None,
  43. cumulative=False,
  44. ):
  45. """Initialize the estimator with its parameters.
  46. Parameters
  47. ----------
  48. bw_method : string, scalar, or callable, optional
  49. Method for determining the smoothing bandwidth to use; passed to
  50. :class:`scipy.stats.gaussian_kde`.
  51. bw_adjust : number, optional
  52. Factor that multiplicatively scales the value chosen using
  53. ``bw_method``. Increasing will make the curve smoother. See Notes.
  54. gridsize : int, optional
  55. Number of points on each dimension of the evaluation grid.
  56. cut : number, optional
  57. Factor, multiplied by the smoothing bandwidth, that determines how
  58. far the evaluation grid extends past the extreme datapoints. When
  59. set to 0, truncate the curve at the data limits.
  60. clip : pair of numbers or None, or a pair of such pairs
  61. Do not evaluate the density outside of these limits.
  62. cumulative : bool, optional
  63. If True, estimate a cumulative distribution function. Requires scipy.
  64. """
  65. if clip is None:
  66. clip = None, None
  67. self.bw_method = bw_method
  68. self.bw_adjust = bw_adjust
  69. self.gridsize = gridsize
  70. self.cut = cut
  71. self.clip = clip
  72. self.cumulative = cumulative
  73. if cumulative and _no_scipy:
  74. raise RuntimeError("Cumulative KDE evaluation requires scipy")
  75. self.support = None
  76. def _define_support_grid(self, x, bw, cut, clip, gridsize):
  77. """Create the grid of evaluation points depending for vector x."""
  78. clip_lo = -np.inf if clip[0] is None else clip[0]
  79. clip_hi = +np.inf if clip[1] is None else clip[1]
  80. gridmin = max(x.min() - bw * cut, clip_lo)
  81. gridmax = min(x.max() + bw * cut, clip_hi)
  82. return np.linspace(gridmin, gridmax, gridsize)
  83. def _define_support_univariate(self, x, weights):
  84. """Create a 1D grid of evaluation points."""
  85. kde = self._fit(x, weights)
  86. bw = np.sqrt(kde.covariance.squeeze())
  87. grid = self._define_support_grid(
  88. x, bw, self.cut, self.clip, self.gridsize
  89. )
  90. return grid
  91. def _define_support_bivariate(self, x1, x2, weights):
  92. """Create a 2D grid of evaluation points."""
  93. clip = self.clip
  94. if clip[0] is None or np.isscalar(clip[0]):
  95. clip = (clip, clip)
  96. kde = self._fit([x1, x2], weights)
  97. bw = np.sqrt(np.diag(kde.covariance).squeeze())
  98. grid1 = self._define_support_grid(
  99. x1, bw[0], self.cut, clip[0], self.gridsize
  100. )
  101. grid2 = self._define_support_grid(
  102. x2, bw[1], self.cut, clip[1], self.gridsize
  103. )
  104. return grid1, grid2
  105. def define_support(self, x1, x2=None, weights=None, cache=True):
  106. """Create the evaluation grid for a given data set."""
  107. if x2 is None:
  108. support = self._define_support_univariate(x1, weights)
  109. else:
  110. support = self._define_support_bivariate(x1, x2, weights)
  111. if cache:
  112. self.support = support
  113. return support
  114. def _fit(self, fit_data, weights=None):
  115. """Fit the scipy kde while adding bw_adjust logic and version check."""
  116. fit_kws = {"bw_method": self.bw_method}
  117. if weights is not None:
  118. fit_kws["weights"] = weights
  119. kde = gaussian_kde(fit_data, **fit_kws)
  120. kde.set_bandwidth(kde.factor * self.bw_adjust)
  121. return kde
  122. def _eval_univariate(self, x, weights=None):
  123. """Fit and evaluate a univariate on univariate data."""
  124. support = self.support
  125. if support is None:
  126. support = self.define_support(x, cache=False)
  127. kde = self._fit(x, weights)
  128. if self.cumulative:
  129. s_0 = support[0]
  130. density = np.array([
  131. kde.integrate_box_1d(s_0, s_i) for s_i in support
  132. ])
  133. else:
  134. density = kde(support)
  135. return density, support
  136. def _eval_bivariate(self, x1, x2, weights=None):
  137. """Fit and evaluate a univariate on bivariate data."""
  138. support = self.support
  139. if support is None:
  140. support = self.define_support(x1, x2, cache=False)
  141. kde = self._fit([x1, x2], weights)
  142. if self.cumulative:
  143. grid1, grid2 = support
  144. density = np.zeros((grid1.size, grid2.size))
  145. p0 = grid1.min(), grid2.min()
  146. for i, xi in enumerate(grid1):
  147. for j, xj in enumerate(grid2):
  148. density[i, j] = kde.integrate_box(p0, (xi, xj))
  149. else:
  150. xx1, xx2 = np.meshgrid(*support)
  151. density = kde([xx1.ravel(), xx2.ravel()]).reshape(xx1.shape)
  152. return density, support
  153. def __call__(self, x1, x2=None, weights=None):
  154. """Fit and evaluate on univariate or bivariate data."""
  155. if x2 is None:
  156. return self._eval_univariate(x1, weights)
  157. else:
  158. return self._eval_bivariate(x1, x2, weights)
  159. # Note: we no longer use this for univariate histograms in histplot,
  160. # preferring _stats.Hist. We'll deprecate this once we have a bivariate Stat class.
  161. class Histogram:
  162. """Univariate and bivariate histogram estimator."""
  163. def __init__(
  164. self,
  165. stat="count",
  166. bins="auto",
  167. binwidth=None,
  168. binrange=None,
  169. discrete=False,
  170. cumulative=False,
  171. ):
  172. """Initialize the estimator with its parameters.
  173. Parameters
  174. ----------
  175. stat : str
  176. Aggregate statistic to compute in each bin.
  177. - `count`: show the number of observations in each bin
  178. - `frequency`: show the number of observations divided by the bin width
  179. - `probability` or `proportion`: normalize such that bar heights sum to 1
  180. - `percent`: normalize such that bar heights sum to 100
  181. - `density`: normalize such that the total area of the histogram equals 1
  182. bins : str, number, vector, or a pair of such values
  183. Generic bin parameter that can be the name of a reference rule,
  184. the number of bins, or the breaks of the bins.
  185. Passed to :func:`numpy.histogram_bin_edges`.
  186. binwidth : number or pair of numbers
  187. Width of each bin, overrides ``bins`` but can be used with
  188. ``binrange``.
  189. binrange : pair of numbers or a pair of pairs
  190. Lowest and highest value for bin edges; can be used either
  191. with ``bins`` or ``binwidth``. Defaults to data extremes.
  192. discrete : bool or pair of bools
  193. If True, set ``binwidth`` and ``binrange`` such that bin
  194. edges cover integer values in the dataset.
  195. cumulative : bool
  196. If True, return the cumulative statistic.
  197. """
  198. stat_choices = [
  199. "count", "frequency", "density", "probability", "proportion", "percent",
  200. ]
  201. _check_argument("stat", stat_choices, stat)
  202. self.stat = stat
  203. self.bins = bins
  204. self.binwidth = binwidth
  205. self.binrange = binrange
  206. self.discrete = discrete
  207. self.cumulative = cumulative
  208. self.bin_kws = None
  209. def _define_bin_edges(self, x, weights, bins, binwidth, binrange, discrete):
  210. """Inner function that takes bin parameters as arguments."""
  211. if binrange is None:
  212. start, stop = x.min(), x.max()
  213. else:
  214. start, stop = binrange
  215. if discrete:
  216. bin_edges = np.arange(start - .5, stop + 1.5)
  217. elif binwidth is not None:
  218. step = binwidth
  219. bin_edges = np.arange(start, stop + step, step)
  220. # Handle roundoff error (maybe there is a less clumsy way?)
  221. if bin_edges.max() < stop or len(bin_edges) < 2:
  222. bin_edges = np.append(bin_edges, bin_edges.max() + step)
  223. else:
  224. bin_edges = np.histogram_bin_edges(
  225. x, bins, binrange, weights,
  226. )
  227. return bin_edges
  228. def define_bin_params(self, x1, x2=None, weights=None, cache=True):
  229. """Given data, return numpy.histogram parameters to define bins."""
  230. if x2 is None:
  231. bin_edges = self._define_bin_edges(
  232. x1, weights, self.bins, self.binwidth, self.binrange, self.discrete,
  233. )
  234. if isinstance(self.bins, (str, Number)):
  235. n_bins = len(bin_edges) - 1
  236. bin_range = bin_edges.min(), bin_edges.max()
  237. bin_kws = dict(bins=n_bins, range=bin_range)
  238. else:
  239. bin_kws = dict(bins=bin_edges)
  240. else:
  241. bin_edges = []
  242. for i, x in enumerate([x1, x2]):
  243. # Resolve out whether bin parameters are shared
  244. # or specific to each variable
  245. bins = self.bins
  246. if not bins or isinstance(bins, (str, Number)):
  247. pass
  248. elif isinstance(bins[i], str):
  249. bins = bins[i]
  250. elif len(bins) == 2:
  251. bins = bins[i]
  252. binwidth = self.binwidth
  253. if binwidth is None:
  254. pass
  255. elif not isinstance(binwidth, Number):
  256. binwidth = binwidth[i]
  257. binrange = self.binrange
  258. if binrange is None:
  259. pass
  260. elif not isinstance(binrange[0], Number):
  261. binrange = binrange[i]
  262. discrete = self.discrete
  263. if not isinstance(discrete, bool):
  264. discrete = discrete[i]
  265. # Define the bins for this variable
  266. bin_edges.append(self._define_bin_edges(
  267. x, weights, bins, binwidth, binrange, discrete,
  268. ))
  269. bin_kws = dict(bins=tuple(bin_edges))
  270. if cache:
  271. self.bin_kws = bin_kws
  272. return bin_kws
  273. def _eval_bivariate(self, x1, x2, weights):
  274. """Inner function for histogram of two variables."""
  275. bin_kws = self.bin_kws
  276. if bin_kws is None:
  277. bin_kws = self.define_bin_params(x1, x2, cache=False)
  278. density = self.stat == "density"
  279. hist, *bin_edges = np.histogram2d(
  280. x1, x2, **bin_kws, weights=weights, density=density
  281. )
  282. area = np.outer(
  283. np.diff(bin_edges[0]),
  284. np.diff(bin_edges[1]),
  285. )
  286. if self.stat == "probability" or self.stat == "proportion":
  287. hist = hist.astype(float) / hist.sum()
  288. elif self.stat == "percent":
  289. hist = hist.astype(float) / hist.sum() * 100
  290. elif self.stat == "frequency":
  291. hist = hist.astype(float) / area
  292. if self.cumulative:
  293. if self.stat in ["density", "frequency"]:
  294. hist = (hist * area).cumsum(axis=0).cumsum(axis=1)
  295. else:
  296. hist = hist.cumsum(axis=0).cumsum(axis=1)
  297. return hist, bin_edges
  298. def _eval_univariate(self, x, weights):
  299. """Inner function for histogram of one variable."""
  300. bin_kws = self.bin_kws
  301. if bin_kws is None:
  302. bin_kws = self.define_bin_params(x, weights=weights, cache=False)
  303. density = self.stat == "density"
  304. hist, bin_edges = np.histogram(
  305. x, **bin_kws, weights=weights, density=density,
  306. )
  307. if self.stat == "probability" or self.stat == "proportion":
  308. hist = hist.astype(float) / hist.sum()
  309. elif self.stat == "percent":
  310. hist = hist.astype(float) / hist.sum() * 100
  311. elif self.stat == "frequency":
  312. hist = hist.astype(float) / np.diff(bin_edges)
  313. if self.cumulative:
  314. if self.stat in ["density", "frequency"]:
  315. hist = (hist * np.diff(bin_edges)).cumsum()
  316. else:
  317. hist = hist.cumsum()
  318. return hist, bin_edges
  319. def __call__(self, x1, x2=None, weights=None):
  320. """Count the occurrences in each bin, maybe normalize."""
  321. if x2 is None:
  322. return self._eval_univariate(x1, weights)
  323. else:
  324. return self._eval_bivariate(x1, x2, weights)
  325. class ECDF:
  326. """Univariate empirical cumulative distribution estimator."""
  327. def __init__(self, stat="proportion", complementary=False):
  328. """Initialize the class with its parameters
  329. Parameters
  330. ----------
  331. stat : {{"proportion", "percent", "count"}}
  332. Distribution statistic to compute.
  333. complementary : bool
  334. If True, use the complementary CDF (1 - CDF)
  335. """
  336. _check_argument("stat", ["count", "percent", "proportion"], stat)
  337. self.stat = stat
  338. self.complementary = complementary
  339. def _eval_bivariate(self, x1, x2, weights):
  340. """Inner function for ECDF of two variables."""
  341. raise NotImplementedError("Bivariate ECDF is not implemented")
  342. def _eval_univariate(self, x, weights):
  343. """Inner function for ECDF of one variable."""
  344. sorter = x.argsort()
  345. x = x[sorter]
  346. weights = weights[sorter]
  347. y = weights.cumsum()
  348. if self.stat in ["percent", "proportion"]:
  349. y = y / y.max()
  350. if self.stat == "percent":
  351. y = y * 100
  352. x = np.r_[-np.inf, x]
  353. y = np.r_[0, y]
  354. if self.complementary:
  355. y = y.max() - y
  356. return y, x
  357. def __call__(self, x1, x2=None, weights=None):
  358. """Return proportion or count of observations below each sorted datapoint."""
  359. x1 = np.asarray(x1)
  360. if weights is None:
  361. weights = np.ones_like(x1)
  362. else:
  363. weights = np.asarray(weights)
  364. if x2 is None:
  365. return self._eval_univariate(x1, weights)
  366. else:
  367. return self._eval_bivariate(x1, x2, weights)
  368. class EstimateAggregator:
  369. def __init__(self, estimator, errorbar=None, **boot_kws):
  370. """
  371. Data aggregator that produces an estimate and error bar interval.
  372. Parameters
  373. ----------
  374. estimator : callable or string
  375. Function (or method name) that maps a vector to a scalar.
  376. errorbar : string, (string, number) tuple, or callable
  377. Name of errorbar method (either "ci", "pi", "se", or "sd"), or a tuple
  378. with a method name and a level parameter, or a function that maps from a
  379. vector to a (min, max) interval.
  380. boot_kws
  381. Additional keywords are passed to bootstrap when error_method is "ci".
  382. """
  383. self.estimator = estimator
  384. method, level = _validate_errorbar_arg(errorbar)
  385. self.error_method = method
  386. self.error_level = level
  387. self.boot_kws = boot_kws
  388. def __call__(self, data, var):
  389. """Aggregate over `var` column of `data` with estimate and error interval."""
  390. vals = data[var]
  391. if callable(self.estimator):
  392. # You would think we could pass to vals.agg, and yet:
  393. # https://github.com/mwaskom/seaborn/issues/2943
  394. estimate = self.estimator(vals)
  395. else:
  396. estimate = vals.agg(self.estimator)
  397. # Options that produce no error bars
  398. if self.error_method is None:
  399. err_min = err_max = np.nan
  400. elif len(data) <= 1:
  401. err_min = err_max = np.nan
  402. # Generic errorbars from user-supplied function
  403. elif callable(self.error_method):
  404. err_min, err_max = self.error_method(vals)
  405. # Parametric options
  406. elif self.error_method == "sd":
  407. half_interval = vals.std() * self.error_level
  408. err_min, err_max = estimate - half_interval, estimate + half_interval
  409. elif self.error_method == "se":
  410. half_interval = vals.sem() * self.error_level
  411. err_min, err_max = estimate - half_interval, estimate + half_interval
  412. # Nonparametric options
  413. elif self.error_method == "pi":
  414. err_min, err_max = _percentile_interval(vals, self.error_level)
  415. elif self.error_method == "ci":
  416. units = data.get("units", None)
  417. boots = bootstrap(vals, units=units, func=self.estimator, **self.boot_kws)
  418. err_min, err_max = _percentile_interval(boots, self.error_level)
  419. return pd.Series({var: estimate, f"{var}min": err_min, f"{var}max": err_max})
  420. class LetterValues:
  421. def __init__(self, k_depth, outlier_prop, trust_alpha):
  422. """
  423. Compute percentiles of a distribution using various tail stopping rules.
  424. Parameters
  425. ----------
  426. k_depth: "tukey", "proportion", "trustworthy", or "full"
  427. Stopping rule for choosing tail percentiled to show:
  428. - tukey: Show a similar number of outliers as in a conventional boxplot.
  429. - proportion: Show approximately `outlier_prop` outliers.
  430. - trust_alpha: Use `trust_alpha` level for most extreme tail percentile.
  431. outlier_prop: float
  432. Parameter for `k_depth="proportion"` setting the expected outlier rate.
  433. trust_alpha: float
  434. Parameter for `k_depth="trustworthy"` setting the confidence threshold.
  435. Notes
  436. -----
  437. Based on the proposal in this paper:
  438. https://vita.had.co.nz/papers/letter-value-plot.pdf
  439. """
  440. k_options = ["tukey", "proportion", "trustworthy", "full"]
  441. if isinstance(k_depth, str):
  442. _check_argument("k_depth", k_options, k_depth)
  443. elif not isinstance(k_depth, int):
  444. err = (
  445. "The `k_depth` parameter must be either an integer or string "
  446. f"(one of {k_options}), not {k_depth!r}."
  447. )
  448. raise TypeError(err)
  449. self.k_depth = k_depth
  450. self.outlier_prop = outlier_prop
  451. self.trust_alpha = trust_alpha
  452. def _compute_k(self, n):
  453. # Select the depth, i.e. number of boxes to draw, based on the method
  454. if self.k_depth == "full":
  455. # extend boxes to 100% of the data
  456. k = int(np.log2(n)) + 1
  457. elif self.k_depth == "tukey":
  458. # This results with 5-8 points in each tail
  459. k = int(np.log2(n)) - 3
  460. elif self.k_depth == "proportion":
  461. k = int(np.log2(n)) - int(np.log2(n * self.outlier_prop)) + 1
  462. elif self.k_depth == "trustworthy":
  463. point_conf = 2 * _normal_quantile_func(1 - self.trust_alpha / 2) ** 2
  464. k = int(np.log2(n / point_conf)) + 1
  465. else:
  466. # Allow having k directly specified as input
  467. k = int(self.k_depth)
  468. return max(k, 1)
  469. def __call__(self, x):
  470. """Evaluate the letter values."""
  471. k = self._compute_k(len(x))
  472. exp = np.arange(k + 1, 1, -1), np.arange(2, k + 2)
  473. levels = k + 1 - np.concatenate([exp[0], exp[1][1:]])
  474. percentiles = 100 * np.concatenate([0.5 ** exp[0], 1 - 0.5 ** exp[1]])
  475. if self.k_depth == "full":
  476. percentiles[0] = 0
  477. percentiles[-1] = 100
  478. values = np.percentile(x, percentiles)
  479. fliers = np.asarray(x[(x < values.min()) | (x > values.max())])
  480. median = np.percentile(x, 50)
  481. return {
  482. "k": k,
  483. "levels": levels,
  484. "percs": percentiles,
  485. "values": values,
  486. "fliers": fliers,
  487. "median": median,
  488. }
  489. def _percentile_interval(data, width):
  490. """Return a percentile interval from data of a given width."""
  491. edge = (100 - width) / 2
  492. percentiles = edge, 100 - edge
  493. return np.nanpercentile(data, percentiles)
  494. def _validate_errorbar_arg(arg):
  495. """Check type and value of errorbar argument and assign default level."""
  496. DEFAULT_LEVELS = {
  497. "ci": 95,
  498. "pi": 95,
  499. "se": 1,
  500. "sd": 1,
  501. }
  502. usage = "`errorbar` must be a callable, string, or (string, number) tuple"
  503. if arg is None:
  504. return None, None
  505. elif callable(arg):
  506. return arg, None
  507. elif isinstance(arg, str):
  508. method = arg
  509. level = DEFAULT_LEVELS.get(method, None)
  510. else:
  511. try:
  512. method, level = arg
  513. except (ValueError, TypeError) as err:
  514. raise err.__class__(usage) from err
  515. _check_argument("errorbar", list(DEFAULT_LEVELS), method)
  516. if level is not None and not isinstance(level, Number):
  517. raise TypeError(usage)
  518. return method, level