distributions.py 85 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531
  1. """Plotting functions for visualizing distributions."""
  2. from numbers import Number
  3. from functools import partial
  4. import math
  5. import textwrap
  6. import warnings
  7. import numpy as np
  8. import pandas as pd
  9. import matplotlib as mpl
  10. import matplotlib.pyplot as plt
  11. import matplotlib.transforms as tx
  12. from matplotlib.colors import to_rgba
  13. from matplotlib.collections import LineCollection
  14. from ._base import VectorPlotter
  15. # We have moved univariate histogram computation over to the new Hist class,
  16. # but still use the older Histogram for bivariate computation.
  17. from ._statistics import ECDF, Histogram, KDE
  18. from ._stats.counting import Hist
  19. from .axisgrid import (
  20. FacetGrid,
  21. _facet_docs,
  22. )
  23. from .utils import (
  24. remove_na,
  25. _get_transform_functions,
  26. _kde_support,
  27. _normalize_kwargs,
  28. _check_argument,
  29. _assign_default_kwargs,
  30. _default_color,
  31. )
  32. from .palettes import color_palette
  33. from .external import husl
  34. from .external.kde import gaussian_kde
  35. from ._docstrings import (
  36. DocstringComponents,
  37. _core_docs,
  38. )
  39. __all__ = ["displot", "histplot", "kdeplot", "ecdfplot", "rugplot", "distplot"]
  40. # ==================================================================================== #
  41. # Module documentation
  42. # ==================================================================================== #
  43. _dist_params = dict(
  44. multiple="""
  45. multiple : {{"layer", "stack", "fill"}}
  46. Method for drawing multiple elements when semantic mapping creates subsets.
  47. Only relevant with univariate data.
  48. """,
  49. log_scale="""
  50. log_scale : bool or number, or pair of bools or numbers
  51. Set axis scale(s) to log. A single value sets the data axis for any numeric
  52. axes in the plot. A pair of values sets each axis independently.
  53. Numeric values are interpreted as the desired base (default 10).
  54. When `None` or `False`, seaborn defers to the existing Axes scale.
  55. """,
  56. legend="""
  57. legend : bool
  58. If False, suppress the legend for semantic variables.
  59. """,
  60. cbar="""
  61. cbar : bool
  62. If True, add a colorbar to annotate the color mapping in a bivariate plot.
  63. Note: Does not currently support plots with a ``hue`` variable well.
  64. """,
  65. cbar_ax="""
  66. cbar_ax : :class:`matplotlib.axes.Axes`
  67. Pre-existing axes for the colorbar.
  68. """,
  69. cbar_kws="""
  70. cbar_kws : dict
  71. Additional parameters passed to :meth:`matplotlib.figure.Figure.colorbar`.
  72. """,
  73. )
  74. _param_docs = DocstringComponents.from_nested_components(
  75. core=_core_docs["params"],
  76. facets=DocstringComponents(_facet_docs),
  77. dist=DocstringComponents(_dist_params),
  78. kde=DocstringComponents.from_function_params(KDE.__init__),
  79. hist=DocstringComponents.from_function_params(Histogram.__init__),
  80. ecdf=DocstringComponents.from_function_params(ECDF.__init__),
  81. )
  82. # ==================================================================================== #
  83. # Internal API
  84. # ==================================================================================== #
  85. class _DistributionPlotter(VectorPlotter):
  86. wide_structure = {"x": "@values", "hue": "@columns"}
  87. flat_structure = {"x": "@values"}
  88. def __init__(
  89. self,
  90. data=None,
  91. variables={},
  92. ):
  93. super().__init__(data=data, variables=variables)
  94. @property
  95. def univariate(self):
  96. """Return True if only x or y are used."""
  97. # TODO this could go down to core, but putting it here now.
  98. # We'd want to be conceptually clear that univariate only applies
  99. # to x/y and not to other semantics, which can exist.
  100. # We haven't settled on a good conceptual name for x/y.
  101. return bool({"x", "y"} - set(self.variables))
  102. @property
  103. def data_variable(self):
  104. """Return the variable with data for univariate plots."""
  105. # TODO This could also be in core, but it should have a better name.
  106. if not self.univariate:
  107. raise AttributeError("This is not a univariate plot")
  108. return {"x", "y"}.intersection(self.variables).pop()
  109. @property
  110. def has_xy_data(self):
  111. """Return True at least one of x or y is defined."""
  112. # TODO see above points about where this should go
  113. return bool({"x", "y"} & set(self.variables))
  114. def _add_legend(
  115. self,
  116. ax_obj, artist, fill, element, multiple, alpha, artist_kws, legend_kws,
  117. ):
  118. """Add artists that reflect semantic mappings and put then in a legend."""
  119. # TODO note that this doesn't handle numeric mappings like the relational plots
  120. handles = []
  121. labels = []
  122. for level in self._hue_map.levels:
  123. color = self._hue_map(level)
  124. kws = self._artist_kws(
  125. artist_kws, fill, element, multiple, color, alpha
  126. )
  127. # color gets added to the kws to workaround an issue with barplot's color
  128. # cycle integration but it causes problems in this context where we are
  129. # setting artist properties directly, so pop it off here
  130. if "facecolor" in kws:
  131. kws.pop("color", None)
  132. handles.append(artist(**kws))
  133. labels.append(level)
  134. if isinstance(ax_obj, mpl.axes.Axes):
  135. ax_obj.legend(handles, labels, title=self.variables["hue"], **legend_kws)
  136. else: # i.e. a FacetGrid. TODO make this better
  137. legend_data = dict(zip(labels, handles))
  138. ax_obj.add_legend(
  139. legend_data,
  140. title=self.variables["hue"],
  141. label_order=self.var_levels["hue"],
  142. **legend_kws
  143. )
  144. def _artist_kws(self, kws, fill, element, multiple, color, alpha):
  145. """Handle differences between artists in filled/unfilled plots."""
  146. kws = kws.copy()
  147. if fill:
  148. kws = _normalize_kwargs(kws, mpl.collections.PolyCollection)
  149. kws.setdefault("facecolor", to_rgba(color, alpha))
  150. if element == "bars":
  151. # Make bar() interface with property cycle correctly
  152. # https://github.com/matplotlib/matplotlib/issues/19385
  153. kws["color"] = "none"
  154. if multiple in ["stack", "fill"] or element == "bars":
  155. kws.setdefault("edgecolor", mpl.rcParams["patch.edgecolor"])
  156. else:
  157. kws.setdefault("edgecolor", to_rgba(color, 1))
  158. elif element == "bars":
  159. kws["facecolor"] = "none"
  160. kws["edgecolor"] = to_rgba(color, alpha)
  161. else:
  162. kws["color"] = to_rgba(color, alpha)
  163. return kws
  164. def _quantile_to_level(self, data, quantile):
  165. """Return data levels corresponding to quantile cuts of mass."""
  166. isoprop = np.asarray(quantile)
  167. values = np.ravel(data)
  168. sorted_values = np.sort(values)[::-1]
  169. normalized_values = np.cumsum(sorted_values) / values.sum()
  170. idx = np.searchsorted(normalized_values, 1 - isoprop)
  171. levels = np.take(sorted_values, idx, mode="clip")
  172. return levels
  173. def _cmap_from_color(self, color):
  174. """Return a sequential colormap given a color seed."""
  175. # Like so much else here, this is broadly useful, but keeping it
  176. # in this class to signify that I haven't thought overly hard about it...
  177. r, g, b, _ = to_rgba(color)
  178. h, s, _ = husl.rgb_to_husl(r, g, b)
  179. xx = np.linspace(-1, 1, int(1.15 * 256))[:256]
  180. ramp = np.zeros((256, 3))
  181. ramp[:, 0] = h
  182. ramp[:, 1] = s * np.cos(xx)
  183. ramp[:, 2] = np.linspace(35, 80, 256)
  184. colors = np.clip([husl.husl_to_rgb(*hsl) for hsl in ramp], 0, 1)
  185. return mpl.colors.ListedColormap(colors[::-1])
  186. def _default_discrete(self):
  187. """Find default values for discrete hist estimation based on variable type."""
  188. if self.univariate:
  189. discrete = self.var_types[self.data_variable] == "categorical"
  190. else:
  191. discrete_x = self.var_types["x"] == "categorical"
  192. discrete_y = self.var_types["y"] == "categorical"
  193. discrete = discrete_x, discrete_y
  194. return discrete
  195. def _resolve_multiple(self, curves, multiple):
  196. """Modify the density data structure to handle multiple densities."""
  197. # Default baselines have all densities starting at 0
  198. baselines = {k: np.zeros_like(v) for k, v in curves.items()}
  199. # TODO we should have some central clearinghouse for checking if any
  200. # "grouping" (terminnology?) semantics have been assigned
  201. if "hue" not in self.variables:
  202. return curves, baselines
  203. if multiple in ("stack", "fill"):
  204. # Setting stack or fill means that the curves share a
  205. # support grid / set of bin edges, so we can make a dataframe
  206. # Reverse the column order to plot from top to bottom
  207. curves = pd.DataFrame(curves).iloc[:, ::-1]
  208. # Find column groups that are nested within col/row variables
  209. column_groups = {}
  210. for i, keyd in enumerate(map(dict, curves.columns)):
  211. facet_key = keyd.get("col", None), keyd.get("row", None)
  212. column_groups.setdefault(facet_key, [])
  213. column_groups[facet_key].append(i)
  214. baselines = curves.copy()
  215. for col_idxs in column_groups.values():
  216. cols = curves.columns[col_idxs]
  217. norm_constant = curves[cols].sum(axis="columns")
  218. # Take the cumulative sum to stack
  219. curves[cols] = curves[cols].cumsum(axis="columns")
  220. # Normalize by row sum to fill
  221. if multiple == "fill":
  222. curves[cols] = curves[cols].div(norm_constant, axis="index")
  223. # Define where each segment starts
  224. baselines[cols] = curves[cols].shift(1, axis=1).fillna(0)
  225. if multiple == "dodge":
  226. # Account for the unique semantic (non-faceting) levels
  227. # This will require rethiniking if we add other semantics!
  228. hue_levels = self.var_levels["hue"]
  229. n = len(hue_levels)
  230. f_fwd, f_inv = self._get_scale_transforms(self.data_variable)
  231. for key in curves:
  232. level = dict(key)["hue"]
  233. hist = curves[key].reset_index(name="heights")
  234. level_idx = hue_levels.index(level)
  235. a = f_fwd(hist["edges"])
  236. b = f_fwd(hist["edges"] + hist["widths"])
  237. w = (b - a) / n
  238. new_min = f_inv(a + level_idx * w)
  239. new_max = f_inv(a + (level_idx + 1) * w)
  240. hist["widths"] = new_max - new_min
  241. hist["edges"] = new_min
  242. curves[key] = hist.set_index(["edges", "widths"])["heights"]
  243. return curves, baselines
  244. # -------------------------------------------------------------------------------- #
  245. # Computation
  246. # -------------------------------------------------------------------------------- #
  247. def _compute_univariate_density(
  248. self,
  249. data_variable,
  250. common_norm,
  251. common_grid,
  252. estimate_kws,
  253. warn_singular=True,
  254. ):
  255. # Initialize the estimator object
  256. estimator = KDE(**estimate_kws)
  257. if set(self.variables) - {"x", "y"}:
  258. if common_grid:
  259. all_observations = self.comp_data.dropna()
  260. estimator.define_support(all_observations[data_variable])
  261. else:
  262. common_norm = False
  263. all_data = self.plot_data.dropna()
  264. if common_norm and "weights" in all_data:
  265. whole_weight = all_data["weights"].sum()
  266. else:
  267. whole_weight = len(all_data)
  268. densities = {}
  269. for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
  270. # Extract the data points from this sub set and remove nulls
  271. observations = sub_data[data_variable]
  272. # Extract the weights for this subset of observations
  273. if "weights" in self.variables:
  274. weights = sub_data["weights"]
  275. part_weight = weights.sum()
  276. else:
  277. weights = None
  278. part_weight = len(sub_data)
  279. # Estimate the density of observations at this level
  280. variance = np.nan_to_num(observations.var())
  281. singular = len(observations) < 2 or math.isclose(variance, 0)
  282. try:
  283. if not singular:
  284. # Convoluted approach needed because numerical failures
  285. # can manifest in a few different ways.
  286. density, support = estimator(observations, weights=weights)
  287. except np.linalg.LinAlgError:
  288. singular = True
  289. if singular:
  290. msg = (
  291. "Dataset has 0 variance; skipping density estimate. "
  292. "Pass `warn_singular=False` to disable this warning."
  293. )
  294. if warn_singular:
  295. warnings.warn(msg, UserWarning, stacklevel=4)
  296. continue
  297. # Invert the scaling of the support points
  298. _, f_inv = self._get_scale_transforms(self.data_variable)
  299. support = f_inv(support)
  300. # Apply a scaling factor so that the integral over all subsets is 1
  301. if common_norm:
  302. density *= part_weight / whole_weight
  303. # Store the density for this level
  304. key = tuple(sub_vars.items())
  305. densities[key] = pd.Series(density, index=support)
  306. return densities
  307. # -------------------------------------------------------------------------------- #
  308. # Plotting
  309. # -------------------------------------------------------------------------------- #
  310. def plot_univariate_histogram(
  311. self,
  312. multiple,
  313. element,
  314. fill,
  315. common_norm,
  316. common_bins,
  317. shrink,
  318. kde,
  319. kde_kws,
  320. color,
  321. legend,
  322. line_kws,
  323. estimate_kws,
  324. **plot_kws,
  325. ):
  326. # -- Default keyword dicts
  327. kde_kws = {} if kde_kws is None else kde_kws.copy()
  328. line_kws = {} if line_kws is None else line_kws.copy()
  329. estimate_kws = {} if estimate_kws is None else estimate_kws.copy()
  330. # -- Input checking
  331. _check_argument("multiple", ["layer", "stack", "fill", "dodge"], multiple)
  332. _check_argument("element", ["bars", "step", "poly"], element)
  333. auto_bins_with_weights = (
  334. "weights" in self.variables
  335. and estimate_kws["bins"] == "auto"
  336. and estimate_kws["binwidth"] is None
  337. and not estimate_kws["discrete"]
  338. )
  339. if auto_bins_with_weights:
  340. msg = (
  341. "`bins` cannot be 'auto' when using weights. "
  342. "Setting `bins=10`, but you will likely want to adjust."
  343. )
  344. warnings.warn(msg, UserWarning)
  345. estimate_kws["bins"] = 10
  346. # Simplify downstream code if we are not normalizing
  347. if estimate_kws["stat"] == "count":
  348. common_norm = False
  349. orient = self.data_variable
  350. # Now initialize the Histogram estimator
  351. estimator = Hist(**estimate_kws)
  352. histograms = {}
  353. # Do pre-compute housekeeping related to multiple groups
  354. all_data = self.comp_data.dropna()
  355. all_weights = all_data.get("weights", None)
  356. multiple_histograms = set(self.variables) - {"x", "y"}
  357. if multiple_histograms:
  358. if common_bins:
  359. bin_kws = estimator._define_bin_params(all_data, orient, None)
  360. else:
  361. common_norm = False
  362. if common_norm and all_weights is not None:
  363. whole_weight = all_weights.sum()
  364. else:
  365. whole_weight = len(all_data)
  366. # Estimate the smoothed kernel densities, for use later
  367. if kde:
  368. # TODO alternatively, clip at min/max bins?
  369. kde_kws.setdefault("cut", 0)
  370. kde_kws["cumulative"] = estimate_kws["cumulative"]
  371. densities = self._compute_univariate_density(
  372. self.data_variable,
  373. common_norm,
  374. common_bins,
  375. kde_kws,
  376. warn_singular=False,
  377. )
  378. # First pass through the data to compute the histograms
  379. for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
  380. # Prepare the relevant data
  381. key = tuple(sub_vars.items())
  382. orient = self.data_variable
  383. if "weights" in self.variables:
  384. sub_data["weight"] = sub_data.pop("weights")
  385. part_weight = sub_data["weight"].sum()
  386. else:
  387. part_weight = len(sub_data)
  388. # Do the histogram computation
  389. if not (multiple_histograms and common_bins):
  390. bin_kws = estimator._define_bin_params(sub_data, orient, None)
  391. res = estimator._normalize(estimator._eval(sub_data, orient, bin_kws))
  392. heights = res[estimator.stat].to_numpy()
  393. widths = res["space"].to_numpy()
  394. edges = res[orient].to_numpy() - widths / 2
  395. # Rescale the smoothed curve to match the histogram
  396. if kde and key in densities:
  397. density = densities[key]
  398. if estimator.cumulative:
  399. hist_norm = heights.max()
  400. else:
  401. hist_norm = (heights * widths).sum()
  402. densities[key] *= hist_norm
  403. # Convert edges back to original units for plotting
  404. ax = self._get_axes(sub_vars)
  405. _, inv = _get_transform_functions(ax, self.data_variable)
  406. widths = inv(edges + widths) - inv(edges)
  407. edges = inv(edges)
  408. # Pack the histogram data and metadata together
  409. edges = edges + (1 - shrink) / 2 * widths
  410. widths *= shrink
  411. index = pd.MultiIndex.from_arrays([
  412. pd.Index(edges, name="edges"),
  413. pd.Index(widths, name="widths"),
  414. ])
  415. hist = pd.Series(heights, index=index, name="heights")
  416. # Apply scaling to normalize across groups
  417. if common_norm:
  418. hist *= part_weight / whole_weight
  419. # Store the finalized histogram data for future plotting
  420. histograms[key] = hist
  421. # Modify the histogram and density data to resolve multiple groups
  422. histograms, baselines = self._resolve_multiple(histograms, multiple)
  423. if kde:
  424. densities, _ = self._resolve_multiple(
  425. densities, None if multiple == "dodge" else multiple
  426. )
  427. # Set autoscaling-related meta
  428. sticky_stat = (0, 1) if multiple == "fill" else (0, np.inf)
  429. if multiple == "fill":
  430. # Filled plots should not have any margins
  431. bin_vals = histograms.index.to_frame()
  432. edges = bin_vals["edges"]
  433. widths = bin_vals["widths"]
  434. sticky_data = (
  435. edges.min(),
  436. edges.max() + widths.loc[edges.idxmax()]
  437. )
  438. else:
  439. sticky_data = []
  440. # --- Handle default visual attributes
  441. # Note: default linewidth is determined after plotting
  442. # Default alpha should depend on other parameters
  443. if fill:
  444. # Note: will need to account for other grouping semantics if added
  445. if "hue" in self.variables and multiple == "layer":
  446. default_alpha = .5 if element == "bars" else .25
  447. elif kde:
  448. default_alpha = .5
  449. else:
  450. default_alpha = .75
  451. else:
  452. default_alpha = 1
  453. alpha = plot_kws.pop("alpha", default_alpha) # TODO make parameter?
  454. hist_artists = []
  455. # Go back through the dataset and draw the plots
  456. for sub_vars, _ in self.iter_data("hue", reverse=True):
  457. key = tuple(sub_vars.items())
  458. hist = histograms[key].rename("heights").reset_index()
  459. bottom = np.asarray(baselines[key])
  460. ax = self._get_axes(sub_vars)
  461. # Define the matplotlib attributes that depend on semantic mapping
  462. if "hue" in self.variables:
  463. sub_color = self._hue_map(sub_vars["hue"])
  464. else:
  465. sub_color = color
  466. artist_kws = self._artist_kws(
  467. plot_kws, fill, element, multiple, sub_color, alpha
  468. )
  469. if element == "bars":
  470. # Use matplotlib bar plotting
  471. plot_func = ax.bar if self.data_variable == "x" else ax.barh
  472. artists = plot_func(
  473. hist["edges"],
  474. hist["heights"] - bottom,
  475. hist["widths"],
  476. bottom,
  477. align="edge",
  478. **artist_kws,
  479. )
  480. for bar in artists:
  481. if self.data_variable == "x":
  482. bar.sticky_edges.x[:] = sticky_data
  483. bar.sticky_edges.y[:] = sticky_stat
  484. else:
  485. bar.sticky_edges.x[:] = sticky_stat
  486. bar.sticky_edges.y[:] = sticky_data
  487. hist_artists.extend(artists)
  488. else:
  489. # Use either fill_between or plot to draw hull of histogram
  490. if element == "step":
  491. final = hist.iloc[-1]
  492. x = np.append(hist["edges"], final["edges"] + final["widths"])
  493. y = np.append(hist["heights"], final["heights"])
  494. b = np.append(bottom, bottom[-1])
  495. if self.data_variable == "x":
  496. step = "post"
  497. drawstyle = "steps-post"
  498. else:
  499. step = "post" # fillbetweenx handles mapping internally
  500. drawstyle = "steps-pre"
  501. elif element == "poly":
  502. x = hist["edges"] + hist["widths"] / 2
  503. y = hist["heights"]
  504. b = bottom
  505. step = None
  506. drawstyle = None
  507. if self.data_variable == "x":
  508. if fill:
  509. artist = ax.fill_between(x, b, y, step=step, **artist_kws)
  510. else:
  511. artist, = ax.plot(x, y, drawstyle=drawstyle, **artist_kws)
  512. artist.sticky_edges.x[:] = sticky_data
  513. artist.sticky_edges.y[:] = sticky_stat
  514. else:
  515. if fill:
  516. artist = ax.fill_betweenx(x, b, y, step=step, **artist_kws)
  517. else:
  518. artist, = ax.plot(y, x, drawstyle=drawstyle, **artist_kws)
  519. artist.sticky_edges.x[:] = sticky_stat
  520. artist.sticky_edges.y[:] = sticky_data
  521. hist_artists.append(artist)
  522. if kde:
  523. # Add in the density curves
  524. try:
  525. density = densities[key]
  526. except KeyError:
  527. continue
  528. support = density.index
  529. if "x" in self.variables:
  530. line_args = support, density
  531. sticky_x, sticky_y = None, (0, np.inf)
  532. else:
  533. line_args = density, support
  534. sticky_x, sticky_y = (0, np.inf), None
  535. line_kws["color"] = to_rgba(sub_color, 1)
  536. line, = ax.plot(
  537. *line_args, **line_kws,
  538. )
  539. if sticky_x is not None:
  540. line.sticky_edges.x[:] = sticky_x
  541. if sticky_y is not None:
  542. line.sticky_edges.y[:] = sticky_y
  543. if element == "bars" and "linewidth" not in plot_kws:
  544. # Now we handle linewidth, which depends on the scaling of the plot
  545. # We will base everything on the minimum bin width
  546. hist_metadata = pd.concat([
  547. # Use .items for generality over dict or df
  548. h.index.to_frame() for _, h in histograms.items()
  549. ]).reset_index(drop=True)
  550. thin_bar_idx = hist_metadata["widths"].idxmin()
  551. binwidth = hist_metadata.loc[thin_bar_idx, "widths"]
  552. left_edge = hist_metadata.loc[thin_bar_idx, "edges"]
  553. # Set initial value
  554. default_linewidth = math.inf
  555. # Loop through subsets based only on facet variables
  556. for sub_vars, _ in self.iter_data():
  557. ax = self._get_axes(sub_vars)
  558. # Needed in some cases to get valid transforms.
  559. # Innocuous in other cases?
  560. ax.autoscale_view()
  561. # Convert binwidth from data coordinates to pixels
  562. pts_x, pts_y = 72 / ax.figure.dpi * abs(
  563. ax.transData.transform([left_edge + binwidth] * 2)
  564. - ax.transData.transform([left_edge] * 2)
  565. )
  566. if self.data_variable == "x":
  567. binwidth_points = pts_x
  568. else:
  569. binwidth_points = pts_y
  570. # The relative size of the lines depends on the appearance
  571. # This is a provisional value and may need more tweaking
  572. default_linewidth = min(.1 * binwidth_points, default_linewidth)
  573. # Set the attributes
  574. for bar in hist_artists:
  575. # Don't let the lines get too thick
  576. max_linewidth = bar.get_linewidth()
  577. if not fill:
  578. max_linewidth *= 1.5
  579. linewidth = min(default_linewidth, max_linewidth)
  580. # If not filling, don't let lines disappear
  581. if not fill:
  582. min_linewidth = .5
  583. linewidth = max(linewidth, min_linewidth)
  584. bar.set_linewidth(linewidth)
  585. # --- Finalize the plot ----
  586. # Axis labels
  587. ax = self.ax if self.ax is not None else self.facets.axes.flat[0]
  588. default_x = default_y = ""
  589. if self.data_variable == "x":
  590. default_y = estimator.stat.capitalize()
  591. if self.data_variable == "y":
  592. default_x = estimator.stat.capitalize()
  593. self._add_axis_labels(ax, default_x, default_y)
  594. # Legend for semantic variables
  595. if "hue" in self.variables and legend:
  596. if fill or element == "bars":
  597. artist = partial(mpl.patches.Patch)
  598. else:
  599. artist = partial(mpl.lines.Line2D, [], [])
  600. ax_obj = self.ax if self.ax is not None else self.facets
  601. self._add_legend(
  602. ax_obj, artist, fill, element, multiple, alpha, plot_kws, {},
  603. )
  604. def plot_bivariate_histogram(
  605. self,
  606. common_bins, common_norm,
  607. thresh, pthresh, pmax,
  608. color, legend,
  609. cbar, cbar_ax, cbar_kws,
  610. estimate_kws,
  611. **plot_kws,
  612. ):
  613. # Default keyword dicts
  614. cbar_kws = {} if cbar_kws is None else cbar_kws.copy()
  615. # Now initialize the Histogram estimator
  616. estimator = Histogram(**estimate_kws)
  617. # Do pre-compute housekeeping related to multiple groups
  618. if set(self.variables) - {"x", "y"}:
  619. all_data = self.comp_data.dropna()
  620. if common_bins:
  621. estimator.define_bin_params(
  622. all_data["x"],
  623. all_data["y"],
  624. all_data.get("weights", None),
  625. )
  626. else:
  627. common_norm = False
  628. # -- Determine colormap threshold and norm based on the full data
  629. full_heights = []
  630. for _, sub_data in self.iter_data(from_comp_data=True):
  631. sub_heights, _ = estimator(
  632. sub_data["x"], sub_data["y"], sub_data.get("weights", None)
  633. )
  634. full_heights.append(sub_heights)
  635. common_color_norm = not set(self.variables) - {"x", "y"} or common_norm
  636. if pthresh is not None and common_color_norm:
  637. thresh = self._quantile_to_level(full_heights, pthresh)
  638. plot_kws.setdefault("vmin", 0)
  639. if common_color_norm:
  640. if pmax is not None:
  641. vmax = self._quantile_to_level(full_heights, pmax)
  642. else:
  643. vmax = plot_kws.pop("vmax", max(map(np.max, full_heights)))
  644. else:
  645. vmax = None
  646. # Get a default color
  647. # (We won't follow the color cycle here, as multiple plots are unlikely)
  648. if color is None:
  649. color = "C0"
  650. # --- Loop over data (subsets) and draw the histograms
  651. for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
  652. if sub_data.empty:
  653. continue
  654. # Do the histogram computation
  655. heights, (x_edges, y_edges) = estimator(
  656. sub_data["x"],
  657. sub_data["y"],
  658. weights=sub_data.get("weights", None),
  659. )
  660. # Get the axes for this plot
  661. ax = self._get_axes(sub_vars)
  662. # Invert the scale for the edges
  663. _, inv_x = _get_transform_functions(ax, "x")
  664. _, inv_y = _get_transform_functions(ax, "y")
  665. x_edges = inv_x(x_edges)
  666. y_edges = inv_y(y_edges)
  667. # Apply scaling to normalize across groups
  668. if estimator.stat != "count" and common_norm:
  669. heights *= len(sub_data) / len(all_data)
  670. # Define the specific kwargs for this artist
  671. artist_kws = plot_kws.copy()
  672. if "hue" in self.variables:
  673. color = self._hue_map(sub_vars["hue"])
  674. cmap = self._cmap_from_color(color)
  675. artist_kws["cmap"] = cmap
  676. else:
  677. cmap = artist_kws.pop("cmap", None)
  678. if isinstance(cmap, str):
  679. cmap = color_palette(cmap, as_cmap=True)
  680. elif cmap is None:
  681. cmap = self._cmap_from_color(color)
  682. artist_kws["cmap"] = cmap
  683. # Set the upper norm on the colormap
  684. if not common_color_norm and pmax is not None:
  685. vmax = self._quantile_to_level(heights, pmax)
  686. if vmax is not None:
  687. artist_kws["vmax"] = vmax
  688. # Make cells at or below the threshold transparent
  689. if not common_color_norm and pthresh:
  690. thresh = self._quantile_to_level(heights, pthresh)
  691. if thresh is not None:
  692. heights = np.ma.masked_less_equal(heights, thresh)
  693. # pcolormesh is going to turn the grid off, but we want to keep it
  694. # I'm not sure if there's a better way to get the grid state
  695. x_grid = any([l.get_visible() for l in ax.xaxis.get_gridlines()])
  696. y_grid = any([l.get_visible() for l in ax.yaxis.get_gridlines()])
  697. mesh = ax.pcolormesh(
  698. x_edges,
  699. y_edges,
  700. heights.T,
  701. **artist_kws,
  702. )
  703. # pcolormesh sets sticky edges, but we only want them if not thresholding
  704. if thresh is not None:
  705. mesh.sticky_edges.x[:] = []
  706. mesh.sticky_edges.y[:] = []
  707. # Add an optional colorbar
  708. # Note, we want to improve this. When hue is used, it will stack
  709. # multiple colorbars with redundant ticks in an ugly way.
  710. # But it's going to take some work to have multiple colorbars that
  711. # share ticks nicely.
  712. if cbar:
  713. ax.figure.colorbar(mesh, cbar_ax, ax, **cbar_kws)
  714. # Reset the grid state
  715. if x_grid:
  716. ax.grid(True, axis="x")
  717. if y_grid:
  718. ax.grid(True, axis="y")
  719. # --- Finalize the plot
  720. ax = self.ax if self.ax is not None else self.facets.axes.flat[0]
  721. self._add_axis_labels(ax)
  722. if "hue" in self.variables and legend:
  723. # TODO if possible, I would like to move the contour
  724. # intensity information into the legend too and label the
  725. # iso proportions rather than the raw density values
  726. artist_kws = {}
  727. artist = partial(mpl.patches.Patch)
  728. ax_obj = self.ax if self.ax is not None else self.facets
  729. self._add_legend(
  730. ax_obj, artist, True, False, "layer", 1, artist_kws, {},
  731. )
  732. def plot_univariate_density(
  733. self,
  734. multiple,
  735. common_norm,
  736. common_grid,
  737. warn_singular,
  738. fill,
  739. color,
  740. legend,
  741. estimate_kws,
  742. **plot_kws,
  743. ):
  744. # Handle conditional defaults
  745. if fill is None:
  746. fill = multiple in ("stack", "fill")
  747. # Preprocess the matplotlib keyword dictionaries
  748. if fill:
  749. artist = mpl.collections.PolyCollection
  750. else:
  751. artist = mpl.lines.Line2D
  752. plot_kws = _normalize_kwargs(plot_kws, artist)
  753. # Input checking
  754. _check_argument("multiple", ["layer", "stack", "fill"], multiple)
  755. # Always share the evaluation grid when stacking
  756. subsets = bool(set(self.variables) - {"x", "y"})
  757. if subsets and multiple in ("stack", "fill"):
  758. common_grid = True
  759. # Do the computation
  760. densities = self._compute_univariate_density(
  761. self.data_variable,
  762. common_norm,
  763. common_grid,
  764. estimate_kws,
  765. warn_singular,
  766. )
  767. # Adjust densities based on the `multiple` rule
  768. densities, baselines = self._resolve_multiple(densities, multiple)
  769. # Control the interaction with autoscaling by defining sticky_edges
  770. # i.e. we don't want autoscale margins below the density curve
  771. sticky_density = (0, 1) if multiple == "fill" else (0, np.inf)
  772. if multiple == "fill":
  773. # Filled plots should not have any margins
  774. sticky_support = densities.index.min(), densities.index.max()
  775. else:
  776. sticky_support = []
  777. if fill:
  778. if multiple == "layer":
  779. default_alpha = .25
  780. else:
  781. default_alpha = .75
  782. else:
  783. default_alpha = 1
  784. alpha = plot_kws.pop("alpha", default_alpha) # TODO make parameter?
  785. # Now iterate through the subsets and draw the densities
  786. # We go backwards so stacked densities read from top-to-bottom
  787. for sub_vars, _ in self.iter_data("hue", reverse=True):
  788. # Extract the support grid and density curve for this level
  789. key = tuple(sub_vars.items())
  790. try:
  791. density = densities[key]
  792. except KeyError:
  793. continue
  794. support = density.index
  795. fill_from = baselines[key]
  796. ax = self._get_axes(sub_vars)
  797. if "hue" in self.variables:
  798. sub_color = self._hue_map(sub_vars["hue"])
  799. else:
  800. sub_color = color
  801. artist_kws = self._artist_kws(
  802. plot_kws, fill, False, multiple, sub_color, alpha
  803. )
  804. # Either plot a curve with observation values on the x axis
  805. if "x" in self.variables:
  806. if fill:
  807. artist = ax.fill_between(support, fill_from, density, **artist_kws)
  808. else:
  809. artist, = ax.plot(support, density, **artist_kws)
  810. artist.sticky_edges.x[:] = sticky_support
  811. artist.sticky_edges.y[:] = sticky_density
  812. # Or plot a curve with observation values on the y axis
  813. else:
  814. if fill:
  815. artist = ax.fill_betweenx(support, fill_from, density, **artist_kws)
  816. else:
  817. artist, = ax.plot(density, support, **artist_kws)
  818. artist.sticky_edges.x[:] = sticky_density
  819. artist.sticky_edges.y[:] = sticky_support
  820. # --- Finalize the plot ----
  821. ax = self.ax if self.ax is not None else self.facets.axes.flat[0]
  822. default_x = default_y = ""
  823. if self.data_variable == "x":
  824. default_y = "Density"
  825. if self.data_variable == "y":
  826. default_x = "Density"
  827. self._add_axis_labels(ax, default_x, default_y)
  828. if "hue" in self.variables and legend:
  829. if fill:
  830. artist = partial(mpl.patches.Patch)
  831. else:
  832. artist = partial(mpl.lines.Line2D, [], [])
  833. ax_obj = self.ax if self.ax is not None else self.facets
  834. self._add_legend(
  835. ax_obj, artist, fill, False, multiple, alpha, plot_kws, {},
  836. )
  837. def plot_bivariate_density(
  838. self,
  839. common_norm,
  840. fill,
  841. levels,
  842. thresh,
  843. color,
  844. legend,
  845. cbar,
  846. warn_singular,
  847. cbar_ax,
  848. cbar_kws,
  849. estimate_kws,
  850. **contour_kws,
  851. ):
  852. contour_kws = contour_kws.copy()
  853. estimator = KDE(**estimate_kws)
  854. if not set(self.variables) - {"x", "y"}:
  855. common_norm = False
  856. all_data = self.plot_data.dropna()
  857. # Loop through the subsets and estimate the KDEs
  858. densities, supports = {}, {}
  859. for sub_vars, sub_data in self.iter_data("hue", from_comp_data=True):
  860. # Extract the data points from this sub set
  861. observations = sub_data[["x", "y"]]
  862. min_variance = observations.var().fillna(0).min()
  863. observations = observations["x"], observations["y"]
  864. # Extract the weights for this subset of observations
  865. if "weights" in self.variables:
  866. weights = sub_data["weights"]
  867. else:
  868. weights = None
  869. # Estimate the density of observations at this level
  870. singular = math.isclose(min_variance, 0)
  871. try:
  872. if not singular:
  873. density, support = estimator(*observations, weights=weights)
  874. except np.linalg.LinAlgError:
  875. # Testing for 0 variance doesn't catch all cases where scipy raises,
  876. # but we can also get a ValueError, so we need this convoluted approach
  877. singular = True
  878. if singular:
  879. msg = (
  880. "KDE cannot be estimated (0 variance or perfect covariance). "
  881. "Pass `warn_singular=False` to disable this warning."
  882. )
  883. if warn_singular:
  884. warnings.warn(msg, UserWarning, stacklevel=3)
  885. continue
  886. # Transform the support grid back to the original scale
  887. ax = self._get_axes(sub_vars)
  888. _, inv_x = _get_transform_functions(ax, "x")
  889. _, inv_y = _get_transform_functions(ax, "y")
  890. support = inv_x(support[0]), inv_y(support[1])
  891. # Apply a scaling factor so that the integral over all subsets is 1
  892. if common_norm:
  893. density *= len(sub_data) / len(all_data)
  894. key = tuple(sub_vars.items())
  895. densities[key] = density
  896. supports[key] = support
  897. # Define a grid of iso-proportion levels
  898. if thresh is None:
  899. thresh = 0
  900. if isinstance(levels, Number):
  901. levels = np.linspace(thresh, 1, levels)
  902. else:
  903. if min(levels) < 0 or max(levels) > 1:
  904. raise ValueError("levels must be in [0, 1]")
  905. # Transform from iso-proportions to iso-densities
  906. if common_norm:
  907. common_levels = self._quantile_to_level(
  908. list(densities.values()), levels,
  909. )
  910. draw_levels = {k: common_levels for k in densities}
  911. else:
  912. draw_levels = {
  913. k: self._quantile_to_level(d, levels)
  914. for k, d in densities.items()
  915. }
  916. # Define the coloring of the contours
  917. if "hue" in self.variables:
  918. for param in ["cmap", "colors"]:
  919. if param in contour_kws:
  920. msg = f"{param} parameter ignored when using hue mapping."
  921. warnings.warn(msg, UserWarning)
  922. contour_kws.pop(param)
  923. else:
  924. # Work out a default coloring of the contours
  925. coloring_given = set(contour_kws) & {"cmap", "colors"}
  926. if fill and not coloring_given:
  927. cmap = self._cmap_from_color(color)
  928. contour_kws["cmap"] = cmap
  929. if not fill and not coloring_given:
  930. contour_kws["colors"] = [color]
  931. # Use our internal colormap lookup
  932. cmap = contour_kws.pop("cmap", None)
  933. if isinstance(cmap, str):
  934. cmap = color_palette(cmap, as_cmap=True)
  935. if cmap is not None:
  936. contour_kws["cmap"] = cmap
  937. # Loop through the subsets again and plot the data
  938. for sub_vars, _ in self.iter_data("hue"):
  939. if "hue" in sub_vars:
  940. color = self._hue_map(sub_vars["hue"])
  941. if fill:
  942. contour_kws["cmap"] = self._cmap_from_color(color)
  943. else:
  944. contour_kws["colors"] = [color]
  945. ax = self._get_axes(sub_vars)
  946. # Choose the function to plot with
  947. # TODO could add a pcolormesh based option as well
  948. # Which would look something like element="raster"
  949. if fill:
  950. contour_func = ax.contourf
  951. else:
  952. contour_func = ax.contour
  953. key = tuple(sub_vars.items())
  954. if key not in densities:
  955. continue
  956. density = densities[key]
  957. xx, yy = supports[key]
  958. # Pop the label kwarg which is unused by contour_func (but warns)
  959. contour_kws.pop("label", None)
  960. cset = contour_func(
  961. xx, yy, density,
  962. levels=draw_levels[key],
  963. **contour_kws,
  964. )
  965. # Add a color bar representing the contour heights
  966. # Note: this shows iso densities, not iso proportions
  967. # See more notes in histplot about how this could be improved
  968. if cbar:
  969. cbar_kws = {} if cbar_kws is None else cbar_kws
  970. ax.figure.colorbar(cset, cbar_ax, ax, **cbar_kws)
  971. # --- Finalize the plot
  972. ax = self.ax if self.ax is not None else self.facets.axes.flat[0]
  973. self._add_axis_labels(ax)
  974. if "hue" in self.variables and legend:
  975. # TODO if possible, I would like to move the contour
  976. # intensity information into the legend too and label the
  977. # iso proportions rather than the raw density values
  978. artist_kws = {}
  979. if fill:
  980. artist = partial(mpl.patches.Patch)
  981. else:
  982. artist = partial(mpl.lines.Line2D, [], [])
  983. ax_obj = self.ax if self.ax is not None else self.facets
  984. self._add_legend(
  985. ax_obj, artist, fill, False, "layer", 1, artist_kws, {},
  986. )
  987. def plot_univariate_ecdf(self, estimate_kws, legend, **plot_kws):
  988. estimator = ECDF(**estimate_kws)
  989. # Set the draw style to step the right way for the data variable
  990. drawstyles = dict(x="steps-post", y="steps-pre")
  991. plot_kws["drawstyle"] = drawstyles[self.data_variable]
  992. # Loop through the subsets, transform and plot the data
  993. for sub_vars, sub_data in self.iter_data(
  994. "hue", reverse=True, from_comp_data=True,
  995. ):
  996. # Compute the ECDF
  997. if sub_data.empty:
  998. continue
  999. observations = sub_data[self.data_variable]
  1000. weights = sub_data.get("weights", None)
  1001. stat, vals = estimator(observations, weights=weights)
  1002. # Assign attributes based on semantic mapping
  1003. artist_kws = plot_kws.copy()
  1004. if "hue" in self.variables:
  1005. artist_kws["color"] = self._hue_map(sub_vars["hue"])
  1006. # Return the data variable to the linear domain
  1007. ax = self._get_axes(sub_vars)
  1008. _, inv = _get_transform_functions(ax, self.data_variable)
  1009. vals = inv(vals)
  1010. # Manually set the minimum value on a "log" scale
  1011. if isinstance(inv.__self__, mpl.scale.LogTransform):
  1012. vals[0] = -np.inf
  1013. # Work out the orientation of the plot
  1014. if self.data_variable == "x":
  1015. plot_args = vals, stat
  1016. stat_variable = "y"
  1017. else:
  1018. plot_args = stat, vals
  1019. stat_variable = "x"
  1020. if estimator.stat == "count":
  1021. top_edge = len(observations)
  1022. else:
  1023. top_edge = 1
  1024. # Draw the line for this subset
  1025. artist, = ax.plot(*plot_args, **artist_kws)
  1026. sticky_edges = getattr(artist.sticky_edges, stat_variable)
  1027. sticky_edges[:] = 0, top_edge
  1028. # --- Finalize the plot ----
  1029. ax = self.ax if self.ax is not None else self.facets.axes.flat[0]
  1030. stat = estimator.stat.capitalize()
  1031. default_x = default_y = ""
  1032. if self.data_variable == "x":
  1033. default_y = stat
  1034. if self.data_variable == "y":
  1035. default_x = stat
  1036. self._add_axis_labels(ax, default_x, default_y)
  1037. if "hue" in self.variables and legend:
  1038. artist = partial(mpl.lines.Line2D, [], [])
  1039. alpha = plot_kws.get("alpha", 1)
  1040. ax_obj = self.ax if self.ax is not None else self.facets
  1041. self._add_legend(
  1042. ax_obj, artist, False, False, None, alpha, plot_kws, {},
  1043. )
  1044. def plot_rug(self, height, expand_margins, legend, **kws):
  1045. for sub_vars, sub_data, in self.iter_data(from_comp_data=True):
  1046. ax = self._get_axes(sub_vars)
  1047. kws.setdefault("linewidth", 1)
  1048. if expand_margins:
  1049. xmarg, ymarg = ax.margins()
  1050. if "x" in self.variables:
  1051. ymarg += height * 2
  1052. if "y" in self.variables:
  1053. xmarg += height * 2
  1054. ax.margins(x=xmarg, y=ymarg)
  1055. if "hue" in self.variables:
  1056. kws.pop("c", None)
  1057. kws.pop("color", None)
  1058. if "x" in self.variables:
  1059. self._plot_single_rug(sub_data, "x", height, ax, kws)
  1060. if "y" in self.variables:
  1061. self._plot_single_rug(sub_data, "y", height, ax, kws)
  1062. # --- Finalize the plot
  1063. self._add_axis_labels(ax)
  1064. if "hue" in self.variables and legend:
  1065. # TODO ideally i'd like the legend artist to look like a rug
  1066. legend_artist = partial(mpl.lines.Line2D, [], [])
  1067. self._add_legend(
  1068. ax, legend_artist, False, False, None, 1, {}, {},
  1069. )
  1070. def _plot_single_rug(self, sub_data, var, height, ax, kws):
  1071. """Draw a rugplot along one axis of the plot."""
  1072. vector = sub_data[var]
  1073. n = len(vector)
  1074. # Return data to linear domain
  1075. _, inv = _get_transform_functions(ax, var)
  1076. vector = inv(vector)
  1077. # We'll always add a single collection with varying colors
  1078. if "hue" in self.variables:
  1079. colors = self._hue_map(sub_data["hue"])
  1080. else:
  1081. colors = None
  1082. # Build the array of values for the LineCollection
  1083. if var == "x":
  1084. trans = tx.blended_transform_factory(ax.transData, ax.transAxes)
  1085. xy_pairs = np.column_stack([
  1086. np.repeat(vector, 2), np.tile([0, height], n)
  1087. ])
  1088. if var == "y":
  1089. trans = tx.blended_transform_factory(ax.transAxes, ax.transData)
  1090. xy_pairs = np.column_stack([
  1091. np.tile([0, height], n), np.repeat(vector, 2)
  1092. ])
  1093. # Draw the lines on the plot
  1094. line_segs = xy_pairs.reshape([n, 2, 2])
  1095. ax.add_collection(LineCollection(
  1096. line_segs, transform=trans, colors=colors, **kws
  1097. ))
  1098. ax.autoscale_view(scalex=var == "x", scaley=var == "y")
  1099. # ==================================================================================== #
  1100. # External API
  1101. # ==================================================================================== #
  1102. def histplot(
  1103. data=None, *,
  1104. # Vector variables
  1105. x=None, y=None, hue=None, weights=None,
  1106. # Histogram computation parameters
  1107. stat="count", bins="auto", binwidth=None, binrange=None,
  1108. discrete=None, cumulative=False, common_bins=True, common_norm=True,
  1109. # Histogram appearance parameters
  1110. multiple="layer", element="bars", fill=True, shrink=1,
  1111. # Histogram smoothing with a kernel density estimate
  1112. kde=False, kde_kws=None, line_kws=None,
  1113. # Bivariate histogram parameters
  1114. thresh=0, pthresh=None, pmax=None, cbar=False, cbar_ax=None, cbar_kws=None,
  1115. # Hue mapping parameters
  1116. palette=None, hue_order=None, hue_norm=None, color=None,
  1117. # Axes information
  1118. log_scale=None, legend=True, ax=None,
  1119. # Other appearance keywords
  1120. **kwargs,
  1121. ):
  1122. p = _DistributionPlotter(
  1123. data=data,
  1124. variables=dict(x=x, y=y, hue=hue, weights=weights),
  1125. )
  1126. p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
  1127. if ax is None:
  1128. ax = plt.gca()
  1129. p._attach(ax, log_scale=log_scale)
  1130. if p.univariate: # Note, bivariate plots won't cycle
  1131. if fill:
  1132. method = ax.bar if element == "bars" else ax.fill_between
  1133. else:
  1134. method = ax.plot
  1135. color = _default_color(method, hue, color, kwargs)
  1136. if not p.has_xy_data:
  1137. return ax
  1138. # Default to discrete bins for categorical variables
  1139. if discrete is None:
  1140. discrete = p._default_discrete()
  1141. estimate_kws = dict(
  1142. stat=stat,
  1143. bins=bins,
  1144. binwidth=binwidth,
  1145. binrange=binrange,
  1146. discrete=discrete,
  1147. cumulative=cumulative,
  1148. )
  1149. if p.univariate:
  1150. p.plot_univariate_histogram(
  1151. multiple=multiple,
  1152. element=element,
  1153. fill=fill,
  1154. shrink=shrink,
  1155. common_norm=common_norm,
  1156. common_bins=common_bins,
  1157. kde=kde,
  1158. kde_kws=kde_kws,
  1159. color=color,
  1160. legend=legend,
  1161. estimate_kws=estimate_kws,
  1162. line_kws=line_kws,
  1163. **kwargs,
  1164. )
  1165. else:
  1166. p.plot_bivariate_histogram(
  1167. common_bins=common_bins,
  1168. common_norm=common_norm,
  1169. thresh=thresh,
  1170. pthresh=pthresh,
  1171. pmax=pmax,
  1172. color=color,
  1173. legend=legend,
  1174. cbar=cbar,
  1175. cbar_ax=cbar_ax,
  1176. cbar_kws=cbar_kws,
  1177. estimate_kws=estimate_kws,
  1178. **kwargs,
  1179. )
  1180. return ax
  1181. histplot.__doc__ = """\
  1182. Plot univariate or bivariate histograms to show distributions of datasets.
  1183. A histogram is a classic visualization tool that represents the distribution
  1184. of one or more variables by counting the number of observations that fall within
  1185. discrete bins.
  1186. This function can normalize the statistic computed within each bin to estimate
  1187. frequency, density or probability mass, and it can add a smooth curve obtained
  1188. using a kernel density estimate, similar to :func:`kdeplot`.
  1189. More information is provided in the :ref:`user guide <tutorial_hist>`.
  1190. Parameters
  1191. ----------
  1192. {params.core.data}
  1193. {params.core.xy}
  1194. {params.core.hue}
  1195. weights : vector or key in ``data``
  1196. If provided, weight the contribution of the corresponding data points
  1197. towards the count in each bin by these factors.
  1198. {params.hist.stat}
  1199. {params.hist.bins}
  1200. {params.hist.binwidth}
  1201. {params.hist.binrange}
  1202. discrete : bool
  1203. If True, default to ``binwidth=1`` and draw the bars so that they are
  1204. centered on their corresponding data points. This avoids "gaps" that may
  1205. otherwise appear when using discrete (integer) data.
  1206. cumulative : bool
  1207. If True, plot the cumulative counts as bins increase.
  1208. common_bins : bool
  1209. If True, use the same bins when semantic variables produce multiple
  1210. plots. If using a reference rule to determine the bins, it will be computed
  1211. with the full dataset.
  1212. common_norm : bool
  1213. If True and using a normalized statistic, the normalization will apply over
  1214. the full dataset. Otherwise, normalize each histogram independently.
  1215. multiple : {{"layer", "dodge", "stack", "fill"}}
  1216. Approach to resolving multiple elements when semantic mapping creates subsets.
  1217. Only relevant with univariate data.
  1218. element : {{"bars", "step", "poly"}}
  1219. Visual representation of the histogram statistic.
  1220. Only relevant with univariate data.
  1221. fill : bool
  1222. If True, fill in the space under the histogram.
  1223. Only relevant with univariate data.
  1224. shrink : number
  1225. Scale the width of each bar relative to the binwidth by this factor.
  1226. Only relevant with univariate data.
  1227. kde : bool
  1228. If True, compute a kernel density estimate to smooth the distribution
  1229. and show on the plot as (one or more) line(s).
  1230. Only relevant with univariate data.
  1231. kde_kws : dict
  1232. Parameters that control the KDE computation, as in :func:`kdeplot`.
  1233. line_kws : dict
  1234. Parameters that control the KDE visualization, passed to
  1235. :meth:`matplotlib.axes.Axes.plot`.
  1236. thresh : number or None
  1237. Cells with a statistic less than or equal to this value will be transparent.
  1238. Only relevant with bivariate data.
  1239. pthresh : number or None
  1240. Like ``thresh``, but a value in [0, 1] such that cells with aggregate counts
  1241. (or other statistics, when used) up to this proportion of the total will be
  1242. transparent.
  1243. pmax : number or None
  1244. A value in [0, 1] that sets that saturation point for the colormap at a value
  1245. such that cells below constitute this proportion of the total count (or
  1246. other statistic, when used).
  1247. {params.dist.cbar}
  1248. {params.dist.cbar_ax}
  1249. {params.dist.cbar_kws}
  1250. {params.core.palette}
  1251. {params.core.hue_order}
  1252. {params.core.hue_norm}
  1253. {params.core.color}
  1254. {params.dist.log_scale}
  1255. {params.dist.legend}
  1256. {params.core.ax}
  1257. kwargs
  1258. Other keyword arguments are passed to one of the following matplotlib
  1259. functions:
  1260. - :meth:`matplotlib.axes.Axes.bar` (univariate, element="bars")
  1261. - :meth:`matplotlib.axes.Axes.fill_between` (univariate, other element, fill=True)
  1262. - :meth:`matplotlib.axes.Axes.plot` (univariate, other element, fill=False)
  1263. - :meth:`matplotlib.axes.Axes.pcolormesh` (bivariate)
  1264. Returns
  1265. -------
  1266. {returns.ax}
  1267. See Also
  1268. --------
  1269. {seealso.displot}
  1270. {seealso.kdeplot}
  1271. {seealso.rugplot}
  1272. {seealso.ecdfplot}
  1273. {seealso.jointplot}
  1274. Notes
  1275. -----
  1276. The choice of bins for computing and plotting a histogram can exert
  1277. substantial influence on the insights that one is able to draw from the
  1278. visualization. If the bins are too large, they may erase important features.
  1279. On the other hand, bins that are too small may be dominated by random
  1280. variability, obscuring the shape of the true underlying distribution. The
  1281. default bin size is determined using a reference rule that depends on the
  1282. sample size and variance. This works well in many cases, (i.e., with
  1283. "well-behaved" data) but it fails in others. It is always a good to try
  1284. different bin sizes to be sure that you are not missing something important.
  1285. This function allows you to specify bins in several different ways, such as
  1286. by setting the total number of bins to use, the width of each bin, or the
  1287. specific locations where the bins should break.
  1288. Examples
  1289. --------
  1290. .. include:: ../docstrings/histplot.rst
  1291. """.format(
  1292. params=_param_docs,
  1293. returns=_core_docs["returns"],
  1294. seealso=_core_docs["seealso"],
  1295. )
  1296. def kdeplot(
  1297. data=None, *, x=None, y=None, hue=None, weights=None,
  1298. palette=None, hue_order=None, hue_norm=None, color=None, fill=None,
  1299. multiple="layer", common_norm=True, common_grid=False, cumulative=False,
  1300. bw_method="scott", bw_adjust=1, warn_singular=True, log_scale=None,
  1301. levels=10, thresh=.05, gridsize=200, cut=3, clip=None,
  1302. legend=True, cbar=False, cbar_ax=None, cbar_kws=None, ax=None,
  1303. **kwargs,
  1304. ):
  1305. # --- Start with backwards compatability for versions < 0.11.0 ----------------
  1306. # Handle (past) deprecation of `data2`
  1307. if "data2" in kwargs:
  1308. msg = "`data2` has been removed (replaced by `y`); please update your code."
  1309. TypeError(msg)
  1310. # Handle deprecation of `vertical`
  1311. vertical = kwargs.pop("vertical", None)
  1312. if vertical is not None:
  1313. if vertical:
  1314. action_taken = "assigning data to `y`."
  1315. if x is None:
  1316. data, y = y, data
  1317. else:
  1318. x, y = y, x
  1319. else:
  1320. action_taken = "assigning data to `x`."
  1321. msg = textwrap.dedent(f"""\n
  1322. The `vertical` parameter is deprecated; {action_taken}
  1323. This will become an error in seaborn v0.14.0; please update your code.
  1324. """)
  1325. warnings.warn(msg, UserWarning, stacklevel=2)
  1326. # Handle deprecation of `bw`
  1327. bw = kwargs.pop("bw", None)
  1328. if bw is not None:
  1329. msg = textwrap.dedent(f"""\n
  1330. The `bw` parameter is deprecated in favor of `bw_method` and `bw_adjust`.
  1331. Setting `bw_method={bw}`, but please see the docs for the new parameters
  1332. and update your code. This will become an error in seaborn v0.14.0.
  1333. """)
  1334. warnings.warn(msg, UserWarning, stacklevel=2)
  1335. bw_method = bw
  1336. # Handle deprecation of `kernel`
  1337. if kwargs.pop("kernel", None) is not None:
  1338. msg = textwrap.dedent("""\n
  1339. Support for alternate kernels has been removed; using Gaussian kernel.
  1340. This will become an error in seaborn v0.14.0; please update your code.
  1341. """)
  1342. warnings.warn(msg, UserWarning, stacklevel=2)
  1343. # Handle deprecation of shade_lowest
  1344. shade_lowest = kwargs.pop("shade_lowest", None)
  1345. if shade_lowest is not None:
  1346. if shade_lowest:
  1347. thresh = 0
  1348. msg = textwrap.dedent(f"""\n
  1349. `shade_lowest` has been replaced by `thresh`; setting `thresh={thresh}.
  1350. This will become an error in seaborn v0.14.0; please update your code.
  1351. """)
  1352. warnings.warn(msg, UserWarning, stacklevel=2)
  1353. # Handle "soft" deprecation of shade `shade` is not really the right
  1354. # terminology here, but unlike some of the other deprecated parameters it
  1355. # is probably very commonly used and much hard to remove. This is therefore
  1356. # going to be a longer process where, first, `fill` will be introduced and
  1357. # be used throughout the documentation. In 0.12, when kwarg-only
  1358. # enforcement hits, we can remove the shade/shade_lowest out of the
  1359. # function signature all together and pull them out of the kwargs. Then we
  1360. # can actually fire a FutureWarning, and eventually remove.
  1361. shade = kwargs.pop("shade", None)
  1362. if shade is not None:
  1363. fill = shade
  1364. msg = textwrap.dedent(f"""\n
  1365. `shade` is now deprecated in favor of `fill`; setting `fill={shade}`.
  1366. This will become an error in seaborn v0.14.0; please update your code.
  1367. """)
  1368. warnings.warn(msg, FutureWarning, stacklevel=2)
  1369. # Handle `n_levels`
  1370. # This was never in the formal API but it was processed, and appeared in an
  1371. # example. We can treat as an alias for `levels` now and deprecate later.
  1372. levels = kwargs.pop("n_levels", levels)
  1373. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
  1374. p = _DistributionPlotter(
  1375. data=data,
  1376. variables=dict(x=x, y=y, hue=hue, weights=weights),
  1377. )
  1378. p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
  1379. if ax is None:
  1380. ax = plt.gca()
  1381. p._attach(ax, allowed_types=["numeric", "datetime"], log_scale=log_scale)
  1382. method = ax.fill_between if fill else ax.plot
  1383. color = _default_color(method, hue, color, kwargs)
  1384. if not p.has_xy_data:
  1385. return ax
  1386. # Pack the kwargs for statistics.KDE
  1387. estimate_kws = dict(
  1388. bw_method=bw_method,
  1389. bw_adjust=bw_adjust,
  1390. gridsize=gridsize,
  1391. cut=cut,
  1392. clip=clip,
  1393. cumulative=cumulative,
  1394. )
  1395. if p.univariate:
  1396. plot_kws = kwargs.copy()
  1397. p.plot_univariate_density(
  1398. multiple=multiple,
  1399. common_norm=common_norm,
  1400. common_grid=common_grid,
  1401. fill=fill,
  1402. color=color,
  1403. legend=legend,
  1404. warn_singular=warn_singular,
  1405. estimate_kws=estimate_kws,
  1406. **plot_kws,
  1407. )
  1408. else:
  1409. p.plot_bivariate_density(
  1410. common_norm=common_norm,
  1411. fill=fill,
  1412. levels=levels,
  1413. thresh=thresh,
  1414. legend=legend,
  1415. color=color,
  1416. warn_singular=warn_singular,
  1417. cbar=cbar,
  1418. cbar_ax=cbar_ax,
  1419. cbar_kws=cbar_kws,
  1420. estimate_kws=estimate_kws,
  1421. **kwargs,
  1422. )
  1423. return ax
  1424. kdeplot.__doc__ = """\
  1425. Plot univariate or bivariate distributions using kernel density estimation.
  1426. A kernel density estimate (KDE) plot is a method for visualizing the
  1427. distribution of observations in a dataset, analogous to a histogram. KDE
  1428. represents the data using a continuous probability density curve in one or
  1429. more dimensions.
  1430. The approach is explained further in the :ref:`user guide <tutorial_kde>`.
  1431. Relative to a histogram, KDE can produce a plot that is less cluttered and
  1432. more interpretable, especially when drawing multiple distributions. But it
  1433. has the potential to introduce distortions if the underlying distribution is
  1434. bounded or not smooth. Like a histogram, the quality of the representation
  1435. also depends on the selection of good smoothing parameters.
  1436. Parameters
  1437. ----------
  1438. {params.core.data}
  1439. {params.core.xy}
  1440. {params.core.hue}
  1441. weights : vector or key in ``data``
  1442. If provided, weight the kernel density estimation using these values.
  1443. {params.core.palette}
  1444. {params.core.hue_order}
  1445. {params.core.hue_norm}
  1446. {params.core.color}
  1447. fill : bool or None
  1448. If True, fill in the area under univariate density curves or between
  1449. bivariate contours. If None, the default depends on ``multiple``.
  1450. {params.dist.multiple}
  1451. common_norm : bool
  1452. If True, scale each conditional density by the number of observations
  1453. such that the total area under all densities sums to 1. Otherwise,
  1454. normalize each density independently.
  1455. common_grid : bool
  1456. If True, use the same evaluation grid for each kernel density estimate.
  1457. Only relevant with univariate data.
  1458. {params.kde.cumulative}
  1459. {params.kde.bw_method}
  1460. {params.kde.bw_adjust}
  1461. warn_singular : bool
  1462. If True, issue a warning when trying to estimate the density of data
  1463. with zero variance.
  1464. {params.dist.log_scale}
  1465. levels : int or vector
  1466. Number of contour levels or values to draw contours at. A vector argument
  1467. must have increasing values in [0, 1]. Levels correspond to iso-proportions
  1468. of the density: e.g., 20% of the probability mass will lie below the
  1469. contour drawn for 0.2. Only relevant with bivariate data.
  1470. thresh : number in [0, 1]
  1471. Lowest iso-proportion level at which to draw a contour line. Ignored when
  1472. ``levels`` is a vector. Only relevant with bivariate data.
  1473. gridsize : int
  1474. Number of points on each dimension of the evaluation grid.
  1475. {params.kde.cut}
  1476. {params.kde.clip}
  1477. {params.dist.legend}
  1478. {params.dist.cbar}
  1479. {params.dist.cbar_ax}
  1480. {params.dist.cbar_kws}
  1481. {params.core.ax}
  1482. kwargs
  1483. Other keyword arguments are passed to one of the following matplotlib
  1484. functions:
  1485. - :meth:`matplotlib.axes.Axes.plot` (univariate, ``fill=False``),
  1486. - :meth:`matplotlib.axes.Axes.fill_between` (univariate, ``fill=True``),
  1487. - :meth:`matplotlib.axes.Axes.contour` (bivariate, ``fill=False``),
  1488. - :meth:`matplotlib.axes.contourf` (bivariate, ``fill=True``).
  1489. Returns
  1490. -------
  1491. {returns.ax}
  1492. See Also
  1493. --------
  1494. {seealso.displot}
  1495. {seealso.histplot}
  1496. {seealso.ecdfplot}
  1497. {seealso.jointplot}
  1498. {seealso.violinplot}
  1499. Notes
  1500. -----
  1501. The *bandwidth*, or standard deviation of the smoothing kernel, is an
  1502. important parameter. Misspecification of the bandwidth can produce a
  1503. distorted representation of the data. Much like the choice of bin width in a
  1504. histogram, an over-smoothed curve can erase true features of a
  1505. distribution, while an under-smoothed curve can create false features out of
  1506. random variability. The rule-of-thumb that sets the default bandwidth works
  1507. best when the true distribution is smooth, unimodal, and roughly bell-shaped.
  1508. It is always a good idea to check the default behavior by using ``bw_adjust``
  1509. to increase or decrease the amount of smoothing.
  1510. Because the smoothing algorithm uses a Gaussian kernel, the estimated density
  1511. curve can extend to values that do not make sense for a particular dataset.
  1512. For example, the curve may be drawn over negative values when smoothing data
  1513. that are naturally positive. The ``cut`` and ``clip`` parameters can be used
  1514. to control the extent of the curve, but datasets that have many observations
  1515. close to a natural boundary may be better served by a different visualization
  1516. method.
  1517. Similar considerations apply when a dataset is naturally discrete or "spiky"
  1518. (containing many repeated observations of the same value). Kernel density
  1519. estimation will always produce a smooth curve, which would be misleading
  1520. in these situations.
  1521. The units on the density axis are a common source of confusion. While kernel
  1522. density estimation produces a probability distribution, the height of the curve
  1523. at each point gives a density, not a probability. A probability can be obtained
  1524. only by integrating the density across a range. The curve is normalized so
  1525. that the integral over all possible values is 1, meaning that the scale of
  1526. the density axis depends on the data values.
  1527. Examples
  1528. --------
  1529. .. include:: ../docstrings/kdeplot.rst
  1530. """.format(
  1531. params=_param_docs,
  1532. returns=_core_docs["returns"],
  1533. seealso=_core_docs["seealso"],
  1534. )
  1535. def ecdfplot(
  1536. data=None, *,
  1537. # Vector variables
  1538. x=None, y=None, hue=None, weights=None,
  1539. # Computation parameters
  1540. stat="proportion", complementary=False,
  1541. # Hue mapping parameters
  1542. palette=None, hue_order=None, hue_norm=None,
  1543. # Axes information
  1544. log_scale=None, legend=True, ax=None,
  1545. # Other appearance keywords
  1546. **kwargs,
  1547. ):
  1548. p = _DistributionPlotter(
  1549. data=data,
  1550. variables=dict(x=x, y=y, hue=hue, weights=weights),
  1551. )
  1552. p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
  1553. # We could support other semantics (size, style) here fairly easily
  1554. # But it would make distplot a bit more complicated.
  1555. # It's always possible to add features like that later, so I am going to defer.
  1556. # It will be even easier to wait until after there is a more general/abstract
  1557. # way to go from semantic specs to artist attributes.
  1558. if ax is None:
  1559. ax = plt.gca()
  1560. p._attach(ax, log_scale=log_scale)
  1561. color = kwargs.pop("color", kwargs.pop("c", None))
  1562. kwargs["color"] = _default_color(ax.plot, hue, color, kwargs)
  1563. if not p.has_xy_data:
  1564. return ax
  1565. # We could add this one day, but it's of dubious value
  1566. if not p.univariate:
  1567. raise NotImplementedError("Bivariate ECDF plots are not implemented")
  1568. estimate_kws = dict(
  1569. stat=stat,
  1570. complementary=complementary,
  1571. )
  1572. p.plot_univariate_ecdf(
  1573. estimate_kws=estimate_kws,
  1574. legend=legend,
  1575. **kwargs,
  1576. )
  1577. return ax
  1578. ecdfplot.__doc__ = """\
  1579. Plot empirical cumulative distribution functions.
  1580. An ECDF represents the proportion or count of observations falling below each
  1581. unique value in a dataset. Compared to a histogram or density plot, it has the
  1582. advantage that each observation is visualized directly, meaning that there are
  1583. no binning or smoothing parameters that need to be adjusted. It also aids direct
  1584. comparisons between multiple distributions. A downside is that the relationship
  1585. between the appearance of the plot and the basic properties of the distribution
  1586. (such as its central tendency, variance, and the presence of any bimodality)
  1587. may not be as intuitive.
  1588. More information is provided in the :ref:`user guide <tutorial_ecdf>`.
  1589. Parameters
  1590. ----------
  1591. {params.core.data}
  1592. {params.core.xy}
  1593. {params.core.hue}
  1594. weights : vector or key in ``data``
  1595. If provided, weight the contribution of the corresponding data points
  1596. towards the cumulative distribution using these values.
  1597. {params.ecdf.stat}
  1598. {params.ecdf.complementary}
  1599. {params.core.palette}
  1600. {params.core.hue_order}
  1601. {params.core.hue_norm}
  1602. {params.dist.log_scale}
  1603. {params.dist.legend}
  1604. {params.core.ax}
  1605. kwargs
  1606. Other keyword arguments are passed to :meth:`matplotlib.axes.Axes.plot`.
  1607. Returns
  1608. -------
  1609. {returns.ax}
  1610. See Also
  1611. --------
  1612. {seealso.displot}
  1613. {seealso.histplot}
  1614. {seealso.kdeplot}
  1615. {seealso.rugplot}
  1616. Examples
  1617. --------
  1618. .. include:: ../docstrings/ecdfplot.rst
  1619. """.format(
  1620. params=_param_docs,
  1621. returns=_core_docs["returns"],
  1622. seealso=_core_docs["seealso"],
  1623. )
  1624. def rugplot(
  1625. data=None, *, x=None, y=None, hue=None, height=.025, expand_margins=True,
  1626. palette=None, hue_order=None, hue_norm=None, legend=True, ax=None, **kwargs
  1627. ):
  1628. # A note: I think it would make sense to add multiple= to rugplot and allow
  1629. # rugs for different hue variables to be shifted orthogonal to the data axis
  1630. # But is this stacking, or dodging?
  1631. # A note: if we want to add a style semantic to rugplot,
  1632. # we could make an option that draws the rug using scatterplot
  1633. # A note, it would also be nice to offer some kind of histogram/density
  1634. # rugplot, since alpha blending doesn't work great in the large n regime
  1635. # --- Start with backwards compatability for versions < 0.11.0 ----------------
  1636. a = kwargs.pop("a", None)
  1637. axis = kwargs.pop("axis", None)
  1638. if a is not None:
  1639. data = a
  1640. msg = textwrap.dedent("""\n
  1641. The `a` parameter has been replaced; use `x`, `y`, and/or `data` instead.
  1642. Please update your code; This will become an error in seaborn v0.14.0.
  1643. """)
  1644. warnings.warn(msg, UserWarning, stacklevel=2)
  1645. if axis is not None:
  1646. if axis == "x":
  1647. x = data
  1648. elif axis == "y":
  1649. y = data
  1650. data = None
  1651. msg = textwrap.dedent(f"""\n
  1652. The `axis` parameter has been deprecated; use the `{axis}` parameter instead.
  1653. Please update your code; this will become an error in seaborn v0.14.0.
  1654. """)
  1655. warnings.warn(msg, UserWarning, stacklevel=2)
  1656. vertical = kwargs.pop("vertical", None)
  1657. if vertical is not None:
  1658. if vertical:
  1659. action_taken = "assigning data to `y`."
  1660. if x is None:
  1661. data, y = y, data
  1662. else:
  1663. x, y = y, x
  1664. else:
  1665. action_taken = "assigning data to `x`."
  1666. msg = textwrap.dedent(f"""\n
  1667. The `vertical` parameter is deprecated; {action_taken}
  1668. This will become an error in seaborn v0.14.0; please update your code.
  1669. """)
  1670. warnings.warn(msg, UserWarning, stacklevel=2)
  1671. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - #
  1672. p = _DistributionPlotter(
  1673. data=data,
  1674. variables=dict(x=x, y=y, hue=hue),
  1675. )
  1676. p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
  1677. if ax is None:
  1678. ax = plt.gca()
  1679. p._attach(ax)
  1680. color = kwargs.pop("color", kwargs.pop("c", None))
  1681. kwargs["color"] = _default_color(ax.plot, hue, color, kwargs)
  1682. if not p.has_xy_data:
  1683. return ax
  1684. p.plot_rug(height, expand_margins, legend, **kwargs)
  1685. return ax
  1686. rugplot.__doc__ = """\
  1687. Plot marginal distributions by drawing ticks along the x and y axes.
  1688. This function is intended to complement other plots by showing the location
  1689. of individual observations in an unobtrusive way.
  1690. Parameters
  1691. ----------
  1692. {params.core.data}
  1693. {params.core.xy}
  1694. {params.core.hue}
  1695. height : float
  1696. Proportion of axes extent covered by each rug element. Can be negative.
  1697. expand_margins : bool
  1698. If True, increase the axes margins by the height of the rug to avoid
  1699. overlap with other elements.
  1700. {params.core.palette}
  1701. {params.core.hue_order}
  1702. {params.core.hue_norm}
  1703. legend : bool
  1704. If False, do not add a legend for semantic variables.
  1705. {params.core.ax}
  1706. kwargs
  1707. Other keyword arguments are passed to
  1708. :meth:`matplotlib.collections.LineCollection`
  1709. Returns
  1710. -------
  1711. {returns.ax}
  1712. Examples
  1713. --------
  1714. .. include:: ../docstrings/rugplot.rst
  1715. """.format(
  1716. params=_param_docs,
  1717. returns=_core_docs["returns"],
  1718. )
  1719. def displot(
  1720. data=None, *,
  1721. # Vector variables
  1722. x=None, y=None, hue=None, row=None, col=None, weights=None,
  1723. # Other plot parameters
  1724. kind="hist", rug=False, rug_kws=None, log_scale=None, legend=True,
  1725. # Hue-mapping parameters
  1726. palette=None, hue_order=None, hue_norm=None, color=None,
  1727. # Faceting parameters
  1728. col_wrap=None, row_order=None, col_order=None,
  1729. height=5, aspect=1, facet_kws=None,
  1730. **kwargs,
  1731. ):
  1732. p = _DistributionPlotter(
  1733. data=data,
  1734. variables=dict(x=x, y=y, hue=hue, weights=weights, row=row, col=col),
  1735. )
  1736. p.map_hue(palette=palette, order=hue_order, norm=hue_norm)
  1737. _check_argument("kind", ["hist", "kde", "ecdf"], kind)
  1738. # --- Initialize the FacetGrid object
  1739. # Check for attempt to plot onto specific axes and warn
  1740. if "ax" in kwargs:
  1741. msg = (
  1742. "`displot` is a figure-level function and does not accept "
  1743. "the ax= parameter. You may wish to try {}plot.".format(kind)
  1744. )
  1745. warnings.warn(msg, UserWarning)
  1746. kwargs.pop("ax")
  1747. for var in ["row", "col"]:
  1748. # Handle faceting variables that lack name information
  1749. if var in p.variables and p.variables[var] is None:
  1750. p.variables[var] = f"_{var}_"
  1751. # Adapt the plot_data dataframe for use with FacetGrid
  1752. grid_data = p.plot_data.rename(columns=p.variables)
  1753. grid_data = grid_data.loc[:, ~grid_data.columns.duplicated()]
  1754. col_name = p.variables.get("col")
  1755. row_name = p.variables.get("row")
  1756. if facet_kws is None:
  1757. facet_kws = {}
  1758. g = FacetGrid(
  1759. data=grid_data, row=row_name, col=col_name,
  1760. col_wrap=col_wrap, row_order=row_order,
  1761. col_order=col_order, height=height,
  1762. aspect=aspect,
  1763. **facet_kws,
  1764. )
  1765. # Now attach the axes object to the plotter object
  1766. if kind == "kde":
  1767. allowed_types = ["numeric", "datetime"]
  1768. else:
  1769. allowed_types = None
  1770. p._attach(g, allowed_types=allowed_types, log_scale=log_scale)
  1771. # Check for a specification that lacks x/y data and return early
  1772. if not p.has_xy_data:
  1773. return g
  1774. if color is None and hue is None:
  1775. color = "C0"
  1776. # XXX else warn if hue is not None?
  1777. kwargs["legend"] = legend
  1778. # --- Draw the plots
  1779. if kind == "hist":
  1780. hist_kws = kwargs.copy()
  1781. # Extract the parameters that will go directly to Histogram
  1782. estimate_defaults = {}
  1783. _assign_default_kwargs(estimate_defaults, Histogram.__init__, histplot)
  1784. estimate_kws = {}
  1785. for key, default_val in estimate_defaults.items():
  1786. estimate_kws[key] = hist_kws.pop(key, default_val)
  1787. # Handle derivative defaults
  1788. if estimate_kws["discrete"] is None:
  1789. estimate_kws["discrete"] = p._default_discrete()
  1790. hist_kws["estimate_kws"] = estimate_kws
  1791. hist_kws.setdefault("color", color)
  1792. if p.univariate:
  1793. _assign_default_kwargs(hist_kws, p.plot_univariate_histogram, histplot)
  1794. p.plot_univariate_histogram(**hist_kws)
  1795. else:
  1796. _assign_default_kwargs(hist_kws, p.plot_bivariate_histogram, histplot)
  1797. p.plot_bivariate_histogram(**hist_kws)
  1798. elif kind == "kde":
  1799. kde_kws = kwargs.copy()
  1800. # Extract the parameters that will go directly to KDE
  1801. estimate_defaults = {}
  1802. _assign_default_kwargs(estimate_defaults, KDE.__init__, kdeplot)
  1803. estimate_kws = {}
  1804. for key, default_val in estimate_defaults.items():
  1805. estimate_kws[key] = kde_kws.pop(key, default_val)
  1806. kde_kws["estimate_kws"] = estimate_kws
  1807. kde_kws["color"] = color
  1808. if p.univariate:
  1809. _assign_default_kwargs(kde_kws, p.plot_univariate_density, kdeplot)
  1810. p.plot_univariate_density(**kde_kws)
  1811. else:
  1812. _assign_default_kwargs(kde_kws, p.plot_bivariate_density, kdeplot)
  1813. p.plot_bivariate_density(**kde_kws)
  1814. elif kind == "ecdf":
  1815. ecdf_kws = kwargs.copy()
  1816. # Extract the parameters that will go directly to the estimator
  1817. estimate_kws = {}
  1818. estimate_defaults = {}
  1819. _assign_default_kwargs(estimate_defaults, ECDF.__init__, ecdfplot)
  1820. for key, default_val in estimate_defaults.items():
  1821. estimate_kws[key] = ecdf_kws.pop(key, default_val)
  1822. ecdf_kws["estimate_kws"] = estimate_kws
  1823. ecdf_kws["color"] = color
  1824. if p.univariate:
  1825. _assign_default_kwargs(ecdf_kws, p.plot_univariate_ecdf, ecdfplot)
  1826. p.plot_univariate_ecdf(**ecdf_kws)
  1827. else:
  1828. raise NotImplementedError("Bivariate ECDF plots are not implemented")
  1829. # All plot kinds can include a rug
  1830. if rug:
  1831. # TODO with expand_margins=True, each facet expands margins... annoying!
  1832. if rug_kws is None:
  1833. rug_kws = {}
  1834. _assign_default_kwargs(rug_kws, p.plot_rug, rugplot)
  1835. rug_kws["legend"] = False
  1836. if color is not None:
  1837. rug_kws["color"] = color
  1838. p.plot_rug(**rug_kws)
  1839. # Call FacetGrid annotation methods
  1840. # Note that the legend is currently set inside the plotting method
  1841. g.set_axis_labels(
  1842. x_var=p.variables.get("x", g.axes.flat[0].get_xlabel()),
  1843. y_var=p.variables.get("y", g.axes.flat[0].get_ylabel()),
  1844. )
  1845. g.set_titles()
  1846. g.tight_layout()
  1847. if data is not None and (x is not None or y is not None):
  1848. if not isinstance(data, pd.DataFrame):
  1849. data = pd.DataFrame(data)
  1850. g.data = pd.merge(
  1851. data,
  1852. g.data[g.data.columns.difference(data.columns)],
  1853. left_index=True,
  1854. right_index=True,
  1855. )
  1856. else:
  1857. wide_cols = {
  1858. k: f"_{k}_" if v is None else v for k, v in p.variables.items()
  1859. }
  1860. g.data = p.plot_data.rename(columns=wide_cols)
  1861. return g
  1862. displot.__doc__ = """\
  1863. Figure-level interface for drawing distribution plots onto a FacetGrid.
  1864. This function provides access to several approaches for visualizing the
  1865. univariate or bivariate distribution of data, including subsets of data
  1866. defined by semantic mapping and faceting across multiple subplots. The
  1867. ``kind`` parameter selects the approach to use:
  1868. - :func:`histplot` (with ``kind="hist"``; the default)
  1869. - :func:`kdeplot` (with ``kind="kde"``)
  1870. - :func:`ecdfplot` (with ``kind="ecdf"``; univariate-only)
  1871. Additionally, a :func:`rugplot` can be added to any kind of plot to show
  1872. individual observations.
  1873. Extra keyword arguments are passed to the underlying function, so you should
  1874. refer to the documentation for each to understand the complete set of options
  1875. for making plots with this interface.
  1876. See the :doc:`distribution plots tutorial <../tutorial/distributions>` for a more
  1877. in-depth discussion of the relative strengths and weaknesses of each approach.
  1878. The distinction between figure-level and axes-level functions is explained
  1879. further in the :doc:`user guide <../tutorial/function_overview>`.
  1880. Parameters
  1881. ----------
  1882. {params.core.data}
  1883. {params.core.xy}
  1884. {params.core.hue}
  1885. {params.facets.rowcol}
  1886. weights : vector or key in ``data``
  1887. Observation weights used for computing the distribution function.
  1888. kind : {{"hist", "kde", "ecdf"}}
  1889. Approach for visualizing the data. Selects the underlying plotting function
  1890. and determines the additional set of valid parameters.
  1891. rug : bool
  1892. If True, show each observation with marginal ticks (as in :func:`rugplot`).
  1893. rug_kws : dict
  1894. Parameters to control the appearance of the rug plot.
  1895. {params.dist.log_scale}
  1896. {params.dist.legend}
  1897. {params.core.palette}
  1898. {params.core.hue_order}
  1899. {params.core.hue_norm}
  1900. {params.core.color}
  1901. {params.facets.col_wrap}
  1902. {params.facets.rowcol_order}
  1903. {params.facets.height}
  1904. {params.facets.aspect}
  1905. {params.facets.facet_kws}
  1906. kwargs
  1907. Other keyword arguments are documented with the relevant axes-level function:
  1908. - :func:`histplot` (with ``kind="hist"``)
  1909. - :func:`kdeplot` (with ``kind="kde"``)
  1910. - :func:`ecdfplot` (with ``kind="ecdf"``)
  1911. Returns
  1912. -------
  1913. {returns.facetgrid}
  1914. See Also
  1915. --------
  1916. {seealso.histplot}
  1917. {seealso.kdeplot}
  1918. {seealso.rugplot}
  1919. {seealso.ecdfplot}
  1920. {seealso.jointplot}
  1921. Examples
  1922. --------
  1923. See the API documentation for the axes-level functions for more details
  1924. about the breadth of options available for each plot kind.
  1925. .. include:: ../docstrings/displot.rst
  1926. """.format(
  1927. params=_param_docs,
  1928. returns=_core_docs["returns"],
  1929. seealso=_core_docs["seealso"],
  1930. )
  1931. # =========================================================================== #
  1932. # DEPRECATED FUNCTIONS LIVE BELOW HERE
  1933. # =========================================================================== #
  1934. def _freedman_diaconis_bins(a):
  1935. """Calculate number of hist bins using Freedman-Diaconis rule."""
  1936. # From https://stats.stackexchange.com/questions/798/
  1937. a = np.asarray(a)
  1938. if len(a) < 2:
  1939. return 1
  1940. iqr = np.subtract.reduce(np.nanpercentile(a, [75, 25]))
  1941. h = 2 * iqr / (len(a) ** (1 / 3))
  1942. # fall back to sqrt(a) bins if iqr is 0
  1943. if h == 0:
  1944. return int(np.sqrt(a.size))
  1945. else:
  1946. return int(np.ceil((a.max() - a.min()) / h))
  1947. def distplot(a=None, bins=None, hist=True, kde=True, rug=False, fit=None,
  1948. hist_kws=None, kde_kws=None, rug_kws=None, fit_kws=None,
  1949. color=None, vertical=False, norm_hist=False, axlabel=None,
  1950. label=None, ax=None, x=None):
  1951. """
  1952. DEPRECATED
  1953. This function has been deprecated and will be removed in seaborn v0.14.0.
  1954. It has been replaced by :func:`histplot` and :func:`displot`, two functions
  1955. with a modern API and many more capabilities.
  1956. For a guide to updating, please see this notebook:
  1957. https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
  1958. """
  1959. if kde and not hist:
  1960. axes_level_suggestion = (
  1961. "`kdeplot` (an axes-level function for kernel density plots)"
  1962. )
  1963. else:
  1964. axes_level_suggestion = (
  1965. "`histplot` (an axes-level function for histograms)"
  1966. )
  1967. msg = textwrap.dedent(f"""
  1968. `distplot` is a deprecated function and will be removed in seaborn v0.14.0.
  1969. Please adapt your code to use either `displot` (a figure-level function with
  1970. similar flexibility) or {axes_level_suggestion}.
  1971. For a guide to updating your code to use the new functions, please see
  1972. https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
  1973. """)
  1974. warnings.warn(msg, UserWarning, stacklevel=2)
  1975. if ax is None:
  1976. ax = plt.gca()
  1977. # Intelligently label the support axis
  1978. label_ax = bool(axlabel)
  1979. if axlabel is None and hasattr(a, "name"):
  1980. axlabel = a.name
  1981. if axlabel is not None:
  1982. label_ax = True
  1983. # Support new-style API
  1984. if x is not None:
  1985. a = x
  1986. # Make a a 1-d float array
  1987. a = np.asarray(a, float)
  1988. if a.ndim > 1:
  1989. a = a.squeeze()
  1990. # Drop null values from array
  1991. a = remove_na(a)
  1992. # Decide if the hist is normed
  1993. norm_hist = norm_hist or kde or (fit is not None)
  1994. # Handle dictionary defaults
  1995. hist_kws = {} if hist_kws is None else hist_kws.copy()
  1996. kde_kws = {} if kde_kws is None else kde_kws.copy()
  1997. rug_kws = {} if rug_kws is None else rug_kws.copy()
  1998. fit_kws = {} if fit_kws is None else fit_kws.copy()
  1999. # Get the color from the current color cycle
  2000. if color is None:
  2001. if vertical:
  2002. line, = ax.plot(0, a.mean())
  2003. else:
  2004. line, = ax.plot(a.mean(), 0)
  2005. color = line.get_color()
  2006. line.remove()
  2007. # Plug the label into the right kwarg dictionary
  2008. if label is not None:
  2009. if hist:
  2010. hist_kws["label"] = label
  2011. elif kde:
  2012. kde_kws["label"] = label
  2013. elif rug:
  2014. rug_kws["label"] = label
  2015. elif fit:
  2016. fit_kws["label"] = label
  2017. if hist:
  2018. if bins is None:
  2019. bins = min(_freedman_diaconis_bins(a), 50)
  2020. hist_kws.setdefault("alpha", 0.4)
  2021. hist_kws.setdefault("density", norm_hist)
  2022. orientation = "horizontal" if vertical else "vertical"
  2023. hist_color = hist_kws.pop("color", color)
  2024. ax.hist(a, bins, orientation=orientation,
  2025. color=hist_color, **hist_kws)
  2026. if hist_color != color:
  2027. hist_kws["color"] = hist_color
  2028. axis = "y" if vertical else "x"
  2029. if kde:
  2030. kde_color = kde_kws.pop("color", color)
  2031. kdeplot(**{axis: a}, ax=ax, color=kde_color, **kde_kws)
  2032. if kde_color != color:
  2033. kde_kws["color"] = kde_color
  2034. if rug:
  2035. rug_color = rug_kws.pop("color", color)
  2036. rugplot(**{axis: a}, ax=ax, color=rug_color, **rug_kws)
  2037. if rug_color != color:
  2038. rug_kws["color"] = rug_color
  2039. if fit is not None:
  2040. def pdf(x):
  2041. return fit.pdf(x, *params)
  2042. fit_color = fit_kws.pop("color", "#282828")
  2043. gridsize = fit_kws.pop("gridsize", 200)
  2044. cut = fit_kws.pop("cut", 3)
  2045. clip = fit_kws.pop("clip", (-np.inf, np.inf))
  2046. bw = gaussian_kde(a).scotts_factor() * a.std(ddof=1)
  2047. x = _kde_support(a, bw, gridsize, cut, clip)
  2048. params = fit.fit(a)
  2049. y = pdf(x)
  2050. if vertical:
  2051. x, y = y, x
  2052. ax.plot(x, y, color=fit_color, **fit_kws)
  2053. if fit_color != "#282828":
  2054. fit_kws["color"] = fit_color
  2055. if label_ax:
  2056. if vertical:
  2057. ax.set_ylabel(axlabel)
  2058. else:
  2059. ax.set_xlabel(axlabel)
  2060. return ax