123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550 |
- from __future__ import annotations
- from typing import (
- TYPE_CHECKING,
- Collection,
- Literal,
- NamedTuple,
- )
- import warnings
- from matplotlib.artist import setp
- import numpy as np
- from pandas._typing import MatplotlibColor
- from pandas.util._exceptions import find_stack_level
- from pandas.core.dtypes.common import is_dict_like
- from pandas.core.dtypes.missing import remove_na_arraylike
- import pandas as pd
- import pandas.core.common as com
- from pandas.io.formats.printing import pprint_thing
- from pandas.plotting._matplotlib.core import (
- LinePlot,
- MPLPlot,
- )
- from pandas.plotting._matplotlib.groupby import create_iter_data_given_by
- from pandas.plotting._matplotlib.style import get_standard_colors
- from pandas.plotting._matplotlib.tools import (
- create_subplots,
- flatten_axes,
- maybe_adjust_figure,
- )
- if TYPE_CHECKING:
- from matplotlib.axes import Axes
- from matplotlib.lines import Line2D
- class BoxPlot(LinePlot):
- @property
- def _kind(self) -> Literal["box"]:
- return "box"
- _layout_type = "horizontal"
- _valid_return_types = (None, "axes", "dict", "both")
- class BP(NamedTuple):
- # namedtuple to hold results
- ax: Axes
- lines: dict[str, list[Line2D]]
- def __init__(self, data, return_type: str = "axes", **kwargs) -> None:
- if return_type not in self._valid_return_types:
- raise ValueError("return_type must be {None, 'axes', 'dict', 'both'}")
- self.return_type = return_type
- # Do not call LinePlot.__init__ which may fill nan
- MPLPlot.__init__(self, data, **kwargs) # pylint: disable=non-parent-init-called
- def _args_adjust(self) -> None:
- if self.subplots:
- # Disable label ax sharing. Otherwise, all subplots shows last
- # column label
- if self.orientation == "vertical":
- self.sharex = False
- else:
- self.sharey = False
- # error: Signature of "_plot" incompatible with supertype "MPLPlot"
- @classmethod
- def _plot( # type: ignore[override]
- cls, ax, y, column_num=None, return_type: str = "axes", **kwds
- ):
- if y.ndim == 2:
- y = [remove_na_arraylike(v) for v in y]
- # Boxplot fails with empty arrays, so need to add a NaN
- # if any cols are empty
- # GH 8181
- y = [v if v.size > 0 else np.array([np.nan]) for v in y]
- else:
- y = remove_na_arraylike(y)
- bp = ax.boxplot(y, **kwds)
- if return_type == "dict":
- return bp, bp
- elif return_type == "both":
- return cls.BP(ax=ax, lines=bp), bp
- else:
- return ax, bp
- def _validate_color_args(self):
- if "color" in self.kwds:
- if self.colormap is not None:
- warnings.warn(
- "'color' and 'colormap' cannot be used "
- "simultaneously. Using 'color'",
- stacklevel=find_stack_level(),
- )
- self.color = self.kwds.pop("color")
- if isinstance(self.color, dict):
- valid_keys = ["boxes", "whiskers", "medians", "caps"]
- for key in self.color:
- if key not in valid_keys:
- raise ValueError(
- f"color dict contains invalid key '{key}'. "
- f"The key must be either {valid_keys}"
- )
- else:
- self.color = None
- # get standard colors for default
- colors = get_standard_colors(num_colors=3, colormap=self.colormap, color=None)
- # use 2 colors by default, for box/whisker and median
- # flier colors isn't needed here
- # because it can be specified by ``sym`` kw
- self._boxes_c = colors[0]
- self._whiskers_c = colors[0]
- self._medians_c = colors[2]
- self._caps_c = colors[0]
- def _get_colors(
- self,
- num_colors=None,
- color_kwds: dict[str, MatplotlibColor]
- | MatplotlibColor
- | Collection[MatplotlibColor]
- | None = "color",
- ) -> None:
- pass
- def maybe_color_bp(self, bp) -> None:
- if isinstance(self.color, dict):
- boxes = self.color.get("boxes", self._boxes_c)
- whiskers = self.color.get("whiskers", self._whiskers_c)
- medians = self.color.get("medians", self._medians_c)
- caps = self.color.get("caps", self._caps_c)
- else:
- # Other types are forwarded to matplotlib
- # If None, use default colors
- boxes = self.color or self._boxes_c
- whiskers = self.color or self._whiskers_c
- medians = self.color or self._medians_c
- caps = self.color or self._caps_c
- # GH 30346, when users specifying those arguments explicitly, our defaults
- # for these four kwargs should be overridden; if not, use Pandas settings
- if not self.kwds.get("boxprops"):
- setp(bp["boxes"], color=boxes, alpha=1)
- if not self.kwds.get("whiskerprops"):
- setp(bp["whiskers"], color=whiskers, alpha=1)
- if not self.kwds.get("medianprops"):
- setp(bp["medians"], color=medians, alpha=1)
- if not self.kwds.get("capprops"):
- setp(bp["caps"], color=caps, alpha=1)
- def _make_plot(self) -> None:
- if self.subplots:
- self._return_obj = pd.Series(dtype=object)
- # Re-create iterated data if `by` is assigned by users
- data = (
- create_iter_data_given_by(self.data, self._kind)
- if self.by is not None
- else self.data
- )
- for i, (label, y) in enumerate(self._iter_data(data=data)):
- ax = self._get_ax(i)
- kwds = self.kwds.copy()
- # When by is applied, show title for subplots to know which group it is
- # just like df.boxplot, and need to apply T on y to provide right input
- if self.by is not None:
- y = y.T
- ax.set_title(pprint_thing(label))
- # When `by` is assigned, the ticklabels will become unique grouped
- # values, instead of label which is used as subtitle in this case.
- ticklabels = [
- pprint_thing(col) for col in self.data.columns.levels[0]
- ]
- else:
- ticklabels = [pprint_thing(label)]
- ret, bp = self._plot(
- ax, y, column_num=i, return_type=self.return_type, **kwds
- )
- self.maybe_color_bp(bp)
- self._return_obj[label] = ret
- self._set_ticklabels(ax, ticklabels)
- else:
- y = self.data.values.T
- ax = self._get_ax(0)
- kwds = self.kwds.copy()
- ret, bp = self._plot(
- ax, y, column_num=0, return_type=self.return_type, **kwds
- )
- self.maybe_color_bp(bp)
- self._return_obj = ret
- labels = [left for left, _ in self._iter_data()]
- labels = [pprint_thing(left) for left in labels]
- if not self.use_index:
- labels = [pprint_thing(key) for key in range(len(labels))]
- self._set_ticklabels(ax, labels)
- def _set_ticklabels(self, ax: Axes, labels) -> None:
- if self.orientation == "vertical":
- ax.set_xticklabels(labels)
- else:
- ax.set_yticklabels(labels)
- def _make_legend(self) -> None:
- pass
- def _post_plot_logic(self, ax, data) -> None:
- # GH 45465: make sure that the boxplot doesn't ignore xlabel/ylabel
- if self.xlabel:
- ax.set_xlabel(pprint_thing(self.xlabel))
- if self.ylabel:
- ax.set_ylabel(pprint_thing(self.ylabel))
- @property
- def orientation(self) -> Literal["horizontal", "vertical"]:
- if self.kwds.get("vert", True):
- return "vertical"
- else:
- return "horizontal"
- @property
- def result(self):
- if self.return_type is None:
- return super().result
- else:
- return self._return_obj
- def _grouped_plot_by_column(
- plotf,
- data,
- columns=None,
- by=None,
- numeric_only: bool = True,
- grid: bool = False,
- figsize=None,
- ax=None,
- layout=None,
- return_type=None,
- **kwargs,
- ):
- grouped = data.groupby(by)
- if columns is None:
- if not isinstance(by, (list, tuple)):
- by = [by]
- columns = data._get_numeric_data().columns.difference(by)
- naxes = len(columns)
- fig, axes = create_subplots(
- naxes=naxes,
- sharex=kwargs.pop("sharex", True),
- sharey=kwargs.pop("sharey", True),
- figsize=figsize,
- ax=ax,
- layout=layout,
- )
- _axes = flatten_axes(axes)
- # GH 45465: move the "by" label based on "vert"
- xlabel, ylabel = kwargs.pop("xlabel", None), kwargs.pop("ylabel", None)
- if kwargs.get("vert", True):
- xlabel = xlabel or by
- else:
- ylabel = ylabel or by
- ax_values = []
- for i, col in enumerate(columns):
- ax = _axes[i]
- gp_col = grouped[col]
- keys, values = zip(*gp_col)
- re_plotf = plotf(keys, values, ax, xlabel=xlabel, ylabel=ylabel, **kwargs)
- ax.set_title(col)
- ax_values.append(re_plotf)
- ax.grid(grid)
- result = pd.Series(ax_values, index=columns, copy=False)
- # Return axes in multiplot case, maybe revisit later # 985
- if return_type is None:
- result = axes
- byline = by[0] if len(by) == 1 else by
- fig.suptitle(f"Boxplot grouped by {byline}")
- maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
- return result
- def boxplot(
- data,
- column=None,
- by=None,
- ax=None,
- fontsize=None,
- rot: int = 0,
- grid: bool = True,
- figsize=None,
- layout=None,
- return_type=None,
- **kwds,
- ):
- import matplotlib.pyplot as plt
- # validate return_type:
- if return_type not in BoxPlot._valid_return_types:
- raise ValueError("return_type must be {'axes', 'dict', 'both'}")
- if isinstance(data, pd.Series):
- data = data.to_frame("x")
- column = "x"
- def _get_colors():
- # num_colors=3 is required as method maybe_color_bp takes the colors
- # in positions 0 and 2.
- # if colors not provided, use same defaults as DataFrame.plot.box
- result = get_standard_colors(num_colors=3)
- result = np.take(result, [0, 0, 2])
- result = np.append(result, "k")
- colors = kwds.pop("color", None)
- if colors:
- if is_dict_like(colors):
- # replace colors in result array with user-specified colors
- # taken from the colors dict parameter
- # "boxes" value placed in position 0, "whiskers" in 1, etc.
- valid_keys = ["boxes", "whiskers", "medians", "caps"]
- key_to_index = dict(zip(valid_keys, range(4)))
- for key, value in colors.items():
- if key in valid_keys:
- result[key_to_index[key]] = value
- else:
- raise ValueError(
- f"color dict contains invalid key '{key}'. "
- f"The key must be either {valid_keys}"
- )
- else:
- result.fill(colors)
- return result
- def maybe_color_bp(bp, **kwds) -> None:
- # GH 30346, when users specifying those arguments explicitly, our defaults
- # for these four kwargs should be overridden; if not, use Pandas settings
- if not kwds.get("boxprops"):
- setp(bp["boxes"], color=colors[0], alpha=1)
- if not kwds.get("whiskerprops"):
- setp(bp["whiskers"], color=colors[1], alpha=1)
- if not kwds.get("medianprops"):
- setp(bp["medians"], color=colors[2], alpha=1)
- if not kwds.get("capprops"):
- setp(bp["caps"], color=colors[3], alpha=1)
- def plot_group(keys, values, ax: Axes, **kwds):
- # GH 45465: xlabel/ylabel need to be popped out before plotting happens
- xlabel, ylabel = kwds.pop("xlabel", None), kwds.pop("ylabel", None)
- if xlabel:
- ax.set_xlabel(pprint_thing(xlabel))
- if ylabel:
- ax.set_ylabel(pprint_thing(ylabel))
- keys = [pprint_thing(x) for x in keys]
- values = [np.asarray(remove_na_arraylike(v), dtype=object) for v in values]
- bp = ax.boxplot(values, **kwds)
- if fontsize is not None:
- ax.tick_params(axis="both", labelsize=fontsize)
- # GH 45465: x/y are flipped when "vert" changes
- is_vertical = kwds.get("vert", True)
- ticks = ax.get_xticks() if is_vertical else ax.get_yticks()
- if len(ticks) != len(keys):
- i, remainder = divmod(len(ticks), len(keys))
- assert remainder == 0, remainder
- keys *= i
- if is_vertical:
- ax.set_xticklabels(keys, rotation=rot)
- else:
- ax.set_yticklabels(keys, rotation=rot)
- maybe_color_bp(bp, **kwds)
- # Return axes in multiplot case, maybe revisit later # 985
- if return_type == "dict":
- return bp
- elif return_type == "both":
- return BoxPlot.BP(ax=ax, lines=bp)
- else:
- return ax
- colors = _get_colors()
- if column is None:
- columns = None
- else:
- if isinstance(column, (list, tuple)):
- columns = column
- else:
- columns = [column]
- if by is not None:
- # Prefer array return type for 2-D plots to match the subplot layout
- # https://github.com/pandas-dev/pandas/pull/12216#issuecomment-241175580
- result = _grouped_plot_by_column(
- plot_group,
- data,
- columns=columns,
- by=by,
- grid=grid,
- figsize=figsize,
- ax=ax,
- layout=layout,
- return_type=return_type,
- **kwds,
- )
- else:
- if return_type is None:
- return_type = "axes"
- if layout is not None:
- raise ValueError("The 'layout' keyword is not supported when 'by' is None")
- if ax is None:
- rc = {"figure.figsize": figsize} if figsize is not None else {}
- with plt.rc_context(rc):
- ax = plt.gca()
- data = data._get_numeric_data()
- naxes = len(data.columns)
- if naxes == 0:
- raise ValueError(
- "boxplot method requires numerical columns, nothing to plot."
- )
- if columns is None:
- columns = data.columns
- else:
- data = data[columns]
- result = plot_group(columns, data.values.T, ax, **kwds)
- ax.grid(grid)
- return result
- def boxplot_frame(
- self,
- column=None,
- by=None,
- ax=None,
- fontsize=None,
- rot: int = 0,
- grid: bool = True,
- figsize=None,
- layout=None,
- return_type=None,
- **kwds,
- ):
- import matplotlib.pyplot as plt
- ax = boxplot(
- self,
- column=column,
- by=by,
- ax=ax,
- fontsize=fontsize,
- grid=grid,
- rot=rot,
- figsize=figsize,
- layout=layout,
- return_type=return_type,
- **kwds,
- )
- plt.draw_if_interactive()
- return ax
- def boxplot_frame_groupby(
- grouped,
- subplots: bool = True,
- column=None,
- fontsize=None,
- rot: int = 0,
- grid: bool = True,
- ax=None,
- figsize=None,
- layout=None,
- sharex: bool = False,
- sharey: bool = True,
- **kwds,
- ):
- if subplots is True:
- naxes = len(grouped)
- fig, axes = create_subplots(
- naxes=naxes,
- squeeze=False,
- ax=ax,
- sharex=sharex,
- sharey=sharey,
- figsize=figsize,
- layout=layout,
- )
- axes = flatten_axes(axes)
- ret = pd.Series(dtype=object)
- for (key, group), ax in zip(grouped, axes):
- d = group.boxplot(
- ax=ax, column=column, fontsize=fontsize, rot=rot, grid=grid, **kwds
- )
- ax.set_title(pprint_thing(key))
- ret.loc[key] = d
- maybe_adjust_figure(fig, bottom=0.15, top=0.9, left=0.1, right=0.9, wspace=0.2)
- else:
- keys, frames = zip(*grouped)
- if grouped.axis == 0:
- df = pd.concat(frames, keys=keys, axis=1)
- else:
- if len(frames) > 1:
- df = frames[0].join(frames[1::])
- else:
- df = frames[0]
- # GH 16748, DataFrameGroupby fails when subplots=False and `column` argument
- # is assigned, and in this case, since `df` here becomes MI after groupby,
- # so we need to couple the keys (grouped values) and column (original df
- # column) together to search for subset to plot
- if column is not None:
- column = com.convert_to_list_like(column)
- multi_key = pd.MultiIndex.from_product([keys, column])
- column = list(multi_key.values)
- ret = df.boxplot(
- column=column,
- fontsize=fontsize,
- rot=rot,
- grid=grid,
- ax=ax,
- figsize=figsize,
- layout=layout,
- **kwds,
- )
- return ret
|