12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775 |
- from __future__ import annotations
- import warnings
- import itertools
- from copy import copy
- from collections import UserString
- from collections.abc import Iterable, Sequence, Mapping
- from numbers import Number
- from datetime import datetime
- import numpy as np
- import pandas as pd
- import matplotlib as mpl
- from seaborn._core.data import PlotData
- from seaborn.palettes import (
- QUAL_PALETTES,
- color_palette,
- )
- from seaborn.utils import (
- _check_argument,
- _version_predates,
- desaturate,
- locator_to_legend_entries,
- get_color_cycle,
- remove_na,
- )
- class SemanticMapping:
- """Base class for mapping data values to plot attributes."""
- # -- Default attributes that all SemanticMapping subclasses must set
- # Whether the mapping is numeric, categorical, or datetime
- map_type: str | None = None
- # Ordered list of unique values in the input data
- levels = None
- # A mapping from the data values to corresponding plot attributes
- lookup_table = None
- def __init__(self, plotter):
- # TODO Putting this here so we can continue to use a lot of the
- # logic that's built into the library, but the idea of this class
- # is to move towards semantic mappings that are agnostic about the
- # kind of plot they're going to be used to draw.
- # Fully achieving that is going to take some thinking.
- self.plotter = plotter
- def _check_list_length(self, levels, values, variable):
- """Input check when values are provided as a list."""
- # Copied from _core/properties; eventually will be replaced for that.
- message = ""
- if len(levels) > len(values):
- message = " ".join([
- f"\nThe {variable} list has fewer values ({len(values)})",
- f"than needed ({len(levels)}) and will cycle, which may",
- "produce an uninterpretable plot."
- ])
- values = [x for _, x in zip(levels, itertools.cycle(values))]
- elif len(values) > len(levels):
- message = " ".join([
- f"The {variable} list has more values ({len(values)})",
- f"than needed ({len(levels)}), which may not be intended.",
- ])
- values = values[:len(levels)]
- if message:
- warnings.warn(message, UserWarning, stacklevel=6)
- return values
- def _lookup_single(self, key):
- """Apply the mapping to a single data value."""
- return self.lookup_table[key]
- def __call__(self, key, *args, **kwargs):
- """Get the attribute(s) values for the data key."""
- if isinstance(key, (list, np.ndarray, pd.Series)):
- return [self._lookup_single(k, *args, **kwargs) for k in key]
- else:
- return self._lookup_single(key, *args, **kwargs)
- class HueMapping(SemanticMapping):
- """Mapping that sets artist colors according to data values."""
- # A specification of the colors that should appear in the plot
- palette = None
- # An object that normalizes data values to [0, 1] range for color mapping
- norm = None
- # A continuous colormap object for interpolating in a numeric context
- cmap = None
- def __init__(
- self, plotter, palette=None, order=None, norm=None, saturation=1,
- ):
- """Map the levels of the `hue` variable to distinct colors.
- Parameters
- ----------
- # TODO add generic parameters
- """
- super().__init__(plotter)
- data = plotter.plot_data.get("hue", pd.Series(dtype=float))
- if isinstance(palette, np.ndarray):
- msg = (
- "Numpy array is not a supported type for `palette`. "
- "Please convert your palette to a list. "
- "This will become an error in v0.14"
- )
- warnings.warn(msg, stacklevel=4)
- palette = palette.tolist()
- if data.isna().all():
- if palette is not None:
- msg = "Ignoring `palette` because no `hue` variable has been assigned."
- warnings.warn(msg, stacklevel=4)
- else:
- map_type = self.infer_map_type(
- palette, norm, plotter.input_format, plotter.var_types["hue"]
- )
- # Our goal is to end up with a dictionary mapping every unique
- # value in `data` to a color. We will also keep track of the
- # metadata about this mapping we will need for, e.g., a legend
- # --- Option 1: numeric mapping with a matplotlib colormap
- if map_type == "numeric":
- data = pd.to_numeric(data)
- levels, lookup_table, norm, cmap = self.numeric_mapping(
- data, palette, norm,
- )
- # --- Option 2: categorical mapping using seaborn palette
- elif map_type == "categorical":
- cmap = norm = None
- levels, lookup_table = self.categorical_mapping(
- data, palette, order,
- )
- # --- Option 3: datetime mapping
- else:
- # TODO this needs actual implementation
- cmap = norm = None
- levels, lookup_table = self.categorical_mapping(
- # Casting data to list to handle differences in the way
- # pandas and numpy represent datetime64 data
- list(data), palette, order,
- )
- self.saturation = saturation
- self.map_type = map_type
- self.lookup_table = lookup_table
- self.palette = palette
- self.levels = levels
- self.norm = norm
- self.cmap = cmap
- def _lookup_single(self, key):
- """Get the color for a single value, using colormap to interpolate."""
- try:
- # Use a value that's in the original data vector
- value = self.lookup_table[key]
- except KeyError:
- if self.norm is None:
- # Currently we only get here in scatterplot with hue_order,
- # because scatterplot does not consider hue a grouping variable
- # So unused hue levels are in the data, but not the lookup table
- return (0, 0, 0, 0)
- # Use the colormap to interpolate between existing datapoints
- # (e.g. in the context of making a continuous legend)
- try:
- normed = self.norm(key)
- except TypeError as err:
- if np.isnan(key):
- value = (0, 0, 0, 0)
- else:
- raise err
- else:
- if np.ma.is_masked(normed):
- normed = np.nan
- value = self.cmap(normed)
- if self.saturation < 1:
- value = desaturate(value, self.saturation)
- return value
- def infer_map_type(self, palette, norm, input_format, var_type):
- """Determine how to implement the mapping."""
- if palette in QUAL_PALETTES:
- map_type = "categorical"
- elif norm is not None:
- map_type = "numeric"
- elif isinstance(palette, (dict, list)):
- map_type = "categorical"
- elif input_format == "wide":
- map_type = "categorical"
- else:
- map_type = var_type
- return map_type
- def categorical_mapping(self, data, palette, order):
- """Determine colors when the hue mapping is categorical."""
- # -- Identify the order and name of the levels
- levels = categorical_order(data, order)
- n_colors = len(levels)
- # -- Identify the set of colors to use
- if isinstance(palette, dict):
- missing = set(levels) - set(palette)
- if any(missing):
- err = "The palette dictionary is missing keys: {}"
- raise ValueError(err.format(missing))
- lookup_table = palette
- else:
- if palette is None:
- if n_colors <= len(get_color_cycle()):
- colors = color_palette(None, n_colors)
- else:
- colors = color_palette("husl", n_colors)
- elif isinstance(palette, list):
- colors = self._check_list_length(levels, palette, "palette")
- else:
- colors = color_palette(palette, n_colors)
- lookup_table = dict(zip(levels, colors))
- return levels, lookup_table
- def numeric_mapping(self, data, palette, norm):
- """Determine colors when the hue variable is quantitative."""
- if isinstance(palette, dict):
- # The presence of a norm object overrides a dictionary of hues
- # in specifying a numeric mapping, so we need to process it here.
- levels = list(sorted(palette))
- colors = [palette[k] for k in sorted(palette)]
- cmap = mpl.colors.ListedColormap(colors)
- lookup_table = palette.copy()
- else:
- # The levels are the sorted unique values in the data
- levels = list(np.sort(remove_na(data.unique())))
- # --- Sort out the colormap to use from the palette argument
- # Default numeric palette is our default cubehelix palette
- # TODO do we want to do something complicated to ensure contrast?
- palette = "ch:" if palette is None else palette
- if isinstance(palette, mpl.colors.Colormap):
- cmap = palette
- else:
- cmap = color_palette(palette, as_cmap=True)
- # Now sort out the data normalization
- if norm is None:
- norm = mpl.colors.Normalize()
- elif isinstance(norm, tuple):
- norm = mpl.colors.Normalize(*norm)
- elif not isinstance(norm, mpl.colors.Normalize):
- err = "``hue_norm`` must be None, tuple, or Normalize object."
- raise ValueError(err)
- if not norm.scaled():
- norm(np.asarray(data.dropna()))
- lookup_table = dict(zip(levels, cmap(norm(levels))))
- return levels, lookup_table, norm, cmap
- class SizeMapping(SemanticMapping):
- """Mapping that sets artist sizes according to data values."""
- # An object that normalizes data values to [0, 1] range
- norm = None
- def __init__(
- self, plotter, sizes=None, order=None, norm=None,
- ):
- """Map the levels of the `size` variable to distinct values.
- Parameters
- ----------
- # TODO add generic parameters
- """
- super().__init__(plotter)
- data = plotter.plot_data.get("size", pd.Series(dtype=float))
- if data.notna().any():
- map_type = self.infer_map_type(
- norm, sizes, plotter.var_types["size"]
- )
- # --- Option 1: numeric mapping
- if map_type == "numeric":
- levels, lookup_table, norm, size_range = self.numeric_mapping(
- data, sizes, norm,
- )
- # --- Option 2: categorical mapping
- elif map_type == "categorical":
- levels, lookup_table = self.categorical_mapping(
- data, sizes, order,
- )
- size_range = None
- # --- Option 3: datetime mapping
- # TODO this needs an actual implementation
- else:
- levels, lookup_table = self.categorical_mapping(
- # Casting data to list to handle differences in the way
- # pandas and numpy represent datetime64 data
- list(data), sizes, order,
- )
- size_range = None
- self.map_type = map_type
- self.levels = levels
- self.norm = norm
- self.sizes = sizes
- self.size_range = size_range
- self.lookup_table = lookup_table
- def infer_map_type(self, norm, sizes, var_type):
- if norm is not None:
- map_type = "numeric"
- elif isinstance(sizes, (dict, list)):
- map_type = "categorical"
- else:
- map_type = var_type
- return map_type
- def _lookup_single(self, key):
- try:
- value = self.lookup_table[key]
- except KeyError:
- normed = self.norm(key)
- if np.ma.is_masked(normed):
- normed = np.nan
- value = self.size_range[0] + normed * np.ptp(self.size_range)
- return value
- def categorical_mapping(self, data, sizes, order):
- levels = categorical_order(data, order)
- if isinstance(sizes, dict):
- # Dict inputs map existing data values to the size attribute
- missing = set(levels) - set(sizes)
- if any(missing):
- err = f"Missing sizes for the following levels: {missing}"
- raise ValueError(err)
- lookup_table = sizes.copy()
- elif isinstance(sizes, list):
- # List inputs give size values in the same order as the levels
- sizes = self._check_list_length(levels, sizes, "sizes")
- lookup_table = dict(zip(levels, sizes))
- else:
- if isinstance(sizes, tuple):
- # Tuple input sets the min, max size values
- if len(sizes) != 2:
- err = "A `sizes` tuple must have only 2 values"
- raise ValueError(err)
- elif sizes is not None:
- err = f"Value for `sizes` not understood: {sizes}"
- raise ValueError(err)
- else:
- # Otherwise, we need to get the min, max size values from
- # the plotter object we are attached to.
- # TODO this is going to cause us trouble later, because we
- # want to restructure things so that the plotter is generic
- # across the visual representation of the data. But at this
- # point, we don't know the visual representation. Likely we
- # want to change the logic of this Mapping so that it gives
- # points on a normalized range that then gets un-normalized
- # when we know what we're drawing. But given the way the
- # package works now, this way is cleanest.
- sizes = self.plotter._default_size_range
- # For categorical sizes, use regularly-spaced linear steps
- # between the minimum and maximum sizes. Then reverse the
- # ramp so that the largest value is used for the first entry
- # in size_order, etc. This is because "ordered" categories
- # are often though to go in decreasing priority.
- sizes = np.linspace(*sizes, len(levels))[::-1]
- lookup_table = dict(zip(levels, sizes))
- return levels, lookup_table
- def numeric_mapping(self, data, sizes, norm):
- if isinstance(sizes, dict):
- # The presence of a norm object overrides a dictionary of sizes
- # in specifying a numeric mapping, so we need to process it
- # dictionary here
- levels = list(np.sort(list(sizes)))
- size_values = sizes.values()
- size_range = min(size_values), max(size_values)
- else:
- # The levels here will be the unique values in the data
- levels = list(np.sort(remove_na(data.unique())))
- if isinstance(sizes, tuple):
- # For numeric inputs, the size can be parametrized by
- # the minimum and maximum artist values to map to. The
- # norm object that gets set up next specifies how to
- # do the mapping.
- if len(sizes) != 2:
- err = "A `sizes` tuple must have only 2 values"
- raise ValueError(err)
- size_range = sizes
- elif sizes is not None:
- err = f"Value for `sizes` not understood: {sizes}"
- raise ValueError(err)
- else:
- # When not provided, we get the size range from the plotter
- # object we are attached to. See the note in the categorical
- # method about how this is suboptimal for future development.
- size_range = self.plotter._default_size_range
- # Now that we know the minimum and maximum sizes that will get drawn,
- # we need to map the data values that we have into that range. We will
- # use a matplotlib Normalize class, which is typically used for numeric
- # color mapping but works fine here too. It takes data values and maps
- # them into a [0, 1] interval, potentially nonlinear-ly.
- if norm is None:
- # Default is a linear function between the min and max data values
- norm = mpl.colors.Normalize()
- elif isinstance(norm, tuple):
- # It is also possible to give different limits in data space
- norm = mpl.colors.Normalize(*norm)
- elif not isinstance(norm, mpl.colors.Normalize):
- err = f"Value for size `norm` parameter not understood: {norm}"
- raise ValueError(err)
- else:
- # If provided with Normalize object, copy it so we can modify
- norm = copy(norm)
- # Set the mapping so all output values are in [0, 1]
- norm.clip = True
- # If the input range is not set, use the full range of the data
- if not norm.scaled():
- norm(levels)
- # Map from data values to [0, 1] range
- sizes_scaled = norm(levels)
- # Now map from the scaled range into the artist units
- if isinstance(sizes, dict):
- lookup_table = sizes
- else:
- lo, hi = size_range
- sizes = lo + sizes_scaled * (hi - lo)
- lookup_table = dict(zip(levels, sizes))
- return levels, lookup_table, norm, size_range
- class StyleMapping(SemanticMapping):
- """Mapping that sets artist style according to data values."""
- # Style mapping is always treated as categorical
- map_type = "categorical"
- def __init__(self, plotter, markers=None, dashes=None, order=None):
- """Map the levels of the `style` variable to distinct values.
- Parameters
- ----------
- # TODO add generic parameters
- """
- super().__init__(plotter)
- data = plotter.plot_data.get("style", pd.Series(dtype=float))
- if data.notna().any():
- # Cast to list to handle numpy/pandas datetime quirks
- if variable_type(data) == "datetime":
- data = list(data)
- # Find ordered unique values
- levels = categorical_order(data, order)
- markers = self._map_attributes(
- markers, levels, unique_markers(len(levels)), "markers",
- )
- dashes = self._map_attributes(
- dashes, levels, unique_dashes(len(levels)), "dashes",
- )
- # Build the paths matplotlib will use to draw the markers
- paths = {}
- filled_markers = []
- for k, m in markers.items():
- if not isinstance(m, mpl.markers.MarkerStyle):
- m = mpl.markers.MarkerStyle(m)
- paths[k] = m.get_path().transformed(m.get_transform())
- filled_markers.append(m.is_filled())
- # Mixture of filled and unfilled markers will show line art markers
- # in the edge color, which defaults to white. This can be handled,
- # but there would be additional complexity with specifying the
- # weight of the line art markers without overwhelming the filled
- # ones with the edges. So for now, we will disallow mixtures.
- if any(filled_markers) and not all(filled_markers):
- err = "Filled and line art markers cannot be mixed"
- raise ValueError(err)
- lookup_table = {}
- for key in levels:
- lookup_table[key] = {}
- if markers:
- lookup_table[key]["marker"] = markers[key]
- lookup_table[key]["path"] = paths[key]
- if dashes:
- lookup_table[key]["dashes"] = dashes[key]
- self.levels = levels
- self.lookup_table = lookup_table
- def _lookup_single(self, key, attr=None):
- """Get attribute(s) for a given data point."""
- if attr is None:
- value = self.lookup_table[key]
- else:
- value = self.lookup_table[key][attr]
- return value
- def _map_attributes(self, arg, levels, defaults, attr):
- """Handle the specification for a given style attribute."""
- if arg is True:
- lookup_table = dict(zip(levels, defaults))
- elif isinstance(arg, dict):
- missing = set(levels) - set(arg)
- if missing:
- err = f"These `{attr}` levels are missing values: {missing}"
- raise ValueError(err)
- lookup_table = arg
- elif isinstance(arg, Sequence):
- arg = self._check_list_length(levels, arg, attr)
- lookup_table = dict(zip(levels, arg))
- elif arg:
- err = f"This `{attr}` argument was not understood: {arg}"
- raise ValueError(err)
- else:
- lookup_table = {}
- return lookup_table
- # =========================================================================== #
- class VectorPlotter:
- """Base class for objects underlying *plot functions."""
- wide_structure = {
- "x": "@index", "y": "@values", "hue": "@columns", "style": "@columns",
- }
- flat_structure = {"x": "@index", "y": "@values"}
- _default_size_range = 1, 2 # Unused but needed in tests, ugh
- def __init__(self, data=None, variables={}):
- self._var_levels = {}
- # var_ordered is relevant only for categorical axis variables, and may
- # be better handled by an internal axis information object that tracks
- # such information and is set up by the scale_* methods. The analogous
- # information for numeric axes would be information about log scales.
- self._var_ordered = {"x": False, "y": False} # alt., used DefaultDict
- self.assign_variables(data, variables)
- # TODO Lots of tests assume that these are called to initialize the
- # mappings to default values on class initialization. I'd prefer to
- # move away from that and only have a mapping when explicitly called.
- for var in ["hue", "size", "style"]:
- if var in variables:
- getattr(self, f"map_{var}")()
- @property
- def has_xy_data(self):
- """Return True at least one of x or y is defined."""
- return bool({"x", "y"} & set(self.variables))
- @property
- def var_levels(self):
- """Property interface to ordered list of variables levels.
- Each time it's accessed, it updates the var_levels dictionary with the
- list of levels in the current semantic mappers. But it also allows the
- dictionary to persist, so it can be used to set levels by a key. This is
- used to track the list of col/row levels using an attached FacetGrid
- object, but it's kind of messy and ideally fixed by improving the
- faceting logic so it interfaces better with the modern approach to
- tracking plot variables.
- """
- for var in self.variables:
- if (map_obj := getattr(self, f"_{var}_map", None)) is not None:
- self._var_levels[var] = map_obj.levels
- return self._var_levels
- def assign_variables(self, data=None, variables={}):
- """Define plot variables, optionally using lookup from `data`."""
- x = variables.get("x", None)
- y = variables.get("y", None)
- if x is None and y is None:
- self.input_format = "wide"
- frame, names = self._assign_variables_wideform(data, **variables)
- else:
- # When dealing with long-form input, use the newer PlotData
- # object (internal but introduced for the objects interface)
- # to centralize / standardize data consumption logic.
- self.input_format = "long"
- plot_data = PlotData(data, variables)
- frame = plot_data.frame
- names = plot_data.names
- self.plot_data = frame
- self.variables = names
- self.var_types = {
- v: variable_type(
- frame[v],
- boolean_type="numeric" if v in "xy" else "categorical"
- )
- for v in names
- }
- return self
- def _assign_variables_wideform(self, data=None, **kwargs):
- """Define plot variables given wide-form data.
- Parameters
- ----------
- data : flat vector or collection of vectors
- Data can be a vector or mapping that is coerceable to a Series
- or a sequence- or mapping-based collection of such vectors, or a
- rectangular numpy array, or a Pandas DataFrame.
- kwargs : variable -> data mappings
- Behavior with keyword arguments is currently undefined.
- Returns
- -------
- plot_data : :class:`pandas.DataFrame`
- Long-form data object mapping seaborn variables (x, y, hue, ...)
- to data vectors.
- variables : dict
- Keys are defined seaborn variables; values are names inferred from
- the inputs (or None when no name can be determined).
- """
- # Raise if semantic or other variables are assigned in wide-form mode
- assigned = [k for k, v in kwargs.items() if v is not None]
- if any(assigned):
- s = "s" if len(assigned) > 1 else ""
- err = f"The following variable{s} cannot be assigned with wide-form data: "
- err += ", ".join(f"`{v}`" for v in assigned)
- raise ValueError(err)
- # Determine if the data object actually has any data in it
- empty = data is None or not len(data)
- # Then, determine if we have "flat" data (a single vector)
- if isinstance(data, dict):
- values = data.values()
- else:
- values = np.atleast_1d(np.asarray(data, dtype=object))
- flat = not any(
- isinstance(v, Iterable) and not isinstance(v, (str, bytes))
- for v in values
- )
- if empty:
- # Make an object with the structure of plot_data, but empty
- plot_data = pd.DataFrame()
- variables = {}
- elif flat:
- # Handle flat data by converting to pandas Series and using the
- # index and/or values to define x and/or y
- # (Could be accomplished with a more general to_series() interface)
- flat_data = pd.Series(data).copy()
- names = {
- "@values": flat_data.name,
- "@index": flat_data.index.name
- }
- plot_data = {}
- variables = {}
- for var in ["x", "y"]:
- if var in self.flat_structure:
- attr = self.flat_structure[var]
- plot_data[var] = getattr(flat_data, attr[1:])
- variables[var] = names[self.flat_structure[var]]
- plot_data = pd.DataFrame(plot_data)
- else:
- # Otherwise assume we have some collection of vectors.
- # Handle Python sequences such that entries end up in the columns,
- # not in the rows, of the intermediate wide DataFrame.
- # One way to accomplish this is to convert to a dict of Series.
- if isinstance(data, Sequence):
- data_dict = {}
- for i, var in enumerate(data):
- key = getattr(var, "name", i)
- # TODO is there a safer/more generic way to ensure Series?
- # sort of like np.asarray, but for pandas?
- data_dict[key] = pd.Series(var)
- data = data_dict
- # Pandas requires that dict values either be Series objects
- # or all have the same length, but we want to allow "ragged" inputs
- if isinstance(data, Mapping):
- data = {key: pd.Series(val) for key, val in data.items()}
- # Otherwise, delegate to the pandas DataFrame constructor
- # This is where we'd prefer to use a general interface that says
- # "give me this data as a pandas DataFrame", so we can accept
- # DataFrame objects from other libraries
- wide_data = pd.DataFrame(data, copy=True)
- # At this point we should reduce the dataframe to numeric cols
- numeric_cols = [
- k for k, v in wide_data.items() if variable_type(v) == "numeric"
- ]
- wide_data = wide_data[numeric_cols]
- # Now melt the data to long form
- melt_kws = {"var_name": "@columns", "value_name": "@values"}
- use_index = "@index" in self.wide_structure.values()
- if use_index:
- melt_kws["id_vars"] = "@index"
- try:
- orig_categories = wide_data.columns.categories
- orig_ordered = wide_data.columns.ordered
- wide_data.columns = wide_data.columns.add_categories("@index")
- except AttributeError:
- category_columns = False
- else:
- category_columns = True
- wide_data["@index"] = wide_data.index.to_series()
- plot_data = wide_data.melt(**melt_kws)
- if use_index and category_columns:
- plot_data["@columns"] = pd.Categorical(plot_data["@columns"],
- orig_categories,
- orig_ordered)
- # Assign names corresponding to plot semantics
- for var, attr in self.wide_structure.items():
- plot_data[var] = plot_data[attr]
- # Define the variable names
- variables = {}
- for var, attr in self.wide_structure.items():
- obj = getattr(wide_data, attr[1:])
- variables[var] = getattr(obj, "name", None)
- # Remove redundant columns from plot_data
- plot_data = plot_data[list(variables)]
- return plot_data, variables
- def map_hue(self, palette=None, order=None, norm=None, saturation=1):
- mapping = HueMapping(self, palette, order, norm, saturation)
- self._hue_map = mapping
- def map_size(self, sizes=None, order=None, norm=None):
- mapping = SizeMapping(self, sizes, order, norm)
- self._size_map = mapping
- def map_style(self, markers=None, dashes=None, order=None):
- mapping = StyleMapping(self, markers, dashes, order)
- self._style_map = mapping
- def iter_data(
- self, grouping_vars=None, *,
- reverse=False, from_comp_data=False,
- by_facet=True, allow_empty=False, dropna=True,
- ):
- """Generator for getting subsets of data defined by semantic variables.
- Also injects "col" and "row" into grouping semantics.
- Parameters
- ----------
- grouping_vars : string or list of strings
- Semantic variables that define the subsets of data.
- reverse : bool
- If True, reverse the order of iteration.
- from_comp_data : bool
- If True, use self.comp_data rather than self.plot_data
- by_facet : bool
- If True, add faceting variables to the set of grouping variables.
- allow_empty : bool
- If True, yield an empty dataframe when no observations exist for
- combinations of grouping variables.
- dropna : bool
- If True, remove rows with missing data.
- Yields
- ------
- sub_vars : dict
- Keys are semantic names, values are the level of that semantic.
- sub_data : :class:`pandas.DataFrame`
- Subset of ``plot_data`` for this combination of semantic values.
- """
- # TODO should this default to using all (non x/y?) semantics?
- # or define grouping vars somewhere?
- if grouping_vars is None:
- grouping_vars = []
- elif isinstance(grouping_vars, str):
- grouping_vars = [grouping_vars]
- elif isinstance(grouping_vars, tuple):
- grouping_vars = list(grouping_vars)
- # Always insert faceting variables
- if by_facet:
- facet_vars = {"col", "row"}
- grouping_vars.extend(
- facet_vars & set(self.variables) - set(grouping_vars)
- )
- # Reduce to the semantics used in this plot
- grouping_vars = [var for var in grouping_vars if var in self.variables]
- if from_comp_data:
- data = self.comp_data
- else:
- data = self.plot_data
- if dropna:
- data = data.dropna()
- levels = self.var_levels.copy()
- if from_comp_data:
- for axis in {"x", "y"} & set(grouping_vars):
- converter = self.converters[axis].iloc[0]
- if self.var_types[axis] == "categorical":
- if self._var_ordered[axis]:
- # If the axis is ordered, then the axes in a possible
- # facet grid are by definition "shared", or there is a
- # single axis with a unique cat -> idx mapping.
- # So we can just take the first converter object.
- levels[axis] = converter.convert_units(levels[axis])
- else:
- # Otherwise, the mappings may not be unique, but we can
- # use the unique set of index values in comp_data.
- levels[axis] = np.sort(data[axis].unique())
- else:
- transform = converter.get_transform().transform
- levels[axis] = transform(converter.convert_units(levels[axis]))
- if grouping_vars:
- grouped_data = data.groupby(
- grouping_vars, sort=False, as_index=False, observed=False,
- )
- grouping_keys = []
- for var in grouping_vars:
- grouping_keys.append(levels.get(var, []))
- iter_keys = itertools.product(*grouping_keys)
- if reverse:
- iter_keys = reversed(list(iter_keys))
- for key in iter_keys:
- # Pandas fails with singleton tuple inputs
- pd_key = key[0] if len(key) == 1 else key
- try:
- data_subset = grouped_data.get_group(pd_key)
- except KeyError:
- # XXX we are adding this to allow backwards compatibility
- # with the empty artists that old categorical plots would
- # add (before 0.12), which we may decide to break, in which
- # case this option could be removed
- data_subset = data.loc[[]]
- if data_subset.empty and not allow_empty:
- continue
- sub_vars = dict(zip(grouping_vars, key))
- yield sub_vars, data_subset.copy()
- else:
- yield {}, data.copy()
- @property
- def comp_data(self):
- """Dataframe with numeric x and y, after unit conversion and log scaling."""
- if not hasattr(self, "ax"):
- # Probably a good idea, but will need a bunch of tests updated
- # Most of these tests should just use the external interface
- # Then this can be re-enabled.
- # raise AttributeError("No Axes attached to plotter")
- return self.plot_data
- if not hasattr(self, "_comp_data"):
- comp_data = (
- self.plot_data
- .copy(deep=False)
- .drop(["x", "y"], axis=1, errors="ignore")
- )
- for var in "yx":
- if var not in self.variables:
- continue
- parts = []
- grouped = self.plot_data[var].groupby(self.converters[var], sort=False)
- for converter, orig in grouped:
- orig = orig.mask(orig.isin([np.inf, -np.inf]), np.nan)
- orig = orig.dropna()
- if var in self.var_levels:
- # TODO this should happen in some centralized location
- # it is similar to GH2419, but more complicated because
- # supporting `order` in categorical plots is tricky
- orig = orig[orig.isin(self.var_levels[var])]
- comp = pd.to_numeric(converter.convert_units(orig)).astype(float)
- transform = converter.get_transform().transform
- parts.append(pd.Series(transform(comp), orig.index, name=orig.name))
- if parts:
- comp_col = pd.concat(parts)
- else:
- comp_col = pd.Series(dtype=float, name=var)
- comp_data.insert(0, var, comp_col)
- self._comp_data = comp_data
- return self._comp_data
- def _get_axes(self, sub_vars):
- """Return an Axes object based on existence of row/col variables."""
- row = sub_vars.get("row", None)
- col = sub_vars.get("col", None)
- if row is not None and col is not None:
- return self.facets.axes_dict[(row, col)]
- elif row is not None:
- return self.facets.axes_dict[row]
- elif col is not None:
- return self.facets.axes_dict[col]
- elif self.ax is None:
- return self.facets.ax
- else:
- return self.ax
- def _attach(
- self,
- obj,
- allowed_types=None,
- log_scale=None,
- ):
- """Associate the plotter with an Axes manager and initialize its units.
- Parameters
- ----------
- obj : :class:`matplotlib.axes.Axes` or :class:'FacetGrid`
- Structural object that we will eventually plot onto.
- allowed_types : str or list of str
- If provided, raise when either the x or y variable does not have
- one of the declared seaborn types.
- log_scale : bool, number, or pair of bools or numbers
- If not False, set the axes to use log scaling, with the given
- base or defaulting to 10. If a tuple, interpreted as separate
- arguments for the x and y axes.
- """
- from .axisgrid import FacetGrid
- if isinstance(obj, FacetGrid):
- self.ax = None
- self.facets = obj
- ax_list = obj.axes.flatten()
- if obj.col_names is not None:
- self.var_levels["col"] = obj.col_names
- if obj.row_names is not None:
- self.var_levels["row"] = obj.row_names
- else:
- self.ax = obj
- self.facets = None
- ax_list = [obj]
- # Identify which "axis" variables we have defined
- axis_variables = set("xy").intersection(self.variables)
- # -- Verify the types of our x and y variables here.
- # This doesn't really make complete sense being here here, but it's a fine
- # place for it, given the current system.
- # (Note that for some plots, there might be more complicated restrictions)
- # e.g. the categorical plots have their own check that as specific to the
- # non-categorical axis.
- if allowed_types is None:
- allowed_types = ["numeric", "datetime", "categorical"]
- elif isinstance(allowed_types, str):
- allowed_types = [allowed_types]
- for var in axis_variables:
- var_type = self.var_types[var]
- if var_type not in allowed_types:
- err = (
- f"The {var} variable is {var_type}, but one of "
- f"{allowed_types} is required"
- )
- raise TypeError(err)
- # -- Get axis objects for each row in plot_data for type conversions and scaling
- facet_dim = {"x": "col", "y": "row"}
- self.converters = {}
- for var in axis_variables:
- other_var = {"x": "y", "y": "x"}[var]
- converter = pd.Series(index=self.plot_data.index, name=var, dtype=object)
- share_state = getattr(self.facets, f"_share{var}", True)
- # Simplest cases are that we have a single axes, all axes are shared,
- # or sharing is only on the orthogonal facet dimension. In these cases,
- # all datapoints get converted the same way, so use the first axis
- if share_state is True or share_state == facet_dim[other_var]:
- converter.loc[:] = getattr(ax_list[0], f"{var}axis")
- else:
- # Next simplest case is when no axes are shared, and we can
- # use the axis objects within each facet
- if share_state is False:
- for axes_vars, axes_data in self.iter_data():
- ax = self._get_axes(axes_vars)
- converter.loc[axes_data.index] = getattr(ax, f"{var}axis")
- # In the more complicated case, the axes are shared within each
- # "file" of the facetgrid. In that case, we need to subset the data
- # for that file and assign it the first axis in the slice of the grid
- else:
- names = getattr(self.facets, f"{share_state}_names")
- for i, level in enumerate(names):
- idx = (i, 0) if share_state == "row" else (0, i)
- axis = getattr(self.facets.axes[idx], f"{var}axis")
- converter.loc[self.plot_data[share_state] == level] = axis
- # Store the converter vector, which we use elsewhere (e.g comp_data)
- self.converters[var] = converter
- # Now actually update the matplotlib objects to do the conversion we want
- grouped = self.plot_data[var].groupby(self.converters[var], sort=False)
- for converter, seed_data in grouped:
- if self.var_types[var] == "categorical":
- if self._var_ordered[var]:
- order = self.var_levels[var]
- else:
- order = None
- seed_data = categorical_order(seed_data, order)
- converter.update_units(seed_data)
- # -- Set numerical axis scales
- # First unpack the log_scale argument
- if log_scale is None:
- scalex = scaley = False
- else:
- # Allow single value or x, y tuple
- try:
- scalex, scaley = log_scale
- except TypeError:
- scalex = log_scale if self.var_types.get("x") == "numeric" else False
- scaley = log_scale if self.var_types.get("y") == "numeric" else False
- # Now use it
- for axis, scale in zip("xy", (scalex, scaley)):
- if scale:
- for ax in ax_list:
- set_scale = getattr(ax, f"set_{axis}scale")
- if scale is True:
- set_scale("log", nonpositive="mask")
- else:
- set_scale("log", base=scale, nonpositive="mask")
- # For categorical y, we want the "first" level to be at the top of the axis
- if self.var_types.get("y", None) == "categorical":
- for ax in ax_list:
- try:
- ax.yaxis.set_inverted(True)
- except AttributeError: # mpl < 3.1
- if not ax.yaxis_inverted():
- ax.invert_yaxis()
- # TODO -- Add axes labels
- def _get_scale_transforms(self, axis):
- """Return a function implementing the scale transform (or its inverse)."""
- if self.ax is None:
- axis_list = [getattr(ax, f"{axis}axis") for ax in self.facets.axes.flat]
- scales = {axis.get_scale() for axis in axis_list}
- if len(scales) > 1:
- # It is a simplifying assumption that faceted axes will always have
- # the same scale (even if they are unshared and have distinct limits).
- # Nothing in the seaborn API allows you to create a FacetGrid with
- # a mixture of scales, although it's possible via matplotlib.
- # This is constraining, but no more so than previous behavior that
- # only (properly) handled log scales, and there are some places where
- # it would be much too complicated to use axes-specific transforms.
- err = "Cannot determine transform with mixed scales on faceted axes."
- raise RuntimeError(err)
- transform_obj = axis_list[0].get_transform()
- else:
- # This case is more straightforward
- transform_obj = getattr(self.ax, f"{axis}axis").get_transform()
- return transform_obj.transform, transform_obj.inverted().transform
- def _add_axis_labels(self, ax, default_x="", default_y=""):
- """Add axis labels if not present, set visibility to match ticklabels."""
- # TODO ax could default to None and use attached axes if present
- # but what to do about the case of facets? Currently using FacetGrid's
- # set_axis_labels method, which doesn't add labels to the interior even
- # when the axes are not shared. Maybe that makes sense?
- if not ax.get_xlabel():
- x_visible = any(t.get_visible() for t in ax.get_xticklabels())
- ax.set_xlabel(self.variables.get("x", default_x), visible=x_visible)
- if not ax.get_ylabel():
- y_visible = any(t.get_visible() for t in ax.get_yticklabels())
- ax.set_ylabel(self.variables.get("y", default_y), visible=y_visible)
- def add_legend_data(
- self, ax, func, common_kws=None, attrs=None, semantic_kws=None,
- ):
- """Add labeled artists to represent the different plot semantics."""
- verbosity = self.legend
- if isinstance(verbosity, str) and verbosity not in ["auto", "brief", "full"]:
- err = "`legend` must be 'auto', 'brief', 'full', or a boolean."
- raise ValueError(err)
- elif verbosity is True:
- verbosity = "auto"
- keys = []
- legend_kws = {}
- common_kws = {} if common_kws is None else common_kws.copy()
- semantic_kws = {} if semantic_kws is None else semantic_kws.copy()
- # Assign a legend title if there is only going to be one sub-legend,
- # otherwise, subtitles will be inserted into the texts list with an
- # invisible handle (which is a hack)
- titles = {
- title for title in
- (self.variables.get(v, None) for v in ["hue", "size", "style"])
- if title is not None
- }
- title = "" if len(titles) != 1 else titles.pop()
- title_kws = dict(
- visible=False, color="w", s=0, linewidth=0, marker="", dashes=""
- )
- def update(var_name, val_name, **kws):
- key = var_name, val_name
- if key in legend_kws:
- legend_kws[key].update(**kws)
- else:
- keys.append(key)
- legend_kws[key] = dict(**kws)
- if attrs is None:
- attrs = {"hue": "color", "size": ["linewidth", "s"], "style": None}
- for var, names in attrs.items():
- self._update_legend_data(
- update, var, verbosity, title, title_kws, names, semantic_kws.get(var),
- )
- legend_data = {}
- legend_order = []
- # Don't allow color=None so we can set a neutral color for size/style legends
- if common_kws.get("color", False) is None:
- common_kws.pop("color")
- for key in keys:
- _, label = key
- kws = legend_kws[key]
- level_kws = {}
- use_attrs = [
- *self._legend_attributes,
- *common_kws,
- *[attr for var_attrs in semantic_kws.values() for attr in var_attrs],
- ]
- for attr in use_attrs:
- if attr in kws:
- level_kws[attr] = kws[attr]
- artist = func(label=label, **{"color": ".2", **common_kws, **level_kws})
- if _version_predates(mpl, "3.5.0"):
- if isinstance(artist, mpl.lines.Line2D):
- ax.add_line(artist)
- elif isinstance(artist, mpl.patches.Patch):
- ax.add_patch(artist)
- elif isinstance(artist, mpl.collections.Collection):
- ax.add_collection(artist)
- else:
- ax.add_artist(artist)
- legend_data[key] = artist
- legend_order.append(key)
- self.legend_title = title
- self.legend_data = legend_data
- self.legend_order = legend_order
- def _update_legend_data(
- self,
- update,
- var,
- verbosity,
- title,
- title_kws,
- attr_names,
- other_props,
- ):
- """Generate legend tick values and formatted labels."""
- brief_ticks = 6
- mapper = getattr(self, f"_{var}_map", None)
- if mapper is None:
- return
- brief = mapper.map_type == "numeric" and (
- verbosity == "brief"
- or (verbosity == "auto" and len(mapper.levels) > brief_ticks)
- )
- if brief:
- if isinstance(mapper.norm, mpl.colors.LogNorm):
- locator = mpl.ticker.LogLocator(numticks=brief_ticks)
- else:
- locator = mpl.ticker.MaxNLocator(nbins=brief_ticks)
- limits = min(mapper.levels), max(mapper.levels)
- levels, formatted_levels = locator_to_legend_entries(
- locator, limits, self.plot_data[var].infer_objects().dtype
- )
- elif mapper.levels is None:
- levels = formatted_levels = []
- else:
- levels = formatted_levels = mapper.levels
- if not title and self.variables.get(var, None) is not None:
- update((self.variables[var], "title"), self.variables[var], **title_kws)
- other_props = {} if other_props is None else other_props
- for level, formatted_level in zip(levels, formatted_levels):
- if level is not None:
- attr = mapper(level)
- if isinstance(attr_names, list):
- attr = {name: attr for name in attr_names}
- elif attr_names is not None:
- attr = {attr_names: attr}
- attr.update({k: v[level] for k, v in other_props.items() if level in v})
- update(self.variables[var], formatted_level, **attr)
- # XXX If the scale_* methods are going to modify the plot_data structure, they
- # can't be called twice. That means that if they are called twice, they should
- # raise. Alternatively, we could store an original version of plot_data and each
- # time they are called they operate on the store, not the current state.
- def scale_native(self, axis, *args, **kwargs):
- # Default, defer to matplotlib
- raise NotImplementedError
- def scale_numeric(self, axis, *args, **kwargs):
- # Feels needed to completeness, what should it do?
- # Perhaps handle log scaling? Set the ticker/formatter/limits?
- raise NotImplementedError
- def scale_datetime(self, axis, *args, **kwargs):
- # Use pd.to_datetime to convert strings or numbers to datetime objects
- # Note, use day-resolution for numeric->datetime to match matplotlib
- raise NotImplementedError
- def scale_categorical(self, axis, order=None, formatter=None):
- """
- Enforce categorical (fixed-scale) rules for the data on given axis.
- Parameters
- ----------
- axis : "x" or "y"
- Axis of the plot to operate on.
- order : list
- Order that unique values should appear in.
- formatter : callable
- Function mapping values to a string representation.
- Returns
- -------
- self
- """
- # This method both modifies the internal representation of the data
- # (converting it to string) and sets some attributes on self. It might be
- # a good idea to have a separate object attached to self that contains the
- # information in those attributes (i.e. whether to enforce variable order
- # across facets, the order to use) similar to the SemanticMapping objects
- # we have for semantic variables. That object could also hold the converter
- # objects that get used, if we can decouple those from an existing axis
- # (cf. https://github.com/matplotlib/matplotlib/issues/19229).
- # There are some interactions with faceting information that would need
- # to be thought through, since the converts to use depend on facets.
- # If we go that route, these methods could become "borrowed" methods similar
- # to what happens with the alternate semantic mapper constructors, although
- # that approach is kind of fussy and confusing.
- # TODO this method could also set the grid state? Since we like to have no
- # grid on the categorical axis by default. Again, a case where we'll need to
- # store information until we use it, so best to have a way to collect the
- # attributes that this method sets.
- # TODO if we are going to set visual properties of the axes with these methods,
- # then we could do the steps currently in CategoricalPlotter._adjust_cat_axis
- # TODO another, and distinct idea, is to expose a cut= param here
- _check_argument("axis", ["x", "y"], axis)
- # Categorical plots can be "univariate" in which case they get an anonymous
- # category label on the opposite axis.
- if axis not in self.variables:
- self.variables[axis] = None
- self.var_types[axis] = "categorical"
- self.plot_data[axis] = ""
- # If the "categorical" variable has a numeric type, sort the rows so that
- # the default result from categorical_order has those values sorted after
- # they have been coerced to strings. The reason for this is so that later
- # we can get facet-wise orders that are correct.
- # XXX Should this also sort datetimes?
- # It feels more consistent, but technically will be a default change
- # If so, should also change categorical_order to behave that way
- if self.var_types[axis] == "numeric":
- self.plot_data = self.plot_data.sort_values(axis, kind="mergesort")
- # Now get a reference to the categorical data vector and remove na values
- cat_data = self.plot_data[axis].dropna()
- # Get the initial categorical order, which we do before string
- # conversion to respect the original types of the order list.
- # Track whether the order is given explicitly so that we can know
- # whether or not to use the order constructed here downstream
- self._var_ordered[axis] = order is not None or cat_data.dtype.name == "category"
- order = pd.Index(categorical_order(cat_data, order), name=axis)
- # Then convert data to strings. This is because in matplotlib,
- # "categorical" data really mean "string" data, so doing this artists
- # will be drawn on the categorical axis with a fixed scale.
- # TODO implement formatter here; check that it returns strings?
- if formatter is not None:
- cat_data = cat_data.map(formatter)
- order = order.map(formatter)
- else:
- cat_data = cat_data.astype(str)
- order = order.astype(str)
- # Update the levels list with the type-converted order variable
- self.var_levels[axis] = order
- # Now ensure that seaborn will use categorical rules internally
- self.var_types[axis] = "categorical"
- # Put the string-typed categorical vector back into the plot_data structure
- self.plot_data[axis] = cat_data
- return self
- class VariableType(UserString):
- """
- Prevent comparisons elsewhere in the library from using the wrong name.
- Errors are simple assertions because users should not be able to trigger
- them. If that changes, they should be more verbose.
- """
- # TODO we can replace this with typing.Literal on Python 3.8+
- allowed = "numeric", "datetime", "categorical"
- def __init__(self, data):
- assert data in self.allowed, data
- super().__init__(data)
- def __eq__(self, other):
- assert other in self.allowed, other
- return self.data == other
- def variable_type(vector, boolean_type="numeric"):
- """
- Determine whether a vector contains numeric, categorical, or datetime data.
- This function differs from the pandas typing API in two ways:
- - Python sequences or object-typed PyData objects are considered numeric if
- all of their entries are numeric.
- - String or mixed-type data are considered categorical even if not
- explicitly represented as a :class:`pandas.api.types.CategoricalDtype`.
- Parameters
- ----------
- vector : :func:`pandas.Series`, :func:`numpy.ndarray`, or Python sequence
- Input data to test.
- boolean_type : 'numeric' or 'categorical'
- Type to use for vectors containing only 0s and 1s (and NAs).
- Returns
- -------
- var_type : 'numeric', 'categorical', or 'datetime'
- Name identifying the type of data in the vector.
- """
- vector = pd.Series(vector)
- # If a categorical dtype is set, infer categorical
- if isinstance(vector.dtype, pd.CategoricalDtype):
- return VariableType("categorical")
- # Special-case all-na data, which is always "numeric"
- if pd.isna(vector).all():
- return VariableType("numeric")
- # At this point, drop nans to simplify further type inference
- vector = vector.dropna()
- # Special-case binary/boolean data, allow caller to determine
- # This triggers a numpy warning when vector has strings/objects
- # https://github.com/numpy/numpy/issues/6784
- # Because we reduce with .all(), we are agnostic about whether the
- # comparison returns a scalar or vector, so we will ignore the warning.
- # It triggers a separate DeprecationWarning when the vector has datetimes:
- # https://github.com/numpy/numpy/issues/13548
- # This is considered a bug by numpy and will likely go away.
- with warnings.catch_warnings():
- warnings.simplefilter(
- action='ignore', category=(FutureWarning, DeprecationWarning)
- )
- if np.isin(vector, [0, 1]).all():
- return VariableType(boolean_type)
- # Defer to positive pandas tests
- if pd.api.types.is_numeric_dtype(vector):
- return VariableType("numeric")
- if pd.api.types.is_datetime64_dtype(vector):
- return VariableType("datetime")
- # --- If we get to here, we need to check the entries
- # Check for a collection where everything is a number
- def all_numeric(x):
- for x_i in x:
- if not isinstance(x_i, Number):
- return False
- return True
- if all_numeric(vector):
- return VariableType("numeric")
- # Check for a collection where everything is a datetime
- def all_datetime(x):
- for x_i in x:
- if not isinstance(x_i, (datetime, np.datetime64)):
- return False
- return True
- if all_datetime(vector):
- return VariableType("datetime")
- # Otherwise, our final fallback is to consider things categorical
- return VariableType("categorical")
- def infer_orient(x=None, y=None, orient=None, require_numeric=True):
- """Determine how the plot should be oriented based on the data.
- For historical reasons, the convention is to call a plot "horizontally"
- or "vertically" oriented based on the axis representing its dependent
- variable. Practically, this is used when determining the axis for
- numerical aggregation.
- Parameters
- ----------
- x, y : Vector data or None
- Positional data vectors for the plot.
- orient : string or None
- Specified orientation. If not None, can be "x" or "y", or otherwise
- must start with "v" or "h".
- require_numeric : bool
- If set, raise when the implied dependent variable is not numeric.
- Returns
- -------
- orient : "x" or "y"
- Raises
- ------
- ValueError: When `orient` is an unknown string.
- TypeError: When dependent variable is not numeric, with `require_numeric`
- """
- x_type = None if x is None else variable_type(x)
- y_type = None if y is None else variable_type(y)
- nonnumeric_dv_error = "{} orientation requires numeric `{}` variable."
- single_var_warning = "{} orientation ignored with only `{}` specified."
- if x is None:
- if str(orient).startswith("h"):
- warnings.warn(single_var_warning.format("Horizontal", "y"))
- if require_numeric and y_type != "numeric":
- raise TypeError(nonnumeric_dv_error.format("Vertical", "y"))
- return "x"
- elif y is None:
- if str(orient).startswith("v"):
- warnings.warn(single_var_warning.format("Vertical", "x"))
- if require_numeric and x_type != "numeric":
- raise TypeError(nonnumeric_dv_error.format("Horizontal", "x"))
- return "y"
- elif str(orient).startswith("v") or orient == "x":
- if require_numeric and y_type != "numeric":
- raise TypeError(nonnumeric_dv_error.format("Vertical", "y"))
- return "x"
- elif str(orient).startswith("h") or orient == "y":
- if require_numeric and x_type != "numeric":
- raise TypeError(nonnumeric_dv_error.format("Horizontal", "x"))
- return "y"
- elif orient is not None:
- err = (
- "`orient` must start with 'v' or 'h' or be None, "
- f"but `{repr(orient)}` was passed."
- )
- raise ValueError(err)
- elif x_type != "categorical" and y_type == "categorical":
- return "y"
- elif x_type != "numeric" and y_type == "numeric":
- return "x"
- elif x_type == "numeric" and y_type != "numeric":
- return "y"
- elif require_numeric and "numeric" not in (x_type, y_type):
- err = "Neither the `x` nor `y` variable appears to be numeric."
- raise TypeError(err)
- else:
- return "x"
- def unique_dashes(n):
- """Build an arbitrarily long list of unique dash styles for lines.
- Parameters
- ----------
- n : int
- Number of unique dash specs to generate.
- Returns
- -------
- dashes : list of strings or tuples
- Valid arguments for the ``dashes`` parameter on
- :class:`matplotlib.lines.Line2D`. The first spec is a solid
- line (``""``), the remainder are sequences of long and short
- dashes.
- """
- # Start with dash specs that are well distinguishable
- dashes = [
- "",
- (4, 1.5),
- (1, 1),
- (3, 1.25, 1.5, 1.25),
- (5, 1, 1, 1),
- ]
- # Now programmatically build as many as we need
- p = 3
- while len(dashes) < n:
- # Take combinations of long and short dashes
- a = itertools.combinations_with_replacement([3, 1.25], p)
- b = itertools.combinations_with_replacement([4, 1], p)
- # Interleave the combinations, reversing one of the streams
- segment_list = itertools.chain(*zip(
- list(a)[1:-1][::-1],
- list(b)[1:-1]
- ))
- # Now insert the gaps
- for segments in segment_list:
- gap = min(segments)
- spec = tuple(itertools.chain(*((seg, gap) for seg in segments)))
- dashes.append(spec)
- p += 1
- return dashes[:n]
- def unique_markers(n):
- """Build an arbitrarily long list of unique marker styles for points.
- Parameters
- ----------
- n : int
- Number of unique marker specs to generate.
- Returns
- -------
- markers : list of string or tuples
- Values for defining :class:`matplotlib.markers.MarkerStyle` objects.
- All markers will be filled.
- """
- # Start with marker specs that are well distinguishable
- markers = [
- "o",
- "X",
- (4, 0, 45),
- "P",
- (4, 0, 0),
- (4, 1, 0),
- "^",
- (4, 1, 45),
- "v",
- ]
- # Now generate more from regular polygons of increasing order
- s = 5
- while len(markers) < n:
- a = 360 / (s + 1) / 2
- markers.extend([
- (s + 1, 1, a),
- (s + 1, 0, a),
- (s, 1, 0),
- (s, 0, 0),
- ])
- s += 1
- # Convert to MarkerStyle object, using only exactly what we need
- # markers = [mpl.markers.MarkerStyle(m) for m in markers[:n]]
- return markers[:n]
- def categorical_order(vector, order=None):
- """Return a list of unique data values.
- Determine an ordered list of levels in ``values``.
- Parameters
- ----------
- vector : list, array, Categorical, or Series
- Vector of "categorical" values
- order : list-like, optional
- Desired order of category levels to override the order determined
- from the ``values`` object.
- Returns
- -------
- order : list
- Ordered list of category levels not including null values.
- """
- if order is None:
- if hasattr(vector, "categories"):
- order = vector.categories
- else:
- try:
- order = vector.cat.categories
- except (TypeError, AttributeError):
- order = pd.Series(vector).unique()
- if variable_type(vector) == "numeric":
- order = np.sort(order)
- order = filter(pd.notnull, order)
- return list(order)
|