12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070 |
- """
- Histogram-related functions
- """
- import contextlib
- import functools
- import operator
- import warnings
- import numpy as np
- from numpy.core import overrides
- __all__ = ['histogram', 'histogramdd', 'histogram_bin_edges']
- array_function_dispatch = functools.partial(
- overrides.array_function_dispatch, module='numpy')
- # range is a keyword argument to many functions, so save the builtin so they can
- # use it.
- _range = range
- def _ptp(x):
- """Peak-to-peak value of x.
- This implementation avoids the problem of signed integer arrays having a
- peak-to-peak value that cannot be represented with the array's data type.
- This function returns an unsigned value for signed integer arrays.
- """
- return _unsigned_subtract(x.max(), x.min())
- def _hist_bin_sqrt(x, range):
- """
- Square root histogram bin estimator.
- Bin width is inversely proportional to the data size. Used by many
- programs for its simplicity.
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- return _ptp(x) / np.sqrt(x.size)
- def _hist_bin_sturges(x, range):
- """
- Sturges histogram bin estimator.
- A very simplistic estimator based on the assumption of normality of
- the data. This estimator has poor performance for non-normal data,
- which becomes especially obvious for large data sets. The estimate
- depends only on size of the data.
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- return _ptp(x) / (np.log2(x.size) + 1.0)
- def _hist_bin_rice(x, range):
- """
- Rice histogram bin estimator.
- Another simple estimator with no normality assumption. It has better
- performance for large data than Sturges, but tends to overestimate
- the number of bins. The number of bins is proportional to the cube
- root of data size (asymptotically optimal). The estimate depends
- only on size of the data.
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- return _ptp(x) / (2.0 * x.size ** (1.0 / 3))
- def _hist_bin_scott(x, range):
- """
- Scott histogram bin estimator.
- The binwidth is proportional to the standard deviation of the data
- and inversely proportional to the cube root of data size
- (asymptotically optimal).
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- return (24.0 * np.pi**0.5 / x.size)**(1.0 / 3.0) * np.std(x)
- def _hist_bin_stone(x, range):
- """
- Histogram bin estimator based on minimizing the estimated integrated squared error (ISE).
- The number of bins is chosen by minimizing the estimated ISE against the unknown true distribution.
- The ISE is estimated using cross-validation and can be regarded as a generalization of Scott's rule.
- https://en.wikipedia.org/wiki/Histogram#Scott.27s_normal_reference_rule
- This paper by Stone appears to be the origination of this rule.
- http://digitalassets.lib.berkeley.edu/sdtr/ucb/text/34.pdf
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- range : (float, float)
- The lower and upper range of the bins.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- n = x.size
- ptp_x = _ptp(x)
- if n <= 1 or ptp_x == 0:
- return 0
- def jhat(nbins):
- hh = ptp_x / nbins
- p_k = np.histogram(x, bins=nbins, range=range)[0] / n
- return (2 - (n + 1) * p_k.dot(p_k)) / hh
- nbins_upper_bound = max(100, int(np.sqrt(n)))
- nbins = min(_range(1, nbins_upper_bound + 1), key=jhat)
- if nbins == nbins_upper_bound:
- warnings.warn("The number of bins estimated may be suboptimal.",
- RuntimeWarning, stacklevel=3)
- return ptp_x / nbins
- def _hist_bin_doane(x, range):
- """
- Doane's histogram bin estimator.
- Improved version of Sturges' formula which works better for
- non-normal data. See
- stats.stackexchange.com/questions/55134/doanes-formula-for-histogram-binning
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- if x.size > 2:
- sg1 = np.sqrt(6.0 * (x.size - 2) / ((x.size + 1.0) * (x.size + 3)))
- sigma = np.std(x)
- if sigma > 0.0:
- # These three operations add up to
- # g1 = np.mean(((x - np.mean(x)) / sigma)**3)
- # but use only one temp array instead of three
- temp = x - np.mean(x)
- np.true_divide(temp, sigma, temp)
- np.power(temp, 3, temp)
- g1 = np.mean(temp)
- return _ptp(x) / (1.0 + np.log2(x.size) +
- np.log2(1.0 + np.absolute(g1) / sg1))
- return 0.0
- def _hist_bin_fd(x, range):
- """
- The Freedman-Diaconis histogram bin estimator.
- The Freedman-Diaconis rule uses interquartile range (IQR) to
- estimate binwidth. It is considered a variation of the Scott rule
- with more robustness as the IQR is less affected by outliers than
- the standard deviation. However, the IQR depends on fewer points
- than the standard deviation, so it is less accurate, especially for
- long tailed distributions.
- If the IQR is 0, this function returns 0 for the bin width.
- Binwidth is inversely proportional to the cube root of data size
- (asymptotically optimal).
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- """
- del range # unused
- iqr = np.subtract(*np.percentile(x, [75, 25]))
- return 2.0 * iqr * x.size ** (-1.0 / 3.0)
- def _hist_bin_auto(x, range):
- """
- Histogram bin estimator that uses the minimum width of the
- Freedman-Diaconis and Sturges estimators if the FD bin width is non-zero.
- If the bin width from the FD estimator is 0, the Sturges estimator is used.
- The FD estimator is usually the most robust method, but its width
- estimate tends to be too large for small `x` and bad for data with limited
- variance. The Sturges estimator is quite good for small (<1000) datasets
- and is the default in the R language. This method gives good off-the-shelf
- behaviour.
- .. versionchanged:: 1.15.0
- If there is limited variance the IQR can be 0, which results in the
- FD bin width being 0 too. This is not a valid bin width, so
- ``np.histogram_bin_edges`` chooses 1 bin instead, which may not be optimal.
- If the IQR is 0, it's unlikely any variance-based estimators will be of
- use, so we revert to the Sturges estimator, which only uses the size of the
- dataset in its calculation.
- Parameters
- ----------
- x : array_like
- Input data that is to be histogrammed, trimmed to range. May not
- be empty.
- Returns
- -------
- h : An estimate of the optimal bin width for the given data.
- See Also
- --------
- _hist_bin_fd, _hist_bin_sturges
- """
- fd_bw = _hist_bin_fd(x, range)
- sturges_bw = _hist_bin_sturges(x, range)
- del range # unused
- if fd_bw:
- return min(fd_bw, sturges_bw)
- else:
- # limited variance, so we return a len dependent bw estimator
- return sturges_bw
- # Private dict initialized at module load time
- _hist_bin_selectors = {'stone': _hist_bin_stone,
- 'auto': _hist_bin_auto,
- 'doane': _hist_bin_doane,
- 'fd': _hist_bin_fd,
- 'rice': _hist_bin_rice,
- 'scott': _hist_bin_scott,
- 'sqrt': _hist_bin_sqrt,
- 'sturges': _hist_bin_sturges}
- def _ravel_and_check_weights(a, weights):
- """ Check a and weights have matching shapes, and ravel both """
- a = np.asarray(a)
- # Ensure that the array is a "subtractable" dtype
- if a.dtype == np.bool_:
- warnings.warn("Converting input from {} to {} for compatibility."
- .format(a.dtype, np.uint8),
- RuntimeWarning, stacklevel=3)
- a = a.astype(np.uint8)
- if weights is not None:
- weights = np.asarray(weights)
- if weights.shape != a.shape:
- raise ValueError(
- 'weights should have the same shape as a.')
- weights = weights.ravel()
- a = a.ravel()
- return a, weights
- def _get_outer_edges(a, range):
- """
- Determine the outer bin edges to use, from either the data or the range
- argument
- """
- if range is not None:
- first_edge, last_edge = range
- if first_edge > last_edge:
- raise ValueError(
- 'max must be larger than min in range parameter.')
- if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
- raise ValueError(
- "supplied range of [{}, {}] is not finite".format(first_edge, last_edge))
- elif a.size == 0:
- # handle empty arrays. Can't determine range, so use 0-1.
- first_edge, last_edge = 0, 1
- else:
- first_edge, last_edge = a.min(), a.max()
- if not (np.isfinite(first_edge) and np.isfinite(last_edge)):
- raise ValueError(
- "autodetected range of [{}, {}] is not finite".format(first_edge, last_edge))
- # expand empty range to avoid divide by zero
- if first_edge == last_edge:
- first_edge = first_edge - 0.5
- last_edge = last_edge + 0.5
- return first_edge, last_edge
- def _unsigned_subtract(a, b):
- """
- Subtract two values where a >= b, and produce an unsigned result
- This is needed when finding the difference between the upper and lower
- bound of an int16 histogram
- """
- # coerce to a single type
- signed_to_unsigned = {
- np.byte: np.ubyte,
- np.short: np.ushort,
- np.intc: np.uintc,
- np.int_: np.uint,
- np.longlong: np.ulonglong
- }
- dt = np.result_type(a, b)
- try:
- dt = signed_to_unsigned[dt.type]
- except KeyError:
- return np.subtract(a, b, dtype=dt)
- else:
- # we know the inputs are integers, and we are deliberately casting
- # signed to unsigned
- return np.subtract(a, b, casting='unsafe', dtype=dt)
- def _get_bin_edges(a, bins, range, weights):
- """
- Computes the bins used internally by `histogram`.
- Parameters
- ==========
- a : ndarray
- Ravelled data array
- bins, range
- Forwarded arguments from `histogram`.
- weights : ndarray, optional
- Ravelled weights array, or None
- Returns
- =======
- bin_edges : ndarray
- Array of bin edges
- uniform_bins : (Number, Number, int):
- The upper bound, lowerbound, and number of bins, used in the optimized
- implementation of `histogram` that works on uniform bins.
- """
- # parse the overloaded bins argument
- n_equal_bins = None
- bin_edges = None
- if isinstance(bins, str):
- bin_name = bins
- # if `bins` is a string for an automatic method,
- # this will replace it with the number of bins calculated
- if bin_name not in _hist_bin_selectors:
- raise ValueError(
- "{!r} is not a valid estimator for `bins`".format(bin_name))
- if weights is not None:
- raise TypeError("Automated estimation of the number of "
- "bins is not supported for weighted data")
- first_edge, last_edge = _get_outer_edges(a, range)
- # truncate the range if needed
- if range is not None:
- keep = (a >= first_edge)
- keep &= (a <= last_edge)
- if not np.logical_and.reduce(keep):
- a = a[keep]
- if a.size == 0:
- n_equal_bins = 1
- else:
- # Do not call selectors on empty arrays
- width = _hist_bin_selectors[bin_name](a, (first_edge, last_edge))
- if width:
- n_equal_bins = int(np.ceil(_unsigned_subtract(last_edge, first_edge) / width))
- else:
- # Width can be zero for some estimators, e.g. FD when
- # the IQR of the data is zero.
- n_equal_bins = 1
- elif np.ndim(bins) == 0:
- try:
- n_equal_bins = operator.index(bins)
- except TypeError as e:
- raise TypeError(
- '`bins` must be an integer, a string, or an array') from e
- if n_equal_bins < 1:
- raise ValueError('`bins` must be positive, when an integer')
- first_edge, last_edge = _get_outer_edges(a, range)
- elif np.ndim(bins) == 1:
- bin_edges = np.asarray(bins)
- if np.any(bin_edges[:-1] > bin_edges[1:]):
- raise ValueError(
- '`bins` must increase monotonically, when an array')
- else:
- raise ValueError('`bins` must be 1d, when an array')
- if n_equal_bins is not None:
- # gh-10322 means that type resolution rules are dependent on array
- # shapes. To avoid this causing problems, we pick a type now and stick
- # with it throughout.
- bin_type = np.result_type(first_edge, last_edge, a)
- if np.issubdtype(bin_type, np.integer):
- bin_type = np.result_type(bin_type, float)
- # bin edges must be computed
- bin_edges = np.linspace(
- first_edge, last_edge, n_equal_bins + 1,
- endpoint=True, dtype=bin_type)
- return bin_edges, (first_edge, last_edge, n_equal_bins)
- else:
- return bin_edges, None
- def _search_sorted_inclusive(a, v):
- """
- Like `searchsorted`, but where the last item in `v` is placed on the right.
- In the context of a histogram, this makes the last bin edge inclusive
- """
- return np.concatenate((
- a.searchsorted(v[:-1], 'left'),
- a.searchsorted(v[-1:], 'right')
- ))
- def _histogram_bin_edges_dispatcher(a, bins=None, range=None, weights=None):
- return (a, bins, weights)
- @array_function_dispatch(_histogram_bin_edges_dispatcher)
- def histogram_bin_edges(a, bins=10, range=None, weights=None):
- r"""
- Function to calculate only the edges of the bins used by the `histogram`
- function.
- Parameters
- ----------
- a : array_like
- Input data. The histogram is computed over the flattened array.
- bins : int or sequence of scalars or str, optional
- If `bins` is an int, it defines the number of equal-width
- bins in the given range (10, by default). If `bins` is a
- sequence, it defines the bin edges, including the rightmost
- edge, allowing for non-uniform bin widths.
- If `bins` is a string from the list below, `histogram_bin_edges` will use
- the method chosen to calculate the optimal bin width and
- consequently the number of bins (see `Notes` for more detail on
- the estimators) from the data that falls within the requested
- range. While the bin width will be optimal for the actual data
- in the range, the number of bins will be computed to fill the
- entire range, including the empty portions. For visualisation,
- using the 'auto' option is suggested. Weighted data is not
- supported for automated bin size selection.
- 'auto'
- Maximum of the 'sturges' and 'fd' estimators. Provides good
- all around performance.
- 'fd' (Freedman Diaconis Estimator)
- Robust (resilient to outliers) estimator that takes into
- account data variability and data size.
- 'doane'
- An improved version of Sturges' estimator that works better
- with non-normal datasets.
- 'scott'
- Less robust estimator that takes into account data variability
- and data size.
- 'stone'
- Estimator based on leave-one-out cross-validation estimate of
- the integrated squared error. Can be regarded as a generalization
- of Scott's rule.
- 'rice'
- Estimator does not take variability into account, only data
- size. Commonly overestimates number of bins required.
- 'sturges'
- R's default method, only accounts for data size. Only
- optimal for gaussian data and underestimates number of bins
- for large non-gaussian datasets.
- 'sqrt'
- Square root (of data size) estimator, used by Excel and
- other programs for its speed and simplicity.
- range : (float, float), optional
- The lower and upper range of the bins. If not provided, range
- is simply ``(a.min(), a.max())``. Values outside the range are
- ignored. The first element of the range must be less than or
- equal to the second. `range` affects the automatic bin
- computation as well. While bin width is computed to be optimal
- based on the actual data within `range`, the bin count will fill
- the entire range including portions containing no data.
- weights : array_like, optional
- An array of weights, of the same shape as `a`. Each value in
- `a` only contributes its associated weight towards the bin count
- (instead of 1). This is currently not used by any of the bin estimators,
- but may be in the future.
- Returns
- -------
- bin_edges : array of dtype float
- The edges to pass into `histogram`
- See Also
- --------
- histogram
- Notes
- -----
- The methods to estimate the optimal number of bins are well founded
- in literature, and are inspired by the choices R provides for
- histogram visualisation. Note that having the number of bins
- proportional to :math:`n^{1/3}` is asymptotically optimal, which is
- why it appears in most estimators. These are simply plug-in methods
- that give good starting points for number of bins. In the equations
- below, :math:`h` is the binwidth and :math:`n_h` is the number of
- bins. All estimators that compute bin counts are recast to bin width
- using the `ptp` of the data. The final bin count is obtained from
- ``np.round(np.ceil(range / h))``. The final bin width is often less
- than what is returned by the estimators below.
- 'auto' (maximum of the 'sturges' and 'fd' estimators)
- A compromise to get a good value. For small datasets the Sturges
- value will usually be chosen, while larger datasets will usually
- default to FD. Avoids the overly conservative behaviour of FD
- and Sturges for small and large datasets respectively.
- Switchover point is usually :math:`a.size \approx 1000`.
- 'fd' (Freedman Diaconis Estimator)
- .. math:: h = 2 \frac{IQR}{n^{1/3}}
- The binwidth is proportional to the interquartile range (IQR)
- and inversely proportional to cube root of a.size. Can be too
- conservative for small datasets, but is quite good for large
- datasets. The IQR is very robust to outliers.
- 'scott'
- .. math:: h = \sigma \sqrt[3]{\frac{24 \sqrt{\pi}}{n}}
- The binwidth is proportional to the standard deviation of the
- data and inversely proportional to cube root of ``x.size``. Can
- be too conservative for small datasets, but is quite good for
- large datasets. The standard deviation is not very robust to
- outliers. Values are very similar to the Freedman-Diaconis
- estimator in the absence of outliers.
- 'rice'
- .. math:: n_h = 2n^{1/3}
- The number of bins is only proportional to cube root of
- ``a.size``. It tends to overestimate the number of bins and it
- does not take into account data variability.
- 'sturges'
- .. math:: n_h = \log _{2}(n) + 1
- The number of bins is the base 2 log of ``a.size``. This
- estimator assumes normality of data and is too conservative for
- larger, non-normal datasets. This is the default method in R's
- ``hist`` method.
- 'doane'
- .. math:: n_h = 1 + \log_{2}(n) +
- \log_{2}\left(1 + \frac{|g_1|}{\sigma_{g_1}}\right)
- g_1 = mean\left[\left(\frac{x - \mu}{\sigma}\right)^3\right]
- \sigma_{g_1} = \sqrt{\frac{6(n - 2)}{(n + 1)(n + 3)}}
- An improved version of Sturges' formula that produces better
- estimates for non-normal datasets. This estimator attempts to
- account for the skew of the data.
- 'sqrt'
- .. math:: n_h = \sqrt n
- The simplest and fastest estimator. Only takes into account the
- data size.
- Examples
- --------
- >>> arr = np.array([0, 0, 0, 1, 2, 3, 3, 4, 5])
- >>> np.histogram_bin_edges(arr, bins='auto', range=(0, 1))
- array([0. , 0.25, 0.5 , 0.75, 1. ])
- >>> np.histogram_bin_edges(arr, bins=2)
- array([0. , 2.5, 5. ])
- For consistency with histogram, an array of pre-computed bins is
- passed through unmodified:
- >>> np.histogram_bin_edges(arr, [1, 2])
- array([1, 2])
- This function allows one set of bins to be computed, and reused across
- multiple histograms:
- >>> shared_bins = np.histogram_bin_edges(arr, bins='auto')
- >>> shared_bins
- array([0., 1., 2., 3., 4., 5.])
- >>> group_id = np.array([0, 1, 1, 0, 1, 1, 0, 1, 1])
- >>> hist_0, _ = np.histogram(arr[group_id == 0], bins=shared_bins)
- >>> hist_1, _ = np.histogram(arr[group_id == 1], bins=shared_bins)
- >>> hist_0; hist_1
- array([1, 1, 0, 1, 0])
- array([2, 0, 1, 1, 2])
- Which gives more easily comparable results than using separate bins for
- each histogram:
- >>> hist_0, bins_0 = np.histogram(arr[group_id == 0], bins='auto')
- >>> hist_1, bins_1 = np.histogram(arr[group_id == 1], bins='auto')
- >>> hist_0; hist_1
- array([1, 1, 1])
- array([2, 1, 1, 2])
- >>> bins_0; bins_1
- array([0., 1., 2., 3.])
- array([0. , 1.25, 2.5 , 3.75, 5. ])
- """
- a, weights = _ravel_and_check_weights(a, weights)
- bin_edges, _ = _get_bin_edges(a, bins, range, weights)
- return bin_edges
- def _histogram_dispatcher(
- a, bins=None, range=None, density=None, weights=None):
- return (a, bins, weights)
- @array_function_dispatch(_histogram_dispatcher)
- def histogram(a, bins=10, range=None, density=None, weights=None):
- r"""
- Compute the histogram of a dataset.
- Parameters
- ----------
- a : array_like
- Input data. The histogram is computed over the flattened array.
- bins : int or sequence of scalars or str, optional
- If `bins` is an int, it defines the number of equal-width
- bins in the given range (10, by default). If `bins` is a
- sequence, it defines a monotonically increasing array of bin edges,
- including the rightmost edge, allowing for non-uniform bin widths.
- .. versionadded:: 1.11.0
- If `bins` is a string, it defines the method used to calculate the
- optimal bin width, as defined by `histogram_bin_edges`.
- range : (float, float), optional
- The lower and upper range of the bins. If not provided, range
- is simply ``(a.min(), a.max())``. Values outside the range are
- ignored. The first element of the range must be less than or
- equal to the second. `range` affects the automatic bin
- computation as well. While bin width is computed to be optimal
- based on the actual data within `range`, the bin count will fill
- the entire range including portions containing no data.
- weights : array_like, optional
- An array of weights, of the same shape as `a`. Each value in
- `a` only contributes its associated weight towards the bin count
- (instead of 1). If `density` is True, the weights are
- normalized, so that the integral of the density over the range
- remains 1.
- density : bool, optional
- If ``False``, the result will contain the number of samples in
- each bin. If ``True``, the result is the value of the
- probability *density* function at the bin, normalized such that
- the *integral* over the range is 1. Note that the sum of the
- histogram values will not be equal to 1 unless bins of unity
- width are chosen; it is not a probability *mass* function.
- Returns
- -------
- hist : array
- The values of the histogram. See `density` and `weights` for a
- description of the possible semantics.
- bin_edges : array of dtype float
- Return the bin edges ``(length(hist)+1)``.
- See Also
- --------
- histogramdd, bincount, searchsorted, digitize, histogram_bin_edges
- Notes
- -----
- All but the last (righthand-most) bin is half-open. In other words,
- if `bins` is::
- [1, 2, 3, 4]
- then the first bin is ``[1, 2)`` (including 1, but excluding 2) and
- the second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which
- *includes* 4.
- Examples
- --------
- >>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
- (array([0, 2, 1]), array([0, 1, 2, 3]))
- >>> np.histogram(np.arange(4), bins=np.arange(5), density=True)
- (array([0.25, 0.25, 0.25, 0.25]), array([0, 1, 2, 3, 4]))
- >>> np.histogram([[1, 2, 1], [1, 0, 1]], bins=[0,1,2,3])
- (array([1, 4, 1]), array([0, 1, 2, 3]))
- >>> a = np.arange(5)
- >>> hist, bin_edges = np.histogram(a, density=True)
- >>> hist
- array([0.5, 0. , 0.5, 0. , 0. , 0.5, 0. , 0.5, 0. , 0.5])
- >>> hist.sum()
- 2.4999999999999996
- >>> np.sum(hist * np.diff(bin_edges))
- 1.0
- .. versionadded:: 1.11.0
- Automated Bin Selection Methods example, using 2 peak random data
- with 2000 points:
- >>> import matplotlib.pyplot as plt
- >>> rng = np.random.RandomState(10) # deterministic random data
- >>> a = np.hstack((rng.normal(size=1000),
- ... rng.normal(loc=5, scale=2, size=1000)))
- >>> _ = plt.hist(a, bins='auto') # arguments are passed to np.histogram
- >>> plt.title("Histogram with 'auto' bins")
- Text(0.5, 1.0, "Histogram with 'auto' bins")
- >>> plt.show()
- """
- a, weights = _ravel_and_check_weights(a, weights)
- bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
- # Histogram is an integer or a float array depending on the weights.
- if weights is None:
- ntype = np.dtype(np.intp)
- else:
- ntype = weights.dtype
- # We set a block size, as this allows us to iterate over chunks when
- # computing histograms, to minimize memory usage.
- BLOCK = 65536
- # The fast path uses bincount, but that only works for certain types
- # of weight
- simple_weights = (
- weights is None or
- np.can_cast(weights.dtype, np.double) or
- np.can_cast(weights.dtype, complex)
- )
- if uniform_bins is not None and simple_weights:
- # Fast algorithm for equal bins
- # We now convert values of a to bin indices, under the assumption of
- # equal bin widths (which is valid here).
- first_edge, last_edge, n_equal_bins = uniform_bins
- # Initialize empty histogram
- n = np.zeros(n_equal_bins, ntype)
- # Pre-compute histogram scaling factor
- norm = n_equal_bins / _unsigned_subtract(last_edge, first_edge)
- # We iterate over blocks here for two reasons: the first is that for
- # large arrays, it is actually faster (for example for a 10^8 array it
- # is 2x as fast) and it results in a memory footprint 3x lower in the
- # limit of large arrays.
- for i in _range(0, len(a), BLOCK):
- tmp_a = a[i:i+BLOCK]
- if weights is None:
- tmp_w = None
- else:
- tmp_w = weights[i:i + BLOCK]
- # Only include values in the right range
- keep = (tmp_a >= first_edge)
- keep &= (tmp_a <= last_edge)
- if not np.logical_and.reduce(keep):
- tmp_a = tmp_a[keep]
- if tmp_w is not None:
- tmp_w = tmp_w[keep]
- # This cast ensures no type promotions occur below, which gh-10322
- # make unpredictable. Getting it wrong leads to precision errors
- # like gh-8123.
- tmp_a = tmp_a.astype(bin_edges.dtype, copy=False)
- # Compute the bin indices, and for values that lie exactly on
- # last_edge we need to subtract one
- f_indices = _unsigned_subtract(tmp_a, first_edge) * norm
- indices = f_indices.astype(np.intp)
- indices[indices == n_equal_bins] -= 1
- # The index computation is not guaranteed to give exactly
- # consistent results within ~1 ULP of the bin edges.
- decrement = tmp_a < bin_edges[indices]
- indices[decrement] -= 1
- # The last bin includes the right edge. The other bins do not.
- increment = ((tmp_a >= bin_edges[indices + 1])
- & (indices != n_equal_bins - 1))
- indices[increment] += 1
- # We now compute the histogram using bincount
- if ntype.kind == 'c':
- n.real += np.bincount(indices, weights=tmp_w.real,
- minlength=n_equal_bins)
- n.imag += np.bincount(indices, weights=tmp_w.imag,
- minlength=n_equal_bins)
- else:
- n += np.bincount(indices, weights=tmp_w,
- minlength=n_equal_bins).astype(ntype)
- else:
- # Compute via cumulative histogram
- cum_n = np.zeros(bin_edges.shape, ntype)
- if weights is None:
- for i in _range(0, len(a), BLOCK):
- sa = np.sort(a[i:i+BLOCK])
- cum_n += _search_sorted_inclusive(sa, bin_edges)
- else:
- zero = np.zeros(1, dtype=ntype)
- for i in _range(0, len(a), BLOCK):
- tmp_a = a[i:i+BLOCK]
- tmp_w = weights[i:i+BLOCK]
- sorting_index = np.argsort(tmp_a)
- sa = tmp_a[sorting_index]
- sw = tmp_w[sorting_index]
- cw = np.concatenate((zero, sw.cumsum()))
- bin_index = _search_sorted_inclusive(sa, bin_edges)
- cum_n += cw[bin_index]
- n = np.diff(cum_n)
- if density:
- db = np.array(np.diff(bin_edges), float)
- return n/db/n.sum(), bin_edges
- return n, bin_edges
- def _histogramdd_dispatcher(sample, bins=None, range=None, density=None,
- weights=None):
- if hasattr(sample, 'shape'): # same condition as used in histogramdd
- yield sample
- else:
- yield from sample
- with contextlib.suppress(TypeError):
- yield from bins
- yield weights
- @array_function_dispatch(_histogramdd_dispatcher)
- def histogramdd(sample, bins=10, range=None, density=None, weights=None):
- """
- Compute the multidimensional histogram of some data.
- Parameters
- ----------
- sample : (N, D) array, or (N, D) array_like
- The data to be histogrammed.
- Note the unusual interpretation of sample when an array_like:
- * When an array, each row is a coordinate in a D-dimensional space -
- such as ``histogramdd(np.array([p1, p2, p3]))``.
- * When an array_like, each element is the list of values for single
- coordinate - such as ``histogramdd((X, Y, Z))``.
- The first form should be preferred.
- bins : sequence or int, optional
- The bin specification:
- * A sequence of arrays describing the monotonically increasing bin
- edges along each dimension.
- * The number of bins for each dimension (nx, ny, ... =bins)
- * The number of bins for all dimensions (nx=ny=...=bins).
- range : sequence, optional
- A sequence of length D, each an optional (lower, upper) tuple giving
- the outer bin edges to be used if the edges are not given explicitly in
- `bins`.
- An entry of None in the sequence results in the minimum and maximum
- values being used for the corresponding dimension.
- The default, None, is equivalent to passing a tuple of D None values.
- density : bool, optional
- If False, the default, returns the number of samples in each bin.
- If True, returns the probability *density* function at the bin,
- ``bin_count / sample_count / bin_volume``.
- weights : (N,) array_like, optional
- An array of values `w_i` weighing each sample `(x_i, y_i, z_i, ...)`.
- Weights are normalized to 1 if density is True. If density is False,
- the values of the returned histogram are equal to the sum of the
- weights belonging to the samples falling into each bin.
- Returns
- -------
- H : ndarray
- The multidimensional histogram of sample x. See density and weights
- for the different possible semantics.
- edges : list
- A list of D arrays describing the bin edges for each dimension.
- See Also
- --------
- histogram: 1-D histogram
- histogram2d: 2-D histogram
- Examples
- --------
- >>> r = np.random.randn(100,3)
- >>> H, edges = np.histogramdd(r, bins = (5, 8, 4))
- >>> H.shape, edges[0].size, edges[1].size, edges[2].size
- ((5, 8, 4), 6, 9, 5)
- """
- try:
- # Sample is an ND-array.
- N, D = sample.shape
- except (AttributeError, ValueError):
- # Sample is a sequence of 1D arrays.
- sample = np.atleast_2d(sample).T
- N, D = sample.shape
- nbin = np.empty(D, np.intp)
- edges = D*[None]
- dedges = D*[None]
- if weights is not None:
- weights = np.asarray(weights)
- try:
- M = len(bins)
- if M != D:
- raise ValueError(
- 'The dimension of bins must be equal to the dimension of the '
- ' sample x.')
- except TypeError:
- # bins is an integer
- bins = D*[bins]
- # normalize the range argument
- if range is None:
- range = (None,) * D
- elif len(range) != D:
- raise ValueError('range argument must have one entry per dimension')
- # Create edge arrays
- for i in _range(D):
- if np.ndim(bins[i]) == 0:
- if bins[i] < 1:
- raise ValueError(
- '`bins[{}]` must be positive, when an integer'.format(i))
- smin, smax = _get_outer_edges(sample[:,i], range[i])
- try:
- n = operator.index(bins[i])
- except TypeError as e:
- raise TypeError(
- "`bins[{}]` must be an integer, when a scalar".format(i)
- ) from e
- edges[i] = np.linspace(smin, smax, n + 1)
- elif np.ndim(bins[i]) == 1:
- edges[i] = np.asarray(bins[i])
- if np.any(edges[i][:-1] > edges[i][1:]):
- raise ValueError(
- '`bins[{}]` must be monotonically increasing, when an array'
- .format(i))
- else:
- raise ValueError(
- '`bins[{}]` must be a scalar or 1d array'.format(i))
- nbin[i] = len(edges[i]) + 1 # includes an outlier on each end
- dedges[i] = np.diff(edges[i])
- # Compute the bin number each sample falls into.
- Ncount = tuple(
- # avoid np.digitize to work around gh-11022
- np.searchsorted(edges[i], sample[:, i], side='right')
- for i in _range(D)
- )
- # Using digitize, values that fall on an edge are put in the right bin.
- # For the rightmost bin, we want values equal to the right edge to be
- # counted in the last bin, and not as an outlier.
- for i in _range(D):
- # Find which points are on the rightmost edge.
- on_edge = (sample[:, i] == edges[i][-1])
- # Shift these points one bin to the left.
- Ncount[i][on_edge] -= 1
- # Compute the sample indices in the flattened histogram matrix.
- # This raises an error if the array is too large.
- xy = np.ravel_multi_index(Ncount, nbin)
- # Compute the number of repetitions in xy and assign it to the
- # flattened histmat.
- hist = np.bincount(xy, weights, minlength=nbin.prod())
- # Shape into a proper matrix
- hist = hist.reshape(nbin)
- # This preserves the (bad) behavior observed in gh-7845, for now.
- hist = hist.astype(float, casting='safe')
- # Remove outliers (indices 0 and -1 for each dimension).
- core = D*(slice(1, -1),)
- hist = hist[core]
- if density:
- # calculate the probability density function
- s = hist.sum()
- for i in _range(D):
- shape = np.ones(D, int)
- shape[i] = nbin[i] - 2
- hist = hist / dedges[i].reshape(shape)
- hist /= s
- if (hist.shape != nbin - 2).any():
- raise RuntimeError(
- "Internal Shape Error")
- return hist, edges
|