_binned_statistic.py 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795
  1. import builtins
  2. import numpy as np
  3. from numpy.testing import suppress_warnings
  4. from operator import index
  5. from collections import namedtuple
  6. __all__ = ['binned_statistic',
  7. 'binned_statistic_2d',
  8. 'binned_statistic_dd']
  9. BinnedStatisticResult = namedtuple('BinnedStatisticResult',
  10. ('statistic', 'bin_edges', 'binnumber'))
  11. def binned_statistic(x, values, statistic='mean',
  12. bins=10, range=None):
  13. """
  14. Compute a binned statistic for one or more sets of data.
  15. This is a generalization of a histogram function. A histogram divides
  16. the space into bins, and returns the count of the number of points in
  17. each bin. This function allows the computation of the sum, mean, median,
  18. or other statistic of the values (or set of values) within each bin.
  19. Parameters
  20. ----------
  21. x : (N,) array_like
  22. A sequence of values to be binned.
  23. values : (N,) array_like or list of (N,) array_like
  24. The data on which the statistic will be computed. This must be
  25. the same shape as `x`, or a set of sequences - each the same shape as
  26. `x`. If `values` is a set of sequences, the statistic will be computed
  27. on each independently.
  28. statistic : string or callable, optional
  29. The statistic to compute (default is 'mean').
  30. The following statistics are available:
  31. * 'mean' : compute the mean of values for points within each bin.
  32. Empty bins will be represented by NaN.
  33. * 'std' : compute the standard deviation within each bin. This
  34. is implicitly calculated with ddof=0.
  35. * 'median' : compute the median of values for points within each
  36. bin. Empty bins will be represented by NaN.
  37. * 'count' : compute the count of points within each bin. This is
  38. identical to an unweighted histogram. `values` array is not
  39. referenced.
  40. * 'sum' : compute the sum of values for points within each bin.
  41. This is identical to a weighted histogram.
  42. * 'min' : compute the minimum of values for points within each bin.
  43. Empty bins will be represented by NaN.
  44. * 'max' : compute the maximum of values for point within each bin.
  45. Empty bins will be represented by NaN.
  46. * function : a user-defined function which takes a 1D array of
  47. values, and outputs a single numerical statistic. This function
  48. will be called on the values in each bin. Empty bins will be
  49. represented by function([]), or NaN if this returns an error.
  50. bins : int or sequence of scalars, optional
  51. If `bins` is an int, it defines the number of equal-width bins in the
  52. given range (10 by default). If `bins` is a sequence, it defines the
  53. bin edges, including the rightmost edge, allowing for non-uniform bin
  54. widths. Values in `x` that are smaller than lowest bin edge are
  55. assigned to bin number 0, values beyond the highest bin are assigned to
  56. ``bins[-1]``. If the bin edges are specified, the number of bins will
  57. be, (nx = len(bins)-1).
  58. range : (float, float) or [(float, float)], optional
  59. The lower and upper range of the bins. If not provided, range
  60. is simply ``(x.min(), x.max())``. Values outside the range are
  61. ignored.
  62. Returns
  63. -------
  64. statistic : array
  65. The values of the selected statistic in each bin.
  66. bin_edges : array of dtype float
  67. Return the bin edges ``(length(statistic)+1)``.
  68. binnumber: 1-D ndarray of ints
  69. Indices of the bins (corresponding to `bin_edges`) in which each value
  70. of `x` belongs. Same length as `values`. A binnumber of `i` means the
  71. corresponding value is between (bin_edges[i-1], bin_edges[i]).
  72. See Also
  73. --------
  74. numpy.digitize, numpy.histogram, binned_statistic_2d, binned_statistic_dd
  75. Notes
  76. -----
  77. All but the last (righthand-most) bin is half-open. In other words, if
  78. `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
  79. but excluding 2) and the second ``[2, 3)``. The last bin, however, is
  80. ``[3, 4]``, which *includes* 4.
  81. .. versionadded:: 0.11.0
  82. Examples
  83. --------
  84. >>> import numpy as np
  85. >>> from scipy import stats
  86. >>> import matplotlib.pyplot as plt
  87. First some basic examples:
  88. Create two evenly spaced bins in the range of the given sample, and sum the
  89. corresponding values in each of those bins:
  90. >>> values = [1.0, 1.0, 2.0, 1.5, 3.0]
  91. >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
  92. BinnedStatisticResult(statistic=array([4. , 4.5]),
  93. bin_edges=array([1., 4., 7.]), binnumber=array([1, 1, 1, 2, 2]))
  94. Multiple arrays of values can also be passed. The statistic is calculated
  95. on each set independently:
  96. >>> values = [[1.0, 1.0, 2.0, 1.5, 3.0], [2.0, 2.0, 4.0, 3.0, 6.0]]
  97. >>> stats.binned_statistic([1, 1, 2, 5, 7], values, 'sum', bins=2)
  98. BinnedStatisticResult(statistic=array([[4. , 4.5],
  99. [8. , 9. ]]), bin_edges=array([1., 4., 7.]),
  100. binnumber=array([1, 1, 1, 2, 2]))
  101. >>> stats.binned_statistic([1, 2, 1, 2, 4], np.arange(5), statistic='mean',
  102. ... bins=3)
  103. BinnedStatisticResult(statistic=array([1., 2., 4.]),
  104. bin_edges=array([1., 2., 3., 4.]),
  105. binnumber=array([1, 2, 1, 2, 3]))
  106. As a second example, we now generate some random data of sailing boat speed
  107. as a function of wind speed, and then determine how fast our boat is for
  108. certain wind speeds:
  109. >>> rng = np.random.default_rng()
  110. >>> windspeed = 8 * rng.random(500)
  111. >>> boatspeed = .3 * windspeed**.5 + .2 * rng.random(500)
  112. >>> bin_means, bin_edges, binnumber = stats.binned_statistic(windspeed,
  113. ... boatspeed, statistic='median', bins=[1,2,3,4,5,6,7])
  114. >>> plt.figure()
  115. >>> plt.plot(windspeed, boatspeed, 'b.', label='raw data')
  116. >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=5,
  117. ... label='binned statistic of data')
  118. >>> plt.legend()
  119. Now we can use ``binnumber`` to select all datapoints with a windspeed
  120. below 1:
  121. >>> low_boatspeed = boatspeed[binnumber == 0]
  122. As a final example, we will use ``bin_edges`` and ``binnumber`` to make a
  123. plot of a distribution that shows the mean and distribution around that
  124. mean per bin, on top of a regular histogram and the probability
  125. distribution function:
  126. >>> x = np.linspace(0, 5, num=500)
  127. >>> x_pdf = stats.maxwell.pdf(x)
  128. >>> samples = stats.maxwell.rvs(size=10000)
  129. >>> bin_means, bin_edges, binnumber = stats.binned_statistic(x, x_pdf,
  130. ... statistic='mean', bins=25)
  131. >>> bin_width = (bin_edges[1] - bin_edges[0])
  132. >>> bin_centers = bin_edges[1:] - bin_width/2
  133. >>> plt.figure()
  134. >>> plt.hist(samples, bins=50, density=True, histtype='stepfilled',
  135. ... alpha=0.2, label='histogram of data')
  136. >>> plt.plot(x, x_pdf, 'r-', label='analytical pdf')
  137. >>> plt.hlines(bin_means, bin_edges[:-1], bin_edges[1:], colors='g', lw=2,
  138. ... label='binned statistic of data')
  139. >>> plt.plot((binnumber - 0.5) * bin_width, x_pdf, 'g.', alpha=0.5)
  140. >>> plt.legend(fontsize=10)
  141. >>> plt.show()
  142. """
  143. try:
  144. N = len(bins)
  145. except TypeError:
  146. N = 1
  147. if N != 1:
  148. bins = [np.asarray(bins, float)]
  149. if range is not None:
  150. if len(range) == 2:
  151. range = [range]
  152. medians, edges, binnumbers = binned_statistic_dd(
  153. [x], values, statistic, bins, range)
  154. return BinnedStatisticResult(medians, edges[0], binnumbers)
  155. BinnedStatistic2dResult = namedtuple('BinnedStatistic2dResult',
  156. ('statistic', 'x_edge', 'y_edge',
  157. 'binnumber'))
  158. def binned_statistic_2d(x, y, values, statistic='mean',
  159. bins=10, range=None, expand_binnumbers=False):
  160. """
  161. Compute a bidimensional binned statistic for one or more sets of data.
  162. This is a generalization of a histogram2d function. A histogram divides
  163. the space into bins, and returns the count of the number of points in
  164. each bin. This function allows the computation of the sum, mean, median,
  165. or other statistic of the values (or set of values) within each bin.
  166. Parameters
  167. ----------
  168. x : (N,) array_like
  169. A sequence of values to be binned along the first dimension.
  170. y : (N,) array_like
  171. A sequence of values to be binned along the second dimension.
  172. values : (N,) array_like or list of (N,) array_like
  173. The data on which the statistic will be computed. This must be
  174. the same shape as `x`, or a list of sequences - each with the same
  175. shape as `x`. If `values` is such a list, the statistic will be
  176. computed on each independently.
  177. statistic : string or callable, optional
  178. The statistic to compute (default is 'mean').
  179. The following statistics are available:
  180. * 'mean' : compute the mean of values for points within each bin.
  181. Empty bins will be represented by NaN.
  182. * 'std' : compute the standard deviation within each bin. This
  183. is implicitly calculated with ddof=0.
  184. * 'median' : compute the median of values for points within each
  185. bin. Empty bins will be represented by NaN.
  186. * 'count' : compute the count of points within each bin. This is
  187. identical to an unweighted histogram. `values` array is not
  188. referenced.
  189. * 'sum' : compute the sum of values for points within each bin.
  190. This is identical to a weighted histogram.
  191. * 'min' : compute the minimum of values for points within each bin.
  192. Empty bins will be represented by NaN.
  193. * 'max' : compute the maximum of values for point within each bin.
  194. Empty bins will be represented by NaN.
  195. * function : a user-defined function which takes a 1D array of
  196. values, and outputs a single numerical statistic. This function
  197. will be called on the values in each bin. Empty bins will be
  198. represented by function([]), or NaN if this returns an error.
  199. bins : int or [int, int] or array_like or [array, array], optional
  200. The bin specification:
  201. * the number of bins for the two dimensions (nx = ny = bins),
  202. * the number of bins in each dimension (nx, ny = bins),
  203. * the bin edges for the two dimensions (x_edge = y_edge = bins),
  204. * the bin edges in each dimension (x_edge, y_edge = bins).
  205. If the bin edges are specified, the number of bins will be,
  206. (nx = len(x_edge)-1, ny = len(y_edge)-1).
  207. range : (2,2) array_like, optional
  208. The leftmost and rightmost edges of the bins along each dimension
  209. (if not specified explicitly in the `bins` parameters):
  210. [[xmin, xmax], [ymin, ymax]]. All values outside of this range will be
  211. considered outliers and not tallied in the histogram.
  212. expand_binnumbers : bool, optional
  213. 'False' (default): the returned `binnumber` is a shape (N,) array of
  214. linearized bin indices.
  215. 'True': the returned `binnumber` is 'unraveled' into a shape (2,N)
  216. ndarray, where each row gives the bin numbers in the corresponding
  217. dimension.
  218. See the `binnumber` returned value, and the `Examples` section.
  219. .. versionadded:: 0.17.0
  220. Returns
  221. -------
  222. statistic : (nx, ny) ndarray
  223. The values of the selected statistic in each two-dimensional bin.
  224. x_edge : (nx + 1) ndarray
  225. The bin edges along the first dimension.
  226. y_edge : (ny + 1) ndarray
  227. The bin edges along the second dimension.
  228. binnumber : (N,) array of ints or (2,N) ndarray of ints
  229. This assigns to each element of `sample` an integer that represents the
  230. bin in which this observation falls. The representation depends on the
  231. `expand_binnumbers` argument. See `Notes` for details.
  232. See Also
  233. --------
  234. numpy.digitize, numpy.histogram2d, binned_statistic, binned_statistic_dd
  235. Notes
  236. -----
  237. Binedges:
  238. All but the last (righthand-most) bin is half-open. In other words, if
  239. `bins` is ``[1, 2, 3, 4]``, then the first bin is ``[1, 2)`` (including 1,
  240. but excluding 2) and the second ``[2, 3)``. The last bin, however, is
  241. ``[3, 4]``, which *includes* 4.
  242. `binnumber`:
  243. This returned argument assigns to each element of `sample` an integer that
  244. represents the bin in which it belongs. The representation depends on the
  245. `expand_binnumbers` argument. If 'False' (default): The returned
  246. `binnumber` is a shape (N,) array of linearized indices mapping each
  247. element of `sample` to its corresponding bin (using row-major ordering).
  248. Note that the returned linearized bin indices are used for an array with
  249. extra bins on the outer binedges to capture values outside of the defined
  250. bin bounds.
  251. If 'True': The returned `binnumber` is a shape (2,N) ndarray where
  252. each row indicates bin placements for each dimension respectively. In each
  253. dimension, a binnumber of `i` means the corresponding value is between
  254. (D_edge[i-1], D_edge[i]), where 'D' is either 'x' or 'y'.
  255. .. versionadded:: 0.11.0
  256. Examples
  257. --------
  258. >>> from scipy import stats
  259. Calculate the counts with explicit bin-edges:
  260. >>> x = [0.1, 0.1, 0.1, 0.6]
  261. >>> y = [2.1, 2.6, 2.1, 2.1]
  262. >>> binx = [0.0, 0.5, 1.0]
  263. >>> biny = [2.0, 2.5, 3.0]
  264. >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny])
  265. >>> ret.statistic
  266. array([[2., 1.],
  267. [1., 0.]])
  268. The bin in which each sample is placed is given by the `binnumber`
  269. returned parameter. By default, these are the linearized bin indices:
  270. >>> ret.binnumber
  271. array([5, 6, 5, 9])
  272. The bin indices can also be expanded into separate entries for each
  273. dimension using the `expand_binnumbers` parameter:
  274. >>> ret = stats.binned_statistic_2d(x, y, None, 'count', bins=[binx, biny],
  275. ... expand_binnumbers=True)
  276. >>> ret.binnumber
  277. array([[1, 1, 1, 2],
  278. [1, 2, 1, 1]])
  279. Which shows that the first three elements belong in the xbin 1, and the
  280. fourth into xbin 2; and so on for y.
  281. """
  282. # This code is based on np.histogram2d
  283. try:
  284. N = len(bins)
  285. except TypeError:
  286. N = 1
  287. if N != 1 and N != 2:
  288. xedges = yedges = np.asarray(bins, float)
  289. bins = [xedges, yedges]
  290. medians, edges, binnumbers = binned_statistic_dd(
  291. [x, y], values, statistic, bins, range,
  292. expand_binnumbers=expand_binnumbers)
  293. return BinnedStatistic2dResult(medians, edges[0], edges[1], binnumbers)
  294. BinnedStatisticddResult = namedtuple('BinnedStatisticddResult',
  295. ('statistic', 'bin_edges',
  296. 'binnumber'))
  297. def _bincount(x, weights):
  298. if np.iscomplexobj(weights):
  299. a = np.bincount(x, np.real(weights))
  300. b = np.bincount(x, np.imag(weights))
  301. z = a + b*1j
  302. else:
  303. z = np.bincount(x, weights)
  304. return z
  305. def binned_statistic_dd(sample, values, statistic='mean',
  306. bins=10, range=None, expand_binnumbers=False,
  307. binned_statistic_result=None):
  308. """
  309. Compute a multidimensional binned statistic for a set of data.
  310. This is a generalization of a histogramdd function. A histogram divides
  311. the space into bins, and returns the count of the number of points in
  312. each bin. This function allows the computation of the sum, mean, median,
  313. or other statistic of the values within each bin.
  314. Parameters
  315. ----------
  316. sample : array_like
  317. Data to histogram passed as a sequence of N arrays of length D, or
  318. as an (N,D) array.
  319. values : (N,) array_like or list of (N,) array_like
  320. The data on which the statistic will be computed. This must be
  321. the same shape as `sample`, or a list of sequences - each with the
  322. same shape as `sample`. If `values` is such a list, the statistic
  323. will be computed on each independently.
  324. statistic : string or callable, optional
  325. The statistic to compute (default is 'mean').
  326. The following statistics are available:
  327. * 'mean' : compute the mean of values for points within each bin.
  328. Empty bins will be represented by NaN.
  329. * 'median' : compute the median of values for points within each
  330. bin. Empty bins will be represented by NaN.
  331. * 'count' : compute the count of points within each bin. This is
  332. identical to an unweighted histogram. `values` array is not
  333. referenced.
  334. * 'sum' : compute the sum of values for points within each bin.
  335. This is identical to a weighted histogram.
  336. * 'std' : compute the standard deviation within each bin. This
  337. is implicitly calculated with ddof=0. If the number of values
  338. within a given bin is 0 or 1, the computed standard deviation value
  339. will be 0 for the bin.
  340. * 'min' : compute the minimum of values for points within each bin.
  341. Empty bins will be represented by NaN.
  342. * 'max' : compute the maximum of values for point within each bin.
  343. Empty bins will be represented by NaN.
  344. * function : a user-defined function which takes a 1D array of
  345. values, and outputs a single numerical statistic. This function
  346. will be called on the values in each bin. Empty bins will be
  347. represented by function([]), or NaN if this returns an error.
  348. bins : sequence or positive int, optional
  349. The bin specification must be in one of the following forms:
  350. * A sequence of arrays describing the bin edges along each dimension.
  351. * The number of bins for each dimension (nx, ny, ... = bins).
  352. * The number of bins for all dimensions (nx = ny = ... = bins).
  353. range : sequence, optional
  354. A sequence of lower and upper bin edges to be used if the edges are
  355. not given explicitly in `bins`. Defaults to the minimum and maximum
  356. values along each dimension.
  357. expand_binnumbers : bool, optional
  358. 'False' (default): the returned `binnumber` is a shape (N,) array of
  359. linearized bin indices.
  360. 'True': the returned `binnumber` is 'unraveled' into a shape (D,N)
  361. ndarray, where each row gives the bin numbers in the corresponding
  362. dimension.
  363. See the `binnumber` returned value, and the `Examples` section of
  364. `binned_statistic_2d`.
  365. binned_statistic_result : binnedStatisticddResult
  366. Result of a previous call to the function in order to reuse bin edges
  367. and bin numbers with new values and/or a different statistic.
  368. To reuse bin numbers, `expand_binnumbers` must have been set to False
  369. (the default)
  370. .. versionadded:: 0.17.0
  371. Returns
  372. -------
  373. statistic : ndarray, shape(nx1, nx2, nx3,...)
  374. The values of the selected statistic in each two-dimensional bin.
  375. bin_edges : list of ndarrays
  376. A list of D arrays describing the (nxi + 1) bin edges for each
  377. dimension.
  378. binnumber : (N,) array of ints or (D,N) ndarray of ints
  379. This assigns to each element of `sample` an integer that represents the
  380. bin in which this observation falls. The representation depends on the
  381. `expand_binnumbers` argument. See `Notes` for details.
  382. See Also
  383. --------
  384. numpy.digitize, numpy.histogramdd, binned_statistic, binned_statistic_2d
  385. Notes
  386. -----
  387. Binedges:
  388. All but the last (righthand-most) bin is half-open in each dimension. In
  389. other words, if `bins` is ``[1, 2, 3, 4]``, then the first bin is
  390. ``[1, 2)`` (including 1, but excluding 2) and the second ``[2, 3)``. The
  391. last bin, however, is ``[3, 4]``, which *includes* 4.
  392. `binnumber`:
  393. This returned argument assigns to each element of `sample` an integer that
  394. represents the bin in which it belongs. The representation depends on the
  395. `expand_binnumbers` argument. If 'False' (default): The returned
  396. `binnumber` is a shape (N,) array of linearized indices mapping each
  397. element of `sample` to its corresponding bin (using row-major ordering).
  398. If 'True': The returned `binnumber` is a shape (D,N) ndarray where
  399. each row indicates bin placements for each dimension respectively. In each
  400. dimension, a binnumber of `i` means the corresponding value is between
  401. (bin_edges[D][i-1], bin_edges[D][i]), for each dimension 'D'.
  402. .. versionadded:: 0.11.0
  403. Examples
  404. --------
  405. >>> import numpy as np
  406. >>> from scipy import stats
  407. >>> import matplotlib.pyplot as plt
  408. >>> from mpl_toolkits.mplot3d import Axes3D
  409. Take an array of 600 (x, y) coordinates as an example.
  410. `binned_statistic_dd` can handle arrays of higher dimension `D`. But a plot
  411. of dimension `D+1` is required.
  412. >>> mu = np.array([0., 1.])
  413. >>> sigma = np.array([[1., -0.5],[-0.5, 1.5]])
  414. >>> multinormal = stats.multivariate_normal(mu, sigma)
  415. >>> data = multinormal.rvs(size=600, random_state=235412)
  416. >>> data.shape
  417. (600, 2)
  418. Create bins and count how many arrays fall in each bin:
  419. >>> N = 60
  420. >>> x = np.linspace(-3, 3, N)
  421. >>> y = np.linspace(-3, 4, N)
  422. >>> ret = stats.binned_statistic_dd(data, np.arange(600), bins=[x, y],
  423. ... statistic='count')
  424. >>> bincounts = ret.statistic
  425. Set the volume and the location of bars:
  426. >>> dx = x[1] - x[0]
  427. >>> dy = y[1] - y[0]
  428. >>> x, y = np.meshgrid(x[:-1]+dx/2, y[:-1]+dy/2)
  429. >>> z = 0
  430. >>> bincounts = bincounts.ravel()
  431. >>> x = x.ravel()
  432. >>> y = y.ravel()
  433. >>> fig = plt.figure()
  434. >>> ax = fig.add_subplot(111, projection='3d')
  435. >>> with np.errstate(divide='ignore'): # silence random axes3d warning
  436. ... ax.bar3d(x, y, z, dx, dy, bincounts)
  437. Reuse bin numbers and bin edges with new values:
  438. >>> ret2 = stats.binned_statistic_dd(data, -np.arange(600),
  439. ... binned_statistic_result=ret,
  440. ... statistic='mean')
  441. """
  442. known_stats = ['mean', 'median', 'count', 'sum', 'std', 'min', 'max']
  443. if not callable(statistic) and statistic not in known_stats:
  444. raise ValueError('invalid statistic %r' % (statistic,))
  445. try:
  446. bins = index(bins)
  447. except TypeError:
  448. # bins is not an integer
  449. pass
  450. # If bins was an integer-like object, now it is an actual Python int.
  451. # NOTE: for _bin_edges(), see e.g. gh-11365
  452. if isinstance(bins, int) and not np.isfinite(sample).all():
  453. raise ValueError('%r contains non-finite values.' % (sample,))
  454. # `Ndim` is the number of dimensions (e.g. `2` for `binned_statistic_2d`)
  455. # `Dlen` is the length of elements along each dimension.
  456. # This code is based on np.histogramdd
  457. try:
  458. # `sample` is an ND-array.
  459. Dlen, Ndim = sample.shape
  460. except (AttributeError, ValueError):
  461. # `sample` is a sequence of 1D arrays.
  462. sample = np.atleast_2d(sample).T
  463. Dlen, Ndim = sample.shape
  464. # Store initial shape of `values` to preserve it in the output
  465. values = np.asarray(values)
  466. input_shape = list(values.shape)
  467. # Make sure that `values` is 2D to iterate over rows
  468. values = np.atleast_2d(values)
  469. Vdim, Vlen = values.shape
  470. # Make sure `values` match `sample`
  471. if statistic != 'count' and Vlen != Dlen:
  472. raise AttributeError('The number of `values` elements must match the '
  473. 'length of each `sample` dimension.')
  474. try:
  475. M = len(bins)
  476. if M != Ndim:
  477. raise AttributeError('The dimension of bins must be equal '
  478. 'to the dimension of the sample x.')
  479. except TypeError:
  480. bins = Ndim * [bins]
  481. if binned_statistic_result is None:
  482. nbin, edges, dedges = _bin_edges(sample, bins, range)
  483. binnumbers = _bin_numbers(sample, nbin, edges, dedges)
  484. else:
  485. edges = binned_statistic_result.bin_edges
  486. nbin = np.array([len(edges[i]) + 1 for i in builtins.range(Ndim)])
  487. # +1 for outlier bins
  488. dedges = [np.diff(edges[i]) for i in builtins.range(Ndim)]
  489. binnumbers = binned_statistic_result.binnumber
  490. # Avoid overflow with double precision. Complex `values` -> `complex128`.
  491. result_type = np.result_type(values, np.float64)
  492. result = np.empty([Vdim, nbin.prod()], dtype=result_type)
  493. if statistic in {'mean', np.mean}:
  494. result.fill(np.nan)
  495. flatcount = _bincount(binnumbers, None)
  496. a = flatcount.nonzero()
  497. for vv in builtins.range(Vdim):
  498. flatsum = _bincount(binnumbers, values[vv])
  499. result[vv, a] = flatsum[a] / flatcount[a]
  500. elif statistic in {'std', np.std}:
  501. result.fill(np.nan)
  502. flatcount = _bincount(binnumbers, None)
  503. a = flatcount.nonzero()
  504. for vv in builtins.range(Vdim):
  505. flatsum = _bincount(binnumbers, values[vv])
  506. delta = values[vv] - flatsum[binnumbers] / flatcount[binnumbers]
  507. std = np.sqrt(
  508. _bincount(binnumbers, delta*np.conj(delta))[a] / flatcount[a]
  509. )
  510. result[vv, a] = std
  511. result = np.real(result)
  512. elif statistic == 'count':
  513. result = np.empty([Vdim, nbin.prod()], dtype=np.float64)
  514. result.fill(0)
  515. flatcount = _bincount(binnumbers, None)
  516. a = np.arange(len(flatcount))
  517. result[:, a] = flatcount[np.newaxis, :]
  518. elif statistic in {'sum', np.sum}:
  519. result.fill(0)
  520. for vv in builtins.range(Vdim):
  521. flatsum = _bincount(binnumbers, values[vv])
  522. a = np.arange(len(flatsum))
  523. result[vv, a] = flatsum
  524. elif statistic in {'median', np.median}:
  525. result.fill(np.nan)
  526. for vv in builtins.range(Vdim):
  527. i = np.lexsort((values[vv], binnumbers))
  528. _, j, counts = np.unique(binnumbers[i],
  529. return_index=True, return_counts=True)
  530. mid = j + (counts - 1) / 2
  531. mid_a = values[vv, i][np.floor(mid).astype(int)]
  532. mid_b = values[vv, i][np.ceil(mid).astype(int)]
  533. medians = (mid_a + mid_b) / 2
  534. result[vv, binnumbers[i][j]] = medians
  535. elif statistic in {'min', np.min}:
  536. result.fill(np.nan)
  537. for vv in builtins.range(Vdim):
  538. i = np.argsort(values[vv])[::-1] # Reversed so the min is last
  539. result[vv, binnumbers[i]] = values[vv, i]
  540. elif statistic in {'max', np.max}:
  541. result.fill(np.nan)
  542. for vv in builtins.range(Vdim):
  543. i = np.argsort(values[vv])
  544. result[vv, binnumbers[i]] = values[vv, i]
  545. elif callable(statistic):
  546. with np.errstate(invalid='ignore'), suppress_warnings() as sup:
  547. sup.filter(RuntimeWarning)
  548. try:
  549. null = statistic([])
  550. except Exception:
  551. null = np.nan
  552. if np.iscomplexobj(null):
  553. result = result.astype(np.complex128)
  554. result.fill(null)
  555. try:
  556. _calc_binned_statistic(
  557. Vdim, binnumbers, result, values, statistic
  558. )
  559. except ValueError:
  560. result = result.astype(np.complex128)
  561. _calc_binned_statistic(
  562. Vdim, binnumbers, result, values, statistic
  563. )
  564. # Shape into a proper matrix
  565. result = result.reshape(np.append(Vdim, nbin))
  566. # Remove outliers (indices 0 and -1 for each bin-dimension).
  567. core = tuple([slice(None)] + Ndim * [slice(1, -1)])
  568. result = result[core]
  569. # Unravel binnumbers into an ndarray, each row the bins for each dimension
  570. if expand_binnumbers and Ndim > 1:
  571. binnumbers = np.asarray(np.unravel_index(binnumbers, nbin))
  572. if np.any(result.shape[1:] != nbin - 2):
  573. raise RuntimeError('Internal Shape Error')
  574. # Reshape to have output (`result`) match input (`values`) shape
  575. result = result.reshape(input_shape[:-1] + list(nbin-2))
  576. return BinnedStatisticddResult(result, edges, binnumbers)
  577. def _calc_binned_statistic(Vdim, bin_numbers, result, values, stat_func):
  578. unique_bin_numbers = np.unique(bin_numbers)
  579. for vv in builtins.range(Vdim):
  580. bin_map = _create_binned_data(bin_numbers, unique_bin_numbers,
  581. values, vv)
  582. for i in unique_bin_numbers:
  583. stat = stat_func(np.array(bin_map[i]))
  584. if np.iscomplexobj(stat) and not np.iscomplexobj(result):
  585. raise ValueError("The statistic function returns complex ")
  586. result[vv, i] = stat
  587. def _create_binned_data(bin_numbers, unique_bin_numbers, values, vv):
  588. """ Create hashmap of bin ids to values in bins
  589. key: bin number
  590. value: list of binned data
  591. """
  592. bin_map = dict()
  593. for i in unique_bin_numbers:
  594. bin_map[i] = []
  595. for i in builtins.range(len(bin_numbers)):
  596. bin_map[bin_numbers[i]].append(values[vv, i])
  597. return bin_map
  598. def _bin_edges(sample, bins=None, range=None):
  599. """ Create edge arrays
  600. """
  601. Dlen, Ndim = sample.shape
  602. nbin = np.empty(Ndim, int) # Number of bins in each dimension
  603. edges = Ndim * [None] # Bin edges for each dim (will be 2D array)
  604. dedges = Ndim * [None] # Spacing between edges (will be 2D array)
  605. # Select range for each dimension
  606. # Used only if number of bins is given.
  607. if range is None:
  608. smin = np.atleast_1d(np.array(sample.min(axis=0), float))
  609. smax = np.atleast_1d(np.array(sample.max(axis=0), float))
  610. else:
  611. if len(range) != Ndim:
  612. raise ValueError(
  613. f"range given for {len(range)} dimensions; {Ndim} required")
  614. smin = np.empty(Ndim)
  615. smax = np.empty(Ndim)
  616. for i in builtins.range(Ndim):
  617. if range[i][1] < range[i][0]:
  618. raise ValueError(
  619. "In {}range, start must be <= stop".format(
  620. f"dimension {i + 1} of " if Ndim > 1 else ""))
  621. smin[i], smax[i] = range[i]
  622. # Make sure the bins have a finite width.
  623. for i in builtins.range(len(smin)):
  624. if smin[i] == smax[i]:
  625. smin[i] = smin[i] - .5
  626. smax[i] = smax[i] + .5
  627. # Preserve sample floating point precision in bin edges
  628. edges_dtype = (sample.dtype if np.issubdtype(sample.dtype, np.floating)
  629. else float)
  630. # Create edge arrays
  631. for i in builtins.range(Ndim):
  632. if np.isscalar(bins[i]):
  633. nbin[i] = bins[i] + 2 # +2 for outlier bins
  634. edges[i] = np.linspace(smin[i], smax[i], nbin[i] - 1,
  635. dtype=edges_dtype)
  636. else:
  637. edges[i] = np.asarray(bins[i], edges_dtype)
  638. nbin[i] = len(edges[i]) + 1 # +1 for outlier bins
  639. dedges[i] = np.diff(edges[i])
  640. nbin = np.asarray(nbin)
  641. return nbin, edges, dedges
  642. def _bin_numbers(sample, nbin, edges, dedges):
  643. """Compute the bin number each sample falls into, in each dimension
  644. """
  645. Dlen, Ndim = sample.shape
  646. sampBin = [
  647. np.digitize(sample[:, i], edges[i])
  648. for i in range(Ndim)
  649. ]
  650. # Using `digitize`, values that fall on an edge are put in the right bin.
  651. # For the rightmost bin, we want values equal to the right
  652. # edge to be counted in the last bin, and not as an outlier.
  653. for i in range(Ndim):
  654. # Find the rounding precision
  655. dedges_min = dedges[i].min()
  656. if dedges_min == 0:
  657. raise ValueError('The smallest edge difference is numerically 0.')
  658. decimal = int(-np.log10(dedges_min)) + 6
  659. # Find which points are on the rightmost edge.
  660. on_edge = np.where((sample[:, i] >= edges[i][-1]) &
  661. (np.around(sample[:, i], decimal) ==
  662. np.around(edges[i][-1], decimal)))[0]
  663. # Shift these points one bin to the left.
  664. sampBin[i][on_edge] -= 1
  665. # Compute the sample indices in the flattened statistic matrix.
  666. binnumbers = np.ravel_multi_index(sampBin, nbin)
  667. return binnumbers