_kde.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725
  1. #-------------------------------------------------------------------------------
  2. #
  3. # Define classes for (uni/multi)-variate kernel density estimation.
  4. #
  5. # Currently, only Gaussian kernels are implemented.
  6. #
  7. # Written by: Robert Kern
  8. #
  9. # Date: 2004-08-09
  10. #
  11. # Modified: 2005-02-10 by Robert Kern.
  12. # Contributed to SciPy
  13. # 2005-10-07 by Robert Kern.
  14. # Some fixes to match the new scipy_core
  15. #
  16. # Copyright 2004-2005 by Enthought, Inc.
  17. #
  18. #-------------------------------------------------------------------------------
  19. # Standard library imports.
  20. import warnings
  21. # SciPy imports.
  22. from scipy import linalg, special
  23. from scipy._lib._util import check_random_state
  24. from numpy import (asarray, atleast_2d, reshape, zeros, newaxis, exp, pi,
  25. sqrt, ravel, power, atleast_1d, squeeze, sum, transpose,
  26. ones, cov)
  27. import numpy as np
  28. # Local imports.
  29. from . import _mvn
  30. from ._stats import gaussian_kernel_estimate, gaussian_kernel_estimate_log
  31. __all__ = ['gaussian_kde']
  32. class gaussian_kde:
  33. """Representation of a kernel-density estimate using Gaussian kernels.
  34. Kernel density estimation is a way to estimate the probability density
  35. function (PDF) of a random variable in a non-parametric way.
  36. `gaussian_kde` works for both uni-variate and multi-variate data. It
  37. includes automatic bandwidth determination. The estimation works best for
  38. a unimodal distribution; bimodal or multi-modal distributions tend to be
  39. oversmoothed.
  40. Parameters
  41. ----------
  42. dataset : array_like
  43. Datapoints to estimate from. In case of univariate data this is a 1-D
  44. array, otherwise a 2-D array with shape (# of dims, # of data).
  45. bw_method : str, scalar or callable, optional
  46. The method used to calculate the estimator bandwidth. This can be
  47. 'scott', 'silverman', a scalar constant or a callable. If a scalar,
  48. this will be used directly as `kde.factor`. If a callable, it should
  49. take a `gaussian_kde` instance as only parameter and return a scalar.
  50. If None (default), 'scott' is used. See Notes for more details.
  51. weights : array_like, optional
  52. weights of datapoints. This must be the same shape as dataset.
  53. If None (default), the samples are assumed to be equally weighted
  54. Attributes
  55. ----------
  56. dataset : ndarray
  57. The dataset with which `gaussian_kde` was initialized.
  58. d : int
  59. Number of dimensions.
  60. n : int
  61. Number of datapoints.
  62. neff : int
  63. Effective number of datapoints.
  64. .. versionadded:: 1.2.0
  65. factor : float
  66. The bandwidth factor, obtained from `kde.covariance_factor`. The square
  67. of `kde.factor` multiplies the covariance matrix of the data in the kde
  68. estimation.
  69. covariance : ndarray
  70. The covariance matrix of `dataset`, scaled by the calculated bandwidth
  71. (`kde.factor`).
  72. inv_cov : ndarray
  73. The inverse of `covariance`.
  74. Methods
  75. -------
  76. evaluate
  77. __call__
  78. integrate_gaussian
  79. integrate_box_1d
  80. integrate_box
  81. integrate_kde
  82. pdf
  83. logpdf
  84. resample
  85. set_bandwidth
  86. covariance_factor
  87. Notes
  88. -----
  89. Bandwidth selection strongly influences the estimate obtained from the KDE
  90. (much more so than the actual shape of the kernel). Bandwidth selection
  91. can be done by a "rule of thumb", by cross-validation, by "plug-in
  92. methods" or by other means; see [3]_, [4]_ for reviews. `gaussian_kde`
  93. uses a rule of thumb, the default is Scott's Rule.
  94. Scott's Rule [1]_, implemented as `scotts_factor`, is::
  95. n**(-1./(d+4)),
  96. with ``n`` the number of data points and ``d`` the number of dimensions.
  97. In the case of unequally weighted points, `scotts_factor` becomes::
  98. neff**(-1./(d+4)),
  99. with ``neff`` the effective number of datapoints.
  100. Silverman's Rule [2]_, implemented as `silverman_factor`, is::
  101. (n * (d + 2) / 4.)**(-1. / (d + 4)).
  102. or in the case of unequally weighted points::
  103. (neff * (d + 2) / 4.)**(-1. / (d + 4)).
  104. Good general descriptions of kernel density estimation can be found in [1]_
  105. and [2]_, the mathematics for this multi-dimensional implementation can be
  106. found in [1]_.
  107. With a set of weighted samples, the effective number of datapoints ``neff``
  108. is defined by::
  109. neff = sum(weights)^2 / sum(weights^2)
  110. as detailed in [5]_.
  111. `gaussian_kde` does not currently support data that lies in a
  112. lower-dimensional subspace of the space in which it is expressed. For such
  113. data, consider performing principle component analysis / dimensionality
  114. reduction and using `gaussian_kde` with the transformed data.
  115. References
  116. ----------
  117. .. [1] D.W. Scott, "Multivariate Density Estimation: Theory, Practice, and
  118. Visualization", John Wiley & Sons, New York, Chicester, 1992.
  119. .. [2] B.W. Silverman, "Density Estimation for Statistics and Data
  120. Analysis", Vol. 26, Monographs on Statistics and Applied Probability,
  121. Chapman and Hall, London, 1986.
  122. .. [3] B.A. Turlach, "Bandwidth Selection in Kernel Density Estimation: A
  123. Review", CORE and Institut de Statistique, Vol. 19, pp. 1-33, 1993.
  124. .. [4] D.M. Bashtannyk and R.J. Hyndman, "Bandwidth selection for kernel
  125. conditional density estimation", Computational Statistics & Data
  126. Analysis, Vol. 36, pp. 279-298, 2001.
  127. .. [5] Gray P. G., 1969, Journal of the Royal Statistical Society.
  128. Series A (General), 132, 272
  129. Examples
  130. --------
  131. Generate some random two-dimensional data:
  132. >>> import numpy as np
  133. >>> from scipy import stats
  134. >>> def measure(n):
  135. ... "Measurement model, return two coupled measurements."
  136. ... m1 = np.random.normal(size=n)
  137. ... m2 = np.random.normal(scale=0.5, size=n)
  138. ... return m1+m2, m1-m2
  139. >>> m1, m2 = measure(2000)
  140. >>> xmin = m1.min()
  141. >>> xmax = m1.max()
  142. >>> ymin = m2.min()
  143. >>> ymax = m2.max()
  144. Perform a kernel density estimate on the data:
  145. >>> X, Y = np.mgrid[xmin:xmax:100j, ymin:ymax:100j]
  146. >>> positions = np.vstack([X.ravel(), Y.ravel()])
  147. >>> values = np.vstack([m1, m2])
  148. >>> kernel = stats.gaussian_kde(values)
  149. >>> Z = np.reshape(kernel(positions).T, X.shape)
  150. Plot the results:
  151. >>> import matplotlib.pyplot as plt
  152. >>> fig, ax = plt.subplots()
  153. >>> ax.imshow(np.rot90(Z), cmap=plt.cm.gist_earth_r,
  154. ... extent=[xmin, xmax, ymin, ymax])
  155. >>> ax.plot(m1, m2, 'k.', markersize=2)
  156. >>> ax.set_xlim([xmin, xmax])
  157. >>> ax.set_ylim([ymin, ymax])
  158. >>> plt.show()
  159. """
  160. def __init__(self, dataset, bw_method=None, weights=None):
  161. self.dataset = atleast_2d(asarray(dataset))
  162. if not self.dataset.size > 1:
  163. raise ValueError("`dataset` input should have multiple elements.")
  164. self.d, self.n = self.dataset.shape
  165. if weights is not None:
  166. self._weights = atleast_1d(weights).astype(float)
  167. self._weights /= sum(self._weights)
  168. if self.weights.ndim != 1:
  169. raise ValueError("`weights` input should be one-dimensional.")
  170. if len(self._weights) != self.n:
  171. raise ValueError("`weights` input should be of length n")
  172. self._neff = 1/sum(self._weights**2)
  173. # This can be converted to a warning once gh-10205 is resolved
  174. if self.d > self.n:
  175. msg = ("Number of dimensions is greater than number of samples. "
  176. "This results in a singular data covariance matrix, which "
  177. "cannot be treated using the algorithms implemented in "
  178. "`gaussian_kde`. Note that `gaussian_kde` interprets each "
  179. "*column* of `dataset` to be a point; consider transposing "
  180. "the input to `dataset`.")
  181. raise ValueError(msg)
  182. try:
  183. self.set_bandwidth(bw_method=bw_method)
  184. except linalg.LinAlgError as e:
  185. msg = ("The data appears to lie in a lower-dimensional subspace "
  186. "of the space in which it is expressed. This has resulted "
  187. "in a singular data covariance matrix, which cannot be "
  188. "treated using the algorithms implemented in "
  189. "`gaussian_kde`. Consider performing principle component "
  190. "analysis / dimensionality reduction and using "
  191. "`gaussian_kde` with the transformed data.")
  192. raise linalg.LinAlgError(msg) from e
  193. def evaluate(self, points):
  194. """Evaluate the estimated pdf on a set of points.
  195. Parameters
  196. ----------
  197. points : (# of dimensions, # of points)-array
  198. Alternatively, a (# of dimensions,) vector can be passed in and
  199. treated as a single point.
  200. Returns
  201. -------
  202. values : (# of points,)-array
  203. The values at each point.
  204. Raises
  205. ------
  206. ValueError : if the dimensionality of the input points is different than
  207. the dimensionality of the KDE.
  208. """
  209. points = atleast_2d(asarray(points))
  210. d, m = points.shape
  211. if d != self.d:
  212. if d == 1 and m == self.d:
  213. # points was passed in as a row vector
  214. points = reshape(points, (self.d, 1))
  215. m = 1
  216. else:
  217. msg = "points have dimension %s, dataset has dimension %s" % (d,
  218. self.d)
  219. raise ValueError(msg)
  220. output_dtype, spec = _get_output_dtype(self.covariance, points)
  221. result = gaussian_kernel_estimate[spec](
  222. self.dataset.T, self.weights[:, None],
  223. points.T, self.cho_cov, output_dtype)
  224. return result[:, 0]
  225. __call__ = evaluate
  226. def integrate_gaussian(self, mean, cov):
  227. """
  228. Multiply estimated density by a multivariate Gaussian and integrate
  229. over the whole space.
  230. Parameters
  231. ----------
  232. mean : aray_like
  233. A 1-D array, specifying the mean of the Gaussian.
  234. cov : array_like
  235. A 2-D array, specifying the covariance matrix of the Gaussian.
  236. Returns
  237. -------
  238. result : scalar
  239. The value of the integral.
  240. Raises
  241. ------
  242. ValueError
  243. If the mean or covariance of the input Gaussian differs from
  244. the KDE's dimensionality.
  245. """
  246. mean = atleast_1d(squeeze(mean))
  247. cov = atleast_2d(cov)
  248. if mean.shape != (self.d,):
  249. raise ValueError("mean does not have dimension %s" % self.d)
  250. if cov.shape != (self.d, self.d):
  251. raise ValueError("covariance does not have dimension %s" % self.d)
  252. # make mean a column vector
  253. mean = mean[:, newaxis]
  254. sum_cov = self.covariance + cov
  255. # This will raise LinAlgError if the new cov matrix is not s.p.d
  256. # cho_factor returns (ndarray, bool) where bool is a flag for whether
  257. # or not ndarray is upper or lower triangular
  258. sum_cov_chol = linalg.cho_factor(sum_cov)
  259. diff = self.dataset - mean
  260. tdiff = linalg.cho_solve(sum_cov_chol, diff)
  261. sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
  262. norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
  263. energies = sum(diff * tdiff, axis=0) / 2.0
  264. result = sum(exp(-energies)*self.weights, axis=0) / norm_const
  265. return result
  266. def integrate_box_1d(self, low, high):
  267. """
  268. Computes the integral of a 1D pdf between two bounds.
  269. Parameters
  270. ----------
  271. low : scalar
  272. Lower bound of integration.
  273. high : scalar
  274. Upper bound of integration.
  275. Returns
  276. -------
  277. value : scalar
  278. The result of the integral.
  279. Raises
  280. ------
  281. ValueError
  282. If the KDE is over more than one dimension.
  283. """
  284. if self.d != 1:
  285. raise ValueError("integrate_box_1d() only handles 1D pdfs")
  286. stdev = ravel(sqrt(self.covariance))[0]
  287. normalized_low = ravel((low - self.dataset) / stdev)
  288. normalized_high = ravel((high - self.dataset) / stdev)
  289. value = np.sum(self.weights*(
  290. special.ndtr(normalized_high) -
  291. special.ndtr(normalized_low)))
  292. return value
  293. def integrate_box(self, low_bounds, high_bounds, maxpts=None):
  294. """Computes the integral of a pdf over a rectangular interval.
  295. Parameters
  296. ----------
  297. low_bounds : array_like
  298. A 1-D array containing the lower bounds of integration.
  299. high_bounds : array_like
  300. A 1-D array containing the upper bounds of integration.
  301. maxpts : int, optional
  302. The maximum number of points to use for integration.
  303. Returns
  304. -------
  305. value : scalar
  306. The result of the integral.
  307. """
  308. if maxpts is not None:
  309. extra_kwds = {'maxpts': maxpts}
  310. else:
  311. extra_kwds = {}
  312. value, inform = _mvn.mvnun_weighted(low_bounds, high_bounds,
  313. self.dataset, self.weights,
  314. self.covariance, **extra_kwds)
  315. if inform:
  316. msg = ('An integral in _mvn.mvnun requires more points than %s' %
  317. (self.d * 1000))
  318. warnings.warn(msg)
  319. return value
  320. def integrate_kde(self, other):
  321. """
  322. Computes the integral of the product of this kernel density estimate
  323. with another.
  324. Parameters
  325. ----------
  326. other : gaussian_kde instance
  327. The other kde.
  328. Returns
  329. -------
  330. value : scalar
  331. The result of the integral.
  332. Raises
  333. ------
  334. ValueError
  335. If the KDEs have different dimensionality.
  336. """
  337. if other.d != self.d:
  338. raise ValueError("KDEs are not the same dimensionality")
  339. # we want to iterate over the smallest number of points
  340. if other.n < self.n:
  341. small = other
  342. large = self
  343. else:
  344. small = self
  345. large = other
  346. sum_cov = small.covariance + large.covariance
  347. sum_cov_chol = linalg.cho_factor(sum_cov)
  348. result = 0.0
  349. for i in range(small.n):
  350. mean = small.dataset[:, i, newaxis]
  351. diff = large.dataset - mean
  352. tdiff = linalg.cho_solve(sum_cov_chol, diff)
  353. energies = sum(diff * tdiff, axis=0) / 2.0
  354. result += sum(exp(-energies)*large.weights, axis=0)*small.weights[i]
  355. sqrt_det = np.prod(np.diagonal(sum_cov_chol[0]))
  356. norm_const = power(2 * pi, sum_cov.shape[0] / 2.0) * sqrt_det
  357. result /= norm_const
  358. return result
  359. def resample(self, size=None, seed=None):
  360. """Randomly sample a dataset from the estimated pdf.
  361. Parameters
  362. ----------
  363. size : int, optional
  364. The number of samples to draw. If not provided, then the size is
  365. the same as the effective number of samples in the underlying
  366. dataset.
  367. seed : {None, int, `numpy.random.Generator`, `numpy.random.RandomState`}, optional
  368. If `seed` is None (or `np.random`), the `numpy.random.RandomState`
  369. singleton is used.
  370. If `seed` is an int, a new ``RandomState`` instance is used,
  371. seeded with `seed`.
  372. If `seed` is already a ``Generator`` or ``RandomState`` instance then
  373. that instance is used.
  374. Returns
  375. -------
  376. resample : (self.d, `size`) ndarray
  377. The sampled dataset.
  378. """
  379. if size is None:
  380. size = int(self.neff)
  381. random_state = check_random_state(seed)
  382. norm = transpose(random_state.multivariate_normal(
  383. zeros((self.d,), float), self.covariance, size=size
  384. ))
  385. indices = random_state.choice(self.n, size=size, p=self.weights)
  386. means = self.dataset[:, indices]
  387. return means + norm
  388. def scotts_factor(self):
  389. """Compute Scott's factor.
  390. Returns
  391. -------
  392. s : float
  393. Scott's factor.
  394. """
  395. return power(self.neff, -1./(self.d+4))
  396. def silverman_factor(self):
  397. """Compute the Silverman factor.
  398. Returns
  399. -------
  400. s : float
  401. The silverman factor.
  402. """
  403. return power(self.neff*(self.d+2.0)/4.0, -1./(self.d+4))
  404. # Default method to calculate bandwidth, can be overwritten by subclass
  405. covariance_factor = scotts_factor
  406. covariance_factor.__doc__ = """Computes the coefficient (`kde.factor`) that
  407. multiplies the data covariance matrix to obtain the kernel covariance
  408. matrix. The default is `scotts_factor`. A subclass can overwrite this
  409. method to provide a different method, or set it through a call to
  410. `kde.set_bandwidth`."""
  411. def set_bandwidth(self, bw_method=None):
  412. """Compute the estimator bandwidth with given method.
  413. The new bandwidth calculated after a call to `set_bandwidth` is used
  414. for subsequent evaluations of the estimated density.
  415. Parameters
  416. ----------
  417. bw_method : str, scalar or callable, optional
  418. The method used to calculate the estimator bandwidth. This can be
  419. 'scott', 'silverman', a scalar constant or a callable. If a
  420. scalar, this will be used directly as `kde.factor`. If a callable,
  421. it should take a `gaussian_kde` instance as only parameter and
  422. return a scalar. If None (default), nothing happens; the current
  423. `kde.covariance_factor` method is kept.
  424. Notes
  425. -----
  426. .. versionadded:: 0.11
  427. Examples
  428. --------
  429. >>> import numpy as np
  430. >>> import scipy.stats as stats
  431. >>> x1 = np.array([-7, -5, 1, 4, 5.])
  432. >>> kde = stats.gaussian_kde(x1)
  433. >>> xs = np.linspace(-10, 10, num=50)
  434. >>> y1 = kde(xs)
  435. >>> kde.set_bandwidth(bw_method='silverman')
  436. >>> y2 = kde(xs)
  437. >>> kde.set_bandwidth(bw_method=kde.factor / 3.)
  438. >>> y3 = kde(xs)
  439. >>> import matplotlib.pyplot as plt
  440. >>> fig, ax = plt.subplots()
  441. >>> ax.plot(x1, np.full(x1.shape, 1 / (4. * x1.size)), 'bo',
  442. ... label='Data points (rescaled)')
  443. >>> ax.plot(xs, y1, label='Scott (default)')
  444. >>> ax.plot(xs, y2, label='Silverman')
  445. >>> ax.plot(xs, y3, label='Const (1/3 * Silverman)')
  446. >>> ax.legend()
  447. >>> plt.show()
  448. """
  449. if bw_method is None:
  450. pass
  451. elif bw_method == 'scott':
  452. self.covariance_factor = self.scotts_factor
  453. elif bw_method == 'silverman':
  454. self.covariance_factor = self.silverman_factor
  455. elif np.isscalar(bw_method) and not isinstance(bw_method, str):
  456. self._bw_method = 'use constant'
  457. self.covariance_factor = lambda: bw_method
  458. elif callable(bw_method):
  459. self._bw_method = bw_method
  460. self.covariance_factor = lambda: self._bw_method(self)
  461. else:
  462. msg = "`bw_method` should be 'scott', 'silverman', a scalar " \
  463. "or a callable."
  464. raise ValueError(msg)
  465. self._compute_covariance()
  466. def _compute_covariance(self):
  467. """Computes the covariance matrix for each Gaussian kernel using
  468. covariance_factor().
  469. """
  470. self.factor = self.covariance_factor()
  471. # Cache covariance and Cholesky decomp of covariance
  472. if not hasattr(self, '_data_cho_cov'):
  473. self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
  474. bias=False,
  475. aweights=self.weights))
  476. self._data_cho_cov = linalg.cholesky(self._data_covariance,
  477. lower=True)
  478. self.covariance = self._data_covariance * self.factor**2
  479. self.cho_cov = (self._data_cho_cov * self.factor).astype(np.float64)
  480. self.log_det = 2*np.log(np.diag(self.cho_cov
  481. * np.sqrt(2*pi))).sum()
  482. @property
  483. def inv_cov(self):
  484. # Re-compute from scratch each time because I'm not sure how this is
  485. # used in the wild. (Perhaps users change the `dataset`, since it's
  486. # not a private attribute?) `_compute_covariance` used to recalculate
  487. # all these, so we'll recalculate everything now that this is a
  488. # a property.
  489. self.factor = self.covariance_factor()
  490. self._data_covariance = atleast_2d(cov(self.dataset, rowvar=1,
  491. bias=False, aweights=self.weights))
  492. return linalg.inv(self._data_covariance) / self.factor**2
  493. def pdf(self, x):
  494. """
  495. Evaluate the estimated pdf on a provided set of points.
  496. Notes
  497. -----
  498. This is an alias for `gaussian_kde.evaluate`. See the ``evaluate``
  499. docstring for more details.
  500. """
  501. return self.evaluate(x)
  502. def logpdf(self, x):
  503. """
  504. Evaluate the log of the estimated pdf on a provided set of points.
  505. """
  506. points = atleast_2d(x)
  507. d, m = points.shape
  508. if d != self.d:
  509. if d == 1 and m == self.d:
  510. # points was passed in as a row vector
  511. points = reshape(points, (self.d, 1))
  512. m = 1
  513. else:
  514. msg = (f"points have dimension {d}, "
  515. f"dataset has dimension {self.d}")
  516. raise ValueError(msg)
  517. output_dtype, spec = _get_output_dtype(self.covariance, points)
  518. result = gaussian_kernel_estimate_log[spec](
  519. self.dataset.T, self.weights[:, None],
  520. points.T, self.cho_cov, output_dtype)
  521. return result[:, 0]
  522. def marginal(self, dimensions):
  523. """Return a marginal KDE distribution
  524. Parameters
  525. ----------
  526. dimensions : int or 1-d array_like
  527. The dimensions of the multivariate distribution corresponding
  528. with the marginal variables, that is, the indices of the dimensions
  529. that are being retained. The other dimensions are marginalized out.
  530. Returns
  531. -------
  532. marginal_kde : gaussian_kde
  533. An object representing the marginal distribution.
  534. Notes
  535. -----
  536. .. versionadded:: 1.10.0
  537. """
  538. dims = np.atleast_1d(dimensions)
  539. if not np.issubdtype(dims.dtype, np.integer):
  540. msg = ("Elements of `dimensions` must be integers - the indices "
  541. "of the marginal variables being retained.")
  542. raise ValueError(msg)
  543. n = len(self.dataset) # number of dimensions
  544. original_dims = dims.copy()
  545. dims[dims < 0] = n + dims[dims < 0]
  546. if len(np.unique(dims)) != len(dims):
  547. msg = ("All elements of `dimensions` must be unique.")
  548. raise ValueError(msg)
  549. i_invalid = (dims < 0) | (dims >= n)
  550. if np.any(i_invalid):
  551. msg = (f"Dimensions {original_dims[i_invalid]} are invalid "
  552. f"for a distribution in {n} dimensions.")
  553. raise ValueError(msg)
  554. dataset = self.dataset[dims]
  555. weights = self.weights
  556. return gaussian_kde(dataset, bw_method=self.covariance_factor(),
  557. weights=weights)
  558. @property
  559. def weights(self):
  560. try:
  561. return self._weights
  562. except AttributeError:
  563. self._weights = ones(self.n)/self.n
  564. return self._weights
  565. @property
  566. def neff(self):
  567. try:
  568. return self._neff
  569. except AttributeError:
  570. self._neff = 1/sum(self.weights**2)
  571. return self._neff
  572. def _get_output_dtype(covariance, points):
  573. """
  574. Calculates the output dtype and the "spec" (=C type name).
  575. This was necessary in order to deal with the fused types in the Cython
  576. routine `gaussian_kernel_estimate`. See gh-10824 for details.
  577. """
  578. output_dtype = np.common_type(covariance, points)
  579. itemsize = np.dtype(output_dtype).itemsize
  580. if itemsize == 4:
  581. spec = 'float'
  582. elif itemsize == 8:
  583. spec = 'double'
  584. elif itemsize in (12, 16):
  585. spec = 'long double'
  586. else:
  587. raise ValueError(
  588. f"{output_dtype} has unexpected item size: {itemsize}"
  589. )
  590. return output_dtype, spec