_entropy.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Fri Apr 2 09:06:05 2021
  4. @author: matth
  5. """
  6. from __future__ import annotations
  7. import math
  8. import numpy as np
  9. from scipy import special
  10. from typing import Optional, Union
  11. __all__ = ['entropy', 'differential_entropy']
  12. def entropy(pk: np.typing.ArrayLike,
  13. qk: Optional[np.typing.ArrayLike] = None,
  14. base: Optional[float] = None,
  15. axis: int = 0
  16. ) -> Union[np.number, np.ndarray]:
  17. """
  18. Calculate the Shannon entropy/relative entropy of given distribution(s).
  19. If only probabilities `pk` are given, the Shannon entropy is calculated as
  20. ``H = -sum(pk * log(pk))``.
  21. If `qk` is not None, then compute the relative entropy
  22. ``D = sum(pk * log(pk / qk))``. This quantity is also known
  23. as the Kullback-Leibler divergence.
  24. This routine will normalize `pk` and `qk` if they don't sum to 1.
  25. Parameters
  26. ----------
  27. pk : array_like
  28. Defines the (discrete) distribution. Along each axis-slice of ``pk``,
  29. element ``i`` is the (possibly unnormalized) probability of event
  30. ``i``.
  31. qk : array_like, optional
  32. Sequence against which the relative entropy is computed. Should be in
  33. the same format as `pk`.
  34. base : float, optional
  35. The logarithmic base to use, defaults to ``e`` (natural logarithm).
  36. axis : int, optional
  37. The axis along which the entropy is calculated. Default is 0.
  38. Returns
  39. -------
  40. S : {float, array_like}
  41. The calculated entropy.
  42. Notes
  43. -----
  44. Informally, the Shannon entropy quantifies the expected uncertainty
  45. inherent in the possible outcomes of a discrete random variable.
  46. For example,
  47. if messages consisting of sequences of symbols from a set are to be
  48. encoded and transmitted over a noiseless channel, then the Shannon entropy
  49. ``H(pk)`` gives a tight lower bound for the average number of units of
  50. information needed per symbol if the symbols occur with frequencies
  51. governed by the discrete distribution `pk` [1]_. The choice of base
  52. determines the choice of units; e.g., ``e`` for nats, ``2`` for bits, etc.
  53. The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
  54. number of units of information needed per symbol if the encoding is
  55. optimized for the probability distribution `qk` instead of the true
  56. distribution `pk`. Informally, the relative entropy quantifies the expected
  57. excess in surprise experienced if one believes the true distribution is
  58. `qk` when it is actually `pk`.
  59. A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
  60. equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
  61. the formula ``CE = -sum(pk * log(qk))``. It gives the average
  62. number of units of information needed per symbol if an encoding is
  63. optimized for the probability distribution `qk` when the true distribution
  64. is `pk`. It is not computed directly by `entropy`, but it can be computed
  65. using two calls to the function (see Examples).
  66. See [2]_ for more information.
  67. References
  68. ----------
  69. .. [1] Shannon, C.E. (1948), A Mathematical Theory of Communication.
  70. Bell System Technical Journal, 27: 379-423.
  71. https://doi.org/10.1002/j.1538-7305.1948.tb01338.x
  72. .. [2] Thomas M. Cover and Joy A. Thomas. 2006. Elements of Information
  73. Theory (Wiley Series in Telecommunications and Signal Processing).
  74. Wiley-Interscience, USA.
  75. Examples
  76. --------
  77. The outcome of a fair coin is the most uncertain:
  78. >>> import numpy as np
  79. >>> from scipy.stats import entropy
  80. >>> base = 2 # work in units of bits
  81. >>> pk = np.array([1/2, 1/2]) # fair coin
  82. >>> H = entropy(pk, base=base)
  83. >>> H
  84. 1.0
  85. >>> H == -np.sum(pk * np.log(pk)) / np.log(base)
  86. True
  87. The outcome of a biased coin is less uncertain:
  88. >>> qk = np.array([9/10, 1/10]) # biased coin
  89. >>> entropy(qk, base=base)
  90. 0.46899559358928117
  91. The relative entropy between the fair coin and biased coin is calculated
  92. as:
  93. >>> D = entropy(pk, qk, base=base)
  94. >>> D
  95. 0.7369655941662062
  96. >>> D == np.sum(pk * np.log(pk/qk)) / np.log(base)
  97. True
  98. The cross entropy can be calculated as the sum of the entropy and
  99. relative entropy`:
  100. >>> CE = entropy(pk, base=base) + entropy(pk, qk, base=base)
  101. >>> CE
  102. 1.736965594166206
  103. >>> CE == -np.sum(pk * np.log(qk)) / np.log(base)
  104. True
  105. """
  106. if base is not None and base <= 0:
  107. raise ValueError("`base` must be a positive number or `None`.")
  108. pk = np.asarray(pk)
  109. pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)
  110. if qk is None:
  111. vec = special.entr(pk)
  112. else:
  113. qk = np.asarray(qk)
  114. pk, qk = np.broadcast_arrays(pk, qk)
  115. qk = 1.0*qk / np.sum(qk, axis=axis, keepdims=True)
  116. vec = special.rel_entr(pk, qk)
  117. S = np.sum(vec, axis=axis)
  118. if base is not None:
  119. S /= np.log(base)
  120. return S
  121. def differential_entropy(
  122. values: np.typing.ArrayLike,
  123. *,
  124. window_length: Optional[int] = None,
  125. base: Optional[float] = None,
  126. axis: int = 0,
  127. method: str = "auto",
  128. ) -> Union[np.number, np.ndarray]:
  129. r"""Given a sample of a distribution, estimate the differential entropy.
  130. Several estimation methods are available using the `method` parameter. By
  131. default, a method is selected based the size of the sample.
  132. Parameters
  133. ----------
  134. values : sequence
  135. Sample from a continuous distribution.
  136. window_length : int, optional
  137. Window length for computing Vasicek estimate. Must be an integer
  138. between 1 and half of the sample size. If ``None`` (the default), it
  139. uses the heuristic value
  140. .. math::
  141. \left \lfloor \sqrt{n} + 0.5 \right \rfloor
  142. where :math:`n` is the sample size. This heuristic was originally
  143. proposed in [2]_ and has become common in the literature.
  144. base : float, optional
  145. The logarithmic base to use, defaults to ``e`` (natural logarithm).
  146. axis : int, optional
  147. The axis along which the differential entropy is calculated.
  148. Default is 0.
  149. method : {'vasicek', 'van es', 'ebrahimi', 'correa', 'auto'}, optional
  150. The method used to estimate the differential entropy from the sample.
  151. Default is ``'auto'``. See Notes for more information.
  152. Returns
  153. -------
  154. entropy : float
  155. The calculated differential entropy.
  156. Notes
  157. -----
  158. This function will converge to the true differential entropy in the limit
  159. .. math::
  160. n \to \infty, \quad m \to \infty, \quad \frac{m}{n} \to 0
  161. The optimal choice of ``window_length`` for a given sample size depends on
  162. the (unknown) distribution. Typically, the smoother the density of the
  163. distribution, the larger the optimal value of ``window_length`` [1]_.
  164. The following options are available for the `method` parameter.
  165. * ``'vasicek'`` uses the estimator presented in [1]_. This is
  166. one of the first and most influential estimators of differential entropy.
  167. * ``'van es'`` uses the bias-corrected estimator presented in [3]_, which
  168. is not only consistent but, under some conditions, asymptotically normal.
  169. * ``'ebrahimi'`` uses an estimator presented in [4]_, which was shown
  170. in simulation to have smaller bias and mean squared error than
  171. the Vasicek estimator.
  172. * ``'correa'`` uses the estimator presented in [5]_ based on local linear
  173. regression. In a simulation study, it had consistently smaller mean
  174. square error than the Vasiceck estimator, but it is more expensive to
  175. compute.
  176. * ``'auto'`` selects the method automatically (default). Currently,
  177. this selects ``'van es'`` for very small samples (<10), ``'ebrahimi'``
  178. for moderate sample sizes (11-1000), and ``'vasicek'`` for larger
  179. samples, but this behavior is subject to change in future versions.
  180. All estimators are implemented as described in [6]_.
  181. References
  182. ----------
  183. .. [1] Vasicek, O. (1976). A test for normality based on sample entropy.
  184. Journal of the Royal Statistical Society:
  185. Series B (Methodological), 38(1), 54-59.
  186. .. [2] Crzcgorzewski, P., & Wirczorkowski, R. (1999). Entropy-based
  187. goodness-of-fit test for exponentiality. Communications in
  188. Statistics-Theory and Methods, 28(5), 1183-1202.
  189. .. [3] Van Es, B. (1992). Estimating functionals related to a density by a
  190. class of statistics based on spacings. Scandinavian Journal of
  191. Statistics, 61-72.
  192. .. [4] Ebrahimi, N., Pflughoeft, K., & Soofi, E. S. (1994). Two measures
  193. of sample entropy. Statistics & Probability Letters, 20(3), 225-234.
  194. .. [5] Correa, J. C. (1995). A new estimator of entropy. Communications
  195. in Statistics-Theory and Methods, 24(10), 2439-2449.
  196. .. [6] Noughabi, H. A. (2015). Entropy Estimation Using Numerical Methods.
  197. Annals of Data Science, 2(2), 231-241.
  198. https://link.springer.com/article/10.1007/s40745-015-0045-9
  199. Examples
  200. --------
  201. >>> import numpy as np
  202. >>> from scipy.stats import differential_entropy, norm
  203. Entropy of a standard normal distribution:
  204. >>> rng = np.random.default_rng()
  205. >>> values = rng.standard_normal(100)
  206. >>> differential_entropy(values)
  207. 1.3407817436640392
  208. Compare with the true entropy:
  209. >>> float(norm.entropy())
  210. 1.4189385332046727
  211. For several sample sizes between 5 and 1000, compare the accuracy of
  212. the ``'vasicek'``, ``'van es'``, and ``'ebrahimi'`` methods. Specifically,
  213. compare the root mean squared error (over 1000 trials) between the estimate
  214. and the true differential entropy of the distribution.
  215. >>> from scipy import stats
  216. >>> import matplotlib.pyplot as plt
  217. >>>
  218. >>>
  219. >>> def rmse(res, expected):
  220. ... '''Root mean squared error'''
  221. ... return np.sqrt(np.mean((res - expected)**2))
  222. >>>
  223. >>>
  224. >>> a, b = np.log10(5), np.log10(1000)
  225. >>> ns = np.round(np.logspace(a, b, 10)).astype(int)
  226. >>> reps = 1000 # number of repetitions for each sample size
  227. >>> expected = stats.expon.entropy()
  228. >>>
  229. >>> method_errors = {'vasicek': [], 'van es': [], 'ebrahimi': []}
  230. >>> for method in method_errors:
  231. ... for n in ns:
  232. ... rvs = stats.expon.rvs(size=(reps, n), random_state=rng)
  233. ... res = stats.differential_entropy(rvs, method=method, axis=-1)
  234. ... error = rmse(res, expected)
  235. ... method_errors[method].append(error)
  236. >>>
  237. >>> for method, errors in method_errors.items():
  238. ... plt.loglog(ns, errors, label=method)
  239. >>>
  240. >>> plt.legend()
  241. >>> plt.xlabel('sample size')
  242. >>> plt.ylabel('RMSE (1000 trials)')
  243. >>> plt.title('Entropy Estimator Error (Exponential Distribution)')
  244. """
  245. values = np.asarray(values)
  246. values = np.moveaxis(values, axis, -1)
  247. n = values.shape[-1] # number of observations
  248. if window_length is None:
  249. window_length = math.floor(math.sqrt(n) + 0.5)
  250. if not 2 <= 2 * window_length < n:
  251. raise ValueError(
  252. f"Window length ({window_length}) must be positive and less "
  253. f"than half the sample size ({n}).",
  254. )
  255. if base is not None and base <= 0:
  256. raise ValueError("`base` must be a positive number or `None`.")
  257. sorted_data = np.sort(values, axis=-1)
  258. methods = {"vasicek": _vasicek_entropy,
  259. "van es": _van_es_entropy,
  260. "correa": _correa_entropy,
  261. "ebrahimi": _ebrahimi_entropy,
  262. "auto": _vasicek_entropy}
  263. method = method.lower()
  264. if method not in methods:
  265. message = f"`method` must be one of {set(methods)}"
  266. raise ValueError(message)
  267. if method == "auto":
  268. if n <= 10:
  269. method = 'van es'
  270. elif n <= 1000:
  271. method = 'ebrahimi'
  272. else:
  273. method = 'vasicek'
  274. res = methods[method](sorted_data, window_length)
  275. if base is not None:
  276. res /= np.log(base)
  277. return res
  278. def _pad_along_last_axis(X, m):
  279. """Pad the data for computing the rolling window difference."""
  280. # scales a bit better than method in _vasicek_like_entropy
  281. shape = np.array(X.shape)
  282. shape[-1] = m
  283. Xl = np.broadcast_to(X[..., [0]], shape) # [0] vs 0 to maintain shape
  284. Xr = np.broadcast_to(X[..., [-1]], shape)
  285. return np.concatenate((Xl, X, Xr), axis=-1)
  286. def _vasicek_entropy(X, m):
  287. """Compute the Vasicek estimator as described in [6] Eq. 1.3."""
  288. n = X.shape[-1]
  289. X = _pad_along_last_axis(X, m)
  290. differences = X[..., 2 * m:] - X[..., : -2 * m:]
  291. logs = np.log(n/(2*m) * differences)
  292. return np.mean(logs, axis=-1)
  293. def _van_es_entropy(X, m):
  294. """Compute the van Es estimator as described in [6]."""
  295. # No equation number, but referred to as HVE_mn.
  296. # Typo: there should be a log within the summation.
  297. n = X.shape[-1]
  298. difference = X[..., m:] - X[..., :-m]
  299. term1 = 1/(n-m) * np.sum(np.log((n+1)/m * difference), axis=-1)
  300. k = np.arange(m, n+1)
  301. return term1 + np.sum(1/k) + np.log(m) - np.log(n+1)
  302. def _ebrahimi_entropy(X, m):
  303. """Compute the Ebrahimi estimator as described in [6]."""
  304. # No equation number, but referred to as HE_mn
  305. n = X.shape[-1]
  306. X = _pad_along_last_axis(X, m)
  307. differences = X[..., 2 * m:] - X[..., : -2 * m:]
  308. i = np.arange(1, n+1).astype(float)
  309. ci = np.ones_like(i)*2
  310. ci[i <= m] = 1 + (i[i <= m] - 1)/m
  311. ci[i >= n - m + 1] = 1 + (n - i[i >= n-m+1])/m
  312. logs = np.log(n * differences / (ci * m))
  313. return np.mean(logs, axis=-1)
  314. def _correa_entropy(X, m):
  315. """Compute the Correa estimator as described in [6]."""
  316. # No equation number, but referred to as HC_mn
  317. n = X.shape[-1]
  318. X = _pad_along_last_axis(X, m)
  319. i = np.arange(1, n+1)
  320. dj = np.arange(-m, m+1)[:, None]
  321. j = i + dj
  322. j0 = j + m - 1 # 0-indexed version of j
  323. Xibar = np.mean(X[..., j0], axis=-2, keepdims=True)
  324. difference = X[..., j0] - Xibar
  325. num = np.sum(difference*dj, axis=-2) # dj is d-i
  326. den = n*np.sum(difference**2, axis=-2)
  327. return -np.mean(np.log(num/den), axis=-1)