_mstats_extras.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. """
  2. Additional statistics functions with support for masked arrays.
  3. """
  4. # Original author (2007): Pierre GF Gerard-Marchant
  5. __all__ = ['compare_medians_ms',
  6. 'hdquantiles', 'hdmedian', 'hdquantiles_sd',
  7. 'idealfourths',
  8. 'median_cihs','mjci','mquantiles_cimj',
  9. 'rsh',
  10. 'trimmed_mean_ci',]
  11. import numpy as np
  12. from numpy import float_, int_, ndarray
  13. import numpy.ma as ma
  14. from numpy.ma import MaskedArray
  15. from . import _mstats_basic as mstats
  16. from scipy.stats.distributions import norm, beta, t, binom
  17. def hdquantiles(data, prob=list([.25,.5,.75]), axis=None, var=False,):
  18. """
  19. Computes quantile estimates with the Harrell-Davis method.
  20. The quantile estimates are calculated as a weighted linear combination
  21. of order statistics.
  22. Parameters
  23. ----------
  24. data : array_like
  25. Data array.
  26. prob : sequence, optional
  27. Sequence of quantiles to compute.
  28. axis : int or None, optional
  29. Axis along which to compute the quantiles. If None, use a flattened
  30. array.
  31. var : bool, optional
  32. Whether to return the variance of the estimate.
  33. Returns
  34. -------
  35. hdquantiles : MaskedArray
  36. A (p,) array of quantiles (if `var` is False), or a (2,p) array of
  37. quantiles and variances (if `var` is True), where ``p`` is the
  38. number of quantiles.
  39. See Also
  40. --------
  41. hdquantiles_sd
  42. """
  43. def _hd_1D(data,prob,var):
  44. "Computes the HD quantiles for a 1D array. Returns nan for invalid data."
  45. xsorted = np.squeeze(np.sort(data.compressed().view(ndarray)))
  46. # Don't use length here, in case we have a numpy scalar
  47. n = xsorted.size
  48. hd = np.empty((2,len(prob)), float_)
  49. if n < 2:
  50. hd.flat = np.nan
  51. if var:
  52. return hd
  53. return hd[0]
  54. v = np.arange(n+1) / float(n)
  55. betacdf = beta.cdf
  56. for (i,p) in enumerate(prob):
  57. _w = betacdf(v, (n+1)*p, (n+1)*(1-p))
  58. w = _w[1:] - _w[:-1]
  59. hd_mean = np.dot(w, xsorted)
  60. hd[0,i] = hd_mean
  61. #
  62. hd[1,i] = np.dot(w, (xsorted-hd_mean)**2)
  63. #
  64. hd[0, prob == 0] = xsorted[0]
  65. hd[0, prob == 1] = xsorted[-1]
  66. if var:
  67. hd[1, prob == 0] = hd[1, prob == 1] = np.nan
  68. return hd
  69. return hd[0]
  70. # Initialization & checks
  71. data = ma.array(data, copy=False, dtype=float_)
  72. p = np.array(prob, copy=False, ndmin=1)
  73. # Computes quantiles along axis (or globally)
  74. if (axis is None) or (data.ndim == 1):
  75. result = _hd_1D(data, p, var)
  76. else:
  77. if data.ndim > 2:
  78. raise ValueError("Array 'data' must be at most two dimensional, "
  79. "but got data.ndim = %d" % data.ndim)
  80. result = ma.apply_along_axis(_hd_1D, axis, data, p, var)
  81. return ma.fix_invalid(result, copy=False)
  82. def hdmedian(data, axis=-1, var=False):
  83. """
  84. Returns the Harrell-Davis estimate of the median along the given axis.
  85. Parameters
  86. ----------
  87. data : ndarray
  88. Data array.
  89. axis : int, optional
  90. Axis along which to compute the quantiles. If None, use a flattened
  91. array.
  92. var : bool, optional
  93. Whether to return the variance of the estimate.
  94. Returns
  95. -------
  96. hdmedian : MaskedArray
  97. The median values. If ``var=True``, the variance is returned inside
  98. the masked array. E.g. for a 1-D array the shape change from (1,) to
  99. (2,).
  100. """
  101. result = hdquantiles(data,[0.5], axis=axis, var=var)
  102. return result.squeeze()
  103. def hdquantiles_sd(data, prob=list([.25,.5,.75]), axis=None):
  104. """
  105. The standard error of the Harrell-Davis quantile estimates by jackknife.
  106. Parameters
  107. ----------
  108. data : array_like
  109. Data array.
  110. prob : sequence, optional
  111. Sequence of quantiles to compute.
  112. axis : int, optional
  113. Axis along which to compute the quantiles. If None, use a flattened
  114. array.
  115. Returns
  116. -------
  117. hdquantiles_sd : MaskedArray
  118. Standard error of the Harrell-Davis quantile estimates.
  119. See Also
  120. --------
  121. hdquantiles
  122. """
  123. def _hdsd_1D(data, prob):
  124. "Computes the std error for 1D arrays."
  125. xsorted = np.sort(data.compressed())
  126. n = len(xsorted)
  127. hdsd = np.empty(len(prob), float_)
  128. if n < 2:
  129. hdsd.flat = np.nan
  130. vv = np.arange(n) / float(n-1)
  131. betacdf = beta.cdf
  132. for (i,p) in enumerate(prob):
  133. _w = betacdf(vv, n*p, n*(1-p))
  134. w = _w[1:] - _w[:-1]
  135. # cumulative sum of weights and data points if
  136. # ith point is left out for jackknife
  137. mx_ = np.zeros_like(xsorted)
  138. mx_[1:] = np.cumsum(w * xsorted[:-1])
  139. # similar but from the right
  140. mx_[:-1] += np.cumsum(w[::-1] * xsorted[:0:-1])[::-1]
  141. hdsd[i] = np.sqrt(mx_.var() * (n - 1))
  142. return hdsd
  143. # Initialization & checks
  144. data = ma.array(data, copy=False, dtype=float_)
  145. p = np.array(prob, copy=False, ndmin=1)
  146. # Computes quantiles along axis (or globally)
  147. if (axis is None):
  148. result = _hdsd_1D(data, p)
  149. else:
  150. if data.ndim > 2:
  151. raise ValueError("Array 'data' must be at most two dimensional, "
  152. "but got data.ndim = %d" % data.ndim)
  153. result = ma.apply_along_axis(_hdsd_1D, axis, data, p)
  154. return ma.fix_invalid(result, copy=False).ravel()
  155. def trimmed_mean_ci(data, limits=(0.2,0.2), inclusive=(True,True),
  156. alpha=0.05, axis=None):
  157. """
  158. Selected confidence interval of the trimmed mean along the given axis.
  159. Parameters
  160. ----------
  161. data : array_like
  162. Input data.
  163. limits : {None, tuple}, optional
  164. None or a two item tuple.
  165. Tuple of the percentages to cut on each side of the array, with respect
  166. to the number of unmasked data, as floats between 0. and 1. If ``n``
  167. is the number of unmasked data before trimming, then
  168. (``n * limits[0]``)th smallest data and (``n * limits[1]``)th
  169. largest data are masked. The total number of unmasked data after
  170. trimming is ``n * (1. - sum(limits))``.
  171. The value of one limit can be set to None to indicate an open interval.
  172. Defaults to (0.2, 0.2).
  173. inclusive : (2,) tuple of boolean, optional
  174. If relative==False, tuple indicating whether values exactly equal to
  175. the absolute limits are allowed.
  176. If relative==True, tuple indicating whether the number of data being
  177. masked on each side should be rounded (True) or truncated (False).
  178. Defaults to (True, True).
  179. alpha : float, optional
  180. Confidence level of the intervals.
  181. Defaults to 0.05.
  182. axis : int, optional
  183. Axis along which to cut. If None, uses a flattened version of `data`.
  184. Defaults to None.
  185. Returns
  186. -------
  187. trimmed_mean_ci : (2,) ndarray
  188. The lower and upper confidence intervals of the trimmed data.
  189. """
  190. data = ma.array(data, copy=False)
  191. trimmed = mstats.trimr(data, limits=limits, inclusive=inclusive, axis=axis)
  192. tmean = trimmed.mean(axis)
  193. tstde = mstats.trimmed_stde(data,limits=limits,inclusive=inclusive,axis=axis)
  194. df = trimmed.count(axis) - 1
  195. tppf = t.ppf(1-alpha/2.,df)
  196. return np.array((tmean - tppf*tstde, tmean+tppf*tstde))
  197. def mjci(data, prob=[0.25,0.5,0.75], axis=None):
  198. """
  199. Returns the Maritz-Jarrett estimators of the standard error of selected
  200. experimental quantiles of the data.
  201. Parameters
  202. ----------
  203. data : ndarray
  204. Data array.
  205. prob : sequence, optional
  206. Sequence of quantiles to compute.
  207. axis : int or None, optional
  208. Axis along which to compute the quantiles. If None, use a flattened
  209. array.
  210. """
  211. def _mjci_1D(data, p):
  212. data = np.sort(data.compressed())
  213. n = data.size
  214. prob = (np.array(p) * n + 0.5).astype(int_)
  215. betacdf = beta.cdf
  216. mj = np.empty(len(prob), float_)
  217. x = np.arange(1,n+1, dtype=float_) / n
  218. y = x - 1./n
  219. for (i,m) in enumerate(prob):
  220. W = betacdf(x,m-1,n-m) - betacdf(y,m-1,n-m)
  221. C1 = np.dot(W,data)
  222. C2 = np.dot(W,data**2)
  223. mj[i] = np.sqrt(C2 - C1**2)
  224. return mj
  225. data = ma.array(data, copy=False)
  226. if data.ndim > 2:
  227. raise ValueError("Array 'data' must be at most two dimensional, "
  228. "but got data.ndim = %d" % data.ndim)
  229. p = np.array(prob, copy=False, ndmin=1)
  230. # Computes quantiles along axis (or globally)
  231. if (axis is None):
  232. return _mjci_1D(data, p)
  233. else:
  234. return ma.apply_along_axis(_mjci_1D, axis, data, p)
  235. def mquantiles_cimj(data, prob=[0.25,0.50,0.75], alpha=0.05, axis=None):
  236. """
  237. Computes the alpha confidence interval for the selected quantiles of the
  238. data, with Maritz-Jarrett estimators.
  239. Parameters
  240. ----------
  241. data : ndarray
  242. Data array.
  243. prob : sequence, optional
  244. Sequence of quantiles to compute.
  245. alpha : float, optional
  246. Confidence level of the intervals.
  247. axis : int or None, optional
  248. Axis along which to compute the quantiles.
  249. If None, use a flattened array.
  250. Returns
  251. -------
  252. ci_lower : ndarray
  253. The lower boundaries of the confidence interval. Of the same length as
  254. `prob`.
  255. ci_upper : ndarray
  256. The upper boundaries of the confidence interval. Of the same length as
  257. `prob`.
  258. """
  259. alpha = min(alpha, 1 - alpha)
  260. z = norm.ppf(1 - alpha/2.)
  261. xq = mstats.mquantiles(data, prob, alphap=0, betap=0, axis=axis)
  262. smj = mjci(data, prob, axis=axis)
  263. return (xq - z * smj, xq + z * smj)
  264. def median_cihs(data, alpha=0.05, axis=None):
  265. """
  266. Computes the alpha-level confidence interval for the median of the data.
  267. Uses the Hettmasperger-Sheather method.
  268. Parameters
  269. ----------
  270. data : array_like
  271. Input data. Masked values are discarded. The input should be 1D only,
  272. or `axis` should be set to None.
  273. alpha : float, optional
  274. Confidence level of the intervals.
  275. axis : int or None, optional
  276. Axis along which to compute the quantiles. If None, use a flattened
  277. array.
  278. Returns
  279. -------
  280. median_cihs
  281. Alpha level confidence interval.
  282. """
  283. def _cihs_1D(data, alpha):
  284. data = np.sort(data.compressed())
  285. n = len(data)
  286. alpha = min(alpha, 1-alpha)
  287. k = int(binom._ppf(alpha/2., n, 0.5))
  288. gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
  289. if gk < 1-alpha:
  290. k -= 1
  291. gk = binom.cdf(n-k,n,0.5) - binom.cdf(k-1,n,0.5)
  292. gkk = binom.cdf(n-k-1,n,0.5) - binom.cdf(k,n,0.5)
  293. I = (gk - 1 + alpha)/(gk - gkk)
  294. lambd = (n-k) * I / float(k + (n-2*k)*I)
  295. lims = (lambd*data[k] + (1-lambd)*data[k-1],
  296. lambd*data[n-k-1] + (1-lambd)*data[n-k])
  297. return lims
  298. data = ma.array(data, copy=False)
  299. # Computes quantiles along axis (or globally)
  300. if (axis is None):
  301. result = _cihs_1D(data, alpha)
  302. else:
  303. if data.ndim > 2:
  304. raise ValueError("Array 'data' must be at most two dimensional, "
  305. "but got data.ndim = %d" % data.ndim)
  306. result = ma.apply_along_axis(_cihs_1D, axis, data, alpha)
  307. return result
  308. def compare_medians_ms(group_1, group_2, axis=None):
  309. """
  310. Compares the medians from two independent groups along the given axis.
  311. The comparison is performed using the McKean-Schrader estimate of the
  312. standard error of the medians.
  313. Parameters
  314. ----------
  315. group_1 : array_like
  316. First dataset. Has to be of size >=7.
  317. group_2 : array_like
  318. Second dataset. Has to be of size >=7.
  319. axis : int, optional
  320. Axis along which the medians are estimated. If None, the arrays are
  321. flattened. If `axis` is not None, then `group_1` and `group_2`
  322. should have the same shape.
  323. Returns
  324. -------
  325. compare_medians_ms : {float, ndarray}
  326. If `axis` is None, then returns a float, otherwise returns a 1-D
  327. ndarray of floats with a length equal to the length of `group_1`
  328. along `axis`.
  329. Examples
  330. --------
  331. >>> from scipy import stats
  332. >>> a = [1, 2, 3, 4, 5, 6, 7]
  333. >>> b = [8, 9, 10, 11, 12, 13, 14]
  334. >>> stats.mstats.compare_medians_ms(a, b, axis=None)
  335. 1.0693225866553746e-05
  336. The function is vectorized to compute along a given axis.
  337. >>> import numpy as np
  338. >>> rng = np.random.default_rng()
  339. >>> x = rng.random(size=(3, 7))
  340. >>> y = rng.random(size=(3, 8))
  341. >>> stats.mstats.compare_medians_ms(x, y, axis=1)
  342. array([0.36908985, 0.36092538, 0.2765313 ])
  343. References
  344. ----------
  345. .. [1] McKean, Joseph W., and Ronald M. Schrader. "A comparison of methods
  346. for studentizing the sample median." Communications in
  347. Statistics-Simulation and Computation 13.6 (1984): 751-773.
  348. """
  349. (med_1, med_2) = (ma.median(group_1,axis=axis), ma.median(group_2,axis=axis))
  350. (std_1, std_2) = (mstats.stde_median(group_1, axis=axis),
  351. mstats.stde_median(group_2, axis=axis))
  352. W = np.abs(med_1 - med_2) / ma.sqrt(std_1**2 + std_2**2)
  353. return 1 - norm.cdf(W)
  354. def idealfourths(data, axis=None):
  355. """
  356. Returns an estimate of the lower and upper quartiles.
  357. Uses the ideal fourths algorithm.
  358. Parameters
  359. ----------
  360. data : array_like
  361. Input array.
  362. axis : int, optional
  363. Axis along which the quartiles are estimated. If None, the arrays are
  364. flattened.
  365. Returns
  366. -------
  367. idealfourths : {list of floats, masked array}
  368. Returns the two internal values that divide `data` into four parts
  369. using the ideal fourths algorithm either along the flattened array
  370. (if `axis` is None) or along `axis` of `data`.
  371. """
  372. def _idf(data):
  373. x = data.compressed()
  374. n = len(x)
  375. if n < 3:
  376. return [np.nan,np.nan]
  377. (j,h) = divmod(n/4. + 5/12.,1)
  378. j = int(j)
  379. qlo = (1-h)*x[j-1] + h*x[j]
  380. k = n - j
  381. qup = (1-h)*x[k] + h*x[k-1]
  382. return [qlo, qup]
  383. data = ma.sort(data, axis=axis).view(MaskedArray)
  384. if (axis is None):
  385. return _idf(data)
  386. else:
  387. return ma.apply_along_axis(_idf, axis, data)
  388. def rsh(data, points=None):
  389. """
  390. Evaluates Rosenblatt's shifted histogram estimators for each data point.
  391. Rosenblatt's estimator is a centered finite-difference approximation to the
  392. derivative of the empirical cumulative distribution function.
  393. Parameters
  394. ----------
  395. data : sequence
  396. Input data, should be 1-D. Masked values are ignored.
  397. points : sequence or None, optional
  398. Sequence of points where to evaluate Rosenblatt shifted histogram.
  399. If None, use the data.
  400. """
  401. data = ma.array(data, copy=False)
  402. if points is None:
  403. points = data
  404. else:
  405. points = np.array(points, copy=False, ndmin=1)
  406. if data.ndim != 1:
  407. raise AttributeError("The input array should be 1D only !")
  408. n = data.count()
  409. r = idealfourths(data, axis=None)
  410. h = 1.2 * (r[-1]-r[0]) / n**(1./5)
  411. nhi = (data[:,None] <= points[None,:] + h).sum(0)
  412. nlo = (data[:,None] < points[None,:] - h).sum(0)
  413. return (nhi-nlo) / (2.*n*h)