_stats_mstats_common.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501
  1. import warnings
  2. import numpy as np
  3. import scipy.stats._stats_py
  4. from . import distributions
  5. from .._lib._bunch import _make_tuple_bunch
  6. from ._stats_pythran import siegelslopes as siegelslopes_pythran
  7. __all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']
  8. # This is not a namedtuple for backwards compatibility. See PR #12983
  9. LinregressResult = _make_tuple_bunch('LinregressResult',
  10. ['slope', 'intercept', 'rvalue',
  11. 'pvalue', 'stderr'],
  12. extra_field_names=['intercept_stderr'])
  13. TheilslopesResult = _make_tuple_bunch('TheilslopesResult',
  14. ['slope', 'intercept',
  15. 'low_slope', 'high_slope'])
  16. SiegelslopesResult = _make_tuple_bunch('SiegelslopesResult',
  17. ['slope', 'intercept'])
  18. def linregress(x, y=None, alternative='two-sided'):
  19. """
  20. Calculate a linear least-squares regression for two sets of measurements.
  21. Parameters
  22. ----------
  23. x, y : array_like
  24. Two sets of measurements. Both arrays should have the same length. If
  25. only `x` is given (and ``y=None``), then it must be a two-dimensional
  26. array where one dimension has length 2. The two sets of measurements
  27. are then found by splitting the array along the length-2 dimension. In
  28. the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is
  29. equivalent to ``linregress(x[0], x[1])``.
  30. alternative : {'two-sided', 'less', 'greater'}, optional
  31. Defines the alternative hypothesis. Default is 'two-sided'.
  32. The following options are available:
  33. * 'two-sided': the slope of the regression line is nonzero
  34. * 'less': the slope of the regression line is less than zero
  35. * 'greater': the slope of the regression line is greater than zero
  36. .. versionadded:: 1.7.0
  37. Returns
  38. -------
  39. result : ``LinregressResult`` instance
  40. The return value is an object with the following attributes:
  41. slope : float
  42. Slope of the regression line.
  43. intercept : float
  44. Intercept of the regression line.
  45. rvalue : float
  46. The Pearson correlation coefficient. The square of ``rvalue``
  47. is equal to the coefficient of determination.
  48. pvalue : float
  49. The p-value for a hypothesis test whose null hypothesis is
  50. that the slope is zero, using Wald Test with t-distribution of
  51. the test statistic. See `alternative` above for alternative
  52. hypotheses.
  53. stderr : float
  54. Standard error of the estimated slope (gradient), under the
  55. assumption of residual normality.
  56. intercept_stderr : float
  57. Standard error of the estimated intercept, under the assumption
  58. of residual normality.
  59. See Also
  60. --------
  61. scipy.optimize.curve_fit :
  62. Use non-linear least squares to fit a function to data.
  63. scipy.optimize.leastsq :
  64. Minimize the sum of squares of a set of equations.
  65. Notes
  66. -----
  67. Missing values are considered pair-wise: if a value is missing in `x`,
  68. the corresponding value in `y` is masked.
  69. For compatibility with older versions of SciPy, the return value acts
  70. like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,
  71. ``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::
  72. slope, intercept, r, p, se = linregress(x, y)
  73. With that style, however, the standard error of the intercept is not
  74. available. To have access to all the computed values, including the
  75. standard error of the intercept, use the return value as an object
  76. with attributes, e.g.::
  77. result = linregress(x, y)
  78. print(result.intercept, result.intercept_stderr)
  79. Examples
  80. --------
  81. >>> import numpy as np
  82. >>> import matplotlib.pyplot as plt
  83. >>> from scipy import stats
  84. >>> rng = np.random.default_rng()
  85. Generate some data:
  86. >>> x = rng.random(10)
  87. >>> y = 1.6*x + rng.random(10)
  88. Perform the linear regression:
  89. >>> res = stats.linregress(x, y)
  90. Coefficient of determination (R-squared):
  91. >>> print(f"R-squared: {res.rvalue**2:.6f}")
  92. R-squared: 0.717533
  93. Plot the data along with the fitted line:
  94. >>> plt.plot(x, y, 'o', label='original data')
  95. >>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
  96. >>> plt.legend()
  97. >>> plt.show()
  98. Calculate 95% confidence interval on slope and intercept:
  99. >>> # Two-sided inverse Students t-distribution
  100. >>> # p - probability, df - degrees of freedom
  101. >>> from scipy.stats import t
  102. >>> tinv = lambda p, df: abs(t.ppf(p/2, df))
  103. >>> ts = tinv(0.05, len(x)-2)
  104. >>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")
  105. slope (95%): 1.453392 +/- 0.743465
  106. >>> print(f"intercept (95%): {res.intercept:.6f}"
  107. ... f" +/- {ts*res.intercept_stderr:.6f}")
  108. intercept (95%): 0.616950 +/- 0.544475
  109. """
  110. TINY = 1.0e-20
  111. if y is None: # x is a (2, N) or (N, 2) shaped array_like
  112. x = np.asarray(x)
  113. if x.shape[0] == 2:
  114. x, y = x
  115. elif x.shape[1] == 2:
  116. x, y = x.T
  117. else:
  118. raise ValueError("If only `x` is given as input, it has to "
  119. "be of shape (2, N) or (N, 2); provided shape "
  120. f"was {x.shape}.")
  121. else:
  122. x = np.asarray(x)
  123. y = np.asarray(y)
  124. if x.size == 0 or y.size == 0:
  125. raise ValueError("Inputs must not be empty.")
  126. if np.amax(x) == np.amin(x) and len(x) > 1:
  127. raise ValueError("Cannot calculate a linear regression "
  128. "if all x values are identical")
  129. n = len(x)
  130. xmean = np.mean(x, None)
  131. ymean = np.mean(y, None)
  132. # Average sums of square differences from the mean
  133. # ssxm = mean( (x-mean(x))^2 )
  134. # ssxym = mean( (x-mean(x)) * (y-mean(y)) )
  135. ssxm, ssxym, _, ssym = np.cov(x, y, bias=1).flat
  136. # R-value
  137. # r = ssxym / sqrt( ssxm * ssym )
  138. if ssxm == 0.0 or ssym == 0.0:
  139. # If the denominator was going to be 0
  140. r = 0.0
  141. else:
  142. r = ssxym / np.sqrt(ssxm * ssym)
  143. # Test for numerical error propagation (make sure -1 < r < 1)
  144. if r > 1.0:
  145. r = 1.0
  146. elif r < -1.0:
  147. r = -1.0
  148. slope = ssxym / ssxm
  149. intercept = ymean - slope*xmean
  150. if n == 2:
  151. # handle case when only two points are passed in
  152. if y[0] == y[1]:
  153. prob = 1.0
  154. else:
  155. prob = 0.0
  156. slope_stderr = 0.0
  157. intercept_stderr = 0.0
  158. else:
  159. df = n - 2 # Number of degrees of freedom
  160. # n-2 degrees of freedom because 2 has been used up
  161. # to estimate the mean and standard deviation
  162. t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
  163. t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
  164. slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
  165. # Also calculate the standard error of the intercept
  166. # The following relationship is used:
  167. # ssxm = mean( (x-mean(x))^2 )
  168. # = ssx - sx*sx
  169. # = mean( x^2 ) - mean(x)^2
  170. intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
  171. return LinregressResult(slope=slope, intercept=intercept, rvalue=r,
  172. pvalue=prob, stderr=slope_stderr,
  173. intercept_stderr=intercept_stderr)
  174. def theilslopes(y, x=None, alpha=0.95, method='separate'):
  175. r"""
  176. Computes the Theil-Sen estimator for a set of points (x, y).
  177. `theilslopes` implements a method for robust linear regression. It
  178. computes the slope as the median of all slopes between paired values.
  179. Parameters
  180. ----------
  181. y : array_like
  182. Dependent variable.
  183. x : array_like or None, optional
  184. Independent variable. If None, use ``arange(len(y))`` instead.
  185. alpha : float, optional
  186. Confidence degree between 0 and 1. Default is 95% confidence.
  187. Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
  188. interpreted as "find the 90% confidence interval".
  189. method : {'joint', 'separate'}, optional
  190. Method to be used for computing estimate for intercept.
  191. Following methods are supported,
  192. * 'joint': Uses np.median(y - slope * x) as intercept.
  193. * 'separate': Uses np.median(y) - slope * np.median(x)
  194. as intercept.
  195. The default is 'separate'.
  196. .. versionadded:: 1.8.0
  197. Returns
  198. -------
  199. result : ``TheilslopesResult`` instance
  200. The return value is an object with the following attributes:
  201. slope : float
  202. Theil slope.
  203. intercept : float
  204. Intercept of the Theil line.
  205. low_slope : float
  206. Lower bound of the confidence interval on `slope`.
  207. high_slope : float
  208. Upper bound of the confidence interval on `slope`.
  209. See Also
  210. --------
  211. siegelslopes : a similar technique using repeated medians
  212. Notes
  213. -----
  214. The implementation of `theilslopes` follows [1]_. The intercept is
  215. not defined in [1]_, and here it is defined as ``median(y) -
  216. slope*median(x)``, which is given in [3]_. Other definitions of
  217. the intercept exist in the literature such as ``median(y - slope*x)``
  218. in [4]_. The approach to compute the intercept can be determined by the
  219. parameter ``method``. A confidence interval for the intercept is not
  220. given as this question is not addressed in [1]_.
  221. For compatibility with older versions of SciPy, the return value acts
  222. like a ``namedtuple`` of length 4, with fields ``slope``, ``intercept``,
  223. ``low_slope``, and ``high_slope``, so one can continue to write::
  224. slope, intercept, low_slope, high_slope = theilslopes(y, x)
  225. References
  226. ----------
  227. .. [1] P.K. Sen, "Estimates of the regression coefficient based on
  228. Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
  229. .. [2] H. Theil, "A rank-invariant method of linear and polynomial
  230. regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
  231. 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
  232. .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
  233. John Wiley and Sons, New York, pp. 493.
  234. .. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
  235. Examples
  236. --------
  237. >>> import numpy as np
  238. >>> from scipy import stats
  239. >>> import matplotlib.pyplot as plt
  240. >>> x = np.linspace(-5, 5, num=150)
  241. >>> y = x + np.random.normal(size=x.size)
  242. >>> y[11:15] += 10 # add outliers
  243. >>> y[-5:] -= 7
  244. Compute the slope, intercept and 90% confidence interval. For comparison,
  245. also compute the least-squares fit with `linregress`:
  246. >>> res = stats.theilslopes(y, x, 0.90, method='separate')
  247. >>> lsq_res = stats.linregress(x, y)
  248. Plot the results. The Theil-Sen regression line is shown in red, with the
  249. dashed red lines illustrating the confidence interval of the slope (note
  250. that the dashed red lines are not the confidence interval of the regression
  251. as the confidence interval of the intercept is not included). The green
  252. line shows the least-squares fit for comparison.
  253. >>> fig = plt.figure()
  254. >>> ax = fig.add_subplot(111)
  255. >>> ax.plot(x, y, 'b.')
  256. >>> ax.plot(x, res[1] + res[0] * x, 'r-')
  257. >>> ax.plot(x, res[1] + res[2] * x, 'r--')
  258. >>> ax.plot(x, res[1] + res[3] * x, 'r--')
  259. >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
  260. >>> plt.show()
  261. """
  262. if method not in ['joint', 'separate']:
  263. raise ValueError(("method must be either 'joint' or 'separate'."
  264. "'{}' is invalid.".format(method)))
  265. # We copy both x and y so we can use _find_repeats.
  266. y = np.array(y).flatten()
  267. if x is None:
  268. x = np.arange(len(y), dtype=float)
  269. else:
  270. x = np.array(x, dtype=float).flatten()
  271. if len(x) != len(y):
  272. raise ValueError("Incompatible lengths ! (%s<>%s)" %
  273. (len(y), len(x)))
  274. # Compute sorted slopes only when deltax > 0
  275. deltax = x[:, np.newaxis] - x
  276. deltay = y[:, np.newaxis] - y
  277. slopes = deltay[deltax > 0] / deltax[deltax > 0]
  278. if not slopes.size:
  279. msg = "All `x` coordinates are identical."
  280. warnings.warn(msg, RuntimeWarning, stacklevel=2)
  281. slopes.sort()
  282. medslope = np.median(slopes)
  283. if method == 'joint':
  284. medinter = np.median(y - medslope * x)
  285. else:
  286. medinter = np.median(y) - medslope * np.median(x)
  287. # Now compute confidence intervals
  288. if alpha > 0.5:
  289. alpha = 1. - alpha
  290. z = distributions.norm.ppf(alpha / 2.)
  291. # This implements (2.6) from Sen (1968)
  292. _, nxreps = _find_repeats(x)
  293. _, nyreps = _find_repeats(y)
  294. nt = len(slopes) # N in Sen (1968)
  295. ny = len(y) # n in Sen (1968)
  296. # Equation 2.6 in Sen (1968):
  297. sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
  298. sum(k * (k-1) * (2*k + 5) for k in nxreps) -
  299. sum(k * (k-1) * (2*k + 5) for k in nyreps))
  300. # Find the confidence interval indices in `slopes`
  301. try:
  302. sigma = np.sqrt(sigsq)
  303. Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
  304. Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
  305. delta = slopes[[Rl, Ru]]
  306. except (ValueError, IndexError):
  307. delta = (np.nan, np.nan)
  308. return TheilslopesResult(slope=medslope, intercept=medinter,
  309. low_slope=delta[0], high_slope=delta[1])
  310. def _find_repeats(arr):
  311. # This function assumes it may clobber its input.
  312. if len(arr) == 0:
  313. return np.array(0, np.float64), np.array(0, np.intp)
  314. # XXX This cast was previously needed for the Fortran implementation,
  315. # should we ditch it?
  316. arr = np.asarray(arr, np.float64).ravel()
  317. arr.sort()
  318. # Taken from NumPy 1.9's np.unique.
  319. change = np.concatenate(([True], arr[1:] != arr[:-1]))
  320. unique = arr[change]
  321. change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
  322. freq = np.diff(change_idx)
  323. atleast2 = freq > 1
  324. return unique[atleast2], freq[atleast2]
  325. def siegelslopes(y, x=None, method="hierarchical"):
  326. r"""
  327. Computes the Siegel estimator for a set of points (x, y).
  328. `siegelslopes` implements a method for robust linear regression
  329. using repeated medians (see [1]_) to fit a line to the points (x, y).
  330. The method is robust to outliers with an asymptotic breakdown point
  331. of 50%.
  332. Parameters
  333. ----------
  334. y : array_like
  335. Dependent variable.
  336. x : array_like or None, optional
  337. Independent variable. If None, use ``arange(len(y))`` instead.
  338. method : {'hierarchical', 'separate'}
  339. If 'hierarchical', estimate the intercept using the estimated
  340. slope ``slope`` (default option).
  341. If 'separate', estimate the intercept independent of the estimated
  342. slope. See Notes for details.
  343. Returns
  344. -------
  345. result : ``SiegelslopesResult`` instance
  346. The return value is an object with the following attributes:
  347. slope : float
  348. Estimate of the slope of the regression line.
  349. intercept : float
  350. Estimate of the intercept of the regression line.
  351. See Also
  352. --------
  353. theilslopes : a similar technique without repeated medians
  354. Notes
  355. -----
  356. With ``n = len(y)``, compute ``m_j`` as the median of
  357. the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
  358. ``slope`` is then the median of all slopes ``m_j``.
  359. Two ways are given to estimate the intercept in [1]_ which can be chosen
  360. via the parameter ``method``.
  361. The hierarchical approach uses the estimated slope ``slope``
  362. and computes ``intercept`` as the median of ``y - slope*x``.
  363. The other approach estimates the intercept separately as follows: for
  364. each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
  365. lines through the remaining points and take the median ``i_j``.
  366. ``intercept`` is the median of the ``i_j``.
  367. The implementation computes `n` times the median of a vector of size `n`
  368. which can be slow for large vectors. There are more efficient algorithms
  369. (see [2]_) which are not implemented here.
  370. For compatibility with older versions of SciPy, the return value acts
  371. like a ``namedtuple`` of length 2, with fields ``slope`` and
  372. ``intercept``, so one can continue to write::
  373. slope, intercept = siegelslopes(y, x)
  374. References
  375. ----------
  376. .. [1] A. Siegel, "Robust Regression Using Repeated Medians",
  377. Biometrika, Vol. 69, pp. 242-244, 1982.
  378. .. [2] A. Stein and M. Werman, "Finding the repeated median regression
  379. line", Proceedings of the Third Annual ACM-SIAM Symposium on
  380. Discrete Algorithms, pp. 409-413, 1992.
  381. Examples
  382. --------
  383. >>> import numpy as np
  384. >>> from scipy import stats
  385. >>> import matplotlib.pyplot as plt
  386. >>> x = np.linspace(-5, 5, num=150)
  387. >>> y = x + np.random.normal(size=x.size)
  388. >>> y[11:15] += 10 # add outliers
  389. >>> y[-5:] -= 7
  390. Compute the slope and intercept. For comparison, also compute the
  391. least-squares fit with `linregress`:
  392. >>> res = stats.siegelslopes(y, x)
  393. >>> lsq_res = stats.linregress(x, y)
  394. Plot the results. The Siegel regression line is shown in red. The green
  395. line shows the least-squares fit for comparison.
  396. >>> fig = plt.figure()
  397. >>> ax = fig.add_subplot(111)
  398. >>> ax.plot(x, y, 'b.')
  399. >>> ax.plot(x, res[1] + res[0] * x, 'r-')
  400. >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
  401. >>> plt.show()
  402. """
  403. if method not in ['hierarchical', 'separate']:
  404. raise ValueError("method can only be 'hierarchical' or 'separate'")
  405. y = np.asarray(y).ravel()
  406. if x is None:
  407. x = np.arange(len(y), dtype=float)
  408. else:
  409. x = np.asarray(x, dtype=float).ravel()
  410. if len(x) != len(y):
  411. raise ValueError("Incompatible lengths ! (%s<>%s)" %
  412. (len(y), len(x)))
  413. dtype = np.result_type(x, y, np.float32) # use at least float32
  414. y, x = y.astype(dtype), x.astype(dtype)
  415. medslope, medinter = siegelslopes_pythran(y, x, method)
  416. return SiegelslopesResult(slope=medslope, intercept=medinter)