123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501 |
- import warnings
- import numpy as np
- import scipy.stats._stats_py
- from . import distributions
- from .._lib._bunch import _make_tuple_bunch
- from ._stats_pythran import siegelslopes as siegelslopes_pythran
- __all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']
- # This is not a namedtuple for backwards compatibility. See PR #12983
- LinregressResult = _make_tuple_bunch('LinregressResult',
- ['slope', 'intercept', 'rvalue',
- 'pvalue', 'stderr'],
- extra_field_names=['intercept_stderr'])
- TheilslopesResult = _make_tuple_bunch('TheilslopesResult',
- ['slope', 'intercept',
- 'low_slope', 'high_slope'])
- SiegelslopesResult = _make_tuple_bunch('SiegelslopesResult',
- ['slope', 'intercept'])
- def linregress(x, y=None, alternative='two-sided'):
- """
- Calculate a linear least-squares regression for two sets of measurements.
- Parameters
- ----------
- x, y : array_like
- Two sets of measurements. Both arrays should have the same length. If
- only `x` is given (and ``y=None``), then it must be a two-dimensional
- array where one dimension has length 2. The two sets of measurements
- are then found by splitting the array along the length-2 dimension. In
- the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is
- equivalent to ``linregress(x[0], x[1])``.
- alternative : {'two-sided', 'less', 'greater'}, optional
- Defines the alternative hypothesis. Default is 'two-sided'.
- The following options are available:
- * 'two-sided': the slope of the regression line is nonzero
- * 'less': the slope of the regression line is less than zero
- * 'greater': the slope of the regression line is greater than zero
- .. versionadded:: 1.7.0
- Returns
- -------
- result : ``LinregressResult`` instance
- The return value is an object with the following attributes:
- slope : float
- Slope of the regression line.
- intercept : float
- Intercept of the regression line.
- rvalue : float
- The Pearson correlation coefficient. The square of ``rvalue``
- is equal to the coefficient of determination.
- pvalue : float
- The p-value for a hypothesis test whose null hypothesis is
- that the slope is zero, using Wald Test with t-distribution of
- the test statistic. See `alternative` above for alternative
- hypotheses.
- stderr : float
- Standard error of the estimated slope (gradient), under the
- assumption of residual normality.
- intercept_stderr : float
- Standard error of the estimated intercept, under the assumption
- of residual normality.
- See Also
- --------
- scipy.optimize.curve_fit :
- Use non-linear least squares to fit a function to data.
- scipy.optimize.leastsq :
- Minimize the sum of squares of a set of equations.
- Notes
- -----
- Missing values are considered pair-wise: if a value is missing in `x`,
- the corresponding value in `y` is masked.
- For compatibility with older versions of SciPy, the return value acts
- like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,
- ``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::
- slope, intercept, r, p, se = linregress(x, y)
- With that style, however, the standard error of the intercept is not
- available. To have access to all the computed values, including the
- standard error of the intercept, use the return value as an object
- with attributes, e.g.::
- result = linregress(x, y)
- print(result.intercept, result.intercept_stderr)
- Examples
- --------
- >>> import numpy as np
- >>> import matplotlib.pyplot as plt
- >>> from scipy import stats
- >>> rng = np.random.default_rng()
- Generate some data:
- >>> x = rng.random(10)
- >>> y = 1.6*x + rng.random(10)
- Perform the linear regression:
- >>> res = stats.linregress(x, y)
- Coefficient of determination (R-squared):
- >>> print(f"R-squared: {res.rvalue**2:.6f}")
- R-squared: 0.717533
- Plot the data along with the fitted line:
- >>> plt.plot(x, y, 'o', label='original data')
- >>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
- >>> plt.legend()
- >>> plt.show()
- Calculate 95% confidence interval on slope and intercept:
- >>> # Two-sided inverse Students t-distribution
- >>> # p - probability, df - degrees of freedom
- >>> from scipy.stats import t
- >>> tinv = lambda p, df: abs(t.ppf(p/2, df))
- >>> ts = tinv(0.05, len(x)-2)
- >>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")
- slope (95%): 1.453392 +/- 0.743465
- >>> print(f"intercept (95%): {res.intercept:.6f}"
- ... f" +/- {ts*res.intercept_stderr:.6f}")
- intercept (95%): 0.616950 +/- 0.544475
- """
- TINY = 1.0e-20
- if y is None: # x is a (2, N) or (N, 2) shaped array_like
- x = np.asarray(x)
- if x.shape[0] == 2:
- x, y = x
- elif x.shape[1] == 2:
- x, y = x.T
- else:
- raise ValueError("If only `x` is given as input, it has to "
- "be of shape (2, N) or (N, 2); provided shape "
- f"was {x.shape}.")
- else:
- x = np.asarray(x)
- y = np.asarray(y)
- if x.size == 0 or y.size == 0:
- raise ValueError("Inputs must not be empty.")
- if np.amax(x) == np.amin(x) and len(x) > 1:
- raise ValueError("Cannot calculate a linear regression "
- "if all x values are identical")
- n = len(x)
- xmean = np.mean(x, None)
- ymean = np.mean(y, None)
- # Average sums of square differences from the mean
- # ssxm = mean( (x-mean(x))^2 )
- # ssxym = mean( (x-mean(x)) * (y-mean(y)) )
- ssxm, ssxym, _, ssym = np.cov(x, y, bias=1).flat
- # R-value
- # r = ssxym / sqrt( ssxm * ssym )
- if ssxm == 0.0 or ssym == 0.0:
- # If the denominator was going to be 0
- r = 0.0
- else:
- r = ssxym / np.sqrt(ssxm * ssym)
- # Test for numerical error propagation (make sure -1 < r < 1)
- if r > 1.0:
- r = 1.0
- elif r < -1.0:
- r = -1.0
- slope = ssxym / ssxm
- intercept = ymean - slope*xmean
- if n == 2:
- # handle case when only two points are passed in
- if y[0] == y[1]:
- prob = 1.0
- else:
- prob = 0.0
- slope_stderr = 0.0
- intercept_stderr = 0.0
- else:
- df = n - 2 # Number of degrees of freedom
- # n-2 degrees of freedom because 2 has been used up
- # to estimate the mean and standard deviation
- t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
- t, prob = scipy.stats._stats_py._ttest_finish(df, t, alternative)
- slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
- # Also calculate the standard error of the intercept
- # The following relationship is used:
- # ssxm = mean( (x-mean(x))^2 )
- # = ssx - sx*sx
- # = mean( x^2 ) - mean(x)^2
- intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
- return LinregressResult(slope=slope, intercept=intercept, rvalue=r,
- pvalue=prob, stderr=slope_stderr,
- intercept_stderr=intercept_stderr)
- def theilslopes(y, x=None, alpha=0.95, method='separate'):
- r"""
- Computes the Theil-Sen estimator for a set of points (x, y).
- `theilslopes` implements a method for robust linear regression. It
- computes the slope as the median of all slopes between paired values.
- Parameters
- ----------
- y : array_like
- Dependent variable.
- x : array_like or None, optional
- Independent variable. If None, use ``arange(len(y))`` instead.
- alpha : float, optional
- Confidence degree between 0 and 1. Default is 95% confidence.
- Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
- interpreted as "find the 90% confidence interval".
- method : {'joint', 'separate'}, optional
- Method to be used for computing estimate for intercept.
- Following methods are supported,
- * 'joint': Uses np.median(y - slope * x) as intercept.
- * 'separate': Uses np.median(y) - slope * np.median(x)
- as intercept.
- The default is 'separate'.
- .. versionadded:: 1.8.0
- Returns
- -------
- result : ``TheilslopesResult`` instance
- The return value is an object with the following attributes:
- slope : float
- Theil slope.
- intercept : float
- Intercept of the Theil line.
- low_slope : float
- Lower bound of the confidence interval on `slope`.
- high_slope : float
- Upper bound of the confidence interval on `slope`.
- See Also
- --------
- siegelslopes : a similar technique using repeated medians
- Notes
- -----
- The implementation of `theilslopes` follows [1]_. The intercept is
- not defined in [1]_, and here it is defined as ``median(y) -
- slope*median(x)``, which is given in [3]_. Other definitions of
- the intercept exist in the literature such as ``median(y - slope*x)``
- in [4]_. The approach to compute the intercept can be determined by the
- parameter ``method``. A confidence interval for the intercept is not
- given as this question is not addressed in [1]_.
- For compatibility with older versions of SciPy, the return value acts
- like a ``namedtuple`` of length 4, with fields ``slope``, ``intercept``,
- ``low_slope``, and ``high_slope``, so one can continue to write::
- slope, intercept, low_slope, high_slope = theilslopes(y, x)
- References
- ----------
- .. [1] P.K. Sen, "Estimates of the regression coefficient based on
- Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
- .. [2] H. Theil, "A rank-invariant method of linear and polynomial
- regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
- 53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
- .. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
- John Wiley and Sons, New York, pp. 493.
- .. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- >>> x = np.linspace(-5, 5, num=150)
- >>> y = x + np.random.normal(size=x.size)
- >>> y[11:15] += 10 # add outliers
- >>> y[-5:] -= 7
- Compute the slope, intercept and 90% confidence interval. For comparison,
- also compute the least-squares fit with `linregress`:
- >>> res = stats.theilslopes(y, x, 0.90, method='separate')
- >>> lsq_res = stats.linregress(x, y)
- Plot the results. The Theil-Sen regression line is shown in red, with the
- dashed red lines illustrating the confidence interval of the slope (note
- that the dashed red lines are not the confidence interval of the regression
- as the confidence interval of the intercept is not included). The green
- line shows the least-squares fit for comparison.
- >>> fig = plt.figure()
- >>> ax = fig.add_subplot(111)
- >>> ax.plot(x, y, 'b.')
- >>> ax.plot(x, res[1] + res[0] * x, 'r-')
- >>> ax.plot(x, res[1] + res[2] * x, 'r--')
- >>> ax.plot(x, res[1] + res[3] * x, 'r--')
- >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
- >>> plt.show()
- """
- if method not in ['joint', 'separate']:
- raise ValueError(("method must be either 'joint' or 'separate'."
- "'{}' is invalid.".format(method)))
- # We copy both x and y so we can use _find_repeats.
- y = np.array(y).flatten()
- if x is None:
- x = np.arange(len(y), dtype=float)
- else:
- x = np.array(x, dtype=float).flatten()
- if len(x) != len(y):
- raise ValueError("Incompatible lengths ! (%s<>%s)" %
- (len(y), len(x)))
- # Compute sorted slopes only when deltax > 0
- deltax = x[:, np.newaxis] - x
- deltay = y[:, np.newaxis] - y
- slopes = deltay[deltax > 0] / deltax[deltax > 0]
- if not slopes.size:
- msg = "All `x` coordinates are identical."
- warnings.warn(msg, RuntimeWarning, stacklevel=2)
- slopes.sort()
- medslope = np.median(slopes)
- if method == 'joint':
- medinter = np.median(y - medslope * x)
- else:
- medinter = np.median(y) - medslope * np.median(x)
- # Now compute confidence intervals
- if alpha > 0.5:
- alpha = 1. - alpha
- z = distributions.norm.ppf(alpha / 2.)
- # This implements (2.6) from Sen (1968)
- _, nxreps = _find_repeats(x)
- _, nyreps = _find_repeats(y)
- nt = len(slopes) # N in Sen (1968)
- ny = len(y) # n in Sen (1968)
- # Equation 2.6 in Sen (1968):
- sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
- sum(k * (k-1) * (2*k + 5) for k in nxreps) -
- sum(k * (k-1) * (2*k + 5) for k in nyreps))
- # Find the confidence interval indices in `slopes`
- try:
- sigma = np.sqrt(sigsq)
- Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
- Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
- delta = slopes[[Rl, Ru]]
- except (ValueError, IndexError):
- delta = (np.nan, np.nan)
- return TheilslopesResult(slope=medslope, intercept=medinter,
- low_slope=delta[0], high_slope=delta[1])
- def _find_repeats(arr):
- # This function assumes it may clobber its input.
- if len(arr) == 0:
- return np.array(0, np.float64), np.array(0, np.intp)
- # XXX This cast was previously needed for the Fortran implementation,
- # should we ditch it?
- arr = np.asarray(arr, np.float64).ravel()
- arr.sort()
- # Taken from NumPy 1.9's np.unique.
- change = np.concatenate(([True], arr[1:] != arr[:-1]))
- unique = arr[change]
- change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
- freq = np.diff(change_idx)
- atleast2 = freq > 1
- return unique[atleast2], freq[atleast2]
- def siegelslopes(y, x=None, method="hierarchical"):
- r"""
- Computes the Siegel estimator for a set of points (x, y).
- `siegelslopes` implements a method for robust linear regression
- using repeated medians (see [1]_) to fit a line to the points (x, y).
- The method is robust to outliers with an asymptotic breakdown point
- of 50%.
- Parameters
- ----------
- y : array_like
- Dependent variable.
- x : array_like or None, optional
- Independent variable. If None, use ``arange(len(y))`` instead.
- method : {'hierarchical', 'separate'}
- If 'hierarchical', estimate the intercept using the estimated
- slope ``slope`` (default option).
- If 'separate', estimate the intercept independent of the estimated
- slope. See Notes for details.
- Returns
- -------
- result : ``SiegelslopesResult`` instance
- The return value is an object with the following attributes:
- slope : float
- Estimate of the slope of the regression line.
- intercept : float
- Estimate of the intercept of the regression line.
- See Also
- --------
- theilslopes : a similar technique without repeated medians
- Notes
- -----
- With ``n = len(y)``, compute ``m_j`` as the median of
- the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
- ``slope`` is then the median of all slopes ``m_j``.
- Two ways are given to estimate the intercept in [1]_ which can be chosen
- via the parameter ``method``.
- The hierarchical approach uses the estimated slope ``slope``
- and computes ``intercept`` as the median of ``y - slope*x``.
- The other approach estimates the intercept separately as follows: for
- each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
- lines through the remaining points and take the median ``i_j``.
- ``intercept`` is the median of the ``i_j``.
- The implementation computes `n` times the median of a vector of size `n`
- which can be slow for large vectors. There are more efficient algorithms
- (see [2]_) which are not implemented here.
- For compatibility with older versions of SciPy, the return value acts
- like a ``namedtuple`` of length 2, with fields ``slope`` and
- ``intercept``, so one can continue to write::
- slope, intercept = siegelslopes(y, x)
- References
- ----------
- .. [1] A. Siegel, "Robust Regression Using Repeated Medians",
- Biometrika, Vol. 69, pp. 242-244, 1982.
- .. [2] A. Stein and M. Werman, "Finding the repeated median regression
- line", Proceedings of the Third Annual ACM-SIAM Symposium on
- Discrete Algorithms, pp. 409-413, 1992.
- Examples
- --------
- >>> import numpy as np
- >>> from scipy import stats
- >>> import matplotlib.pyplot as plt
- >>> x = np.linspace(-5, 5, num=150)
- >>> y = x + np.random.normal(size=x.size)
- >>> y[11:15] += 10 # add outliers
- >>> y[-5:] -= 7
- Compute the slope and intercept. For comparison, also compute the
- least-squares fit with `linregress`:
- >>> res = stats.siegelslopes(y, x)
- >>> lsq_res = stats.linregress(x, y)
- Plot the results. The Siegel regression line is shown in red. The green
- line shows the least-squares fit for comparison.
- >>> fig = plt.figure()
- >>> ax = fig.add_subplot(111)
- >>> ax.plot(x, y, 'b.')
- >>> ax.plot(x, res[1] + res[0] * x, 'r-')
- >>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
- >>> plt.show()
- """
- if method not in ['hierarchical', 'separate']:
- raise ValueError("method can only be 'hierarchical' or 'separate'")
- y = np.asarray(y).ravel()
- if x is None:
- x = np.arange(len(y), dtype=float)
- else:
- x = np.asarray(x, dtype=float).ravel()
- if len(x) != len(y):
- raise ValueError("Incompatible lengths ! (%s<>%s)" %
- (len(y), len(x)))
- dtype = np.result_type(x, y, np.float32) # use at least float32
- y, x = y.astype(dtype), x.astype(dtype)
- medslope, medinter = siegelslopes_pythran(y, x, method)
- return SiegelslopesResult(slope=medslope, intercept=medinter)
|