123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284 |
- import warnings
- from collections import namedtuple
- import numpy as np
- from scipy import optimize, stats
- from scipy._lib._util import check_random_state
- def _combine_bounds(name, user_bounds, shape_domain, integral):
- """Intersection of user-defined bounds and distribution PDF/PMF domain"""
- user_bounds = np.atleast_1d(user_bounds)
- if user_bounds[0] > user_bounds[1]:
- message = (f"There are no values for `{name}` on the interval "
- f"{list(user_bounds)}.")
- raise ValueError(message)
- bounds = (max(user_bounds[0], shape_domain[0]),
- min(user_bounds[1], shape_domain[1]))
- if integral and (np.ceil(bounds[0]) > np.floor(bounds[1])):
- message = (f"There are no integer values for `{name}` on the interval "
- f"defined by the user-provided bounds and the domain "
- "of the distribution.")
- raise ValueError(message)
- elif not integral and (bounds[0] > bounds[1]):
- message = (f"There are no values for `{name}` on the interval "
- f"defined by the user-provided bounds and the domain "
- "of the distribution.")
- raise ValueError(message)
- if not np.all(np.isfinite(bounds)):
- message = (f"The intersection of user-provided bounds for `{name}` "
- f"and the domain of the distribution is not finite. Please "
- f"provide finite bounds for shape `{name}` in `bounds`.")
- raise ValueError(message)
- return bounds
- class FitResult:
- r"""Result of fitting a discrete or continuous distribution to data
- Attributes
- ----------
- params : namedtuple
- A namedtuple containing the maximum likelihood estimates of the
- shape parameters, location, and (if applicable) scale of the
- distribution.
- success : bool or None
- Whether the optimizer considered the optimization to terminate
- successfully or not.
- message : str or None
- Any status message provided by the optimizer.
- """
- def __init__(self, dist, data, discrete, res):
- self._dist = dist
- self._data = data
- self.discrete = discrete
- self.pxf = getattr(dist, "pmf", None) or getattr(dist, "pdf", None)
- shape_names = [] if dist.shapes is None else dist.shapes.split(", ")
- if not discrete:
- FitParams = namedtuple('FitParams', shape_names + ['loc', 'scale'])
- else:
- FitParams = namedtuple('FitParams', shape_names + ['loc'])
- self.params = FitParams(*res.x)
- # Optimizer can report success even when nllf is infinite
- if res.success and not np.isfinite(self.nllf()):
- res.success = False
- res.message = ("Optimization converged to parameter values that "
- "are inconsistent with the data.")
- self.success = getattr(res, "success", None)
- self.message = getattr(res, "message", None)
- def __repr__(self):
- keys = ["params", "success", "message"]
- m = max(map(len, keys)) + 1
- return '\n'.join([key.rjust(m) + ': ' + repr(getattr(self, key))
- for key in keys if getattr(self, key) is not None])
- def nllf(self, params=None, data=None):
- """Negative log-likelihood function
- Evaluates the negative of the log-likelihood function of the provided
- data at the provided parameters.
- Parameters
- ----------
- params : tuple, optional
- The shape parameters, location, and (if applicable) scale of the
- distribution as a single tuple. Default is the maximum likelihood
- estimates (``self.params``).
- data : array_like, optional
- The data for which the log-likelihood function is to be evaluated.
- Default is the data to which the distribution was fit.
- Returns
- -------
- nllf : float
- The negative of the log-likelihood function.
- """
- params = params if params is not None else self.params
- data = data if data is not None else self._data
- return self._dist.nnlf(theta=params, x=data)
- def plot(self, ax=None, *, plot_type="hist"):
- """Visually compare the data against the fitted distribution.
- Available only if ``matplotlib`` is installed.
- Parameters
- ----------
- ax : matplotlib.axes.Axes
- Axes object to draw the plot onto, otherwise uses the current Axes.
- plot_type : {"hist", "qq", "pp", "cdf"}
- Type of plot to draw. Options include:
- - "hist": Superposes the PDF/PMF of the fitted distribution
- over a normalized histogram of the data.
- - "qq": Scatter plot of theoretical quantiles against the
- empirical quantiles. Specifically, the x-coordinates are the
- values of the fitted distribution PPF evaluated at the
- percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is the
- number of data points, and the y-coordinates are the sorted
- data points.
- - "pp": Scatter plot of theoretical percentiles against the
- observed percentiles. Specifically, the x-coordinates are the
- percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is
- the number of data points, and the y-coordinates are the values
- of the fitted distribution CDF evaluated at the sorted
- data points.
- - "cdf": Superposes the CDF of the fitted distribution over the
- empirical CDF. Specifically, the x-coordinates of the empirical
- CDF are the sorted data points, and the y-coordinates are the
- percentiles ``(np.arange(1, n) - 0.5)/n``, where ``n`` is
- the number of data points.
- Returns
- -------
- ax : matplotlib.axes.Axes
- The matplotlib Axes object on which the plot was drawn.
- """
- try:
- import matplotlib # noqa
- except ModuleNotFoundError as exc:
- message = "matplotlib must be installed to use method `plot`."
- raise ModuleNotFoundError(message) from exc
- plots = {'histogram': self._hist_plot, 'qq': self._qq_plot,
- 'pp': self._pp_plot, 'cdf': self._cdf_plot,
- 'hist': self._hist_plot}
- if plot_type.lower() not in plots:
- message = f"`plot_type` must be one of {set(plots.keys())}"
- raise ValueError(message)
- plot = plots[plot_type.lower()]
- if ax is None:
- import matplotlib.pyplot as plt
- ax = plt.gca()
- fit_params = np.atleast_1d(self.params)
- return plot(ax=ax, fit_params=fit_params)
- def _hist_plot(self, ax, fit_params):
- from matplotlib.ticker import MaxNLocator
- support = self._dist.support(*fit_params)
- lb = support[0] if np.isfinite(support[0]) else min(self._data)
- ub = support[1] if np.isfinite(support[1]) else max(self._data)
- pxf = "PMF" if self.discrete else "PDF"
- if self.discrete:
- x = np.arange(lb, ub + 2)
- y = self.pxf(x, *fit_params)
- ax.vlines(x[:-1], 0, y[:-1], label='Fitted Distribution PMF',
- color='C0')
- options = dict(density=True, bins=x, align='left', color='C1')
- ax.xaxis.set_major_locator(MaxNLocator(integer=True))
- ax.set_xlabel('k')
- ax.set_ylabel('PMF')
- else:
- x = np.linspace(lb, ub, 200)
- y = self.pxf(x, *fit_params)
- ax.plot(x, y, '--', label='Fitted Distribution PDF', color='C0')
- options = dict(density=True, bins=50, align='mid', color='C1')
- ax.set_xlabel('x')
- ax.set_ylabel('PDF')
- if len(self._data) > 50 or self.discrete:
- ax.hist(self._data, label="Histogram of Data", **options)
- else:
- ax.plot(self._data, np.zeros_like(self._data), "*",
- label='Data', color='C1')
- ax.set_title(rf"Fitted $\tt {self._dist.name}$ {pxf} and Histogram")
- ax.legend(*ax.get_legend_handles_labels())
- return ax
- def _qp_plot(self, ax, fit_params, qq):
- data = np.sort(self._data)
- ps = self._plotting_positions(len(self._data))
- if qq:
- qp = "Quantiles"
- plot_type = 'Q-Q'
- x = self._dist.ppf(ps, *fit_params)
- y = data
- else:
- qp = "Percentiles"
- plot_type = 'P-P'
- x = ps
- y = self._dist.cdf(data, *fit_params)
- ax.plot(x, y, '.', label=f'Fitted Distribution {plot_type}',
- color='C0', zorder=1)
- xlim = ax.get_xlim()
- ylim = ax.get_ylim()
- lim = [min(xlim[0], ylim[0]), max(xlim[1], ylim[1])]
- if not qq:
- lim = max(lim[0], 0), min(lim[1], 1)
- if self.discrete and qq:
- q_min, q_max = int(lim[0]), int(lim[1]+1)
- q_ideal = np.arange(q_min, q_max)
- # q_ideal = np.unique(self._dist.ppf(ps, *fit_params))
- ax.plot(q_ideal, q_ideal, 'o', label='Reference', color='k',
- alpha=0.25, markerfacecolor='none', clip_on=True)
- elif self.discrete and not qq:
- # The intent of this is to match the plot that would be produced
- # if x were continuous on [0, 1] and y were cdf(ppf(x)).
- # It can be approximated by letting x = np.linspace(0, 1, 1000),
- # but this might not look great when zooming in. The vertical
- # portions are included to indicate where the transition occurs
- # where the data completely obscures the horizontal portions.
- p_min, p_max = lim
- a, b = self._dist.support(*fit_params)
- p_min = max(p_min, 0 if np.isfinite(a) else 1e-3)
- p_max = min(p_max, 1 if np.isfinite(b) else 1-1e-3)
- q_min, q_max = self._dist.ppf([p_min, p_max], *fit_params)
- qs = np.arange(q_min-1, q_max+1)
- ps = self._dist.cdf(qs, *fit_params)
- ax.step(ps, ps, '-', label='Reference', color='k', alpha=0.25,
- clip_on=True)
- else:
- ax.plot(lim, lim, '-', label='Reference', color='k', alpha=0.25,
- clip_on=True)
- ax.set_xlim(lim)
- ax.set_ylim(lim)
- ax.set_xlabel(rf"Fitted $\tt {self._dist.name}$ Theoretical {qp}")
- ax.set_ylabel(f"Data {qp}")
- ax.set_title(rf"Fitted $\tt {self._dist.name}$ {plot_type} Plot")
- ax.legend(*ax.get_legend_handles_labels())
- ax.set_aspect('equal')
- return ax
- def _qq_plot(self, **kwargs):
- return self._qp_plot(qq=True, **kwargs)
- def _pp_plot(self, **kwargs):
- return self._qp_plot(qq=False, **kwargs)
- def _plotting_positions(self, n, a=.5):
- # See https://en.wikipedia.org/wiki/Q%E2%80%93Q_plot#Plotting_positions
- k = np.arange(1, n+1)
- return (k-a) / (n + 1 - 2*a)
- def _cdf_plot(self, ax, fit_params):
- data = np.sort(self._data)
- ecdf = self._plotting_positions(len(self._data))
- ls = '--' if len(np.unique(data)) < 30 else '.'
- xlabel = 'k' if self.discrete else 'x'
- ax.step(data, ecdf, ls, label='Empirical CDF', color='C1', zorder=0)
- xlim = ax.get_xlim()
- q = np.linspace(*xlim, 300)
- tcdf = self._dist.cdf(q, *fit_params)
- ax.plot(q, tcdf, label='Fitted Distribution CDF', color='C0', zorder=1)
- ax.set_xlim(xlim)
- ax.set_ylim(0, 1)
- ax.set_xlabel(xlabel)
- ax.set_ylabel("CDF")
- ax.set_title(rf"Fitted $\tt {self._dist.name}$ and Empirical CDF")
- handles, labels = ax.get_legend_handles_labels()
- ax.legend(handles[::-1], labels[::-1])
- return ax
- def fit(dist, data, bounds=None, *, guess=None, method='mle',
- optimizer=optimize.differential_evolution):
- r"""Fit a discrete or continuous distribution to data
- Given a distribution, data, and bounds on the parameters of the
- distribution, return maximum likelihood estimates of the parameters.
- Parameters
- ----------
- dist : `scipy.stats.rv_continuous` or `scipy.stats.rv_discrete`
- The object representing the distribution to be fit to the data.
- data : 1D array_like
- The data to which the distribution is to be fit. If the data contain
- any of ``np.nan``, ``np.inf``, or -``np.inf``, the fit method will
- raise a ``ValueError``.
- bounds : dict or sequence of tuples, optional
- If a dictionary, each key is the name of a parameter of the
- distribution, and the corresponding value is a tuple containing the
- lower and upper bound on that parameter. If the distribution is
- defined only for a finite range of values of that parameter, no entry
- for that parameter is required; e.g., some distributions have
- parameters which must be on the interval [0, 1]. Bounds for parameters
- location (``loc``) and scale (``scale``) are optional; by default,
- they are fixed to 0 and 1, respectively.
- If a sequence, element *i* is a tuple containing the lower and upper
- bound on the *i*\ th parameter of the distribution. In this case,
- bounds for *all* distribution shape parameters must be provided.
- Optionally, bounds for location and scale may follow the
- distribution shape parameters.
- If a shape is to be held fixed (e.g. if it is known), the
- lower and upper bounds may be equal. If a user-provided lower or upper
- bound is beyond a bound of the domain for which the distribution is
- defined, the bound of the distribution's domain will replace the
- user-provided value. Similarly, parameters which must be integral
- will be constrained to integral values within the user-provided bounds.
- guess : dict or array_like, optional
- If a dictionary, each key is the name of a parameter of the
- distribution, and the corresponding value is a guess for the value
- of the parameter.
- If a sequence, element *i* is a guess for the *i*\ th parameter of the
- distribution. In this case, guesses for *all* distribution shape
- parameters must be provided.
- If `guess` is not provided, guesses for the decision variables will
- not be passed to the optimizer. If `guess` is provided, guesses for
- any missing parameters will be set at the mean of the lower and
- upper bounds. Guesses for parameters which must be integral will be
- rounded to integral values, and guesses that lie outside the
- intersection of the user-provided bounds and the domain of the
- distribution will be clipped.
- method : {'mle', 'mse'}
- With ``method="mle"`` (default), the fit is computed by minimizing
- the negative log-likelihood function. A large, finite penalty
- (rather than infinite negative log-likelihood) is applied for
- observations beyond the support of the distribution.
- With ``method="mse"``, the fit is computed by minimizing
- the negative log-product spacing function. The same penalty is applied
- for observations beyond the support. We follow the approach of [1]_,
- which is generalized for samples with repeated observations.
- optimizer : callable, optional
- `optimizer` is a callable that accepts the following positional
- argument.
- fun : callable
- The objective function to be optimized. `fun` accepts one argument
- ``x``, candidate shape parameters of the distribution, and returns
- the objective function value given ``x``, `dist`, and the provided
- `data`.
- The job of `optimizer` is to find values of the decision variables
- that minimizes `fun`.
- `optimizer` must also accept the following keyword argument.
- bounds : sequence of tuples
- The bounds on values of the decision variables; each element will
- be a tuple containing the lower and upper bound on a decision
- variable.
- If `guess` is provided, `optimizer` must also accept the following
- keyword argument.
- x0 : array_like
- The guesses for each decision variable.
- If the distribution has any shape parameters that must be integral or
- if the distribution is discrete and the location parameter is not
- fixed, `optimizer` must also accept the following keyword argument.
- integrality : array_like of bools
- For each decision variable, True if the decision variable
- must be constrained to integer values and False if the decision
- variable is continuous.
- `optimizer` must return an object, such as an instance of
- `scipy.optimize.OptimizeResult`, which holds the optimal values of
- the decision variables in an attribute ``x``. If attributes
- ``fun``, ``status``, or ``message`` are provided, they will be
- included in the result object returned by `fit`.
- Returns
- -------
- result : `~scipy.stats._result_classes.FitResult`
- An object with the following fields.
- params : namedtuple
- A namedtuple containing the maximum likelihood estimates of the
- shape parameters, location, and (if applicable) scale of the
- distribution.
- success : bool or None
- Whether the optimizer considered the optimization to terminate
- successfully or not.
- message : str or None
- Any status message provided by the optimizer.
- The object has the following method:
- nllf(params=None, data=None)
- By default, the negative log-likehood function at the fitted
- `params` for the given `data`. Accepts a tuple containing
- alternative shapes, location, and scale of the distribution and
- an array of alternative data.
- plot(ax=None)
- Superposes the PDF/PMF of the fitted distribution over a normalized
- histogram of the data.
- See Also
- --------
- rv_continuous, rv_discrete
- Notes
- -----
- Optimization is more likely to converge to the maximum likelihood estimate
- when the user provides tight bounds containing the maximum likelihood
- estimate. For example, when fitting a binomial distribution to data, the
- number of experiments underlying each sample may be known, in which case
- the corresponding shape parameter ``n`` can be fixed.
- References
- ----------
- .. [1] Shao, Yongzhao, and Marjorie G. Hahn. "Maximum product of spacings
- method: a unified formulation with illustration of strong
- consistency." Illinois Journal of Mathematics 43.3 (1999): 489-499.
- Examples
- --------
- Suppose we wish to fit a distribution to the following data.
- >>> import numpy as np
- >>> from scipy import stats
- >>> rng = np.random.default_rng()
- >>> dist = stats.nbinom
- >>> shapes = (5, 0.5)
- >>> data = dist.rvs(*shapes, size=1000, random_state=rng)
- Suppose we do not know how the data were generated, but we suspect that
- it follows a negative binomial distribution with parameters *n* and *p*\.
- (See `scipy.stats.nbinom`.) We believe that the parameter *n* was fewer
- than 30, and we know that the parameter *p* must lie on the interval
- [0, 1]. We record this information in a variable `bounds` and pass
- this information to `fit`.
- >>> bounds = [(0, 30), (0, 1)]
- >>> res = stats.fit(dist, data, bounds)
- `fit` searches within the user-specified `bounds` for the
- values that best match the data (in the sense of maximum likelihood
- estimation). In this case, it found shape values similar to those
- from which the data were actually generated.
- >>> res.params
- FitParams(n=5.0, p=0.5028157644634368, loc=0.0) # may vary
- We can visualize the results by superposing the probability mass function
- of the distribution (with the shapes fit to the data) over a normalized
- histogram of the data.
- >>> import matplotlib.pyplot as plt # matplotlib must be installed to plot
- >>> res.plot()
- >>> plt.show()
- Note that the estimate for *n* was exactly integral; this is because
- the domain of the `nbinom` PMF includes only integral *n*, and the `nbinom`
- object "knows" that. `nbinom` also knows that the shape *p* must be a
- value between 0 and 1. In such a case - when the domain of the distribution
- with respect to a parameter is finite - we are not required to specify
- bounds for the parameter.
- >>> bounds = {'n': (0, 30)} # omit parameter p using a `dict`
- >>> res2 = stats.fit(dist, data, bounds)
- >>> res2.params
- FitParams(n=5.0, p=0.5016492009232932, loc=0.0) # may vary
- If we wish to force the distribution to be fit with *n* fixed at 6, we can
- set both the lower and upper bounds on *n* to 6. Note, however, that the
- value of the objective function being optimized is typically worse (higher)
- in this case.
- >>> bounds = {'n': (6, 6)} # fix parameter `n`
- >>> res3 = stats.fit(dist, data, bounds)
- >>> res3.params
- FitParams(n=6.0, p=0.5486556076755706, loc=0.0) # may vary
- >>> res3.nllf() > res.nllf()
- True # may vary
- Note that the numerical results of the previous examples are typical, but
- they may vary because the default optimizer used by `fit`,
- `scipy.optimize.differential_evolution`, is stochastic. However, we can
- customize the settings used by the optimizer to ensure reproducibility -
- or even use a different optimizer entirely - using the `optimizer`
- parameter.
- >>> from scipy.optimize import differential_evolution
- >>> rng = np.random.default_rng(767585560716548)
- >>> def optimizer(fun, bounds, *, integrality):
- ... return differential_evolution(fun, bounds, strategy='best2bin',
- ... seed=rng, integrality=integrality)
- >>> bounds = [(0, 30), (0, 1)]
- >>> res4 = stats.fit(dist, data, bounds, optimizer=optimizer)
- >>> res4.params
- FitParams(n=5.0, p=0.5015183149259951, loc=0.0)
- """
- # --- Input Validation / Standardization --- #
- user_bounds = bounds
- user_guess = guess
- # distribution input validation and information collection
- if hasattr(dist, "pdf"): # can't use isinstance for types
- default_bounds = {'loc': (0, 0), 'scale': (1, 1)}
- discrete = False
- elif hasattr(dist, "pmf"):
- default_bounds = {'loc': (0, 0)}
- discrete = True
- else:
- message = ("`dist` must be an instance of `rv_continuous` "
- "or `rv_discrete.`")
- raise ValueError(message)
- try:
- param_info = dist._param_info()
- except AttributeError as e:
- message = (f"Distribution `{dist.name}` is not yet supported by "
- "`scipy.stats.fit` because shape information has "
- "not been defined.")
- raise ValueError(message) from e
- # data input validation
- data = np.asarray(data)
- if data.ndim != 1:
- message = "`data` must be exactly one-dimensional."
- raise ValueError(message)
- if not (np.issubdtype(data.dtype, np.number)
- and np.all(np.isfinite(data))):
- message = "All elements of `data` must be finite numbers."
- raise ValueError(message)
- # bounds input validation and information collection
- n_params = len(param_info)
- n_shapes = n_params - (1 if discrete else 2)
- param_list = [param.name for param in param_info]
- param_names = ", ".join(param_list)
- shape_names = ", ".join(param_list[:n_shapes])
- if user_bounds is None:
- user_bounds = {}
- if isinstance(user_bounds, dict):
- default_bounds.update(user_bounds)
- user_bounds = default_bounds
- user_bounds_array = np.empty((n_params, 2))
- for i in range(n_params):
- param_name = param_info[i].name
- user_bound = user_bounds.pop(param_name, None)
- if user_bound is None:
- user_bound = param_info[i].domain
- user_bounds_array[i] = user_bound
- if user_bounds:
- message = ("Bounds provided for the following unrecognized "
- f"parameters will be ignored: {set(user_bounds)}")
- warnings.warn(message, RuntimeWarning, stacklevel=2)
- else:
- try:
- user_bounds = np.asarray(user_bounds, dtype=float)
- if user_bounds.size == 0:
- user_bounds = np.empty((0, 2))
- except ValueError as e:
- message = ("Each element of a `bounds` sequence must be a tuple "
- "containing two elements: the lower and upper bound of "
- "a distribution parameter.")
- raise ValueError(message) from e
- if (user_bounds.ndim != 2 or user_bounds.shape[1] != 2):
- message = ("Each element of `bounds` must be a tuple specifying "
- "the lower and upper bounds of a shape parameter")
- raise ValueError(message)
- if user_bounds.shape[0] < n_shapes:
- message = (f"A `bounds` sequence must contain at least {n_shapes} "
- "elements: tuples specifying the lower and upper "
- f"bounds of all shape parameters {shape_names}.")
- raise ValueError(message)
- if user_bounds.shape[0] > n_params:
- message = ("A `bounds` sequence may not contain more than "
- f"{n_params} elements: tuples specifying the lower and "
- "upper bounds of distribution parameters "
- f"{param_names}.")
- raise ValueError(message)
- user_bounds_array = np.empty((n_params, 2))
- user_bounds_array[n_shapes:] = list(default_bounds.values())
- user_bounds_array[:len(user_bounds)] = user_bounds
- user_bounds = user_bounds_array
- validated_bounds = []
- for i in range(n_params):
- name = param_info[i].name
- user_bound = user_bounds_array[i]
- param_domain = param_info[i].domain
- integral = param_info[i].integrality
- combined = _combine_bounds(name, user_bound, param_domain, integral)
- validated_bounds.append(combined)
- bounds = np.asarray(validated_bounds)
- integrality = [param.integrality for param in param_info]
- # guess input validation
- if user_guess is None:
- guess_array = None
- elif isinstance(user_guess, dict):
- default_guess = {param.name: np.mean(bound)
- for param, bound in zip(param_info, bounds)}
- unrecognized = set(user_guess) - set(default_guess)
- if unrecognized:
- message = ("Guesses provided for the following unrecognized "
- f"parameters will be ignored: {unrecognized}")
- warnings.warn(message, RuntimeWarning, stacklevel=2)
- default_guess.update(user_guess)
- message = ("Each element of `guess` must be a scalar "
- "guess for a distribution parameter.")
- try:
- guess_array = np.asarray([default_guess[param.name]
- for param in param_info], dtype=float)
- except ValueError as e:
- raise ValueError(message) from e
- else:
- message = ("Each element of `guess` must be a scalar "
- "guess for a distribution parameter.")
- try:
- user_guess = np.asarray(user_guess, dtype=float)
- except ValueError as e:
- raise ValueError(message) from e
- if user_guess.ndim != 1:
- raise ValueError(message)
- if user_guess.shape[0] < n_shapes:
- message = (f"A `guess` sequence must contain at least {n_shapes} "
- "elements: scalar guesses for the distribution shape "
- f"parameters {shape_names}.")
- raise ValueError(message)
- if user_guess.shape[0] > n_params:
- message = ("A `guess` sequence may not contain more than "
- f"{n_params} elements: scalar guesses for the "
- f"distribution parameters {param_names}.")
- raise ValueError(message)
- guess_array = np.mean(bounds, axis=1)
- guess_array[:len(user_guess)] = user_guess
- if guess_array is not None:
- guess_rounded = guess_array.copy()
- guess_rounded[integrality] = np.round(guess_rounded[integrality])
- rounded = np.where(guess_rounded != guess_array)[0]
- for i in rounded:
- message = (f"Guess for parameter `{param_info[i].name}` "
- f"rounded from {guess_array[i]} to {guess_rounded[i]}.")
- warnings.warn(message, RuntimeWarning, stacklevel=2)
- guess_clipped = np.clip(guess_rounded, bounds[:, 0], bounds[:, 1])
- clipped = np.where(guess_clipped != guess_rounded)[0]
- for i in clipped:
- message = (f"Guess for parameter `{param_info[i].name}` "
- f"clipped from {guess_rounded[i]} to "
- f"{guess_clipped[i]}.")
- warnings.warn(message, RuntimeWarning, stacklevel=2)
- guess = guess_clipped
- else:
- guess = None
- # --- Fitting --- #
- def nllf(free_params, data=data): # bind data NOW
- with np.errstate(invalid='ignore', divide='ignore'):
- return dist._penalized_nnlf(free_params, data)
- def nlpsf(free_params, data=data): # bind data NOW
- with np.errstate(invalid='ignore', divide='ignore'):
- return dist._penalized_nlpsf(free_params, data)
- methods = {'mle': nllf, 'mse': nlpsf}
- objective = methods[method.lower()]
- with np.errstate(invalid='ignore', divide='ignore'):
- kwds = {}
- if bounds is not None:
- kwds['bounds'] = bounds
- if np.any(integrality):
- kwds['integrality'] = integrality
- if guess is not None:
- kwds['x0'] = guess
- res = optimizer(objective, **kwds)
- return FitResult(dist, data, discrete, res)
- GoodnessOfFitResult = namedtuple('GoodnessOfFitResult',
- ('fit_result', 'statistic', 'pvalue',
- 'null_distribution'))
- def goodness_of_fit(dist, data, *, known_params=None, fit_params=None,
- guessed_params=None, statistic='ad', n_mc_samples=9999,
- random_state=None):
- r"""
- Perform a goodness of fit test comparing data to a distribution family.
- Given a distribution family and data, perform a test of the null hypothesis
- that the data were drawn from a distribution in that family. Any known
- parameters of the distribution may be specified. Remaining parameters of
- the distribution will be fit to the data, and the p-value of the test
- is computed accordingly. Several statistics for comparing the distribution
- to data are available.
- Parameters
- ----------
- dist : `scipy.stats.rv_continuous`
- The object representing the distribution family under the null
- hypothesis.
- data : 1D array_like
- Finite, uncensored data to be tested.
- known_params : dict, optional
- A dictionary containing name-value pairs of known distribution
- parameters. Monte Carlo samples are randomly drawn from the
- null-hypothesized distribution with these values of the parameters.
- Before the statistic is evaluated for each Monte Carlo sample, only
- remaining unknown parameters of the null-hypothesized distribution
- family are fit to the samples; the known parameters are held fixed.
- If all parameters of the distribution family are known, then the step
- of fitting the distribution family to each sample is omitted.
- fit_params : dict, optional
- A dictionary containing name-value pairs of distribution parameters
- that have already been fit to the data, e.g. using `scipy.stats.fit`
- or the ``fit`` method of `dist`. Monte Carlo samples are drawn from the
- null-hypothesized distribution with these specified values of the
- parameter. On those Monte Carlo samples, however, these and all other
- unknown parameters of the null-hypothesized distribution family are
- fit before the statistic is evaluated.
- guessed_params : dict, optional
- A dictionary containing name-value pairs of distribution parameters
- which have been guessed. These parameters are always considered as
- free parameters and are fit both to the provided `data` as well as
- to the Monte Carlo samples drawn from the null-hypothesized
- distribution. The purpose of these `guessed_params` is to be used as
- initial values for the numerical fitting procedure.
- statistic : {"ad", "ks", "cvm"}, optional
- The statistic used to compare data to a distribution after fitting
- unknown parameters of the distribution family to the data. The
- Anderson-Darling ("ad"), Kolmogorov-Smirnov ("ks"), and
- Cramer-von Mises ("cvm") statistics are available [1]_.
- n_mc_samples : int, default: 9999
- The number of Monte Carlo samples drawn from the null hypothesized
- distribution to form the null distribution of the statistic. The
- sample size of each is the same as the given `data`.
- random_state : {None, int, `numpy.random.Generator`,
- `numpy.random.RandomState`}, optional
- Pseudorandom number generator state used to generate the Monte Carlo
- samples.
- If `random_state` is ``None`` (default), the
- `numpy.random.RandomState` singleton is used.
- If `random_state` is an int, a new ``RandomState`` instance is used,
- seeded with `random_state`.
- If `random_state` is already a ``Generator`` or ``RandomState``
- instance, then the provided instance is used.
- Returns
- -------
- res : GoodnessOfFitResult
- An object with the following attributes.
- fit_result : `~scipy.stats._result_classes.FitResult`
- An object representing the fit of the provided `dist` to `data`.
- This object includes the values of distribution family parameters
- that fully define the null-hypothesized distribution, that is,
- the distribution from which Monte Carlo samples are drawn.
- statistic : float
- The value of the statistic comparing provided `data` to the
- null-hypothesized distribution.
- pvalue : float
- The proportion of elements in the null distribution with
- statistic values at least as extreme as the statistic value of the
- provided `data`.
- null_distribution : ndarray
- The value of the statistic for each Monte Carlo sample
- drawn from the null-hypothesized distribution.
- Notes
- -----
- This is a generalized Monte Carlo goodness-of-fit procedure, special cases
- of which correspond with various Anderson-Darling tests, Lilliefors' test,
- etc. The test is described in [2]_, [3]_, and [4]_ as a parametric
- bootstrap test. This is a Monte Carlo test in which parameters that
- specify the distribution from which samples are drawn have been estimated
- from the data. We describe the test using "Monte Carlo" rather than
- "parametric bootstrap" throughout to avoid confusion with the more familiar
- nonparametric bootstrap, and describe how the test is performed below.
- *Traditional goodness of fit tests*
- Traditionally, critical values corresponding with a fixed set of
- significance levels are pre-calculated using Monte Carlo methods. Users
- perform the test by calculating the value of the test statistic only for
- their observed `data` and comparing this value to tabulated critical
- values. This practice is not very flexible, as tables are not available for
- all distributions and combinations of known and unknown parameter values.
- Also, results can be inaccurate when critical values are interpolated from
- limited tabulated data to correspond with the user's sample size and
- fitted parameter values. To overcome these shortcomings, this function
- allows the user to perform the Monte Carlo trials adapted to their
- particular data.
- *Algorithmic overview*
- In brief, this routine executes the following steps:
- 1. Fit unknown parameters to the given `data`, thereby forming the
- "null-hypothesized" distribution, and compute the statistic of
- this pair of data and distribution.
- 2. Draw random samples from this null-hypothesized distribution.
- 3. Fit the unknown parameters to each random sample.
- 4. Calculate the statistic between each sample and the distribution that
- has been fit to the sample.
- 5. Compare the value of the statistic corresponding with `data` from (1)
- against the values of the statistic corresponding with the random
- samples from (4). The p-value is the proportion of samples with a
- statistic value greater than or equal to the statistic of the observed
- data.
- In more detail, the steps are as follows.
- First, any unknown parameters of the distribution family specified by
- `dist` are fit to the provided `data` using maximum likelihood estimation.
- (One exception is the normal distribution with unknown location and scale:
- we use the bias-corrected standard deviation ``np.std(data, ddof=1)`` for
- the scale as recommended in [1]_.)
- These values of the parameters specify a particular member of the
- distribution family referred to as the "null-hypothesized distribution",
- that is, the distribution from which the data were sampled under the null
- hypothesis. The `statistic`, which compares data to a distribution, is
- computed between `data` and the null-hypothesized distribution.
- Next, many (specifically `n_mc_samples`) new samples, each containing the
- same number of observations as `data`, are drawn from the
- null-hypothesized distribution. All unknown parameters of the distribution
- family `dist` are fit to *each resample*, and the `statistic` is computed
- between each sample and its corresponding fitted distribution. These
- values of the statistic form the Monte Carlo null distribution (not to be
- confused with the "null-hypothesized distribution" above).
- The p-value of the test is the proportion of statistic values in the Monte
- Carlo null distribution that are at least as extreme as the statistic value
- of the provided `data`. More precisely, the p-value is given by
- .. math::
- p = \frac{b + 1}
- {m + 1}
- where :math:`b` is the number of statistic values in the Monte Carlo null
- distribution that are greater than or equal to the the statistic value
- calculated for `data`, and :math:`m` is the number of elements in the
- Monte Carlo null distribution (`n_mc_samples`). The addition of :math:`1`
- to the numerator and denominator can be thought of as including the
- value of the statistic corresponding with `data` in the null distribution,
- but a more formal explanation is given in [5]_.
- *Limitations*
- The test can be very slow for some distribution families because unknown
- parameters of the distribution family must be fit to each of the Monte
- Carlo samples, and for most distributions in SciPy, distribution fitting
- performed via numerical optimization.
- *Anti-Pattern*
- For this reason, it may be tempting
- to treat parameters of the distribution pre-fit to `data` (by the user)
- as though they were `known_params`, as specification of all parameters of
- the distribution precludes the need to fit the distribution to each Monte
- Carlo sample. (This is essentially how the original Kilmogorov-Smirnov
- test is performed.) Although such a test can provide evidence against the
- null hypothesis, the test is conservative in the sense that small p-values
- will tend to (greatly) *overestimate* the probability of making a type I
- error (that is, rejecting the null hypothesis although it is true), and the
- power of the test is low (that is, it is less likely to reject the null
- hypothesis even when the null hypothesis is false).
- This is because the Monte Carlo samples are less likely to agree with the
- null-hypothesized distribution as well as `data`. This tends to increase
- the values of the statistic recorded in the null distribution, so that a
- larger number of them exceed the value of statistic for `data`, thereby
- inflating the p-value.
- References
- ----------
- .. [1] M. A. Stephens (1974). "EDF Statistics for Goodness of Fit and
- Some Comparisons." Journal of the American Statistical Association,
- Vol. 69, pp. 730-737.
- .. [2] W. Stute, W. G. Manteiga, and M. P. Quindimil (1993).
- "Bootstrap based goodness-of-fit-tests." Metrika 40.1: 243-256.
- .. [3] C. Genest, & B Rémillard. (2008). "Validity of the parametric
- bootstrap for goodness-of-fit testing in semiparametric models."
- Annales de l'IHP Probabilités et statistiques. Vol. 44. No. 6.
- .. [4] I. Kojadinovic and J. Yan (2012). "Goodness-of-fit testing based on
- a weighted bootstrap: A fast large-sample alternative to the
- parametric bootstrap." Canadian Journal of Statistics 40.3: 480-500.
- .. [5] B. Phipson and G. K. Smyth (2010). "Permutation P-values Should
- Never Be Zero: Calculating Exact P-values When Permutations Are
- Randomly Drawn." Statistical Applications in Genetics and Molecular
- Biology 9.1.
- .. [6] H. W. Lilliefors (1967). "On the Kolmogorov-Smirnov test for
- normality with mean and variance unknown." Journal of the American
- statistical Association 62.318: 399-402.
- Examples
- --------
- A well-known test of the null hypothesis that data were drawn from a
- given distribution is the Kolmogorov-Smirnov (KS) test, available in SciPy
- as `scipy.stats.ks_1samp`. Suppose we wish to test whether the following
- data:
- >>> import numpy as np
- >>> from scipy import stats
- >>> rng = np.random.default_rng()
- >>> x = stats.uniform.rvs(size=75, random_state=rng)
- were sampled from a normal distribution. To perform a KS test, the
- empirical distribution function of the observed data will be compared
- against the (theoretical) cumulative distribution function of a normal
- distribution. Of course, to do this, the normal distribution under the null
- hypothesis must be fully specified. This is commonly done by first fitting
- the ``loc`` and ``scale`` parameters of the distribution to the observed
- data, then performing the test.
- >>> loc, scale = np.mean(x), np.std(x, ddof=1)
- >>> cdf = stats.norm(loc, scale).cdf
- >>> stats.ks_1samp(x, cdf)
- KstestResult(statistic=0.1119257570456813, pvalue=0.2827756409939257)
- An advantage of the KS-test is that the p-value - the probability of
- obtaining a value of the test statistic under the null hypothesis as
- extreme as the value obtained from the observed data - can be calculated
- exactly and efficiently. `goodness_of_fit` can only approximate these
- results.
- >>> known_params = {'loc': loc, 'scale': scale}
- >>> res = stats.goodness_of_fit(stats.norm, x, known_params=known_params,
- ... statistic='ks', random_state=rng)
- >>> res.statistic, res.pvalue
- (0.1119257570456813, 0.2788)
- The statistic matches exactly, but the p-value is estimated by forming
- a "Monte Carlo null distribution", that is, by explicitly drawing random
- samples from `scipy.stats.norm` with the provided parameters and
- calculating the stastic for each. The fraction of these statistic values
- at least as extreme as ``res.statistic`` approximates the exact p-value
- calculated by `scipy.stats.ks_1samp`.
- However, in many cases, we would prefer to test only that the data were
- sampled from one of *any* member of the normal distribution family, not
- specifically from the normal distribution with the location and scale
- fitted to the observed sample. In this case, Lilliefors [6]_ argued that
- the KS test is far too conservative (that is, the p-value overstates
- the actual probability of rejecting a true null hypothesis) and thus lacks
- power - the ability to reject the null hypothesis when the null hypothesis
- is actually false.
- Indeed, our p-value above is approximately 0.28, which is far too large
- to reject the null hypothesis at any common significance level.
- Consider why this might be. Note that in the KS test above, the statistic
- always compares data against the CDF of a normal distribution fitted to the
- *observed data*. This tends to reduce the value of the statistic for the
- observed data, but it is "unfair" when computing the statistic for other
- samples, such as those we randomly draw to form the Monte Carlo null
- distribution. It is easy to correct for this: whenever we compute the KS
- statistic of a sample, we use the CDF of a normal distribution fitted
- to *that sample*. The null distribution in this case has not been
- calculated exactly and is tyically approximated using Monte Carlo methods
- as described above. This is where `goodness_of_fit` excels.
- >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ks',
- ... random_state=rng)
- >>> res.statistic, res.pvalue
- (0.1119257570456813, 0.0196)
- Indeed, this p-value is much smaller, and small enough to (correctly)
- reject the null hypothesis at common signficance levels, including 5% and
- 2.5%.
- However, the KS statistic is not very sensitive to all deviations from
- normality. The original advantage of the KS statistic was the ability
- to compute the null distribution theoretically, but a more sensitive
- statistic - resulting in a higher test power - can be used now that we can
- approximate the null distribution
- computationally. The Anderson-Darling statistic [1]_ tends to be more
- sensitive, and critical values of the this statistic have been tabulated
- for various significance levels and sample sizes using Monte Carlo methods.
- >>> res = stats.anderson(x, 'norm')
- >>> print(res.statistic)
- 1.2139573337497467
- >>> print(res.critical_values)
- [0.549 0.625 0.75 0.875 1.041]
- >>> print(res.significance_level)
- [15. 10. 5. 2.5 1. ]
- Here, the observed value of the statistic exceeds the critical value
- corresponding with a 1% significance level. This tells us that the p-value
- of the observed data is less than 1%, but what is it? We could interpolate
- from these (already-interpolated) values, but `goodness_of_fit` can
- estimate it directly.
- >>> res = stats.goodness_of_fit(stats.norm, x, statistic='ad',
- ... random_state=rng)
- >>> res.statistic, res.pvalue
- (1.2139573337497467, 0.0034)
- A further advantage is that use of `goodness_of_fit` is not limited to
- a particular set of distributions or conditions on which parameters
- are known versus which must be estimated from data. Instead,
- `goodness_of_fit` can estimate p-values relatively quickly for any
- distribution with a sufficiently fast and reliable ``fit`` method. For
- instance, here we perform a goodness of fit test using the Cramer-von Mises
- statistic against the Rayleigh distribution with known location and unknown
- scale.
- >>> rng = np.random.default_rng()
- >>> x = stats.chi(df=2.2, loc=0, scale=2).rvs(size=1000, random_state=rng)
- >>> res = stats.goodness_of_fit(stats.rayleigh, x, statistic='cvm',
- ... known_params={'loc': 0}, random_state=rng)
- This executes fairly quickly, but to check the reliability of the ``fit``
- method, we should inspect the fit result.
- >>> res.fit_result # location is as specified, and scale is reasonable
- params: FitParams(loc=0.0, scale=2.1026719844231243)
- success: True
- message: 'The fit was performed successfully.'
- >>> import matplotlib.pyplot as plt # matplotlib must be installed to plot
- >>> res.fit_result.plot()
- >>> plt.show()
- If the distribution is not fit to the observed data as well as possible,
- the test may not control the type I error rate, that is, the chance of
- rejecting the null hypothesis even when it is true.
- We should also look for extreme outliers in the null distribution that
- may be caused by unreliable fitting. These do not necessarily invalidate
- the result, but they tend to reduce the test's power.
- >>> _, ax = plt.subplots()
- >>> ax.hist(np.log10(res.null_distribution))
- >>> ax.set_xlabel("log10 of CVM statistic under the null hypothesis")
- >>> ax.set_ylabel("Frequency")
- >>> ax.set_title("Histogram of the Monte Carlo null distribution")
- >>> plt.show()
- This plot seems reassuring.
- If ``fit`` method is working reliably, and if the distribution of the test
- statistic is not particularly sensitive to the values of the fitted
- parameters, then the p-value provided by `goodness_of_fit` is expected to
- be a good approximation.
- >>> res.statistic, res.pvalue
- (0.2231991510248692, 0.0525)
- """
- args = _gof_iv(dist, data, known_params, fit_params, guessed_params,
- statistic, n_mc_samples, random_state)
- (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
- guessed_rfd_params, statistic, n_mc_samples_int, random_state) = args
- # Fit null hypothesis distribution to data
- nhd_fit_fun = _get_fit_fun(dist, data, guessed_nhd_params,
- fixed_nhd_params)
- nhd_vals = nhd_fit_fun(data)
- nhd_dist = dist(*nhd_vals)
- def rvs(size):
- return nhd_dist.rvs(size=size, random_state=random_state)
- # Define statistic
- fit_fun = _get_fit_fun(dist, data, guessed_rfd_params, fixed_rfd_params)
- compare_fun = _compare_dict[statistic]
- def statistic_fun(data, axis=-1):
- # Make things simple by always working along the last axis.
- data = np.moveaxis(data, axis, -1)
- rfd_vals = fit_fun(data)
- rfd_dist = dist(*rfd_vals)
- return compare_fun(rfd_dist, data)
- res = stats.monte_carlo_test(data, rvs, statistic_fun, vectorized=True,
- n_resamples=n_mc_samples, axis=-1,
- alternative='greater')
- opt_res = optimize.OptimizeResult()
- opt_res.success = True
- opt_res.message = "The fit was performed successfully."
- opt_res.x = nhd_vals
- # Only continuous distributions for now, hence discrete=False
- # There's no fundamental limitation; it's just that we're not using
- # stats.fit, discrete distributions don't have `fit` method, and
- # we haven't written any vectorized fit functions for a discrete
- # distribution yet.
- return GoodnessOfFitResult(FitResult(dist, data, False, opt_res),
- res.statistic, res.pvalue,
- res.null_distribution)
- def _get_fit_fun(dist, data, guessed_params, fixed_params):
- shape_names = [] if dist.shapes is None else dist.shapes.split(", ")
- param_names = shape_names + ['loc', 'scale']
- fparam_names = ['f'+name for name in param_names]
- all_fixed = not set(fparam_names).difference(fixed_params)
- guessed_shapes = [guessed_params.pop(x, None)
- for x in shape_names if x in guessed_params]
- # Define statistic, including fitting distribution to data
- if dist in _fit_funs:
- def fit_fun(data):
- params = _fit_funs[dist](data, **fixed_params)
- params = np.asarray(np.broadcast_arrays(*params))
- if params.ndim > 1:
- params = params[..., np.newaxis]
- return params
- elif all_fixed:
- def fit_fun(data):
- return [fixed_params[name] for name in fparam_names]
- else:
- def fit_fun_1d(data):
- return dist.fit(data, *guessed_shapes, **guessed_params,
- **fixed_params)
- def fit_fun(data):
- params = np.apply_along_axis(fit_fun_1d, axis=-1, arr=data)
- if params.ndim > 1:
- params = params.T[..., np.newaxis]
- return params
- return fit_fun
- # Vectorized fitting functions. These are to accept ND `data` in which each
- # row (slice along last axis) is a sample to fit and scalar fixed parameters.
- # They return a tuple of shape parameter arrays, each of shape data.shape[:-1].
- def _fit_norm(data, floc=None, fscale=None):
- loc = floc
- scale = fscale
- if loc is None and scale is None:
- loc = np.mean(data, axis=-1)
- scale = np.std(data, ddof=1, axis=-1)
- elif loc is None:
- loc = np.mean(data, axis=-1)
- elif scale is None:
- scale = np.sqrt(((data - loc)**2).mean(axis=-1))
- return loc, scale
- _fit_funs = {stats.norm: _fit_norm} # type: ignore[attr-defined]
- # Vectorized goodness of fit statistic functions. These accept a frozen
- # distribution object and `data` in which each row (slice along last axis) is
- # a sample.
- def _anderson_darling(dist, data):
- x = np.sort(data, axis=-1)
- n = data.shape[-1]
- i = np.arange(1, n+1)
- Si = (2*i - 1)/n * (dist.logcdf(x) + dist.logsf(x[..., ::-1]))
- S = np.sum(Si, axis=-1)
- return -n - S
- def _compute_dplus(cdfvals): # adapted from _stats_py before gh-17062
- n = cdfvals.shape[-1]
- return (np.arange(1.0, n + 1) / n - cdfvals).max(axis=-1)
- def _compute_dminus(cdfvals, axis=-1):
- n = cdfvals.shape[-1]
- return (cdfvals - np.arange(0.0, n)/n).max(axis=-1)
- def _kolmogorov_smirnov(dist, data):
- x = np.sort(data, axis=-1)
- cdfvals = dist.cdf(x)
- Dplus = _compute_dplus(cdfvals) # always works along last axis
- Dminus = _compute_dminus(cdfvals)
- return np.maximum(Dplus, Dminus)
- def _cramer_von_mises(dist, data):
- x = np.sort(data, axis=-1)
- n = data.shape[-1]
- cdfvals = dist.cdf(x)
- u = (2*np.arange(1, n+1) - 1)/(2*n)
- w = 1 / (12*n) + np.sum((u - cdfvals)**2, axis=-1)
- return w
- _compare_dict = {"ad": _anderson_darling, "ks": _kolmogorov_smirnov,
- "cvm": _cramer_von_mises}
- def _gof_iv(dist, data, known_params, fit_params, guessed_params, statistic,
- n_mc_samples, random_state):
- if not isinstance(dist, stats.rv_continuous):
- message = ("`dist` must be a (non-frozen) instance of "
- "`stats.rv_continuous`.")
- raise TypeError(message)
- data = np.asarray(data, dtype=float)
- if not data.ndim == 1:
- message = "`data` must be a one-dimensional array of numbers."
- raise ValueError(message)
- # Leave validation of these key/value pairs to the `fit` method,
- # but collect these into dictionaries that will be used
- known_params = known_params or dict()
- fit_params = fit_params or dict()
- guessed_params = guessed_params or dict()
- known_params_f = {("f"+key): val for key, val in known_params.items()}
- fit_params_f = {("f"+key): val for key, val in fit_params.items()}
- # These the the values of parameters of the null distribution family
- # with which resamples are drawn
- fixed_nhd_params = known_params_f.copy()
- fixed_nhd_params.update(fit_params_f)
- # These are fixed when fitting the distribution family to resamples
- fixed_rfd_params = known_params_f.copy()
- # These are used as guesses when fitting the distribution family to
- # the original data
- guessed_nhd_params = guessed_params.copy()
- # These are used as guesses when fitting the distribution family to
- # resamples
- guessed_rfd_params = fit_params.copy()
- guessed_rfd_params.update(guessed_params)
- statistics = {'ad', 'ks', 'cvm'}
- if statistic.lower() not in statistics:
- message = f"`statistic` must be one of {statistics}."
- raise ValueError(message)
- n_mc_samples_int = int(n_mc_samples)
- if n_mc_samples_int != n_mc_samples:
- message = "`n_mc_samples` must be an integer."
- raise TypeError(message)
- random_state = check_random_state(random_state)
- return (dist, data, fixed_nhd_params, fixed_rfd_params, guessed_nhd_params,
- guessed_rfd_params, statistic, n_mc_samples_int, random_state)
|