123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465 |
- import numpy as np
- from scipy.special import ndtri
- from scipy.optimize import brentq
- from ._discrete_distns import nchypergeom_fisher
- from ._common import ConfidenceInterval
- def _sample_odds_ratio(table):
- """
- Given a table [[a, b], [c, d]], compute a*d/(b*c).
- Return nan if the numerator and denominator are 0.
- Return inf if just the denominator is 0.
- """
- # table must be a 2x2 numpy array.
- if table[1, 0] > 0 and table[0, 1] > 0:
- oddsratio = table[0, 0] * table[1, 1] / (table[1, 0] * table[0, 1])
- elif table[0, 0] == 0 or table[1, 1] == 0:
- oddsratio = np.nan
- else:
- oddsratio = np.inf
- return oddsratio
- def _solve(func):
- """
- Solve func(nc) = 0. func must be an increasing function.
- """
- # We could just as well call the variable `x` instead of `nc`, but we
- # always call this function with functions for which nc (the noncentrality
- # parameter) is the variable for which we are solving.
- nc = 1.0
- value = func(nc)
- if value == 0:
- return nc
- # Multiplicative factor by which to increase or decrease nc when
- # searching for a bracketing interval.
- factor = 2.0
- # Find a bracketing interval.
- if value > 0:
- nc /= factor
- while func(nc) > 0:
- nc /= factor
- lo = nc
- hi = factor*nc
- else:
- nc *= factor
- while func(nc) < 0:
- nc *= factor
- lo = nc/factor
- hi = nc
- # lo and hi bracket the solution for nc.
- nc = brentq(func, lo, hi, xtol=1e-13)
- return nc
- def _nc_hypergeom_mean_inverse(x, M, n, N):
- """
- For the given noncentral hypergeometric parameters x, M, n,and N
- (table[0,0], total, row 0 sum and column 0 sum, resp., of a 2x2
- contingency table), find the noncentrality parameter of Fisher's
- noncentral hypergeometric distribution whose mean is x.
- """
- nc = _solve(lambda nc: nchypergeom_fisher.mean(M, n, N, nc) - x)
- return nc
- def _hypergeom_params_from_table(table):
- # The notation M, n and N is consistent with stats.hypergeom and
- # stats.nchypergeom_fisher.
- x = table[0, 0]
- M = table.sum()
- n = table[0].sum()
- N = table[:, 0].sum()
- return x, M, n, N
- def _ci_upper(table, alpha):
- """
- Compute the upper end of the confidence interval.
- """
- if _sample_odds_ratio(table) == np.inf:
- return np.inf
- x, M, n, N = _hypergeom_params_from_table(table)
- # nchypergeom_fisher.cdf is a decreasing function of nc, so we negate
- # it in the lambda expression.
- nc = _solve(lambda nc: -nchypergeom_fisher.cdf(x, M, n, N, nc) + alpha)
- return nc
- def _ci_lower(table, alpha):
- """
- Compute the lower end of the confidence interval.
- """
- if _sample_odds_ratio(table) == 0:
- return 0
- x, M, n, N = _hypergeom_params_from_table(table)
- nc = _solve(lambda nc: nchypergeom_fisher.sf(x - 1, M, n, N, nc) - alpha)
- return nc
- def _conditional_oddsratio(table):
- """
- Conditional MLE of the odds ratio for the 2x2 contingency table.
- """
- x, M, n, N = _hypergeom_params_from_table(table)
- # Get the bounds of the support. The support of the noncentral
- # hypergeometric distribution with parameters M, n, and N is the same
- # for all values of the noncentrality parameter, so we can use 1 here.
- lo, hi = nchypergeom_fisher.support(M, n, N, 1)
- # Check if x is at one of the extremes of the support. If so, we know
- # the odds ratio is either 0 or inf.
- if x == lo:
- # x is at the low end of the support.
- return 0
- if x == hi:
- # x is at the high end of the support.
- return np.inf
- nc = _nc_hypergeom_mean_inverse(x, M, n, N)
- return nc
- def _conditional_oddsratio_ci(table, confidence_level=0.95,
- alternative='two-sided'):
- """
- Conditional exact confidence interval for the odds ratio.
- """
- if alternative == 'two-sided':
- alpha = 0.5*(1 - confidence_level)
- lower = _ci_lower(table, alpha)
- upper = _ci_upper(table, alpha)
- elif alternative == 'less':
- lower = 0.0
- upper = _ci_upper(table, 1 - confidence_level)
- else:
- # alternative == 'greater'
- lower = _ci_lower(table, 1 - confidence_level)
- upper = np.inf
- return lower, upper
- def _sample_odds_ratio_ci(table, confidence_level=0.95,
- alternative='two-sided'):
- oddsratio = _sample_odds_ratio(table)
- log_or = np.log(oddsratio)
- se = np.sqrt((1/table).sum())
- if alternative == 'less':
- z = ndtri(confidence_level)
- loglow = -np.inf
- loghigh = log_or + z*se
- elif alternative == 'greater':
- z = ndtri(confidence_level)
- loglow = log_or - z*se
- loghigh = np.inf
- else:
- # alternative is 'two-sided'
- z = ndtri(0.5*confidence_level + 0.5)
- loglow = log_or - z*se
- loghigh = log_or + z*se
- return np.exp(loglow), np.exp(loghigh)
- class OddsRatioResult:
- """
- Result of `scipy.stats.contingency.odds_ratio`. See the
- docstring for `odds_ratio` for more details.
- Attributes
- ----------
- statistic : float
- The computed odds ratio.
- * If `kind` is ``'sample'``, this is sample (or unconditional)
- estimate, given by
- ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
- * If `kind` is ``'conditional'``, this is the conditional
- maximum likelihood estimate for the odds ratio. It is
- the noncentrality parameter of Fisher's noncentral
- hypergeometric distribution with the same hypergeometric
- parameters as `table` and whose mean is ``table[0, 0]``.
- Methods
- -------
- confidence_interval :
- Confidence interval for the odds ratio.
- """
- def __init__(self, _table, _kind, statistic):
- # for now, no need to make _table and _kind public, since this sort of
- # information is returned in very few `scipy.stats` results
- self._table = _table
- self._kind = _kind
- self.statistic = statistic
- def __repr__(self):
- return f"OddsRatioResult(statistic={self.statistic})"
- def confidence_interval(self, confidence_level=0.95,
- alternative='two-sided'):
- """
- Confidence interval for the odds ratio.
- Parameters
- ----------
- confidence_level: float
- Desired confidence level for the confidence interval.
- The value must be given as a fraction between 0 and 1.
- Default is 0.95 (meaning 95%).
- alternative : {'two-sided', 'less', 'greater'}, optional
- The alternative hypothesis of the hypothesis test to which the
- confidence interval corresponds. That is, suppose the null
- hypothesis is that the true odds ratio equals ``OR`` and the
- confidence interval is ``(low, high)``. Then the following options
- for `alternative` are available (default is 'two-sided'):
- * 'two-sided': the true odds ratio is not equal to ``OR``. There
- is evidence against the null hypothesis at the chosen
- `confidence_level` if ``high < OR`` or ``low > OR``.
- * 'less': the true odds ratio is less than ``OR``. The ``low`` end
- of the confidence interval is 0, and there is evidence against
- the null hypothesis at the chosen `confidence_level` if
- ``high < OR``.
- * 'greater': the true odds ratio is greater than ``OR``. The
- ``high`` end of the confidence interval is ``np.inf``, and there
- is evidence against the null hypothesis at the chosen
- `confidence_level` if ``low > OR``.
- Returns
- -------
- ci : ``ConfidenceInterval`` instance
- The confidence interval, represented as an object with
- attributes ``low`` and ``high``.
- Notes
- -----
- When `kind` is ``'conditional'``, the limits of the confidence
- interval are the conditional "exact confidence limits" as described
- by Fisher [1]_. The conditional odds ratio and confidence interval are
- also discussed in Section 4.1.2 of the text by Sahai and Khurshid [2]_.
- When `kind` is ``'sample'``, the confidence interval is computed
- under the assumption that the logarithm of the odds ratio is normally
- distributed with standard error given by::
- se = sqrt(1/a + 1/b + 1/c + 1/d)
- where ``a``, ``b``, ``c`` and ``d`` are the elements of the
- contingency table. (See, for example, [2]_, section 3.1.3.2,
- or [3]_, section 2.3.3).
- References
- ----------
- .. [1] R. A. Fisher (1935), The logic of inductive inference,
- Journal of the Royal Statistical Society, Vol. 98, No. 1,
- pp. 39-82.
- .. [2] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
- Methods, Techniques, and Applications, CRC Press LLC, Boca
- Raton, Florida.
- .. [3] Alan Agresti, An Introduction to Categorical Data Analyis
- (second edition), Wiley, Hoboken, NJ, USA (2007).
- """
- if alternative not in ['two-sided', 'less', 'greater']:
- raise ValueError("`alternative` must be 'two-sided', 'less' or "
- "'greater'.")
- if confidence_level < 0 or confidence_level > 1:
- raise ValueError('confidence_level must be between 0 and 1')
- if self._kind == 'conditional':
- ci = self._conditional_odds_ratio_ci(confidence_level, alternative)
- else:
- ci = self._sample_odds_ratio_ci(confidence_level, alternative)
- return ci
- def _conditional_odds_ratio_ci(self, confidence_level=0.95,
- alternative='two-sided'):
- """
- Confidence interval for the conditional odds ratio.
- """
- table = self._table
- if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
- # If both values in a row or column are zero, the p-value is 1,
- # the odds ratio is NaN and the confidence interval is (0, inf).
- ci = (0, np.inf)
- else:
- ci = _conditional_oddsratio_ci(table,
- confidence_level=confidence_level,
- alternative=alternative)
- return ConfidenceInterval(low=ci[0], high=ci[1])
- def _sample_odds_ratio_ci(self, confidence_level=0.95,
- alternative='two-sided'):
- """
- Confidence interval for the sample odds ratio.
- """
- if confidence_level < 0 or confidence_level > 1:
- raise ValueError('confidence_level must be between 0 and 1')
- table = self._table
- if 0 in table.sum(axis=0) or 0 in table.sum(axis=1):
- # If both values in a row or column are zero, the p-value is 1,
- # the odds ratio is NaN and the confidence interval is (0, inf).
- ci = (0, np.inf)
- else:
- ci = _sample_odds_ratio_ci(table,
- confidence_level=confidence_level,
- alternative=alternative)
- return ConfidenceInterval(low=ci[0], high=ci[1])
- def odds_ratio(table, *, kind='conditional'):
- r"""
- Compute the odds ratio for a 2x2 contingency table.
- Parameters
- ----------
- table : array_like of ints
- A 2x2 contingency table. Elements must be non-negative integers.
- kind : str, optional
- Which kind of odds ratio to compute, either the sample
- odds ratio (``kind='sample'``) or the conditional odds ratio
- (``kind='conditional'``). Default is ``'conditional'``.
- Returns
- -------
- result : `~scipy.stats._result_classes.OddsRatioResult` instance
- The returned object has two computed attributes:
- statistic : float
- * If `kind` is ``'sample'``, this is sample (or unconditional)
- estimate, given by
- ``table[0, 0]*table[1, 1]/(table[0, 1]*table[1, 0])``.
- * If `kind` is ``'conditional'``, this is the conditional
- maximum likelihood estimate for the odds ratio. It is
- the noncentrality parameter of Fisher's noncentral
- hypergeometric distribution with the same hypergeometric
- parameters as `table` and whose mean is ``table[0, 0]``.
- The object has the method `confidence_interval` that computes
- the confidence interval of the odds ratio.
- See Also
- --------
- scipy.stats.fisher_exact
- relative_risk
- Notes
- -----
- The conditional odds ratio was discussed by Fisher (see "Example 1"
- of [1]_). Texts that cover the odds ratio include [2]_ and [3]_.
- .. versionadded:: 1.10.0
- References
- ----------
- .. [1] R. A. Fisher (1935), The logic of inductive inference,
- Journal of the Royal Statistical Society, Vol. 98, No. 1,
- pp. 39-82.
- .. [2] Breslow NE, Day NE (1980). Statistical methods in cancer research.
- Volume I - The analysis of case-control studies. IARC Sci Publ.
- (32):5-338. PMID: 7216345. (See section 4.2.)
- .. [3] H. Sahai and A. Khurshid (1996), Statistics in Epidemiology:
- Methods, Techniques, and Applications, CRC Press LLC, Boca
- Raton, Florida.
- Examples
- --------
- In epidemiology, individuals are classified as "exposed" or
- "unexposed" to some factor or treatment. If the occurrence of some
- illness is under study, those who have the illness are often
- classifed as "cases", and those without it are "noncases". The
- counts of the occurrences of these classes gives a contingency
- table::
- exposed unexposed
- cases a b
- noncases c d
- The sample odds ratio may be written ``(a/c) / (b/d)``. ``a/c`` can
- be interpreted as the odds of a case occurring in the exposed group,
- and ``b/d`` as the odds of a case occurring in the unexposed group.
- The sample odds ratio is the ratio of these odds. If the odds ratio
- is greater than 1, it suggests that there is a positive association
- between being exposed and being a case.
- Interchanging the rows or columns of the contingency table inverts
- the odds ratio, so it is import to understand the meaning of labels
- given to the rows and columns of the table when interpreting the
- odds ratio.
- Consider a hypothetical example where it is hypothesized that
- exposure to a certain chemical is assocated with increased occurrence
- of a certain disease. Suppose we have the following table for a
- collection of 410 people::
- exposed unexposed
- cases 7 15
- noncases 58 472
- The question we ask is "Is exposure to the chemical associated with
- increased risk of the disease?"
- Compute the odds ratio:
- >>> from scipy.stats.contingency import odds_ratio
- >>> res = odds_ratio([[7, 15], [58, 472]])
- >>> res.statistic
- 3.7836687705553493
- For this sample, the odds of getting the disease for those who have
- been exposed to the chemical are almost 3.8 times that of those who
- have not been exposed.
- We can compute the 95% confidence interval for the odds ratio:
- >>> res.confidence_interval(confidence_level=0.95)
- ConfidenceInterval(low=1.2514829132266785, high=10.363493716701269)
- The 95% confidence interval for the conditional odds ratio is
- approximately (1.25, 10.4).
- """
- if kind not in ['conditional', 'sample']:
- raise ValueError("`kind` must be 'conditional' or 'sample'.")
- c = np.asarray(table)
- if c.shape != (2, 2):
- raise ValueError(f"Invalid shape {c.shape}. The input `table` must be "
- "of shape (2, 2).")
- if not np.issubdtype(c.dtype, np.integer):
- raise ValueError("`table` must be an array of integers, but got "
- f"type {c.dtype}")
- c = c.astype(np.int64)
- if np.any(c < 0):
- raise ValueError("All values in `table` must be nonnegative.")
- if 0 in c.sum(axis=0) or 0 in c.sum(axis=1):
- # If both values in a row or column are zero, the p-value is NaN and
- # the odds ratio is NaN.
- result = OddsRatioResult(_table=c, _kind=kind, statistic=np.nan)
- return result
- if kind == 'sample':
- oddsratio = _sample_odds_ratio(c)
- else: # kind is 'conditional'
- oddsratio = _conditional_oddsratio(c)
- result = OddsRatioResult(_table=c, _kind=kind, statistic=oddsratio)
- return result
|