123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264 |
- import operator
- from dataclasses import dataclass
- import numpy as np
- from scipy.special import ndtri
- from ._common import ConfidenceInterval
- def _validate_int(n, bound, name):
- msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
- try:
- n = operator.index(n)
- except TypeError:
- raise TypeError(msg) from None
- if n < bound:
- raise ValueError(msg)
- return n
- @dataclass
- class RelativeRiskResult:
- """
- Result of `scipy.stats.contingency.relative_risk`.
- Attributes
- ----------
- relative_risk : float
- This is::
- (exposed_cases/exposed_total) / (control_cases/control_total)
- exposed_cases : int
- The number of "cases" (i.e. occurrence of disease or other event
- of interest) among the sample of "exposed" individuals.
- exposed_total : int
- The total number of "exposed" individuals in the sample.
- control_cases : int
- The number of "cases" among the sample of "control" or non-exposed
- individuals.
- control_total : int
- The total number of "control" individuals in the sample.
- Methods
- -------
- confidence_interval :
- Compute the confidence interval for the relative risk estimate.
- """
- relative_risk: float
- exposed_cases: int
- exposed_total: int
- control_cases: int
- control_total: int
- def confidence_interval(self, confidence_level=0.95):
- """
- Compute the confidence interval for the relative risk.
- The confidence interval is computed using the Katz method
- (i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).
- Parameters
- ----------
- confidence_level : float, optional
- The confidence level to use for the confidence interval.
- Default is 0.95.
- Returns
- -------
- ci : ConfidenceInterval instance
- The return value is an object with attributes ``low`` and
- ``high`` that hold the confidence interval.
- References
- ----------
- .. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
- confidence intervals for the risk ratio in cohort studies",
- Biometrics, 34, 469-474 (1978).
- .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
- CRC Press LLC, Boca Raton, FL, USA (1996).
- Examples
- --------
- >>> from scipy.stats.contingency import relative_risk
- >>> result = relative_risk(exposed_cases=10, exposed_total=75,
- ... control_cases=12, control_total=225)
- >>> result.relative_risk
- 2.5
- >>> result.confidence_interval()
- ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
- """
- if not 0 <= confidence_level <= 1:
- raise ValueError('confidence_level must be in the interval '
- '[0, 1].')
- # Handle edge cases where either exposed_cases or control_cases
- # is zero. We follow the convention of the R function riskratio
- # from the epitools library.
- if self.exposed_cases == 0 and self.control_cases == 0:
- # relative risk is nan.
- return ConfidenceInterval(low=np.nan, high=np.nan)
- elif self.exposed_cases == 0:
- # relative risk is 0.
- return ConfidenceInterval(low=0.0, high=np.nan)
- elif self.control_cases == 0:
- # relative risk is inf
- return ConfidenceInterval(low=np.nan, high=np.inf)
- alpha = 1 - confidence_level
- z = ndtri(1 - alpha/2)
- rr = self.relative_risk
- # Estimate of the variance of log(rr) is
- # var(log(rr)) = 1/exposed_cases - 1/exposed_total +
- # 1/control_cases - 1/control_total
- # and the standard error is the square root of that.
- se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
- 1/self.control_cases - 1/self.control_total)
- delta = z*se
- katz_lo = rr*np.exp(-delta)
- katz_hi = rr*np.exp(delta)
- return ConfidenceInterval(low=katz_lo, high=katz_hi)
- def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
- """
- Compute the relative risk (also known as the risk ratio).
- This function computes the relative risk associated with a 2x2
- contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
- of accepting a table as an argument, the individual numbers that are
- used to compute the relative risk are given as separate parameters.
- This is to avoid the ambiguity of which row or column of the contingency
- table corresponds to the "exposed" cases and which corresponds to the
- "control" cases. Unlike, say, the odds ratio, the relative risk is not
- invariant under an interchange of the rows or columns.
- Parameters
- ----------
- exposed_cases : nonnegative int
- The number of "cases" (i.e. occurrence of disease or other event
- of interest) among the sample of "exposed" individuals.
- exposed_total : positive int
- The total number of "exposed" individuals in the sample.
- control_cases : nonnegative int
- The number of "cases" among the sample of "control" or non-exposed
- individuals.
- control_total : positive int
- The total number of "control" individuals in the sample.
- Returns
- -------
- result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
- The object has the float attribute ``relative_risk``, which is::
- rr = (exposed_cases/exposed_total) / (control_cases/control_total)
- The object also has the method ``confidence_interval`` to compute
- the confidence interval of the relative risk for a given confidence
- level.
- See Also
- --------
- odds_ratio
- Notes
- -----
- The R package epitools has the function `riskratio`, which accepts
- a table with the following layout::
- disease=0 disease=1
- exposed=0 (ref) n00 n01
- exposed=1 n10 n11
- With a 2x2 table in the above format, the estimate of the CI is
- computed by `riskratio` when the argument method="wald" is given,
- or with the function `riskratio.wald`.
- For example, in a test of the incidence of lung cancer among a
- sample of smokers and nonsmokers, the "exposed" category would
- correspond to "is a smoker" and the "disease" category would
- correspond to "has or had lung cancer".
- To pass the same data to ``relative_risk``, use::
- relative_risk(n11, n10 + n11, n01, n00 + n01)
- .. versionadded:: 1.7.0
- References
- ----------
- .. [1] Alan Agresti, An Introduction to Categorical Data Analysis
- (second edition), Wiley, Hoboken, NJ, USA (2007).
- .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
- CRC Press LLC, Boca Raton, FL, USA (1996).
- Examples
- --------
- >>> from scipy.stats.contingency import relative_risk
- This example is from Example 3.1 of [2]_. The results of a heart
- disease study are summarized in the following table::
- High CAT Low CAT Total
- -------- ------- -----
- CHD 27 44 71
- No CHD 95 443 538
- Total 122 487 609
- CHD is coronary heart disease, and CAT refers to the level of
- circulating catecholamine. CAT is the "exposure" variable, and
- high CAT is the "exposed" category. So the data from the table
- to be passed to ``relative_risk`` is::
- exposed_cases = 27
- exposed_total = 122
- control_cases = 44
- control_total = 487
- >>> result = relative_risk(27, 122, 44, 487)
- >>> result.relative_risk
- 2.4495156482861398
- Find the confidence interval for the relative risk.
- >>> result.confidence_interval(confidence_level=0.95)
- ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)
- The interval does not contain 1, so the data supports the statement
- that high CAT is associated with greater risk of CHD.
- """
- # Relative risk is a trivial calculation. The nontrivial part is in the
- # `confidence_interval` method of the RelativeRiskResult class.
- exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
- exposed_total = _validate_int(exposed_total, 1, "exposed_total")
- control_cases = _validate_int(control_cases, 0, "control_cases")
- control_total = _validate_int(control_total, 1, "control_total")
- if exposed_cases > exposed_total:
- raise ValueError('exposed_cases must not exceed exposed_total.')
- if control_cases > control_total:
- raise ValueError('control_cases must not exceed control_total.')
- if exposed_cases == 0 and control_cases == 0:
- # relative risk is 0/0.
- rr = np.nan
- elif exposed_cases == 0:
- # relative risk is 0/nonzero
- rr = 0.0
- elif control_cases == 0:
- # relative risk is nonzero/0.
- rr = np.inf
- else:
- p1 = exposed_cases / exposed_total
- p2 = control_cases / control_total
- rr = p1 / p2
- return RelativeRiskResult(relative_risk=rr,
- exposed_cases=exposed_cases,
- exposed_total=exposed_total,
- control_cases=control_cases,
- control_total=control_total)
|