_relative_risk.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. import operator
  2. from dataclasses import dataclass
  3. import numpy as np
  4. from scipy.special import ndtri
  5. from ._common import ConfidenceInterval
  6. def _validate_int(n, bound, name):
  7. msg = f'{name} must be an integer not less than {bound}, but got {n!r}'
  8. try:
  9. n = operator.index(n)
  10. except TypeError:
  11. raise TypeError(msg) from None
  12. if n < bound:
  13. raise ValueError(msg)
  14. return n
  15. @dataclass
  16. class RelativeRiskResult:
  17. """
  18. Result of `scipy.stats.contingency.relative_risk`.
  19. Attributes
  20. ----------
  21. relative_risk : float
  22. This is::
  23. (exposed_cases/exposed_total) / (control_cases/control_total)
  24. exposed_cases : int
  25. The number of "cases" (i.e. occurrence of disease or other event
  26. of interest) among the sample of "exposed" individuals.
  27. exposed_total : int
  28. The total number of "exposed" individuals in the sample.
  29. control_cases : int
  30. The number of "cases" among the sample of "control" or non-exposed
  31. individuals.
  32. control_total : int
  33. The total number of "control" individuals in the sample.
  34. Methods
  35. -------
  36. confidence_interval :
  37. Compute the confidence interval for the relative risk estimate.
  38. """
  39. relative_risk: float
  40. exposed_cases: int
  41. exposed_total: int
  42. control_cases: int
  43. control_total: int
  44. def confidence_interval(self, confidence_level=0.95):
  45. """
  46. Compute the confidence interval for the relative risk.
  47. The confidence interval is computed using the Katz method
  48. (i.e. "Method C" of [1]_; see also [2]_, section 3.1.2).
  49. Parameters
  50. ----------
  51. confidence_level : float, optional
  52. The confidence level to use for the confidence interval.
  53. Default is 0.95.
  54. Returns
  55. -------
  56. ci : ConfidenceInterval instance
  57. The return value is an object with attributes ``low`` and
  58. ``high`` that hold the confidence interval.
  59. References
  60. ----------
  61. .. [1] D. Katz, J. Baptista, S. P. Azen and M. C. Pike, "Obtaining
  62. confidence intervals for the risk ratio in cohort studies",
  63. Biometrics, 34, 469-474 (1978).
  64. .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
  65. CRC Press LLC, Boca Raton, FL, USA (1996).
  66. Examples
  67. --------
  68. >>> from scipy.stats.contingency import relative_risk
  69. >>> result = relative_risk(exposed_cases=10, exposed_total=75,
  70. ... control_cases=12, control_total=225)
  71. >>> result.relative_risk
  72. 2.5
  73. >>> result.confidence_interval()
  74. ConfidenceInterval(low=1.1261564003469628, high=5.549850800541033)
  75. """
  76. if not 0 <= confidence_level <= 1:
  77. raise ValueError('confidence_level must be in the interval '
  78. '[0, 1].')
  79. # Handle edge cases where either exposed_cases or control_cases
  80. # is zero. We follow the convention of the R function riskratio
  81. # from the epitools library.
  82. if self.exposed_cases == 0 and self.control_cases == 0:
  83. # relative risk is nan.
  84. return ConfidenceInterval(low=np.nan, high=np.nan)
  85. elif self.exposed_cases == 0:
  86. # relative risk is 0.
  87. return ConfidenceInterval(low=0.0, high=np.nan)
  88. elif self.control_cases == 0:
  89. # relative risk is inf
  90. return ConfidenceInterval(low=np.nan, high=np.inf)
  91. alpha = 1 - confidence_level
  92. z = ndtri(1 - alpha/2)
  93. rr = self.relative_risk
  94. # Estimate of the variance of log(rr) is
  95. # var(log(rr)) = 1/exposed_cases - 1/exposed_total +
  96. # 1/control_cases - 1/control_total
  97. # and the standard error is the square root of that.
  98. se = np.sqrt(1/self.exposed_cases - 1/self.exposed_total +
  99. 1/self.control_cases - 1/self.control_total)
  100. delta = z*se
  101. katz_lo = rr*np.exp(-delta)
  102. katz_hi = rr*np.exp(delta)
  103. return ConfidenceInterval(low=katz_lo, high=katz_hi)
  104. def relative_risk(exposed_cases, exposed_total, control_cases, control_total):
  105. """
  106. Compute the relative risk (also known as the risk ratio).
  107. This function computes the relative risk associated with a 2x2
  108. contingency table ([1]_, section 2.2.3; [2]_, section 3.1.2). Instead
  109. of accepting a table as an argument, the individual numbers that are
  110. used to compute the relative risk are given as separate parameters.
  111. This is to avoid the ambiguity of which row or column of the contingency
  112. table corresponds to the "exposed" cases and which corresponds to the
  113. "control" cases. Unlike, say, the odds ratio, the relative risk is not
  114. invariant under an interchange of the rows or columns.
  115. Parameters
  116. ----------
  117. exposed_cases : nonnegative int
  118. The number of "cases" (i.e. occurrence of disease or other event
  119. of interest) among the sample of "exposed" individuals.
  120. exposed_total : positive int
  121. The total number of "exposed" individuals in the sample.
  122. control_cases : nonnegative int
  123. The number of "cases" among the sample of "control" or non-exposed
  124. individuals.
  125. control_total : positive int
  126. The total number of "control" individuals in the sample.
  127. Returns
  128. -------
  129. result : instance of `~scipy.stats._result_classes.RelativeRiskResult`
  130. The object has the float attribute ``relative_risk``, which is::
  131. rr = (exposed_cases/exposed_total) / (control_cases/control_total)
  132. The object also has the method ``confidence_interval`` to compute
  133. the confidence interval of the relative risk for a given confidence
  134. level.
  135. See Also
  136. --------
  137. odds_ratio
  138. Notes
  139. -----
  140. The R package epitools has the function `riskratio`, which accepts
  141. a table with the following layout::
  142. disease=0 disease=1
  143. exposed=0 (ref) n00 n01
  144. exposed=1 n10 n11
  145. With a 2x2 table in the above format, the estimate of the CI is
  146. computed by `riskratio` when the argument method="wald" is given,
  147. or with the function `riskratio.wald`.
  148. For example, in a test of the incidence of lung cancer among a
  149. sample of smokers and nonsmokers, the "exposed" category would
  150. correspond to "is a smoker" and the "disease" category would
  151. correspond to "has or had lung cancer".
  152. To pass the same data to ``relative_risk``, use::
  153. relative_risk(n11, n10 + n11, n01, n00 + n01)
  154. .. versionadded:: 1.7.0
  155. References
  156. ----------
  157. .. [1] Alan Agresti, An Introduction to Categorical Data Analysis
  158. (second edition), Wiley, Hoboken, NJ, USA (2007).
  159. .. [2] Hardeo Sahai and Anwer Khurshid, Statistics in Epidemiology,
  160. CRC Press LLC, Boca Raton, FL, USA (1996).
  161. Examples
  162. --------
  163. >>> from scipy.stats.contingency import relative_risk
  164. This example is from Example 3.1 of [2]_. The results of a heart
  165. disease study are summarized in the following table::
  166. High CAT Low CAT Total
  167. -------- ------- -----
  168. CHD 27 44 71
  169. No CHD 95 443 538
  170. Total 122 487 609
  171. CHD is coronary heart disease, and CAT refers to the level of
  172. circulating catecholamine. CAT is the "exposure" variable, and
  173. high CAT is the "exposed" category. So the data from the table
  174. to be passed to ``relative_risk`` is::
  175. exposed_cases = 27
  176. exposed_total = 122
  177. control_cases = 44
  178. control_total = 487
  179. >>> result = relative_risk(27, 122, 44, 487)
  180. >>> result.relative_risk
  181. 2.4495156482861398
  182. Find the confidence interval for the relative risk.
  183. >>> result.confidence_interval(confidence_level=0.95)
  184. ConfidenceInterval(low=1.5836990926700116, high=3.7886786315466354)
  185. The interval does not contain 1, so the data supports the statement
  186. that high CAT is associated with greater risk of CHD.
  187. """
  188. # Relative risk is a trivial calculation. The nontrivial part is in the
  189. # `confidence_interval` method of the RelativeRiskResult class.
  190. exposed_cases = _validate_int(exposed_cases, 0, "exposed_cases")
  191. exposed_total = _validate_int(exposed_total, 1, "exposed_total")
  192. control_cases = _validate_int(control_cases, 0, "control_cases")
  193. control_total = _validate_int(control_total, 1, "control_total")
  194. if exposed_cases > exposed_total:
  195. raise ValueError('exposed_cases must not exceed exposed_total.')
  196. if control_cases > control_total:
  197. raise ValueError('control_cases must not exceed control_total.')
  198. if exposed_cases == 0 and control_cases == 0:
  199. # relative risk is 0/0.
  200. rr = np.nan
  201. elif exposed_cases == 0:
  202. # relative risk is 0/nonzero
  203. rr = 0.0
  204. elif control_cases == 0:
  205. # relative risk is nonzero/0.
  206. rr = np.inf
  207. else:
  208. p1 = exposed_cases / exposed_total
  209. p2 = control_cases / control_total
  210. rr = p1 / p2
  211. return RelativeRiskResult(relative_risk=rr,
  212. exposed_cases=exposed_cases,
  213. exposed_total=exposed_total,
  214. control_cases=control_cases,
  215. control_total=control_total)