contingency.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419
  1. """
  2. Contingency table functions (:mod:`scipy.stats.contingency`)
  3. ============================================================
  4. Functions for creating and analyzing contingency tables.
  5. .. currentmodule:: scipy.stats.contingency
  6. .. autosummary::
  7. :toctree: generated/
  8. chi2_contingency
  9. relative_risk
  10. odds_ratio
  11. crosstab
  12. association
  13. expected_freq
  14. margins
  15. """
  16. from functools import reduce
  17. import math
  18. import numpy as np
  19. from ._stats_py import power_divergence
  20. from ._relative_risk import relative_risk
  21. from ._crosstab import crosstab
  22. from ._odds_ratio import odds_ratio
  23. from scipy._lib._bunch import _make_tuple_bunch
  24. __all__ = ['margins', 'expected_freq', 'chi2_contingency', 'crosstab',
  25. 'association', 'relative_risk', 'odds_ratio']
  26. def margins(a):
  27. """Return a list of the marginal sums of the array `a`.
  28. Parameters
  29. ----------
  30. a : ndarray
  31. The array for which to compute the marginal sums.
  32. Returns
  33. -------
  34. margsums : list of ndarrays
  35. A list of length `a.ndim`. `margsums[k]` is the result
  36. of summing `a` over all axes except `k`; it has the same
  37. number of dimensions as `a`, but the length of each axis
  38. except axis `k` will be 1.
  39. Examples
  40. --------
  41. >>> import numpy as np
  42. >>> from scipy.stats.contingency import margins
  43. >>> a = np.arange(12).reshape(2, 6)
  44. >>> a
  45. array([[ 0, 1, 2, 3, 4, 5],
  46. [ 6, 7, 8, 9, 10, 11]])
  47. >>> m0, m1 = margins(a)
  48. >>> m0
  49. array([[15],
  50. [51]])
  51. >>> m1
  52. array([[ 6, 8, 10, 12, 14, 16]])
  53. >>> b = np.arange(24).reshape(2,3,4)
  54. >>> m0, m1, m2 = margins(b)
  55. >>> m0
  56. array([[[ 66]],
  57. [[210]]])
  58. >>> m1
  59. array([[[ 60],
  60. [ 92],
  61. [124]]])
  62. >>> m2
  63. array([[[60, 66, 72, 78]]])
  64. """
  65. margsums = []
  66. ranged = list(range(a.ndim))
  67. for k in ranged:
  68. marg = np.apply_over_axes(np.sum, a, [j for j in ranged if j != k])
  69. margsums.append(marg)
  70. return margsums
  71. def expected_freq(observed):
  72. """
  73. Compute the expected frequencies from a contingency table.
  74. Given an n-dimensional contingency table of observed frequencies,
  75. compute the expected frequencies for the table based on the marginal
  76. sums under the assumption that the groups associated with each
  77. dimension are independent.
  78. Parameters
  79. ----------
  80. observed : array_like
  81. The table of observed frequencies. (While this function can handle
  82. a 1-D array, that case is trivial. Generally `observed` is at
  83. least 2-D.)
  84. Returns
  85. -------
  86. expected : ndarray of float64
  87. The expected frequencies, based on the marginal sums of the table.
  88. Same shape as `observed`.
  89. Examples
  90. --------
  91. >>> import numpy as np
  92. >>> from scipy.stats.contingency import expected_freq
  93. >>> observed = np.array([[10, 10, 20],[20, 20, 20]])
  94. >>> expected_freq(observed)
  95. array([[ 12., 12., 16.],
  96. [ 18., 18., 24.]])
  97. """
  98. # Typically `observed` is an integer array. If `observed` has a large
  99. # number of dimensions or holds large values, some of the following
  100. # computations may overflow, so we first switch to floating point.
  101. observed = np.asarray(observed, dtype=np.float64)
  102. # Create a list of the marginal sums.
  103. margsums = margins(observed)
  104. # Create the array of expected frequencies. The shapes of the
  105. # marginal sums returned by apply_over_axes() are just what we
  106. # need for broadcasting in the following product.
  107. d = observed.ndim
  108. expected = reduce(np.multiply, margsums) / observed.sum() ** (d - 1)
  109. return expected
  110. Chi2ContingencyResult = _make_tuple_bunch(
  111. 'Chi2ContingencyResult',
  112. ['statistic', 'pvalue', 'dof', 'expected_freq'], []
  113. )
  114. def chi2_contingency(observed, correction=True, lambda_=None):
  115. """Chi-square test of independence of variables in a contingency table.
  116. This function computes the chi-square statistic and p-value for the
  117. hypothesis test of independence of the observed frequencies in the
  118. contingency table [1]_ `observed`. The expected frequencies are computed
  119. based on the marginal sums under the assumption of independence; see
  120. `scipy.stats.contingency.expected_freq`. The number of degrees of
  121. freedom is (expressed using numpy functions and attributes)::
  122. dof = observed.size - sum(observed.shape) + observed.ndim - 1
  123. Parameters
  124. ----------
  125. observed : array_like
  126. The contingency table. The table contains the observed frequencies
  127. (i.e. number of occurrences) in each category. In the two-dimensional
  128. case, the table is often described as an "R x C table".
  129. correction : bool, optional
  130. If True, *and* the degrees of freedom is 1, apply Yates' correction
  131. for continuity. The effect of the correction is to adjust each
  132. observed value by 0.5 towards the corresponding expected value.
  133. lambda_ : float or str, optional
  134. By default, the statistic computed in this test is Pearson's
  135. chi-squared statistic [2]_. `lambda_` allows a statistic from the
  136. Cressie-Read power divergence family [3]_ to be used instead. See
  137. `scipy.stats.power_divergence` for details.
  138. Returns
  139. -------
  140. res : Chi2ContingencyResult
  141. An object containing attributes:
  142. statistic : float
  143. The test statistic.
  144. pvalue : float
  145. The p-value of the test.
  146. dof : int
  147. The degrees of freedom.
  148. expected_freq : ndarray, same shape as `observed`
  149. The expected frequencies, based on the marginal sums of the table.
  150. See Also
  151. --------
  152. scipy.stats.contingency.expected_freq
  153. scipy.stats.fisher_exact
  154. scipy.stats.chisquare
  155. scipy.stats.power_divergence
  156. scipy.stats.barnard_exact
  157. scipy.stats.boschloo_exact
  158. Notes
  159. -----
  160. An often quoted guideline for the validity of this calculation is that
  161. the test should be used only if the observed and expected frequencies
  162. in each cell are at least 5.
  163. This is a test for the independence of different categories of a
  164. population. The test is only meaningful when the dimension of
  165. `observed` is two or more. Applying the test to a one-dimensional
  166. table will always result in `expected` equal to `observed` and a
  167. chi-square statistic equal to 0.
  168. This function does not handle masked arrays, because the calculation
  169. does not make sense with missing values.
  170. Like `scipy.stats.chisquare`, this function computes a chi-square
  171. statistic; the convenience this function provides is to figure out the
  172. expected frequencies and degrees of freedom from the given contingency
  173. table. If these were already known, and if the Yates' correction was not
  174. required, one could use `scipy.stats.chisquare`. That is, if one calls::
  175. res = chi2_contingency(obs, correction=False)
  176. then the following is true::
  177. (res.statistic, res.pvalue) == stats.chisquare(obs.ravel(),
  178. f_exp=ex.ravel(),
  179. ddof=obs.size - 1 - dof)
  180. The `lambda_` argument was added in version 0.13.0 of scipy.
  181. References
  182. ----------
  183. .. [1] "Contingency table",
  184. https://en.wikipedia.org/wiki/Contingency_table
  185. .. [2] "Pearson's chi-squared test",
  186. https://en.wikipedia.org/wiki/Pearson%27s_chi-squared_test
  187. .. [3] Cressie, N. and Read, T. R. C., "Multinomial Goodness-of-Fit
  188. Tests", J. Royal Stat. Soc. Series B, Vol. 46, No. 3 (1984),
  189. pp. 440-464.
  190. Examples
  191. --------
  192. A two-way example (2 x 3):
  193. >>> import numpy as np
  194. >>> from scipy.stats import chi2_contingency
  195. >>> obs = np.array([[10, 10, 20], [20, 20, 20]])
  196. >>> res = chi2_contingency(obs)
  197. >>> res.statistic
  198. 2.7777777777777777
  199. >>> res.pvalue
  200. 0.24935220877729619
  201. >>> res.dof
  202. 2
  203. >>> res.expected_freq
  204. array([[ 12., 12., 16.],
  205. [ 18., 18., 24.]])
  206. Perform the test using the log-likelihood ratio (i.e. the "G-test")
  207. instead of Pearson's chi-squared statistic.
  208. >>> res = chi2_contingency(obs, lambda_="log-likelihood")
  209. >>> res.statistic
  210. 2.7688587616781319
  211. >>> res.pvalue
  212. 0.25046668010954165
  213. A four-way example (2 x 2 x 2 x 2):
  214. >>> obs = np.array(
  215. ... [[[[12, 17],
  216. ... [11, 16]],
  217. ... [[11, 12],
  218. ... [15, 16]]],
  219. ... [[[23, 15],
  220. ... [30, 22]],
  221. ... [[14, 17],
  222. ... [15, 16]]]])
  223. >>> res = chi2_contingency(obs)
  224. >>> res.statistic
  225. 8.7584514426741897
  226. >>> res.pvalue
  227. 0.64417725029295503
  228. """
  229. observed = np.asarray(observed)
  230. if np.any(observed < 0):
  231. raise ValueError("All values in `observed` must be nonnegative.")
  232. if observed.size == 0:
  233. raise ValueError("No data; `observed` has size 0.")
  234. expected = expected_freq(observed)
  235. if np.any(expected == 0):
  236. # Include one of the positions where expected is zero in
  237. # the exception message.
  238. zeropos = list(zip(*np.nonzero(expected == 0)))[0]
  239. raise ValueError("The internally computed table of expected "
  240. "frequencies has a zero element at %s." % (zeropos,))
  241. # The degrees of freedom
  242. dof = expected.size - sum(expected.shape) + expected.ndim - 1
  243. if dof == 0:
  244. # Degenerate case; this occurs when `observed` is 1D (or, more
  245. # generally, when it has only one nontrivial dimension). In this
  246. # case, we also have observed == expected, so chi2 is 0.
  247. chi2 = 0.0
  248. p = 1.0
  249. else:
  250. if dof == 1 and correction:
  251. # Adjust `observed` according to Yates' correction for continuity.
  252. # Magnitude of correction no bigger than difference; see gh-13875
  253. diff = expected - observed
  254. direction = np.sign(diff)
  255. magnitude = np.minimum(0.5, np.abs(diff))
  256. observed = observed + magnitude * direction
  257. chi2, p = power_divergence(observed, expected,
  258. ddof=observed.size - 1 - dof, axis=None,
  259. lambda_=lambda_)
  260. return Chi2ContingencyResult(chi2, p, dof, expected)
  261. def association(observed, method="cramer", correction=False, lambda_=None):
  262. """Calculates degree of association between two nominal variables.
  263. The function provides the option for computing one of three measures of
  264. association between two nominal variables from the data given in a 2d
  265. contingency table: Tschuprow's T, Pearson's Contingency Coefficient
  266. and Cramer's V.
  267. Parameters
  268. ----------
  269. observed : array-like
  270. The array of observed values
  271. method : {"cramer", "tschuprow", "pearson"} (default = "cramer")
  272. The association test statistic.
  273. correction : bool, optional
  274. Inherited from `scipy.stats.contingency.chi2_contingency()`
  275. lambda_ : float or str, optional
  276. Inherited from `scipy.stats.contingency.chi2_contingency()`
  277. Returns
  278. -------
  279. statistic : float
  280. Value of the test statistic
  281. Notes
  282. -----
  283. Cramer's V, Tschuprow's T and Pearson's Contingency Coefficient, all
  284. measure the degree to which two nominal or ordinal variables are related,
  285. or the level of their association. This differs from correlation, although
  286. many often mistakenly consider them equivalent. Correlation measures in
  287. what way two variables are related, whereas, association measures how
  288. related the variables are. As such, association does not subsume
  289. independent variables, and is rather a test of independence. A value of
  290. 1.0 indicates perfect association, and 0.0 means the variables have no
  291. association.
  292. Both the Cramer's V and Tschuprow's T are extensions of the phi
  293. coefficient. Moreover, due to the close relationship between the
  294. Cramer's V and Tschuprow's T the returned values can often be similar
  295. or even equivalent. They are likely to diverge more as the array shape
  296. diverges from a 2x2.
  297. References
  298. ----------
  299. .. [1] "Tschuprow's T",
  300. https://en.wikipedia.org/wiki/Tschuprow's_T
  301. .. [2] Tschuprow, A. A. (1939)
  302. Principles of the Mathematical Theory of Correlation;
  303. translated by M. Kantorowitsch. W. Hodge & Co.
  304. .. [3] "Cramer's V", https://en.wikipedia.org/wiki/Cramer's_V
  305. .. [4] "Nominal Association: Phi and Cramer's V",
  306. http://www.people.vcu.edu/~pdattalo/702SuppRead/MeasAssoc/NominalAssoc.html
  307. .. [5] Gingrich, Paul, "Association Between Variables",
  308. http://uregina.ca/~gingrich/ch11a.pdf
  309. Examples
  310. --------
  311. An example with a 4x2 contingency table:
  312. >>> import numpy as np
  313. >>> from scipy.stats.contingency import association
  314. >>> obs4x2 = np.array([[100, 150], [203, 322], [420, 700], [320, 210]])
  315. Pearson's contingency coefficient
  316. >>> association(obs4x2, method="pearson")
  317. 0.18303298140595667
  318. Cramer's V
  319. >>> association(obs4x2, method="cramer")
  320. 0.18617813077483678
  321. Tschuprow's T
  322. >>> association(obs4x2, method="tschuprow")
  323. 0.14146478765062995
  324. """
  325. arr = np.asarray(observed)
  326. if not np.issubdtype(arr.dtype, np.integer):
  327. raise ValueError("`observed` must be an integer array.")
  328. if len(arr.shape) != 2:
  329. raise ValueError("method only accepts 2d arrays")
  330. chi2_stat = chi2_contingency(arr, correction=correction,
  331. lambda_=lambda_)
  332. phi2 = chi2_stat.statistic / arr.sum()
  333. n_rows, n_cols = arr.shape
  334. if method == "cramer":
  335. value = phi2 / min(n_cols - 1, n_rows - 1)
  336. elif method == "tschuprow":
  337. value = phi2 / math.sqrt((n_rows - 1) * (n_cols - 1))
  338. elif method == 'pearson':
  339. value = phi2 / (1 + phi2)
  340. else:
  341. raise ValueError("Invalid argument value: 'method' argument must "
  342. "be 'cramer', 'tschuprow', or 'pearson'")
  343. return math.sqrt(value)