test_contingency.py 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241
  1. import numpy as np
  2. from numpy.testing import (assert_equal, assert_array_equal,
  3. assert_array_almost_equal, assert_approx_equal,
  4. assert_allclose)
  5. import pytest
  6. from pytest import raises as assert_raises
  7. from scipy.special import xlogy
  8. from scipy.stats.contingency import (margins, expected_freq,
  9. chi2_contingency, association)
  10. def test_margins():
  11. a = np.array([1])
  12. m = margins(a)
  13. assert_equal(len(m), 1)
  14. m0 = m[0]
  15. assert_array_equal(m0, np.array([1]))
  16. a = np.array([[1]])
  17. m0, m1 = margins(a)
  18. expected0 = np.array([[1]])
  19. expected1 = np.array([[1]])
  20. assert_array_equal(m0, expected0)
  21. assert_array_equal(m1, expected1)
  22. a = np.arange(12).reshape(2, 6)
  23. m0, m1 = margins(a)
  24. expected0 = np.array([[15], [51]])
  25. expected1 = np.array([[6, 8, 10, 12, 14, 16]])
  26. assert_array_equal(m0, expected0)
  27. assert_array_equal(m1, expected1)
  28. a = np.arange(24).reshape(2, 3, 4)
  29. m0, m1, m2 = margins(a)
  30. expected0 = np.array([[[66]], [[210]]])
  31. expected1 = np.array([[[60], [92], [124]]])
  32. expected2 = np.array([[[60, 66, 72, 78]]])
  33. assert_array_equal(m0, expected0)
  34. assert_array_equal(m1, expected1)
  35. assert_array_equal(m2, expected2)
  36. def test_expected_freq():
  37. assert_array_equal(expected_freq([1]), np.array([1.0]))
  38. observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
  39. e = expected_freq(observed)
  40. assert_array_equal(e, np.ones_like(observed))
  41. observed = np.array([[10, 10, 20], [20, 20, 20]])
  42. e = expected_freq(observed)
  43. correct = np.array([[12., 12., 16.], [18., 18., 24.]])
  44. assert_array_almost_equal(e, correct)
  45. def test_chi2_contingency_trivial():
  46. # Some very simple tests for chi2_contingency.
  47. # A trivial case
  48. obs = np.array([[1, 2], [1, 2]])
  49. chi2, p, dof, expected = chi2_contingency(obs, correction=False)
  50. assert_equal(chi2, 0.0)
  51. assert_equal(p, 1.0)
  52. assert_equal(dof, 1)
  53. assert_array_equal(obs, expected)
  54. # A *really* trivial case: 1-D data.
  55. obs = np.array([1, 2, 3])
  56. chi2, p, dof, expected = chi2_contingency(obs, correction=False)
  57. assert_equal(chi2, 0.0)
  58. assert_equal(p, 1.0)
  59. assert_equal(dof, 0)
  60. assert_array_equal(obs, expected)
  61. def test_chi2_contingency_R():
  62. # Some test cases that were computed independently, using R.
  63. # Rcode = \
  64. # """
  65. # # Data vector.
  66. # data <- c(
  67. # 12, 34, 23, 4, 47, 11,
  68. # 35, 31, 11, 34, 10, 18,
  69. # 12, 32, 9, 18, 13, 19,
  70. # 12, 12, 14, 9, 33, 25
  71. # )
  72. #
  73. # # Create factor tags:r=rows, c=columns, t=tiers
  74. # r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
  75. # c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
  76. # t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
  77. #
  78. # # 3-way Chi squared test of independence
  79. # s = summary(xtabs(data~r+c+t))
  80. # print(s)
  81. # """
  82. # Routput = \
  83. # """
  84. # Call: xtabs(formula = data ~ r + c + t)
  85. # Number of cases in table: 478
  86. # Number of factors: 3
  87. # Test for independence of all factors:
  88. # Chisq = 102.17, df = 17, p-value = 3.514e-14
  89. # """
  90. obs = np.array(
  91. [[[12, 34, 23],
  92. [35, 31, 11],
  93. [12, 32, 9],
  94. [12, 12, 14]],
  95. [[4, 47, 11],
  96. [34, 10, 18],
  97. [18, 13, 19],
  98. [9, 33, 25]]])
  99. chi2, p, dof, expected = chi2_contingency(obs)
  100. assert_approx_equal(chi2, 102.17, significant=5)
  101. assert_approx_equal(p, 3.514e-14, significant=4)
  102. assert_equal(dof, 17)
  103. # Rcode = \
  104. # """
  105. # # Data vector.
  106. # data <- c(
  107. # #
  108. # 12, 17,
  109. # 11, 16,
  110. # #
  111. # 11, 12,
  112. # 15, 16,
  113. # #
  114. # 23, 15,
  115. # 30, 22,
  116. # #
  117. # 14, 17,
  118. # 15, 16
  119. # )
  120. #
  121. # # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
  122. # r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
  123. # c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
  124. # d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
  125. # t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
  126. #
  127. # # 4-way Chi squared test of independence
  128. # s = summary(xtabs(data~r+c+d+t))
  129. # print(s)
  130. # """
  131. # Routput = \
  132. # """
  133. # Call: xtabs(formula = data ~ r + c + d + t)
  134. # Number of cases in table: 262
  135. # Number of factors: 4
  136. # Test for independence of all factors:
  137. # Chisq = 8.758, df = 11, p-value = 0.6442
  138. # """
  139. obs = np.array(
  140. [[[[12, 17],
  141. [11, 16]],
  142. [[11, 12],
  143. [15, 16]]],
  144. [[[23, 15],
  145. [30, 22]],
  146. [[14, 17],
  147. [15, 16]]]])
  148. chi2, p, dof, expected = chi2_contingency(obs)
  149. assert_approx_equal(chi2, 8.758, significant=4)
  150. assert_approx_equal(p, 0.6442, significant=4)
  151. assert_equal(dof, 11)
  152. def test_chi2_contingency_g():
  153. c = np.array([[15, 60], [15, 90]])
  154. g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
  155. correction=False)
  156. assert_allclose(g, 2*xlogy(c, c/e).sum())
  157. g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
  158. correction=True)
  159. c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
  160. assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
  161. c = np.array([[10, 12, 10], [12, 10, 10]])
  162. g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
  163. assert_allclose(g, 2*xlogy(c, c/e).sum())
  164. def test_chi2_contingency_bad_args():
  165. # Test that "bad" inputs raise a ValueError.
  166. # Negative value in the array of observed frequencies.
  167. obs = np.array([[-1, 10], [1, 2]])
  168. assert_raises(ValueError, chi2_contingency, obs)
  169. # The zeros in this will result in zeros in the array
  170. # of expected frequencies.
  171. obs = np.array([[0, 1], [0, 1]])
  172. assert_raises(ValueError, chi2_contingency, obs)
  173. # A degenerate case: `observed` has size 0.
  174. obs = np.empty((0, 8))
  175. assert_raises(ValueError, chi2_contingency, obs)
  176. def test_chi2_contingency_yates_gh13875():
  177. # Magnitude of Yates' continuity correction should not exceed difference
  178. # between expected and observed value of the statistic; see gh-13875
  179. observed = np.array([[1573, 3], [4, 0]])
  180. p = chi2_contingency(observed)[1]
  181. assert_allclose(p, 1, rtol=1e-12)
  182. @pytest.mark.parametrize("correction", [False, True])
  183. def test_result(correction):
  184. obs = np.array([[1, 2], [1, 2]])
  185. res = chi2_contingency(obs, correction=correction)
  186. assert_equal((res.statistic, res.pvalue, res.dof, res.expected_freq), res)
  187. def test_bad_association_args():
  188. # Invalid Test Statistic
  189. assert_raises(ValueError, association, [[1, 2], [3, 4]], "X")
  190. # Invalid array shape
  191. assert_raises(ValueError, association, [[[1, 2]], [[3, 4]]], "cramer")
  192. # chi2_contingency exception
  193. assert_raises(ValueError, association, [[-1, 10], [1, 2]], 'cramer')
  194. # Invalid Array Item Data Type
  195. assert_raises(ValueError, association,
  196. np.array([[1, 2], ["dd", 4]], dtype=object), 'cramer')
  197. @pytest.mark.parametrize('stat, expected',
  198. [('cramer', 0.09222412010290792),
  199. ('tschuprow', 0.0775509319944633),
  200. ('pearson', 0.12932925727138758)])
  201. def test_assoc(stat, expected):
  202. # 2d Array
  203. obs1 = np.array([[12, 13, 14, 15, 16],
  204. [17, 16, 18, 19, 11],
  205. [9, 15, 14, 12, 11]])
  206. a = association(observed=obs1, method=stat)
  207. assert_allclose(a, expected)