123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241 |
- import numpy as np
- from numpy.testing import (assert_equal, assert_array_equal,
- assert_array_almost_equal, assert_approx_equal,
- assert_allclose)
- import pytest
- from pytest import raises as assert_raises
- from scipy.special import xlogy
- from scipy.stats.contingency import (margins, expected_freq,
- chi2_contingency, association)
- def test_margins():
- a = np.array([1])
- m = margins(a)
- assert_equal(len(m), 1)
- m0 = m[0]
- assert_array_equal(m0, np.array([1]))
- a = np.array([[1]])
- m0, m1 = margins(a)
- expected0 = np.array([[1]])
- expected1 = np.array([[1]])
- assert_array_equal(m0, expected0)
- assert_array_equal(m1, expected1)
- a = np.arange(12).reshape(2, 6)
- m0, m1 = margins(a)
- expected0 = np.array([[15], [51]])
- expected1 = np.array([[6, 8, 10, 12, 14, 16]])
- assert_array_equal(m0, expected0)
- assert_array_equal(m1, expected1)
- a = np.arange(24).reshape(2, 3, 4)
- m0, m1, m2 = margins(a)
- expected0 = np.array([[[66]], [[210]]])
- expected1 = np.array([[[60], [92], [124]]])
- expected2 = np.array([[[60, 66, 72, 78]]])
- assert_array_equal(m0, expected0)
- assert_array_equal(m1, expected1)
- assert_array_equal(m2, expected2)
- def test_expected_freq():
- assert_array_equal(expected_freq([1]), np.array([1.0]))
- observed = np.array([[[2, 0], [0, 2]], [[0, 2], [2, 0]], [[1, 1], [1, 1]]])
- e = expected_freq(observed)
- assert_array_equal(e, np.ones_like(observed))
- observed = np.array([[10, 10, 20], [20, 20, 20]])
- e = expected_freq(observed)
- correct = np.array([[12., 12., 16.], [18., 18., 24.]])
- assert_array_almost_equal(e, correct)
- def test_chi2_contingency_trivial():
- # Some very simple tests for chi2_contingency.
- # A trivial case
- obs = np.array([[1, 2], [1, 2]])
- chi2, p, dof, expected = chi2_contingency(obs, correction=False)
- assert_equal(chi2, 0.0)
- assert_equal(p, 1.0)
- assert_equal(dof, 1)
- assert_array_equal(obs, expected)
- # A *really* trivial case: 1-D data.
- obs = np.array([1, 2, 3])
- chi2, p, dof, expected = chi2_contingency(obs, correction=False)
- assert_equal(chi2, 0.0)
- assert_equal(p, 1.0)
- assert_equal(dof, 0)
- assert_array_equal(obs, expected)
- def test_chi2_contingency_R():
- # Some test cases that were computed independently, using R.
- # Rcode = \
- # """
- # # Data vector.
- # data <- c(
- # 12, 34, 23, 4, 47, 11,
- # 35, 31, 11, 34, 10, 18,
- # 12, 32, 9, 18, 13, 19,
- # 12, 12, 14, 9, 33, 25
- # )
- #
- # # Create factor tags:r=rows, c=columns, t=tiers
- # r <- factor(gl(4, 2*3, 2*3*4, labels=c("r1", "r2", "r3", "r4")))
- # c <- factor(gl(3, 1, 2*3*4, labels=c("c1", "c2", "c3")))
- # t <- factor(gl(2, 3, 2*3*4, labels=c("t1", "t2")))
- #
- # # 3-way Chi squared test of independence
- # s = summary(xtabs(data~r+c+t))
- # print(s)
- # """
- # Routput = \
- # """
- # Call: xtabs(formula = data ~ r + c + t)
- # Number of cases in table: 478
- # Number of factors: 3
- # Test for independence of all factors:
- # Chisq = 102.17, df = 17, p-value = 3.514e-14
- # """
- obs = np.array(
- [[[12, 34, 23],
- [35, 31, 11],
- [12, 32, 9],
- [12, 12, 14]],
- [[4, 47, 11],
- [34, 10, 18],
- [18, 13, 19],
- [9, 33, 25]]])
- chi2, p, dof, expected = chi2_contingency(obs)
- assert_approx_equal(chi2, 102.17, significant=5)
- assert_approx_equal(p, 3.514e-14, significant=4)
- assert_equal(dof, 17)
- # Rcode = \
- # """
- # # Data vector.
- # data <- c(
- # #
- # 12, 17,
- # 11, 16,
- # #
- # 11, 12,
- # 15, 16,
- # #
- # 23, 15,
- # 30, 22,
- # #
- # 14, 17,
- # 15, 16
- # )
- #
- # # Create factor tags:r=rows, c=columns, d=depths(?), t=tiers
- # r <- factor(gl(2, 2, 2*2*2*2, labels=c("r1", "r2")))
- # c <- factor(gl(2, 1, 2*2*2*2, labels=c("c1", "c2")))
- # d <- factor(gl(2, 4, 2*2*2*2, labels=c("d1", "d2")))
- # t <- factor(gl(2, 8, 2*2*2*2, labels=c("t1", "t2")))
- #
- # # 4-way Chi squared test of independence
- # s = summary(xtabs(data~r+c+d+t))
- # print(s)
- # """
- # Routput = \
- # """
- # Call: xtabs(formula = data ~ r + c + d + t)
- # Number of cases in table: 262
- # Number of factors: 4
- # Test for independence of all factors:
- # Chisq = 8.758, df = 11, p-value = 0.6442
- # """
- obs = np.array(
- [[[[12, 17],
- [11, 16]],
- [[11, 12],
- [15, 16]]],
- [[[23, 15],
- [30, 22]],
- [[14, 17],
- [15, 16]]]])
- chi2, p, dof, expected = chi2_contingency(obs)
- assert_approx_equal(chi2, 8.758, significant=4)
- assert_approx_equal(p, 0.6442, significant=4)
- assert_equal(dof, 11)
- def test_chi2_contingency_g():
- c = np.array([[15, 60], [15, 90]])
- g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
- correction=False)
- assert_allclose(g, 2*xlogy(c, c/e).sum())
- g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood',
- correction=True)
- c_corr = c + np.array([[-0.5, 0.5], [0.5, -0.5]])
- assert_allclose(g, 2*xlogy(c_corr, c_corr/e).sum())
- c = np.array([[10, 12, 10], [12, 10, 10]])
- g, p, dof, e = chi2_contingency(c, lambda_='log-likelihood')
- assert_allclose(g, 2*xlogy(c, c/e).sum())
- def test_chi2_contingency_bad_args():
- # Test that "bad" inputs raise a ValueError.
- # Negative value in the array of observed frequencies.
- obs = np.array([[-1, 10], [1, 2]])
- assert_raises(ValueError, chi2_contingency, obs)
- # The zeros in this will result in zeros in the array
- # of expected frequencies.
- obs = np.array([[0, 1], [0, 1]])
- assert_raises(ValueError, chi2_contingency, obs)
- # A degenerate case: `observed` has size 0.
- obs = np.empty((0, 8))
- assert_raises(ValueError, chi2_contingency, obs)
- def test_chi2_contingency_yates_gh13875():
- # Magnitude of Yates' continuity correction should not exceed difference
- # between expected and observed value of the statistic; see gh-13875
- observed = np.array([[1573, 3], [4, 0]])
- p = chi2_contingency(observed)[1]
- assert_allclose(p, 1, rtol=1e-12)
- @pytest.mark.parametrize("correction", [False, True])
- def test_result(correction):
- obs = np.array([[1, 2], [1, 2]])
- res = chi2_contingency(obs, correction=correction)
- assert_equal((res.statistic, res.pvalue, res.dof, res.expected_freq), res)
- def test_bad_association_args():
- # Invalid Test Statistic
- assert_raises(ValueError, association, [[1, 2], [3, 4]], "X")
- # Invalid array shape
- assert_raises(ValueError, association, [[[1, 2]], [[3, 4]]], "cramer")
- # chi2_contingency exception
- assert_raises(ValueError, association, [[-1, 10], [1, 2]], 'cramer')
- # Invalid Array Item Data Type
- assert_raises(ValueError, association,
- np.array([[1, 2], ["dd", 4]], dtype=object), 'cramer')
- @pytest.mark.parametrize('stat, expected',
- [('cramer', 0.09222412010290792),
- ('tschuprow', 0.0775509319944633),
- ('pearson', 0.12932925727138758)])
- def test_assoc(stat, expected):
- # 2d Array
- obs1 = np.array([[12, 13, 14, 15, 16],
- [17, 16, 18, 19, 11],
- [9, 15, 14, 12, 11]])
- a = association(observed=obs1, method=stat)
- assert_allclose(a, expected)
|