test_rank.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320
  1. import numpy as np
  2. from numpy.testing import assert_equal, assert_array_equal
  3. from scipy.stats import rankdata, tiecorrect
  4. import pytest
  5. class TestTieCorrect:
  6. def test_empty(self):
  7. """An empty array requires no correction, should return 1.0."""
  8. ranks = np.array([], dtype=np.float64)
  9. c = tiecorrect(ranks)
  10. assert_equal(c, 1.0)
  11. def test_one(self):
  12. """A single element requires no correction, should return 1.0."""
  13. ranks = np.array([1.0], dtype=np.float64)
  14. c = tiecorrect(ranks)
  15. assert_equal(c, 1.0)
  16. def test_no_correction(self):
  17. """Arrays with no ties require no correction."""
  18. ranks = np.arange(2.0)
  19. c = tiecorrect(ranks)
  20. assert_equal(c, 1.0)
  21. ranks = np.arange(3.0)
  22. c = tiecorrect(ranks)
  23. assert_equal(c, 1.0)
  24. def test_basic(self):
  25. """Check a few basic examples of the tie correction factor."""
  26. # One tie of two elements
  27. ranks = np.array([1.0, 2.5, 2.5])
  28. c = tiecorrect(ranks)
  29. T = 2.0
  30. N = ranks.size
  31. expected = 1.0 - (T**3 - T) / (N**3 - N)
  32. assert_equal(c, expected)
  33. # One tie of two elements (same as above, but tie is not at the end)
  34. ranks = np.array([1.5, 1.5, 3.0])
  35. c = tiecorrect(ranks)
  36. T = 2.0
  37. N = ranks.size
  38. expected = 1.0 - (T**3 - T) / (N**3 - N)
  39. assert_equal(c, expected)
  40. # One tie of three elements
  41. ranks = np.array([1.0, 3.0, 3.0, 3.0])
  42. c = tiecorrect(ranks)
  43. T = 3.0
  44. N = ranks.size
  45. expected = 1.0 - (T**3 - T) / (N**3 - N)
  46. assert_equal(c, expected)
  47. # Two ties, lengths 2 and 3.
  48. ranks = np.array([1.5, 1.5, 4.0, 4.0, 4.0])
  49. c = tiecorrect(ranks)
  50. T1 = 2.0
  51. T2 = 3.0
  52. N = ranks.size
  53. expected = 1.0 - ((T1**3 - T1) + (T2**3 - T2)) / (N**3 - N)
  54. assert_equal(c, expected)
  55. def test_overflow(self):
  56. ntie, k = 2000, 5
  57. a = np.repeat(np.arange(k), ntie)
  58. n = a.size # ntie * k
  59. out = tiecorrect(rankdata(a))
  60. assert_equal(out, 1.0 - k * (ntie**3 - ntie) / float(n**3 - n))
  61. class TestRankData:
  62. def test_empty(self):
  63. """stats.rankdata([]) should return an empty array."""
  64. a = np.array([], dtype=int)
  65. r = rankdata(a)
  66. assert_array_equal(r, np.array([], dtype=np.float64))
  67. r = rankdata([])
  68. assert_array_equal(r, np.array([], dtype=np.float64))
  69. def test_one(self):
  70. """Check stats.rankdata with an array of length 1."""
  71. data = [100]
  72. a = np.array(data, dtype=int)
  73. r = rankdata(a)
  74. assert_array_equal(r, np.array([1.0], dtype=np.float64))
  75. r = rankdata(data)
  76. assert_array_equal(r, np.array([1.0], dtype=np.float64))
  77. def test_basic(self):
  78. """Basic tests of stats.rankdata."""
  79. data = [100, 10, 50]
  80. expected = np.array([3.0, 1.0, 2.0], dtype=np.float64)
  81. a = np.array(data, dtype=int)
  82. r = rankdata(a)
  83. assert_array_equal(r, expected)
  84. r = rankdata(data)
  85. assert_array_equal(r, expected)
  86. data = [40, 10, 30, 10, 50]
  87. expected = np.array([4.0, 1.5, 3.0, 1.5, 5.0], dtype=np.float64)
  88. a = np.array(data, dtype=int)
  89. r = rankdata(a)
  90. assert_array_equal(r, expected)
  91. r = rankdata(data)
  92. assert_array_equal(r, expected)
  93. data = [20, 20, 20, 10, 10, 10]
  94. expected = np.array([5.0, 5.0, 5.0, 2.0, 2.0, 2.0], dtype=np.float64)
  95. a = np.array(data, dtype=int)
  96. r = rankdata(a)
  97. assert_array_equal(r, expected)
  98. r = rankdata(data)
  99. assert_array_equal(r, expected)
  100. # The docstring states explicitly that the argument is flattened.
  101. a2d = a.reshape(2, 3)
  102. r = rankdata(a2d)
  103. assert_array_equal(r, expected)
  104. def test_rankdata_object_string(self):
  105. min_rank = lambda a: [1 + sum(i < j for i in a) for j in a]
  106. max_rank = lambda a: [sum(i <= j for i in a) for j in a]
  107. ordinal_rank = lambda a: min_rank([(x, i) for i, x in enumerate(a)])
  108. def average_rank(a):
  109. return [(i + j) / 2.0 for i, j in zip(min_rank(a), max_rank(a))]
  110. def dense_rank(a):
  111. b = np.unique(a)
  112. return [1 + sum(i < j for i in b) for j in a]
  113. rankf = dict(min=min_rank, max=max_rank, ordinal=ordinal_rank,
  114. average=average_rank, dense=dense_rank)
  115. def check_ranks(a):
  116. for method in 'min', 'max', 'dense', 'ordinal', 'average':
  117. out = rankdata(a, method=method)
  118. assert_array_equal(out, rankf[method](a))
  119. val = ['foo', 'bar', 'qux', 'xyz', 'abc', 'efg', 'ace', 'qwe', 'qaz']
  120. check_ranks(np.random.choice(val, 200))
  121. check_ranks(np.random.choice(val, 200).astype('object'))
  122. val = np.array([0, 1, 2, 2.718, 3, 3.141], dtype='object')
  123. check_ranks(np.random.choice(val, 200).astype('object'))
  124. def test_large_int(self):
  125. data = np.array([2**60, 2**60+1], dtype=np.uint64)
  126. r = rankdata(data)
  127. assert_array_equal(r, [1.0, 2.0])
  128. data = np.array([2**60, 2**60+1], dtype=np.int64)
  129. r = rankdata(data)
  130. assert_array_equal(r, [1.0, 2.0])
  131. data = np.array([2**60, -2**60+1], dtype=np.int64)
  132. r = rankdata(data)
  133. assert_array_equal(r, [2.0, 1.0])
  134. def test_big_tie(self):
  135. for n in [10000, 100000, 1000000]:
  136. data = np.ones(n, dtype=int)
  137. r = rankdata(data)
  138. expected_rank = 0.5 * (n + 1)
  139. assert_array_equal(r, expected_rank * data,
  140. "test failed with n=%d" % n)
  141. def test_axis(self):
  142. data = [[0, 2, 1],
  143. [4, 2, 2]]
  144. expected0 = [[1., 1.5, 1.],
  145. [2., 1.5, 2.]]
  146. r0 = rankdata(data, axis=0)
  147. assert_array_equal(r0, expected0)
  148. expected1 = [[1., 3., 2.],
  149. [3., 1.5, 1.5]]
  150. r1 = rankdata(data, axis=1)
  151. assert_array_equal(r1, expected1)
  152. methods = ["average", "min", "max", "dense", "ordinal"]
  153. dtypes = [np.float64] + [np.int_]*4
  154. @pytest.mark.parametrize("axis", [0, 1])
  155. @pytest.mark.parametrize("method, dtype", zip(methods, dtypes))
  156. def test_size_0_axis(self, axis, method, dtype):
  157. shape = (3, 0)
  158. data = np.zeros(shape)
  159. r = rankdata(data, method=method, axis=axis)
  160. assert_equal(r.shape, shape)
  161. assert_equal(r.dtype, dtype)
  162. @pytest.mark.parametrize('axis', range(3))
  163. @pytest.mark.parametrize('method', methods)
  164. def test_nan_policy_omit_3d(self, axis, method):
  165. shape = (20, 21, 22)
  166. rng = np.random.default_rng(abs(hash('falafel')))
  167. a = rng.random(size=shape)
  168. i = rng.random(size=shape) < 0.4
  169. j = rng.random(size=shape) < 0.1
  170. k = rng.random(size=shape) < 0.1
  171. a[i] = np.nan
  172. a[j] = -np.inf
  173. a[k] - np.inf
  174. def rank_1d_omit(a, method):
  175. out = np.zeros_like(a)
  176. i = np.isnan(a)
  177. a_compressed = a[~i]
  178. res = rankdata(a_compressed, method)
  179. out[~i] = res
  180. out[i] = np.nan
  181. return out
  182. def rank_omit(a, method, axis):
  183. return np.apply_along_axis(lambda a: rank_1d_omit(a, method),
  184. axis, a)
  185. res = rankdata(a, method, axis=axis, nan_policy='omit')
  186. res0 = rank_omit(a, method, axis=axis)
  187. assert_array_equal(res, res0)
  188. def test_nan_policy_2d_axis_none(self):
  189. # 2 2d-array test with axis=None
  190. data = [[0, np.nan, 3],
  191. [4, 2, np.nan],
  192. [1, 2, 2]]
  193. assert_array_equal(rankdata(data, axis=None, nan_policy='omit'),
  194. [1., np.nan, 6., 7., 4., np.nan, 2., 4., 4.])
  195. assert_array_equal(rankdata(data, axis=None, nan_policy='propagate'),
  196. [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan,
  197. np.nan, np.nan, np.nan])
  198. def test_nan_policy_raise(self):
  199. # 1 1d-array test
  200. data = [0, 2, 3, -2, np.nan, np.nan]
  201. with pytest.raises(ValueError, match="The input contains nan"):
  202. rankdata(data, nan_policy='raise')
  203. # 2 2d-array test
  204. data = [[0, np.nan, 3],
  205. [4, 2, np.nan],
  206. [np.nan, 2, 2]]
  207. with pytest.raises(ValueError, match="The input contains nan"):
  208. rankdata(data, axis=0, nan_policy="raise")
  209. with pytest.raises(ValueError, match="The input contains nan"):
  210. rankdata(data, axis=1, nan_policy="raise")
  211. def test_nan_policy_propagate(self):
  212. # 1 1d-array test
  213. data = [0, 2, 3, -2, np.nan, np.nan]
  214. assert_array_equal(rankdata(data, nan_policy='propagate'),
  215. [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan])
  216. # 2 2d-array test
  217. data = [[0, np.nan, 3],
  218. [4, 2, np.nan],
  219. [1, 2, 2]]
  220. assert_array_equal(rankdata(data, axis=0, nan_policy='propagate'),
  221. [[1, np.nan, np.nan],
  222. [3, np.nan, np.nan],
  223. [2, np.nan, np.nan]])
  224. assert_array_equal(rankdata(data, axis=1, nan_policy='propagate'),
  225. [[np.nan, np.nan, np.nan],
  226. [np.nan, np.nan, np.nan],
  227. [1, 2.5, 2.5]])
  228. _cases = (
  229. # values, method, expected
  230. ([], 'average', []),
  231. ([], 'min', []),
  232. ([], 'max', []),
  233. ([], 'dense', []),
  234. ([], 'ordinal', []),
  235. #
  236. ([100], 'average', [1.0]),
  237. ([100], 'min', [1.0]),
  238. ([100], 'max', [1.0]),
  239. ([100], 'dense', [1.0]),
  240. ([100], 'ordinal', [1.0]),
  241. #
  242. ([100, 100, 100], 'average', [2.0, 2.0, 2.0]),
  243. ([100, 100, 100], 'min', [1.0, 1.0, 1.0]),
  244. ([100, 100, 100], 'max', [3.0, 3.0, 3.0]),
  245. ([100, 100, 100], 'dense', [1.0, 1.0, 1.0]),
  246. ([100, 100, 100], 'ordinal', [1.0, 2.0, 3.0]),
  247. #
  248. ([100, 300, 200], 'average', [1.0, 3.0, 2.0]),
  249. ([100, 300, 200], 'min', [1.0, 3.0, 2.0]),
  250. ([100, 300, 200], 'max', [1.0, 3.0, 2.0]),
  251. ([100, 300, 200], 'dense', [1.0, 3.0, 2.0]),
  252. ([100, 300, 200], 'ordinal', [1.0, 3.0, 2.0]),
  253. #
  254. ([100, 200, 300, 200], 'average', [1.0, 2.5, 4.0, 2.5]),
  255. ([100, 200, 300, 200], 'min', [1.0, 2.0, 4.0, 2.0]),
  256. ([100, 200, 300, 200], 'max', [1.0, 3.0, 4.0, 3.0]),
  257. ([100, 200, 300, 200], 'dense', [1.0, 2.0, 3.0, 2.0]),
  258. ([100, 200, 300, 200], 'ordinal', [1.0, 2.0, 4.0, 3.0]),
  259. #
  260. ([100, 200, 300, 200, 100], 'average', [1.5, 3.5, 5.0, 3.5, 1.5]),
  261. ([100, 200, 300, 200, 100], 'min', [1.0, 3.0, 5.0, 3.0, 1.0]),
  262. ([100, 200, 300, 200, 100], 'max', [2.0, 4.0, 5.0, 4.0, 2.0]),
  263. ([100, 200, 300, 200, 100], 'dense', [1.0, 2.0, 3.0, 2.0, 1.0]),
  264. ([100, 200, 300, 200, 100], 'ordinal', [1.0, 3.0, 5.0, 4.0, 2.0]),
  265. #
  266. ([10] * 30, 'ordinal', np.arange(1.0, 31.0)),
  267. )
  268. def test_cases():
  269. for values, method, expected in _cases:
  270. r = rankdata(values, method=method)
  271. assert_array_equal(r, expected)