test_rank.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511
  1. from itertools import chain
  2. import operator
  3. import numpy as np
  4. import pytest
  5. from pandas._libs.algos import (
  6. Infinity,
  7. NegInfinity,
  8. )
  9. import pandas.util._test_decorators as td
  10. from pandas import (
  11. NA,
  12. NaT,
  13. Series,
  14. Timestamp,
  15. date_range,
  16. )
  17. import pandas._testing as tm
  18. from pandas.api.types import CategoricalDtype
  19. @pytest.fixture
  20. def ser():
  21. return Series([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])
  22. @pytest.fixture(
  23. params=[
  24. ["average", np.array([1.5, 5.5, 7.0, 3.5, np.nan, 3.5, 1.5, 8.0, np.nan, 5.5])],
  25. ["min", np.array([1, 5, 7, 3, np.nan, 3, 1, 8, np.nan, 5])],
  26. ["max", np.array([2, 6, 7, 4, np.nan, 4, 2, 8, np.nan, 6])],
  27. ["first", np.array([1, 5, 7, 3, np.nan, 4, 2, 8, np.nan, 6])],
  28. ["dense", np.array([1, 3, 4, 2, np.nan, 2, 1, 5, np.nan, 3])],
  29. ]
  30. )
  31. def results(request):
  32. return request.param
  33. @pytest.fixture(
  34. params=[
  35. "object",
  36. "float64",
  37. "int64",
  38. "Float64",
  39. "Int64",
  40. pytest.param("float64[pyarrow]", marks=td.skip_if_no("pyarrow")),
  41. pytest.param("int64[pyarrow]", marks=td.skip_if_no("pyarrow")),
  42. ]
  43. )
  44. def dtype(request):
  45. return request.param
  46. class TestSeriesRank:
  47. @td.skip_if_no_scipy
  48. def test_rank(self, datetime_series):
  49. from scipy.stats import rankdata
  50. datetime_series[::2] = np.nan
  51. datetime_series[:10:3] = 4.0
  52. ranks = datetime_series.rank()
  53. oranks = datetime_series.astype("O").rank()
  54. tm.assert_series_equal(ranks, oranks)
  55. mask = np.isnan(datetime_series)
  56. filled = datetime_series.fillna(np.inf)
  57. # rankdata returns a ndarray
  58. exp = Series(rankdata(filled), index=filled.index, name="ts")
  59. exp[mask] = np.nan
  60. tm.assert_series_equal(ranks, exp)
  61. iseries = Series(np.arange(5).repeat(2))
  62. iranks = iseries.rank()
  63. exp = iseries.astype(float).rank()
  64. tm.assert_series_equal(iranks, exp)
  65. iseries = Series(np.arange(5)) + 1.0
  66. exp = iseries / 5.0
  67. iranks = iseries.rank(pct=True)
  68. tm.assert_series_equal(iranks, exp)
  69. iseries = Series(np.repeat(1, 100))
  70. exp = Series(np.repeat(0.505, 100))
  71. iranks = iseries.rank(pct=True)
  72. tm.assert_series_equal(iranks, exp)
  73. # Explicit cast to float to avoid implicit cast when setting nan
  74. iseries = iseries.astype("float")
  75. iseries[1] = np.nan
  76. exp = Series(np.repeat(50.0 / 99.0, 100))
  77. exp[1] = np.nan
  78. iranks = iseries.rank(pct=True)
  79. tm.assert_series_equal(iranks, exp)
  80. iseries = Series(np.arange(5)) + 1.0
  81. iseries[4] = np.nan
  82. exp = iseries / 4.0
  83. iranks = iseries.rank(pct=True)
  84. tm.assert_series_equal(iranks, exp)
  85. iseries = Series(np.repeat(np.nan, 100))
  86. exp = iseries.copy()
  87. iranks = iseries.rank(pct=True)
  88. tm.assert_series_equal(iranks, exp)
  89. # Explicit cast to float to avoid implicit cast when setting nan
  90. iseries = Series(np.arange(5), dtype="float") + 1
  91. iseries[4] = np.nan
  92. exp = iseries / 4.0
  93. iranks = iseries.rank(pct=True)
  94. tm.assert_series_equal(iranks, exp)
  95. rng = date_range("1/1/1990", periods=5)
  96. # Explicit cast to float to avoid implicit cast when setting nan
  97. iseries = Series(np.arange(5), rng, dtype="float") + 1
  98. iseries.iloc[4] = np.nan
  99. exp = iseries / 4.0
  100. iranks = iseries.rank(pct=True)
  101. tm.assert_series_equal(iranks, exp)
  102. iseries = Series([1e-50, 1e-100, 1e-20, 1e-2, 1e-20 + 1e-30, 1e-1])
  103. exp = Series([2, 1, 3, 5, 4, 6.0])
  104. iranks = iseries.rank()
  105. tm.assert_series_equal(iranks, exp)
  106. # GH 5968
  107. iseries = Series(["3 day", "1 day 10m", "-2 day", NaT], dtype="m8[ns]")
  108. exp = Series([3, 2, 1, np.nan])
  109. iranks = iseries.rank()
  110. tm.assert_series_equal(iranks, exp)
  111. values = np.array(
  112. [-50, -1, -1e-20, -1e-25, -1e-50, 0, 1e-40, 1e-20, 1e-10, 2, 40],
  113. dtype="float64",
  114. )
  115. random_order = np.random.permutation(len(values))
  116. iseries = Series(values[random_order])
  117. exp = Series(random_order + 1.0, dtype="float64")
  118. iranks = iseries.rank()
  119. tm.assert_series_equal(iranks, exp)
  120. def test_rank_categorical(self):
  121. # GH issue #15420 rank incorrectly orders ordered categories
  122. # Test ascending/descending ranking for ordered categoricals
  123. exp = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
  124. exp_desc = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0])
  125. ordered = Series(
  126. ["first", "second", "third", "fourth", "fifth", "sixth"]
  127. ).astype(
  128. CategoricalDtype(
  129. categories=["first", "second", "third", "fourth", "fifth", "sixth"],
  130. ordered=True,
  131. )
  132. )
  133. tm.assert_series_equal(ordered.rank(), exp)
  134. tm.assert_series_equal(ordered.rank(ascending=False), exp_desc)
  135. # Unordered categoricals should be ranked as objects
  136. unordered = Series(
  137. ["first", "second", "third", "fourth", "fifth", "sixth"]
  138. ).astype(
  139. CategoricalDtype(
  140. categories=["first", "second", "third", "fourth", "fifth", "sixth"],
  141. ordered=False,
  142. )
  143. )
  144. exp_unordered = Series([2.0, 4.0, 6.0, 3.0, 1.0, 5.0])
  145. res = unordered.rank()
  146. tm.assert_series_equal(res, exp_unordered)
  147. unordered1 = Series([1, 2, 3, 4, 5, 6]).astype(
  148. CategoricalDtype([1, 2, 3, 4, 5, 6], False)
  149. )
  150. exp_unordered1 = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0])
  151. res1 = unordered1.rank()
  152. tm.assert_series_equal(res1, exp_unordered1)
  153. # Test na_option for rank data
  154. na_ser = Series(
  155. ["first", "second", "third", "fourth", "fifth", "sixth", np.NaN]
  156. ).astype(
  157. CategoricalDtype(
  158. ["first", "second", "third", "fourth", "fifth", "sixth", "seventh"],
  159. True,
  160. )
  161. )
  162. exp_top = Series([2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 1.0])
  163. exp_bot = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0])
  164. exp_keep = Series([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, np.NaN])
  165. tm.assert_series_equal(na_ser.rank(na_option="top"), exp_top)
  166. tm.assert_series_equal(na_ser.rank(na_option="bottom"), exp_bot)
  167. tm.assert_series_equal(na_ser.rank(na_option="keep"), exp_keep)
  168. # Test na_option for rank data with ascending False
  169. exp_top = Series([7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0])
  170. exp_bot = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, 7.0])
  171. exp_keep = Series([6.0, 5.0, 4.0, 3.0, 2.0, 1.0, np.NaN])
  172. tm.assert_series_equal(na_ser.rank(na_option="top", ascending=False), exp_top)
  173. tm.assert_series_equal(
  174. na_ser.rank(na_option="bottom", ascending=False), exp_bot
  175. )
  176. tm.assert_series_equal(na_ser.rank(na_option="keep", ascending=False), exp_keep)
  177. # Test invalid values for na_option
  178. msg = "na_option must be one of 'keep', 'top', or 'bottom'"
  179. with pytest.raises(ValueError, match=msg):
  180. na_ser.rank(na_option="bad", ascending=False)
  181. # invalid type
  182. with pytest.raises(ValueError, match=msg):
  183. na_ser.rank(na_option=True, ascending=False)
  184. # Test with pct=True
  185. na_ser = Series(["first", "second", "third", "fourth", np.NaN]).astype(
  186. CategoricalDtype(["first", "second", "third", "fourth"], True)
  187. )
  188. exp_top = Series([0.4, 0.6, 0.8, 1.0, 0.2])
  189. exp_bot = Series([0.2, 0.4, 0.6, 0.8, 1.0])
  190. exp_keep = Series([0.25, 0.5, 0.75, 1.0, np.NaN])
  191. tm.assert_series_equal(na_ser.rank(na_option="top", pct=True), exp_top)
  192. tm.assert_series_equal(na_ser.rank(na_option="bottom", pct=True), exp_bot)
  193. tm.assert_series_equal(na_ser.rank(na_option="keep", pct=True), exp_keep)
  194. def test_rank_signature(self):
  195. s = Series([0, 1])
  196. s.rank(method="average")
  197. msg = "No axis named average for object type Series"
  198. with pytest.raises(ValueError, match=msg):
  199. s.rank("average")
  200. @pytest.mark.parametrize("dtype", [None, object])
  201. def test_rank_tie_methods(self, ser, results, dtype):
  202. method, exp = results
  203. ser = ser if dtype is None else ser.astype(dtype)
  204. result = ser.rank(method=method)
  205. tm.assert_series_equal(result, Series(exp))
  206. @td.skip_if_no_scipy
  207. @pytest.mark.parametrize("ascending", [True, False])
  208. @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
  209. @pytest.mark.parametrize("na_option", ["top", "bottom", "keep"])
  210. @pytest.mark.parametrize(
  211. "dtype, na_value, pos_inf, neg_inf",
  212. [
  213. ("object", None, Infinity(), NegInfinity()),
  214. ("float64", np.nan, np.inf, -np.inf),
  215. ("Float64", NA, np.inf, -np.inf),
  216. pytest.param(
  217. "float64[pyarrow]",
  218. NA,
  219. np.inf,
  220. -np.inf,
  221. marks=td.skip_if_no("pyarrow"),
  222. ),
  223. ],
  224. )
  225. def test_rank_tie_methods_on_infs_nans(
  226. self, method, na_option, ascending, dtype, na_value, pos_inf, neg_inf
  227. ):
  228. if dtype == "float64[pyarrow]":
  229. if method == "average":
  230. exp_dtype = "float64[pyarrow]"
  231. else:
  232. exp_dtype = "uint64[pyarrow]"
  233. else:
  234. exp_dtype = "float64"
  235. chunk = 3
  236. in_arr = [neg_inf] * chunk + [na_value] * chunk + [pos_inf] * chunk
  237. iseries = Series(in_arr, dtype=dtype)
  238. exp_ranks = {
  239. "average": ([2, 2, 2], [5, 5, 5], [8, 8, 8]),
  240. "min": ([1, 1, 1], [4, 4, 4], [7, 7, 7]),
  241. "max": ([3, 3, 3], [6, 6, 6], [9, 9, 9]),
  242. "first": ([1, 2, 3], [4, 5, 6], [7, 8, 9]),
  243. "dense": ([1, 1, 1], [2, 2, 2], [3, 3, 3]),
  244. }
  245. ranks = exp_ranks[method]
  246. if na_option == "top":
  247. order = [ranks[1], ranks[0], ranks[2]]
  248. elif na_option == "bottom":
  249. order = [ranks[0], ranks[2], ranks[1]]
  250. else:
  251. order = [ranks[0], [np.nan] * chunk, ranks[1]]
  252. expected = order if ascending else order[::-1]
  253. expected = list(chain.from_iterable(expected))
  254. result = iseries.rank(method=method, na_option=na_option, ascending=ascending)
  255. tm.assert_series_equal(result, Series(expected, dtype=exp_dtype))
  256. def test_rank_desc_mix_nans_infs(self):
  257. # GH 19538
  258. # check descending ranking when mix nans and infs
  259. iseries = Series([1, np.nan, np.inf, -np.inf, 25])
  260. result = iseries.rank(ascending=False)
  261. exp = Series([3, np.nan, 1, 4, 2], dtype="float64")
  262. tm.assert_series_equal(result, exp)
  263. @td.skip_if_no_scipy
  264. @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
  265. @pytest.mark.parametrize(
  266. "op, value",
  267. [
  268. [operator.add, 0],
  269. [operator.add, 1e6],
  270. [operator.mul, 1e-6],
  271. ],
  272. )
  273. def test_rank_methods_series(self, method, op, value):
  274. from scipy.stats import rankdata
  275. xs = np.random.randn(9)
  276. xs = np.concatenate([xs[i:] for i in range(0, 9, 2)]) # add duplicates
  277. np.random.shuffle(xs)
  278. index = [chr(ord("a") + i) for i in range(len(xs))]
  279. vals = op(xs, value)
  280. ts = Series(vals, index=index)
  281. result = ts.rank(method=method)
  282. sprank = rankdata(vals, method if method != "first" else "ordinal")
  283. expected = Series(sprank, index=index).astype("float64")
  284. tm.assert_series_equal(result, expected)
  285. @pytest.mark.parametrize(
  286. "ser, exp",
  287. [
  288. ([1], [1]),
  289. ([2], [1]),
  290. ([0], [1]),
  291. ([2, 2], [1, 1]),
  292. ([1, 2, 3], [1, 2, 3]),
  293. ([4, 2, 1], [3, 2, 1]),
  294. ([1, 1, 5, 5, 3], [1, 1, 3, 3, 2]),
  295. ([-5, -4, -3, -2, -1], [1, 2, 3, 4, 5]),
  296. ],
  297. )
  298. def test_rank_dense_method(self, dtype, ser, exp):
  299. s = Series(ser).astype(dtype)
  300. result = s.rank(method="dense")
  301. expected = Series(exp).astype(result.dtype)
  302. tm.assert_series_equal(result, expected)
  303. def test_rank_descending(self, ser, results, dtype):
  304. method, _ = results
  305. if "i" in dtype:
  306. s = ser.dropna()
  307. else:
  308. s = ser.astype(dtype)
  309. res = s.rank(ascending=False)
  310. expected = (s.max() - s).rank()
  311. tm.assert_series_equal(res, expected)
  312. expected = (s.max() - s).rank(method=method)
  313. res2 = s.rank(method=method, ascending=False)
  314. tm.assert_series_equal(res2, expected)
  315. def test_rank_int(self, ser, results):
  316. method, exp = results
  317. s = ser.dropna().astype("i8")
  318. result = s.rank(method=method)
  319. expected = Series(exp).dropna()
  320. expected.index = result.index
  321. tm.assert_series_equal(result, expected)
  322. def test_rank_object_bug(self):
  323. # GH 13445
  324. # smoke tests
  325. Series([np.nan] * 32).astype(object).rank(ascending=True)
  326. Series([np.nan] * 32).astype(object).rank(ascending=False)
  327. def test_rank_modify_inplace(self):
  328. # GH 18521
  329. # Check rank does not mutate series
  330. s = Series([Timestamp("2017-01-05 10:20:27.569000"), NaT])
  331. expected = s.copy()
  332. s.rank()
  333. result = s
  334. tm.assert_series_equal(result, expected)
  335. # GH15630, pct should be on 100% basis when method='dense'
  336. @pytest.mark.parametrize(
  337. "ser, exp",
  338. [
  339. ([1], [1.0]),
  340. ([1, 2], [1.0 / 2, 2.0 / 2]),
  341. ([2, 2], [1.0, 1.0]),
  342. ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]),
  343. ([1, 2, 2], [1.0 / 2, 2.0 / 2, 2.0 / 2]),
  344. ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]),
  345. ([1, 1, 5, 5, 3], [1.0 / 3, 1.0 / 3, 3.0 / 3, 3.0 / 3, 2.0 / 3]),
  346. ([1, 1, 3, 3, 5, 5], [1.0 / 3, 1.0 / 3, 2.0 / 3, 2.0 / 3, 3.0 / 3, 3.0 / 3]),
  347. ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]),
  348. ],
  349. )
  350. def test_rank_dense_pct(dtype, ser, exp):
  351. s = Series(ser).astype(dtype)
  352. result = s.rank(method="dense", pct=True)
  353. expected = Series(exp).astype(result.dtype)
  354. tm.assert_series_equal(result, expected)
  355. @pytest.mark.parametrize(
  356. "ser, exp",
  357. [
  358. ([1], [1.0]),
  359. ([1, 2], [1.0 / 2, 2.0 / 2]),
  360. ([2, 2], [1.0 / 2, 1.0 / 2]),
  361. ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]),
  362. ([1, 2, 2], [1.0 / 3, 2.0 / 3, 2.0 / 3]),
  363. ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]),
  364. ([1, 1, 5, 5, 3], [1.0 / 5, 1.0 / 5, 4.0 / 5, 4.0 / 5, 3.0 / 5]),
  365. ([1, 1, 3, 3, 5, 5], [1.0 / 6, 1.0 / 6, 3.0 / 6, 3.0 / 6, 5.0 / 6, 5.0 / 6]),
  366. ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]),
  367. ],
  368. )
  369. def test_rank_min_pct(dtype, ser, exp):
  370. s = Series(ser).astype(dtype)
  371. result = s.rank(method="min", pct=True)
  372. expected = Series(exp).astype(result.dtype)
  373. tm.assert_series_equal(result, expected)
  374. @pytest.mark.parametrize(
  375. "ser, exp",
  376. [
  377. ([1], [1.0]),
  378. ([1, 2], [1.0 / 2, 2.0 / 2]),
  379. ([2, 2], [1.0, 1.0]),
  380. ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]),
  381. ([1, 2, 2], [1.0 / 3, 3.0 / 3, 3.0 / 3]),
  382. ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]),
  383. ([1, 1, 5, 5, 3], [2.0 / 5, 2.0 / 5, 5.0 / 5, 5.0 / 5, 3.0 / 5]),
  384. ([1, 1, 3, 3, 5, 5], [2.0 / 6, 2.0 / 6, 4.0 / 6, 4.0 / 6, 6.0 / 6, 6.0 / 6]),
  385. ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]),
  386. ],
  387. )
  388. def test_rank_max_pct(dtype, ser, exp):
  389. s = Series(ser).astype(dtype)
  390. result = s.rank(method="max", pct=True)
  391. expected = Series(exp).astype(result.dtype)
  392. tm.assert_series_equal(result, expected)
  393. @pytest.mark.parametrize(
  394. "ser, exp",
  395. [
  396. ([1], [1.0]),
  397. ([1, 2], [1.0 / 2, 2.0 / 2]),
  398. ([2, 2], [1.5 / 2, 1.5 / 2]),
  399. ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]),
  400. ([1, 2, 2], [1.0 / 3, 2.5 / 3, 2.5 / 3]),
  401. ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]),
  402. ([1, 1, 5, 5, 3], [1.5 / 5, 1.5 / 5, 4.5 / 5, 4.5 / 5, 3.0 / 5]),
  403. ([1, 1, 3, 3, 5, 5], [1.5 / 6, 1.5 / 6, 3.5 / 6, 3.5 / 6, 5.5 / 6, 5.5 / 6]),
  404. ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]),
  405. ],
  406. )
  407. def test_rank_average_pct(dtype, ser, exp):
  408. s = Series(ser).astype(dtype)
  409. result = s.rank(method="average", pct=True)
  410. expected = Series(exp).astype(result.dtype)
  411. tm.assert_series_equal(result, expected)
  412. @pytest.mark.parametrize(
  413. "ser, exp",
  414. [
  415. ([1], [1.0]),
  416. ([1, 2], [1.0 / 2, 2.0 / 2]),
  417. ([2, 2], [1.0 / 2, 2.0 / 2.0]),
  418. ([1, 2, 3], [1.0 / 3, 2.0 / 3, 3.0 / 3]),
  419. ([1, 2, 2], [1.0 / 3, 2.0 / 3, 3.0 / 3]),
  420. ([4, 2, 1], [3.0 / 3, 2.0 / 3, 1.0 / 3]),
  421. ([1, 1, 5, 5, 3], [1.0 / 5, 2.0 / 5, 4.0 / 5, 5.0 / 5, 3.0 / 5]),
  422. ([1, 1, 3, 3, 5, 5], [1.0 / 6, 2.0 / 6, 3.0 / 6, 4.0 / 6, 5.0 / 6, 6.0 / 6]),
  423. ([-5, -4, -3, -2, -1], [1.0 / 5, 2.0 / 5, 3.0 / 5, 4.0 / 5, 5.0 / 5]),
  424. ],
  425. )
  426. def test_rank_first_pct(dtype, ser, exp):
  427. s = Series(ser).astype(dtype)
  428. result = s.rank(method="first", pct=True)
  429. expected = Series(exp).astype(result.dtype)
  430. tm.assert_series_equal(result, expected)
  431. @pytest.mark.single_cpu
  432. def test_pct_max_many_rows():
  433. # GH 18271
  434. s = Series(np.arange(2**24 + 1))
  435. result = s.rank(pct=True).max()
  436. assert result == 1