test_quantile.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. Index,
  7. )
  8. import pandas._testing as tm
  9. @pytest.mark.parametrize(
  10. "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
  11. )
  12. @pytest.mark.parametrize(
  13. "a_vals,b_vals",
  14. [
  15. # Ints
  16. ([1, 2, 3, 4, 5], [5, 4, 3, 2, 1]),
  17. ([1, 2, 3, 4], [4, 3, 2, 1]),
  18. ([1, 2, 3, 4, 5], [4, 3, 2, 1]),
  19. # Floats
  20. ([1.0, 2.0, 3.0, 4.0, 5.0], [5.0, 4.0, 3.0, 2.0, 1.0]),
  21. # Missing data
  22. ([1.0, np.nan, 3.0, np.nan, 5.0], [5.0, np.nan, 3.0, np.nan, 1.0]),
  23. ([np.nan, 4.0, np.nan, 2.0, np.nan], [np.nan, 4.0, np.nan, 2.0, np.nan]),
  24. # Timestamps
  25. (
  26. pd.date_range("1/1/18", freq="D", periods=5),
  27. pd.date_range("1/1/18", freq="D", periods=5)[::-1],
  28. ),
  29. (
  30. pd.date_range("1/1/18", freq="D", periods=5).as_unit("s"),
  31. pd.date_range("1/1/18", freq="D", periods=5)[::-1].as_unit("s"),
  32. ),
  33. # All NA
  34. ([np.nan] * 5, [np.nan] * 5),
  35. ],
  36. )
  37. @pytest.mark.parametrize("q", [0, 0.25, 0.5, 0.75, 1])
  38. def test_quantile(interpolation, a_vals, b_vals, q, request):
  39. if (
  40. interpolation == "nearest"
  41. and q == 0.5
  42. and isinstance(b_vals, list)
  43. and b_vals == [4, 3, 2, 1]
  44. ):
  45. request.node.add_marker(
  46. pytest.mark.xfail(
  47. reason="Unclear numpy expectation for nearest "
  48. "result with equidistant data"
  49. )
  50. )
  51. all_vals = pd.concat([pd.Series(a_vals), pd.Series(b_vals)])
  52. a_expected = pd.Series(a_vals).quantile(q, interpolation=interpolation)
  53. b_expected = pd.Series(b_vals).quantile(q, interpolation=interpolation)
  54. df = DataFrame({"key": ["a"] * len(a_vals) + ["b"] * len(b_vals), "val": all_vals})
  55. expected = DataFrame(
  56. [a_expected, b_expected], columns=["val"], index=Index(["a", "b"], name="key")
  57. )
  58. if all_vals.dtype.kind == "M" and expected.dtypes.values[0].kind == "M":
  59. # TODO(non-nano): this should be unnecessary once array_to_datetime
  60. # correctly infers non-nano from Timestamp.unit
  61. expected = expected.astype(all_vals.dtype)
  62. result = df.groupby("key").quantile(q, interpolation=interpolation)
  63. tm.assert_frame_equal(result, expected)
  64. def test_quantile_array():
  65. # https://github.com/pandas-dev/pandas/issues/27526
  66. df = DataFrame({"A": [0, 1, 2, 3, 4]})
  67. key = np.array([0, 0, 1, 1, 1], dtype=np.int64)
  68. result = df.groupby(key).quantile([0.25])
  69. index = pd.MultiIndex.from_product([[0, 1], [0.25]])
  70. expected = DataFrame({"A": [0.25, 2.50]}, index=index)
  71. tm.assert_frame_equal(result, expected)
  72. df = DataFrame({"A": [0, 1, 2, 3], "B": [4, 5, 6, 7]})
  73. index = pd.MultiIndex.from_product([[0, 1], [0.25, 0.75]])
  74. key = np.array([0, 0, 1, 1], dtype=np.int64)
  75. result = df.groupby(key).quantile([0.25, 0.75])
  76. expected = DataFrame(
  77. {"A": [0.25, 0.75, 2.25, 2.75], "B": [4.25, 4.75, 6.25, 6.75]}, index=index
  78. )
  79. tm.assert_frame_equal(result, expected)
  80. def test_quantile_array2():
  81. # https://github.com/pandas-dev/pandas/pull/28085#issuecomment-524066959
  82. arr = np.random.RandomState(0).randint(0, 5, size=(10, 3), dtype=np.int64)
  83. df = DataFrame(arr, columns=list("ABC"))
  84. result = df.groupby("A").quantile([0.3, 0.7])
  85. expected = DataFrame(
  86. {
  87. "B": [0.9, 2.1, 2.2, 3.4, 1.6, 2.4, 2.3, 2.7, 0.0, 0.0],
  88. "C": [1.2, 2.8, 1.8, 3.0, 0.0, 0.0, 1.9, 3.1, 3.0, 3.0],
  89. },
  90. index=pd.MultiIndex.from_product(
  91. [[0, 1, 2, 3, 4], [0.3, 0.7]], names=["A", None]
  92. ),
  93. )
  94. tm.assert_frame_equal(result, expected)
  95. def test_quantile_array_no_sort():
  96. df = DataFrame({"A": [0, 1, 2], "B": [3, 4, 5]})
  97. key = np.array([1, 0, 1], dtype=np.int64)
  98. result = df.groupby(key, sort=False).quantile([0.25, 0.5, 0.75])
  99. expected = DataFrame(
  100. {"A": [0.5, 1.0, 1.5, 1.0, 1.0, 1.0], "B": [3.5, 4.0, 4.5, 4.0, 4.0, 4.0]},
  101. index=pd.MultiIndex.from_product([[1, 0], [0.25, 0.5, 0.75]]),
  102. )
  103. tm.assert_frame_equal(result, expected)
  104. result = df.groupby(key, sort=False).quantile([0.75, 0.25])
  105. expected = DataFrame(
  106. {"A": [1.5, 0.5, 1.0, 1.0], "B": [4.5, 3.5, 4.0, 4.0]},
  107. index=pd.MultiIndex.from_product([[1, 0], [0.75, 0.25]]),
  108. )
  109. tm.assert_frame_equal(result, expected)
  110. def test_quantile_array_multiple_levels():
  111. df = DataFrame(
  112. {"A": [0, 1, 2], "B": [3, 4, 5], "c": ["a", "a", "a"], "d": ["a", "a", "b"]}
  113. )
  114. result = df.groupby(["c", "d"]).quantile([0.25, 0.75])
  115. index = pd.MultiIndex.from_tuples(
  116. [("a", "a", 0.25), ("a", "a", 0.75), ("a", "b", 0.25), ("a", "b", 0.75)],
  117. names=["c", "d", None],
  118. )
  119. expected = DataFrame(
  120. {"A": [0.25, 0.75, 2.0, 2.0], "B": [3.25, 3.75, 5.0, 5.0]}, index=index
  121. )
  122. tm.assert_frame_equal(result, expected)
  123. @pytest.mark.parametrize("frame_size", [(2, 3), (100, 10)])
  124. @pytest.mark.parametrize("groupby", [[0], [0, 1]])
  125. @pytest.mark.parametrize("q", [[0.5, 0.6]])
  126. def test_groupby_quantile_with_arraylike_q_and_int_columns(frame_size, groupby, q):
  127. # GH30289
  128. nrow, ncol = frame_size
  129. df = DataFrame(np.array([ncol * [_ % 4] for _ in range(nrow)]), columns=range(ncol))
  130. idx_levels = [np.arange(min(nrow, 4))] * len(groupby) + [q]
  131. idx_codes = [[x for x in range(min(nrow, 4)) for _ in q]] * len(groupby) + [
  132. list(range(len(q))) * min(nrow, 4)
  133. ]
  134. expected_index = pd.MultiIndex(
  135. levels=idx_levels, codes=idx_codes, names=groupby + [None]
  136. )
  137. expected_values = [
  138. [float(x)] * (ncol - len(groupby)) for x in range(min(nrow, 4)) for _ in q
  139. ]
  140. expected_columns = [x for x in range(ncol) if x not in groupby]
  141. expected = DataFrame(
  142. expected_values, index=expected_index, columns=expected_columns
  143. )
  144. result = df.groupby(groupby).quantile(q)
  145. tm.assert_frame_equal(result, expected)
  146. def test_quantile_raises():
  147. df = DataFrame([["foo", "a"], ["foo", "b"], ["foo", "c"]], columns=["key", "val"])
  148. with pytest.raises(TypeError, match="cannot be performed against 'object' dtypes"):
  149. df.groupby("key").quantile()
  150. def test_quantile_out_of_bounds_q_raises():
  151. # https://github.com/pandas-dev/pandas/issues/27470
  152. df = DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": range(6)})
  153. g = df.groupby([0, 0, 0, 1, 1, 1])
  154. with pytest.raises(ValueError, match="Got '50.0' instead"):
  155. g.quantile(50)
  156. with pytest.raises(ValueError, match="Got '-1.0' instead"):
  157. g.quantile(-1)
  158. def test_quantile_missing_group_values_no_segfaults():
  159. # GH 28662
  160. data = np.array([1.0, np.nan, 1.0])
  161. df = DataFrame({"key": data, "val": range(3)})
  162. # Random segfaults; would have been guaranteed in loop
  163. grp = df.groupby("key")
  164. for _ in range(100):
  165. grp.quantile()
  166. @pytest.mark.parametrize(
  167. "key, val, expected_key, expected_val",
  168. [
  169. ([1.0, np.nan, 3.0, np.nan], range(4), [1.0, 3.0], [0.0, 2.0]),
  170. ([1.0, np.nan, 2.0, 2.0], range(4), [1.0, 2.0], [0.0, 2.5]),
  171. (["a", "b", "b", np.nan], range(4), ["a", "b"], [0, 1.5]),
  172. ([0], [42], [0], [42.0]),
  173. ([], [], np.array([], dtype="float64"), np.array([], dtype="float64")),
  174. ],
  175. )
  176. def test_quantile_missing_group_values_correct_results(
  177. key, val, expected_key, expected_val
  178. ):
  179. # GH 28662, GH 33200, GH 33569
  180. df = DataFrame({"key": key, "val": val})
  181. expected = DataFrame(
  182. expected_val, index=Index(expected_key, name="key"), columns=["val"]
  183. )
  184. grp = df.groupby("key")
  185. result = grp.quantile(0.5)
  186. tm.assert_frame_equal(result, expected)
  187. result = grp.quantile()
  188. tm.assert_frame_equal(result, expected)
  189. @pytest.mark.parametrize(
  190. "values",
  191. [
  192. pd.array([1, 0, None] * 2, dtype="Int64"),
  193. pd.array([True, False, None] * 2, dtype="boolean"),
  194. ],
  195. )
  196. @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
  197. def test_groupby_quantile_nullable_array(values, q):
  198. # https://github.com/pandas-dev/pandas/issues/33136
  199. df = DataFrame({"a": ["x"] * 3 + ["y"] * 3, "b": values})
  200. result = df.groupby("a")["b"].quantile(q)
  201. if isinstance(q, list):
  202. idx = pd.MultiIndex.from_product((["x", "y"], q), names=["a", None])
  203. true_quantiles = [0.0, 0.5, 1.0]
  204. else:
  205. idx = Index(["x", "y"], name="a")
  206. true_quantiles = [0.5]
  207. expected = pd.Series(true_quantiles * 2, index=idx, name="b", dtype="Float64")
  208. tm.assert_series_equal(result, expected)
  209. @pytest.mark.parametrize("q", [0.5, [0.0, 0.5, 1.0]])
  210. @pytest.mark.parametrize("numeric_only", [True, False])
  211. def test_groupby_quantile_raises_on_invalid_dtype(q, numeric_only):
  212. df = DataFrame({"a": [1], "b": [2.0], "c": ["x"]})
  213. if numeric_only:
  214. result = df.groupby("a").quantile(q, numeric_only=numeric_only)
  215. expected = df.groupby("a")[["b"]].quantile(q)
  216. tm.assert_frame_equal(result, expected)
  217. else:
  218. with pytest.raises(
  219. TypeError, match="'quantile' cannot be performed against 'object' dtypes!"
  220. ):
  221. df.groupby("a").quantile(q, numeric_only=numeric_only)
  222. def test_groupby_quantile_NA_float(any_float_dtype):
  223. # GH#42849
  224. df = DataFrame({"x": [1, 1], "y": [0.2, np.nan]}, dtype=any_float_dtype)
  225. result = df.groupby("x")["y"].quantile(0.5)
  226. exp_index = Index([1.0], dtype=any_float_dtype, name="x")
  227. if any_float_dtype in ["Float32", "Float64"]:
  228. expected_dtype = any_float_dtype
  229. else:
  230. expected_dtype = None
  231. expected = pd.Series([0.2], dtype=expected_dtype, index=exp_index, name="y")
  232. tm.assert_series_equal(result, expected)
  233. result = df.groupby("x")["y"].quantile([0.5, 0.75])
  234. expected = pd.Series(
  235. [0.2] * 2,
  236. index=pd.MultiIndex.from_product((exp_index, [0.5, 0.75]), names=["x", None]),
  237. name="y",
  238. dtype=expected_dtype,
  239. )
  240. tm.assert_series_equal(result, expected)
  241. def test_groupby_quantile_NA_int(any_int_ea_dtype):
  242. # GH#42849
  243. df = DataFrame({"x": [1, 1], "y": [2, 5]}, dtype=any_int_ea_dtype)
  244. result = df.groupby("x")["y"].quantile(0.5)
  245. expected = pd.Series(
  246. [3.5],
  247. dtype="Float64",
  248. index=Index([1], name="x", dtype=any_int_ea_dtype),
  249. name="y",
  250. )
  251. tm.assert_series_equal(expected, result)
  252. result = df.groupby("x").quantile(0.5)
  253. expected = DataFrame(
  254. {"y": 3.5}, dtype="Float64", index=Index([1], name="x", dtype=any_int_ea_dtype)
  255. )
  256. tm.assert_frame_equal(result, expected)
  257. @pytest.mark.parametrize(
  258. "interpolation, val1, val2", [("lower", 2, 2), ("higher", 2, 3), ("nearest", 2, 2)]
  259. )
  260. def test_groupby_quantile_all_na_group_masked(
  261. interpolation, val1, val2, any_numeric_ea_dtype
  262. ):
  263. # GH#37493
  264. df = DataFrame(
  265. {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
  266. )
  267. result = df.groupby("a").quantile(q=[0.5, 0.7], interpolation=interpolation)
  268. expected = DataFrame(
  269. {"b": [val1, val2, pd.NA, pd.NA]},
  270. dtype=any_numeric_ea_dtype,
  271. index=pd.MultiIndex.from_arrays(
  272. [pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype), [0.5, 0.7, 0.5, 0.7]],
  273. names=["a", None],
  274. ),
  275. )
  276. tm.assert_frame_equal(result, expected)
  277. @pytest.mark.parametrize("interpolation", ["midpoint", "linear"])
  278. def test_groupby_quantile_all_na_group_masked_interp(
  279. interpolation, any_numeric_ea_dtype
  280. ):
  281. # GH#37493
  282. df = DataFrame(
  283. {"a": [1, 1, 1, 2], "b": [1, 2, 3, pd.NA]}, dtype=any_numeric_ea_dtype
  284. )
  285. result = df.groupby("a").quantile(q=[0.5, 0.75], interpolation=interpolation)
  286. if any_numeric_ea_dtype == "Float32":
  287. expected_dtype = any_numeric_ea_dtype
  288. else:
  289. expected_dtype = "Float64"
  290. expected = DataFrame(
  291. {"b": [2.0, 2.5, pd.NA, pd.NA]},
  292. dtype=expected_dtype,
  293. index=pd.MultiIndex.from_arrays(
  294. [
  295. pd.Series([1, 1, 2, 2], dtype=any_numeric_ea_dtype),
  296. [0.5, 0.75, 0.5, 0.75],
  297. ],
  298. names=["a", None],
  299. ),
  300. )
  301. tm.assert_frame_equal(result, expected)
  302. @pytest.mark.parametrize("dtype", ["Float64", "Float32"])
  303. def test_groupby_quantile_allNA_column(dtype):
  304. # GH#42849
  305. df = DataFrame({"x": [1, 1], "y": [pd.NA] * 2}, dtype=dtype)
  306. result = df.groupby("x")["y"].quantile(0.5)
  307. expected = pd.Series(
  308. [np.nan], dtype=dtype, index=Index([1.0], dtype=dtype), name="y"
  309. )
  310. expected.index.name = "x"
  311. tm.assert_series_equal(expected, result)
  312. def test_groupby_timedelta_quantile():
  313. # GH: 29485
  314. df = DataFrame(
  315. {"value": pd.to_timedelta(np.arange(4), unit="s"), "group": [1, 1, 2, 2]}
  316. )
  317. result = df.groupby("group").quantile(0.99)
  318. expected = DataFrame(
  319. {
  320. "value": [
  321. pd.Timedelta("0 days 00:00:00.990000"),
  322. pd.Timedelta("0 days 00:00:02.990000"),
  323. ]
  324. },
  325. index=Index([1, 2], name="group"),
  326. )
  327. tm.assert_frame_equal(result, expected)
  328. def test_columns_groupby_quantile():
  329. # GH 33795
  330. df = DataFrame(
  331. np.arange(12).reshape(3, -1),
  332. index=list("XYZ"),
  333. columns=pd.Series(list("ABAB"), name="col"),
  334. )
  335. result = df.groupby("col", axis=1).quantile(q=[0.8, 0.2])
  336. expected = DataFrame(
  337. [
  338. [1.6, 0.4, 2.6, 1.4],
  339. [5.6, 4.4, 6.6, 5.4],
  340. [9.6, 8.4, 10.6, 9.4],
  341. ],
  342. index=list("XYZ"),
  343. columns=pd.MultiIndex.from_tuples(
  344. [("A", 0.8), ("A", 0.2), ("B", 0.8), ("B", 0.2)], names=["col", None]
  345. ),
  346. )
  347. tm.assert_frame_equal(result, expected)
  348. def test_timestamp_groupby_quantile():
  349. # GH 33168
  350. df = DataFrame(
  351. {
  352. "timestamp": pd.date_range(
  353. start="2020-04-19 00:00:00", freq="1T", periods=100, tz="UTC"
  354. ).floor("1H"),
  355. "category": list(range(1, 101)),
  356. "value": list(range(101, 201)),
  357. }
  358. )
  359. result = df.groupby("timestamp").quantile([0.2, 0.8])
  360. expected = DataFrame(
  361. [
  362. {"category": 12.8, "value": 112.8},
  363. {"category": 48.2, "value": 148.2},
  364. {"category": 68.8, "value": 168.8},
  365. {"category": 92.2, "value": 192.2},
  366. ],
  367. index=pd.MultiIndex.from_tuples(
  368. [
  369. (pd.Timestamp("2020-04-19 00:00:00+00:00"), 0.2),
  370. (pd.Timestamp("2020-04-19 00:00:00+00:00"), 0.8),
  371. (pd.Timestamp("2020-04-19 01:00:00+00:00"), 0.2),
  372. (pd.Timestamp("2020-04-19 01:00:00+00:00"), 0.8),
  373. ],
  374. names=("timestamp", None),
  375. ),
  376. )
  377. tm.assert_frame_equal(result, expected)
  378. def test_groupby_quantile_dt64tz_period():
  379. # GH#51373
  380. dti = pd.date_range("2016-01-01", periods=1000)
  381. ser = pd.Series(dti)
  382. df = ser.to_frame()
  383. df[1] = dti.tz_localize("US/Pacific")
  384. df[2] = dti.to_period("D")
  385. df[3] = dti - dti[0]
  386. df.iloc[-1] = pd.NaT
  387. by = np.tile(np.arange(5), 200)
  388. gb = df.groupby(by)
  389. result = gb.quantile(0.5)
  390. # Check that we match the group-by-group result
  391. exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
  392. expected = DataFrame(exp).T
  393. expected.index = expected.index.astype(np.int_)
  394. tm.assert_frame_equal(result, expected)