test_indexing.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332
  1. # Test GroupBy._positional_selector positional grouped indexing GH#42864
  2. import random
  3. import numpy as np
  4. import pytest
  5. import pandas as pd
  6. import pandas._testing as tm
  7. @pytest.mark.parametrize(
  8. "arg, expected_rows",
  9. [
  10. [0, [0, 1, 4]],
  11. [2, [5]],
  12. [5, []],
  13. [-1, [3, 4, 7]],
  14. [-2, [1, 6]],
  15. [-6, []],
  16. ],
  17. )
  18. def test_int(slice_test_df, slice_test_grouped, arg, expected_rows):
  19. # Test single integer
  20. result = slice_test_grouped._positional_selector[arg]
  21. expected = slice_test_df.iloc[expected_rows]
  22. tm.assert_frame_equal(result, expected)
  23. def test_slice(slice_test_df, slice_test_grouped):
  24. # Test single slice
  25. result = slice_test_grouped._positional_selector[0:3:2]
  26. expected = slice_test_df.iloc[[0, 1, 4, 5]]
  27. tm.assert_frame_equal(result, expected)
  28. @pytest.mark.parametrize(
  29. "arg, expected_rows",
  30. [
  31. [[0, 2], [0, 1, 4, 5]],
  32. [[0, 2, -1], [0, 1, 3, 4, 5, 7]],
  33. [range(0, 3, 2), [0, 1, 4, 5]],
  34. [{0, 2}, [0, 1, 4, 5]],
  35. ],
  36. ids=[
  37. "list",
  38. "negative",
  39. "range",
  40. "set",
  41. ],
  42. )
  43. def test_list(slice_test_df, slice_test_grouped, arg, expected_rows):
  44. # Test lists of integers and integer valued iterables
  45. result = slice_test_grouped._positional_selector[arg]
  46. expected = slice_test_df.iloc[expected_rows]
  47. tm.assert_frame_equal(result, expected)
  48. def test_ints(slice_test_df, slice_test_grouped):
  49. # Test tuple of ints
  50. result = slice_test_grouped._positional_selector[0, 2, -1]
  51. expected = slice_test_df.iloc[[0, 1, 3, 4, 5, 7]]
  52. tm.assert_frame_equal(result, expected)
  53. def test_slices(slice_test_df, slice_test_grouped):
  54. # Test tuple of slices
  55. result = slice_test_grouped._positional_selector[:2, -2:]
  56. expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
  57. tm.assert_frame_equal(result, expected)
  58. def test_mix(slice_test_df, slice_test_grouped):
  59. # Test mixed tuple of ints and slices
  60. result = slice_test_grouped._positional_selector[0, 1, -2:]
  61. expected = slice_test_df.iloc[[0, 1, 2, 3, 4, 6, 7]]
  62. tm.assert_frame_equal(result, expected)
  63. @pytest.mark.parametrize(
  64. "arg, expected_rows",
  65. [
  66. [0, [0, 1, 4]],
  67. [[0, 2, -1], [0, 1, 3, 4, 5, 7]],
  68. [(slice(None, 2), slice(-2, None)), [0, 1, 2, 3, 4, 6, 7]],
  69. ],
  70. )
  71. def test_as_index(slice_test_df, arg, expected_rows):
  72. # Test the default as_index behaviour
  73. result = slice_test_df.groupby("Group", sort=False)._positional_selector[arg]
  74. expected = slice_test_df.iloc[expected_rows]
  75. tm.assert_frame_equal(result, expected)
  76. def test_doc_examples():
  77. # Test the examples in the documentation
  78. df = pd.DataFrame(
  79. [["a", 1], ["a", 2], ["a", 3], ["b", 4], ["b", 5]], columns=["A", "B"]
  80. )
  81. grouped = df.groupby("A", as_index=False)
  82. result = grouped._positional_selector[1:2]
  83. expected = pd.DataFrame([["a", 2], ["b", 5]], columns=["A", "B"], index=[1, 4])
  84. tm.assert_frame_equal(result, expected)
  85. result = grouped._positional_selector[1, -1]
  86. expected = pd.DataFrame(
  87. [["a", 2], ["a", 3], ["b", 5]], columns=["A", "B"], index=[1, 2, 4]
  88. )
  89. tm.assert_frame_equal(result, expected)
  90. @pytest.fixture()
  91. def multiindex_data():
  92. ndates = 100
  93. nitems = 20
  94. dates = pd.date_range("20130101", periods=ndates, freq="D")
  95. items = [f"item {i}" for i in range(nitems)]
  96. data = {}
  97. for date in dates:
  98. nitems_for_date = nitems - random.randint(0, 12)
  99. levels = [
  100. (item, random.randint(0, 10000) / 100, random.randint(0, 10000) / 100)
  101. for item in items[:nitems_for_date]
  102. ]
  103. levels.sort(key=lambda x: x[1])
  104. data[date] = levels
  105. return data
  106. def _make_df_from_data(data):
  107. rows = {}
  108. for date in data:
  109. for level in data[date]:
  110. rows[(date, level[0])] = {"A": level[1], "B": level[2]}
  111. df = pd.DataFrame.from_dict(rows, orient="index")
  112. df.index.names = ("Date", "Item")
  113. return df
  114. def test_multiindex(multiindex_data):
  115. # Test the multiindex mentioned as the use-case in the documentation
  116. df = _make_df_from_data(multiindex_data)
  117. result = df.groupby("Date", as_index=False).nth(slice(3, -3))
  118. sliced = {date: multiindex_data[date][3:-3] for date in multiindex_data}
  119. expected = _make_df_from_data(sliced)
  120. tm.assert_frame_equal(result, expected)
  121. @pytest.mark.parametrize("arg", [1, 5, 30, 1000, -1, -5, -30, -1000])
  122. @pytest.mark.parametrize("method", ["head", "tail"])
  123. @pytest.mark.parametrize("simulated", [True, False])
  124. def test_against_head_and_tail(arg, method, simulated):
  125. # Test gives the same results as grouped head and tail
  126. n_groups = 100
  127. n_rows_per_group = 30
  128. data = {
  129. "group": [
  130. f"group {g}" for j in range(n_rows_per_group) for g in range(n_groups)
  131. ],
  132. "value": [
  133. f"group {g} row {j}"
  134. for j in range(n_rows_per_group)
  135. for g in range(n_groups)
  136. ],
  137. }
  138. df = pd.DataFrame(data)
  139. grouped = df.groupby("group", as_index=False)
  140. size = arg if arg >= 0 else n_rows_per_group + arg
  141. if method == "head":
  142. result = grouped._positional_selector[:arg]
  143. if simulated:
  144. indices = []
  145. for j in range(size):
  146. for i in range(n_groups):
  147. if j * n_groups + i < n_groups * n_rows_per_group:
  148. indices.append(j * n_groups + i)
  149. expected = df.iloc[indices]
  150. else:
  151. expected = grouped.head(arg)
  152. else:
  153. result = grouped._positional_selector[-arg:]
  154. if simulated:
  155. indices = []
  156. for j in range(size):
  157. for i in range(n_groups):
  158. if (n_rows_per_group + j - size) * n_groups + i >= 0:
  159. indices.append((n_rows_per_group + j - size) * n_groups + i)
  160. expected = df.iloc[indices]
  161. else:
  162. expected = grouped.tail(arg)
  163. tm.assert_frame_equal(result, expected)
  164. @pytest.mark.parametrize("start", [None, 0, 1, 10, -1, -10])
  165. @pytest.mark.parametrize("stop", [None, 0, 1, 10, -1, -10])
  166. @pytest.mark.parametrize("step", [None, 1, 5])
  167. def test_against_df_iloc(start, stop, step):
  168. # Test that a single group gives the same results as DataFrame.iloc
  169. n_rows = 30
  170. data = {
  171. "group": ["group 0"] * n_rows,
  172. "value": list(range(n_rows)),
  173. }
  174. df = pd.DataFrame(data)
  175. grouped = df.groupby("group", as_index=False)
  176. result = grouped._positional_selector[start:stop:step]
  177. expected = df.iloc[start:stop:step]
  178. tm.assert_frame_equal(result, expected)
  179. def test_series():
  180. # Test grouped Series
  181. ser = pd.Series([1, 2, 3, 4, 5], index=["a", "a", "a", "b", "b"])
  182. grouped = ser.groupby(level=0)
  183. result = grouped._positional_selector[1:2]
  184. expected = pd.Series([2, 5], index=["a", "b"])
  185. tm.assert_series_equal(result, expected)
  186. @pytest.mark.parametrize("step", [1, 2, 3, 4, 5])
  187. def test_step(step):
  188. # Test slice with various step values
  189. data = [["x", f"x{i}"] for i in range(5)]
  190. data += [["y", f"y{i}"] for i in range(4)]
  191. data += [["z", f"z{i}"] for i in range(3)]
  192. df = pd.DataFrame(data, columns=["A", "B"])
  193. grouped = df.groupby("A", as_index=False)
  194. result = grouped._positional_selector[::step]
  195. data = [["x", f"x{i}"] for i in range(0, 5, step)]
  196. data += [["y", f"y{i}"] for i in range(0, 4, step)]
  197. data += [["z", f"z{i}"] for i in range(0, 3, step)]
  198. index = [0 + i for i in range(0, 5, step)]
  199. index += [5 + i for i in range(0, 4, step)]
  200. index += [9 + i for i in range(0, 3, step)]
  201. expected = pd.DataFrame(data, columns=["A", "B"], index=index)
  202. tm.assert_frame_equal(result, expected)
  203. @pytest.fixture()
  204. def column_group_df():
  205. return pd.DataFrame(
  206. [[0, 1, 2, 3, 4, 5, 6], [0, 0, 1, 0, 1, 0, 2]],
  207. columns=["A", "B", "C", "D", "E", "F", "G"],
  208. )
  209. def test_column_axis(column_group_df):
  210. g = column_group_df.groupby(column_group_df.iloc[1], axis=1)
  211. result = g._positional_selector[1:-1]
  212. expected = column_group_df.iloc[:, [1, 3]]
  213. tm.assert_frame_equal(result, expected)
  214. def test_columns_on_iter():
  215. # GitHub issue #44821
  216. df = pd.DataFrame({k: range(10) for k in "ABC"})
  217. # Group-by and select columns
  218. cols = ["A", "B"]
  219. for _, dg in df.groupby(df.A < 4)[cols]:
  220. tm.assert_index_equal(dg.columns, pd.Index(cols))
  221. assert "C" not in dg.columns
  222. @pytest.mark.parametrize("func", [list, pd.Index, pd.Series, np.array])
  223. def test_groupby_duplicated_columns(func):
  224. # GH#44924
  225. df = pd.DataFrame(
  226. {
  227. "A": [1, 2],
  228. "B": [3, 3],
  229. "C": ["G", "G"],
  230. }
  231. )
  232. result = df.groupby("C")[func(["A", "B", "A"])].mean()
  233. expected = pd.DataFrame(
  234. [[1.5, 3.0, 1.5]], columns=["A", "B", "A"], index=pd.Index(["G"], name="C")
  235. )
  236. tm.assert_frame_equal(result, expected)
  237. def test_groupby_get_nonexisting_groups():
  238. # GH#32492
  239. df = pd.DataFrame(
  240. data={
  241. "A": ["a1", "a2", None],
  242. "B": ["b1", "b2", "b1"],
  243. "val": [1, 2, 3],
  244. }
  245. )
  246. grps = df.groupby(by=["A", "B"])
  247. msg = "('a2', 'b1')"
  248. with pytest.raises(KeyError, match=msg):
  249. grps.get_group(("a2", "b1"))