test_explode.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303
  1. import re
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. import pandas._testing as tm
  6. def test_error():
  7. df = pd.DataFrame(
  8. {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
  9. )
  10. with pytest.raises(
  11. ValueError, match="column must be a scalar, tuple, or list thereof"
  12. ):
  13. df.explode([list("AA")])
  14. with pytest.raises(ValueError, match="column must be unique"):
  15. df.explode(list("AA"))
  16. df.columns = list("AA")
  17. with pytest.raises(
  18. ValueError,
  19. match=re.escape("DataFrame columns must be unique. Duplicate columns: ['A']"),
  20. ):
  21. df.explode("A")
  22. @pytest.mark.parametrize(
  23. "input_subset, error_message",
  24. [
  25. (
  26. list("AC"),
  27. "columns must have matching element counts",
  28. ),
  29. (
  30. [],
  31. "column must be nonempty",
  32. ),
  33. (
  34. list("AC"),
  35. "columns must have matching element counts",
  36. ),
  37. ],
  38. )
  39. def test_error_multi_columns(input_subset, error_message):
  40. # GH 39240
  41. df = pd.DataFrame(
  42. {
  43. "A": [[0, 1, 2], np.nan, [], (3, 4)],
  44. "B": 1,
  45. "C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]],
  46. },
  47. index=list("abcd"),
  48. )
  49. with pytest.raises(ValueError, match=error_message):
  50. df.explode(input_subset)
  51. @pytest.mark.parametrize(
  52. "scalar",
  53. ["a", 0, 1.5, pd.Timedelta("1 days"), pd.Timestamp("2019-12-31")],
  54. )
  55. def test_basic(scalar):
  56. df = pd.DataFrame(
  57. {scalar: pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
  58. )
  59. result = df.explode(scalar)
  60. expected = pd.DataFrame(
  61. {
  62. scalar: pd.Series(
  63. [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
  64. ),
  65. "B": 1,
  66. }
  67. )
  68. tm.assert_frame_equal(result, expected)
  69. def test_multi_index_rows():
  70. df = pd.DataFrame(
  71. {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1},
  72. index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]),
  73. )
  74. result = df.explode("A")
  75. expected = pd.DataFrame(
  76. {
  77. "A": pd.Series(
  78. [0, 1, 2, np.nan, np.nan, 3, 4],
  79. index=pd.MultiIndex.from_tuples(
  80. [
  81. ("a", 1),
  82. ("a", 1),
  83. ("a", 1),
  84. ("a", 2),
  85. ("b", 1),
  86. ("b", 2),
  87. ("b", 2),
  88. ]
  89. ),
  90. dtype=object,
  91. ),
  92. "B": 1,
  93. }
  94. )
  95. tm.assert_frame_equal(result, expected)
  96. def test_multi_index_columns():
  97. df = pd.DataFrame(
  98. {("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1}
  99. )
  100. result = df.explode(("A", 1))
  101. expected = pd.DataFrame(
  102. {
  103. ("A", 1): pd.Series(
  104. [0, 1, 2, np.nan, np.nan, 3, 4],
  105. index=pd.Index([0, 0, 0, 1, 2, 3, 3]),
  106. dtype=object,
  107. ),
  108. ("A", 2): 1,
  109. }
  110. )
  111. tm.assert_frame_equal(result, expected)
  112. def test_usecase():
  113. # explode a single column
  114. # gh-10511
  115. df = pd.DataFrame(
  116. [[11, range(5), 10], [22, range(3), 20]], columns=list("ABC")
  117. ).set_index("C")
  118. result = df.explode("B")
  119. expected = pd.DataFrame(
  120. {
  121. "A": [11, 11, 11, 11, 11, 22, 22, 22],
  122. "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),
  123. "C": [10, 10, 10, 10, 10, 20, 20, 20],
  124. },
  125. columns=list("ABC"),
  126. ).set_index("C")
  127. tm.assert_frame_equal(result, expected)
  128. # gh-8517
  129. df = pd.DataFrame(
  130. [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],
  131. columns=["dt", "name", "text"],
  132. )
  133. result = df.assign(text=df.text.str.split(" ")).explode("text")
  134. expected = pd.DataFrame(
  135. [
  136. ["2014-01-01", "Alice", "A"],
  137. ["2014-01-01", "Alice", "B"],
  138. ["2014-01-02", "Bob", "C"],
  139. ["2014-01-02", "Bob", "D"],
  140. ],
  141. columns=["dt", "name", "text"],
  142. index=[0, 0, 1, 1],
  143. )
  144. tm.assert_frame_equal(result, expected)
  145. @pytest.mark.parametrize(
  146. "input_dict, input_index, expected_dict, expected_index",
  147. [
  148. (
  149. {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
  150. [0, 0],
  151. {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
  152. [0, 0, 0, 0],
  153. ),
  154. (
  155. {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
  156. pd.Index([0, 0], name="my_index"),
  157. {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
  158. pd.Index([0, 0, 0, 0], name="my_index"),
  159. ),
  160. (
  161. {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
  162. pd.MultiIndex.from_arrays(
  163. [[0, 0], [1, 1]], names=["my_first_index", "my_second_index"]
  164. ),
  165. {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
  166. pd.MultiIndex.from_arrays(
  167. [[0, 0, 0, 0], [1, 1, 1, 1]],
  168. names=["my_first_index", "my_second_index"],
  169. ),
  170. ),
  171. (
  172. {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
  173. pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]),
  174. {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
  175. pd.MultiIndex.from_arrays(
  176. [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None]
  177. ),
  178. ),
  179. ],
  180. )
  181. def test_duplicate_index(input_dict, input_index, expected_dict, expected_index):
  182. # GH 28005
  183. df = pd.DataFrame(input_dict, index=input_index)
  184. result = df.explode("col1")
  185. expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object)
  186. tm.assert_frame_equal(result, expected)
  187. def test_ignore_index():
  188. # GH 34932
  189. df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]})
  190. result = df.explode("values", ignore_index=True)
  191. expected = pd.DataFrame(
  192. {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3]
  193. )
  194. tm.assert_frame_equal(result, expected)
  195. def test_explode_sets():
  196. # https://github.com/pandas-dev/pandas/issues/35614
  197. df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1])
  198. result = df.explode(column="a").sort_values(by="a")
  199. expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])
  200. tm.assert_frame_equal(result, expected)
  201. @pytest.mark.parametrize(
  202. "input_subset, expected_dict, expected_index",
  203. [
  204. (
  205. list("AC"),
  206. {
  207. "A": pd.Series(
  208. [0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
  209. index=list("aaabcdde"),
  210. dtype=object,
  211. ),
  212. "B": 1,
  213. "C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan],
  214. },
  215. list("aaabcdde"),
  216. ),
  217. (
  218. list("A"),
  219. {
  220. "A": pd.Series(
  221. [0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
  222. index=list("aaabcdde"),
  223. dtype=object,
  224. ),
  225. "B": 1,
  226. "C": [
  227. ["a", "b", "c"],
  228. ["a", "b", "c"],
  229. ["a", "b", "c"],
  230. "foo",
  231. [],
  232. ["d", "e"],
  233. ["d", "e"],
  234. np.nan,
  235. ],
  236. },
  237. list("aaabcdde"),
  238. ),
  239. ],
  240. )
  241. def test_multi_columns(input_subset, expected_dict, expected_index):
  242. # GH 39240
  243. df = pd.DataFrame(
  244. {
  245. "A": [[0, 1, 2], np.nan, [], (3, 4), np.nan],
  246. "B": 1,
  247. "C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan],
  248. },
  249. index=list("abcde"),
  250. )
  251. result = df.explode(input_subset)
  252. expected = pd.DataFrame(expected_dict, expected_index)
  253. tm.assert_frame_equal(result, expected)
  254. def test_multi_columns_nan_empty():
  255. # GH 46084
  256. df = pd.DataFrame(
  257. {
  258. "A": [[0, 1], [5], [], [2, 3]],
  259. "B": [9, 8, 7, 6],
  260. "C": [[1, 2], np.nan, [], [3, 4]],
  261. }
  262. )
  263. result = df.explode(["A", "C"])
  264. expected = pd.DataFrame(
  265. {
  266. "A": np.array([0, 1, 5, np.nan, 2, 3], dtype=object),
  267. "B": [9, 9, 8, 7, 6, 6],
  268. "C": np.array([1, 2, np.nan, np.nan, 3, 4], dtype=object),
  269. },
  270. index=[0, 0, 1, 2, 3, 3],
  271. )
  272. tm.assert_frame_equal(result, expected)