123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303 |
- import re
- import numpy as np
- import pytest
- import pandas as pd
- import pandas._testing as tm
- def test_error():
- df = pd.DataFrame(
- {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
- )
- with pytest.raises(
- ValueError, match="column must be a scalar, tuple, or list thereof"
- ):
- df.explode([list("AA")])
- with pytest.raises(ValueError, match="column must be unique"):
- df.explode(list("AA"))
- df.columns = list("AA")
- with pytest.raises(
- ValueError,
- match=re.escape("DataFrame columns must be unique. Duplicate columns: ['A']"),
- ):
- df.explode("A")
- @pytest.mark.parametrize(
- "input_subset, error_message",
- [
- (
- list("AC"),
- "columns must have matching element counts",
- ),
- (
- [],
- "column must be nonempty",
- ),
- (
- list("AC"),
- "columns must have matching element counts",
- ),
- ],
- )
- def test_error_multi_columns(input_subset, error_message):
- # GH 39240
- df = pd.DataFrame(
- {
- "A": [[0, 1, 2], np.nan, [], (3, 4)],
- "B": 1,
- "C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]],
- },
- index=list("abcd"),
- )
- with pytest.raises(ValueError, match=error_message):
- df.explode(input_subset)
- @pytest.mark.parametrize(
- "scalar",
- ["a", 0, 1.5, pd.Timedelta("1 days"), pd.Timestamp("2019-12-31")],
- )
- def test_basic(scalar):
- df = pd.DataFrame(
- {scalar: pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
- )
- result = df.explode(scalar)
- expected = pd.DataFrame(
- {
- scalar: pd.Series(
- [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
- ),
- "B": 1,
- }
- )
- tm.assert_frame_equal(result, expected)
- def test_multi_index_rows():
- df = pd.DataFrame(
- {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1},
- index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]),
- )
- result = df.explode("A")
- expected = pd.DataFrame(
- {
- "A": pd.Series(
- [0, 1, 2, np.nan, np.nan, 3, 4],
- index=pd.MultiIndex.from_tuples(
- [
- ("a", 1),
- ("a", 1),
- ("a", 1),
- ("a", 2),
- ("b", 1),
- ("b", 2),
- ("b", 2),
- ]
- ),
- dtype=object,
- ),
- "B": 1,
- }
- )
- tm.assert_frame_equal(result, expected)
- def test_multi_index_columns():
- df = pd.DataFrame(
- {("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1}
- )
- result = df.explode(("A", 1))
- expected = pd.DataFrame(
- {
- ("A", 1): pd.Series(
- [0, 1, 2, np.nan, np.nan, 3, 4],
- index=pd.Index([0, 0, 0, 1, 2, 3, 3]),
- dtype=object,
- ),
- ("A", 2): 1,
- }
- )
- tm.assert_frame_equal(result, expected)
- def test_usecase():
- # explode a single column
- # gh-10511
- df = pd.DataFrame(
- [[11, range(5), 10], [22, range(3), 20]], columns=list("ABC")
- ).set_index("C")
- result = df.explode("B")
- expected = pd.DataFrame(
- {
- "A": [11, 11, 11, 11, 11, 22, 22, 22],
- "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),
- "C": [10, 10, 10, 10, 10, 20, 20, 20],
- },
- columns=list("ABC"),
- ).set_index("C")
- tm.assert_frame_equal(result, expected)
- # gh-8517
- df = pd.DataFrame(
- [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],
- columns=["dt", "name", "text"],
- )
- result = df.assign(text=df.text.str.split(" ")).explode("text")
- expected = pd.DataFrame(
- [
- ["2014-01-01", "Alice", "A"],
- ["2014-01-01", "Alice", "B"],
- ["2014-01-02", "Bob", "C"],
- ["2014-01-02", "Bob", "D"],
- ],
- columns=["dt", "name", "text"],
- index=[0, 0, 1, 1],
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "input_dict, input_index, expected_dict, expected_index",
- [
- (
- {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
- [0, 0],
- {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
- [0, 0, 0, 0],
- ),
- (
- {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
- pd.Index([0, 0], name="my_index"),
- {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
- pd.Index([0, 0, 0, 0], name="my_index"),
- ),
- (
- {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
- pd.MultiIndex.from_arrays(
- [[0, 0], [1, 1]], names=["my_first_index", "my_second_index"]
- ),
- {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
- pd.MultiIndex.from_arrays(
- [[0, 0, 0, 0], [1, 1, 1, 1]],
- names=["my_first_index", "my_second_index"],
- ),
- ),
- (
- {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
- pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]),
- {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
- pd.MultiIndex.from_arrays(
- [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None]
- ),
- ),
- ],
- )
- def test_duplicate_index(input_dict, input_index, expected_dict, expected_index):
- # GH 28005
- df = pd.DataFrame(input_dict, index=input_index)
- result = df.explode("col1")
- expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object)
- tm.assert_frame_equal(result, expected)
- def test_ignore_index():
- # GH 34932
- df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]})
- result = df.explode("values", ignore_index=True)
- expected = pd.DataFrame(
- {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3]
- )
- tm.assert_frame_equal(result, expected)
- def test_explode_sets():
- # https://github.com/pandas-dev/pandas/issues/35614
- df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1])
- result = df.explode(column="a").sort_values(by="a")
- expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "input_subset, expected_dict, expected_index",
- [
- (
- list("AC"),
- {
- "A": pd.Series(
- [0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
- index=list("aaabcdde"),
- dtype=object,
- ),
- "B": 1,
- "C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan],
- },
- list("aaabcdde"),
- ),
- (
- list("A"),
- {
- "A": pd.Series(
- [0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
- index=list("aaabcdde"),
- dtype=object,
- ),
- "B": 1,
- "C": [
- ["a", "b", "c"],
- ["a", "b", "c"],
- ["a", "b", "c"],
- "foo",
- [],
- ["d", "e"],
- ["d", "e"],
- np.nan,
- ],
- },
- list("aaabcdde"),
- ),
- ],
- )
- def test_multi_columns(input_subset, expected_dict, expected_index):
- # GH 39240
- df = pd.DataFrame(
- {
- "A": [[0, 1, 2], np.nan, [], (3, 4), np.nan],
- "B": 1,
- "C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan],
- },
- index=list("abcde"),
- )
- result = df.explode(input_subset)
- expected = pd.DataFrame(expected_dict, expected_index)
- tm.assert_frame_equal(result, expected)
- def test_multi_columns_nan_empty():
- # GH 46084
- df = pd.DataFrame(
- {
- "A": [[0, 1], [5], [], [2, 3]],
- "B": [9, 8, 7, 6],
- "C": [[1, 2], np.nan, [], [3, 4]],
- }
- )
- result = df.explode(["A", "C"])
- expected = pd.DataFrame(
- {
- "A": np.array([0, 1, 5, np.nan, 2, 3], dtype=object),
- "B": [9, 9, 8, 7, 6, 6],
- "C": np.array([1, 2, np.nan, np.nan, 3, 4], dtype=object),
- },
- index=[0, 0, 1, 2, 3, 3],
- )
- tm.assert_frame_equal(result, expected)
|