123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125 |
- import re
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import (
- DataFrame,
- lreshape,
- melt,
- wide_to_long,
- )
- import pandas._testing as tm
- @pytest.fixture
- def df():
- res = tm.makeTimeDataFrame()[:10]
- res["id1"] = (res["A"] > 0).astype(np.int64)
- res["id2"] = (res["B"] > 0).astype(np.int64)
- return res
- @pytest.fixture
- def df1():
- res = DataFrame(
- [
- [1.067683, -1.110463, 0.20867],
- [-1.321405, 0.368915, -1.055342],
- [-0.807333, 0.08298, -0.873361],
- ]
- )
- res.columns = [list("ABC"), list("abc")]
- res.columns.names = ["CAP", "low"]
- return res
- @pytest.fixture
- def var_name():
- return "var"
- @pytest.fixture
- def value_name():
- return "val"
- class TestMelt:
- def test_top_level_method(self, df):
- result = melt(df)
- assert result.columns.tolist() == ["variable", "value"]
- def test_method_signatures(self, df, df1, var_name, value_name):
- tm.assert_frame_equal(df.melt(), melt(df))
- tm.assert_frame_equal(
- df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]),
- melt(df, id_vars=["id1", "id2"], value_vars=["A", "B"]),
- )
- tm.assert_frame_equal(
- df.melt(var_name=var_name, value_name=value_name),
- melt(df, var_name=var_name, value_name=value_name),
- )
- tm.assert_frame_equal(df1.melt(col_level=0), melt(df1, col_level=0))
- def test_default_col_names(self, df):
- result = df.melt()
- assert result.columns.tolist() == ["variable", "value"]
- result1 = df.melt(id_vars=["id1"])
- assert result1.columns.tolist() == ["id1", "variable", "value"]
- result2 = df.melt(id_vars=["id1", "id2"])
- assert result2.columns.tolist() == ["id1", "id2", "variable", "value"]
- def test_value_vars(self, df):
- result3 = df.melt(id_vars=["id1", "id2"], value_vars="A")
- assert len(result3) == 10
- result4 = df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"])
- expected4 = DataFrame(
- {
- "id1": df["id1"].tolist() * 2,
- "id2": df["id2"].tolist() * 2,
- "variable": ["A"] * 10 + ["B"] * 10,
- "value": (df["A"].tolist() + df["B"].tolist()),
- },
- columns=["id1", "id2", "variable", "value"],
- )
- tm.assert_frame_equal(result4, expected4)
- @pytest.mark.parametrize("type_", (tuple, list, np.array))
- def test_value_vars_types(self, type_, df):
- # GH 15348
- expected = DataFrame(
- {
- "id1": df["id1"].tolist() * 2,
- "id2": df["id2"].tolist() * 2,
- "variable": ["A"] * 10 + ["B"] * 10,
- "value": (df["A"].tolist() + df["B"].tolist()),
- },
- columns=["id1", "id2", "variable", "value"],
- )
- result = df.melt(id_vars=["id1", "id2"], value_vars=type_(("A", "B")))
- tm.assert_frame_equal(result, expected)
- def test_vars_work_with_multiindex(self, df1):
- expected = DataFrame(
- {
- ("A", "a"): df1[("A", "a")],
- "CAP": ["B"] * len(df1),
- "low": ["b"] * len(df1),
- "value": df1[("B", "b")],
- },
- columns=[("A", "a"), "CAP", "low", "value"],
- )
- result = df1.melt(id_vars=[("A", "a")], value_vars=[("B", "b")])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "id_vars, value_vars, col_level, expected",
- [
- (
- ["A"],
- ["B"],
- 0,
- DataFrame(
- {
- "A": {0: 1.067683, 1: -1.321405, 2: -0.807333},
- "CAP": {0: "B", 1: "B", 2: "B"},
- "value": {0: -1.110463, 1: 0.368915, 2: 0.08298},
- }
- ),
- ),
- (
- ["a"],
- ["b"],
- 1,
- DataFrame(
- {
- "a": {0: 1.067683, 1: -1.321405, 2: -0.807333},
- "low": {0: "b", 1: "b", 2: "b"},
- "value": {0: -1.110463, 1: 0.368915, 2: 0.08298},
- }
- ),
- ),
- ],
- )
- def test_single_vars_work_with_multiindex(
- self, id_vars, value_vars, col_level, expected, df1
- ):
- result = df1.melt(id_vars, value_vars, col_level=col_level)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "id_vars, value_vars",
- [
- [("A", "a"), [("B", "b")]],
- [[("A", "a")], ("B", "b")],
- [("A", "a"), ("B", "b")],
- ],
- )
- def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1):
- # melt should fail with an informative error message if
- # the columns have a MultiIndex and a tuple is passed
- # for id_vars or value_vars.
- msg = r"(id|value)_vars must be a list of tuples when columns are a MultiIndex"
- with pytest.raises(ValueError, match=msg):
- df1.melt(id_vars=id_vars, value_vars=value_vars)
- def test_custom_var_name(self, df, var_name):
- result5 = df.melt(var_name=var_name)
- assert result5.columns.tolist() == ["var", "value"]
- result6 = df.melt(id_vars=["id1"], var_name=var_name)
- assert result6.columns.tolist() == ["id1", "var", "value"]
- result7 = df.melt(id_vars=["id1", "id2"], var_name=var_name)
- assert result7.columns.tolist() == ["id1", "id2", "var", "value"]
- result8 = df.melt(id_vars=["id1", "id2"], value_vars="A", var_name=var_name)
- assert result8.columns.tolist() == ["id1", "id2", "var", "value"]
- result9 = df.melt(
- id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=var_name
- )
- expected9 = DataFrame(
- {
- "id1": df["id1"].tolist() * 2,
- "id2": df["id2"].tolist() * 2,
- var_name: ["A"] * 10 + ["B"] * 10,
- "value": (df["A"].tolist() + df["B"].tolist()),
- },
- columns=["id1", "id2", var_name, "value"],
- )
- tm.assert_frame_equal(result9, expected9)
- def test_custom_value_name(self, df, value_name):
- result10 = df.melt(value_name=value_name)
- assert result10.columns.tolist() == ["variable", "val"]
- result11 = df.melt(id_vars=["id1"], value_name=value_name)
- assert result11.columns.tolist() == ["id1", "variable", "val"]
- result12 = df.melt(id_vars=["id1", "id2"], value_name=value_name)
- assert result12.columns.tolist() == ["id1", "id2", "variable", "val"]
- result13 = df.melt(
- id_vars=["id1", "id2"], value_vars="A", value_name=value_name
- )
- assert result13.columns.tolist() == ["id1", "id2", "variable", "val"]
- result14 = df.melt(
- id_vars=["id1", "id2"], value_vars=["A", "B"], value_name=value_name
- )
- expected14 = DataFrame(
- {
- "id1": df["id1"].tolist() * 2,
- "id2": df["id2"].tolist() * 2,
- "variable": ["A"] * 10 + ["B"] * 10,
- value_name: (df["A"].tolist() + df["B"].tolist()),
- },
- columns=["id1", "id2", "variable", value_name],
- )
- tm.assert_frame_equal(result14, expected14)
- def test_custom_var_and_value_name(self, df, value_name, var_name):
- result15 = df.melt(var_name=var_name, value_name=value_name)
- assert result15.columns.tolist() == ["var", "val"]
- result16 = df.melt(id_vars=["id1"], var_name=var_name, value_name=value_name)
- assert result16.columns.tolist() == ["id1", "var", "val"]
- result17 = df.melt(
- id_vars=["id1", "id2"], var_name=var_name, value_name=value_name
- )
- assert result17.columns.tolist() == ["id1", "id2", "var", "val"]
- result18 = df.melt(
- id_vars=["id1", "id2"],
- value_vars="A",
- var_name=var_name,
- value_name=value_name,
- )
- assert result18.columns.tolist() == ["id1", "id2", "var", "val"]
- result19 = df.melt(
- id_vars=["id1", "id2"],
- value_vars=["A", "B"],
- var_name=var_name,
- value_name=value_name,
- )
- expected19 = DataFrame(
- {
- "id1": df["id1"].tolist() * 2,
- "id2": df["id2"].tolist() * 2,
- var_name: ["A"] * 10 + ["B"] * 10,
- value_name: (df["A"].tolist() + df["B"].tolist()),
- },
- columns=["id1", "id2", var_name, value_name],
- )
- tm.assert_frame_equal(result19, expected19)
- df20 = df.copy()
- df20.columns.name = "foo"
- result20 = df20.melt()
- assert result20.columns.tolist() == ["foo", "value"]
- @pytest.mark.parametrize("col_level", [0, "CAP"])
- def test_col_level(self, col_level, df1):
- res = df1.melt(col_level=col_level)
- assert res.columns.tolist() == ["CAP", "value"]
- def test_multiindex(self, df1):
- res = df1.melt()
- assert res.columns.tolist() == ["CAP", "low", "value"]
- @pytest.mark.parametrize(
- "col",
- [
- pd.Series(pd.date_range("2010", periods=5, tz="US/Pacific")),
- pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
- pd.Series([0, 1, 0, 0, 0]),
- ],
- )
- def test_pandas_dtypes(self, col):
- # GH 15785
- df = DataFrame(
- {"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col}
- )
- expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], ignore_index=True)
- result = melt(
- df, id_vars=["klass", "col"], var_name="attribute", value_name="value"
- )
- expected = DataFrame(
- {
- 0: list(range(5)) * 2,
- 1: pd.concat([col] * 2, ignore_index=True),
- 2: ["attr1"] * 5 + ["attr2"] * 5,
- 3: expected_value,
- }
- )
- expected.columns = ["klass", "col", "attribute", "value"]
- tm.assert_frame_equal(result, expected)
- def test_preserve_category(self):
- # GH 15853
- data = DataFrame({"A": [1, 2], "B": pd.Categorical(["X", "Y"])})
- result = melt(data, ["B"], ["A"])
- expected = DataFrame(
- {"B": pd.Categorical(["X", "Y"]), "variable": ["A", "A"], "value": [1, 2]}
- )
- tm.assert_frame_equal(result, expected)
- def test_melt_missing_columns_raises(self):
- # GH-23575
- # This test is to ensure that pandas raises an error if melting is
- # attempted with column names absent from the dataframe
- # Generate data
- df = DataFrame(np.random.randn(5, 4), columns=list("abcd"))
- # Try to melt with missing `value_vars` column name
- msg = "The following '{Var}' are not present in the DataFrame: {Col}"
- with pytest.raises(
- KeyError, match=msg.format(Var="value_vars", Col="\\['C'\\]")
- ):
- df.melt(["a", "b"], ["C", "d"])
- # Try to melt with missing `id_vars` column name
- with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['A'\\]")):
- df.melt(["A", "b"], ["c", "d"])
- # Multiple missing
- with pytest.raises(
- KeyError,
- match=msg.format(Var="id_vars", Col="\\['not_here', 'or_there'\\]"),
- ):
- df.melt(["a", "b", "not_here", "or_there"], ["c", "d"])
- # Multiindex melt fails if column is missing from multilevel melt
- multi = df.copy()
- multi.columns = [list("ABCD"), list("abcd")]
- with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['E'\\]")):
- multi.melt([("E", "a")], [("B", "b")])
- # Multiindex fails if column is missing from single level melt
- with pytest.raises(
- KeyError, match=msg.format(Var="value_vars", Col="\\['F'\\]")
- ):
- multi.melt(["A"], ["F"], col_level=0)
- def test_melt_mixed_int_str_id_vars(self):
- # GH 29718
- df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]})
- result = melt(df, id_vars=[0, "a"], value_vars=["b", "d"])
- expected = DataFrame(
- {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]}
- )
- tm.assert_frame_equal(result, expected)
- def test_melt_mixed_int_str_value_vars(self):
- # GH 29718
- df = DataFrame({0: ["foo"], "a": ["bar"]})
- result = melt(df, value_vars=[0, "a"])
- expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]})
- tm.assert_frame_equal(result, expected)
- def test_ignore_index(self):
- # GH 17440
- df = DataFrame({"foo": [0], "bar": [1]}, index=["first"])
- result = melt(df, ignore_index=False)
- expected = DataFrame(
- {"variable": ["foo", "bar"], "value": [0, 1]}, index=["first", "first"]
- )
- tm.assert_frame_equal(result, expected)
- def test_ignore_multiindex(self):
- # GH 17440
- index = pd.MultiIndex.from_tuples(
- [("first", "second"), ("first", "third")], names=["baz", "foobar"]
- )
- df = DataFrame({"foo": [0, 1], "bar": [2, 3]}, index=index)
- result = melt(df, ignore_index=False)
- expected_index = pd.MultiIndex.from_tuples(
- [("first", "second"), ("first", "third")] * 2, names=["baz", "foobar"]
- )
- expected = DataFrame(
- {"variable": ["foo"] * 2 + ["bar"] * 2, "value": [0, 1, 2, 3]},
- index=expected_index,
- )
- tm.assert_frame_equal(result, expected)
- def test_ignore_index_name_and_type(self):
- # GH 17440
- index = pd.Index(["foo", "bar"], dtype="category", name="baz")
- df = DataFrame({"x": [0, 1], "y": [2, 3]}, index=index)
- result = melt(df, ignore_index=False)
- expected_index = pd.Index(["foo", "bar"] * 2, dtype="category", name="baz")
- expected = DataFrame(
- {"variable": ["x", "x", "y", "y"], "value": [0, 1, 2, 3]},
- index=expected_index,
- )
- tm.assert_frame_equal(result, expected)
- def test_melt_with_duplicate_columns(self):
- # GH#41951
- df = DataFrame([["id", 2, 3]], columns=["a", "b", "b"])
- result = df.melt(id_vars=["a"], value_vars=["b"])
- expected = DataFrame(
- [["id", "b", 2], ["id", "b", 3]], columns=["a", "variable", "value"]
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("dtype", ["Int8", "Int64"])
- def test_melt_ea_dtype(self, dtype):
- # GH#41570
- df = DataFrame(
- {
- "a": pd.Series([1, 2], dtype="Int8"),
- "b": pd.Series([3, 4], dtype=dtype),
- }
- )
- result = df.melt()
- expected = DataFrame(
- {
- "variable": ["a", "a", "b", "b"],
- "value": pd.Series([1, 2, 3, 4], dtype=dtype),
- }
- )
- tm.assert_frame_equal(result, expected)
- class TestLreshape:
- def test_pairs(self):
- data = {
- "birthdt": [
- "08jan2009",
- "20dec2008",
- "30dec2008",
- "21dec2008",
- "11jan2009",
- ],
- "birthwt": [1766, 3301, 1454, 3139, 4133],
- "id": [101, 102, 103, 104, 105],
- "sex": ["Male", "Female", "Female", "Female", "Female"],
- "visitdt1": [
- "11jan2009",
- "22dec2008",
- "04jan2009",
- "29dec2008",
- "20jan2009",
- ],
- "visitdt2": ["21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009"],
- "visitdt3": ["05feb2009", np.nan, np.nan, "02jan2009", "15feb2009"],
- "wt1": [1823, 3338, 1549, 3298, 4306],
- "wt2": [2011.0, np.nan, 1892.0, 3338.0, 4575.0],
- "wt3": [2293.0, np.nan, np.nan, 3377.0, 4805.0],
- }
- df = DataFrame(data)
- spec = {
- "visitdt": [f"visitdt{i:d}" for i in range(1, 4)],
- "wt": [f"wt{i:d}" for i in range(1, 4)],
- }
- result = lreshape(df, spec)
- exp_data = {
- "birthdt": [
- "08jan2009",
- "20dec2008",
- "30dec2008",
- "21dec2008",
- "11jan2009",
- "08jan2009",
- "30dec2008",
- "21dec2008",
- "11jan2009",
- "08jan2009",
- "21dec2008",
- "11jan2009",
- ],
- "birthwt": [
- 1766,
- 3301,
- 1454,
- 3139,
- 4133,
- 1766,
- 1454,
- 3139,
- 4133,
- 1766,
- 3139,
- 4133,
- ],
- "id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105],
- "sex": [
- "Male",
- "Female",
- "Female",
- "Female",
- "Female",
- "Male",
- "Female",
- "Female",
- "Female",
- "Male",
- "Female",
- "Female",
- ],
- "visitdt": [
- "11jan2009",
- "22dec2008",
- "04jan2009",
- "29dec2008",
- "20jan2009",
- "21jan2009",
- "22jan2009",
- "31dec2008",
- "03feb2009",
- "05feb2009",
- "02jan2009",
- "15feb2009",
- ],
- "wt": [
- 1823.0,
- 3338.0,
- 1549.0,
- 3298.0,
- 4306.0,
- 2011.0,
- 1892.0,
- 3338.0,
- 4575.0,
- 2293.0,
- 3377.0,
- 4805.0,
- ],
- }
- exp = DataFrame(exp_data, columns=result.columns)
- tm.assert_frame_equal(result, exp)
- result = lreshape(df, spec, dropna=False)
- exp_data = {
- "birthdt": [
- "08jan2009",
- "20dec2008",
- "30dec2008",
- "21dec2008",
- "11jan2009",
- "08jan2009",
- "20dec2008",
- "30dec2008",
- "21dec2008",
- "11jan2009",
- "08jan2009",
- "20dec2008",
- "30dec2008",
- "21dec2008",
- "11jan2009",
- ],
- "birthwt": [
- 1766,
- 3301,
- 1454,
- 3139,
- 4133,
- 1766,
- 3301,
- 1454,
- 3139,
- 4133,
- 1766,
- 3301,
- 1454,
- 3139,
- 4133,
- ],
- "id": [
- 101,
- 102,
- 103,
- 104,
- 105,
- 101,
- 102,
- 103,
- 104,
- 105,
- 101,
- 102,
- 103,
- 104,
- 105,
- ],
- "sex": [
- "Male",
- "Female",
- "Female",
- "Female",
- "Female",
- "Male",
- "Female",
- "Female",
- "Female",
- "Female",
- "Male",
- "Female",
- "Female",
- "Female",
- "Female",
- ],
- "visitdt": [
- "11jan2009",
- "22dec2008",
- "04jan2009",
- "29dec2008",
- "20jan2009",
- "21jan2009",
- np.nan,
- "22jan2009",
- "31dec2008",
- "03feb2009",
- "05feb2009",
- np.nan,
- np.nan,
- "02jan2009",
- "15feb2009",
- ],
- "wt": [
- 1823.0,
- 3338.0,
- 1549.0,
- 3298.0,
- 4306.0,
- 2011.0,
- np.nan,
- 1892.0,
- 3338.0,
- 4575.0,
- 2293.0,
- np.nan,
- np.nan,
- 3377.0,
- 4805.0,
- ],
- }
- exp = DataFrame(exp_data, columns=result.columns)
- tm.assert_frame_equal(result, exp)
- spec = {
- "visitdt": [f"visitdt{i:d}" for i in range(1, 3)],
- "wt": [f"wt{i:d}" for i in range(1, 4)],
- }
- msg = "All column lists must be same length"
- with pytest.raises(ValueError, match=msg):
- lreshape(df, spec)
- class TestWideToLong:
- def test_simple(self):
- np.random.seed(123)
- x = np.random.randn(3)
- df = DataFrame(
- {
- "A1970": {0: "a", 1: "b", 2: "c"},
- "A1980": {0: "d", 1: "e", 2: "f"},
- "B1970": {0: 2.5, 1: 1.2, 2: 0.7},
- "B1980": {0: 3.2, 1: 1.3, 2: 0.1},
- "X": dict(zip(range(3), x)),
- }
- )
- df["id"] = df.index
- exp_data = {
- "X": x.tolist() + x.tolist(),
- "A": ["a", "b", "c", "d", "e", "f"],
- "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
- "year": [1970, 1970, 1970, 1980, 1980, 1980],
- "id": [0, 1, 2, 0, 1, 2],
- }
- expected = DataFrame(exp_data)
- expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
- result = wide_to_long(df, ["A", "B"], i="id", j="year")
- tm.assert_frame_equal(result, expected)
- def test_stubs(self):
- # GH9204 wide_to_long call should not modify 'stubs' list
- df = DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
- df.columns = ["id", "inc1", "inc2", "edu1", "edu2"]
- stubs = ["inc", "edu"]
- wide_to_long(df, stubs, i="id", j="age")
- assert stubs == ["inc", "edu"]
- def test_separating_character(self):
- # GH14779
- np.random.seed(123)
- x = np.random.randn(3)
- df = DataFrame(
- {
- "A.1970": {0: "a", 1: "b", 2: "c"},
- "A.1980": {0: "d", 1: "e", 2: "f"},
- "B.1970": {0: 2.5, 1: 1.2, 2: 0.7},
- "B.1980": {0: 3.2, 1: 1.3, 2: 0.1},
- "X": dict(zip(range(3), x)),
- }
- )
- df["id"] = df.index
- exp_data = {
- "X": x.tolist() + x.tolist(),
- "A": ["a", "b", "c", "d", "e", "f"],
- "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
- "year": [1970, 1970, 1970, 1980, 1980, 1980],
- "id": [0, 1, 2, 0, 1, 2],
- }
- expected = DataFrame(exp_data)
- expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
- result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
- tm.assert_frame_equal(result, expected)
- def test_escapable_characters(self):
- np.random.seed(123)
- x = np.random.randn(3)
- df = DataFrame(
- {
- "A(quarterly)1970": {0: "a", 1: "b", 2: "c"},
- "A(quarterly)1980": {0: "d", 1: "e", 2: "f"},
- "B(quarterly)1970": {0: 2.5, 1: 1.2, 2: 0.7},
- "B(quarterly)1980": {0: 3.2, 1: 1.3, 2: 0.1},
- "X": dict(zip(range(3), x)),
- }
- )
- df["id"] = df.index
- exp_data = {
- "X": x.tolist() + x.tolist(),
- "A(quarterly)": ["a", "b", "c", "d", "e", "f"],
- "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
- "year": [1970, 1970, 1970, 1980, 1980, 1980],
- "id": [0, 1, 2, 0, 1, 2],
- }
- expected = DataFrame(exp_data)
- expected = expected.set_index(["id", "year"])[
- ["X", "A(quarterly)", "B(quarterly)"]
- ]
- result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year")
- tm.assert_frame_equal(result, expected)
- def test_unbalanced(self):
- # test that we can have a varying amount of time variables
- df = DataFrame(
- {
- "A2010": [1.0, 2.0],
- "A2011": [3.0, 4.0],
- "B2010": [5.0, 6.0],
- "X": ["X1", "X2"],
- }
- )
- df["id"] = df.index
- exp_data = {
- "X": ["X1", "X2", "X1", "X2"],
- "A": [1.0, 2.0, 3.0, 4.0],
- "B": [5.0, 6.0, np.nan, np.nan],
- "id": [0, 1, 0, 1],
- "year": [2010, 2010, 2011, 2011],
- }
- expected = DataFrame(exp_data)
- expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
- result = wide_to_long(df, ["A", "B"], i="id", j="year")
- tm.assert_frame_equal(result, expected)
- def test_character_overlap(self):
- # Test we handle overlapping characters in both id_vars and value_vars
- df = DataFrame(
- {
- "A11": ["a11", "a22", "a33"],
- "A12": ["a21", "a22", "a23"],
- "B11": ["b11", "b12", "b13"],
- "B12": ["b21", "b22", "b23"],
- "BB11": [1, 2, 3],
- "BB12": [4, 5, 6],
- "BBBX": [91, 92, 93],
- "BBBZ": [91, 92, 93],
- }
- )
- df["id"] = df.index
- expected = DataFrame(
- {
- "BBBX": [91, 92, 93, 91, 92, 93],
- "BBBZ": [91, 92, 93, 91, 92, 93],
- "A": ["a11", "a22", "a33", "a21", "a22", "a23"],
- "B": ["b11", "b12", "b13", "b21", "b22", "b23"],
- "BB": [1, 2, 3, 4, 5, 6],
- "id": [0, 1, 2, 0, 1, 2],
- "year": [11, 11, 11, 12, 12, 12],
- }
- )
- expected = expected.set_index(["id", "year"])[["BBBX", "BBBZ", "A", "B", "BB"]]
- result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
- tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
- def test_invalid_separator(self):
- # if an invalid separator is supplied a empty data frame is returned
- sep = "nope!"
- df = DataFrame(
- {
- "A2010": [1.0, 2.0],
- "A2011": [3.0, 4.0],
- "B2010": [5.0, 6.0],
- "X": ["X1", "X2"],
- }
- )
- df["id"] = df.index
- exp_data = {
- "X": "",
- "A2010": [],
- "A2011": [],
- "B2010": [],
- "id": [],
- "year": [],
- "A": [],
- "B": [],
- }
- expected = DataFrame(exp_data).astype({"year": np.int64})
- expected = expected.set_index(["id", "year"])[
- ["X", "A2010", "A2011", "B2010", "A", "B"]
- ]
- expected.index = expected.index.set_levels([0, 1], level=0)
- result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep)
- tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
- def test_num_string_disambiguation(self):
- # Test that we can disambiguate number value_vars from
- # string value_vars
- df = DataFrame(
- {
- "A11": ["a11", "a22", "a33"],
- "A12": ["a21", "a22", "a23"],
- "B11": ["b11", "b12", "b13"],
- "B12": ["b21", "b22", "b23"],
- "BB11": [1, 2, 3],
- "BB12": [4, 5, 6],
- "Arating": [91, 92, 93],
- "Arating_old": [91, 92, 93],
- }
- )
- df["id"] = df.index
- expected = DataFrame(
- {
- "Arating": [91, 92, 93, 91, 92, 93],
- "Arating_old": [91, 92, 93, 91, 92, 93],
- "A": ["a11", "a22", "a33", "a21", "a22", "a23"],
- "B": ["b11", "b12", "b13", "b21", "b22", "b23"],
- "BB": [1, 2, 3, 4, 5, 6],
- "id": [0, 1, 2, 0, 1, 2],
- "year": [11, 11, 11, 12, 12, 12],
- }
- )
- expected = expected.set_index(["id", "year"])[
- ["Arating", "Arating_old", "A", "B", "BB"]
- ]
- result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
- tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
- def test_invalid_suffixtype(self):
- # If all stubs names end with a string, but a numeric suffix is
- # assumed, an empty data frame is returned
- df = DataFrame(
- {
- "Aone": [1.0, 2.0],
- "Atwo": [3.0, 4.0],
- "Bone": [5.0, 6.0],
- "X": ["X1", "X2"],
- }
- )
- df["id"] = df.index
- exp_data = {
- "X": "",
- "Aone": [],
- "Atwo": [],
- "Bone": [],
- "id": [],
- "year": [],
- "A": [],
- "B": [],
- }
- expected = DataFrame(exp_data).astype({"year": np.int64})
- expected = expected.set_index(["id", "year"])
- expected.index = expected.index.set_levels([0, 1], level=0)
- result = wide_to_long(df, ["A", "B"], i="id", j="year")
- tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
- def test_multiple_id_columns(self):
- # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
- df = DataFrame(
- {
- "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3],
- "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3],
- "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
- "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9],
- }
- )
- expected = DataFrame(
- {
- "ht": [
- 2.8,
- 3.4,
- 2.9,
- 3.8,
- 2.2,
- 2.9,
- 2.0,
- 3.2,
- 1.8,
- 2.8,
- 1.9,
- 2.4,
- 2.2,
- 3.3,
- 2.3,
- 3.4,
- 2.1,
- 2.9,
- ],
- "famid": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
- "birth": [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
- "age": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
- }
- )
- expected = expected.set_index(["famid", "birth", "age"])[["ht"]]
- result = wide_to_long(df, "ht", i=["famid", "birth"], j="age")
- tm.assert_frame_equal(result, expected)
- def test_non_unique_idvars(self):
- # GH16382
- # Raise an error message if non unique id vars (i) are passed
- df = DataFrame(
- {"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]}
- )
- msg = "the id variables need to uniquely identify each row"
- with pytest.raises(ValueError, match=msg):
- wide_to_long(df, ["A_A", "B_B"], i="x", j="colname")
- def test_cast_j_int(self):
- df = DataFrame(
- {
- "actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"],
- "actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"],
- "actor_fb_likes_1": [1000.0, 40000.0, 11000.0],
- "actor_fb_likes_2": [936.0, 5000.0, 393.0],
- "title": ["Avatar", "Pirates of the Caribbean", "Spectre"],
- }
- )
- expected = DataFrame(
- {
- "actor": [
- "CCH Pounder",
- "Johnny Depp",
- "Christoph Waltz",
- "Joel David Moore",
- "Orlando Bloom",
- "Rory Kinnear",
- ],
- "actor_fb_likes": [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
- "num": [1, 1, 1, 2, 2, 2],
- "title": [
- "Avatar",
- "Pirates of the Caribbean",
- "Spectre",
- "Avatar",
- "Pirates of the Caribbean",
- "Spectre",
- ],
- }
- ).set_index(["title", "num"])
- result = wide_to_long(
- df, ["actor", "actor_fb_likes"], i="title", j="num", sep="_"
- )
- tm.assert_frame_equal(result, expected)
- def test_identical_stubnames(self):
- df = DataFrame(
- {
- "A2010": [1.0, 2.0],
- "A2011": [3.0, 4.0],
- "B2010": [5.0, 6.0],
- "A": ["X1", "X2"],
- }
- )
- msg = "stubname can't be identical to a column name"
- with pytest.raises(ValueError, match=msg):
- wide_to_long(df, ["A", "B"], i="A", j="colname")
- def test_nonnumeric_suffix(self):
- df = DataFrame(
- {
- "treatment_placebo": [1.0, 2.0],
- "treatment_test": [3.0, 4.0],
- "result_placebo": [5.0, 6.0],
- "A": ["X1", "X2"],
- }
- )
- expected = DataFrame(
- {
- "A": ["X1", "X2", "X1", "X2"],
- "colname": ["placebo", "placebo", "test", "test"],
- "result": [5.0, 6.0, np.nan, np.nan],
- "treatment": [1.0, 2.0, 3.0, 4.0],
- }
- )
- expected = expected.set_index(["A", "colname"])
- result = wide_to_long(
- df, ["result", "treatment"], i="A", j="colname", suffix="[a-z]+", sep="_"
- )
- tm.assert_frame_equal(result, expected)
- def test_mixed_type_suffix(self):
- df = DataFrame(
- {
- "A": ["X1", "X2"],
- "result_1": [0, 9],
- "result_foo": [5.0, 6.0],
- "treatment_1": [1.0, 2.0],
- "treatment_foo": [3.0, 4.0],
- }
- )
- expected = DataFrame(
- {
- "A": ["X1", "X2", "X1", "X2"],
- "colname": ["1", "1", "foo", "foo"],
- "result": [0.0, 9.0, 5.0, 6.0],
- "treatment": [1.0, 2.0, 3.0, 4.0],
- }
- ).set_index(["A", "colname"])
- result = wide_to_long(
- df, ["result", "treatment"], i="A", j="colname", suffix=".+", sep="_"
- )
- tm.assert_frame_equal(result, expected)
- def test_float_suffix(self):
- df = DataFrame(
- {
- "treatment_1.1": [1.0, 2.0],
- "treatment_2.1": [3.0, 4.0],
- "result_1.2": [5.0, 6.0],
- "result_1": [0, 9],
- "A": ["X1", "X2"],
- }
- )
- expected = DataFrame(
- {
- "A": ["X1", "X2", "X1", "X2", "X1", "X2", "X1", "X2"],
- "colname": [1.2, 1.2, 1.0, 1.0, 1.1, 1.1, 2.1, 2.1],
- "result": [5.0, 6.0, 0.0, 9.0, np.nan, np.nan, np.nan, np.nan],
- "treatment": [np.nan, np.nan, np.nan, np.nan, 1.0, 2.0, 3.0, 4.0],
- }
- )
- expected = expected.set_index(["A", "colname"])
- result = wide_to_long(
- df, ["result", "treatment"], i="A", j="colname", suffix="[0-9.]+", sep="_"
- )
- tm.assert_frame_equal(result, expected)
- def test_col_substring_of_stubname(self):
- # GH22468
- # Don't raise ValueError when a column name is a substring
- # of a stubname that's been passed as a string
- wide_data = {
- "node_id": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
- "A": {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81},
- "PA0": {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6},
- "PA1": {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67},
- "PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67},
- }
- wide_df = DataFrame.from_dict(wide_data)
- expected = wide_to_long(wide_df, stubnames=["PA"], i=["node_id", "A"], j="time")
- result = wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time")
- tm.assert_frame_equal(result, expected)
- def test_raise_of_column_name_value(self):
- # GH34731, enforced in 2.0
- # raise a ValueError if the resultant value column name matches
- # a name in the dataframe already (default name is "value")
- df = DataFrame({"col": list("ABC"), "value": range(10, 16, 2)})
- with pytest.raises(
- ValueError, match=re.escape("value_name (value) cannot match")
- ):
- df.melt(id_vars="value", value_name="value")
- @pytest.mark.parametrize("dtype", ["O", "string"])
- def test_missing_stubname(self, dtype):
- # GH46044
- df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]})
- df = df.astype({"id": dtype})
- result = wide_to_long(
- df,
- stubnames=["a", "b"],
- i="id",
- j="num",
- sep="-",
- )
- index = pd.Index(
- [("1", 1), ("2", 1), ("1", 2), ("2", 2)],
- name=("id", "num"),
- )
- expected = DataFrame(
- {"a": [100, 200, 300, 400], "b": [np.nan] * 4},
- index=index,
- )
- new_level = expected.index.levels[0].astype(dtype)
- expected.index = expected.index.set_levels(new_level, level=0)
- tm.assert_frame_equal(result, expected)
|