test_melt.py 37 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125
  1. import re
  2. import numpy as np
  3. import pytest
  4. import pandas as pd
  5. from pandas import (
  6. DataFrame,
  7. lreshape,
  8. melt,
  9. wide_to_long,
  10. )
  11. import pandas._testing as tm
  12. @pytest.fixture
  13. def df():
  14. res = tm.makeTimeDataFrame()[:10]
  15. res["id1"] = (res["A"] > 0).astype(np.int64)
  16. res["id2"] = (res["B"] > 0).astype(np.int64)
  17. return res
  18. @pytest.fixture
  19. def df1():
  20. res = DataFrame(
  21. [
  22. [1.067683, -1.110463, 0.20867],
  23. [-1.321405, 0.368915, -1.055342],
  24. [-0.807333, 0.08298, -0.873361],
  25. ]
  26. )
  27. res.columns = [list("ABC"), list("abc")]
  28. res.columns.names = ["CAP", "low"]
  29. return res
  30. @pytest.fixture
  31. def var_name():
  32. return "var"
  33. @pytest.fixture
  34. def value_name():
  35. return "val"
  36. class TestMelt:
  37. def test_top_level_method(self, df):
  38. result = melt(df)
  39. assert result.columns.tolist() == ["variable", "value"]
  40. def test_method_signatures(self, df, df1, var_name, value_name):
  41. tm.assert_frame_equal(df.melt(), melt(df))
  42. tm.assert_frame_equal(
  43. df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"]),
  44. melt(df, id_vars=["id1", "id2"], value_vars=["A", "B"]),
  45. )
  46. tm.assert_frame_equal(
  47. df.melt(var_name=var_name, value_name=value_name),
  48. melt(df, var_name=var_name, value_name=value_name),
  49. )
  50. tm.assert_frame_equal(df1.melt(col_level=0), melt(df1, col_level=0))
  51. def test_default_col_names(self, df):
  52. result = df.melt()
  53. assert result.columns.tolist() == ["variable", "value"]
  54. result1 = df.melt(id_vars=["id1"])
  55. assert result1.columns.tolist() == ["id1", "variable", "value"]
  56. result2 = df.melt(id_vars=["id1", "id2"])
  57. assert result2.columns.tolist() == ["id1", "id2", "variable", "value"]
  58. def test_value_vars(self, df):
  59. result3 = df.melt(id_vars=["id1", "id2"], value_vars="A")
  60. assert len(result3) == 10
  61. result4 = df.melt(id_vars=["id1", "id2"], value_vars=["A", "B"])
  62. expected4 = DataFrame(
  63. {
  64. "id1": df["id1"].tolist() * 2,
  65. "id2": df["id2"].tolist() * 2,
  66. "variable": ["A"] * 10 + ["B"] * 10,
  67. "value": (df["A"].tolist() + df["B"].tolist()),
  68. },
  69. columns=["id1", "id2", "variable", "value"],
  70. )
  71. tm.assert_frame_equal(result4, expected4)
  72. @pytest.mark.parametrize("type_", (tuple, list, np.array))
  73. def test_value_vars_types(self, type_, df):
  74. # GH 15348
  75. expected = DataFrame(
  76. {
  77. "id1": df["id1"].tolist() * 2,
  78. "id2": df["id2"].tolist() * 2,
  79. "variable": ["A"] * 10 + ["B"] * 10,
  80. "value": (df["A"].tolist() + df["B"].tolist()),
  81. },
  82. columns=["id1", "id2", "variable", "value"],
  83. )
  84. result = df.melt(id_vars=["id1", "id2"], value_vars=type_(("A", "B")))
  85. tm.assert_frame_equal(result, expected)
  86. def test_vars_work_with_multiindex(self, df1):
  87. expected = DataFrame(
  88. {
  89. ("A", "a"): df1[("A", "a")],
  90. "CAP": ["B"] * len(df1),
  91. "low": ["b"] * len(df1),
  92. "value": df1[("B", "b")],
  93. },
  94. columns=[("A", "a"), "CAP", "low", "value"],
  95. )
  96. result = df1.melt(id_vars=[("A", "a")], value_vars=[("B", "b")])
  97. tm.assert_frame_equal(result, expected)
  98. @pytest.mark.parametrize(
  99. "id_vars, value_vars, col_level, expected",
  100. [
  101. (
  102. ["A"],
  103. ["B"],
  104. 0,
  105. DataFrame(
  106. {
  107. "A": {0: 1.067683, 1: -1.321405, 2: -0.807333},
  108. "CAP": {0: "B", 1: "B", 2: "B"},
  109. "value": {0: -1.110463, 1: 0.368915, 2: 0.08298},
  110. }
  111. ),
  112. ),
  113. (
  114. ["a"],
  115. ["b"],
  116. 1,
  117. DataFrame(
  118. {
  119. "a": {0: 1.067683, 1: -1.321405, 2: -0.807333},
  120. "low": {0: "b", 1: "b", 2: "b"},
  121. "value": {0: -1.110463, 1: 0.368915, 2: 0.08298},
  122. }
  123. ),
  124. ),
  125. ],
  126. )
  127. def test_single_vars_work_with_multiindex(
  128. self, id_vars, value_vars, col_level, expected, df1
  129. ):
  130. result = df1.melt(id_vars, value_vars, col_level=col_level)
  131. tm.assert_frame_equal(result, expected)
  132. @pytest.mark.parametrize(
  133. "id_vars, value_vars",
  134. [
  135. [("A", "a"), [("B", "b")]],
  136. [[("A", "a")], ("B", "b")],
  137. [("A", "a"), ("B", "b")],
  138. ],
  139. )
  140. def test_tuple_vars_fail_with_multiindex(self, id_vars, value_vars, df1):
  141. # melt should fail with an informative error message if
  142. # the columns have a MultiIndex and a tuple is passed
  143. # for id_vars or value_vars.
  144. msg = r"(id|value)_vars must be a list of tuples when columns are a MultiIndex"
  145. with pytest.raises(ValueError, match=msg):
  146. df1.melt(id_vars=id_vars, value_vars=value_vars)
  147. def test_custom_var_name(self, df, var_name):
  148. result5 = df.melt(var_name=var_name)
  149. assert result5.columns.tolist() == ["var", "value"]
  150. result6 = df.melt(id_vars=["id1"], var_name=var_name)
  151. assert result6.columns.tolist() == ["id1", "var", "value"]
  152. result7 = df.melt(id_vars=["id1", "id2"], var_name=var_name)
  153. assert result7.columns.tolist() == ["id1", "id2", "var", "value"]
  154. result8 = df.melt(id_vars=["id1", "id2"], value_vars="A", var_name=var_name)
  155. assert result8.columns.tolist() == ["id1", "id2", "var", "value"]
  156. result9 = df.melt(
  157. id_vars=["id1", "id2"], value_vars=["A", "B"], var_name=var_name
  158. )
  159. expected9 = DataFrame(
  160. {
  161. "id1": df["id1"].tolist() * 2,
  162. "id2": df["id2"].tolist() * 2,
  163. var_name: ["A"] * 10 + ["B"] * 10,
  164. "value": (df["A"].tolist() + df["B"].tolist()),
  165. },
  166. columns=["id1", "id2", var_name, "value"],
  167. )
  168. tm.assert_frame_equal(result9, expected9)
  169. def test_custom_value_name(self, df, value_name):
  170. result10 = df.melt(value_name=value_name)
  171. assert result10.columns.tolist() == ["variable", "val"]
  172. result11 = df.melt(id_vars=["id1"], value_name=value_name)
  173. assert result11.columns.tolist() == ["id1", "variable", "val"]
  174. result12 = df.melt(id_vars=["id1", "id2"], value_name=value_name)
  175. assert result12.columns.tolist() == ["id1", "id2", "variable", "val"]
  176. result13 = df.melt(
  177. id_vars=["id1", "id2"], value_vars="A", value_name=value_name
  178. )
  179. assert result13.columns.tolist() == ["id1", "id2", "variable", "val"]
  180. result14 = df.melt(
  181. id_vars=["id1", "id2"], value_vars=["A", "B"], value_name=value_name
  182. )
  183. expected14 = DataFrame(
  184. {
  185. "id1": df["id1"].tolist() * 2,
  186. "id2": df["id2"].tolist() * 2,
  187. "variable": ["A"] * 10 + ["B"] * 10,
  188. value_name: (df["A"].tolist() + df["B"].tolist()),
  189. },
  190. columns=["id1", "id2", "variable", value_name],
  191. )
  192. tm.assert_frame_equal(result14, expected14)
  193. def test_custom_var_and_value_name(self, df, value_name, var_name):
  194. result15 = df.melt(var_name=var_name, value_name=value_name)
  195. assert result15.columns.tolist() == ["var", "val"]
  196. result16 = df.melt(id_vars=["id1"], var_name=var_name, value_name=value_name)
  197. assert result16.columns.tolist() == ["id1", "var", "val"]
  198. result17 = df.melt(
  199. id_vars=["id1", "id2"], var_name=var_name, value_name=value_name
  200. )
  201. assert result17.columns.tolist() == ["id1", "id2", "var", "val"]
  202. result18 = df.melt(
  203. id_vars=["id1", "id2"],
  204. value_vars="A",
  205. var_name=var_name,
  206. value_name=value_name,
  207. )
  208. assert result18.columns.tolist() == ["id1", "id2", "var", "val"]
  209. result19 = df.melt(
  210. id_vars=["id1", "id2"],
  211. value_vars=["A", "B"],
  212. var_name=var_name,
  213. value_name=value_name,
  214. )
  215. expected19 = DataFrame(
  216. {
  217. "id1": df["id1"].tolist() * 2,
  218. "id2": df["id2"].tolist() * 2,
  219. var_name: ["A"] * 10 + ["B"] * 10,
  220. value_name: (df["A"].tolist() + df["B"].tolist()),
  221. },
  222. columns=["id1", "id2", var_name, value_name],
  223. )
  224. tm.assert_frame_equal(result19, expected19)
  225. df20 = df.copy()
  226. df20.columns.name = "foo"
  227. result20 = df20.melt()
  228. assert result20.columns.tolist() == ["foo", "value"]
  229. @pytest.mark.parametrize("col_level", [0, "CAP"])
  230. def test_col_level(self, col_level, df1):
  231. res = df1.melt(col_level=col_level)
  232. assert res.columns.tolist() == ["CAP", "value"]
  233. def test_multiindex(self, df1):
  234. res = df1.melt()
  235. assert res.columns.tolist() == ["CAP", "low", "value"]
  236. @pytest.mark.parametrize(
  237. "col",
  238. [
  239. pd.Series(pd.date_range("2010", periods=5, tz="US/Pacific")),
  240. pd.Series(["a", "b", "c", "a", "d"], dtype="category"),
  241. pd.Series([0, 1, 0, 0, 0]),
  242. ],
  243. )
  244. def test_pandas_dtypes(self, col):
  245. # GH 15785
  246. df = DataFrame(
  247. {"klass": range(5), "col": col, "attr1": [1, 0, 0, 0, 0], "attr2": col}
  248. )
  249. expected_value = pd.concat([pd.Series([1, 0, 0, 0, 0]), col], ignore_index=True)
  250. result = melt(
  251. df, id_vars=["klass", "col"], var_name="attribute", value_name="value"
  252. )
  253. expected = DataFrame(
  254. {
  255. 0: list(range(5)) * 2,
  256. 1: pd.concat([col] * 2, ignore_index=True),
  257. 2: ["attr1"] * 5 + ["attr2"] * 5,
  258. 3: expected_value,
  259. }
  260. )
  261. expected.columns = ["klass", "col", "attribute", "value"]
  262. tm.assert_frame_equal(result, expected)
  263. def test_preserve_category(self):
  264. # GH 15853
  265. data = DataFrame({"A": [1, 2], "B": pd.Categorical(["X", "Y"])})
  266. result = melt(data, ["B"], ["A"])
  267. expected = DataFrame(
  268. {"B": pd.Categorical(["X", "Y"]), "variable": ["A", "A"], "value": [1, 2]}
  269. )
  270. tm.assert_frame_equal(result, expected)
  271. def test_melt_missing_columns_raises(self):
  272. # GH-23575
  273. # This test is to ensure that pandas raises an error if melting is
  274. # attempted with column names absent from the dataframe
  275. # Generate data
  276. df = DataFrame(np.random.randn(5, 4), columns=list("abcd"))
  277. # Try to melt with missing `value_vars` column name
  278. msg = "The following '{Var}' are not present in the DataFrame: {Col}"
  279. with pytest.raises(
  280. KeyError, match=msg.format(Var="value_vars", Col="\\['C'\\]")
  281. ):
  282. df.melt(["a", "b"], ["C", "d"])
  283. # Try to melt with missing `id_vars` column name
  284. with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['A'\\]")):
  285. df.melt(["A", "b"], ["c", "d"])
  286. # Multiple missing
  287. with pytest.raises(
  288. KeyError,
  289. match=msg.format(Var="id_vars", Col="\\['not_here', 'or_there'\\]"),
  290. ):
  291. df.melt(["a", "b", "not_here", "or_there"], ["c", "d"])
  292. # Multiindex melt fails if column is missing from multilevel melt
  293. multi = df.copy()
  294. multi.columns = [list("ABCD"), list("abcd")]
  295. with pytest.raises(KeyError, match=msg.format(Var="id_vars", Col="\\['E'\\]")):
  296. multi.melt([("E", "a")], [("B", "b")])
  297. # Multiindex fails if column is missing from single level melt
  298. with pytest.raises(
  299. KeyError, match=msg.format(Var="value_vars", Col="\\['F'\\]")
  300. ):
  301. multi.melt(["A"], ["F"], col_level=0)
  302. def test_melt_mixed_int_str_id_vars(self):
  303. # GH 29718
  304. df = DataFrame({0: ["foo"], "a": ["bar"], "b": [1], "d": [2]})
  305. result = melt(df, id_vars=[0, "a"], value_vars=["b", "d"])
  306. expected = DataFrame(
  307. {0: ["foo"] * 2, "a": ["bar"] * 2, "variable": list("bd"), "value": [1, 2]}
  308. )
  309. tm.assert_frame_equal(result, expected)
  310. def test_melt_mixed_int_str_value_vars(self):
  311. # GH 29718
  312. df = DataFrame({0: ["foo"], "a": ["bar"]})
  313. result = melt(df, value_vars=[0, "a"])
  314. expected = DataFrame({"variable": [0, "a"], "value": ["foo", "bar"]})
  315. tm.assert_frame_equal(result, expected)
  316. def test_ignore_index(self):
  317. # GH 17440
  318. df = DataFrame({"foo": [0], "bar": [1]}, index=["first"])
  319. result = melt(df, ignore_index=False)
  320. expected = DataFrame(
  321. {"variable": ["foo", "bar"], "value": [0, 1]}, index=["first", "first"]
  322. )
  323. tm.assert_frame_equal(result, expected)
  324. def test_ignore_multiindex(self):
  325. # GH 17440
  326. index = pd.MultiIndex.from_tuples(
  327. [("first", "second"), ("first", "third")], names=["baz", "foobar"]
  328. )
  329. df = DataFrame({"foo": [0, 1], "bar": [2, 3]}, index=index)
  330. result = melt(df, ignore_index=False)
  331. expected_index = pd.MultiIndex.from_tuples(
  332. [("first", "second"), ("first", "third")] * 2, names=["baz", "foobar"]
  333. )
  334. expected = DataFrame(
  335. {"variable": ["foo"] * 2 + ["bar"] * 2, "value": [0, 1, 2, 3]},
  336. index=expected_index,
  337. )
  338. tm.assert_frame_equal(result, expected)
  339. def test_ignore_index_name_and_type(self):
  340. # GH 17440
  341. index = pd.Index(["foo", "bar"], dtype="category", name="baz")
  342. df = DataFrame({"x": [0, 1], "y": [2, 3]}, index=index)
  343. result = melt(df, ignore_index=False)
  344. expected_index = pd.Index(["foo", "bar"] * 2, dtype="category", name="baz")
  345. expected = DataFrame(
  346. {"variable": ["x", "x", "y", "y"], "value": [0, 1, 2, 3]},
  347. index=expected_index,
  348. )
  349. tm.assert_frame_equal(result, expected)
  350. def test_melt_with_duplicate_columns(self):
  351. # GH#41951
  352. df = DataFrame([["id", 2, 3]], columns=["a", "b", "b"])
  353. result = df.melt(id_vars=["a"], value_vars=["b"])
  354. expected = DataFrame(
  355. [["id", "b", 2], ["id", "b", 3]], columns=["a", "variable", "value"]
  356. )
  357. tm.assert_frame_equal(result, expected)
  358. @pytest.mark.parametrize("dtype", ["Int8", "Int64"])
  359. def test_melt_ea_dtype(self, dtype):
  360. # GH#41570
  361. df = DataFrame(
  362. {
  363. "a": pd.Series([1, 2], dtype="Int8"),
  364. "b": pd.Series([3, 4], dtype=dtype),
  365. }
  366. )
  367. result = df.melt()
  368. expected = DataFrame(
  369. {
  370. "variable": ["a", "a", "b", "b"],
  371. "value": pd.Series([1, 2, 3, 4], dtype=dtype),
  372. }
  373. )
  374. tm.assert_frame_equal(result, expected)
  375. class TestLreshape:
  376. def test_pairs(self):
  377. data = {
  378. "birthdt": [
  379. "08jan2009",
  380. "20dec2008",
  381. "30dec2008",
  382. "21dec2008",
  383. "11jan2009",
  384. ],
  385. "birthwt": [1766, 3301, 1454, 3139, 4133],
  386. "id": [101, 102, 103, 104, 105],
  387. "sex": ["Male", "Female", "Female", "Female", "Female"],
  388. "visitdt1": [
  389. "11jan2009",
  390. "22dec2008",
  391. "04jan2009",
  392. "29dec2008",
  393. "20jan2009",
  394. ],
  395. "visitdt2": ["21jan2009", np.nan, "22jan2009", "31dec2008", "03feb2009"],
  396. "visitdt3": ["05feb2009", np.nan, np.nan, "02jan2009", "15feb2009"],
  397. "wt1": [1823, 3338, 1549, 3298, 4306],
  398. "wt2": [2011.0, np.nan, 1892.0, 3338.0, 4575.0],
  399. "wt3": [2293.0, np.nan, np.nan, 3377.0, 4805.0],
  400. }
  401. df = DataFrame(data)
  402. spec = {
  403. "visitdt": [f"visitdt{i:d}" for i in range(1, 4)],
  404. "wt": [f"wt{i:d}" for i in range(1, 4)],
  405. }
  406. result = lreshape(df, spec)
  407. exp_data = {
  408. "birthdt": [
  409. "08jan2009",
  410. "20dec2008",
  411. "30dec2008",
  412. "21dec2008",
  413. "11jan2009",
  414. "08jan2009",
  415. "30dec2008",
  416. "21dec2008",
  417. "11jan2009",
  418. "08jan2009",
  419. "21dec2008",
  420. "11jan2009",
  421. ],
  422. "birthwt": [
  423. 1766,
  424. 3301,
  425. 1454,
  426. 3139,
  427. 4133,
  428. 1766,
  429. 1454,
  430. 3139,
  431. 4133,
  432. 1766,
  433. 3139,
  434. 4133,
  435. ],
  436. "id": [101, 102, 103, 104, 105, 101, 103, 104, 105, 101, 104, 105],
  437. "sex": [
  438. "Male",
  439. "Female",
  440. "Female",
  441. "Female",
  442. "Female",
  443. "Male",
  444. "Female",
  445. "Female",
  446. "Female",
  447. "Male",
  448. "Female",
  449. "Female",
  450. ],
  451. "visitdt": [
  452. "11jan2009",
  453. "22dec2008",
  454. "04jan2009",
  455. "29dec2008",
  456. "20jan2009",
  457. "21jan2009",
  458. "22jan2009",
  459. "31dec2008",
  460. "03feb2009",
  461. "05feb2009",
  462. "02jan2009",
  463. "15feb2009",
  464. ],
  465. "wt": [
  466. 1823.0,
  467. 3338.0,
  468. 1549.0,
  469. 3298.0,
  470. 4306.0,
  471. 2011.0,
  472. 1892.0,
  473. 3338.0,
  474. 4575.0,
  475. 2293.0,
  476. 3377.0,
  477. 4805.0,
  478. ],
  479. }
  480. exp = DataFrame(exp_data, columns=result.columns)
  481. tm.assert_frame_equal(result, exp)
  482. result = lreshape(df, spec, dropna=False)
  483. exp_data = {
  484. "birthdt": [
  485. "08jan2009",
  486. "20dec2008",
  487. "30dec2008",
  488. "21dec2008",
  489. "11jan2009",
  490. "08jan2009",
  491. "20dec2008",
  492. "30dec2008",
  493. "21dec2008",
  494. "11jan2009",
  495. "08jan2009",
  496. "20dec2008",
  497. "30dec2008",
  498. "21dec2008",
  499. "11jan2009",
  500. ],
  501. "birthwt": [
  502. 1766,
  503. 3301,
  504. 1454,
  505. 3139,
  506. 4133,
  507. 1766,
  508. 3301,
  509. 1454,
  510. 3139,
  511. 4133,
  512. 1766,
  513. 3301,
  514. 1454,
  515. 3139,
  516. 4133,
  517. ],
  518. "id": [
  519. 101,
  520. 102,
  521. 103,
  522. 104,
  523. 105,
  524. 101,
  525. 102,
  526. 103,
  527. 104,
  528. 105,
  529. 101,
  530. 102,
  531. 103,
  532. 104,
  533. 105,
  534. ],
  535. "sex": [
  536. "Male",
  537. "Female",
  538. "Female",
  539. "Female",
  540. "Female",
  541. "Male",
  542. "Female",
  543. "Female",
  544. "Female",
  545. "Female",
  546. "Male",
  547. "Female",
  548. "Female",
  549. "Female",
  550. "Female",
  551. ],
  552. "visitdt": [
  553. "11jan2009",
  554. "22dec2008",
  555. "04jan2009",
  556. "29dec2008",
  557. "20jan2009",
  558. "21jan2009",
  559. np.nan,
  560. "22jan2009",
  561. "31dec2008",
  562. "03feb2009",
  563. "05feb2009",
  564. np.nan,
  565. np.nan,
  566. "02jan2009",
  567. "15feb2009",
  568. ],
  569. "wt": [
  570. 1823.0,
  571. 3338.0,
  572. 1549.0,
  573. 3298.0,
  574. 4306.0,
  575. 2011.0,
  576. np.nan,
  577. 1892.0,
  578. 3338.0,
  579. 4575.0,
  580. 2293.0,
  581. np.nan,
  582. np.nan,
  583. 3377.0,
  584. 4805.0,
  585. ],
  586. }
  587. exp = DataFrame(exp_data, columns=result.columns)
  588. tm.assert_frame_equal(result, exp)
  589. spec = {
  590. "visitdt": [f"visitdt{i:d}" for i in range(1, 3)],
  591. "wt": [f"wt{i:d}" for i in range(1, 4)],
  592. }
  593. msg = "All column lists must be same length"
  594. with pytest.raises(ValueError, match=msg):
  595. lreshape(df, spec)
  596. class TestWideToLong:
  597. def test_simple(self):
  598. np.random.seed(123)
  599. x = np.random.randn(3)
  600. df = DataFrame(
  601. {
  602. "A1970": {0: "a", 1: "b", 2: "c"},
  603. "A1980": {0: "d", 1: "e", 2: "f"},
  604. "B1970": {0: 2.5, 1: 1.2, 2: 0.7},
  605. "B1980": {0: 3.2, 1: 1.3, 2: 0.1},
  606. "X": dict(zip(range(3), x)),
  607. }
  608. )
  609. df["id"] = df.index
  610. exp_data = {
  611. "X": x.tolist() + x.tolist(),
  612. "A": ["a", "b", "c", "d", "e", "f"],
  613. "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
  614. "year": [1970, 1970, 1970, 1980, 1980, 1980],
  615. "id": [0, 1, 2, 0, 1, 2],
  616. }
  617. expected = DataFrame(exp_data)
  618. expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
  619. result = wide_to_long(df, ["A", "B"], i="id", j="year")
  620. tm.assert_frame_equal(result, expected)
  621. def test_stubs(self):
  622. # GH9204 wide_to_long call should not modify 'stubs' list
  623. df = DataFrame([[0, 1, 2, 3, 8], [4, 5, 6, 7, 9]])
  624. df.columns = ["id", "inc1", "inc2", "edu1", "edu2"]
  625. stubs = ["inc", "edu"]
  626. wide_to_long(df, stubs, i="id", j="age")
  627. assert stubs == ["inc", "edu"]
  628. def test_separating_character(self):
  629. # GH14779
  630. np.random.seed(123)
  631. x = np.random.randn(3)
  632. df = DataFrame(
  633. {
  634. "A.1970": {0: "a", 1: "b", 2: "c"},
  635. "A.1980": {0: "d", 1: "e", 2: "f"},
  636. "B.1970": {0: 2.5, 1: 1.2, 2: 0.7},
  637. "B.1980": {0: 3.2, 1: 1.3, 2: 0.1},
  638. "X": dict(zip(range(3), x)),
  639. }
  640. )
  641. df["id"] = df.index
  642. exp_data = {
  643. "X": x.tolist() + x.tolist(),
  644. "A": ["a", "b", "c", "d", "e", "f"],
  645. "B": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
  646. "year": [1970, 1970, 1970, 1980, 1980, 1980],
  647. "id": [0, 1, 2, 0, 1, 2],
  648. }
  649. expected = DataFrame(exp_data)
  650. expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
  651. result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=".")
  652. tm.assert_frame_equal(result, expected)
  653. def test_escapable_characters(self):
  654. np.random.seed(123)
  655. x = np.random.randn(3)
  656. df = DataFrame(
  657. {
  658. "A(quarterly)1970": {0: "a", 1: "b", 2: "c"},
  659. "A(quarterly)1980": {0: "d", 1: "e", 2: "f"},
  660. "B(quarterly)1970": {0: 2.5, 1: 1.2, 2: 0.7},
  661. "B(quarterly)1980": {0: 3.2, 1: 1.3, 2: 0.1},
  662. "X": dict(zip(range(3), x)),
  663. }
  664. )
  665. df["id"] = df.index
  666. exp_data = {
  667. "X": x.tolist() + x.tolist(),
  668. "A(quarterly)": ["a", "b", "c", "d", "e", "f"],
  669. "B(quarterly)": [2.5, 1.2, 0.7, 3.2, 1.3, 0.1],
  670. "year": [1970, 1970, 1970, 1980, 1980, 1980],
  671. "id": [0, 1, 2, 0, 1, 2],
  672. }
  673. expected = DataFrame(exp_data)
  674. expected = expected.set_index(["id", "year"])[
  675. ["X", "A(quarterly)", "B(quarterly)"]
  676. ]
  677. result = wide_to_long(df, ["A(quarterly)", "B(quarterly)"], i="id", j="year")
  678. tm.assert_frame_equal(result, expected)
  679. def test_unbalanced(self):
  680. # test that we can have a varying amount of time variables
  681. df = DataFrame(
  682. {
  683. "A2010": [1.0, 2.0],
  684. "A2011": [3.0, 4.0],
  685. "B2010": [5.0, 6.0],
  686. "X": ["X1", "X2"],
  687. }
  688. )
  689. df["id"] = df.index
  690. exp_data = {
  691. "X": ["X1", "X2", "X1", "X2"],
  692. "A": [1.0, 2.0, 3.0, 4.0],
  693. "B": [5.0, 6.0, np.nan, np.nan],
  694. "id": [0, 1, 0, 1],
  695. "year": [2010, 2010, 2011, 2011],
  696. }
  697. expected = DataFrame(exp_data)
  698. expected = expected.set_index(["id", "year"])[["X", "A", "B"]]
  699. result = wide_to_long(df, ["A", "B"], i="id", j="year")
  700. tm.assert_frame_equal(result, expected)
  701. def test_character_overlap(self):
  702. # Test we handle overlapping characters in both id_vars and value_vars
  703. df = DataFrame(
  704. {
  705. "A11": ["a11", "a22", "a33"],
  706. "A12": ["a21", "a22", "a23"],
  707. "B11": ["b11", "b12", "b13"],
  708. "B12": ["b21", "b22", "b23"],
  709. "BB11": [1, 2, 3],
  710. "BB12": [4, 5, 6],
  711. "BBBX": [91, 92, 93],
  712. "BBBZ": [91, 92, 93],
  713. }
  714. )
  715. df["id"] = df.index
  716. expected = DataFrame(
  717. {
  718. "BBBX": [91, 92, 93, 91, 92, 93],
  719. "BBBZ": [91, 92, 93, 91, 92, 93],
  720. "A": ["a11", "a22", "a33", "a21", "a22", "a23"],
  721. "B": ["b11", "b12", "b13", "b21", "b22", "b23"],
  722. "BB": [1, 2, 3, 4, 5, 6],
  723. "id": [0, 1, 2, 0, 1, 2],
  724. "year": [11, 11, 11, 12, 12, 12],
  725. }
  726. )
  727. expected = expected.set_index(["id", "year"])[["BBBX", "BBBZ", "A", "B", "BB"]]
  728. result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
  729. tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
  730. def test_invalid_separator(self):
  731. # if an invalid separator is supplied a empty data frame is returned
  732. sep = "nope!"
  733. df = DataFrame(
  734. {
  735. "A2010": [1.0, 2.0],
  736. "A2011": [3.0, 4.0],
  737. "B2010": [5.0, 6.0],
  738. "X": ["X1", "X2"],
  739. }
  740. )
  741. df["id"] = df.index
  742. exp_data = {
  743. "X": "",
  744. "A2010": [],
  745. "A2011": [],
  746. "B2010": [],
  747. "id": [],
  748. "year": [],
  749. "A": [],
  750. "B": [],
  751. }
  752. expected = DataFrame(exp_data).astype({"year": np.int64})
  753. expected = expected.set_index(["id", "year"])[
  754. ["X", "A2010", "A2011", "B2010", "A", "B"]
  755. ]
  756. expected.index = expected.index.set_levels([0, 1], level=0)
  757. result = wide_to_long(df, ["A", "B"], i="id", j="year", sep=sep)
  758. tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
  759. def test_num_string_disambiguation(self):
  760. # Test that we can disambiguate number value_vars from
  761. # string value_vars
  762. df = DataFrame(
  763. {
  764. "A11": ["a11", "a22", "a33"],
  765. "A12": ["a21", "a22", "a23"],
  766. "B11": ["b11", "b12", "b13"],
  767. "B12": ["b21", "b22", "b23"],
  768. "BB11": [1, 2, 3],
  769. "BB12": [4, 5, 6],
  770. "Arating": [91, 92, 93],
  771. "Arating_old": [91, 92, 93],
  772. }
  773. )
  774. df["id"] = df.index
  775. expected = DataFrame(
  776. {
  777. "Arating": [91, 92, 93, 91, 92, 93],
  778. "Arating_old": [91, 92, 93, 91, 92, 93],
  779. "A": ["a11", "a22", "a33", "a21", "a22", "a23"],
  780. "B": ["b11", "b12", "b13", "b21", "b22", "b23"],
  781. "BB": [1, 2, 3, 4, 5, 6],
  782. "id": [0, 1, 2, 0, 1, 2],
  783. "year": [11, 11, 11, 12, 12, 12],
  784. }
  785. )
  786. expected = expected.set_index(["id", "year"])[
  787. ["Arating", "Arating_old", "A", "B", "BB"]
  788. ]
  789. result = wide_to_long(df, ["A", "B", "BB"], i="id", j="year")
  790. tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
  791. def test_invalid_suffixtype(self):
  792. # If all stubs names end with a string, but a numeric suffix is
  793. # assumed, an empty data frame is returned
  794. df = DataFrame(
  795. {
  796. "Aone": [1.0, 2.0],
  797. "Atwo": [3.0, 4.0],
  798. "Bone": [5.0, 6.0],
  799. "X": ["X1", "X2"],
  800. }
  801. )
  802. df["id"] = df.index
  803. exp_data = {
  804. "X": "",
  805. "Aone": [],
  806. "Atwo": [],
  807. "Bone": [],
  808. "id": [],
  809. "year": [],
  810. "A": [],
  811. "B": [],
  812. }
  813. expected = DataFrame(exp_data).astype({"year": np.int64})
  814. expected = expected.set_index(["id", "year"])
  815. expected.index = expected.index.set_levels([0, 1], level=0)
  816. result = wide_to_long(df, ["A", "B"], i="id", j="year")
  817. tm.assert_frame_equal(result.sort_index(axis=1), expected.sort_index(axis=1))
  818. def test_multiple_id_columns(self):
  819. # Taken from http://www.ats.ucla.edu/stat/stata/modules/reshapel.htm
  820. df = DataFrame(
  821. {
  822. "famid": [1, 1, 1, 2, 2, 2, 3, 3, 3],
  823. "birth": [1, 2, 3, 1, 2, 3, 1, 2, 3],
  824. "ht1": [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
  825. "ht2": [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9],
  826. }
  827. )
  828. expected = DataFrame(
  829. {
  830. "ht": [
  831. 2.8,
  832. 3.4,
  833. 2.9,
  834. 3.8,
  835. 2.2,
  836. 2.9,
  837. 2.0,
  838. 3.2,
  839. 1.8,
  840. 2.8,
  841. 1.9,
  842. 2.4,
  843. 2.2,
  844. 3.3,
  845. 2.3,
  846. 3.4,
  847. 2.1,
  848. 2.9,
  849. ],
  850. "famid": [1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3],
  851. "birth": [1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3],
  852. "age": [1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2],
  853. }
  854. )
  855. expected = expected.set_index(["famid", "birth", "age"])[["ht"]]
  856. result = wide_to_long(df, "ht", i=["famid", "birth"], j="age")
  857. tm.assert_frame_equal(result, expected)
  858. def test_non_unique_idvars(self):
  859. # GH16382
  860. # Raise an error message if non unique id vars (i) are passed
  861. df = DataFrame(
  862. {"A_A1": [1, 2, 3, 4, 5], "B_B1": [1, 2, 3, 4, 5], "x": [1, 1, 1, 1, 1]}
  863. )
  864. msg = "the id variables need to uniquely identify each row"
  865. with pytest.raises(ValueError, match=msg):
  866. wide_to_long(df, ["A_A", "B_B"], i="x", j="colname")
  867. def test_cast_j_int(self):
  868. df = DataFrame(
  869. {
  870. "actor_1": ["CCH Pounder", "Johnny Depp", "Christoph Waltz"],
  871. "actor_2": ["Joel David Moore", "Orlando Bloom", "Rory Kinnear"],
  872. "actor_fb_likes_1": [1000.0, 40000.0, 11000.0],
  873. "actor_fb_likes_2": [936.0, 5000.0, 393.0],
  874. "title": ["Avatar", "Pirates of the Caribbean", "Spectre"],
  875. }
  876. )
  877. expected = DataFrame(
  878. {
  879. "actor": [
  880. "CCH Pounder",
  881. "Johnny Depp",
  882. "Christoph Waltz",
  883. "Joel David Moore",
  884. "Orlando Bloom",
  885. "Rory Kinnear",
  886. ],
  887. "actor_fb_likes": [1000.0, 40000.0, 11000.0, 936.0, 5000.0, 393.0],
  888. "num": [1, 1, 1, 2, 2, 2],
  889. "title": [
  890. "Avatar",
  891. "Pirates of the Caribbean",
  892. "Spectre",
  893. "Avatar",
  894. "Pirates of the Caribbean",
  895. "Spectre",
  896. ],
  897. }
  898. ).set_index(["title", "num"])
  899. result = wide_to_long(
  900. df, ["actor", "actor_fb_likes"], i="title", j="num", sep="_"
  901. )
  902. tm.assert_frame_equal(result, expected)
  903. def test_identical_stubnames(self):
  904. df = DataFrame(
  905. {
  906. "A2010": [1.0, 2.0],
  907. "A2011": [3.0, 4.0],
  908. "B2010": [5.0, 6.0],
  909. "A": ["X1", "X2"],
  910. }
  911. )
  912. msg = "stubname can't be identical to a column name"
  913. with pytest.raises(ValueError, match=msg):
  914. wide_to_long(df, ["A", "B"], i="A", j="colname")
  915. def test_nonnumeric_suffix(self):
  916. df = DataFrame(
  917. {
  918. "treatment_placebo": [1.0, 2.0],
  919. "treatment_test": [3.0, 4.0],
  920. "result_placebo": [5.0, 6.0],
  921. "A": ["X1", "X2"],
  922. }
  923. )
  924. expected = DataFrame(
  925. {
  926. "A": ["X1", "X2", "X1", "X2"],
  927. "colname": ["placebo", "placebo", "test", "test"],
  928. "result": [5.0, 6.0, np.nan, np.nan],
  929. "treatment": [1.0, 2.0, 3.0, 4.0],
  930. }
  931. )
  932. expected = expected.set_index(["A", "colname"])
  933. result = wide_to_long(
  934. df, ["result", "treatment"], i="A", j="colname", suffix="[a-z]+", sep="_"
  935. )
  936. tm.assert_frame_equal(result, expected)
  937. def test_mixed_type_suffix(self):
  938. df = DataFrame(
  939. {
  940. "A": ["X1", "X2"],
  941. "result_1": [0, 9],
  942. "result_foo": [5.0, 6.0],
  943. "treatment_1": [1.0, 2.0],
  944. "treatment_foo": [3.0, 4.0],
  945. }
  946. )
  947. expected = DataFrame(
  948. {
  949. "A": ["X1", "X2", "X1", "X2"],
  950. "colname": ["1", "1", "foo", "foo"],
  951. "result": [0.0, 9.0, 5.0, 6.0],
  952. "treatment": [1.0, 2.0, 3.0, 4.0],
  953. }
  954. ).set_index(["A", "colname"])
  955. result = wide_to_long(
  956. df, ["result", "treatment"], i="A", j="colname", suffix=".+", sep="_"
  957. )
  958. tm.assert_frame_equal(result, expected)
  959. def test_float_suffix(self):
  960. df = DataFrame(
  961. {
  962. "treatment_1.1": [1.0, 2.0],
  963. "treatment_2.1": [3.0, 4.0],
  964. "result_1.2": [5.0, 6.0],
  965. "result_1": [0, 9],
  966. "A": ["X1", "X2"],
  967. }
  968. )
  969. expected = DataFrame(
  970. {
  971. "A": ["X1", "X2", "X1", "X2", "X1", "X2", "X1", "X2"],
  972. "colname": [1.2, 1.2, 1.0, 1.0, 1.1, 1.1, 2.1, 2.1],
  973. "result": [5.0, 6.0, 0.0, 9.0, np.nan, np.nan, np.nan, np.nan],
  974. "treatment": [np.nan, np.nan, np.nan, np.nan, 1.0, 2.0, 3.0, 4.0],
  975. }
  976. )
  977. expected = expected.set_index(["A", "colname"])
  978. result = wide_to_long(
  979. df, ["result", "treatment"], i="A", j="colname", suffix="[0-9.]+", sep="_"
  980. )
  981. tm.assert_frame_equal(result, expected)
  982. def test_col_substring_of_stubname(self):
  983. # GH22468
  984. # Don't raise ValueError when a column name is a substring
  985. # of a stubname that's been passed as a string
  986. wide_data = {
  987. "node_id": {0: 0, 1: 1, 2: 2, 3: 3, 4: 4},
  988. "A": {0: 0.80, 1: 0.0, 2: 0.25, 3: 1.0, 4: 0.81},
  989. "PA0": {0: 0.74, 1: 0.56, 2: 0.56, 3: 0.98, 4: 0.6},
  990. "PA1": {0: 0.77, 1: 0.64, 2: 0.52, 3: 0.98, 4: 0.67},
  991. "PA3": {0: 0.34, 1: 0.70, 2: 0.52, 3: 0.98, 4: 0.67},
  992. }
  993. wide_df = DataFrame.from_dict(wide_data)
  994. expected = wide_to_long(wide_df, stubnames=["PA"], i=["node_id", "A"], j="time")
  995. result = wide_to_long(wide_df, stubnames="PA", i=["node_id", "A"], j="time")
  996. tm.assert_frame_equal(result, expected)
  997. def test_raise_of_column_name_value(self):
  998. # GH34731, enforced in 2.0
  999. # raise a ValueError if the resultant value column name matches
  1000. # a name in the dataframe already (default name is "value")
  1001. df = DataFrame({"col": list("ABC"), "value": range(10, 16, 2)})
  1002. with pytest.raises(
  1003. ValueError, match=re.escape("value_name (value) cannot match")
  1004. ):
  1005. df.melt(id_vars="value", value_name="value")
  1006. @pytest.mark.parametrize("dtype", ["O", "string"])
  1007. def test_missing_stubname(self, dtype):
  1008. # GH46044
  1009. df = DataFrame({"id": ["1", "2"], "a-1": [100, 200], "a-2": [300, 400]})
  1010. df = df.astype({"id": dtype})
  1011. result = wide_to_long(
  1012. df,
  1013. stubnames=["a", "b"],
  1014. i="id",
  1015. j="num",
  1016. sep="-",
  1017. )
  1018. index = pd.Index(
  1019. [("1", 1), ("2", 1), ("1", 2), ("2", 2)],
  1020. name=("id", "num"),
  1021. )
  1022. expected = DataFrame(
  1023. {"a": [100, 200, 300, 400], "b": [np.nan] * 4},
  1024. index=index,
  1025. )
  1026. new_level = expected.index.levels[0].astype(dtype)
  1027. expected.index = expected.index.set_levels(new_level, level=0)
  1028. tm.assert_frame_equal(result, expected)