test_stack_unstack.py 76 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200
  1. from datetime import datetime
  2. from io import StringIO
  3. import itertools
  4. import numpy as np
  5. import pytest
  6. from pandas.errors import PerformanceWarning
  7. import pandas as pd
  8. from pandas import (
  9. DataFrame,
  10. Index,
  11. MultiIndex,
  12. Period,
  13. Series,
  14. Timedelta,
  15. date_range,
  16. )
  17. import pandas._testing as tm
  18. from pandas.core.reshape import reshape as reshape_lib
  19. class TestDataFrameReshape:
  20. def test_stack_unstack(self, float_frame):
  21. df = float_frame.copy()
  22. df[:] = np.arange(np.prod(df.shape)).reshape(df.shape)
  23. stacked = df.stack()
  24. stacked_df = DataFrame({"foo": stacked, "bar": stacked})
  25. unstacked = stacked.unstack()
  26. unstacked_df = stacked_df.unstack()
  27. tm.assert_frame_equal(unstacked, df)
  28. tm.assert_frame_equal(unstacked_df["bar"], df)
  29. unstacked_cols = stacked.unstack(0)
  30. unstacked_cols_df = stacked_df.unstack(0)
  31. tm.assert_frame_equal(unstacked_cols.T, df)
  32. tm.assert_frame_equal(unstacked_cols_df["bar"].T, df)
  33. def test_stack_mixed_level(self):
  34. # GH 18310
  35. levels = [range(3), [3, "a", "b"], [1, 2]]
  36. # flat columns:
  37. df = DataFrame(1, index=levels[0], columns=levels[1])
  38. result = df.stack()
  39. expected = Series(1, index=MultiIndex.from_product(levels[:2]))
  40. tm.assert_series_equal(result, expected)
  41. # MultiIndex columns:
  42. df = DataFrame(1, index=levels[0], columns=MultiIndex.from_product(levels[1:]))
  43. result = df.stack(1)
  44. expected = DataFrame(
  45. 1, index=MultiIndex.from_product([levels[0], levels[2]]), columns=levels[1]
  46. )
  47. tm.assert_frame_equal(result, expected)
  48. # as above, but used labels in level are actually of homogeneous type
  49. result = df[["a", "b"]].stack(1)
  50. expected = expected[["a", "b"]]
  51. tm.assert_frame_equal(result, expected)
  52. def test_unstack_not_consolidated(self, using_array_manager):
  53. # Gh#34708
  54. df = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]})
  55. df2 = df[["x"]]
  56. df2["y"] = df["y"]
  57. if not using_array_manager:
  58. assert len(df2._mgr.blocks) == 2
  59. res = df2.unstack()
  60. expected = df.unstack()
  61. tm.assert_series_equal(res, expected)
  62. def test_unstack_fill(self):
  63. # GH #9746: fill_value keyword argument for Series
  64. # and DataFrame unstack
  65. # From a series
  66. data = Series([1, 2, 4, 5], dtype=np.int16)
  67. data.index = MultiIndex.from_tuples(
  68. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  69. )
  70. result = data.unstack(fill_value=-1)
  71. expected = DataFrame(
  72. {"a": [1, -1, 5], "b": [2, 4, -1]}, index=["x", "y", "z"], dtype=np.int16
  73. )
  74. tm.assert_frame_equal(result, expected)
  75. # From a series with incorrect data type for fill_value
  76. result = data.unstack(fill_value=0.5)
  77. expected = DataFrame(
  78. {"a": [1, 0.5, 5], "b": [2, 4, 0.5]}, index=["x", "y", "z"], dtype=float
  79. )
  80. tm.assert_frame_equal(result, expected)
  81. # GH #13971: fill_value when unstacking multiple levels:
  82. df = DataFrame(
  83. {"x": ["a", "a", "b"], "y": ["j", "k", "j"], "z": [0, 1, 2], "w": [0, 1, 2]}
  84. ).set_index(["x", "y", "z"])
  85. unstacked = df.unstack(["x", "y"], fill_value=0)
  86. key = ("w", "b", "j")
  87. expected = unstacked[key]
  88. result = Series([0, 0, 2], index=unstacked.index, name=key)
  89. tm.assert_series_equal(result, expected)
  90. stacked = unstacked.stack(["x", "y"])
  91. stacked.index = stacked.index.reorder_levels(df.index.names)
  92. # Workaround for GH #17886 (unnecessarily casts to float):
  93. stacked = stacked.astype(np.int64)
  94. result = stacked.loc[df.index]
  95. tm.assert_frame_equal(result, df)
  96. # From a series
  97. s = df["w"]
  98. result = s.unstack(["x", "y"], fill_value=0)
  99. expected = unstacked["w"]
  100. tm.assert_frame_equal(result, expected)
  101. def test_unstack_fill_frame(self):
  102. # From a dataframe
  103. rows = [[1, 2], [3, 4], [5, 6], [7, 8]]
  104. df = DataFrame(rows, columns=list("AB"), dtype=np.int32)
  105. df.index = MultiIndex.from_tuples(
  106. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  107. )
  108. result = df.unstack(fill_value=-1)
  109. rows = [[1, 3, 2, 4], [-1, 5, -1, 6], [7, -1, 8, -1]]
  110. expected = DataFrame(rows, index=list("xyz"), dtype=np.int32)
  111. expected.columns = MultiIndex.from_tuples(
  112. [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]
  113. )
  114. tm.assert_frame_equal(result, expected)
  115. # From a mixed type dataframe
  116. df["A"] = df["A"].astype(np.int16)
  117. df["B"] = df["B"].astype(np.float64)
  118. result = df.unstack(fill_value=-1)
  119. expected["A"] = expected["A"].astype(np.int16)
  120. expected["B"] = expected["B"].astype(np.float64)
  121. tm.assert_frame_equal(result, expected)
  122. # From a dataframe with incorrect data type for fill_value
  123. result = df.unstack(fill_value=0.5)
  124. rows = [[1, 3, 2, 4], [0.5, 5, 0.5, 6], [7, 0.5, 8, 0.5]]
  125. expected = DataFrame(rows, index=list("xyz"), dtype=float)
  126. expected.columns = MultiIndex.from_tuples(
  127. [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")]
  128. )
  129. tm.assert_frame_equal(result, expected)
  130. def test_unstack_fill_frame_datetime(self):
  131. # Test unstacking with date times
  132. dv = date_range("2012-01-01", periods=4).values
  133. data = Series(dv)
  134. data.index = MultiIndex.from_tuples(
  135. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  136. )
  137. result = data.unstack()
  138. expected = DataFrame(
  139. {"a": [dv[0], pd.NaT, dv[3]], "b": [dv[1], dv[2], pd.NaT]},
  140. index=["x", "y", "z"],
  141. )
  142. tm.assert_frame_equal(result, expected)
  143. result = data.unstack(fill_value=dv[0])
  144. expected = DataFrame(
  145. {"a": [dv[0], dv[0], dv[3]], "b": [dv[1], dv[2], dv[0]]},
  146. index=["x", "y", "z"],
  147. )
  148. tm.assert_frame_equal(result, expected)
  149. def test_unstack_fill_frame_timedelta(self):
  150. # Test unstacking with time deltas
  151. td = [Timedelta(days=i) for i in range(4)]
  152. data = Series(td)
  153. data.index = MultiIndex.from_tuples(
  154. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  155. )
  156. result = data.unstack()
  157. expected = DataFrame(
  158. {"a": [td[0], pd.NaT, td[3]], "b": [td[1], td[2], pd.NaT]},
  159. index=["x", "y", "z"],
  160. )
  161. tm.assert_frame_equal(result, expected)
  162. result = data.unstack(fill_value=td[1])
  163. expected = DataFrame(
  164. {"a": [td[0], td[1], td[3]], "b": [td[1], td[2], td[1]]},
  165. index=["x", "y", "z"],
  166. )
  167. tm.assert_frame_equal(result, expected)
  168. def test_unstack_fill_frame_period(self):
  169. # Test unstacking with period
  170. periods = [
  171. Period("2012-01"),
  172. Period("2012-02"),
  173. Period("2012-03"),
  174. Period("2012-04"),
  175. ]
  176. data = Series(periods)
  177. data.index = MultiIndex.from_tuples(
  178. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  179. )
  180. result = data.unstack()
  181. expected = DataFrame(
  182. {"a": [periods[0], None, periods[3]], "b": [periods[1], periods[2], None]},
  183. index=["x", "y", "z"],
  184. )
  185. tm.assert_frame_equal(result, expected)
  186. result = data.unstack(fill_value=periods[1])
  187. expected = DataFrame(
  188. {
  189. "a": [periods[0], periods[1], periods[3]],
  190. "b": [periods[1], periods[2], periods[1]],
  191. },
  192. index=["x", "y", "z"],
  193. )
  194. tm.assert_frame_equal(result, expected)
  195. def test_unstack_fill_frame_categorical(self):
  196. # Test unstacking with categorical
  197. data = Series(["a", "b", "c", "a"], dtype="category")
  198. data.index = MultiIndex.from_tuples(
  199. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  200. )
  201. # By default missing values will be NaN
  202. result = data.unstack()
  203. expected = DataFrame(
  204. {
  205. "a": pd.Categorical(list("axa"), categories=list("abc")),
  206. "b": pd.Categorical(list("bcx"), categories=list("abc")),
  207. },
  208. index=list("xyz"),
  209. )
  210. tm.assert_frame_equal(result, expected)
  211. # Fill with non-category results in a ValueError
  212. msg = r"Cannot setitem on a Categorical with a new category \(d\)"
  213. with pytest.raises(TypeError, match=msg):
  214. data.unstack(fill_value="d")
  215. # Fill with category value replaces missing values as expected
  216. result = data.unstack(fill_value="c")
  217. expected = DataFrame(
  218. {
  219. "a": pd.Categorical(list("aca"), categories=list("abc")),
  220. "b": pd.Categorical(list("bcc"), categories=list("abc")),
  221. },
  222. index=list("xyz"),
  223. )
  224. tm.assert_frame_equal(result, expected)
  225. def test_unstack_tuplename_in_multiindex(self):
  226. # GH 19966
  227. idx = MultiIndex.from_product(
  228. [["a", "b", "c"], [1, 2, 3]], names=[("A", "a"), ("B", "b")]
  229. )
  230. df = DataFrame({"d": [1] * 9, "e": [2] * 9}, index=idx)
  231. result = df.unstack(("A", "a"))
  232. expected = DataFrame(
  233. [[1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2], [1, 1, 1, 2, 2, 2]],
  234. columns=MultiIndex.from_tuples(
  235. [
  236. ("d", "a"),
  237. ("d", "b"),
  238. ("d", "c"),
  239. ("e", "a"),
  240. ("e", "b"),
  241. ("e", "c"),
  242. ],
  243. names=[None, ("A", "a")],
  244. ),
  245. index=Index([1, 2, 3], name=("B", "b")),
  246. )
  247. tm.assert_frame_equal(result, expected)
  248. @pytest.mark.parametrize(
  249. "unstack_idx, expected_values, expected_index, expected_columns",
  250. [
  251. (
  252. ("A", "a"),
  253. [[1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2], [1, 1, 2, 2]],
  254. MultiIndex.from_tuples(
  255. [(1, 3), (1, 4), (2, 3), (2, 4)], names=["B", "C"]
  256. ),
  257. MultiIndex.from_tuples(
  258. [("d", "a"), ("d", "b"), ("e", "a"), ("e", "b")],
  259. names=[None, ("A", "a")],
  260. ),
  261. ),
  262. (
  263. (("A", "a"), "B"),
  264. [[1, 1, 1, 1, 2, 2, 2, 2], [1, 1, 1, 1, 2, 2, 2, 2]],
  265. Index([3, 4], name="C"),
  266. MultiIndex.from_tuples(
  267. [
  268. ("d", "a", 1),
  269. ("d", "a", 2),
  270. ("d", "b", 1),
  271. ("d", "b", 2),
  272. ("e", "a", 1),
  273. ("e", "a", 2),
  274. ("e", "b", 1),
  275. ("e", "b", 2),
  276. ],
  277. names=[None, ("A", "a"), "B"],
  278. ),
  279. ),
  280. ],
  281. )
  282. def test_unstack_mixed_type_name_in_multiindex(
  283. self, unstack_idx, expected_values, expected_index, expected_columns
  284. ):
  285. # GH 19966
  286. idx = MultiIndex.from_product(
  287. [["a", "b"], [1, 2], [3, 4]], names=[("A", "a"), "B", "C"]
  288. )
  289. df = DataFrame({"d": [1] * 8, "e": [2] * 8}, index=idx)
  290. result = df.unstack(unstack_idx)
  291. expected = DataFrame(
  292. expected_values, columns=expected_columns, index=expected_index
  293. )
  294. tm.assert_frame_equal(result, expected)
  295. def test_unstack_preserve_dtypes(self):
  296. # Checks fix for #11847
  297. df = DataFrame(
  298. {
  299. "state": ["IL", "MI", "NC"],
  300. "index": ["a", "b", "c"],
  301. "some_categories": Series(["a", "b", "c"]).astype("category"),
  302. "A": np.random.rand(3),
  303. "B": 1,
  304. "C": "foo",
  305. "D": pd.Timestamp("20010102"),
  306. "E": Series([1.0, 50.0, 100.0]).astype("float32"),
  307. "F": Series([3.0, 4.0, 5.0]).astype("float64"),
  308. "G": False,
  309. "H": Series([1, 200, 923442]).astype("int8"),
  310. }
  311. )
  312. def unstack_and_compare(df, column_name):
  313. unstacked1 = df.unstack([column_name])
  314. unstacked2 = df.unstack(column_name)
  315. tm.assert_frame_equal(unstacked1, unstacked2)
  316. df1 = df.set_index(["state", "index"])
  317. unstack_and_compare(df1, "index")
  318. df1 = df.set_index(["state", "some_categories"])
  319. unstack_and_compare(df1, "some_categories")
  320. df1 = df.set_index(["F", "C"])
  321. unstack_and_compare(df1, "F")
  322. df1 = df.set_index(["G", "B", "state"])
  323. unstack_and_compare(df1, "B")
  324. df1 = df.set_index(["E", "A"])
  325. unstack_and_compare(df1, "E")
  326. df1 = df.set_index(["state", "index"])
  327. s = df1["A"]
  328. unstack_and_compare(s, "index")
  329. def test_stack_ints(self):
  330. columns = MultiIndex.from_tuples(list(itertools.product(range(3), repeat=3)))
  331. df = DataFrame(np.random.randn(30, 27), columns=columns)
  332. tm.assert_frame_equal(df.stack(level=[1, 2]), df.stack(level=1).stack(level=1))
  333. tm.assert_frame_equal(
  334. df.stack(level=[-2, -1]), df.stack(level=1).stack(level=1)
  335. )
  336. df_named = df.copy()
  337. return_value = df_named.columns.set_names(range(3), inplace=True)
  338. assert return_value is None
  339. tm.assert_frame_equal(
  340. df_named.stack(level=[1, 2]), df_named.stack(level=1).stack(level=1)
  341. )
  342. def test_stack_mixed_levels(self):
  343. columns = MultiIndex.from_tuples(
  344. [
  345. ("A", "cat", "long"),
  346. ("B", "cat", "long"),
  347. ("A", "dog", "short"),
  348. ("B", "dog", "short"),
  349. ],
  350. names=["exp", "animal", "hair_length"],
  351. )
  352. df = DataFrame(np.random.randn(4, 4), columns=columns)
  353. animal_hair_stacked = df.stack(level=["animal", "hair_length"])
  354. exp_hair_stacked = df.stack(level=["exp", "hair_length"])
  355. # GH #8584: Need to check that stacking works when a number
  356. # is passed that is both a level name and in the range of
  357. # the level numbers
  358. df2 = df.copy()
  359. df2.columns.names = ["exp", "animal", 1]
  360. tm.assert_frame_equal(
  361. df2.stack(level=["animal", 1]), animal_hair_stacked, check_names=False
  362. )
  363. tm.assert_frame_equal(
  364. df2.stack(level=["exp", 1]), exp_hair_stacked, check_names=False
  365. )
  366. # When mixed types are passed and the ints are not level
  367. # names, raise
  368. msg = (
  369. "level should contain all level names or all level numbers, not "
  370. "a mixture of the two"
  371. )
  372. with pytest.raises(ValueError, match=msg):
  373. df2.stack(level=["animal", 0])
  374. # GH #8584: Having 0 in the level names could raise a
  375. # strange error about lexsort depth
  376. df3 = df.copy()
  377. df3.columns.names = ["exp", "animal", 0]
  378. tm.assert_frame_equal(
  379. df3.stack(level=["animal", 0]), animal_hair_stacked, check_names=False
  380. )
  381. def test_stack_int_level_names(self):
  382. columns = MultiIndex.from_tuples(
  383. [
  384. ("A", "cat", "long"),
  385. ("B", "cat", "long"),
  386. ("A", "dog", "short"),
  387. ("B", "dog", "short"),
  388. ],
  389. names=["exp", "animal", "hair_length"],
  390. )
  391. df = DataFrame(np.random.randn(4, 4), columns=columns)
  392. exp_animal_stacked = df.stack(level=["exp", "animal"])
  393. animal_hair_stacked = df.stack(level=["animal", "hair_length"])
  394. exp_hair_stacked = df.stack(level=["exp", "hair_length"])
  395. df2 = df.copy()
  396. df2.columns.names = [0, 1, 2]
  397. tm.assert_frame_equal(
  398. df2.stack(level=[1, 2]), animal_hair_stacked, check_names=False
  399. )
  400. tm.assert_frame_equal(
  401. df2.stack(level=[0, 1]), exp_animal_stacked, check_names=False
  402. )
  403. tm.assert_frame_equal(
  404. df2.stack(level=[0, 2]), exp_hair_stacked, check_names=False
  405. )
  406. # Out-of-order int column names
  407. df3 = df.copy()
  408. df3.columns.names = [2, 0, 1]
  409. tm.assert_frame_equal(
  410. df3.stack(level=[0, 1]), animal_hair_stacked, check_names=False
  411. )
  412. tm.assert_frame_equal(
  413. df3.stack(level=[2, 0]), exp_animal_stacked, check_names=False
  414. )
  415. tm.assert_frame_equal(
  416. df3.stack(level=[2, 1]), exp_hair_stacked, check_names=False
  417. )
  418. def test_unstack_bool(self):
  419. df = DataFrame(
  420. [False, False],
  421. index=MultiIndex.from_arrays([["a", "b"], ["c", "l"]]),
  422. columns=["col"],
  423. )
  424. rs = df.unstack()
  425. xp = DataFrame(
  426. np.array([[False, np.nan], [np.nan, False]], dtype=object),
  427. index=["a", "b"],
  428. columns=MultiIndex.from_arrays([["col", "col"], ["c", "l"]]),
  429. )
  430. tm.assert_frame_equal(rs, xp)
  431. def test_unstack_level_binding(self):
  432. # GH9856
  433. mi = MultiIndex(
  434. levels=[["foo", "bar"], ["one", "two"], ["a", "b"]],
  435. codes=[[0, 0, 1, 1], [0, 1, 0, 1], [1, 0, 1, 0]],
  436. names=["first", "second", "third"],
  437. )
  438. s = Series(0, index=mi)
  439. result = s.unstack([1, 2]).stack(0)
  440. expected_mi = MultiIndex(
  441. levels=[["foo", "bar"], ["one", "two"]],
  442. codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
  443. names=["first", "second"],
  444. )
  445. expected = DataFrame(
  446. np.array(
  447. [[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64
  448. ),
  449. index=expected_mi,
  450. columns=Index(["a", "b"], name="third"),
  451. )
  452. tm.assert_frame_equal(result, expected)
  453. def test_unstack_to_series(self, float_frame):
  454. # check reversibility
  455. data = float_frame.unstack()
  456. assert isinstance(data, Series)
  457. undo = data.unstack().T
  458. tm.assert_frame_equal(undo, float_frame)
  459. # check NA handling
  460. data = DataFrame({"x": [1, 2, np.NaN], "y": [3.0, 4, np.NaN]})
  461. data.index = Index(["a", "b", "c"])
  462. result = data.unstack()
  463. midx = MultiIndex(
  464. levels=[["x", "y"], ["a", "b", "c"]],
  465. codes=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]],
  466. )
  467. expected = Series([1, 2, np.NaN, 3, 4, np.NaN], index=midx)
  468. tm.assert_series_equal(result, expected)
  469. # check composability of unstack
  470. old_data = data.copy()
  471. for _ in range(4):
  472. data = data.unstack()
  473. tm.assert_frame_equal(old_data, data)
  474. def test_unstack_dtypes(self):
  475. # GH 2929
  476. rows = [[1, 1, 3, 4], [1, 2, 3, 4], [2, 1, 3, 4], [2, 2, 3, 4]]
  477. df = DataFrame(rows, columns=list("ABCD"))
  478. result = df.dtypes
  479. expected = Series([np.dtype("int64")] * 4, index=list("ABCD"))
  480. tm.assert_series_equal(result, expected)
  481. # single dtype
  482. df2 = df.set_index(["A", "B"])
  483. df3 = df2.unstack("B")
  484. result = df3.dtypes
  485. expected = Series(
  486. [np.dtype("int64")] * 4,
  487. index=MultiIndex.from_arrays(
  488. [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
  489. ),
  490. )
  491. tm.assert_series_equal(result, expected)
  492. # mixed
  493. df2 = df.set_index(["A", "B"])
  494. df2["C"] = 3.0
  495. df3 = df2.unstack("B")
  496. result = df3.dtypes
  497. expected = Series(
  498. [np.dtype("float64")] * 2 + [np.dtype("int64")] * 2,
  499. index=MultiIndex.from_arrays(
  500. [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
  501. ),
  502. )
  503. tm.assert_series_equal(result, expected)
  504. df2["D"] = "foo"
  505. df3 = df2.unstack("B")
  506. result = df3.dtypes
  507. expected = Series(
  508. [np.dtype("float64")] * 2 + [np.dtype("object")] * 2,
  509. index=MultiIndex.from_arrays(
  510. [["C", "C", "D", "D"], [1, 2, 1, 2]], names=(None, "B")
  511. ),
  512. )
  513. tm.assert_series_equal(result, expected)
  514. @pytest.mark.parametrize(
  515. "c, d",
  516. (
  517. (np.zeros(5), np.zeros(5)),
  518. (np.arange(5, dtype="f8"), np.arange(5, 10, dtype="f8")),
  519. ),
  520. )
  521. def test_unstack_dtypes_mixed_date(self, c, d):
  522. # GH7405
  523. df = DataFrame(
  524. {
  525. "A": ["a"] * 5,
  526. "C": c,
  527. "D": d,
  528. "B": date_range("2012-01-01", periods=5),
  529. }
  530. )
  531. right = df.iloc[:3].copy(deep=True)
  532. df = df.set_index(["A", "B"])
  533. df["D"] = df["D"].astype("int64")
  534. left = df.iloc[:3].unstack(0)
  535. right = right.set_index(["A", "B"]).unstack(0)
  536. right[("D", "a")] = right[("D", "a")].astype("int64")
  537. assert left.shape == (3, 2)
  538. tm.assert_frame_equal(left, right)
  539. def test_unstack_non_unique_index_names(self):
  540. idx = MultiIndex.from_tuples([("a", "b"), ("c", "d")], names=["c1", "c1"])
  541. df = DataFrame([1, 2], index=idx)
  542. msg = "The name c1 occurs multiple times, use a level number"
  543. with pytest.raises(ValueError, match=msg):
  544. df.unstack("c1")
  545. with pytest.raises(ValueError, match=msg):
  546. df.T.stack("c1")
  547. def test_unstack_unused_levels(self):
  548. # GH 17845: unused codes in index make unstack() cast int to float
  549. idx = MultiIndex.from_product([["a"], ["A", "B", "C", "D"]])[:-1]
  550. df = DataFrame([[1, 0]] * 3, index=idx)
  551. result = df.unstack()
  552. exp_col = MultiIndex.from_product([[0, 1], ["A", "B", "C"]])
  553. expected = DataFrame([[1, 1, 1, 0, 0, 0]], index=["a"], columns=exp_col)
  554. tm.assert_frame_equal(result, expected)
  555. assert (result.columns.levels[1] == idx.levels[1]).all()
  556. # Unused items on both levels
  557. levels = [[0, 1, 7], [0, 1, 2, 3]]
  558. codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
  559. idx = MultiIndex(levels, codes)
  560. block = np.arange(4).reshape(2, 2)
  561. df = DataFrame(np.concatenate([block, block + 4]), index=idx)
  562. result = df.unstack()
  563. expected = DataFrame(
  564. np.concatenate([block * 2, block * 2 + 1], axis=1), columns=idx
  565. )
  566. tm.assert_frame_equal(result, expected)
  567. assert (result.columns.levels[1] == idx.levels[1]).all()
  568. @pytest.mark.parametrize(
  569. "level, idces, col_level, idx_level",
  570. (
  571. (0, [13, 16, 6, 9, 2, 5, 8, 11], [np.nan, "a", 2], [np.nan, 5, 1]),
  572. (1, [8, 11, 1, 4, 12, 15, 13, 16], [np.nan, 5, 1], [np.nan, "a", 2]),
  573. ),
  574. )
  575. def test_unstack_unused_levels_mixed_with_nan(
  576. self, level, idces, col_level, idx_level
  577. ):
  578. # With mixed dtype and NaN
  579. levels = [["a", 2, "c"], [1, 3, 5, 7]]
  580. codes = [[0, -1, 1, 1], [0, 2, -1, 2]]
  581. idx = MultiIndex(levels, codes)
  582. data = np.arange(8)
  583. df = DataFrame(data.reshape(4, 2), index=idx)
  584. result = df.unstack(level=level)
  585. exp_data = np.zeros(18) * np.nan
  586. exp_data[idces] = data
  587. cols = MultiIndex.from_product([[0, 1], col_level])
  588. expected = DataFrame(exp_data.reshape(3, 6), index=idx_level, columns=cols)
  589. tm.assert_frame_equal(result, expected)
  590. @pytest.mark.parametrize("cols", [["A", "C"], slice(None)])
  591. def test_unstack_unused_level(self, cols):
  592. # GH 18562 : unused codes on the unstacked level
  593. df = DataFrame([[2010, "a", "I"], [2011, "b", "II"]], columns=["A", "B", "C"])
  594. ind = df.set_index(["A", "B", "C"], drop=False)
  595. selection = ind.loc[(slice(None), slice(None), "I"), cols]
  596. result = selection.unstack()
  597. expected = ind.iloc[[0]][cols]
  598. expected.columns = MultiIndex.from_product(
  599. [expected.columns, ["I"]], names=[None, "C"]
  600. )
  601. expected.index = expected.index.droplevel("C")
  602. tm.assert_frame_equal(result, expected)
  603. def test_unstack_long_index(self):
  604. # PH 32624: Error when using a lot of indices to unstack.
  605. # The error occurred only, if a lot of indices are used.
  606. df = DataFrame(
  607. [[1]],
  608. columns=MultiIndex.from_tuples([[0]], names=["c1"]),
  609. index=MultiIndex.from_tuples(
  610. [[0, 0, 1, 0, 0, 0, 1]],
  611. names=["i1", "i2", "i3", "i4", "i5", "i6", "i7"],
  612. ),
  613. )
  614. result = df.unstack(["i2", "i3", "i4", "i5", "i6", "i7"])
  615. expected = DataFrame(
  616. [[1]],
  617. columns=MultiIndex.from_tuples(
  618. [[0, 0, 1, 0, 0, 0, 1]],
  619. names=["c1", "i2", "i3", "i4", "i5", "i6", "i7"],
  620. ),
  621. index=Index([0], name="i1"),
  622. )
  623. tm.assert_frame_equal(result, expected)
  624. def test_unstack_multi_level_cols(self):
  625. # PH 24729: Unstack a df with multi level columns
  626. df = DataFrame(
  627. [[0.0, 0.0], [0.0, 0.0]],
  628. columns=MultiIndex.from_tuples(
  629. [["B", "C"], ["B", "D"]], names=["c1", "c2"]
  630. ),
  631. index=MultiIndex.from_tuples(
  632. [[10, 20, 30], [10, 20, 40]], names=["i1", "i2", "i3"]
  633. ),
  634. )
  635. assert df.unstack(["i2", "i1"]).columns.names[-2:] == ["i2", "i1"]
  636. def test_unstack_multi_level_rows_and_cols(self):
  637. # PH 28306: Unstack df with multi level cols and rows
  638. df = DataFrame(
  639. [[1, 2], [3, 4], [-1, -2], [-3, -4]],
  640. columns=MultiIndex.from_tuples([["a", "b", "c"], ["d", "e", "f"]]),
  641. index=MultiIndex.from_tuples(
  642. [
  643. ["m1", "P3", 222],
  644. ["m1", "A5", 111],
  645. ["m2", "P3", 222],
  646. ["m2", "A5", 111],
  647. ],
  648. names=["i1", "i2", "i3"],
  649. ),
  650. )
  651. result = df.unstack(["i3", "i2"])
  652. expected = df.unstack(["i3"]).unstack(["i2"])
  653. tm.assert_frame_equal(result, expected)
  654. @pytest.mark.parametrize("idx", [("jim", "joe"), ("joe", "jim")])
  655. @pytest.mark.parametrize("lev", list(range(2)))
  656. def test_unstack_nan_index1(self, idx, lev):
  657. # GH7466
  658. def cast(val):
  659. val_str = "" if val != val else val
  660. return f"{val_str:1}"
  661. df = DataFrame(
  662. {
  663. "jim": ["a", "b", np.nan, "d"],
  664. "joe": ["w", "x", "y", "z"],
  665. "jolie": ["a.w", "b.x", " .y", "d.z"],
  666. }
  667. )
  668. left = df.set_index(["jim", "joe"]).unstack()["jolie"]
  669. right = df.set_index(["joe", "jim"]).unstack()["jolie"].T
  670. tm.assert_frame_equal(left, right)
  671. mi = df.set_index(list(idx))
  672. udf = mi.unstack(level=lev)
  673. assert udf.notna().values.sum() == len(df)
  674. mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
  675. rows, cols = udf["jolie"].notna().values.nonzero()
  676. for i, j in zip(rows, cols):
  677. left = sorted(udf["jolie"].iloc[i, j].split("."))
  678. right = mk_list(udf["jolie"].index[i]) + mk_list(udf["jolie"].columns[j])
  679. right = sorted(map(cast, right))
  680. assert left == right
  681. @pytest.mark.parametrize("idx", itertools.permutations(["1st", "2nd", "3rd"]))
  682. @pytest.mark.parametrize("lev", list(range(3)))
  683. @pytest.mark.parametrize("col", ["4th", "5th"])
  684. def test_unstack_nan_index_repeats(self, idx, lev, col):
  685. def cast(val):
  686. val_str = "" if val != val else val
  687. return f"{val_str:1}"
  688. df = DataFrame(
  689. {
  690. "1st": ["d"] * 3
  691. + [np.nan] * 5
  692. + ["a"] * 2
  693. + ["c"] * 3
  694. + ["e"] * 2
  695. + ["b"] * 5,
  696. "2nd": ["y"] * 2
  697. + ["w"] * 3
  698. + [np.nan] * 3
  699. + ["z"] * 4
  700. + [np.nan] * 3
  701. + ["x"] * 3
  702. + [np.nan] * 2,
  703. "3rd": [
  704. 67,
  705. 39,
  706. 53,
  707. 72,
  708. 57,
  709. 80,
  710. 31,
  711. 18,
  712. 11,
  713. 30,
  714. 59,
  715. 50,
  716. 62,
  717. 59,
  718. 76,
  719. 52,
  720. 14,
  721. 53,
  722. 60,
  723. 51,
  724. ],
  725. }
  726. )
  727. df["4th"], df["5th"] = (
  728. df.apply(lambda r: ".".join(map(cast, r)), axis=1),
  729. df.apply(lambda r: ".".join(map(cast, r.iloc[::-1])), axis=1),
  730. )
  731. mi = df.set_index(list(idx))
  732. udf = mi.unstack(level=lev)
  733. assert udf.notna().values.sum() == 2 * len(df)
  734. mk_list = lambda a: list(a) if isinstance(a, tuple) else [a]
  735. rows, cols = udf[col].notna().values.nonzero()
  736. for i, j in zip(rows, cols):
  737. left = sorted(udf[col].iloc[i, j].split("."))
  738. right = mk_list(udf[col].index[i]) + mk_list(udf[col].columns[j])
  739. right = sorted(map(cast, right))
  740. assert left == right
  741. def test_unstack_nan_index2(self):
  742. # GH7403
  743. df = DataFrame({"A": list("aaaabbbb"), "B": range(8), "C": range(8)})
  744. # Explicit cast to avoid implicit cast when setting to np.NaN
  745. df = df.astype({"B": "float"})
  746. df.iloc[3, 1] = np.NaN
  747. left = df.set_index(["A", "B"]).unstack(0)
  748. vals = [
  749. [3, 0, 1, 2, np.nan, np.nan, np.nan, np.nan],
  750. [np.nan, np.nan, np.nan, np.nan, 4, 5, 6, 7],
  751. ]
  752. vals = list(map(list, zip(*vals)))
  753. idx = Index([np.nan, 0, 1, 2, 4, 5, 6, 7], name="B")
  754. cols = MultiIndex(
  755. levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
  756. )
  757. right = DataFrame(vals, columns=cols, index=idx)
  758. tm.assert_frame_equal(left, right)
  759. df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)})
  760. # Explicit cast to avoid implicit cast when setting to np.NaN
  761. df = df.astype({"B": "float"})
  762. df.iloc[2, 1] = np.NaN
  763. left = df.set_index(["A", "B"]).unstack(0)
  764. vals = [[2, np.nan], [0, 4], [1, 5], [np.nan, 6], [3, 7]]
  765. cols = MultiIndex(
  766. levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
  767. )
  768. idx = Index([np.nan, 0, 1, 2, 3], name="B")
  769. right = DataFrame(vals, columns=cols, index=idx)
  770. tm.assert_frame_equal(left, right)
  771. df = DataFrame({"A": list("aaaabbbb"), "B": list(range(4)) * 2, "C": range(8)})
  772. # Explicit cast to avoid implicit cast when setting to np.NaN
  773. df = df.astype({"B": "float"})
  774. df.iloc[3, 1] = np.NaN
  775. left = df.set_index(["A", "B"]).unstack(0)
  776. vals = [[3, np.nan], [0, 4], [1, 5], [2, 6], [np.nan, 7]]
  777. cols = MultiIndex(
  778. levels=[["C"], ["a", "b"]], codes=[[0, 0], [0, 1]], names=[None, "A"]
  779. )
  780. idx = Index([np.nan, 0, 1, 2, 3], name="B")
  781. right = DataFrame(vals, columns=cols, index=idx)
  782. tm.assert_frame_equal(left, right)
  783. def test_unstack_nan_index3(self, using_array_manager):
  784. # GH7401
  785. df = DataFrame(
  786. {
  787. "A": list("aaaaabbbbb"),
  788. "B": (date_range("2012-01-01", periods=5).tolist() * 2),
  789. "C": np.arange(10),
  790. }
  791. )
  792. df.iloc[3, 1] = np.NaN
  793. left = df.set_index(["A", "B"]).unstack()
  794. vals = np.array([[3, 0, 1, 2, np.nan, 4], [np.nan, 5, 6, 7, 8, 9]])
  795. idx = Index(["a", "b"], name="A")
  796. cols = MultiIndex(
  797. levels=[["C"], date_range("2012-01-01", periods=5)],
  798. codes=[[0, 0, 0, 0, 0, 0], [-1, 0, 1, 2, 3, 4]],
  799. names=[None, "B"],
  800. )
  801. right = DataFrame(vals, columns=cols, index=idx)
  802. if using_array_manager:
  803. # INFO(ArrayManager) with ArrayManager preserve dtype where possible
  804. cols = right.columns[[1, 2, 3, 5]]
  805. right[cols] = right[cols].astype(df["C"].dtype)
  806. tm.assert_frame_equal(left, right)
  807. def test_unstack_nan_index4(self):
  808. # GH4862
  809. vals = [
  810. ["Hg", np.nan, np.nan, 680585148],
  811. ["U", 0.0, np.nan, 680585148],
  812. ["Pb", 7.07e-06, np.nan, 680585148],
  813. ["Sn", 2.3614e-05, 0.0133, 680607017],
  814. ["Ag", 0.0, 0.0133, 680607017],
  815. ["Hg", -0.00015, 0.0133, 680607017],
  816. ]
  817. df = DataFrame(
  818. vals,
  819. columns=["agent", "change", "dosage", "s_id"],
  820. index=[17263, 17264, 17265, 17266, 17267, 17268],
  821. )
  822. left = df.copy().set_index(["s_id", "dosage", "agent"]).unstack()
  823. vals = [
  824. [np.nan, np.nan, 7.07e-06, np.nan, 0.0],
  825. [0.0, -0.00015, np.nan, 2.3614e-05, np.nan],
  826. ]
  827. idx = MultiIndex(
  828. levels=[[680585148, 680607017], [0.0133]],
  829. codes=[[0, 1], [-1, 0]],
  830. names=["s_id", "dosage"],
  831. )
  832. cols = MultiIndex(
  833. levels=[["change"], ["Ag", "Hg", "Pb", "Sn", "U"]],
  834. codes=[[0, 0, 0, 0, 0], [0, 1, 2, 3, 4]],
  835. names=[None, "agent"],
  836. )
  837. right = DataFrame(vals, columns=cols, index=idx)
  838. tm.assert_frame_equal(left, right)
  839. left = df.loc[17264:].copy().set_index(["s_id", "dosage", "agent"])
  840. tm.assert_frame_equal(left.unstack(), right)
  841. def test_unstack_nan_index5(self):
  842. # GH9497 - multiple unstack with nulls
  843. df = DataFrame(
  844. {
  845. "1st": [1, 2, 1, 2, 1, 2],
  846. "2nd": date_range("2014-02-01", periods=6, freq="D"),
  847. "jim": 100 + np.arange(6),
  848. "joe": (np.random.randn(6) * 10).round(2),
  849. }
  850. )
  851. df["3rd"] = df["2nd"] - pd.Timestamp("2014-02-02")
  852. df.loc[1, "2nd"] = df.loc[3, "2nd"] = np.nan
  853. df.loc[1, "3rd"] = df.loc[4, "3rd"] = np.nan
  854. left = df.set_index(["1st", "2nd", "3rd"]).unstack(["2nd", "3rd"])
  855. assert left.notna().values.sum() == 2 * len(df)
  856. for col in ["jim", "joe"]:
  857. for _, r in df.iterrows():
  858. key = r["1st"], (col, r["2nd"], r["3rd"])
  859. assert r[col] == left.loc[key]
  860. def test_stack_datetime_column_multiIndex(self):
  861. # GH 8039
  862. t = datetime(2014, 1, 1)
  863. df = DataFrame([1, 2, 3, 4], columns=MultiIndex.from_tuples([(t, "A", "B")]))
  864. result = df.stack()
  865. eidx = MultiIndex.from_product([(0, 1, 2, 3), ("B",)])
  866. ecols = MultiIndex.from_tuples([(t, "A")])
  867. expected = DataFrame([1, 2, 3, 4], index=eidx, columns=ecols)
  868. tm.assert_frame_equal(result, expected)
  869. @pytest.mark.parametrize(
  870. "multiindex_columns",
  871. [
  872. [0, 1, 2, 3, 4],
  873. [0, 1, 2, 3],
  874. [0, 1, 2, 4],
  875. [0, 1, 2],
  876. [1, 2, 3],
  877. [2, 3, 4],
  878. [0, 1],
  879. [0, 2],
  880. [0, 3],
  881. [0],
  882. [2],
  883. [4],
  884. [4, 3, 2, 1, 0],
  885. [3, 2, 1, 0],
  886. [4, 2, 1, 0],
  887. [2, 1, 0],
  888. [3, 2, 1],
  889. [4, 3, 2],
  890. [1, 0],
  891. [2, 0],
  892. [3, 0],
  893. ],
  894. )
  895. @pytest.mark.parametrize("level", (-1, 0, 1, [0, 1], [1, 0]))
  896. def test_stack_partial_multiIndex(self, multiindex_columns, level):
  897. # GH 8844
  898. full_multiindex = MultiIndex.from_tuples(
  899. [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")],
  900. names=["Upper", "Lower"],
  901. )
  902. multiindex = full_multiindex[multiindex_columns]
  903. df = DataFrame(
  904. np.arange(3 * len(multiindex)).reshape(3, len(multiindex)),
  905. columns=multiindex,
  906. )
  907. result = df.stack(level=level, dropna=False)
  908. if isinstance(level, int):
  909. # Stacking a single level should not make any all-NaN rows,
  910. # so df.stack(level=level, dropna=False) should be the same
  911. # as df.stack(level=level, dropna=True).
  912. expected = df.stack(level=level, dropna=True)
  913. if isinstance(expected, Series):
  914. tm.assert_series_equal(result, expected)
  915. else:
  916. tm.assert_frame_equal(result, expected)
  917. df.columns = MultiIndex.from_tuples(
  918. df.columns.to_numpy(), names=df.columns.names
  919. )
  920. expected = df.stack(level=level, dropna=False)
  921. if isinstance(expected, Series):
  922. tm.assert_series_equal(result, expected)
  923. else:
  924. tm.assert_frame_equal(result, expected)
  925. def test_stack_full_multiIndex(self):
  926. # GH 8844
  927. full_multiindex = MultiIndex.from_tuples(
  928. [("B", "x"), ("B", "z"), ("A", "y"), ("C", "x"), ("C", "u")],
  929. names=["Upper", "Lower"],
  930. )
  931. df = DataFrame(np.arange(6).reshape(2, 3), columns=full_multiindex[[0, 1, 3]])
  932. result = df.stack(dropna=False)
  933. expected = DataFrame(
  934. [[0, 2], [1, np.nan], [3, 5], [4, np.nan]],
  935. index=MultiIndex(
  936. levels=[[0, 1], ["u", "x", "y", "z"]],
  937. codes=[[0, 0, 1, 1], [1, 3, 1, 3]],
  938. names=[None, "Lower"],
  939. ),
  940. columns=Index(["B", "C"], name="Upper"),
  941. )
  942. expected["B"] = expected["B"].astype(df.dtypes[0])
  943. tm.assert_frame_equal(result, expected)
  944. @pytest.mark.parametrize("ordered", [False, True])
  945. @pytest.mark.parametrize("labels", [list("yxz"), list("yxy")])
  946. def test_stack_preserve_categorical_dtype(self, ordered, labels):
  947. # GH13854
  948. cidx = pd.CategoricalIndex(labels, categories=list("xyz"), ordered=ordered)
  949. df = DataFrame([[10, 11, 12]], columns=cidx)
  950. result = df.stack()
  951. # `MultiIndex.from_product` preserves categorical dtype -
  952. # it's tested elsewhere.
  953. midx = MultiIndex.from_product([df.index, cidx])
  954. expected = Series([10, 11, 12], index=midx)
  955. tm.assert_series_equal(result, expected)
  956. @pytest.mark.parametrize("ordered", [False, True])
  957. @pytest.mark.parametrize(
  958. "labels,data",
  959. [
  960. (list("xyz"), [10, 11, 12, 13, 14, 15]),
  961. (list("zyx"), [14, 15, 12, 13, 10, 11]),
  962. ],
  963. )
  964. def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data):
  965. # GH-36991
  966. cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered)
  967. cidx2 = pd.CategoricalIndex(["u", "v"], ordered=ordered)
  968. midx = MultiIndex.from_product([cidx, cidx2])
  969. df = DataFrame([sorted(data)], columns=midx)
  970. result = df.stack([0, 1])
  971. s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered)
  972. expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2]))
  973. tm.assert_series_equal(result, expected)
  974. def test_stack_preserve_categorical_dtype_values(self):
  975. # GH-23077
  976. cat = pd.Categorical(["a", "a", "b", "c"])
  977. df = DataFrame({"A": cat, "B": cat})
  978. result = df.stack()
  979. index = MultiIndex.from_product([[0, 1, 2, 3], ["A", "B"]])
  980. expected = Series(
  981. pd.Categorical(["a", "a", "a", "a", "b", "b", "c", "c"]), index=index
  982. )
  983. tm.assert_series_equal(result, expected)
  984. @pytest.mark.parametrize(
  985. "index, columns",
  986. [
  987. ([0, 0, 1, 1], MultiIndex.from_product([[1, 2], ["a", "b"]])),
  988. ([0, 0, 2, 3], MultiIndex.from_product([[1, 2], ["a", "b"]])),
  989. ([0, 1, 2, 3], MultiIndex.from_product([[1, 2], ["a", "b"]])),
  990. ],
  991. )
  992. def test_stack_multi_columns_non_unique_index(self, index, columns):
  993. # GH-28301
  994. df = DataFrame(index=index, columns=columns).fillna(1)
  995. stacked = df.stack()
  996. new_index = MultiIndex.from_tuples(stacked.index.to_numpy())
  997. expected = DataFrame(
  998. stacked.to_numpy(), index=new_index, columns=stacked.columns
  999. )
  1000. tm.assert_frame_equal(stacked, expected)
  1001. stacked_codes = np.asarray(stacked.index.codes)
  1002. expected_codes = np.asarray(new_index.codes)
  1003. tm.assert_numpy_array_equal(stacked_codes, expected_codes)
  1004. @pytest.mark.parametrize("level", [0, 1])
  1005. def test_unstack_mixed_extension_types(self, level):
  1006. index = MultiIndex.from_tuples([("A", 0), ("A", 1), ("B", 1)], names=["a", "b"])
  1007. df = DataFrame(
  1008. {
  1009. "A": pd.array([0, 1, None], dtype="Int64"),
  1010. "B": pd.Categorical(["a", "a", "b"]),
  1011. },
  1012. index=index,
  1013. )
  1014. result = df.unstack(level=level)
  1015. expected = df.astype(object).unstack(level=level)
  1016. expected_dtypes = Series(
  1017. [df.A.dtype] * 2 + [df.B.dtype] * 2, index=result.columns
  1018. )
  1019. tm.assert_series_equal(result.dtypes, expected_dtypes)
  1020. tm.assert_frame_equal(result.astype(object), expected)
  1021. @pytest.mark.parametrize("level", [0, "baz"])
  1022. def test_unstack_swaplevel_sortlevel(self, level):
  1023. # GH 20994
  1024. mi = MultiIndex.from_product([[0], ["d", "c"]], names=["bar", "baz"])
  1025. df = DataFrame([[0, 2], [1, 3]], index=mi, columns=["B", "A"])
  1026. df.columns.name = "foo"
  1027. expected = DataFrame(
  1028. [[3, 1, 2, 0]],
  1029. columns=MultiIndex.from_tuples(
  1030. [("c", "A"), ("c", "B"), ("d", "A"), ("d", "B")], names=["baz", "foo"]
  1031. ),
  1032. )
  1033. expected.index.name = "bar"
  1034. result = df.unstack().swaplevel(axis=1).sort_index(axis=1, level=level)
  1035. tm.assert_frame_equal(result, expected)
  1036. def test_unstack_fill_frame_object():
  1037. # GH12815 Test unstacking with object.
  1038. data = Series(["a", "b", "c", "a"], dtype="object")
  1039. data.index = MultiIndex.from_tuples(
  1040. [("x", "a"), ("x", "b"), ("y", "b"), ("z", "a")]
  1041. )
  1042. # By default missing values will be NaN
  1043. result = data.unstack()
  1044. expected = DataFrame(
  1045. {"a": ["a", np.nan, "a"], "b": ["b", "c", np.nan]}, index=list("xyz")
  1046. )
  1047. tm.assert_frame_equal(result, expected)
  1048. # Fill with any value replaces missing values as expected
  1049. result = data.unstack(fill_value="d")
  1050. expected = DataFrame(
  1051. {"a": ["a", "d", "a"], "b": ["b", "c", "d"]}, index=list("xyz")
  1052. )
  1053. tm.assert_frame_equal(result, expected)
  1054. def test_unstack_timezone_aware_values():
  1055. # GH 18338
  1056. df = DataFrame(
  1057. {
  1058. "timestamp": [pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC")],
  1059. "a": ["a"],
  1060. "b": ["b"],
  1061. "c": ["c"],
  1062. },
  1063. columns=["timestamp", "a", "b", "c"],
  1064. )
  1065. result = df.set_index(["a", "b"]).unstack()
  1066. expected = DataFrame(
  1067. [[pd.Timestamp("2017-08-27 01:00:00.709949+0000", tz="UTC"), "c"]],
  1068. index=Index(["a"], name="a"),
  1069. columns=MultiIndex(
  1070. levels=[["timestamp", "c"], ["b"]],
  1071. codes=[[0, 1], [0, 0]],
  1072. names=[None, "b"],
  1073. ),
  1074. )
  1075. tm.assert_frame_equal(result, expected)
  1076. def test_stack_timezone_aware_values():
  1077. # GH 19420
  1078. ts = date_range(freq="D", start="20180101", end="20180103", tz="America/New_York")
  1079. df = DataFrame({"A": ts}, index=["a", "b", "c"])
  1080. result = df.stack()
  1081. expected = Series(
  1082. ts,
  1083. index=MultiIndex(levels=[["a", "b", "c"], ["A"]], codes=[[0, 1, 2], [0, 0, 0]]),
  1084. )
  1085. tm.assert_series_equal(result, expected)
  1086. @pytest.mark.parametrize("dropna", [True, False])
  1087. def test_stack_empty_frame(dropna):
  1088. # GH 36113
  1089. levels = [np.array([], dtype=np.int64), np.array([], dtype=np.int64)]
  1090. expected = Series(dtype=np.float64, index=MultiIndex(levels=levels, codes=[[], []]))
  1091. result = DataFrame(dtype=np.float64).stack(dropna=dropna)
  1092. tm.assert_series_equal(result, expected)
  1093. @pytest.mark.parametrize("dropna", [True, False])
  1094. @pytest.mark.parametrize("fill_value", [None, 0])
  1095. def test_stack_unstack_empty_frame(dropna, fill_value):
  1096. # GH 36113
  1097. result = (
  1098. DataFrame(dtype=np.int64).stack(dropna=dropna).unstack(fill_value=fill_value)
  1099. )
  1100. expected = DataFrame(dtype=np.int64)
  1101. tm.assert_frame_equal(result, expected)
  1102. def test_unstack_single_index_series():
  1103. # GH 36113
  1104. msg = r"index must be a MultiIndex to unstack.*"
  1105. with pytest.raises(ValueError, match=msg):
  1106. Series(dtype=np.int64).unstack()
  1107. def test_unstacking_multi_index_df():
  1108. # see gh-30740
  1109. df = DataFrame(
  1110. {
  1111. "name": ["Alice", "Bob"],
  1112. "score": [9.5, 8],
  1113. "employed": [False, True],
  1114. "kids": [0, 0],
  1115. "gender": ["female", "male"],
  1116. }
  1117. )
  1118. df = df.set_index(["name", "employed", "kids", "gender"])
  1119. df = df.unstack(["gender"], fill_value=0)
  1120. expected = df.unstack("employed", fill_value=0).unstack("kids", fill_value=0)
  1121. result = df.unstack(["employed", "kids"], fill_value=0)
  1122. expected = DataFrame(
  1123. [[9.5, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 8.0]],
  1124. index=Index(["Alice", "Bob"], name="name"),
  1125. columns=MultiIndex.from_tuples(
  1126. [
  1127. ("score", "female", False, 0),
  1128. ("score", "female", True, 0),
  1129. ("score", "male", False, 0),
  1130. ("score", "male", True, 0),
  1131. ],
  1132. names=[None, "gender", "employed", "kids"],
  1133. ),
  1134. )
  1135. tm.assert_frame_equal(result, expected)
  1136. def test_stack_positional_level_duplicate_column_names():
  1137. # https://github.com/pandas-dev/pandas/issues/36353
  1138. columns = MultiIndex.from_product([("x", "y"), ("y", "z")], names=["a", "a"])
  1139. df = DataFrame([[1, 1, 1, 1]], columns=columns)
  1140. result = df.stack(0)
  1141. new_columns = Index(["y", "z"], name="a")
  1142. new_index = MultiIndex.from_tuples([(0, "x"), (0, "y")], names=[None, "a"])
  1143. expected = DataFrame([[1, 1], [1, 1]], index=new_index, columns=new_columns)
  1144. tm.assert_frame_equal(result, expected)
  1145. def test_unstack_non_slice_like_blocks(using_array_manager):
  1146. # Case where the mgr_locs of a DataFrame's underlying blocks are not slice-like
  1147. mi = MultiIndex.from_product([range(5), ["A", "B", "C"]])
  1148. df = DataFrame(
  1149. {
  1150. 0: np.random.randn(15),
  1151. 1: np.random.randn(15).astype(np.int64),
  1152. 2: np.random.randn(15),
  1153. 3: np.random.randn(15),
  1154. },
  1155. index=mi,
  1156. )
  1157. if not using_array_manager:
  1158. assert any(not x.mgr_locs.is_slice_like for x in df._mgr.blocks)
  1159. res = df.unstack()
  1160. expected = pd.concat([df[n].unstack() for n in range(4)], keys=range(4), axis=1)
  1161. tm.assert_frame_equal(res, expected)
  1162. class TestStackUnstackMultiLevel:
  1163. def test_unstack(self, multiindex_year_month_day_dataframe_random_data):
  1164. # just check that it works for now
  1165. ymd = multiindex_year_month_day_dataframe_random_data
  1166. unstacked = ymd.unstack()
  1167. unstacked.unstack()
  1168. # test that ints work
  1169. ymd.astype(int).unstack()
  1170. # test that int32 work
  1171. ymd.astype(np.int32).unstack()
  1172. @pytest.mark.parametrize(
  1173. "result_rows,result_columns,index_product,expected_row",
  1174. [
  1175. (
  1176. [[1, 1, None, None, 30.0, None], [2, 2, None, None, 30.0, None]],
  1177. ["ix1", "ix2", "col1", "col2", "col3", "col4"],
  1178. 2,
  1179. [None, None, 30.0, None],
  1180. ),
  1181. (
  1182. [[1, 1, None, None, 30.0], [2, 2, None, None, 30.0]],
  1183. ["ix1", "ix2", "col1", "col2", "col3"],
  1184. 2,
  1185. [None, None, 30.0],
  1186. ),
  1187. (
  1188. [[1, 1, None, None, 30.0], [2, None, None, None, 30.0]],
  1189. ["ix1", "ix2", "col1", "col2", "col3"],
  1190. None,
  1191. [None, None, 30.0],
  1192. ),
  1193. ],
  1194. )
  1195. def test_unstack_partial(
  1196. self, result_rows, result_columns, index_product, expected_row
  1197. ):
  1198. # check for regressions on this issue:
  1199. # https://github.com/pandas-dev/pandas/issues/19351
  1200. # make sure DataFrame.unstack() works when its run on a subset of the DataFrame
  1201. # and the Index levels contain values that are not present in the subset
  1202. result = DataFrame(result_rows, columns=result_columns).set_index(
  1203. ["ix1", "ix2"]
  1204. )
  1205. result = result.iloc[1:2].unstack("ix2")
  1206. expected = DataFrame(
  1207. [expected_row],
  1208. columns=MultiIndex.from_product(
  1209. [result_columns[2:], [index_product]], names=[None, "ix2"]
  1210. ),
  1211. index=Index([2], name="ix1"),
  1212. )
  1213. tm.assert_frame_equal(result, expected)
  1214. def test_unstack_multiple_no_empty_columns(self):
  1215. index = MultiIndex.from_tuples(
  1216. [(0, "foo", 0), (0, "bar", 0), (1, "baz", 1), (1, "qux", 1)]
  1217. )
  1218. s = Series(np.random.randn(4), index=index)
  1219. unstacked = s.unstack([1, 2])
  1220. expected = unstacked.dropna(axis=1, how="all")
  1221. tm.assert_frame_equal(unstacked, expected)
  1222. def test_stack(self, multiindex_year_month_day_dataframe_random_data):
  1223. ymd = multiindex_year_month_day_dataframe_random_data
  1224. # regular roundtrip
  1225. unstacked = ymd.unstack()
  1226. restacked = unstacked.stack()
  1227. tm.assert_frame_equal(restacked, ymd)
  1228. unlexsorted = ymd.sort_index(level=2)
  1229. unstacked = unlexsorted.unstack(2)
  1230. restacked = unstacked.stack()
  1231. tm.assert_frame_equal(restacked.sort_index(level=0), ymd)
  1232. unlexsorted = unlexsorted[::-1]
  1233. unstacked = unlexsorted.unstack(1)
  1234. restacked = unstacked.stack().swaplevel(1, 2)
  1235. tm.assert_frame_equal(restacked.sort_index(level=0), ymd)
  1236. unlexsorted = unlexsorted.swaplevel(0, 1)
  1237. unstacked = unlexsorted.unstack(0).swaplevel(0, 1, axis=1)
  1238. restacked = unstacked.stack(0).swaplevel(1, 2)
  1239. tm.assert_frame_equal(restacked.sort_index(level=0), ymd)
  1240. # columns unsorted
  1241. unstacked = ymd.unstack()
  1242. unstacked = unstacked.sort_index(axis=1, ascending=False)
  1243. restacked = unstacked.stack()
  1244. tm.assert_frame_equal(restacked, ymd)
  1245. # more than 2 levels in the columns
  1246. unstacked = ymd.unstack(1).unstack(1)
  1247. result = unstacked.stack(1)
  1248. expected = ymd.unstack()
  1249. tm.assert_frame_equal(result, expected)
  1250. result = unstacked.stack(2)
  1251. expected = ymd.unstack(1)
  1252. tm.assert_frame_equal(result, expected)
  1253. result = unstacked.stack(0)
  1254. expected = ymd.stack().unstack(1).unstack(1)
  1255. tm.assert_frame_equal(result, expected)
  1256. # not all levels present in each echelon
  1257. unstacked = ymd.unstack(2).loc[:, ::3]
  1258. stacked = unstacked.stack().stack()
  1259. ymd_stacked = ymd.stack()
  1260. tm.assert_series_equal(stacked, ymd_stacked.reindex(stacked.index))
  1261. # stack with negative number
  1262. result = ymd.unstack(0).stack(-2)
  1263. expected = ymd.unstack(0).stack(0)
  1264. tm.assert_equal(result, expected)
  1265. @pytest.mark.parametrize(
  1266. "idx, columns, exp_idx",
  1267. [
  1268. [
  1269. list("abab"),
  1270. ["1st", "2nd", "3rd"],
  1271. MultiIndex(
  1272. levels=[["a", "b"], ["1st", "2nd", "3rd"]],
  1273. codes=[
  1274. np.tile(np.arange(2).repeat(3), 2),
  1275. np.tile(np.arange(3), 4),
  1276. ],
  1277. ),
  1278. ],
  1279. [
  1280. list("abab"),
  1281. ["1st", "2nd", "1st"],
  1282. MultiIndex(
  1283. levels=[["a", "b"], ["1st", "2nd"]],
  1284. codes=[np.tile(np.arange(2).repeat(3), 2), np.tile([0, 1, 0], 4)],
  1285. ),
  1286. ],
  1287. [
  1288. MultiIndex.from_tuples((("a", 2), ("b", 1), ("a", 1), ("b", 2))),
  1289. ["1st", "2nd", "1st"],
  1290. MultiIndex(
  1291. levels=[["a", "b"], [1, 2], ["1st", "2nd"]],
  1292. codes=[
  1293. np.tile(np.arange(2).repeat(3), 2),
  1294. np.repeat([1, 0, 1], [3, 6, 3]),
  1295. np.tile([0, 1, 0], 4),
  1296. ],
  1297. ),
  1298. ],
  1299. ],
  1300. )
  1301. def test_stack_duplicate_index(self, idx, columns, exp_idx):
  1302. # GH10417
  1303. df = DataFrame(
  1304. np.arange(12).reshape(4, 3),
  1305. index=idx,
  1306. columns=columns,
  1307. )
  1308. result = df.stack()
  1309. expected = Series(np.arange(12), index=exp_idx)
  1310. tm.assert_series_equal(result, expected)
  1311. assert result.index.is_unique is False
  1312. li, ri = result.index, expected.index
  1313. tm.assert_index_equal(li, ri)
  1314. def test_unstack_odd_failure(self):
  1315. data = """day,time,smoker,sum,len
  1316. Fri,Dinner,No,8.25,3.
  1317. Fri,Dinner,Yes,27.03,9
  1318. Fri,Lunch,No,3.0,1
  1319. Fri,Lunch,Yes,13.68,6
  1320. Sat,Dinner,No,139.63,45
  1321. Sat,Dinner,Yes,120.77,42
  1322. Sun,Dinner,No,180.57,57
  1323. Sun,Dinner,Yes,66.82,19
  1324. Thu,Dinner,No,3.0,1
  1325. Thu,Lunch,No,117.32,44
  1326. Thu,Lunch,Yes,51.51,17"""
  1327. df = pd.read_csv(StringIO(data)).set_index(["day", "time", "smoker"])
  1328. # it works, #2100
  1329. result = df.unstack(2)
  1330. recons = result.stack()
  1331. tm.assert_frame_equal(recons, df)
  1332. def test_stack_mixed_dtype(self, multiindex_dataframe_random_data):
  1333. frame = multiindex_dataframe_random_data
  1334. df = frame.T
  1335. df["foo", "four"] = "foo"
  1336. df = df.sort_index(level=1, axis=1)
  1337. stacked = df.stack()
  1338. result = df["foo"].stack().sort_index()
  1339. tm.assert_series_equal(stacked["foo"], result, check_names=False)
  1340. assert result.name is None
  1341. assert stacked["bar"].dtype == np.float_
  1342. def test_unstack_bug(self):
  1343. df = DataFrame(
  1344. {
  1345. "state": ["naive", "naive", "naive", "active", "active", "active"],
  1346. "exp": ["a", "b", "b", "b", "a", "a"],
  1347. "barcode": [1, 2, 3, 4, 1, 3],
  1348. "v": ["hi", "hi", "bye", "bye", "bye", "peace"],
  1349. "extra": np.arange(6.0),
  1350. }
  1351. )
  1352. result = df.groupby(["state", "exp", "barcode", "v"]).apply(len)
  1353. unstacked = result.unstack()
  1354. restacked = unstacked.stack()
  1355. tm.assert_series_equal(restacked, result.reindex(restacked.index).astype(float))
  1356. def test_stack_unstack_preserve_names(self, multiindex_dataframe_random_data):
  1357. frame = multiindex_dataframe_random_data
  1358. unstacked = frame.unstack()
  1359. assert unstacked.index.name == "first"
  1360. assert unstacked.columns.names == ["exp", "second"]
  1361. restacked = unstacked.stack()
  1362. assert restacked.index.names == frame.index.names
  1363. @pytest.mark.parametrize("method", ["stack", "unstack"])
  1364. def test_stack_unstack_wrong_level_name(
  1365. self, method, multiindex_dataframe_random_data
  1366. ):
  1367. # GH 18303 - wrong level name should raise
  1368. frame = multiindex_dataframe_random_data
  1369. # A DataFrame with flat axes:
  1370. df = frame.loc["foo"]
  1371. with pytest.raises(KeyError, match="does not match index name"):
  1372. getattr(df, method)("mistake")
  1373. if method == "unstack":
  1374. # Same on a Series:
  1375. s = df.iloc[:, 0]
  1376. with pytest.raises(KeyError, match="does not match index name"):
  1377. getattr(s, method)("mistake")
  1378. def test_unstack_level_name(self, multiindex_dataframe_random_data):
  1379. frame = multiindex_dataframe_random_data
  1380. result = frame.unstack("second")
  1381. expected = frame.unstack(level=1)
  1382. tm.assert_frame_equal(result, expected)
  1383. def test_stack_level_name(self, multiindex_dataframe_random_data):
  1384. frame = multiindex_dataframe_random_data
  1385. unstacked = frame.unstack("second")
  1386. result = unstacked.stack("exp")
  1387. expected = frame.unstack().stack(0)
  1388. tm.assert_frame_equal(result, expected)
  1389. result = frame.stack("exp")
  1390. expected = frame.stack()
  1391. tm.assert_series_equal(result, expected)
  1392. def test_stack_unstack_multiple(
  1393. self, multiindex_year_month_day_dataframe_random_data
  1394. ):
  1395. ymd = multiindex_year_month_day_dataframe_random_data
  1396. unstacked = ymd.unstack(["year", "month"])
  1397. expected = ymd.unstack("year").unstack("month")
  1398. tm.assert_frame_equal(unstacked, expected)
  1399. assert unstacked.columns.names == expected.columns.names
  1400. # series
  1401. s = ymd["A"]
  1402. s_unstacked = s.unstack(["year", "month"])
  1403. tm.assert_frame_equal(s_unstacked, expected["A"])
  1404. restacked = unstacked.stack(["year", "month"])
  1405. restacked = restacked.swaplevel(0, 1).swaplevel(1, 2)
  1406. restacked = restacked.sort_index(level=0)
  1407. tm.assert_frame_equal(restacked, ymd)
  1408. assert restacked.index.names == ymd.index.names
  1409. # GH #451
  1410. unstacked = ymd.unstack([1, 2])
  1411. expected = ymd.unstack(1).unstack(1).dropna(axis=1, how="all")
  1412. tm.assert_frame_equal(unstacked, expected)
  1413. unstacked = ymd.unstack([2, 1])
  1414. expected = ymd.unstack(2).unstack(1).dropna(axis=1, how="all")
  1415. tm.assert_frame_equal(unstacked, expected.loc[:, unstacked.columns])
  1416. def test_stack_names_and_numbers(
  1417. self, multiindex_year_month_day_dataframe_random_data
  1418. ):
  1419. ymd = multiindex_year_month_day_dataframe_random_data
  1420. unstacked = ymd.unstack(["year", "month"])
  1421. # Can't use mixture of names and numbers to stack
  1422. with pytest.raises(ValueError, match="level should contain"):
  1423. unstacked.stack([0, "month"])
  1424. def test_stack_multiple_out_of_bounds(
  1425. self, multiindex_year_month_day_dataframe_random_data
  1426. ):
  1427. # nlevels == 3
  1428. ymd = multiindex_year_month_day_dataframe_random_data
  1429. unstacked = ymd.unstack(["year", "month"])
  1430. with pytest.raises(IndexError, match="Too many levels"):
  1431. unstacked.stack([2, 3])
  1432. with pytest.raises(IndexError, match="not a valid level number"):
  1433. unstacked.stack([-4, -3])
  1434. def test_unstack_period_series(self):
  1435. # GH4342
  1436. idx1 = pd.PeriodIndex(
  1437. ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"],
  1438. freq="M",
  1439. name="period",
  1440. )
  1441. idx2 = Index(["A", "B"] * 3, name="str")
  1442. value = [1, 2, 3, 4, 5, 6]
  1443. idx = MultiIndex.from_arrays([idx1, idx2])
  1444. s = Series(value, index=idx)
  1445. result1 = s.unstack()
  1446. result2 = s.unstack(level=1)
  1447. result3 = s.unstack(level=0)
  1448. e_idx = pd.PeriodIndex(
  1449. ["2013-01", "2013-02", "2013-03"], freq="M", name="period"
  1450. )
  1451. expected = DataFrame(
  1452. {"A": [1, 3, 5], "B": [2, 4, 6]}, index=e_idx, columns=["A", "B"]
  1453. )
  1454. expected.columns.name = "str"
  1455. tm.assert_frame_equal(result1, expected)
  1456. tm.assert_frame_equal(result2, expected)
  1457. tm.assert_frame_equal(result3, expected.T)
  1458. idx1 = pd.PeriodIndex(
  1459. ["2013-01", "2013-01", "2013-02", "2013-02", "2013-03", "2013-03"],
  1460. freq="M",
  1461. name="period1",
  1462. )
  1463. idx2 = pd.PeriodIndex(
  1464. ["2013-12", "2013-11", "2013-10", "2013-09", "2013-08", "2013-07"],
  1465. freq="M",
  1466. name="period2",
  1467. )
  1468. idx = MultiIndex.from_arrays([idx1, idx2])
  1469. s = Series(value, index=idx)
  1470. result1 = s.unstack()
  1471. result2 = s.unstack(level=1)
  1472. result3 = s.unstack(level=0)
  1473. e_idx = pd.PeriodIndex(
  1474. ["2013-01", "2013-02", "2013-03"], freq="M", name="period1"
  1475. )
  1476. e_cols = pd.PeriodIndex(
  1477. ["2013-07", "2013-08", "2013-09", "2013-10", "2013-11", "2013-12"],
  1478. freq="M",
  1479. name="period2",
  1480. )
  1481. expected = DataFrame(
  1482. [
  1483. [np.nan, np.nan, np.nan, np.nan, 2, 1],
  1484. [np.nan, np.nan, 4, 3, np.nan, np.nan],
  1485. [6, 5, np.nan, np.nan, np.nan, np.nan],
  1486. ],
  1487. index=e_idx,
  1488. columns=e_cols,
  1489. )
  1490. tm.assert_frame_equal(result1, expected)
  1491. tm.assert_frame_equal(result2, expected)
  1492. tm.assert_frame_equal(result3, expected.T)
  1493. def test_unstack_period_frame(self):
  1494. # GH4342
  1495. idx1 = pd.PeriodIndex(
  1496. ["2014-01", "2014-02", "2014-02", "2014-02", "2014-01", "2014-01"],
  1497. freq="M",
  1498. name="period1",
  1499. )
  1500. idx2 = pd.PeriodIndex(
  1501. ["2013-12", "2013-12", "2014-02", "2013-10", "2013-10", "2014-02"],
  1502. freq="M",
  1503. name="period2",
  1504. )
  1505. value = {"A": [1, 2, 3, 4, 5, 6], "B": [6, 5, 4, 3, 2, 1]}
  1506. idx = MultiIndex.from_arrays([idx1, idx2])
  1507. df = DataFrame(value, index=idx)
  1508. result1 = df.unstack()
  1509. result2 = df.unstack(level=1)
  1510. result3 = df.unstack(level=0)
  1511. e_1 = pd.PeriodIndex(["2014-01", "2014-02"], freq="M", name="period1")
  1512. e_2 = pd.PeriodIndex(
  1513. ["2013-10", "2013-12", "2014-02", "2013-10", "2013-12", "2014-02"],
  1514. freq="M",
  1515. name="period2",
  1516. )
  1517. e_cols = MultiIndex.from_arrays(["A A A B B B".split(), e_2])
  1518. expected = DataFrame(
  1519. [[5, 1, 6, 2, 6, 1], [4, 2, 3, 3, 5, 4]], index=e_1, columns=e_cols
  1520. )
  1521. tm.assert_frame_equal(result1, expected)
  1522. tm.assert_frame_equal(result2, expected)
  1523. e_1 = pd.PeriodIndex(
  1524. ["2014-01", "2014-02", "2014-01", "2014-02"], freq="M", name="period1"
  1525. )
  1526. e_2 = pd.PeriodIndex(
  1527. ["2013-10", "2013-12", "2014-02"], freq="M", name="period2"
  1528. )
  1529. e_cols = MultiIndex.from_arrays(["A A B B".split(), e_1])
  1530. expected = DataFrame(
  1531. [[5, 4, 2, 3], [1, 2, 6, 5], [6, 3, 1, 4]], index=e_2, columns=e_cols
  1532. )
  1533. tm.assert_frame_equal(result3, expected)
  1534. def test_stack_multiple_bug(self):
  1535. # bug when some uniques are not present in the data GH#3170
  1536. id_col = ([1] * 3) + ([2] * 3)
  1537. name = (["a"] * 3) + (["b"] * 3)
  1538. date = pd.to_datetime(["2013-01-03", "2013-01-04", "2013-01-05"] * 2)
  1539. var1 = np.random.randint(0, 100, 6)
  1540. df = DataFrame({"ID": id_col, "NAME": name, "DATE": date, "VAR1": var1})
  1541. multi = df.set_index(["DATE", "ID"])
  1542. multi.columns.name = "Params"
  1543. unst = multi.unstack("ID")
  1544. with pytest.raises(TypeError, match="Could not convert"):
  1545. unst.resample("W-THU").mean()
  1546. down = unst.resample("W-THU").mean(numeric_only=True)
  1547. rs = down.stack("ID")
  1548. xp = unst.loc[:, ["VAR1"]].resample("W-THU").mean().stack("ID")
  1549. xp.columns.name = "Params"
  1550. tm.assert_frame_equal(rs, xp)
  1551. def test_stack_dropna(self):
  1552. # GH#3997
  1553. df = DataFrame({"A": ["a1", "a2"], "B": ["b1", "b2"], "C": [1, 1]})
  1554. df = df.set_index(["A", "B"])
  1555. stacked = df.unstack().stack(dropna=False)
  1556. assert len(stacked) > len(stacked.dropna())
  1557. stacked = df.unstack().stack(dropna=True)
  1558. tm.assert_frame_equal(stacked, stacked.dropna())
  1559. def test_unstack_multiple_hierarchical(self):
  1560. df = DataFrame(
  1561. index=[
  1562. [0, 0, 0, 0, 1, 1, 1, 1],
  1563. [0, 0, 1, 1, 0, 0, 1, 1],
  1564. [0, 1, 0, 1, 0, 1, 0, 1],
  1565. ],
  1566. columns=[[0, 0, 1, 1], [0, 1, 0, 1]],
  1567. )
  1568. df.index.names = ["a", "b", "c"]
  1569. df.columns.names = ["d", "e"]
  1570. # it works!
  1571. df.unstack(["b", "c"])
  1572. def test_unstack_sparse_keyspace(self):
  1573. # memory problems with naive impl GH#2278
  1574. # Generate Long File & Test Pivot
  1575. NUM_ROWS = 1000
  1576. df = DataFrame(
  1577. {
  1578. "A": np.random.randint(100, size=NUM_ROWS),
  1579. "B": np.random.randint(300, size=NUM_ROWS),
  1580. "C": np.random.randint(-7, 7, size=NUM_ROWS),
  1581. "D": np.random.randint(-19, 19, size=NUM_ROWS),
  1582. "E": np.random.randint(3000, size=NUM_ROWS),
  1583. "F": np.random.randn(NUM_ROWS),
  1584. }
  1585. )
  1586. idf = df.set_index(["A", "B", "C", "D", "E"])
  1587. # it works! is sufficient
  1588. idf.unstack("E")
  1589. def test_unstack_unobserved_keys(self):
  1590. # related to GH#2278 refactoring
  1591. levels = [[0, 1], [0, 1, 2, 3]]
  1592. codes = [[0, 0, 1, 1], [0, 2, 0, 2]]
  1593. index = MultiIndex(levels, codes)
  1594. df = DataFrame(np.random.randn(4, 2), index=index)
  1595. result = df.unstack()
  1596. assert len(result.columns) == 4
  1597. recons = result.stack()
  1598. tm.assert_frame_equal(recons, df)
  1599. @pytest.mark.slow
  1600. def test_unstack_number_of_levels_larger_than_int32(self, monkeypatch):
  1601. # GH#20601
  1602. # GH 26314: Change ValueError to PerformanceWarning
  1603. class MockUnstacker(reshape_lib._Unstacker):
  1604. def __init__(self, *args, **kwargs) -> None:
  1605. # __init__ will raise the warning
  1606. super().__init__(*args, **kwargs)
  1607. raise Exception("Don't compute final result.")
  1608. with monkeypatch.context() as m:
  1609. m.setattr(reshape_lib, "_Unstacker", MockUnstacker)
  1610. df = DataFrame(
  1611. np.random.randn(2**16, 2),
  1612. index=[np.arange(2**16), np.arange(2**16)],
  1613. )
  1614. msg = "The following operation may generate"
  1615. with tm.assert_produces_warning(PerformanceWarning, match=msg):
  1616. with pytest.raises(Exception, match="Don't compute final result."):
  1617. df.unstack()
  1618. @pytest.mark.parametrize(
  1619. "levels",
  1620. itertools.chain.from_iterable(
  1621. itertools.product(itertools.permutations([0, 1, 2], width), repeat=2)
  1622. for width in [2, 3]
  1623. ),
  1624. )
  1625. @pytest.mark.parametrize("stack_lev", range(2))
  1626. def test_stack_order_with_unsorted_levels(self, levels, stack_lev):
  1627. # GH#16323
  1628. # deep check for 1-row case
  1629. columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
  1630. df = DataFrame(columns=columns, data=[range(4)])
  1631. df_stacked = df.stack(stack_lev)
  1632. assert all(
  1633. df.loc[row, col]
  1634. == df_stacked.loc[(row, col[stack_lev]), col[1 - stack_lev]]
  1635. for row in df.index
  1636. for col in df.columns
  1637. )
  1638. def test_stack_order_with_unsorted_levels_multi_row(self):
  1639. # GH#16323
  1640. # check multi-row case
  1641. mi = MultiIndex(
  1642. levels=[["A", "C", "B"], ["B", "A", "C"]],
  1643. codes=[np.repeat(range(3), 3), np.tile(range(3), 3)],
  1644. )
  1645. df = DataFrame(
  1646. columns=mi, index=range(5), data=np.arange(5 * len(mi)).reshape(5, -1)
  1647. )
  1648. assert all(
  1649. df.loc[row, col] == df.stack(0).loc[(row, col[0]), col[1]]
  1650. for row in df.index
  1651. for col in df.columns
  1652. )
  1653. def test_stack_unstack_unordered_multiindex(self):
  1654. # GH# 18265
  1655. values = np.arange(5)
  1656. data = np.vstack(
  1657. [
  1658. [f"b{x}" for x in values], # b0, b1, ..
  1659. [f"a{x}" for x in values], # a0, a1, ..
  1660. ]
  1661. )
  1662. df = DataFrame(data.T, columns=["b", "a"])
  1663. df.columns.name = "first"
  1664. second_level_dict = {"x": df}
  1665. multi_level_df = pd.concat(second_level_dict, axis=1)
  1666. multi_level_df.columns.names = ["second", "first"]
  1667. df = multi_level_df.reindex(sorted(multi_level_df.columns), axis=1)
  1668. result = df.stack(["first", "second"]).unstack(["first", "second"])
  1669. expected = DataFrame(
  1670. [["a0", "b0"], ["a1", "b1"], ["a2", "b2"], ["a3", "b3"], ["a4", "b4"]],
  1671. index=[0, 1, 2, 3, 4],
  1672. columns=MultiIndex.from_tuples(
  1673. [("a", "x"), ("b", "x")], names=["first", "second"]
  1674. ),
  1675. )
  1676. tm.assert_frame_equal(result, expected)
  1677. def test_unstack_preserve_types(
  1678. self, multiindex_year_month_day_dataframe_random_data
  1679. ):
  1680. # GH#403
  1681. ymd = multiindex_year_month_day_dataframe_random_data
  1682. ymd["E"] = "foo"
  1683. ymd["F"] = 2
  1684. unstacked = ymd.unstack("month")
  1685. assert unstacked["A", 1].dtype == np.float64
  1686. assert unstacked["E", 1].dtype == np.object_
  1687. assert unstacked["F", 1].dtype == np.float64
  1688. def test_unstack_group_index_overflow(self):
  1689. codes = np.tile(np.arange(500), 2)
  1690. level = np.arange(500)
  1691. index = MultiIndex(
  1692. levels=[level] * 8 + [[0, 1]],
  1693. codes=[codes] * 8 + [np.arange(2).repeat(500)],
  1694. )
  1695. s = Series(np.arange(1000), index=index)
  1696. result = s.unstack()
  1697. assert result.shape == (500, 2)
  1698. # test roundtrip
  1699. stacked = result.stack()
  1700. tm.assert_series_equal(s, stacked.reindex(s.index))
  1701. # put it at beginning
  1702. index = MultiIndex(
  1703. levels=[[0, 1]] + [level] * 8,
  1704. codes=[np.arange(2).repeat(500)] + [codes] * 8,
  1705. )
  1706. s = Series(np.arange(1000), index=index)
  1707. result = s.unstack(0)
  1708. assert result.shape == (500, 2)
  1709. # put it in middle
  1710. index = MultiIndex(
  1711. levels=[level] * 4 + [[0, 1]] + [level] * 4,
  1712. codes=([codes] * 4 + [np.arange(2).repeat(500)] + [codes] * 4),
  1713. )
  1714. s = Series(np.arange(1000), index=index)
  1715. result = s.unstack(4)
  1716. assert result.shape == (500, 2)
  1717. def test_unstack_with_missing_int_cast_to_float(self, using_array_manager):
  1718. # https://github.com/pandas-dev/pandas/issues/37115
  1719. df = DataFrame(
  1720. {
  1721. "a": ["A", "A", "B"],
  1722. "b": ["ca", "cb", "cb"],
  1723. "v": [10] * 3,
  1724. }
  1725. ).set_index(["a", "b"])
  1726. # add another int column to get 2 blocks
  1727. df["is_"] = 1
  1728. if not using_array_manager:
  1729. assert len(df._mgr.blocks) == 2
  1730. result = df.unstack("b")
  1731. result[("is_", "ca")] = result[("is_", "ca")].fillna(0)
  1732. expected = DataFrame(
  1733. [[10.0, 10.0, 1.0, 1.0], [np.nan, 10.0, 0.0, 1.0]],
  1734. index=Index(["A", "B"], dtype="object", name="a"),
  1735. columns=MultiIndex.from_tuples(
  1736. [("v", "ca"), ("v", "cb"), ("is_", "ca"), ("is_", "cb")],
  1737. names=[None, "b"],
  1738. ),
  1739. )
  1740. if using_array_manager:
  1741. # INFO(ArrayManager) with ArrayManager preserve dtype where possible
  1742. expected[("v", "cb")] = expected[("v", "cb")].astype("int64")
  1743. expected[("is_", "cb")] = expected[("is_", "cb")].astype("int64")
  1744. tm.assert_frame_equal(result, expected)
  1745. def test_unstack_with_level_has_nan(self):
  1746. # GH 37510
  1747. df1 = DataFrame(
  1748. {
  1749. "L1": [1, 2, 3, 4],
  1750. "L2": [3, 4, 1, 2],
  1751. "L3": [1, 1, 1, 1],
  1752. "x": [1, 2, 3, 4],
  1753. }
  1754. )
  1755. df1 = df1.set_index(["L1", "L2", "L3"])
  1756. new_levels = ["n1", "n2", "n3", None]
  1757. df1.index = df1.index.set_levels(levels=new_levels, level="L1")
  1758. df1.index = df1.index.set_levels(levels=new_levels, level="L2")
  1759. result = df1.unstack("L3")[("x", 1)].sort_index().index
  1760. expected = MultiIndex(
  1761. levels=[["n1", "n2", "n3", None], ["n1", "n2", "n3", None]],
  1762. codes=[[0, 1, 2, 3], [2, 3, 0, 1]],
  1763. names=["L1", "L2"],
  1764. )
  1765. tm.assert_index_equal(result, expected)
  1766. def test_stack_nan_in_multiindex_columns(self):
  1767. # GH#39481
  1768. df = DataFrame(
  1769. np.zeros([1, 5]),
  1770. columns=MultiIndex.from_tuples(
  1771. [
  1772. (0, None, None),
  1773. (0, 2, 0),
  1774. (0, 2, 1),
  1775. (0, 3, 0),
  1776. (0, 3, 1),
  1777. ],
  1778. ),
  1779. )
  1780. result = df.stack(2)
  1781. expected = DataFrame(
  1782. [[0.0, np.nan, np.nan], [np.nan, 0.0, 0.0], [np.nan, 0.0, 0.0]],
  1783. index=Index([(0, None), (0, 0), (0, 1)]),
  1784. columns=Index([(0, None), (0, 2), (0, 3)]),
  1785. )
  1786. tm.assert_frame_equal(result, expected)
  1787. def test_multi_level_stack_categorical(self):
  1788. # GH 15239
  1789. midx = MultiIndex.from_arrays(
  1790. [
  1791. ["A"] * 2 + ["B"] * 2,
  1792. pd.Categorical(list("abab")),
  1793. pd.Categorical(list("ccdd")),
  1794. ]
  1795. )
  1796. df = DataFrame(np.arange(8).reshape(2, 4), columns=midx)
  1797. result = df.stack([1, 2])
  1798. expected = DataFrame(
  1799. [
  1800. [0, np.nan],
  1801. [np.nan, 2],
  1802. [1, np.nan],
  1803. [np.nan, 3],
  1804. [4, np.nan],
  1805. [np.nan, 6],
  1806. [5, np.nan],
  1807. [np.nan, 7],
  1808. ],
  1809. columns=["A", "B"],
  1810. index=MultiIndex.from_arrays(
  1811. [
  1812. [0] * 4 + [1] * 4,
  1813. pd.Categorical(list("aabbaabb")),
  1814. pd.Categorical(list("cdcdcdcd")),
  1815. ]
  1816. ),
  1817. )
  1818. tm.assert_frame_equal(result, expected)
  1819. def test_stack_nan_level(self):
  1820. # GH 9406
  1821. df_nan = DataFrame(
  1822. np.arange(4).reshape(2, 2),
  1823. columns=MultiIndex.from_tuples(
  1824. [("A", np.nan), ("B", "b")], names=["Upper", "Lower"]
  1825. ),
  1826. index=Index([0, 1], name="Num"),
  1827. dtype=np.float64,
  1828. )
  1829. result = df_nan.stack()
  1830. expected = DataFrame(
  1831. [[0.0, np.nan], [np.nan, 1], [2.0, np.nan], [np.nan, 3.0]],
  1832. columns=Index(["A", "B"], name="Upper"),
  1833. index=MultiIndex.from_tuples(
  1834. [(0, np.nan), (0, "b"), (1, np.nan), (1, "b")], names=["Num", "Lower"]
  1835. ),
  1836. )
  1837. tm.assert_frame_equal(result, expected)
  1838. def test_unstack_categorical_columns(self):
  1839. # GH 14018
  1840. idx = MultiIndex.from_product([["A"], [0, 1]])
  1841. df = DataFrame({"cat": pd.Categorical(["a", "b"])}, index=idx)
  1842. result = df.unstack()
  1843. expected = DataFrame(
  1844. {
  1845. 0: pd.Categorical(["a"], categories=["a", "b"]),
  1846. 1: pd.Categorical(["b"], categories=["a", "b"]),
  1847. },
  1848. index=["A"],
  1849. )
  1850. expected.columns = MultiIndex.from_tuples([("cat", 0), ("cat", 1)])
  1851. tm.assert_frame_equal(result, expected)
  1852. def test_stack_unsorted(self):
  1853. # GH 16925
  1854. PAE = ["ITA", "FRA"]
  1855. VAR = ["A1", "A2"]
  1856. TYP = ["CRT", "DBT", "NET"]
  1857. MI = MultiIndex.from_product([PAE, VAR, TYP], names=["PAE", "VAR", "TYP"])
  1858. V = list(range(len(MI)))
  1859. DF = DataFrame(data=V, index=MI, columns=["VALUE"])
  1860. DF = DF.unstack(["VAR", "TYP"])
  1861. DF.columns = DF.columns.droplevel(0)
  1862. DF.loc[:, ("A0", "NET")] = 9999
  1863. result = DF.stack(["VAR", "TYP"]).sort_index()
  1864. expected = DF.sort_index(axis=1).stack(["VAR", "TYP"]).sort_index()
  1865. tm.assert_series_equal(result, expected)
  1866. def test_stack_nullable_dtype(self):
  1867. # GH#43561
  1868. columns = MultiIndex.from_product(
  1869. [["54511", "54515"], ["r", "t_mean"]], names=["station", "element"]
  1870. )
  1871. index = Index([1, 2, 3], name="time")
  1872. arr = np.array([[50, 226, 10, 215], [10, 215, 9, 220], [305, 232, 111, 220]])
  1873. df = DataFrame(arr, columns=columns, index=index, dtype=pd.Int64Dtype())
  1874. result = df.stack("station")
  1875. expected = df.astype(np.int64).stack("station").astype(pd.Int64Dtype())
  1876. tm.assert_frame_equal(result, expected)
  1877. # non-homogeneous case
  1878. df[df.columns[0]] = df[df.columns[0]].astype(pd.Float64Dtype())
  1879. result = df.stack("station")
  1880. # TODO(EA2D): we get object dtype because DataFrame.values can't
  1881. # be an EA
  1882. expected = df.astype(object).stack("station")
  1883. tm.assert_frame_equal(result, expected)
  1884. def test_unstack_mixed_level_names(self):
  1885. # GH#48763
  1886. arrays = [["a", "a"], [1, 2], ["red", "blue"]]
  1887. idx = MultiIndex.from_arrays(arrays, names=("x", 0, "y"))
  1888. df = DataFrame({"m": [1, 2]}, index=idx)
  1889. result = df.unstack("x")
  1890. expected = DataFrame(
  1891. [[1], [2]],
  1892. columns=MultiIndex.from_tuples([("m", "a")], names=[None, "x"]),
  1893. index=MultiIndex.from_tuples([(1, "red"), (2, "blue")], names=[0, "y"]),
  1894. )
  1895. tm.assert_frame_equal(result, expected)