melt.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540
  1. from __future__ import annotations
  2. import re
  3. from typing import (
  4. TYPE_CHECKING,
  5. Hashable,
  6. )
  7. import numpy as np
  8. from pandas.util._decorators import Appender
  9. from pandas.core.dtypes.common import (
  10. is_extension_array_dtype,
  11. is_list_like,
  12. )
  13. from pandas.core.dtypes.concat import concat_compat
  14. from pandas.core.dtypes.missing import notna
  15. import pandas.core.algorithms as algos
  16. from pandas.core.arrays import Categorical
  17. import pandas.core.common as com
  18. from pandas.core.indexes.api import (
  19. Index,
  20. MultiIndex,
  21. )
  22. from pandas.core.reshape.concat import concat
  23. from pandas.core.reshape.util import tile_compat
  24. from pandas.core.shared_docs import _shared_docs
  25. from pandas.core.tools.numeric import to_numeric
  26. if TYPE_CHECKING:
  27. from pandas._typing import AnyArrayLike
  28. from pandas import DataFrame
  29. @Appender(_shared_docs["melt"] % {"caller": "pd.melt(df, ", "other": "DataFrame.melt"})
  30. def melt(
  31. frame: DataFrame,
  32. id_vars=None,
  33. value_vars=None,
  34. var_name=None,
  35. value_name: Hashable = "value",
  36. col_level=None,
  37. ignore_index: bool = True,
  38. ) -> DataFrame:
  39. # If multiindex, gather names of columns on all level for checking presence
  40. # of `id_vars` and `value_vars`
  41. if isinstance(frame.columns, MultiIndex):
  42. cols = [x for c in frame.columns for x in c]
  43. else:
  44. cols = list(frame.columns)
  45. if value_name in frame.columns:
  46. raise ValueError(
  47. f"value_name ({value_name}) cannot match an element in "
  48. "the DataFrame columns."
  49. )
  50. if id_vars is not None:
  51. if not is_list_like(id_vars):
  52. id_vars = [id_vars]
  53. elif isinstance(frame.columns, MultiIndex) and not isinstance(id_vars, list):
  54. raise ValueError(
  55. "id_vars must be a list of tuples when columns are a MultiIndex"
  56. )
  57. else:
  58. # Check that `id_vars` are in frame
  59. id_vars = list(id_vars)
  60. missing = Index(com.flatten(id_vars)).difference(cols)
  61. if not missing.empty:
  62. raise KeyError(
  63. "The following 'id_vars' are not present "
  64. f"in the DataFrame: {list(missing)}"
  65. )
  66. else:
  67. id_vars = []
  68. if value_vars is not None:
  69. if not is_list_like(value_vars):
  70. value_vars = [value_vars]
  71. elif isinstance(frame.columns, MultiIndex) and not isinstance(value_vars, list):
  72. raise ValueError(
  73. "value_vars must be a list of tuples when columns are a MultiIndex"
  74. )
  75. else:
  76. value_vars = list(value_vars)
  77. # Check that `value_vars` are in frame
  78. missing = Index(com.flatten(value_vars)).difference(cols)
  79. if not missing.empty:
  80. raise KeyError(
  81. "The following 'value_vars' are not present in "
  82. f"the DataFrame: {list(missing)}"
  83. )
  84. if col_level is not None:
  85. idx = frame.columns.get_level_values(col_level).get_indexer(
  86. id_vars + value_vars
  87. )
  88. else:
  89. idx = algos.unique(frame.columns.get_indexer_for(id_vars + value_vars))
  90. frame = frame.iloc[:, idx]
  91. else:
  92. frame = frame.copy()
  93. if col_level is not None: # allow list or other?
  94. # frame is a copy
  95. frame.columns = frame.columns.get_level_values(col_level)
  96. if var_name is None:
  97. if isinstance(frame.columns, MultiIndex):
  98. if len(frame.columns.names) == len(set(frame.columns.names)):
  99. var_name = frame.columns.names
  100. else:
  101. var_name = [f"variable_{i}" for i in range(len(frame.columns.names))]
  102. else:
  103. var_name = [
  104. frame.columns.name if frame.columns.name is not None else "variable"
  105. ]
  106. if isinstance(var_name, str):
  107. var_name = [var_name]
  108. N, K = frame.shape
  109. K -= len(id_vars)
  110. mdata: dict[Hashable, AnyArrayLike] = {}
  111. for col in id_vars:
  112. id_data = frame.pop(col)
  113. if is_extension_array_dtype(id_data):
  114. if K > 0:
  115. id_data = concat([id_data] * K, ignore_index=True)
  116. else:
  117. # We can't concat empty list. (GH 46044)
  118. id_data = type(id_data)([], name=id_data.name, dtype=id_data.dtype)
  119. else:
  120. # error: Incompatible types in assignment (expression has type
  121. # "ndarray[Any, dtype[Any]]", variable has type "Series")
  122. id_data = np.tile(id_data._values, K) # type: ignore[assignment]
  123. mdata[col] = id_data
  124. mcolumns = id_vars + var_name + [value_name]
  125. if frame.shape[1] > 0:
  126. mdata[value_name] = concat(
  127. [frame.iloc[:, i] for i in range(frame.shape[1])]
  128. ).values
  129. else:
  130. mdata[value_name] = frame._values.ravel("F")
  131. for i, col in enumerate(var_name):
  132. # asanyarray will keep the columns as an Index
  133. mdata[col] = np.asanyarray(frame.columns._get_level_values(i)).repeat(N)
  134. result = frame._constructor(mdata, columns=mcolumns)
  135. if not ignore_index:
  136. result.index = tile_compat(frame.index, K)
  137. return result
  138. def lreshape(data: DataFrame, groups, dropna: bool = True) -> DataFrame:
  139. """
  140. Reshape wide-format data to long. Generalized inverse of DataFrame.pivot.
  141. Accepts a dictionary, ``groups``, in which each key is a new column name
  142. and each value is a list of old column names that will be "melted" under
  143. the new column name as part of the reshape.
  144. Parameters
  145. ----------
  146. data : DataFrame
  147. The wide-format DataFrame.
  148. groups : dict
  149. {new_name : list_of_columns}.
  150. dropna : bool, default True
  151. Do not include columns whose entries are all NaN.
  152. Returns
  153. -------
  154. DataFrame
  155. Reshaped DataFrame.
  156. See Also
  157. --------
  158. melt : Unpivot a DataFrame from wide to long format, optionally leaving
  159. identifiers set.
  160. pivot : Create a spreadsheet-style pivot table as a DataFrame.
  161. DataFrame.pivot : Pivot without aggregation that can handle
  162. non-numeric data.
  163. DataFrame.pivot_table : Generalization of pivot that can handle
  164. duplicate values for one index/column pair.
  165. DataFrame.unstack : Pivot based on the index values instead of a
  166. column.
  167. wide_to_long : Wide panel to long format. Less flexible but more
  168. user-friendly than melt.
  169. Examples
  170. --------
  171. >>> data = pd.DataFrame({'hr1': [514, 573], 'hr2': [545, 526],
  172. ... 'team': ['Red Sox', 'Yankees'],
  173. ... 'year1': [2007, 2007], 'year2': [2008, 2008]})
  174. >>> data
  175. hr1 hr2 team year1 year2
  176. 0 514 545 Red Sox 2007 2008
  177. 1 573 526 Yankees 2007 2008
  178. >>> pd.lreshape(data, {'year': ['year1', 'year2'], 'hr': ['hr1', 'hr2']})
  179. team year hr
  180. 0 Red Sox 2007 514
  181. 1 Yankees 2007 573
  182. 2 Red Sox 2008 545
  183. 3 Yankees 2008 526
  184. """
  185. if isinstance(groups, dict):
  186. keys = list(groups.keys())
  187. values = list(groups.values())
  188. else:
  189. keys, values = zip(*groups)
  190. all_cols = list(set.union(*(set(x) for x in values)))
  191. id_cols = list(data.columns.difference(all_cols))
  192. K = len(values[0])
  193. for seq in values:
  194. if len(seq) != K:
  195. raise ValueError("All column lists must be same length")
  196. mdata = {}
  197. pivot_cols = []
  198. for target, names in zip(keys, values):
  199. to_concat = [data[col]._values for col in names]
  200. mdata[target] = concat_compat(to_concat)
  201. pivot_cols.append(target)
  202. for col in id_cols:
  203. mdata[col] = np.tile(data[col]._values, K)
  204. if dropna:
  205. mask = np.ones(len(mdata[pivot_cols[0]]), dtype=bool)
  206. for c in pivot_cols:
  207. mask &= notna(mdata[c])
  208. if not mask.all():
  209. mdata = {k: v[mask] for k, v in mdata.items()}
  210. return data._constructor(mdata, columns=id_cols + pivot_cols)
  211. def wide_to_long(
  212. df: DataFrame, stubnames, i, j, sep: str = "", suffix: str = r"\d+"
  213. ) -> DataFrame:
  214. r"""
  215. Unpivot a DataFrame from wide to long format.
  216. Less flexible but more user-friendly than melt.
  217. With stubnames ['A', 'B'], this function expects to find one or more
  218. group of columns with format
  219. A-suffix1, A-suffix2,..., B-suffix1, B-suffix2,...
  220. You specify what you want to call this suffix in the resulting long format
  221. with `j` (for example `j='year'`)
  222. Each row of these wide variables are assumed to be uniquely identified by
  223. `i` (can be a single column name or a list of column names)
  224. All remaining variables in the data frame are left intact.
  225. Parameters
  226. ----------
  227. df : DataFrame
  228. The wide-format DataFrame.
  229. stubnames : str or list-like
  230. The stub name(s). The wide format variables are assumed to
  231. start with the stub names.
  232. i : str or list-like
  233. Column(s) to use as id variable(s).
  234. j : str
  235. The name of the sub-observation variable. What you wish to name your
  236. suffix in the long format.
  237. sep : str, default ""
  238. A character indicating the separation of the variable names
  239. in the wide format, to be stripped from the names in the long format.
  240. For example, if your column names are A-suffix1, A-suffix2, you
  241. can strip the hyphen by specifying `sep='-'`.
  242. suffix : str, default '\\d+'
  243. A regular expression capturing the wanted suffixes. '\\d+' captures
  244. numeric suffixes. Suffixes with no numbers could be specified with the
  245. negated character class '\\D+'. You can also further disambiguate
  246. suffixes, for example, if your wide variables are of the form A-one,
  247. B-two,.., and you have an unrelated column A-rating, you can ignore the
  248. last one by specifying `suffix='(!?one|two)'`. When all suffixes are
  249. numeric, they are cast to int64/float64.
  250. Returns
  251. -------
  252. DataFrame
  253. A DataFrame that contains each stub name as a variable, with new index
  254. (i, j).
  255. See Also
  256. --------
  257. melt : Unpivot a DataFrame from wide to long format, optionally leaving
  258. identifiers set.
  259. pivot : Create a spreadsheet-style pivot table as a DataFrame.
  260. DataFrame.pivot : Pivot without aggregation that can handle
  261. non-numeric data.
  262. DataFrame.pivot_table : Generalization of pivot that can handle
  263. duplicate values for one index/column pair.
  264. DataFrame.unstack : Pivot based on the index values instead of a
  265. column.
  266. Notes
  267. -----
  268. All extra variables are left untouched. This simply uses
  269. `pandas.melt` under the hood, but is hard-coded to "do the right thing"
  270. in a typical case.
  271. Examples
  272. --------
  273. >>> np.random.seed(123)
  274. >>> df = pd.DataFrame({"A1970" : {0 : "a", 1 : "b", 2 : "c"},
  275. ... "A1980" : {0 : "d", 1 : "e", 2 : "f"},
  276. ... "B1970" : {0 : 2.5, 1 : 1.2, 2 : .7},
  277. ... "B1980" : {0 : 3.2, 1 : 1.3, 2 : .1},
  278. ... "X" : dict(zip(range(3), np.random.randn(3)))
  279. ... })
  280. >>> df["id"] = df.index
  281. >>> df
  282. A1970 A1980 B1970 B1980 X id
  283. 0 a d 2.5 3.2 -1.085631 0
  284. 1 b e 1.2 1.3 0.997345 1
  285. 2 c f 0.7 0.1 0.282978 2
  286. >>> pd.wide_to_long(df, ["A", "B"], i="id", j="year")
  287. ... # doctest: +NORMALIZE_WHITESPACE
  288. X A B
  289. id year
  290. 0 1970 -1.085631 a 2.5
  291. 1 1970 0.997345 b 1.2
  292. 2 1970 0.282978 c 0.7
  293. 0 1980 -1.085631 d 3.2
  294. 1 1980 0.997345 e 1.3
  295. 2 1980 0.282978 f 0.1
  296. With multiple id columns
  297. >>> df = pd.DataFrame({
  298. ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
  299. ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
  300. ... 'ht1': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
  301. ... 'ht2': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
  302. ... })
  303. >>> df
  304. famid birth ht1 ht2
  305. 0 1 1 2.8 3.4
  306. 1 1 2 2.9 3.8
  307. 2 1 3 2.2 2.9
  308. 3 2 1 2.0 3.2
  309. 4 2 2 1.8 2.8
  310. 5 2 3 1.9 2.4
  311. 6 3 1 2.2 3.3
  312. 7 3 2 2.3 3.4
  313. 8 3 3 2.1 2.9
  314. >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age')
  315. >>> l
  316. ... # doctest: +NORMALIZE_WHITESPACE
  317. ht
  318. famid birth age
  319. 1 1 1 2.8
  320. 2 3.4
  321. 2 1 2.9
  322. 2 3.8
  323. 3 1 2.2
  324. 2 2.9
  325. 2 1 1 2.0
  326. 2 3.2
  327. 2 1 1.8
  328. 2 2.8
  329. 3 1 1.9
  330. 2 2.4
  331. 3 1 1 2.2
  332. 2 3.3
  333. 2 1 2.3
  334. 2 3.4
  335. 3 1 2.1
  336. 2 2.9
  337. Going from long back to wide just takes some creative use of `unstack`
  338. >>> w = l.unstack()
  339. >>> w.columns = w.columns.map('{0[0]}{0[1]}'.format)
  340. >>> w.reset_index()
  341. famid birth ht1 ht2
  342. 0 1 1 2.8 3.4
  343. 1 1 2 2.9 3.8
  344. 2 1 3 2.2 2.9
  345. 3 2 1 2.0 3.2
  346. 4 2 2 1.8 2.8
  347. 5 2 3 1.9 2.4
  348. 6 3 1 2.2 3.3
  349. 7 3 2 2.3 3.4
  350. 8 3 3 2.1 2.9
  351. Less wieldy column names are also handled
  352. >>> np.random.seed(0)
  353. >>> df = pd.DataFrame({'A(weekly)-2010': np.random.rand(3),
  354. ... 'A(weekly)-2011': np.random.rand(3),
  355. ... 'B(weekly)-2010': np.random.rand(3),
  356. ... 'B(weekly)-2011': np.random.rand(3),
  357. ... 'X' : np.random.randint(3, size=3)})
  358. >>> df['id'] = df.index
  359. >>> df # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS
  360. A(weekly)-2010 A(weekly)-2011 B(weekly)-2010 B(weekly)-2011 X id
  361. 0 0.548814 0.544883 0.437587 0.383442 0 0
  362. 1 0.715189 0.423655 0.891773 0.791725 1 1
  363. 2 0.602763 0.645894 0.963663 0.528895 1 2
  364. >>> pd.wide_to_long(df, ['A(weekly)', 'B(weekly)'], i='id',
  365. ... j='year', sep='-')
  366. ... # doctest: +NORMALIZE_WHITESPACE
  367. X A(weekly) B(weekly)
  368. id year
  369. 0 2010 0 0.548814 0.437587
  370. 1 2010 1 0.715189 0.891773
  371. 2 2010 1 0.602763 0.963663
  372. 0 2011 0 0.544883 0.383442
  373. 1 2011 1 0.423655 0.791725
  374. 2 2011 1 0.645894 0.528895
  375. If we have many columns, we could also use a regex to find our
  376. stubnames and pass that list on to wide_to_long
  377. >>> stubnames = sorted(
  378. ... set([match[0] for match in df.columns.str.findall(
  379. ... r'[A-B]\(.*\)').values if match != []])
  380. ... )
  381. >>> list(stubnames)
  382. ['A(weekly)', 'B(weekly)']
  383. All of the above examples have integers as suffixes. It is possible to
  384. have non-integers as suffixes.
  385. >>> df = pd.DataFrame({
  386. ... 'famid': [1, 1, 1, 2, 2, 2, 3, 3, 3],
  387. ... 'birth': [1, 2, 3, 1, 2, 3, 1, 2, 3],
  388. ... 'ht_one': [2.8, 2.9, 2.2, 2, 1.8, 1.9, 2.2, 2.3, 2.1],
  389. ... 'ht_two': [3.4, 3.8, 2.9, 3.2, 2.8, 2.4, 3.3, 3.4, 2.9]
  390. ... })
  391. >>> df
  392. famid birth ht_one ht_two
  393. 0 1 1 2.8 3.4
  394. 1 1 2 2.9 3.8
  395. 2 1 3 2.2 2.9
  396. 3 2 1 2.0 3.2
  397. 4 2 2 1.8 2.8
  398. 5 2 3 1.9 2.4
  399. 6 3 1 2.2 3.3
  400. 7 3 2 2.3 3.4
  401. 8 3 3 2.1 2.9
  402. >>> l = pd.wide_to_long(df, stubnames='ht', i=['famid', 'birth'], j='age',
  403. ... sep='_', suffix=r'\w+')
  404. >>> l
  405. ... # doctest: +NORMALIZE_WHITESPACE
  406. ht
  407. famid birth age
  408. 1 1 one 2.8
  409. two 3.4
  410. 2 one 2.9
  411. two 3.8
  412. 3 one 2.2
  413. two 2.9
  414. 2 1 one 2.0
  415. two 3.2
  416. 2 one 1.8
  417. two 2.8
  418. 3 one 1.9
  419. two 2.4
  420. 3 1 one 2.2
  421. two 3.3
  422. 2 one 2.3
  423. two 3.4
  424. 3 one 2.1
  425. two 2.9
  426. """
  427. def get_var_names(df, stub: str, sep: str, suffix: str) -> list[str]:
  428. regex = rf"^{re.escape(stub)}{re.escape(sep)}{suffix}$"
  429. pattern = re.compile(regex)
  430. return [col for col in df.columns if pattern.match(col)]
  431. def melt_stub(df, stub: str, i, j, value_vars, sep: str):
  432. newdf = melt(
  433. df,
  434. id_vars=i,
  435. value_vars=value_vars,
  436. value_name=stub.rstrip(sep),
  437. var_name=j,
  438. )
  439. newdf[j] = Categorical(newdf[j])
  440. newdf[j] = newdf[j].str.replace(re.escape(stub + sep), "", regex=True)
  441. # GH17627 Cast numerics suffixes to int/float
  442. newdf[j] = to_numeric(newdf[j], errors="ignore")
  443. return newdf.set_index(i + [j])
  444. if not is_list_like(stubnames):
  445. stubnames = [stubnames]
  446. else:
  447. stubnames = list(stubnames)
  448. if any(col in stubnames for col in df.columns):
  449. raise ValueError("stubname can't be identical to a column name")
  450. if not is_list_like(i):
  451. i = [i]
  452. else:
  453. i = list(i)
  454. if df[i].duplicated().any():
  455. raise ValueError("the id variables need to uniquely identify each row")
  456. value_vars = [get_var_names(df, stub, sep, suffix) for stub in stubnames]
  457. value_vars_flattened = [e for sublist in value_vars for e in sublist]
  458. id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened))
  459. _melted = [melt_stub(df, s, i, j, v, sep) for s, v in zip(stubnames, value_vars)]
  460. melted = _melted[0].join(_melted[1:], how="outer")
  461. if len(i) == 1:
  462. new = df[id_vars].set_index(i).join(melted)
  463. return new
  464. new = df[id_vars].merge(melted.reset_index(), on=i).set_index(i + [j])
  465. return new