reshape.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841
  1. from __future__ import annotations
  2. import itertools
  3. from typing import (
  4. TYPE_CHECKING,
  5. cast,
  6. )
  7. import warnings
  8. import numpy as np
  9. import pandas._libs.reshape as libreshape
  10. from pandas._typing import npt
  11. from pandas.errors import PerformanceWarning
  12. from pandas.util._decorators import cache_readonly
  13. from pandas.util._exceptions import find_stack_level
  14. from pandas.core.dtypes.cast import maybe_promote
  15. from pandas.core.dtypes.common import (
  16. ensure_platform_int,
  17. is_1d_only_ea_dtype,
  18. is_extension_array_dtype,
  19. is_integer,
  20. needs_i8_conversion,
  21. )
  22. from pandas.core.dtypes.dtypes import ExtensionDtype
  23. from pandas.core.dtypes.missing import notna
  24. import pandas.core.algorithms as algos
  25. from pandas.core.arrays.categorical import factorize_from_iterable
  26. from pandas.core.construction import ensure_wrapped_if_datetimelike
  27. from pandas.core.frame import DataFrame
  28. from pandas.core.indexes.api import (
  29. Index,
  30. MultiIndex,
  31. )
  32. from pandas.core.series import Series
  33. from pandas.core.sorting import (
  34. compress_group_index,
  35. decons_obs_group_ids,
  36. get_compressed_ids,
  37. get_group_index,
  38. get_group_index_sorter,
  39. )
  40. if TYPE_CHECKING:
  41. from pandas.core.arrays import ExtensionArray
  42. from pandas.core.indexes.frozen import FrozenList
  43. class _Unstacker:
  44. """
  45. Helper class to unstack data / pivot with multi-level index
  46. Parameters
  47. ----------
  48. index : MultiIndex
  49. level : int or str, default last level
  50. Level to "unstack". Accepts a name for the level.
  51. fill_value : scalar, optional
  52. Default value to fill in missing values if subgroups do not have the
  53. same set of labels. By default, missing values will be replaced with
  54. the default fill value for that data type, NaN for float, NaT for
  55. datetimelike, etc. For integer types, by default data will converted to
  56. float and missing values will be set to NaN.
  57. constructor : object
  58. Pandas ``DataFrame`` or subclass used to create unstacked
  59. response. If None, DataFrame will be used.
  60. Examples
  61. --------
  62. >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
  63. ... ('two', 'a'), ('two', 'b')])
  64. >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
  65. >>> s
  66. one a 1
  67. b 2
  68. two a 3
  69. b 4
  70. dtype: int64
  71. >>> s.unstack(level=-1)
  72. a b
  73. one 1 2
  74. two 3 4
  75. >>> s.unstack(level=0)
  76. one two
  77. a 1 3
  78. b 2 4
  79. Returns
  80. -------
  81. unstacked : DataFrame
  82. """
  83. def __init__(self, index: MultiIndex, level=-1, constructor=None) -> None:
  84. if constructor is None:
  85. constructor = DataFrame
  86. self.constructor = constructor
  87. self.index = index.remove_unused_levels()
  88. self.level = self.index._get_level_number(level)
  89. # when index includes `nan`, need to lift levels/strides by 1
  90. self.lift = 1 if -1 in self.index.codes[self.level] else 0
  91. # Note: the "pop" below alters these in-place.
  92. self.new_index_levels = list(self.index.levels)
  93. self.new_index_names = list(self.index.names)
  94. self.removed_name = self.new_index_names.pop(self.level)
  95. self.removed_level = self.new_index_levels.pop(self.level)
  96. self.removed_level_full = index.levels[self.level]
  97. # Bug fix GH 20601
  98. # If the data frame is too big, the number of unique index combination
  99. # will cause int32 overflow on windows environments.
  100. # We want to check and raise an error before this happens
  101. num_rows = np.max([index_level.size for index_level in self.new_index_levels])
  102. num_columns = self.removed_level.size
  103. # GH20601: This forces an overflow if the number of cells is too high.
  104. num_cells = num_rows * num_columns
  105. # GH 26314: Previous ValueError raised was too restrictive for many users.
  106. if num_cells > np.iinfo(np.int32).max:
  107. warnings.warn(
  108. f"The following operation may generate {num_cells} cells "
  109. f"in the resulting pandas object.",
  110. PerformanceWarning,
  111. stacklevel=find_stack_level(),
  112. )
  113. self._make_selectors()
  114. @cache_readonly
  115. def _indexer_and_to_sort(
  116. self,
  117. ) -> tuple[
  118. npt.NDArray[np.intp],
  119. list[np.ndarray], # each has _some_ signed integer dtype
  120. ]:
  121. v = self.level
  122. codes = list(self.index.codes)
  123. levs = list(self.index.levels)
  124. to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
  125. sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
  126. comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
  127. ngroups = len(obs_ids)
  128. indexer = get_group_index_sorter(comp_index, ngroups)
  129. return indexer, to_sort
  130. @cache_readonly
  131. def sorted_labels(self) -> list[np.ndarray]:
  132. indexer, to_sort = self._indexer_and_to_sort
  133. return [line.take(indexer) for line in to_sort]
  134. def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
  135. indexer, _ = self._indexer_and_to_sort
  136. sorted_values = algos.take_nd(values, indexer, axis=0)
  137. return sorted_values
  138. def _make_selectors(self):
  139. new_levels = self.new_index_levels
  140. # make the mask
  141. remaining_labels = self.sorted_labels[:-1]
  142. level_sizes = tuple(len(x) for x in new_levels)
  143. comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
  144. ngroups = len(obs_ids)
  145. comp_index = ensure_platform_int(comp_index)
  146. stride = self.index.levshape[self.level] + self.lift
  147. self.full_shape = ngroups, stride
  148. selector = self.sorted_labels[-1] + stride * comp_index + self.lift
  149. mask = np.zeros(np.prod(self.full_shape), dtype=bool)
  150. mask.put(selector, True)
  151. if mask.sum() < len(self.index):
  152. raise ValueError("Index contains duplicate entries, cannot reshape")
  153. self.group_index = comp_index
  154. self.mask = mask
  155. self.compressor = comp_index.searchsorted(np.arange(ngroups))
  156. @cache_readonly
  157. def mask_all(self) -> bool:
  158. return bool(self.mask.all())
  159. @cache_readonly
  160. def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
  161. # We cache this for re-use in ExtensionBlock._unstack
  162. dummy_arr = np.arange(len(self.index), dtype=np.intp)
  163. new_values, mask = self.get_new_values(dummy_arr, fill_value=-1)
  164. return new_values, mask.any(0)
  165. # TODO: in all tests we have mask.any(0).all(); can we rely on that?
  166. def get_result(self, values, value_columns, fill_value) -> DataFrame:
  167. if values.ndim == 1:
  168. values = values[:, np.newaxis]
  169. if value_columns is None and values.shape[1] != 1: # pragma: no cover
  170. raise ValueError("must pass column labels for multi-column data")
  171. values, _ = self.get_new_values(values, fill_value)
  172. columns = self.get_new_columns(value_columns)
  173. index = self.new_index
  174. return self.constructor(
  175. values, index=index, columns=columns, dtype=values.dtype
  176. )
  177. def get_new_values(self, values, fill_value=None):
  178. if values.ndim == 1:
  179. values = values[:, np.newaxis]
  180. sorted_values = self._make_sorted_values(values)
  181. # place the values
  182. length, width = self.full_shape
  183. stride = values.shape[1]
  184. result_width = width * stride
  185. result_shape = (length, result_width)
  186. mask = self.mask
  187. mask_all = self.mask_all
  188. # we can simply reshape if we don't have a mask
  189. if mask_all and len(values):
  190. # TODO: Under what circumstances can we rely on sorted_values
  191. # matching values? When that holds, we can slice instead
  192. # of take (in particular for EAs)
  193. new_values = (
  194. sorted_values.reshape(length, width, stride)
  195. .swapaxes(1, 2)
  196. .reshape(result_shape)
  197. )
  198. new_mask = np.ones(result_shape, dtype=bool)
  199. return new_values, new_mask
  200. dtype = values.dtype
  201. # if our mask is all True, then we can use our existing dtype
  202. if mask_all:
  203. dtype = values.dtype
  204. new_values = np.empty(result_shape, dtype=dtype)
  205. else:
  206. if isinstance(dtype, ExtensionDtype):
  207. # GH#41875
  208. # We are assuming that fill_value can be held by this dtype,
  209. # unlike the non-EA case that promotes.
  210. cls = dtype.construct_array_type()
  211. new_values = cls._empty(result_shape, dtype=dtype)
  212. new_values[:] = fill_value
  213. else:
  214. dtype, fill_value = maybe_promote(dtype, fill_value)
  215. new_values = np.empty(result_shape, dtype=dtype)
  216. new_values.fill(fill_value)
  217. name = dtype.name
  218. new_mask = np.zeros(result_shape, dtype=bool)
  219. # we need to convert to a basic dtype
  220. # and possibly coerce an input to our output dtype
  221. # e.g. ints -> floats
  222. if needs_i8_conversion(values.dtype):
  223. sorted_values = sorted_values.view("i8")
  224. new_values = new_values.view("i8")
  225. else:
  226. sorted_values = sorted_values.astype(name, copy=False)
  227. # fill in our values & mask
  228. libreshape.unstack(
  229. sorted_values,
  230. mask.view("u1"),
  231. stride,
  232. length,
  233. width,
  234. new_values,
  235. new_mask.view("u1"),
  236. )
  237. # reconstruct dtype if needed
  238. if needs_i8_conversion(values.dtype):
  239. # view as datetime64 so we can wrap in DatetimeArray and use
  240. # DTA's view method
  241. new_values = new_values.view("M8[ns]")
  242. new_values = ensure_wrapped_if_datetimelike(new_values)
  243. new_values = new_values.view(values.dtype)
  244. return new_values, new_mask
  245. def get_new_columns(self, value_columns: Index | None):
  246. if value_columns is None:
  247. if self.lift == 0:
  248. return self.removed_level._rename(name=self.removed_name)
  249. lev = self.removed_level.insert(0, item=self.removed_level._na_value)
  250. return lev.rename(self.removed_name)
  251. stride = len(self.removed_level) + self.lift
  252. width = len(value_columns)
  253. propagator = np.repeat(np.arange(width), stride)
  254. new_levels: FrozenList | list[Index]
  255. if isinstance(value_columns, MultiIndex):
  256. # error: Cannot determine type of "__add__" [has-type]
  257. new_levels = value_columns.levels + ( # type: ignore[has-type]
  258. self.removed_level_full,
  259. )
  260. new_names = value_columns.names + (self.removed_name,)
  261. new_codes = [lab.take(propagator) for lab in value_columns.codes]
  262. else:
  263. new_levels = [
  264. value_columns,
  265. self.removed_level_full,
  266. ]
  267. new_names = [value_columns.name, self.removed_name]
  268. new_codes = [propagator]
  269. repeater = self._repeater
  270. # The entire level is then just a repetition of the single chunk:
  271. new_codes.append(np.tile(repeater, width))
  272. return MultiIndex(
  273. levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
  274. )
  275. @cache_readonly
  276. def _repeater(self) -> np.ndarray:
  277. # The two indices differ only if the unstacked level had unused items:
  278. if len(self.removed_level_full) != len(self.removed_level):
  279. # In this case, we remap the new codes to the original level:
  280. repeater = self.removed_level_full.get_indexer(self.removed_level)
  281. if self.lift:
  282. repeater = np.insert(repeater, 0, -1)
  283. else:
  284. # Otherwise, we just use each level item exactly once:
  285. stride = len(self.removed_level) + self.lift
  286. repeater = np.arange(stride) - self.lift
  287. return repeater
  288. @cache_readonly
  289. def new_index(self) -> MultiIndex:
  290. # Does not depend on values or value_columns
  291. result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
  292. # construct the new index
  293. if len(self.new_index_levels) == 1:
  294. level, level_codes = self.new_index_levels[0], result_codes[0]
  295. if (level_codes == -1).any():
  296. level = level.insert(len(level), level._na_value)
  297. return level.take(level_codes).rename(self.new_index_names[0])
  298. return MultiIndex(
  299. levels=self.new_index_levels,
  300. codes=result_codes,
  301. names=self.new_index_names,
  302. verify_integrity=False,
  303. )
  304. def _unstack_multiple(data, clocs, fill_value=None):
  305. if len(clocs) == 0:
  306. return data
  307. # NOTE: This doesn't deal with hierarchical columns yet
  308. index = data.index
  309. # GH 19966 Make sure if MultiIndexed index has tuple name, they will be
  310. # recognised as a whole
  311. if clocs in index.names:
  312. clocs = [clocs]
  313. clocs = [index._get_level_number(i) for i in clocs]
  314. rlocs = [i for i in range(index.nlevels) if i not in clocs]
  315. clevels = [index.levels[i] for i in clocs]
  316. ccodes = [index.codes[i] for i in clocs]
  317. cnames = [index.names[i] for i in clocs]
  318. rlevels = [index.levels[i] for i in rlocs]
  319. rcodes = [index.codes[i] for i in rlocs]
  320. rnames = [index.names[i] for i in rlocs]
  321. shape = tuple(len(x) for x in clevels)
  322. group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
  323. comp_ids, obs_ids = compress_group_index(group_index, sort=False)
  324. recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
  325. if not rlocs:
  326. # Everything is in clocs, so the dummy df has a regular index
  327. dummy_index = Index(obs_ids, name="__placeholder__")
  328. else:
  329. dummy_index = MultiIndex(
  330. levels=rlevels + [obs_ids],
  331. codes=rcodes + [comp_ids],
  332. names=rnames + ["__placeholder__"],
  333. verify_integrity=False,
  334. )
  335. if isinstance(data, Series):
  336. dummy = data.copy()
  337. dummy.index = dummy_index
  338. unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
  339. new_levels = clevels
  340. new_names = cnames
  341. new_codes = recons_codes
  342. else:
  343. if isinstance(data.columns, MultiIndex):
  344. result = data
  345. while clocs:
  346. val = clocs.pop(0)
  347. result = result.unstack(val, fill_value=fill_value)
  348. clocs = [v if v < val else v - 1 for v in clocs]
  349. return result
  350. # GH#42579 deep=False to avoid consolidating
  351. dummy = data.copy(deep=False)
  352. dummy.index = dummy_index
  353. unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
  354. if isinstance(unstacked, Series):
  355. unstcols = unstacked.index
  356. else:
  357. unstcols = unstacked.columns
  358. assert isinstance(unstcols, MultiIndex) # for mypy
  359. new_levels = [unstcols.levels[0]] + clevels
  360. new_names = [data.columns.name] + cnames
  361. new_codes = [unstcols.codes[0]]
  362. for rec in recons_codes:
  363. new_codes.append(rec.take(unstcols.codes[-1]))
  364. new_columns = MultiIndex(
  365. levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
  366. )
  367. if isinstance(unstacked, Series):
  368. unstacked.index = new_columns
  369. else:
  370. unstacked.columns = new_columns
  371. return unstacked
  372. def unstack(obj: Series | DataFrame, level, fill_value=None):
  373. if isinstance(level, (tuple, list)):
  374. if len(level) != 1:
  375. # _unstack_multiple only handles MultiIndexes,
  376. # and isn't needed for a single level
  377. return _unstack_multiple(obj, level, fill_value=fill_value)
  378. else:
  379. level = level[0]
  380. if not is_integer(level) and not level == "__placeholder__":
  381. # check if level is valid in case of regular index
  382. obj.index._get_level_number(level)
  383. if isinstance(obj, DataFrame):
  384. if isinstance(obj.index, MultiIndex):
  385. return _unstack_frame(obj, level, fill_value=fill_value)
  386. else:
  387. return obj.T.stack(dropna=False)
  388. elif not isinstance(obj.index, MultiIndex):
  389. # GH 36113
  390. # Give nicer error messages when unstack a Series whose
  391. # Index is not a MultiIndex.
  392. raise ValueError(
  393. f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
  394. )
  395. else:
  396. if is_1d_only_ea_dtype(obj.dtype):
  397. return _unstack_extension_series(obj, level, fill_value)
  398. unstacker = _Unstacker(
  399. obj.index, level=level, constructor=obj._constructor_expanddim
  400. )
  401. return unstacker.get_result(
  402. obj._values, value_columns=None, fill_value=fill_value
  403. )
  404. def _unstack_frame(obj: DataFrame, level, fill_value=None):
  405. assert isinstance(obj.index, MultiIndex) # checked by caller
  406. unstacker = _Unstacker(obj.index, level=level, constructor=obj._constructor)
  407. if not obj._can_fast_transpose:
  408. mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
  409. return obj._constructor(mgr)
  410. else:
  411. return unstacker.get_result(
  412. obj._values, value_columns=obj.columns, fill_value=fill_value
  413. )
  414. def _unstack_extension_series(series: Series, level, fill_value) -> DataFrame:
  415. """
  416. Unstack an ExtensionArray-backed Series.
  417. The ExtensionDtype is preserved.
  418. Parameters
  419. ----------
  420. series : Series
  421. A Series with an ExtensionArray for values
  422. level : Any
  423. The level name or number.
  424. fill_value : Any
  425. The user-level (not physical storage) fill value to use for
  426. missing values introduced by the reshape. Passed to
  427. ``series.values.take``.
  428. Returns
  429. -------
  430. DataFrame
  431. Each column of the DataFrame will have the same dtype as
  432. the input Series.
  433. """
  434. # Defer to the logic in ExtensionBlock._unstack
  435. df = series.to_frame()
  436. result = df.unstack(level=level, fill_value=fill_value)
  437. # equiv: result.droplevel(level=0, axis=1)
  438. # but this avoids an extra copy
  439. result.columns = result.columns.droplevel(0)
  440. return result
  441. def stack(frame: DataFrame, level=-1, dropna: bool = True):
  442. """
  443. Convert DataFrame to Series with multi-level Index. Columns become the
  444. second level of the resulting hierarchical index
  445. Returns
  446. -------
  447. stacked : Series or DataFrame
  448. """
  449. def factorize(index):
  450. if index.is_unique:
  451. return index, np.arange(len(index))
  452. codes, categories = factorize_from_iterable(index)
  453. return categories, codes
  454. N, K = frame.shape
  455. # Will also convert negative level numbers and check if out of bounds.
  456. level_num = frame.columns._get_level_number(level)
  457. if isinstance(frame.columns, MultiIndex):
  458. return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
  459. elif isinstance(frame.index, MultiIndex):
  460. new_levels = list(frame.index.levels)
  461. new_codes = [lab.repeat(K) for lab in frame.index.codes]
  462. clev, clab = factorize(frame.columns)
  463. new_levels.append(clev)
  464. new_codes.append(np.tile(clab, N).ravel())
  465. new_names = list(frame.index.names)
  466. new_names.append(frame.columns.name)
  467. new_index = MultiIndex(
  468. levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
  469. )
  470. else:
  471. levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns)))
  472. codes = ilab.repeat(K), np.tile(clab, N).ravel()
  473. new_index = MultiIndex(
  474. levels=levels,
  475. codes=codes,
  476. names=[frame.index.name, frame.columns.name],
  477. verify_integrity=False,
  478. )
  479. if not frame.empty and frame._is_homogeneous_type:
  480. # For homogeneous EAs, frame._values will coerce to object. So
  481. # we concatenate instead.
  482. dtypes = list(frame.dtypes._values)
  483. dtype = dtypes[0]
  484. if is_extension_array_dtype(dtype):
  485. arr = dtype.construct_array_type()
  486. new_values = arr._concat_same_type(
  487. [col._values for _, col in frame.items()]
  488. )
  489. new_values = _reorder_for_extension_array_stack(new_values, N, K)
  490. else:
  491. # homogeneous, non-EA
  492. new_values = frame._values.ravel()
  493. else:
  494. # non-homogeneous
  495. new_values = frame._values.ravel()
  496. if dropna:
  497. mask = notna(new_values)
  498. new_values = new_values[mask]
  499. new_index = new_index[mask]
  500. return frame._constructor_sliced(new_values, index=new_index)
  501. def stack_multiple(frame, level, dropna: bool = True):
  502. # If all passed levels match up to column names, no
  503. # ambiguity about what to do
  504. if all(lev in frame.columns.names for lev in level):
  505. result = frame
  506. for lev in level:
  507. result = stack(result, lev, dropna=dropna)
  508. # Otherwise, level numbers may change as each successive level is stacked
  509. elif all(isinstance(lev, int) for lev in level):
  510. # As each stack is done, the level numbers decrease, so we need
  511. # to account for that when level is a sequence of ints
  512. result = frame
  513. # _get_level_number() checks level numbers are in range and converts
  514. # negative numbers to positive
  515. level = [frame.columns._get_level_number(lev) for lev in level]
  516. while level:
  517. lev = level.pop(0)
  518. result = stack(result, lev, dropna=dropna)
  519. # Decrement all level numbers greater than current, as these
  520. # have now shifted down by one
  521. level = [v if v <= lev else v - 1 for v in level]
  522. else:
  523. raise ValueError(
  524. "level should contain all level names or all level "
  525. "numbers, not a mixture of the two."
  526. )
  527. return result
  528. def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
  529. """Creates a MultiIndex from the first N-1 levels of this MultiIndex."""
  530. if len(columns.levels) <= 2:
  531. return columns.levels[0]._rename(name=columns.names[0])
  532. levs = [
  533. [lev[c] if c >= 0 else None for c in codes]
  534. for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
  535. ]
  536. # Remove duplicate tuples in the MultiIndex.
  537. tuples = zip(*levs)
  538. unique_tuples = (key for key, _ in itertools.groupby(tuples))
  539. new_levs = zip(*unique_tuples)
  540. # The dtype of each level must be explicitly set to avoid inferring the wrong type.
  541. # See GH-36991.
  542. return MultiIndex.from_arrays(
  543. [
  544. # Not all indices can accept None values.
  545. Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev
  546. for new_lev, lev in zip(new_levs, columns.levels)
  547. ],
  548. names=columns.names[:-1],
  549. )
  550. def _stack_multi_columns(
  551. frame: DataFrame, level_num: int = -1, dropna: bool = True
  552. ) -> DataFrame:
  553. def _convert_level_number(level_num: int, columns: Index):
  554. """
  555. Logic for converting the level number to something we can safely pass
  556. to swaplevel.
  557. If `level_num` matches a column name return the name from
  558. position `level_num`, otherwise return `level_num`.
  559. """
  560. if level_num in columns.names:
  561. return columns.names[level_num]
  562. return level_num
  563. this = frame.copy(deep=False)
  564. mi_cols = this.columns # cast(MultiIndex, this.columns)
  565. assert isinstance(mi_cols, MultiIndex) # caller is responsible
  566. # this makes life much simpler
  567. if level_num != mi_cols.nlevels - 1:
  568. # roll levels to put selected level at end
  569. roll_columns = mi_cols
  570. for i in range(level_num, mi_cols.nlevels - 1):
  571. # Need to check if the ints conflict with level names
  572. lev1 = _convert_level_number(i, roll_columns)
  573. lev2 = _convert_level_number(i + 1, roll_columns)
  574. roll_columns = roll_columns.swaplevel(lev1, lev2)
  575. this.columns = mi_cols = roll_columns
  576. if not mi_cols._is_lexsorted():
  577. # Workaround the edge case where 0 is one of the column names,
  578. # which interferes with trying to sort based on the first
  579. # level
  580. level_to_sort = _convert_level_number(0, mi_cols)
  581. this = this.sort_index(level=level_to_sort, axis=1)
  582. mi_cols = this.columns
  583. mi_cols = cast(MultiIndex, mi_cols)
  584. new_columns = _stack_multi_column_index(mi_cols)
  585. # time to ravel the values
  586. new_data = {}
  587. level_vals = mi_cols.levels[-1]
  588. level_codes = sorted(set(mi_cols.codes[-1]))
  589. level_vals_nan = level_vals.insert(len(level_vals), None)
  590. level_vals_used = np.take(level_vals_nan, level_codes)
  591. levsize = len(level_codes)
  592. drop_cols = []
  593. for key in new_columns:
  594. try:
  595. loc = this.columns.get_loc(key)
  596. except KeyError:
  597. drop_cols.append(key)
  598. continue
  599. # can make more efficient?
  600. # we almost always return a slice
  601. # but if unsorted can get a boolean
  602. # indexer
  603. if not isinstance(loc, slice):
  604. slice_len = len(loc)
  605. else:
  606. slice_len = loc.stop - loc.start
  607. if slice_len != levsize:
  608. chunk = this.loc[:, this.columns[loc]]
  609. chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
  610. value_slice = chunk.reindex(columns=level_vals_used).values
  611. else:
  612. if frame._is_homogeneous_type and is_extension_array_dtype(
  613. frame.dtypes.iloc[0]
  614. ):
  615. # TODO(EA2D): won't need special case, can go through .values
  616. # paths below (might change to ._values)
  617. dtype = this[this.columns[loc]].dtypes.iloc[0]
  618. subset = this[this.columns[loc]]
  619. value_slice = dtype.construct_array_type()._concat_same_type(
  620. [x._values for _, x in subset.items()]
  621. )
  622. N, K = subset.shape
  623. idx = np.arange(N * K).reshape(K, N).T.ravel()
  624. value_slice = value_slice.take(idx)
  625. elif frame._is_mixed_type:
  626. value_slice = this[this.columns[loc]].values
  627. else:
  628. value_slice = this.values[:, loc]
  629. if value_slice.ndim > 1:
  630. # i.e. not extension
  631. value_slice = value_slice.ravel()
  632. new_data[key] = value_slice
  633. if len(drop_cols) > 0:
  634. new_columns = new_columns.difference(drop_cols)
  635. N = len(this)
  636. if isinstance(this.index, MultiIndex):
  637. new_levels = list(this.index.levels)
  638. new_names = list(this.index.names)
  639. new_codes = [lab.repeat(levsize) for lab in this.index.codes]
  640. else:
  641. old_codes, old_levels = factorize_from_iterable(this.index)
  642. new_levels = [old_levels]
  643. new_codes = [old_codes.repeat(levsize)]
  644. new_names = [this.index.name] # something better?
  645. new_levels.append(level_vals)
  646. new_codes.append(np.tile(level_codes, N))
  647. new_names.append(frame.columns.names[level_num])
  648. new_index = MultiIndex(
  649. levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
  650. )
  651. result = frame._constructor(new_data, index=new_index, columns=new_columns)
  652. # more efficient way to go about this? can do the whole masking biz but
  653. # will only save a small amount of time...
  654. if dropna:
  655. result = result.dropna(axis=0, how="all")
  656. return result
  657. def _reorder_for_extension_array_stack(
  658. arr: ExtensionArray, n_rows: int, n_columns: int
  659. ) -> ExtensionArray:
  660. """
  661. Re-orders the values when stacking multiple extension-arrays.
  662. The indirect stacking method used for EAs requires a followup
  663. take to get the order correct.
  664. Parameters
  665. ----------
  666. arr : ExtensionArray
  667. n_rows, n_columns : int
  668. The number of rows and columns in the original DataFrame.
  669. Returns
  670. -------
  671. taken : ExtensionArray
  672. The original `arr` with elements re-ordered appropriately
  673. Examples
  674. --------
  675. >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
  676. >>> _reorder_for_extension_array_stack(arr, 2, 3)
  677. array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
  678. >>> _reorder_for_extension_array_stack(arr, 3, 2)
  679. array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
  680. """
  681. # final take to get the order correct.
  682. # idx is an indexer like
  683. # [c0r0, c1r0, c2r0, ...,
  684. # c0r1, c1r1, c2r1, ...]
  685. idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
  686. return arr.take(idx)