123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841 |
- from __future__ import annotations
- import itertools
- from typing import (
- TYPE_CHECKING,
- cast,
- )
- import warnings
- import numpy as np
- import pandas._libs.reshape as libreshape
- from pandas._typing import npt
- from pandas.errors import PerformanceWarning
- from pandas.util._decorators import cache_readonly
- from pandas.util._exceptions import find_stack_level
- from pandas.core.dtypes.cast import maybe_promote
- from pandas.core.dtypes.common import (
- ensure_platform_int,
- is_1d_only_ea_dtype,
- is_extension_array_dtype,
- is_integer,
- needs_i8_conversion,
- )
- from pandas.core.dtypes.dtypes import ExtensionDtype
- from pandas.core.dtypes.missing import notna
- import pandas.core.algorithms as algos
- from pandas.core.arrays.categorical import factorize_from_iterable
- from pandas.core.construction import ensure_wrapped_if_datetimelike
- from pandas.core.frame import DataFrame
- from pandas.core.indexes.api import (
- Index,
- MultiIndex,
- )
- from pandas.core.series import Series
- from pandas.core.sorting import (
- compress_group_index,
- decons_obs_group_ids,
- get_compressed_ids,
- get_group_index,
- get_group_index_sorter,
- )
- if TYPE_CHECKING:
- from pandas.core.arrays import ExtensionArray
- from pandas.core.indexes.frozen import FrozenList
- class _Unstacker:
- """
- Helper class to unstack data / pivot with multi-level index
- Parameters
- ----------
- index : MultiIndex
- level : int or str, default last level
- Level to "unstack". Accepts a name for the level.
- fill_value : scalar, optional
- Default value to fill in missing values if subgroups do not have the
- same set of labels. By default, missing values will be replaced with
- the default fill value for that data type, NaN for float, NaT for
- datetimelike, etc. For integer types, by default data will converted to
- float and missing values will be set to NaN.
- constructor : object
- Pandas ``DataFrame`` or subclass used to create unstacked
- response. If None, DataFrame will be used.
- Examples
- --------
- >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
- ... ('two', 'a'), ('two', 'b')])
- >>> s = pd.Series(np.arange(1, 5, dtype=np.int64), index=index)
- >>> s
- one a 1
- b 2
- two a 3
- b 4
- dtype: int64
- >>> s.unstack(level=-1)
- a b
- one 1 2
- two 3 4
- >>> s.unstack(level=0)
- one two
- a 1 3
- b 2 4
- Returns
- -------
- unstacked : DataFrame
- """
- def __init__(self, index: MultiIndex, level=-1, constructor=None) -> None:
- if constructor is None:
- constructor = DataFrame
- self.constructor = constructor
- self.index = index.remove_unused_levels()
- self.level = self.index._get_level_number(level)
- # when index includes `nan`, need to lift levels/strides by 1
- self.lift = 1 if -1 in self.index.codes[self.level] else 0
- # Note: the "pop" below alters these in-place.
- self.new_index_levels = list(self.index.levels)
- self.new_index_names = list(self.index.names)
- self.removed_name = self.new_index_names.pop(self.level)
- self.removed_level = self.new_index_levels.pop(self.level)
- self.removed_level_full = index.levels[self.level]
- # Bug fix GH 20601
- # If the data frame is too big, the number of unique index combination
- # will cause int32 overflow on windows environments.
- # We want to check and raise an error before this happens
- num_rows = np.max([index_level.size for index_level in self.new_index_levels])
- num_columns = self.removed_level.size
- # GH20601: This forces an overflow if the number of cells is too high.
- num_cells = num_rows * num_columns
- # GH 26314: Previous ValueError raised was too restrictive for many users.
- if num_cells > np.iinfo(np.int32).max:
- warnings.warn(
- f"The following operation may generate {num_cells} cells "
- f"in the resulting pandas object.",
- PerformanceWarning,
- stacklevel=find_stack_level(),
- )
- self._make_selectors()
- @cache_readonly
- def _indexer_and_to_sort(
- self,
- ) -> tuple[
- npt.NDArray[np.intp],
- list[np.ndarray], # each has _some_ signed integer dtype
- ]:
- v = self.level
- codes = list(self.index.codes)
- levs = list(self.index.levels)
- to_sort = codes[:v] + codes[v + 1 :] + [codes[v]]
- sizes = tuple(len(x) for x in levs[:v] + levs[v + 1 :] + [levs[v]])
- comp_index, obs_ids = get_compressed_ids(to_sort, sizes)
- ngroups = len(obs_ids)
- indexer = get_group_index_sorter(comp_index, ngroups)
- return indexer, to_sort
- @cache_readonly
- def sorted_labels(self) -> list[np.ndarray]:
- indexer, to_sort = self._indexer_and_to_sort
- return [line.take(indexer) for line in to_sort]
- def _make_sorted_values(self, values: np.ndarray) -> np.ndarray:
- indexer, _ = self._indexer_and_to_sort
- sorted_values = algos.take_nd(values, indexer, axis=0)
- return sorted_values
- def _make_selectors(self):
- new_levels = self.new_index_levels
- # make the mask
- remaining_labels = self.sorted_labels[:-1]
- level_sizes = tuple(len(x) for x in new_levels)
- comp_index, obs_ids = get_compressed_ids(remaining_labels, level_sizes)
- ngroups = len(obs_ids)
- comp_index = ensure_platform_int(comp_index)
- stride = self.index.levshape[self.level] + self.lift
- self.full_shape = ngroups, stride
- selector = self.sorted_labels[-1] + stride * comp_index + self.lift
- mask = np.zeros(np.prod(self.full_shape), dtype=bool)
- mask.put(selector, True)
- if mask.sum() < len(self.index):
- raise ValueError("Index contains duplicate entries, cannot reshape")
- self.group_index = comp_index
- self.mask = mask
- self.compressor = comp_index.searchsorted(np.arange(ngroups))
- @cache_readonly
- def mask_all(self) -> bool:
- return bool(self.mask.all())
- @cache_readonly
- def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]:
- # We cache this for re-use in ExtensionBlock._unstack
- dummy_arr = np.arange(len(self.index), dtype=np.intp)
- new_values, mask = self.get_new_values(dummy_arr, fill_value=-1)
- return new_values, mask.any(0)
- # TODO: in all tests we have mask.any(0).all(); can we rely on that?
- def get_result(self, values, value_columns, fill_value) -> DataFrame:
- if values.ndim == 1:
- values = values[:, np.newaxis]
- if value_columns is None and values.shape[1] != 1: # pragma: no cover
- raise ValueError("must pass column labels for multi-column data")
- values, _ = self.get_new_values(values, fill_value)
- columns = self.get_new_columns(value_columns)
- index = self.new_index
- return self.constructor(
- values, index=index, columns=columns, dtype=values.dtype
- )
- def get_new_values(self, values, fill_value=None):
- if values.ndim == 1:
- values = values[:, np.newaxis]
- sorted_values = self._make_sorted_values(values)
- # place the values
- length, width = self.full_shape
- stride = values.shape[1]
- result_width = width * stride
- result_shape = (length, result_width)
- mask = self.mask
- mask_all = self.mask_all
- # we can simply reshape if we don't have a mask
- if mask_all and len(values):
- # TODO: Under what circumstances can we rely on sorted_values
- # matching values? When that holds, we can slice instead
- # of take (in particular for EAs)
- new_values = (
- sorted_values.reshape(length, width, stride)
- .swapaxes(1, 2)
- .reshape(result_shape)
- )
- new_mask = np.ones(result_shape, dtype=bool)
- return new_values, new_mask
- dtype = values.dtype
- # if our mask is all True, then we can use our existing dtype
- if mask_all:
- dtype = values.dtype
- new_values = np.empty(result_shape, dtype=dtype)
- else:
- if isinstance(dtype, ExtensionDtype):
- # GH#41875
- # We are assuming that fill_value can be held by this dtype,
- # unlike the non-EA case that promotes.
- cls = dtype.construct_array_type()
- new_values = cls._empty(result_shape, dtype=dtype)
- new_values[:] = fill_value
- else:
- dtype, fill_value = maybe_promote(dtype, fill_value)
- new_values = np.empty(result_shape, dtype=dtype)
- new_values.fill(fill_value)
- name = dtype.name
- new_mask = np.zeros(result_shape, dtype=bool)
- # we need to convert to a basic dtype
- # and possibly coerce an input to our output dtype
- # e.g. ints -> floats
- if needs_i8_conversion(values.dtype):
- sorted_values = sorted_values.view("i8")
- new_values = new_values.view("i8")
- else:
- sorted_values = sorted_values.astype(name, copy=False)
- # fill in our values & mask
- libreshape.unstack(
- sorted_values,
- mask.view("u1"),
- stride,
- length,
- width,
- new_values,
- new_mask.view("u1"),
- )
- # reconstruct dtype if needed
- if needs_i8_conversion(values.dtype):
- # view as datetime64 so we can wrap in DatetimeArray and use
- # DTA's view method
- new_values = new_values.view("M8[ns]")
- new_values = ensure_wrapped_if_datetimelike(new_values)
- new_values = new_values.view(values.dtype)
- return new_values, new_mask
- def get_new_columns(self, value_columns: Index | None):
- if value_columns is None:
- if self.lift == 0:
- return self.removed_level._rename(name=self.removed_name)
- lev = self.removed_level.insert(0, item=self.removed_level._na_value)
- return lev.rename(self.removed_name)
- stride = len(self.removed_level) + self.lift
- width = len(value_columns)
- propagator = np.repeat(np.arange(width), stride)
- new_levels: FrozenList | list[Index]
- if isinstance(value_columns, MultiIndex):
- # error: Cannot determine type of "__add__" [has-type]
- new_levels = value_columns.levels + ( # type: ignore[has-type]
- self.removed_level_full,
- )
- new_names = value_columns.names + (self.removed_name,)
- new_codes = [lab.take(propagator) for lab in value_columns.codes]
- else:
- new_levels = [
- value_columns,
- self.removed_level_full,
- ]
- new_names = [value_columns.name, self.removed_name]
- new_codes = [propagator]
- repeater = self._repeater
- # The entire level is then just a repetition of the single chunk:
- new_codes.append(np.tile(repeater, width))
- return MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
- @cache_readonly
- def _repeater(self) -> np.ndarray:
- # The two indices differ only if the unstacked level had unused items:
- if len(self.removed_level_full) != len(self.removed_level):
- # In this case, we remap the new codes to the original level:
- repeater = self.removed_level_full.get_indexer(self.removed_level)
- if self.lift:
- repeater = np.insert(repeater, 0, -1)
- else:
- # Otherwise, we just use each level item exactly once:
- stride = len(self.removed_level) + self.lift
- repeater = np.arange(stride) - self.lift
- return repeater
- @cache_readonly
- def new_index(self) -> MultiIndex:
- # Does not depend on values or value_columns
- result_codes = [lab.take(self.compressor) for lab in self.sorted_labels[:-1]]
- # construct the new index
- if len(self.new_index_levels) == 1:
- level, level_codes = self.new_index_levels[0], result_codes[0]
- if (level_codes == -1).any():
- level = level.insert(len(level), level._na_value)
- return level.take(level_codes).rename(self.new_index_names[0])
- return MultiIndex(
- levels=self.new_index_levels,
- codes=result_codes,
- names=self.new_index_names,
- verify_integrity=False,
- )
- def _unstack_multiple(data, clocs, fill_value=None):
- if len(clocs) == 0:
- return data
- # NOTE: This doesn't deal with hierarchical columns yet
- index = data.index
- # GH 19966 Make sure if MultiIndexed index has tuple name, they will be
- # recognised as a whole
- if clocs in index.names:
- clocs = [clocs]
- clocs = [index._get_level_number(i) for i in clocs]
- rlocs = [i for i in range(index.nlevels) if i not in clocs]
- clevels = [index.levels[i] for i in clocs]
- ccodes = [index.codes[i] for i in clocs]
- cnames = [index.names[i] for i in clocs]
- rlevels = [index.levels[i] for i in rlocs]
- rcodes = [index.codes[i] for i in rlocs]
- rnames = [index.names[i] for i in rlocs]
- shape = tuple(len(x) for x in clevels)
- group_index = get_group_index(ccodes, shape, sort=False, xnull=False)
- comp_ids, obs_ids = compress_group_index(group_index, sort=False)
- recons_codes = decons_obs_group_ids(comp_ids, obs_ids, shape, ccodes, xnull=False)
- if not rlocs:
- # Everything is in clocs, so the dummy df has a regular index
- dummy_index = Index(obs_ids, name="__placeholder__")
- else:
- dummy_index = MultiIndex(
- levels=rlevels + [obs_ids],
- codes=rcodes + [comp_ids],
- names=rnames + ["__placeholder__"],
- verify_integrity=False,
- )
- if isinstance(data, Series):
- dummy = data.copy()
- dummy.index = dummy_index
- unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
- new_levels = clevels
- new_names = cnames
- new_codes = recons_codes
- else:
- if isinstance(data.columns, MultiIndex):
- result = data
- while clocs:
- val = clocs.pop(0)
- result = result.unstack(val, fill_value=fill_value)
- clocs = [v if v < val else v - 1 for v in clocs]
- return result
- # GH#42579 deep=False to avoid consolidating
- dummy = data.copy(deep=False)
- dummy.index = dummy_index
- unstacked = dummy.unstack("__placeholder__", fill_value=fill_value)
- if isinstance(unstacked, Series):
- unstcols = unstacked.index
- else:
- unstcols = unstacked.columns
- assert isinstance(unstcols, MultiIndex) # for mypy
- new_levels = [unstcols.levels[0]] + clevels
- new_names = [data.columns.name] + cnames
- new_codes = [unstcols.codes[0]]
- for rec in recons_codes:
- new_codes.append(rec.take(unstcols.codes[-1]))
- new_columns = MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
- if isinstance(unstacked, Series):
- unstacked.index = new_columns
- else:
- unstacked.columns = new_columns
- return unstacked
- def unstack(obj: Series | DataFrame, level, fill_value=None):
- if isinstance(level, (tuple, list)):
- if len(level) != 1:
- # _unstack_multiple only handles MultiIndexes,
- # and isn't needed for a single level
- return _unstack_multiple(obj, level, fill_value=fill_value)
- else:
- level = level[0]
- if not is_integer(level) and not level == "__placeholder__":
- # check if level is valid in case of regular index
- obj.index._get_level_number(level)
- if isinstance(obj, DataFrame):
- if isinstance(obj.index, MultiIndex):
- return _unstack_frame(obj, level, fill_value=fill_value)
- else:
- return obj.T.stack(dropna=False)
- elif not isinstance(obj.index, MultiIndex):
- # GH 36113
- # Give nicer error messages when unstack a Series whose
- # Index is not a MultiIndex.
- raise ValueError(
- f"index must be a MultiIndex to unstack, {type(obj.index)} was passed"
- )
- else:
- if is_1d_only_ea_dtype(obj.dtype):
- return _unstack_extension_series(obj, level, fill_value)
- unstacker = _Unstacker(
- obj.index, level=level, constructor=obj._constructor_expanddim
- )
- return unstacker.get_result(
- obj._values, value_columns=None, fill_value=fill_value
- )
- def _unstack_frame(obj: DataFrame, level, fill_value=None):
- assert isinstance(obj.index, MultiIndex) # checked by caller
- unstacker = _Unstacker(obj.index, level=level, constructor=obj._constructor)
- if not obj._can_fast_transpose:
- mgr = obj._mgr.unstack(unstacker, fill_value=fill_value)
- return obj._constructor(mgr)
- else:
- return unstacker.get_result(
- obj._values, value_columns=obj.columns, fill_value=fill_value
- )
- def _unstack_extension_series(series: Series, level, fill_value) -> DataFrame:
- """
- Unstack an ExtensionArray-backed Series.
- The ExtensionDtype is preserved.
- Parameters
- ----------
- series : Series
- A Series with an ExtensionArray for values
- level : Any
- The level name or number.
- fill_value : Any
- The user-level (not physical storage) fill value to use for
- missing values introduced by the reshape. Passed to
- ``series.values.take``.
- Returns
- -------
- DataFrame
- Each column of the DataFrame will have the same dtype as
- the input Series.
- """
- # Defer to the logic in ExtensionBlock._unstack
- df = series.to_frame()
- result = df.unstack(level=level, fill_value=fill_value)
- # equiv: result.droplevel(level=0, axis=1)
- # but this avoids an extra copy
- result.columns = result.columns.droplevel(0)
- return result
- def stack(frame: DataFrame, level=-1, dropna: bool = True):
- """
- Convert DataFrame to Series with multi-level Index. Columns become the
- second level of the resulting hierarchical index
- Returns
- -------
- stacked : Series or DataFrame
- """
- def factorize(index):
- if index.is_unique:
- return index, np.arange(len(index))
- codes, categories = factorize_from_iterable(index)
- return categories, codes
- N, K = frame.shape
- # Will also convert negative level numbers and check if out of bounds.
- level_num = frame.columns._get_level_number(level)
- if isinstance(frame.columns, MultiIndex):
- return _stack_multi_columns(frame, level_num=level_num, dropna=dropna)
- elif isinstance(frame.index, MultiIndex):
- new_levels = list(frame.index.levels)
- new_codes = [lab.repeat(K) for lab in frame.index.codes]
- clev, clab = factorize(frame.columns)
- new_levels.append(clev)
- new_codes.append(np.tile(clab, N).ravel())
- new_names = list(frame.index.names)
- new_names.append(frame.columns.name)
- new_index = MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
- else:
- levels, (ilab, clab) = zip(*map(factorize, (frame.index, frame.columns)))
- codes = ilab.repeat(K), np.tile(clab, N).ravel()
- new_index = MultiIndex(
- levels=levels,
- codes=codes,
- names=[frame.index.name, frame.columns.name],
- verify_integrity=False,
- )
- if not frame.empty and frame._is_homogeneous_type:
- # For homogeneous EAs, frame._values will coerce to object. So
- # we concatenate instead.
- dtypes = list(frame.dtypes._values)
- dtype = dtypes[0]
- if is_extension_array_dtype(dtype):
- arr = dtype.construct_array_type()
- new_values = arr._concat_same_type(
- [col._values for _, col in frame.items()]
- )
- new_values = _reorder_for_extension_array_stack(new_values, N, K)
- else:
- # homogeneous, non-EA
- new_values = frame._values.ravel()
- else:
- # non-homogeneous
- new_values = frame._values.ravel()
- if dropna:
- mask = notna(new_values)
- new_values = new_values[mask]
- new_index = new_index[mask]
- return frame._constructor_sliced(new_values, index=new_index)
- def stack_multiple(frame, level, dropna: bool = True):
- # If all passed levels match up to column names, no
- # ambiguity about what to do
- if all(lev in frame.columns.names for lev in level):
- result = frame
- for lev in level:
- result = stack(result, lev, dropna=dropna)
- # Otherwise, level numbers may change as each successive level is stacked
- elif all(isinstance(lev, int) for lev in level):
- # As each stack is done, the level numbers decrease, so we need
- # to account for that when level is a sequence of ints
- result = frame
- # _get_level_number() checks level numbers are in range and converts
- # negative numbers to positive
- level = [frame.columns._get_level_number(lev) for lev in level]
- while level:
- lev = level.pop(0)
- result = stack(result, lev, dropna=dropna)
- # Decrement all level numbers greater than current, as these
- # have now shifted down by one
- level = [v if v <= lev else v - 1 for v in level]
- else:
- raise ValueError(
- "level should contain all level names or all level "
- "numbers, not a mixture of the two."
- )
- return result
- def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:
- """Creates a MultiIndex from the first N-1 levels of this MultiIndex."""
- if len(columns.levels) <= 2:
- return columns.levels[0]._rename(name=columns.names[0])
- levs = [
- [lev[c] if c >= 0 else None for c in codes]
- for lev, codes in zip(columns.levels[:-1], columns.codes[:-1])
- ]
- # Remove duplicate tuples in the MultiIndex.
- tuples = zip(*levs)
- unique_tuples = (key for key, _ in itertools.groupby(tuples))
- new_levs = zip(*unique_tuples)
- # The dtype of each level must be explicitly set to avoid inferring the wrong type.
- # See GH-36991.
- return MultiIndex.from_arrays(
- [
- # Not all indices can accept None values.
- Index(new_lev, dtype=lev.dtype) if None not in new_lev else new_lev
- for new_lev, lev in zip(new_levs, columns.levels)
- ],
- names=columns.names[:-1],
- )
- def _stack_multi_columns(
- frame: DataFrame, level_num: int = -1, dropna: bool = True
- ) -> DataFrame:
- def _convert_level_number(level_num: int, columns: Index):
- """
- Logic for converting the level number to something we can safely pass
- to swaplevel.
- If `level_num` matches a column name return the name from
- position `level_num`, otherwise return `level_num`.
- """
- if level_num in columns.names:
- return columns.names[level_num]
- return level_num
- this = frame.copy(deep=False)
- mi_cols = this.columns # cast(MultiIndex, this.columns)
- assert isinstance(mi_cols, MultiIndex) # caller is responsible
- # this makes life much simpler
- if level_num != mi_cols.nlevels - 1:
- # roll levels to put selected level at end
- roll_columns = mi_cols
- for i in range(level_num, mi_cols.nlevels - 1):
- # Need to check if the ints conflict with level names
- lev1 = _convert_level_number(i, roll_columns)
- lev2 = _convert_level_number(i + 1, roll_columns)
- roll_columns = roll_columns.swaplevel(lev1, lev2)
- this.columns = mi_cols = roll_columns
- if not mi_cols._is_lexsorted():
- # Workaround the edge case where 0 is one of the column names,
- # which interferes with trying to sort based on the first
- # level
- level_to_sort = _convert_level_number(0, mi_cols)
- this = this.sort_index(level=level_to_sort, axis=1)
- mi_cols = this.columns
- mi_cols = cast(MultiIndex, mi_cols)
- new_columns = _stack_multi_column_index(mi_cols)
- # time to ravel the values
- new_data = {}
- level_vals = mi_cols.levels[-1]
- level_codes = sorted(set(mi_cols.codes[-1]))
- level_vals_nan = level_vals.insert(len(level_vals), None)
- level_vals_used = np.take(level_vals_nan, level_codes)
- levsize = len(level_codes)
- drop_cols = []
- for key in new_columns:
- try:
- loc = this.columns.get_loc(key)
- except KeyError:
- drop_cols.append(key)
- continue
- # can make more efficient?
- # we almost always return a slice
- # but if unsorted can get a boolean
- # indexer
- if not isinstance(loc, slice):
- slice_len = len(loc)
- else:
- slice_len = loc.stop - loc.start
- if slice_len != levsize:
- chunk = this.loc[:, this.columns[loc]]
- chunk.columns = level_vals_nan.take(chunk.columns.codes[-1])
- value_slice = chunk.reindex(columns=level_vals_used).values
- else:
- if frame._is_homogeneous_type and is_extension_array_dtype(
- frame.dtypes.iloc[0]
- ):
- # TODO(EA2D): won't need special case, can go through .values
- # paths below (might change to ._values)
- dtype = this[this.columns[loc]].dtypes.iloc[0]
- subset = this[this.columns[loc]]
- value_slice = dtype.construct_array_type()._concat_same_type(
- [x._values for _, x in subset.items()]
- )
- N, K = subset.shape
- idx = np.arange(N * K).reshape(K, N).T.ravel()
- value_slice = value_slice.take(idx)
- elif frame._is_mixed_type:
- value_slice = this[this.columns[loc]].values
- else:
- value_slice = this.values[:, loc]
- if value_slice.ndim > 1:
- # i.e. not extension
- value_slice = value_slice.ravel()
- new_data[key] = value_slice
- if len(drop_cols) > 0:
- new_columns = new_columns.difference(drop_cols)
- N = len(this)
- if isinstance(this.index, MultiIndex):
- new_levels = list(this.index.levels)
- new_names = list(this.index.names)
- new_codes = [lab.repeat(levsize) for lab in this.index.codes]
- else:
- old_codes, old_levels = factorize_from_iterable(this.index)
- new_levels = [old_levels]
- new_codes = [old_codes.repeat(levsize)]
- new_names = [this.index.name] # something better?
- new_levels.append(level_vals)
- new_codes.append(np.tile(level_codes, N))
- new_names.append(frame.columns.names[level_num])
- new_index = MultiIndex(
- levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
- )
- result = frame._constructor(new_data, index=new_index, columns=new_columns)
- # more efficient way to go about this? can do the whole masking biz but
- # will only save a small amount of time...
- if dropna:
- result = result.dropna(axis=0, how="all")
- return result
- def _reorder_for_extension_array_stack(
- arr: ExtensionArray, n_rows: int, n_columns: int
- ) -> ExtensionArray:
- """
- Re-orders the values when stacking multiple extension-arrays.
- The indirect stacking method used for EAs requires a followup
- take to get the order correct.
- Parameters
- ----------
- arr : ExtensionArray
- n_rows, n_columns : int
- The number of rows and columns in the original DataFrame.
- Returns
- -------
- taken : ExtensionArray
- The original `arr` with elements re-ordered appropriately
- Examples
- --------
- >>> arr = np.array(['a', 'b', 'c', 'd', 'e', 'f'])
- >>> _reorder_for_extension_array_stack(arr, 2, 3)
- array(['a', 'c', 'e', 'b', 'd', 'f'], dtype='<U1')
- >>> _reorder_for_extension_array_stack(arr, 3, 2)
- array(['a', 'd', 'b', 'e', 'c', 'f'], dtype='<U1')
- """
- # final take to get the order correct.
- # idx is an indexer like
- # [c0r0, c1r0, c2r0, ...,
- # c0r1, c1r1, c2r1, ...]
- idx = np.arange(n_rows * n_columns).reshape(n_columns, n_rows).T.ravel()
- return arr.take(idx)
|