123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533 |
- from __future__ import annotations
- from collections import defaultdict
- import itertools
- from typing import (
- Hashable,
- Iterable,
- )
- import numpy as np
- from pandas._libs.sparse import IntIndex
- from pandas._typing import NpDtype
- from pandas.core.dtypes.common import (
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- pandas_dtype,
- )
- from pandas.core.arrays import SparseArray
- from pandas.core.arrays.categorical import factorize_from_iterable
- from pandas.core.frame import DataFrame
- from pandas.core.indexes.api import (
- Index,
- default_index,
- )
- from pandas.core.series import Series
- def get_dummies(
- data,
- prefix=None,
- prefix_sep: str | Iterable[str] | dict[str, str] = "_",
- dummy_na: bool = False,
- columns=None,
- sparse: bool = False,
- drop_first: bool = False,
- dtype: NpDtype | None = None,
- ) -> DataFrame:
- """
- Convert categorical variable into dummy/indicator variables.
- Each variable is converted in as many 0/1 variables as there are different
- values. Columns in the output are each named after a value; if the input is
- a DataFrame, the name of the original variable is prepended to the value.
- Parameters
- ----------
- data : array-like, Series, or DataFrame
- Data of which to get dummy indicators.
- prefix : str, list of str, or dict of str, default None
- String to append DataFrame column names.
- Pass a list with length equal to the number of columns
- when calling get_dummies on a DataFrame. Alternatively, `prefix`
- can be a dictionary mapping column names to prefixes.
- prefix_sep : str, default '_'
- If appending prefix, separator/delimiter to use. Or pass a
- list or dictionary as with `prefix`.
- dummy_na : bool, default False
- Add a column to indicate NaNs, if False NaNs are ignored.
- columns : list-like, default None
- Column names in the DataFrame to be encoded.
- If `columns` is None then all the columns with
- `object`, `string`, or `category` dtype will be converted.
- sparse : bool, default False
- Whether the dummy-encoded columns should be backed by
- a :class:`SparseArray` (True) or a regular NumPy array (False).
- drop_first : bool, default False
- Whether to get k-1 dummies out of k categorical levels by removing the
- first level.
- dtype : dtype, default bool
- Data type for new columns. Only a single dtype is allowed.
- Returns
- -------
- DataFrame
- Dummy-coded data. If `data` contains other columns than the
- dummy-coded one(s), these will be prepended, unaltered, to the result.
- See Also
- --------
- Series.str.get_dummies : Convert Series of strings to dummy codes.
- :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
- Notes
- -----
- Reference :ref:`the user guide <reshaping.dummies>` for more examples.
- Examples
- --------
- >>> s = pd.Series(list('abca'))
- >>> pd.get_dummies(s)
- a b c
- 0 True False False
- 1 False True False
- 2 False False True
- 3 True False False
- >>> s1 = ['a', 'b', np.nan]
- >>> pd.get_dummies(s1)
- a b
- 0 True False
- 1 False True
- 2 False False
- >>> pd.get_dummies(s1, dummy_na=True)
- a b NaN
- 0 True False False
- 1 False True False
- 2 False False True
- >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
- ... 'C': [1, 2, 3]})
- >>> pd.get_dummies(df, prefix=['col1', 'col2'])
- C col1_a col1_b col2_a col2_b col2_c
- 0 1 True False False True False
- 1 2 False True True False False
- 2 3 True False False False True
- >>> pd.get_dummies(pd.Series(list('abcaa')))
- a b c
- 0 True False False
- 1 False True False
- 2 False False True
- 3 True False False
- 4 True False False
- >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
- b c
- 0 False False
- 1 True False
- 2 False True
- 3 False False
- 4 False False
- >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
- a b c
- 0 1.0 0.0 0.0
- 1 0.0 1.0 0.0
- 2 0.0 0.0 1.0
- """
- from pandas.core.reshape.concat import concat
- dtypes_to_encode = ["object", "string", "category"]
- if isinstance(data, DataFrame):
- # determine columns being encoded
- if columns is None:
- data_to_encode = data.select_dtypes(include=dtypes_to_encode)
- elif not is_list_like(columns):
- raise TypeError("Input must be a list-like for parameter `columns`")
- else:
- data_to_encode = data[columns]
- # validate prefixes and separator to avoid silently dropping cols
- def check_len(item, name):
- if is_list_like(item):
- if not len(item) == data_to_encode.shape[1]:
- len_msg = (
- f"Length of '{name}' ({len(item)}) did not match the "
- "length of the columns being encoded "
- f"({data_to_encode.shape[1]})."
- )
- raise ValueError(len_msg)
- check_len(prefix, "prefix")
- check_len(prefix_sep, "prefix_sep")
- if isinstance(prefix, str):
- prefix = itertools.cycle([prefix])
- if isinstance(prefix, dict):
- prefix = [prefix[col] for col in data_to_encode.columns]
- if prefix is None:
- prefix = data_to_encode.columns
- # validate separators
- if isinstance(prefix_sep, str):
- prefix_sep = itertools.cycle([prefix_sep])
- elif isinstance(prefix_sep, dict):
- prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
- with_dummies: list[DataFrame]
- if data_to_encode.shape == data.shape:
- # Encoding the entire df, do not prepend any dropped columns
- with_dummies = []
- elif columns is not None:
- # Encoding only cols specified in columns. Get all cols not in
- # columns to prepend to result.
- with_dummies = [data.drop(columns, axis=1)]
- else:
- # Encoding only object and category dtype columns. Get remaining
- # columns to prepend to result.
- with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
- for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):
- # col is (column_name, column), use just column data here
- dummy = _get_dummies_1d(
- col[1],
- prefix=pre,
- prefix_sep=sep,
- dummy_na=dummy_na,
- sparse=sparse,
- drop_first=drop_first,
- dtype=dtype,
- )
- with_dummies.append(dummy)
- result = concat(with_dummies, axis=1)
- else:
- result = _get_dummies_1d(
- data,
- prefix,
- prefix_sep,
- dummy_na,
- sparse=sparse,
- drop_first=drop_first,
- dtype=dtype,
- )
- return result
- def _get_dummies_1d(
- data,
- prefix,
- prefix_sep: str | Iterable[str] | dict[str, str] = "_",
- dummy_na: bool = False,
- sparse: bool = False,
- drop_first: bool = False,
- dtype: NpDtype | None = None,
- ) -> DataFrame:
- from pandas.core.reshape.concat import concat
- # Series avoids inconsistent NaN handling
- codes, levels = factorize_from_iterable(Series(data, copy=False))
- if dtype is None:
- dtype = np.dtype(bool)
- _dtype = pandas_dtype(dtype)
- if is_object_dtype(_dtype):
- raise ValueError("dtype=object is not a valid dtype for get_dummies")
- def get_empty_frame(data) -> DataFrame:
- index: Index | np.ndarray
- if isinstance(data, Series):
- index = data.index
- else:
- index = default_index(len(data))
- return DataFrame(index=index)
- # if all NaN
- if not dummy_na and len(levels) == 0:
- return get_empty_frame(data)
- codes = codes.copy()
- if dummy_na:
- codes[codes == -1] = len(levels)
- levels = levels.insert(len(levels), np.nan)
- # if dummy_na, we just fake a nan level. drop_first will drop it again
- if drop_first and len(levels) == 1:
- return get_empty_frame(data)
- number_of_cols = len(levels)
- if prefix is None:
- dummy_cols = levels
- else:
- dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
- index: Index | None
- if isinstance(data, Series):
- index = data.index
- else:
- index = None
- if sparse:
- fill_value: bool | float
- if is_integer_dtype(dtype):
- fill_value = 0
- elif dtype == np.dtype(bool):
- fill_value = False
- else:
- fill_value = 0.0
- sparse_series = []
- N = len(data)
- sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
- mask = codes != -1
- codes = codes[mask]
- n_idx = np.arange(N)[mask]
- for ndx, code in zip(n_idx, codes):
- sp_indices[code].append(ndx)
- if drop_first:
- # remove first categorical level to avoid perfect collinearity
- # GH12042
- sp_indices = sp_indices[1:]
- dummy_cols = dummy_cols[1:]
- for col, ixs in zip(dummy_cols, sp_indices):
- sarr = SparseArray(
- np.ones(len(ixs), dtype=dtype),
- sparse_index=IntIndex(N, ixs),
- fill_value=fill_value,
- dtype=dtype,
- )
- sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))
- return concat(sparse_series, axis=1, copy=False)
- else:
- # take on axis=1 + transpose to ensure ndarray layout is column-major
- eye_dtype: NpDtype
- if isinstance(_dtype, np.dtype):
- eye_dtype = _dtype
- else:
- eye_dtype = np.bool_
- dummy_mat = np.eye(number_of_cols, dtype=eye_dtype).take(codes, axis=1).T
- if not dummy_na:
- # reset NaN GH4446
- dummy_mat[codes == -1] = 0
- if drop_first:
- # remove first GH12042
- dummy_mat = dummy_mat[:, 1:]
- dummy_cols = dummy_cols[1:]
- return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
- def from_dummies(
- data: DataFrame,
- sep: None | str = None,
- default_category: None | Hashable | dict[str, Hashable] = None,
- ) -> DataFrame:
- """
- Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.
- Inverts the operation performed by :func:`~pandas.get_dummies`.
- .. versionadded:: 1.5.0
- Parameters
- ----------
- data : DataFrame
- Data which contains dummy-coded variables in form of integer columns of
- 1's and 0's.
- sep : str, default None
- Separator used in the column names of the dummy categories they are
- character indicating the separation of the categorical names from the prefixes.
- For example, if your column names are 'prefix_A' and 'prefix_B',
- you can strip the underscore by specifying sep='_'.
- default_category : None, Hashable or dict of Hashables, default None
- The default category is the implied category when a value has none of the
- listed categories specified with a one, i.e. if all dummies in a row are
- zero. Can be a single value for all variables or a dict directly mapping
- the default categories to a prefix of a variable.
- Returns
- -------
- DataFrame
- Categorical data decoded from the dummy input-data.
- Raises
- ------
- ValueError
- * When the input ``DataFrame`` ``data`` contains NA values.
- * When the input ``DataFrame`` ``data`` contains column names with separators
- that do not match the separator specified with ``sep``.
- * When a ``dict`` passed to ``default_category`` does not include an implied
- category for each prefix.
- * When a value in ``data`` has more than one category assigned to it.
- * When ``default_category=None`` and a value in ``data`` has no category
- assigned to it.
- TypeError
- * When the input ``data`` is not of type ``DataFrame``.
- * When the input ``DataFrame`` ``data`` contains non-dummy data.
- * When the passed ``sep`` is of a wrong data type.
- * When the passed ``default_category`` is of a wrong data type.
- See Also
- --------
- :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.
- :class:`~pandas.Categorical` : Represent a categorical variable in classic.
- Notes
- -----
- The columns of the passed dummy data should only include 1's and 0's,
- or boolean values.
- Examples
- --------
- >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
- ... "c": [0, 0, 1, 0]})
- >>> df
- a b c
- 0 1 0 0
- 1 0 1 0
- 2 0 0 1
- 3 1 0 0
- >>> pd.from_dummies(df)
- 0 a
- 1 b
- 2 c
- 3 a
- >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
- ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
- ... "col2_c": [0, 0, 1]})
- >>> df
- col1_a col1_b col2_a col2_b col2_c
- 0 1 0 0 1 0
- 1 0 1 1 0 0
- 2 1 0 0 0 1
- >>> pd.from_dummies(df, sep="_")
- col1 col2
- 0 a b
- 1 b a
- 2 a c
- >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],
- ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
- ... "col2_c": [0, 0, 0]})
- >>> df
- col1_a col1_b col2_a col2_b col2_c
- 0 1 0 0 1 0
- 1 0 1 1 0 0
- 2 0 0 0 0 0
- >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})
- col1 col2
- 0 a b
- 1 b a
- 2 d e
- """
- from pandas.core.reshape.concat import concat
- if not isinstance(data, DataFrame):
- raise TypeError(
- "Expected 'data' to be a 'DataFrame'; "
- f"Received 'data' of type: {type(data).__name__}"
- )
- if data.isna().any().any():
- raise ValueError(
- "Dummy DataFrame contains NA value in column: "
- f"'{data.isna().any().idxmax()}'"
- )
- # index data with a list of all columns that are dummies
- try:
- data_to_decode = data.astype("boolean", copy=False)
- except TypeError:
- raise TypeError("Passed DataFrame contains non-dummy data")
- # collect prefixes and get lists to slice data for each prefix
- variables_slice = defaultdict(list)
- if sep is None:
- variables_slice[""] = list(data.columns)
- elif isinstance(sep, str):
- for col in data_to_decode.columns:
- prefix = col.split(sep)[0]
- if len(prefix) == len(col):
- raise ValueError(f"Separator not specified for column: {col}")
- variables_slice[prefix].append(col)
- else:
- raise TypeError(
- "Expected 'sep' to be of type 'str' or 'None'; "
- f"Received 'sep' of type: {type(sep).__name__}"
- )
- if default_category is not None:
- if isinstance(default_category, dict):
- if not len(default_category) == len(variables_slice):
- len_msg = (
- f"Length of 'default_category' ({len(default_category)}) "
- f"did not match the length of the columns being encoded "
- f"({len(variables_slice)})"
- )
- raise ValueError(len_msg)
- elif isinstance(default_category, Hashable):
- default_category = dict(
- zip(variables_slice, [default_category] * len(variables_slice))
- )
- else:
- raise TypeError(
- "Expected 'default_category' to be of type "
- "'None', 'Hashable', or 'dict'; "
- "Received 'default_category' of type: "
- f"{type(default_category).__name__}"
- )
- cat_data = {}
- for prefix, prefix_slice in variables_slice.items():
- if sep is None:
- cats = prefix_slice.copy()
- else:
- cats = [col[len(prefix + sep) :] for col in prefix_slice]
- assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)
- if any(assigned > 1):
- raise ValueError(
- "Dummy DataFrame contains multi-assignment(s); "
- f"First instance in row: {assigned.idxmax()}"
- )
- if any(assigned == 0):
- if isinstance(default_category, dict):
- cats.append(default_category[prefix])
- else:
- raise ValueError(
- "Dummy DataFrame contains unassigned value(s); "
- f"First instance in row: {assigned.idxmin()}"
- )
- data_slice = concat(
- (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1
- )
- else:
- data_slice = data_to_decode.loc[:, prefix_slice]
- cats_array = np.array(cats, dtype="object")
- # get indices of True entries along axis=1
- cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]]
- return DataFrame(cat_data)
|