encoding.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533
  1. from __future__ import annotations
  2. from collections import defaultdict
  3. import itertools
  4. from typing import (
  5. Hashable,
  6. Iterable,
  7. )
  8. import numpy as np
  9. from pandas._libs.sparse import IntIndex
  10. from pandas._typing import NpDtype
  11. from pandas.core.dtypes.common import (
  12. is_integer_dtype,
  13. is_list_like,
  14. is_object_dtype,
  15. pandas_dtype,
  16. )
  17. from pandas.core.arrays import SparseArray
  18. from pandas.core.arrays.categorical import factorize_from_iterable
  19. from pandas.core.frame import DataFrame
  20. from pandas.core.indexes.api import (
  21. Index,
  22. default_index,
  23. )
  24. from pandas.core.series import Series
  25. def get_dummies(
  26. data,
  27. prefix=None,
  28. prefix_sep: str | Iterable[str] | dict[str, str] = "_",
  29. dummy_na: bool = False,
  30. columns=None,
  31. sparse: bool = False,
  32. drop_first: bool = False,
  33. dtype: NpDtype | None = None,
  34. ) -> DataFrame:
  35. """
  36. Convert categorical variable into dummy/indicator variables.
  37. Each variable is converted in as many 0/1 variables as there are different
  38. values. Columns in the output are each named after a value; if the input is
  39. a DataFrame, the name of the original variable is prepended to the value.
  40. Parameters
  41. ----------
  42. data : array-like, Series, or DataFrame
  43. Data of which to get dummy indicators.
  44. prefix : str, list of str, or dict of str, default None
  45. String to append DataFrame column names.
  46. Pass a list with length equal to the number of columns
  47. when calling get_dummies on a DataFrame. Alternatively, `prefix`
  48. can be a dictionary mapping column names to prefixes.
  49. prefix_sep : str, default '_'
  50. If appending prefix, separator/delimiter to use. Or pass a
  51. list or dictionary as with `prefix`.
  52. dummy_na : bool, default False
  53. Add a column to indicate NaNs, if False NaNs are ignored.
  54. columns : list-like, default None
  55. Column names in the DataFrame to be encoded.
  56. If `columns` is None then all the columns with
  57. `object`, `string`, or `category` dtype will be converted.
  58. sparse : bool, default False
  59. Whether the dummy-encoded columns should be backed by
  60. a :class:`SparseArray` (True) or a regular NumPy array (False).
  61. drop_first : bool, default False
  62. Whether to get k-1 dummies out of k categorical levels by removing the
  63. first level.
  64. dtype : dtype, default bool
  65. Data type for new columns. Only a single dtype is allowed.
  66. Returns
  67. -------
  68. DataFrame
  69. Dummy-coded data. If `data` contains other columns than the
  70. dummy-coded one(s), these will be prepended, unaltered, to the result.
  71. See Also
  72. --------
  73. Series.str.get_dummies : Convert Series of strings to dummy codes.
  74. :func:`~pandas.from_dummies` : Convert dummy codes to categorical ``DataFrame``.
  75. Notes
  76. -----
  77. Reference :ref:`the user guide <reshaping.dummies>` for more examples.
  78. Examples
  79. --------
  80. >>> s = pd.Series(list('abca'))
  81. >>> pd.get_dummies(s)
  82. a b c
  83. 0 True False False
  84. 1 False True False
  85. 2 False False True
  86. 3 True False False
  87. >>> s1 = ['a', 'b', np.nan]
  88. >>> pd.get_dummies(s1)
  89. a b
  90. 0 True False
  91. 1 False True
  92. 2 False False
  93. >>> pd.get_dummies(s1, dummy_na=True)
  94. a b NaN
  95. 0 True False False
  96. 1 False True False
  97. 2 False False True
  98. >>> df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
  99. ... 'C': [1, 2, 3]})
  100. >>> pd.get_dummies(df, prefix=['col1', 'col2'])
  101. C col1_a col1_b col2_a col2_b col2_c
  102. 0 1 True False False True False
  103. 1 2 False True True False False
  104. 2 3 True False False False True
  105. >>> pd.get_dummies(pd.Series(list('abcaa')))
  106. a b c
  107. 0 True False False
  108. 1 False True False
  109. 2 False False True
  110. 3 True False False
  111. 4 True False False
  112. >>> pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)
  113. b c
  114. 0 False False
  115. 1 True False
  116. 2 False True
  117. 3 False False
  118. 4 False False
  119. >>> pd.get_dummies(pd.Series(list('abc')), dtype=float)
  120. a b c
  121. 0 1.0 0.0 0.0
  122. 1 0.0 1.0 0.0
  123. 2 0.0 0.0 1.0
  124. """
  125. from pandas.core.reshape.concat import concat
  126. dtypes_to_encode = ["object", "string", "category"]
  127. if isinstance(data, DataFrame):
  128. # determine columns being encoded
  129. if columns is None:
  130. data_to_encode = data.select_dtypes(include=dtypes_to_encode)
  131. elif not is_list_like(columns):
  132. raise TypeError("Input must be a list-like for parameter `columns`")
  133. else:
  134. data_to_encode = data[columns]
  135. # validate prefixes and separator to avoid silently dropping cols
  136. def check_len(item, name):
  137. if is_list_like(item):
  138. if not len(item) == data_to_encode.shape[1]:
  139. len_msg = (
  140. f"Length of '{name}' ({len(item)}) did not match the "
  141. "length of the columns being encoded "
  142. f"({data_to_encode.shape[1]})."
  143. )
  144. raise ValueError(len_msg)
  145. check_len(prefix, "prefix")
  146. check_len(prefix_sep, "prefix_sep")
  147. if isinstance(prefix, str):
  148. prefix = itertools.cycle([prefix])
  149. if isinstance(prefix, dict):
  150. prefix = [prefix[col] for col in data_to_encode.columns]
  151. if prefix is None:
  152. prefix = data_to_encode.columns
  153. # validate separators
  154. if isinstance(prefix_sep, str):
  155. prefix_sep = itertools.cycle([prefix_sep])
  156. elif isinstance(prefix_sep, dict):
  157. prefix_sep = [prefix_sep[col] for col in data_to_encode.columns]
  158. with_dummies: list[DataFrame]
  159. if data_to_encode.shape == data.shape:
  160. # Encoding the entire df, do not prepend any dropped columns
  161. with_dummies = []
  162. elif columns is not None:
  163. # Encoding only cols specified in columns. Get all cols not in
  164. # columns to prepend to result.
  165. with_dummies = [data.drop(columns, axis=1)]
  166. else:
  167. # Encoding only object and category dtype columns. Get remaining
  168. # columns to prepend to result.
  169. with_dummies = [data.select_dtypes(exclude=dtypes_to_encode)]
  170. for col, pre, sep in zip(data_to_encode.items(), prefix, prefix_sep):
  171. # col is (column_name, column), use just column data here
  172. dummy = _get_dummies_1d(
  173. col[1],
  174. prefix=pre,
  175. prefix_sep=sep,
  176. dummy_na=dummy_na,
  177. sparse=sparse,
  178. drop_first=drop_first,
  179. dtype=dtype,
  180. )
  181. with_dummies.append(dummy)
  182. result = concat(with_dummies, axis=1)
  183. else:
  184. result = _get_dummies_1d(
  185. data,
  186. prefix,
  187. prefix_sep,
  188. dummy_na,
  189. sparse=sparse,
  190. drop_first=drop_first,
  191. dtype=dtype,
  192. )
  193. return result
  194. def _get_dummies_1d(
  195. data,
  196. prefix,
  197. prefix_sep: str | Iterable[str] | dict[str, str] = "_",
  198. dummy_na: bool = False,
  199. sparse: bool = False,
  200. drop_first: bool = False,
  201. dtype: NpDtype | None = None,
  202. ) -> DataFrame:
  203. from pandas.core.reshape.concat import concat
  204. # Series avoids inconsistent NaN handling
  205. codes, levels = factorize_from_iterable(Series(data, copy=False))
  206. if dtype is None:
  207. dtype = np.dtype(bool)
  208. _dtype = pandas_dtype(dtype)
  209. if is_object_dtype(_dtype):
  210. raise ValueError("dtype=object is not a valid dtype for get_dummies")
  211. def get_empty_frame(data) -> DataFrame:
  212. index: Index | np.ndarray
  213. if isinstance(data, Series):
  214. index = data.index
  215. else:
  216. index = default_index(len(data))
  217. return DataFrame(index=index)
  218. # if all NaN
  219. if not dummy_na and len(levels) == 0:
  220. return get_empty_frame(data)
  221. codes = codes.copy()
  222. if dummy_na:
  223. codes[codes == -1] = len(levels)
  224. levels = levels.insert(len(levels), np.nan)
  225. # if dummy_na, we just fake a nan level. drop_first will drop it again
  226. if drop_first and len(levels) == 1:
  227. return get_empty_frame(data)
  228. number_of_cols = len(levels)
  229. if prefix is None:
  230. dummy_cols = levels
  231. else:
  232. dummy_cols = Index([f"{prefix}{prefix_sep}{level}" for level in levels])
  233. index: Index | None
  234. if isinstance(data, Series):
  235. index = data.index
  236. else:
  237. index = None
  238. if sparse:
  239. fill_value: bool | float
  240. if is_integer_dtype(dtype):
  241. fill_value = 0
  242. elif dtype == np.dtype(bool):
  243. fill_value = False
  244. else:
  245. fill_value = 0.0
  246. sparse_series = []
  247. N = len(data)
  248. sp_indices: list[list] = [[] for _ in range(len(dummy_cols))]
  249. mask = codes != -1
  250. codes = codes[mask]
  251. n_idx = np.arange(N)[mask]
  252. for ndx, code in zip(n_idx, codes):
  253. sp_indices[code].append(ndx)
  254. if drop_first:
  255. # remove first categorical level to avoid perfect collinearity
  256. # GH12042
  257. sp_indices = sp_indices[1:]
  258. dummy_cols = dummy_cols[1:]
  259. for col, ixs in zip(dummy_cols, sp_indices):
  260. sarr = SparseArray(
  261. np.ones(len(ixs), dtype=dtype),
  262. sparse_index=IntIndex(N, ixs),
  263. fill_value=fill_value,
  264. dtype=dtype,
  265. )
  266. sparse_series.append(Series(data=sarr, index=index, name=col, copy=False))
  267. return concat(sparse_series, axis=1, copy=False)
  268. else:
  269. # take on axis=1 + transpose to ensure ndarray layout is column-major
  270. eye_dtype: NpDtype
  271. if isinstance(_dtype, np.dtype):
  272. eye_dtype = _dtype
  273. else:
  274. eye_dtype = np.bool_
  275. dummy_mat = np.eye(number_of_cols, dtype=eye_dtype).take(codes, axis=1).T
  276. if not dummy_na:
  277. # reset NaN GH4446
  278. dummy_mat[codes == -1] = 0
  279. if drop_first:
  280. # remove first GH12042
  281. dummy_mat = dummy_mat[:, 1:]
  282. dummy_cols = dummy_cols[1:]
  283. return DataFrame(dummy_mat, index=index, columns=dummy_cols, dtype=_dtype)
  284. def from_dummies(
  285. data: DataFrame,
  286. sep: None | str = None,
  287. default_category: None | Hashable | dict[str, Hashable] = None,
  288. ) -> DataFrame:
  289. """
  290. Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables.
  291. Inverts the operation performed by :func:`~pandas.get_dummies`.
  292. .. versionadded:: 1.5.0
  293. Parameters
  294. ----------
  295. data : DataFrame
  296. Data which contains dummy-coded variables in form of integer columns of
  297. 1's and 0's.
  298. sep : str, default None
  299. Separator used in the column names of the dummy categories they are
  300. character indicating the separation of the categorical names from the prefixes.
  301. For example, if your column names are 'prefix_A' and 'prefix_B',
  302. you can strip the underscore by specifying sep='_'.
  303. default_category : None, Hashable or dict of Hashables, default None
  304. The default category is the implied category when a value has none of the
  305. listed categories specified with a one, i.e. if all dummies in a row are
  306. zero. Can be a single value for all variables or a dict directly mapping
  307. the default categories to a prefix of a variable.
  308. Returns
  309. -------
  310. DataFrame
  311. Categorical data decoded from the dummy input-data.
  312. Raises
  313. ------
  314. ValueError
  315. * When the input ``DataFrame`` ``data`` contains NA values.
  316. * When the input ``DataFrame`` ``data`` contains column names with separators
  317. that do not match the separator specified with ``sep``.
  318. * When a ``dict`` passed to ``default_category`` does not include an implied
  319. category for each prefix.
  320. * When a value in ``data`` has more than one category assigned to it.
  321. * When ``default_category=None`` and a value in ``data`` has no category
  322. assigned to it.
  323. TypeError
  324. * When the input ``data`` is not of type ``DataFrame``.
  325. * When the input ``DataFrame`` ``data`` contains non-dummy data.
  326. * When the passed ``sep`` is of a wrong data type.
  327. * When the passed ``default_category`` is of a wrong data type.
  328. See Also
  329. --------
  330. :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes.
  331. :class:`~pandas.Categorical` : Represent a categorical variable in classic.
  332. Notes
  333. -----
  334. The columns of the passed dummy data should only include 1's and 0's,
  335. or boolean values.
  336. Examples
  337. --------
  338. >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0],
  339. ... "c": [0, 0, 1, 0]})
  340. >>> df
  341. a b c
  342. 0 1 0 0
  343. 1 0 1 0
  344. 2 0 0 1
  345. 3 1 0 0
  346. >>> pd.from_dummies(df)
  347. 0 a
  348. 1 b
  349. 2 c
  350. 3 a
  351. >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0],
  352. ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
  353. ... "col2_c": [0, 0, 1]})
  354. >>> df
  355. col1_a col1_b col2_a col2_b col2_c
  356. 0 1 0 0 1 0
  357. 1 0 1 1 0 0
  358. 2 1 0 0 0 1
  359. >>> pd.from_dummies(df, sep="_")
  360. col1 col2
  361. 0 a b
  362. 1 b a
  363. 2 a c
  364. >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0],
  365. ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0],
  366. ... "col2_c": [0, 0, 0]})
  367. >>> df
  368. col1_a col1_b col2_a col2_b col2_c
  369. 0 1 0 0 1 0
  370. 1 0 1 1 0 0
  371. 2 0 0 0 0 0
  372. >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"})
  373. col1 col2
  374. 0 a b
  375. 1 b a
  376. 2 d e
  377. """
  378. from pandas.core.reshape.concat import concat
  379. if not isinstance(data, DataFrame):
  380. raise TypeError(
  381. "Expected 'data' to be a 'DataFrame'; "
  382. f"Received 'data' of type: {type(data).__name__}"
  383. )
  384. if data.isna().any().any():
  385. raise ValueError(
  386. "Dummy DataFrame contains NA value in column: "
  387. f"'{data.isna().any().idxmax()}'"
  388. )
  389. # index data with a list of all columns that are dummies
  390. try:
  391. data_to_decode = data.astype("boolean", copy=False)
  392. except TypeError:
  393. raise TypeError("Passed DataFrame contains non-dummy data")
  394. # collect prefixes and get lists to slice data for each prefix
  395. variables_slice = defaultdict(list)
  396. if sep is None:
  397. variables_slice[""] = list(data.columns)
  398. elif isinstance(sep, str):
  399. for col in data_to_decode.columns:
  400. prefix = col.split(sep)[0]
  401. if len(prefix) == len(col):
  402. raise ValueError(f"Separator not specified for column: {col}")
  403. variables_slice[prefix].append(col)
  404. else:
  405. raise TypeError(
  406. "Expected 'sep' to be of type 'str' or 'None'; "
  407. f"Received 'sep' of type: {type(sep).__name__}"
  408. )
  409. if default_category is not None:
  410. if isinstance(default_category, dict):
  411. if not len(default_category) == len(variables_slice):
  412. len_msg = (
  413. f"Length of 'default_category' ({len(default_category)}) "
  414. f"did not match the length of the columns being encoded "
  415. f"({len(variables_slice)})"
  416. )
  417. raise ValueError(len_msg)
  418. elif isinstance(default_category, Hashable):
  419. default_category = dict(
  420. zip(variables_slice, [default_category] * len(variables_slice))
  421. )
  422. else:
  423. raise TypeError(
  424. "Expected 'default_category' to be of type "
  425. "'None', 'Hashable', or 'dict'; "
  426. "Received 'default_category' of type: "
  427. f"{type(default_category).__name__}"
  428. )
  429. cat_data = {}
  430. for prefix, prefix_slice in variables_slice.items():
  431. if sep is None:
  432. cats = prefix_slice.copy()
  433. else:
  434. cats = [col[len(prefix + sep) :] for col in prefix_slice]
  435. assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1)
  436. if any(assigned > 1):
  437. raise ValueError(
  438. "Dummy DataFrame contains multi-assignment(s); "
  439. f"First instance in row: {assigned.idxmax()}"
  440. )
  441. if any(assigned == 0):
  442. if isinstance(default_category, dict):
  443. cats.append(default_category[prefix])
  444. else:
  445. raise ValueError(
  446. "Dummy DataFrame contains unassigned value(s); "
  447. f"First instance in row: {assigned.idxmin()}"
  448. )
  449. data_slice = concat(
  450. (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1
  451. )
  452. else:
  453. data_slice = data_to_decode.loc[:, prefix_slice]
  454. cats_array = np.array(cats, dtype="object")
  455. # get indices of True entries along axis=1
  456. cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]]
  457. return DataFrame(cat_data)