test_get_dummies.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679
  1. import re
  2. import unicodedata
  3. import numpy as np
  4. import pytest
  5. from pandas.core.dtypes.common import is_integer_dtype
  6. import pandas as pd
  7. from pandas import (
  8. Categorical,
  9. CategoricalIndex,
  10. DataFrame,
  11. RangeIndex,
  12. Series,
  13. get_dummies,
  14. )
  15. import pandas._testing as tm
  16. from pandas.core.arrays.sparse import (
  17. SparseArray,
  18. SparseDtype,
  19. )
  20. class TestGetDummies:
  21. @pytest.fixture
  22. def df(self):
  23. return DataFrame({"A": ["a", "b", "a"], "B": ["b", "b", "c"], "C": [1, 2, 3]})
  24. @pytest.fixture(params=["uint8", "i8", np.float64, bool, None])
  25. def dtype(self, request):
  26. return np.dtype(request.param)
  27. @pytest.fixture(params=["dense", "sparse"])
  28. def sparse(self, request):
  29. # params are strings to simplify reading test results,
  30. # e.g. TestGetDummies::test_basic[uint8-sparse] instead of [uint8-True]
  31. return request.param == "sparse"
  32. def effective_dtype(self, dtype):
  33. if dtype is None:
  34. return np.uint8
  35. return dtype
  36. def test_get_dummies_raises_on_dtype_object(self, df):
  37. msg = "dtype=object is not a valid dtype for get_dummies"
  38. with pytest.raises(ValueError, match=msg):
  39. get_dummies(df, dtype="object")
  40. def test_get_dummies_basic(self, sparse, dtype):
  41. s_list = list("abc")
  42. s_series = Series(s_list)
  43. s_series_index = Series(s_list, list("ABC"))
  44. expected = DataFrame(
  45. {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
  46. dtype=self.effective_dtype(dtype),
  47. )
  48. if sparse:
  49. expected = expected.apply(SparseArray, fill_value=0.0)
  50. result = get_dummies(s_list, sparse=sparse, dtype=dtype)
  51. tm.assert_frame_equal(result, expected)
  52. result = get_dummies(s_series, sparse=sparse, dtype=dtype)
  53. tm.assert_frame_equal(result, expected)
  54. expected.index = list("ABC")
  55. result = get_dummies(s_series_index, sparse=sparse, dtype=dtype)
  56. tm.assert_frame_equal(result, expected)
  57. def test_get_dummies_basic_types(self, sparse, dtype):
  58. # GH 10531
  59. s_list = list("abc")
  60. s_series = Series(s_list)
  61. s_df = DataFrame(
  62. {"a": [0, 1, 0, 1, 2], "b": ["A", "A", "B", "C", "C"], "c": [2, 3, 3, 3, 2]}
  63. )
  64. expected = DataFrame(
  65. {"a": [1, 0, 0], "b": [0, 1, 0], "c": [0, 0, 1]},
  66. dtype=self.effective_dtype(dtype),
  67. columns=list("abc"),
  68. )
  69. if sparse:
  70. if is_integer_dtype(dtype):
  71. fill_value = 0
  72. elif dtype == bool:
  73. fill_value = False
  74. else:
  75. fill_value = 0.0
  76. expected = expected.apply(SparseArray, fill_value=fill_value)
  77. result = get_dummies(s_list, sparse=sparse, dtype=dtype)
  78. tm.assert_frame_equal(result, expected)
  79. result = get_dummies(s_series, sparse=sparse, dtype=dtype)
  80. tm.assert_frame_equal(result, expected)
  81. result = get_dummies(s_df, columns=s_df.columns, sparse=sparse, dtype=dtype)
  82. if sparse:
  83. dtype_name = f"Sparse[{self.effective_dtype(dtype).name}, {fill_value}]"
  84. else:
  85. dtype_name = self.effective_dtype(dtype).name
  86. expected = Series({dtype_name: 8}, name="count")
  87. result = result.dtypes.value_counts()
  88. result.index = [str(i) for i in result.index]
  89. tm.assert_series_equal(result, expected)
  90. result = get_dummies(s_df, columns=["a"], sparse=sparse, dtype=dtype)
  91. expected_counts = {"int64": 1, "object": 1}
  92. expected_counts[dtype_name] = 3 + expected_counts.get(dtype_name, 0)
  93. expected = Series(expected_counts, name="count").sort_index()
  94. result = result.dtypes.value_counts()
  95. result.index = [str(i) for i in result.index]
  96. result = result.sort_index()
  97. tm.assert_series_equal(result, expected)
  98. def test_get_dummies_just_na(self, sparse):
  99. just_na_list = [np.nan]
  100. just_na_series = Series(just_na_list)
  101. just_na_series_index = Series(just_na_list, index=["A"])
  102. res_list = get_dummies(just_na_list, sparse=sparse)
  103. res_series = get_dummies(just_na_series, sparse=sparse)
  104. res_series_index = get_dummies(just_na_series_index, sparse=sparse)
  105. assert res_list.empty
  106. assert res_series.empty
  107. assert res_series_index.empty
  108. assert res_list.index.tolist() == [0]
  109. assert res_series.index.tolist() == [0]
  110. assert res_series_index.index.tolist() == ["A"]
  111. def test_get_dummies_include_na(self, sparse, dtype):
  112. s = ["a", "b", np.nan]
  113. res = get_dummies(s, sparse=sparse, dtype=dtype)
  114. exp = DataFrame(
  115. {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
  116. )
  117. if sparse:
  118. exp = exp.apply(SparseArray, fill_value=0.0)
  119. tm.assert_frame_equal(res, exp)
  120. # Sparse dataframes do not allow nan labelled columns, see #GH8822
  121. res_na = get_dummies(s, dummy_na=True, sparse=sparse, dtype=dtype)
  122. exp_na = DataFrame(
  123. {np.nan: [0, 0, 1], "a": [1, 0, 0], "b": [0, 1, 0]},
  124. dtype=self.effective_dtype(dtype),
  125. )
  126. exp_na = exp_na.reindex(["a", "b", np.nan], axis=1)
  127. # hack (NaN handling in assert_index_equal)
  128. exp_na.columns = res_na.columns
  129. if sparse:
  130. exp_na = exp_na.apply(SparseArray, fill_value=0.0)
  131. tm.assert_frame_equal(res_na, exp_na)
  132. res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype)
  133. exp_just_na = DataFrame(
  134. Series(1, index=[0]), columns=[np.nan], dtype=self.effective_dtype(dtype)
  135. )
  136. tm.assert_numpy_array_equal(res_just_na.values, exp_just_na.values)
  137. def test_get_dummies_unicode(self, sparse):
  138. # See GH 6885 - get_dummies chokes on unicode values
  139. e = "e"
  140. eacute = unicodedata.lookup("LATIN SMALL LETTER E WITH ACUTE")
  141. s = [e, eacute, eacute]
  142. res = get_dummies(s, prefix="letter", sparse=sparse)
  143. exp = DataFrame(
  144. {"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
  145. )
  146. if sparse:
  147. exp = exp.apply(SparseArray, fill_value=0)
  148. tm.assert_frame_equal(res, exp)
  149. def test_dataframe_dummies_all_obj(self, df, sparse):
  150. df = df[["A", "B"]]
  151. result = get_dummies(df, sparse=sparse)
  152. expected = DataFrame(
  153. {"A_a": [1, 0, 1], "A_b": [0, 1, 0], "B_b": [1, 1, 0], "B_c": [0, 0, 1]},
  154. dtype=bool,
  155. )
  156. if sparse:
  157. expected = DataFrame(
  158. {
  159. "A_a": SparseArray([1, 0, 1], dtype="bool"),
  160. "A_b": SparseArray([0, 1, 0], dtype="bool"),
  161. "B_b": SparseArray([1, 1, 0], dtype="bool"),
  162. "B_c": SparseArray([0, 0, 1], dtype="bool"),
  163. }
  164. )
  165. tm.assert_frame_equal(result, expected)
  166. def test_dataframe_dummies_string_dtype(self, df):
  167. # GH44965
  168. df = df[["A", "B"]]
  169. df = df.astype({"A": "object", "B": "string"})
  170. result = get_dummies(df)
  171. expected = DataFrame(
  172. {
  173. "A_a": [1, 0, 1],
  174. "A_b": [0, 1, 0],
  175. "B_b": [1, 1, 0],
  176. "B_c": [0, 0, 1],
  177. },
  178. dtype=bool,
  179. )
  180. tm.assert_frame_equal(result, expected)
  181. def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
  182. result = get_dummies(df, sparse=sparse, dtype=dtype)
  183. if sparse:
  184. arr = SparseArray
  185. typ = SparseDtype(dtype, 0)
  186. else:
  187. arr = np.array
  188. typ = dtype
  189. expected = DataFrame(
  190. {
  191. "C": [1, 2, 3],
  192. "A_a": arr([1, 0, 1], dtype=typ),
  193. "A_b": arr([0, 1, 0], dtype=typ),
  194. "B_b": arr([1, 1, 0], dtype=typ),
  195. "B_c": arr([0, 0, 1], dtype=typ),
  196. }
  197. )
  198. expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
  199. tm.assert_frame_equal(result, expected)
  200. def test_dataframe_dummies_prefix_list(self, df, sparse):
  201. prefixes = ["from_A", "from_B"]
  202. result = get_dummies(df, prefix=prefixes, sparse=sparse)
  203. expected = DataFrame(
  204. {
  205. "C": [1, 2, 3],
  206. "from_A_a": [True, False, True],
  207. "from_A_b": [False, True, False],
  208. "from_B_b": [True, True, False],
  209. "from_B_c": [False, False, True],
  210. },
  211. )
  212. expected[["C"]] = df[["C"]]
  213. cols = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
  214. expected = expected[["C"] + cols]
  215. typ = SparseArray if sparse else Series
  216. expected[cols] = expected[cols].apply(lambda x: typ(x))
  217. tm.assert_frame_equal(result, expected)
  218. def test_dataframe_dummies_prefix_str(self, df, sparse):
  219. # not that you should do this...
  220. result = get_dummies(df, prefix="bad", sparse=sparse)
  221. bad_columns = ["bad_a", "bad_b", "bad_b", "bad_c"]
  222. expected = DataFrame(
  223. [
  224. [1, True, False, True, False],
  225. [2, False, True, True, False],
  226. [3, True, False, False, True],
  227. ],
  228. columns=["C"] + bad_columns,
  229. )
  230. expected = expected.astype({"C": np.int64})
  231. if sparse:
  232. # work around astyping & assigning with duplicate columns
  233. # https://github.com/pandas-dev/pandas/issues/14427
  234. expected = pd.concat(
  235. [
  236. Series([1, 2, 3], name="C"),
  237. Series([True, False, True], name="bad_a", dtype="Sparse[bool]"),
  238. Series([False, True, False], name="bad_b", dtype="Sparse[bool]"),
  239. Series([True, True, False], name="bad_b", dtype="Sparse[bool]"),
  240. Series([False, False, True], name="bad_c", dtype="Sparse[bool]"),
  241. ],
  242. axis=1,
  243. )
  244. tm.assert_frame_equal(result, expected)
  245. def test_dataframe_dummies_subset(self, df, sparse):
  246. result = get_dummies(df, prefix=["from_A"], columns=["A"], sparse=sparse)
  247. expected = DataFrame(
  248. {
  249. "B": ["b", "b", "c"],
  250. "C": [1, 2, 3],
  251. "from_A_a": [1, 0, 1],
  252. "from_A_b": [0, 1, 0],
  253. },
  254. )
  255. cols = expected.columns
  256. expected[cols[1:]] = expected[cols[1:]].astype(bool)
  257. expected[["C"]] = df[["C"]]
  258. if sparse:
  259. cols = ["from_A_a", "from_A_b"]
  260. expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
  261. tm.assert_frame_equal(result, expected)
  262. def test_dataframe_dummies_prefix_sep(self, df, sparse):
  263. result = get_dummies(df, prefix_sep="..", sparse=sparse)
  264. expected = DataFrame(
  265. {
  266. "C": [1, 2, 3],
  267. "A..a": [True, False, True],
  268. "A..b": [False, True, False],
  269. "B..b": [True, True, False],
  270. "B..c": [False, False, True],
  271. },
  272. )
  273. expected[["C"]] = df[["C"]]
  274. expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
  275. if sparse:
  276. cols = ["A..a", "A..b", "B..b", "B..c"]
  277. expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
  278. tm.assert_frame_equal(result, expected)
  279. result = get_dummies(df, prefix_sep=["..", "__"], sparse=sparse)
  280. expected = expected.rename(columns={"B..b": "B__b", "B..c": "B__c"})
  281. tm.assert_frame_equal(result, expected)
  282. result = get_dummies(df, prefix_sep={"A": "..", "B": "__"}, sparse=sparse)
  283. tm.assert_frame_equal(result, expected)
  284. def test_dataframe_dummies_prefix_bad_length(self, df, sparse):
  285. msg = re.escape(
  286. "Length of 'prefix' (1) did not match the length of the columns being "
  287. "encoded (2)"
  288. )
  289. with pytest.raises(ValueError, match=msg):
  290. get_dummies(df, prefix=["too few"], sparse=sparse)
  291. def test_dataframe_dummies_prefix_sep_bad_length(self, df, sparse):
  292. msg = re.escape(
  293. "Length of 'prefix_sep' (1) did not match the length of the columns being "
  294. "encoded (2)"
  295. )
  296. with pytest.raises(ValueError, match=msg):
  297. get_dummies(df, prefix_sep=["bad"], sparse=sparse)
  298. def test_dataframe_dummies_prefix_dict(self, sparse):
  299. prefixes = {"A": "from_A", "B": "from_B"}
  300. df = DataFrame({"C": [1, 2, 3], "A": ["a", "b", "a"], "B": ["b", "b", "c"]})
  301. result = get_dummies(df, prefix=prefixes, sparse=sparse)
  302. expected = DataFrame(
  303. {
  304. "C": [1, 2, 3],
  305. "from_A_a": [1, 0, 1],
  306. "from_A_b": [0, 1, 0],
  307. "from_B_b": [1, 1, 0],
  308. "from_B_c": [0, 0, 1],
  309. }
  310. )
  311. columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
  312. expected[columns] = expected[columns].astype(bool)
  313. if sparse:
  314. expected[columns] = expected[columns].astype(SparseDtype("bool", 0))
  315. tm.assert_frame_equal(result, expected)
  316. def test_dataframe_dummies_with_na(self, df, sparse, dtype):
  317. df.loc[3, :] = [np.nan, np.nan, np.nan]
  318. result = get_dummies(df, dummy_na=True, sparse=sparse, dtype=dtype).sort_index(
  319. axis=1
  320. )
  321. if sparse:
  322. arr = SparseArray
  323. typ = SparseDtype(dtype, 0)
  324. else:
  325. arr = np.array
  326. typ = dtype
  327. expected = DataFrame(
  328. {
  329. "C": [1, 2, 3, np.nan],
  330. "A_a": arr([1, 0, 1, 0], dtype=typ),
  331. "A_b": arr([0, 1, 0, 0], dtype=typ),
  332. "A_nan": arr([0, 0, 0, 1], dtype=typ),
  333. "B_b": arr([1, 1, 0, 0], dtype=typ),
  334. "B_c": arr([0, 0, 1, 0], dtype=typ),
  335. "B_nan": arr([0, 0, 0, 1], dtype=typ),
  336. }
  337. ).sort_index(axis=1)
  338. tm.assert_frame_equal(result, expected)
  339. result = get_dummies(df, dummy_na=False, sparse=sparse, dtype=dtype)
  340. expected = expected[["C", "A_a", "A_b", "B_b", "B_c"]]
  341. tm.assert_frame_equal(result, expected)
  342. def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
  343. df["cat"] = Categorical(["x", "y", "y"])
  344. result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
  345. if sparse:
  346. arr = SparseArray
  347. typ = SparseDtype(dtype, 0)
  348. else:
  349. arr = np.array
  350. typ = dtype
  351. expected = DataFrame(
  352. {
  353. "C": [1, 2, 3],
  354. "A_a": arr([1, 0, 1], dtype=typ),
  355. "A_b": arr([0, 1, 0], dtype=typ),
  356. "B_b": arr([1, 1, 0], dtype=typ),
  357. "B_c": arr([0, 0, 1], dtype=typ),
  358. "cat_x": arr([1, 0, 0], dtype=typ),
  359. "cat_y": arr([0, 1, 1], dtype=typ),
  360. }
  361. ).sort_index(axis=1)
  362. tm.assert_frame_equal(result, expected)
  363. @pytest.mark.parametrize(
  364. "get_dummies_kwargs,expected",
  365. [
  366. (
  367. {"data": DataFrame({"ä": ["a"]})},
  368. DataFrame({"ä_a": [True]}),
  369. ),
  370. (
  371. {"data": DataFrame({"x": ["ä"]})},
  372. DataFrame({"x_ä": [True]}),
  373. ),
  374. (
  375. {"data": DataFrame({"x": ["a"]}), "prefix": "ä"},
  376. DataFrame({"ä_a": [True]}),
  377. ),
  378. (
  379. {"data": DataFrame({"x": ["a"]}), "prefix_sep": "ä"},
  380. DataFrame({"xäa": [True]}),
  381. ),
  382. ],
  383. )
  384. def test_dataframe_dummies_unicode(self, get_dummies_kwargs, expected):
  385. # GH22084 get_dummies incorrectly encodes unicode characters
  386. # in dataframe column names
  387. result = get_dummies(**get_dummies_kwargs)
  388. tm.assert_frame_equal(result, expected)
  389. def test_get_dummies_basic_drop_first(self, sparse):
  390. # GH12402 Add a new parameter `drop_first` to avoid collinearity
  391. # Basic case
  392. s_list = list("abc")
  393. s_series = Series(s_list)
  394. s_series_index = Series(s_list, list("ABC"))
  395. expected = DataFrame({"b": [0, 1, 0], "c": [0, 0, 1]}, dtype=bool)
  396. result = get_dummies(s_list, drop_first=True, sparse=sparse)
  397. if sparse:
  398. expected = expected.apply(SparseArray, fill_value=0)
  399. tm.assert_frame_equal(result, expected)
  400. result = get_dummies(s_series, drop_first=True, sparse=sparse)
  401. tm.assert_frame_equal(result, expected)
  402. expected.index = list("ABC")
  403. result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
  404. tm.assert_frame_equal(result, expected)
  405. def test_get_dummies_basic_drop_first_one_level(self, sparse):
  406. # Test the case that categorical variable only has one level.
  407. s_list = list("aaa")
  408. s_series = Series(s_list)
  409. s_series_index = Series(s_list, list("ABC"))
  410. expected = DataFrame(index=RangeIndex(3))
  411. result = get_dummies(s_list, drop_first=True, sparse=sparse)
  412. tm.assert_frame_equal(result, expected)
  413. result = get_dummies(s_series, drop_first=True, sparse=sparse)
  414. tm.assert_frame_equal(result, expected)
  415. expected = DataFrame(index=list("ABC"))
  416. result = get_dummies(s_series_index, drop_first=True, sparse=sparse)
  417. tm.assert_frame_equal(result, expected)
  418. def test_get_dummies_basic_drop_first_NA(self, sparse):
  419. # Test NA handling together with drop_first
  420. s_NA = ["a", "b", np.nan]
  421. res = get_dummies(s_NA, drop_first=True, sparse=sparse)
  422. exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
  423. if sparse:
  424. exp = exp.apply(SparseArray, fill_value=0)
  425. tm.assert_frame_equal(res, exp)
  426. res_na = get_dummies(s_NA, dummy_na=True, drop_first=True, sparse=sparse)
  427. exp_na = DataFrame({"b": [0, 1, 0], np.nan: [0, 0, 1]}, dtype=bool).reindex(
  428. ["b", np.nan], axis=1
  429. )
  430. if sparse:
  431. exp_na = exp_na.apply(SparseArray, fill_value=0)
  432. tm.assert_frame_equal(res_na, exp_na)
  433. res_just_na = get_dummies(
  434. [np.nan], dummy_na=True, drop_first=True, sparse=sparse
  435. )
  436. exp_just_na = DataFrame(index=RangeIndex(1))
  437. tm.assert_frame_equal(res_just_na, exp_just_na)
  438. def test_dataframe_dummies_drop_first(self, df, sparse):
  439. df = df[["A", "B"]]
  440. result = get_dummies(df, drop_first=True, sparse=sparse)
  441. expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
  442. if sparse:
  443. expected = expected.apply(SparseArray, fill_value=0)
  444. tm.assert_frame_equal(result, expected)
  445. def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
  446. df["cat"] = Categorical(["x", "y", "y"])
  447. result = get_dummies(df, drop_first=True, sparse=sparse)
  448. expected = DataFrame(
  449. {"C": [1, 2, 3], "A_b": [0, 1, 0], "B_c": [0, 0, 1], "cat_y": [0, 1, 1]}
  450. )
  451. cols = ["A_b", "B_c", "cat_y"]
  452. expected[cols] = expected[cols].astype(bool)
  453. expected = expected[["C", "A_b", "B_c", "cat_y"]]
  454. if sparse:
  455. for col in cols:
  456. expected[col] = SparseArray(expected[col])
  457. tm.assert_frame_equal(result, expected)
  458. def test_dataframe_dummies_drop_first_with_na(self, df, sparse):
  459. df.loc[3, :] = [np.nan, np.nan, np.nan]
  460. result = get_dummies(
  461. df, dummy_na=True, drop_first=True, sparse=sparse
  462. ).sort_index(axis=1)
  463. expected = DataFrame(
  464. {
  465. "C": [1, 2, 3, np.nan],
  466. "A_b": [0, 1, 0, 0],
  467. "A_nan": [0, 0, 0, 1],
  468. "B_c": [0, 0, 1, 0],
  469. "B_nan": [0, 0, 0, 1],
  470. }
  471. )
  472. cols = ["A_b", "A_nan", "B_c", "B_nan"]
  473. expected[cols] = expected[cols].astype(bool)
  474. expected = expected.sort_index(axis=1)
  475. if sparse:
  476. for col in cols:
  477. expected[col] = SparseArray(expected[col])
  478. tm.assert_frame_equal(result, expected)
  479. result = get_dummies(df, dummy_na=False, drop_first=True, sparse=sparse)
  480. expected = expected[["C", "A_b", "B_c"]]
  481. tm.assert_frame_equal(result, expected)
  482. def test_get_dummies_int_int(self):
  483. data = Series([1, 2, 1])
  484. result = get_dummies(data)
  485. expected = DataFrame([[1, 0], [0, 1], [1, 0]], columns=[1, 2], dtype=bool)
  486. tm.assert_frame_equal(result, expected)
  487. data = Series(Categorical(["a", "b", "a"]))
  488. result = get_dummies(data)
  489. expected = DataFrame(
  490. [[1, 0], [0, 1], [1, 0]], columns=Categorical(["a", "b"]), dtype=bool
  491. )
  492. tm.assert_frame_equal(result, expected)
  493. def test_get_dummies_int_df(self, dtype):
  494. data = DataFrame(
  495. {
  496. "A": [1, 2, 1],
  497. "B": Categorical(["a", "b", "a"]),
  498. "C": [1, 2, 1],
  499. "D": [1.0, 2.0, 1.0],
  500. }
  501. )
  502. columns = ["C", "D", "A_1", "A_2", "B_a", "B_b"]
  503. expected = DataFrame(
  504. [[1, 1.0, 1, 0, 1, 0], [2, 2.0, 0, 1, 0, 1], [1, 1.0, 1, 0, 1, 0]],
  505. columns=columns,
  506. )
  507. expected[columns[2:]] = expected[columns[2:]].astype(dtype)
  508. result = get_dummies(data, columns=["A", "B"], dtype=dtype)
  509. tm.assert_frame_equal(result, expected)
  510. @pytest.mark.parametrize("ordered", [True, False])
  511. def test_dataframe_dummies_preserve_categorical_dtype(self, dtype, ordered):
  512. # GH13854
  513. cat = Categorical(list("xy"), categories=list("xyz"), ordered=ordered)
  514. result = get_dummies(cat, dtype=dtype)
  515. data = np.array([[1, 0, 0], [0, 1, 0]], dtype=self.effective_dtype(dtype))
  516. cols = CategoricalIndex(
  517. cat.categories, categories=cat.categories, ordered=ordered
  518. )
  519. expected = DataFrame(data, columns=cols, dtype=self.effective_dtype(dtype))
  520. tm.assert_frame_equal(result, expected)
  521. @pytest.mark.parametrize("sparse", [True, False])
  522. def test_get_dummies_dont_sparsify_all_columns(self, sparse):
  523. # GH18914
  524. df = DataFrame.from_dict({"GDP": [1, 2], "Nation": ["AB", "CD"]})
  525. df = get_dummies(df, columns=["Nation"], sparse=sparse)
  526. df2 = df.reindex(columns=["GDP"])
  527. tm.assert_frame_equal(df[["GDP"]], df2)
  528. def test_get_dummies_duplicate_columns(self, df):
  529. # GH20839
  530. df.columns = ["A", "A", "A"]
  531. result = get_dummies(df).sort_index(axis=1)
  532. expected = DataFrame(
  533. [
  534. [1, True, False, True, False],
  535. [2, False, True, True, False],
  536. [3, True, False, False, True],
  537. ],
  538. columns=["A", "A_a", "A_b", "A_b", "A_c"],
  539. ).sort_index(axis=1)
  540. expected = expected.astype({"A": np.int64})
  541. tm.assert_frame_equal(result, expected)
  542. def test_get_dummies_all_sparse(self):
  543. df = DataFrame({"A": [1, 2]})
  544. result = get_dummies(df, columns=["A"], sparse=True)
  545. dtype = SparseDtype("bool", 0)
  546. expected = DataFrame(
  547. {
  548. "A_1": SparseArray([1, 0], dtype=dtype),
  549. "A_2": SparseArray([0, 1], dtype=dtype),
  550. }
  551. )
  552. tm.assert_frame_equal(result, expected)
  553. @pytest.mark.parametrize("values", ["baz"])
  554. def test_get_dummies_with_string_values(self, values):
  555. # issue #28383
  556. df = DataFrame(
  557. {
  558. "bar": [1, 2, 3, 4, 5, 6],
  559. "foo": ["one", "one", "one", "two", "two", "two"],
  560. "baz": ["A", "B", "C", "A", "B", "C"],
  561. "zoo": ["x", "y", "z", "q", "w", "t"],
  562. }
  563. )
  564. msg = "Input must be a list-like for parameter `columns`"
  565. with pytest.raises(TypeError, match=msg):
  566. get_dummies(df, columns=values)
  567. def test_get_dummies_ea_dtype_series(self, any_numeric_ea_and_arrow_dtype):
  568. # GH#32430
  569. ser = Series(list("abca"))
  570. result = get_dummies(ser, dtype=any_numeric_ea_and_arrow_dtype)
  571. expected = DataFrame(
  572. {"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]},
  573. dtype=any_numeric_ea_and_arrow_dtype,
  574. )
  575. tm.assert_frame_equal(result, expected)
  576. def test_get_dummies_ea_dtype_dataframe(self, any_numeric_ea_and_arrow_dtype):
  577. # GH#32430
  578. df = DataFrame({"x": list("abca")})
  579. result = get_dummies(df, dtype=any_numeric_ea_and_arrow_dtype)
  580. expected = DataFrame(
  581. {"x_a": [1, 0, 0, 1], "x_b": [0, 1, 0, 0], "x_c": [0, 0, 1, 0]},
  582. dtype=any_numeric_ea_and_arrow_dtype,
  583. )
  584. tm.assert_frame_equal(result, expected)