test_from_dummies.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. DataFrame,
  5. Series,
  6. from_dummies,
  7. get_dummies,
  8. )
  9. import pandas._testing as tm
  10. @pytest.fixture
  11. def dummies_basic():
  12. return DataFrame(
  13. {
  14. "col1_a": [1, 0, 1],
  15. "col1_b": [0, 1, 0],
  16. "col2_a": [0, 1, 0],
  17. "col2_b": [1, 0, 0],
  18. "col2_c": [0, 0, 1],
  19. },
  20. )
  21. @pytest.fixture
  22. def dummies_with_unassigned():
  23. return DataFrame(
  24. {
  25. "col1_a": [1, 0, 0],
  26. "col1_b": [0, 1, 0],
  27. "col2_a": [0, 1, 0],
  28. "col2_b": [0, 0, 0],
  29. "col2_c": [0, 0, 1],
  30. },
  31. )
  32. def test_error_wrong_data_type():
  33. dummies = [0, 1, 0]
  34. with pytest.raises(
  35. TypeError,
  36. match=r"Expected 'data' to be a 'DataFrame'; Received 'data' of type: list",
  37. ):
  38. from_dummies(dummies)
  39. def test_error_no_prefix_contains_unassigned():
  40. dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
  41. with pytest.raises(
  42. ValueError,
  43. match=(
  44. r"Dummy DataFrame contains unassigned value\(s\); "
  45. r"First instance in row: 2"
  46. ),
  47. ):
  48. from_dummies(dummies)
  49. def test_error_no_prefix_wrong_default_category_type():
  50. dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
  51. with pytest.raises(
  52. TypeError,
  53. match=(
  54. r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
  55. r"Received 'default_category' of type: list"
  56. ),
  57. ):
  58. from_dummies(dummies, default_category=["c", "d"])
  59. def test_error_no_prefix_multi_assignment():
  60. dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
  61. with pytest.raises(
  62. ValueError,
  63. match=(
  64. r"Dummy DataFrame contains multi-assignment\(s\); "
  65. r"First instance in row: 2"
  66. ),
  67. ):
  68. from_dummies(dummies)
  69. def test_error_no_prefix_contains_nan():
  70. dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]})
  71. with pytest.raises(
  72. ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'"
  73. ):
  74. from_dummies(dummies)
  75. def test_error_contains_non_dummies():
  76. dummies = DataFrame(
  77. {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]}
  78. )
  79. with pytest.raises(
  80. TypeError,
  81. match=r"Passed DataFrame contains non-dummy data",
  82. ):
  83. from_dummies(dummies)
  84. def test_error_with_prefix_multiple_seperators():
  85. dummies = DataFrame(
  86. {
  87. "col1_a": [1, 0, 1],
  88. "col1_b": [0, 1, 0],
  89. "col2-a": [0, 1, 0],
  90. "col2-b": [1, 0, 1],
  91. },
  92. )
  93. with pytest.raises(
  94. ValueError,
  95. match=(r"Separator not specified for column: col2-a"),
  96. ):
  97. from_dummies(dummies, sep="_")
  98. def test_error_with_prefix_sep_wrong_type(dummies_basic):
  99. with pytest.raises(
  100. TypeError,
  101. match=(
  102. r"Expected 'sep' to be of type 'str' or 'None'; "
  103. r"Received 'sep' of type: list"
  104. ),
  105. ):
  106. from_dummies(dummies_basic, sep=["_"])
  107. def test_error_with_prefix_contains_unassigned(dummies_with_unassigned):
  108. with pytest.raises(
  109. ValueError,
  110. match=(
  111. r"Dummy DataFrame contains unassigned value\(s\); "
  112. r"First instance in row: 2"
  113. ),
  114. ):
  115. from_dummies(dummies_with_unassigned, sep="_")
  116. def test_error_with_prefix_default_category_wrong_type(dummies_with_unassigned):
  117. with pytest.raises(
  118. TypeError,
  119. match=(
  120. r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
  121. r"Received 'default_category' of type: list"
  122. ),
  123. ):
  124. from_dummies(dummies_with_unassigned, sep="_", default_category=["x", "y"])
  125. def test_error_with_prefix_default_category_dict_not_complete(
  126. dummies_with_unassigned,
  127. ):
  128. with pytest.raises(
  129. ValueError,
  130. match=(
  131. r"Length of 'default_category' \(1\) did not match "
  132. r"the length of the columns being encoded \(2\)"
  133. ),
  134. ):
  135. from_dummies(dummies_with_unassigned, sep="_", default_category={"col1": "x"})
  136. def test_error_with_prefix_contains_nan(dummies_basic):
  137. dummies_basic.loc[2, "col2_c"] = np.nan
  138. with pytest.raises(
  139. ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'"
  140. ):
  141. from_dummies(dummies_basic, sep="_")
  142. def test_error_with_prefix_contains_non_dummies(dummies_basic):
  143. dummies_basic.loc[2, "col2_c"] = "str"
  144. with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"):
  145. from_dummies(dummies_basic, sep="_")
  146. def test_error_with_prefix_double_assignment():
  147. dummies = DataFrame(
  148. {
  149. "col1_a": [1, 0, 1],
  150. "col1_b": [1, 1, 0],
  151. "col2_a": [0, 1, 0],
  152. "col2_b": [1, 0, 0],
  153. "col2_c": [0, 0, 1],
  154. },
  155. )
  156. with pytest.raises(
  157. ValueError,
  158. match=(
  159. r"Dummy DataFrame contains multi-assignment\(s\); "
  160. r"First instance in row: 0"
  161. ),
  162. ):
  163. from_dummies(dummies, sep="_")
  164. def test_roundtrip_series_to_dataframe():
  165. categories = Series(["a", "b", "c", "a"])
  166. dummies = get_dummies(categories)
  167. result = from_dummies(dummies)
  168. expected = DataFrame({"": ["a", "b", "c", "a"]})
  169. tm.assert_frame_equal(result, expected)
  170. def test_roundtrip_single_column_dataframe():
  171. categories = DataFrame({"": ["a", "b", "c", "a"]})
  172. dummies = get_dummies(categories)
  173. result = from_dummies(dummies, sep="_")
  174. expected = categories
  175. tm.assert_frame_equal(result, expected)
  176. def test_roundtrip_with_prefixes():
  177. categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
  178. dummies = get_dummies(categories)
  179. result = from_dummies(dummies, sep="_")
  180. expected = categories
  181. tm.assert_frame_equal(result, expected)
  182. def test_no_prefix_string_cats_basic():
  183. dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
  184. expected = DataFrame({"": ["a", "b", "c", "a"]})
  185. result = from_dummies(dummies)
  186. tm.assert_frame_equal(result, expected)
  187. def test_no_prefix_string_cats_basic_bool_values():
  188. dummies = DataFrame(
  189. {
  190. "a": [True, False, False, True],
  191. "b": [False, True, False, False],
  192. "c": [False, False, True, False],
  193. }
  194. )
  195. expected = DataFrame({"": ["a", "b", "c", "a"]})
  196. result = from_dummies(dummies)
  197. tm.assert_frame_equal(result, expected)
  198. def test_no_prefix_string_cats_basic_mixed_bool_values():
  199. dummies = DataFrame(
  200. {"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]}
  201. )
  202. expected = DataFrame({"": ["a", "b", "c", "a"]})
  203. result = from_dummies(dummies)
  204. tm.assert_frame_equal(result, expected)
  205. def test_no_prefix_int_cats_basic():
  206. dummies = DataFrame(
  207. {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]}
  208. )
  209. expected = DataFrame({"": [1, 25, 2, 5]}, dtype="object")
  210. result = from_dummies(dummies)
  211. tm.assert_frame_equal(result, expected)
  212. def test_no_prefix_float_cats_basic():
  213. dummies = DataFrame(
  214. {1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]}
  215. )
  216. expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]}, dtype="object")
  217. result = from_dummies(dummies)
  218. tm.assert_frame_equal(result, expected)
  219. def test_no_prefix_mixed_cats_basic():
  220. dummies = DataFrame(
  221. {
  222. 1.23: [1, 0, 0, 0, 0],
  223. "c": [0, 1, 0, 0, 0],
  224. 2: [0, 0, 1, 0, 0],
  225. False: [0, 0, 0, 1, 0],
  226. None: [0, 0, 0, 0, 1],
  227. }
  228. )
  229. expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object")
  230. result = from_dummies(dummies)
  231. tm.assert_frame_equal(result, expected)
  232. def test_no_prefix_string_cats_contains_get_dummies_NaN_column():
  233. dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]})
  234. expected = DataFrame({"": ["a", "b", "NaN"]})
  235. result = from_dummies(dummies)
  236. tm.assert_frame_equal(result, expected)
  237. @pytest.mark.parametrize(
  238. "default_category, expected",
  239. [
  240. pytest.param(
  241. "c",
  242. DataFrame({"": ["a", "b", "c"]}),
  243. id="default_category is a str",
  244. ),
  245. pytest.param(
  246. 1,
  247. DataFrame({"": ["a", "b", 1]}),
  248. id="default_category is a int",
  249. ),
  250. pytest.param(
  251. 1.25,
  252. DataFrame({"": ["a", "b", 1.25]}),
  253. id="default_category is a float",
  254. ),
  255. pytest.param(
  256. 0,
  257. DataFrame({"": ["a", "b", 0]}),
  258. id="default_category is a 0",
  259. ),
  260. pytest.param(
  261. False,
  262. DataFrame({"": ["a", "b", False]}),
  263. id="default_category is a bool",
  264. ),
  265. pytest.param(
  266. (1, 2),
  267. DataFrame({"": ["a", "b", (1, 2)]}),
  268. id="default_category is a tuple",
  269. ),
  270. ],
  271. )
  272. def test_no_prefix_string_cats_default_category(default_category, expected):
  273. dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
  274. result = from_dummies(dummies, default_category=default_category)
  275. tm.assert_frame_equal(result, expected)
  276. def test_with_prefix_basic(dummies_basic):
  277. expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
  278. result = from_dummies(dummies_basic, sep="_")
  279. tm.assert_frame_equal(result, expected)
  280. def test_with_prefix_contains_get_dummies_NaN_column():
  281. dummies = DataFrame(
  282. {
  283. "col1_a": [1, 0, 0],
  284. "col1_b": [0, 1, 0],
  285. "col1_NaN": [0, 0, 1],
  286. "col2_a": [0, 1, 0],
  287. "col2_b": [0, 0, 0],
  288. "col2_c": [0, 0, 1],
  289. "col2_NaN": [1, 0, 0],
  290. },
  291. )
  292. expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]})
  293. result = from_dummies(dummies, sep="_")
  294. tm.assert_frame_equal(result, expected)
  295. @pytest.mark.parametrize(
  296. "default_category, expected",
  297. [
  298. pytest.param(
  299. "x",
  300. DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}),
  301. id="default_category is a str",
  302. ),
  303. pytest.param(
  304. 0,
  305. DataFrame({"col1": ["a", "b", 0], "col2": [0, "a", "c"]}),
  306. id="default_category is a 0",
  307. ),
  308. pytest.param(
  309. False,
  310. DataFrame({"col1": ["a", "b", False], "col2": [False, "a", "c"]}),
  311. id="default_category is a False",
  312. ),
  313. pytest.param(
  314. {"col2": 1, "col1": 2.5},
  315. DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}),
  316. id="default_category is a dict with int and float values",
  317. ),
  318. pytest.param(
  319. {"col2": None, "col1": False},
  320. DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}),
  321. id="default_category is a dict with bool and None values",
  322. ),
  323. pytest.param(
  324. {"col2": (1, 2), "col1": [1.25, False]},
  325. DataFrame({"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]}),
  326. id="default_category is a dict with list and tuple values",
  327. ),
  328. ],
  329. )
  330. def test_with_prefix_default_category(
  331. dummies_with_unassigned, default_category, expected
  332. ):
  333. result = from_dummies(
  334. dummies_with_unassigned, sep="_", default_category=default_category
  335. )
  336. tm.assert_frame_equal(result, expected)