123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397 |
- import numpy as np
- import pytest
- from pandas import (
- DataFrame,
- Series,
- from_dummies,
- get_dummies,
- )
- import pandas._testing as tm
- @pytest.fixture
- def dummies_basic():
- return DataFrame(
- {
- "col1_a": [1, 0, 1],
- "col1_b": [0, 1, 0],
- "col2_a": [0, 1, 0],
- "col2_b": [1, 0, 0],
- "col2_c": [0, 0, 1],
- },
- )
- @pytest.fixture
- def dummies_with_unassigned():
- return DataFrame(
- {
- "col1_a": [1, 0, 0],
- "col1_b": [0, 1, 0],
- "col2_a": [0, 1, 0],
- "col2_b": [0, 0, 0],
- "col2_c": [0, 0, 1],
- },
- )
- def test_error_wrong_data_type():
- dummies = [0, 1, 0]
- with pytest.raises(
- TypeError,
- match=r"Expected 'data' to be a 'DataFrame'; Received 'data' of type: list",
- ):
- from_dummies(dummies)
- def test_error_no_prefix_contains_unassigned():
- dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
- with pytest.raises(
- ValueError,
- match=(
- r"Dummy DataFrame contains unassigned value\(s\); "
- r"First instance in row: 2"
- ),
- ):
- from_dummies(dummies)
- def test_error_no_prefix_wrong_default_category_type():
- dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
- with pytest.raises(
- TypeError,
- match=(
- r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
- r"Received 'default_category' of type: list"
- ),
- ):
- from_dummies(dummies, default_category=["c", "d"])
- def test_error_no_prefix_multi_assignment():
- dummies = DataFrame({"a": [1, 0, 1], "b": [0, 1, 1]})
- with pytest.raises(
- ValueError,
- match=(
- r"Dummy DataFrame contains multi-assignment\(s\); "
- r"First instance in row: 2"
- ),
- ):
- from_dummies(dummies)
- def test_error_no_prefix_contains_nan():
- dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, np.nan]})
- with pytest.raises(
- ValueError, match=r"Dummy DataFrame contains NA value in column: 'b'"
- ):
- from_dummies(dummies)
- def test_error_contains_non_dummies():
- dummies = DataFrame(
- {"a": [1, 6, 3, 1], "b": [0, 1, 0, 2], "c": ["c1", "c2", "c3", "c4"]}
- )
- with pytest.raises(
- TypeError,
- match=r"Passed DataFrame contains non-dummy data",
- ):
- from_dummies(dummies)
- def test_error_with_prefix_multiple_seperators():
- dummies = DataFrame(
- {
- "col1_a": [1, 0, 1],
- "col1_b": [0, 1, 0],
- "col2-a": [0, 1, 0],
- "col2-b": [1, 0, 1],
- },
- )
- with pytest.raises(
- ValueError,
- match=(r"Separator not specified for column: col2-a"),
- ):
- from_dummies(dummies, sep="_")
- def test_error_with_prefix_sep_wrong_type(dummies_basic):
- with pytest.raises(
- TypeError,
- match=(
- r"Expected 'sep' to be of type 'str' or 'None'; "
- r"Received 'sep' of type: list"
- ),
- ):
- from_dummies(dummies_basic, sep=["_"])
- def test_error_with_prefix_contains_unassigned(dummies_with_unassigned):
- with pytest.raises(
- ValueError,
- match=(
- r"Dummy DataFrame contains unassigned value\(s\); "
- r"First instance in row: 2"
- ),
- ):
- from_dummies(dummies_with_unassigned, sep="_")
- def test_error_with_prefix_default_category_wrong_type(dummies_with_unassigned):
- with pytest.raises(
- TypeError,
- match=(
- r"Expected 'default_category' to be of type 'None', 'Hashable', or 'dict'; "
- r"Received 'default_category' of type: list"
- ),
- ):
- from_dummies(dummies_with_unassigned, sep="_", default_category=["x", "y"])
- def test_error_with_prefix_default_category_dict_not_complete(
- dummies_with_unassigned,
- ):
- with pytest.raises(
- ValueError,
- match=(
- r"Length of 'default_category' \(1\) did not match "
- r"the length of the columns being encoded \(2\)"
- ),
- ):
- from_dummies(dummies_with_unassigned, sep="_", default_category={"col1": "x"})
- def test_error_with_prefix_contains_nan(dummies_basic):
- dummies_basic.loc[2, "col2_c"] = np.nan
- with pytest.raises(
- ValueError, match=r"Dummy DataFrame contains NA value in column: 'col2_c'"
- ):
- from_dummies(dummies_basic, sep="_")
- def test_error_with_prefix_contains_non_dummies(dummies_basic):
- dummies_basic.loc[2, "col2_c"] = "str"
- with pytest.raises(TypeError, match=r"Passed DataFrame contains non-dummy data"):
- from_dummies(dummies_basic, sep="_")
- def test_error_with_prefix_double_assignment():
- dummies = DataFrame(
- {
- "col1_a": [1, 0, 1],
- "col1_b": [1, 1, 0],
- "col2_a": [0, 1, 0],
- "col2_b": [1, 0, 0],
- "col2_c": [0, 0, 1],
- },
- )
- with pytest.raises(
- ValueError,
- match=(
- r"Dummy DataFrame contains multi-assignment\(s\); "
- r"First instance in row: 0"
- ),
- ):
- from_dummies(dummies, sep="_")
- def test_roundtrip_series_to_dataframe():
- categories = Series(["a", "b", "c", "a"])
- dummies = get_dummies(categories)
- result = from_dummies(dummies)
- expected = DataFrame({"": ["a", "b", "c", "a"]})
- tm.assert_frame_equal(result, expected)
- def test_roundtrip_single_column_dataframe():
- categories = DataFrame({"": ["a", "b", "c", "a"]})
- dummies = get_dummies(categories)
- result = from_dummies(dummies, sep="_")
- expected = categories
- tm.assert_frame_equal(result, expected)
- def test_roundtrip_with_prefixes():
- categories = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
- dummies = get_dummies(categories)
- result = from_dummies(dummies, sep="_")
- expected = categories
- tm.assert_frame_equal(result, expected)
- def test_no_prefix_string_cats_basic():
- dummies = DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], "c": [0, 0, 1, 0]})
- expected = DataFrame({"": ["a", "b", "c", "a"]})
- result = from_dummies(dummies)
- tm.assert_frame_equal(result, expected)
- def test_no_prefix_string_cats_basic_bool_values():
- dummies = DataFrame(
- {
- "a": [True, False, False, True],
- "b": [False, True, False, False],
- "c": [False, False, True, False],
- }
- )
- expected = DataFrame({"": ["a", "b", "c", "a"]})
- result = from_dummies(dummies)
- tm.assert_frame_equal(result, expected)
- def test_no_prefix_string_cats_basic_mixed_bool_values():
- dummies = DataFrame(
- {"a": [1, 0, 0, 1], "b": [False, True, False, False], "c": [0, 0, 1, 0]}
- )
- expected = DataFrame({"": ["a", "b", "c", "a"]})
- result = from_dummies(dummies)
- tm.assert_frame_equal(result, expected)
- def test_no_prefix_int_cats_basic():
- dummies = DataFrame(
- {1: [1, 0, 0, 0], 25: [0, 1, 0, 0], 2: [0, 0, 1, 0], 5: [0, 0, 0, 1]}
- )
- expected = DataFrame({"": [1, 25, 2, 5]}, dtype="object")
- result = from_dummies(dummies)
- tm.assert_frame_equal(result, expected)
- def test_no_prefix_float_cats_basic():
- dummies = DataFrame(
- {1.0: [1, 0, 0, 0], 25.0: [0, 1, 0, 0], 2.5: [0, 0, 1, 0], 5.84: [0, 0, 0, 1]}
- )
- expected = DataFrame({"": [1.0, 25.0, 2.5, 5.84]}, dtype="object")
- result = from_dummies(dummies)
- tm.assert_frame_equal(result, expected)
- def test_no_prefix_mixed_cats_basic():
- dummies = DataFrame(
- {
- 1.23: [1, 0, 0, 0, 0],
- "c": [0, 1, 0, 0, 0],
- 2: [0, 0, 1, 0, 0],
- False: [0, 0, 0, 1, 0],
- None: [0, 0, 0, 0, 1],
- }
- )
- expected = DataFrame({"": [1.23, "c", 2, False, None]}, dtype="object")
- result = from_dummies(dummies)
- tm.assert_frame_equal(result, expected)
- def test_no_prefix_string_cats_contains_get_dummies_NaN_column():
- dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0], "NaN": [0, 0, 1]})
- expected = DataFrame({"": ["a", "b", "NaN"]})
- result = from_dummies(dummies)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "default_category, expected",
- [
- pytest.param(
- "c",
- DataFrame({"": ["a", "b", "c"]}),
- id="default_category is a str",
- ),
- pytest.param(
- 1,
- DataFrame({"": ["a", "b", 1]}),
- id="default_category is a int",
- ),
- pytest.param(
- 1.25,
- DataFrame({"": ["a", "b", 1.25]}),
- id="default_category is a float",
- ),
- pytest.param(
- 0,
- DataFrame({"": ["a", "b", 0]}),
- id="default_category is a 0",
- ),
- pytest.param(
- False,
- DataFrame({"": ["a", "b", False]}),
- id="default_category is a bool",
- ),
- pytest.param(
- (1, 2),
- DataFrame({"": ["a", "b", (1, 2)]}),
- id="default_category is a tuple",
- ),
- ],
- )
- def test_no_prefix_string_cats_default_category(default_category, expected):
- dummies = DataFrame({"a": [1, 0, 0], "b": [0, 1, 0]})
- result = from_dummies(dummies, default_category=default_category)
- tm.assert_frame_equal(result, expected)
- def test_with_prefix_basic(dummies_basic):
- expected = DataFrame({"col1": ["a", "b", "a"], "col2": ["b", "a", "c"]})
- result = from_dummies(dummies_basic, sep="_")
- tm.assert_frame_equal(result, expected)
- def test_with_prefix_contains_get_dummies_NaN_column():
- dummies = DataFrame(
- {
- "col1_a": [1, 0, 0],
- "col1_b": [0, 1, 0],
- "col1_NaN": [0, 0, 1],
- "col2_a": [0, 1, 0],
- "col2_b": [0, 0, 0],
- "col2_c": [0, 0, 1],
- "col2_NaN": [1, 0, 0],
- },
- )
- expected = DataFrame({"col1": ["a", "b", "NaN"], "col2": ["NaN", "a", "c"]})
- result = from_dummies(dummies, sep="_")
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "default_category, expected",
- [
- pytest.param(
- "x",
- DataFrame({"col1": ["a", "b", "x"], "col2": ["x", "a", "c"]}),
- id="default_category is a str",
- ),
- pytest.param(
- 0,
- DataFrame({"col1": ["a", "b", 0], "col2": [0, "a", "c"]}),
- id="default_category is a 0",
- ),
- pytest.param(
- False,
- DataFrame({"col1": ["a", "b", False], "col2": [False, "a", "c"]}),
- id="default_category is a False",
- ),
- pytest.param(
- {"col2": 1, "col1": 2.5},
- DataFrame({"col1": ["a", "b", 2.5], "col2": [1, "a", "c"]}),
- id="default_category is a dict with int and float values",
- ),
- pytest.param(
- {"col2": None, "col1": False},
- DataFrame({"col1": ["a", "b", False], "col2": [None, "a", "c"]}),
- id="default_category is a dict with bool and None values",
- ),
- pytest.param(
- {"col2": (1, 2), "col1": [1.25, False]},
- DataFrame({"col1": ["a", "b", [1.25, False]], "col2": [(1, 2), "a", "c"]}),
- id="default_category is a dict with list and tuple values",
- ),
- ],
- )
- def test_with_prefix_default_category(
- dummies_with_unassigned, default_category, expected
- ):
- result = from_dummies(
- dummies_with_unassigned, sep="_", default_category=default_category
- )
- tm.assert_frame_equal(result, expected)
|