123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- """
- Tests that duplicate columns are handled appropriately when parsed by the
- CSV engine. In general, the expected result is that they are either thoroughly
- de-duplicated (if mangling requested) or ignored otherwise.
- """
- from io import StringIO
- import pytest
- from pandas import DataFrame
- import pandas._testing as tm
- skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
- @skip_pyarrow
- def test_basic(all_parsers):
- parser = all_parsers
- data = "a,a,b,b,b\n1,2,3,4,5"
- result = parser.read_csv(StringIO(data), sep=",")
- expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- def test_basic_names(all_parsers):
- # See gh-7160
- parser = all_parsers
- data = "a,b,a\n0,1,2\n3,4,5"
- expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"])
- result = parser.read_csv(StringIO(data))
- tm.assert_frame_equal(result, expected)
- def test_basic_names_raise(all_parsers):
- # See gh-7160
- parser = all_parsers
- data = "0,1,2\n3,4,5"
- with pytest.raises(ValueError, match="Duplicate names"):
- parser.read_csv(StringIO(data), names=["a", "b", "a"])
- @skip_pyarrow
- @pytest.mark.parametrize(
- "data,expected",
- [
- ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
- (
- "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
- DataFrame(
- [[1, 2, 3, 4, 5, 6]],
- columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
- ),
- ),
- (
- "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
- DataFrame(
- [[1, 2, 3, 4, 5, 6, 7]],
- columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
- ),
- ),
- ],
- )
- def test_thorough_mangle_columns(all_parsers, data, expected):
- # see gh-17060
- parser = all_parsers
- result = parser.read_csv(StringIO(data))
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- @pytest.mark.parametrize(
- "data,names,expected",
- [
- (
- "a,b,b\n1,2,3",
- ["a.1", "a.1", "a.1.1"],
- DataFrame(
- [["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"]
- ),
- ),
- (
- "a,b,c,d,e,f\n1,2,3,4,5,6",
- ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
- DataFrame(
- [["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]],
- columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
- ),
- ),
- (
- "a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
- ["a", "a", "a.3", "a.1", "a.2", "a", "a"],
- DataFrame(
- [
- ["a", "b", "c", "d", "e", "f", "g"],
- ["1", "2", "3", "4", "5", "6", "7"],
- ],
- columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
- ),
- ),
- ],
- )
- def test_thorough_mangle_names(all_parsers, data, names, expected):
- # see gh-17095
- parser = all_parsers
- with pytest.raises(ValueError, match="Duplicate names"):
- parser.read_csv(StringIO(data), names=names)
- @skip_pyarrow
- def test_mangled_unnamed_placeholders(all_parsers):
- # xref gh-13017
- orig_key = "0"
- parser = all_parsers
- orig_value = [1, 2, 3]
- df = DataFrame({orig_key: orig_value})
- # This test recursively updates `df`.
- for i in range(3):
- expected = DataFrame()
- for j in range(i + 1):
- col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
- expected.insert(loc=0, column=col_name, value=[0, 1, 2])
- expected[orig_key] = orig_value
- df = parser.read_csv(StringIO(df.to_csv()))
- tm.assert_frame_equal(df, expected)
- @skip_pyarrow
- def test_mangle_dupe_cols_already_exists(all_parsers):
- # GH#14704
- parser = all_parsers
- data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
- result = parser.read_csv(StringIO(data))
- expected = DataFrame(
- [[1, 2, 3, 4, 5, 6, 7]],
- columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
- )
- tm.assert_frame_equal(result, expected)
- @skip_pyarrow
- def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
- # GH#14704
- parser = all_parsers
- data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
- result = parser.read_csv(StringIO(data))
- expected = DataFrame(
- [[1, 2, 3, 4]],
- columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
- )
- tm.assert_frame_equal(result, expected)
|