test_mangle_dupes.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. """
  2. Tests that duplicate columns are handled appropriately when parsed by the
  3. CSV engine. In general, the expected result is that they are either thoroughly
  4. de-duplicated (if mangling requested) or ignored otherwise.
  5. """
  6. from io import StringIO
  7. import pytest
  8. from pandas import DataFrame
  9. import pandas._testing as tm
  10. skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
  11. @skip_pyarrow
  12. def test_basic(all_parsers):
  13. parser = all_parsers
  14. data = "a,a,b,b,b\n1,2,3,4,5"
  15. result = parser.read_csv(StringIO(data), sep=",")
  16. expected = DataFrame([[1, 2, 3, 4, 5]], columns=["a", "a.1", "b", "b.1", "b.2"])
  17. tm.assert_frame_equal(result, expected)
  18. @skip_pyarrow
  19. def test_basic_names(all_parsers):
  20. # See gh-7160
  21. parser = all_parsers
  22. data = "a,b,a\n0,1,2\n3,4,5"
  23. expected = DataFrame([[0, 1, 2], [3, 4, 5]], columns=["a", "b", "a.1"])
  24. result = parser.read_csv(StringIO(data))
  25. tm.assert_frame_equal(result, expected)
  26. def test_basic_names_raise(all_parsers):
  27. # See gh-7160
  28. parser = all_parsers
  29. data = "0,1,2\n3,4,5"
  30. with pytest.raises(ValueError, match="Duplicate names"):
  31. parser.read_csv(StringIO(data), names=["a", "b", "a"])
  32. @skip_pyarrow
  33. @pytest.mark.parametrize(
  34. "data,expected",
  35. [
  36. ("a,a,a.1\n1,2,3", DataFrame([[1, 2, 3]], columns=["a", "a.2", "a.1"])),
  37. (
  38. "a,a,a.1,a.1.1,a.1.1.1,a.1.1.1.1\n1,2,3,4,5,6",
  39. DataFrame(
  40. [[1, 2, 3, 4, 5, 6]],
  41. columns=["a", "a.2", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
  42. ),
  43. ),
  44. (
  45. "a,a,a.3,a.1,a.2,a,a\n1,2,3,4,5,6,7",
  46. DataFrame(
  47. [[1, 2, 3, 4, 5, 6, 7]],
  48. columns=["a", "a.4", "a.3", "a.1", "a.2", "a.5", "a.6"],
  49. ),
  50. ),
  51. ],
  52. )
  53. def test_thorough_mangle_columns(all_parsers, data, expected):
  54. # see gh-17060
  55. parser = all_parsers
  56. result = parser.read_csv(StringIO(data))
  57. tm.assert_frame_equal(result, expected)
  58. @skip_pyarrow
  59. @pytest.mark.parametrize(
  60. "data,names,expected",
  61. [
  62. (
  63. "a,b,b\n1,2,3",
  64. ["a.1", "a.1", "a.1.1"],
  65. DataFrame(
  66. [["a", "b", "b"], ["1", "2", "3"]], columns=["a.1", "a.1.1", "a.1.1.1"]
  67. ),
  68. ),
  69. (
  70. "a,b,c,d,e,f\n1,2,3,4,5,6",
  71. ["a", "a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1"],
  72. DataFrame(
  73. [["a", "b", "c", "d", "e", "f"], ["1", "2", "3", "4", "5", "6"]],
  74. columns=["a", "a.1", "a.1.1", "a.1.1.1", "a.1.1.1.1", "a.1.1.1.1.1"],
  75. ),
  76. ),
  77. (
  78. "a,b,c,d,e,f,g\n1,2,3,4,5,6,7",
  79. ["a", "a", "a.3", "a.1", "a.2", "a", "a"],
  80. DataFrame(
  81. [
  82. ["a", "b", "c", "d", "e", "f", "g"],
  83. ["1", "2", "3", "4", "5", "6", "7"],
  84. ],
  85. columns=["a", "a.1", "a.3", "a.1.1", "a.2", "a.2.1", "a.3.1"],
  86. ),
  87. ),
  88. ],
  89. )
  90. def test_thorough_mangle_names(all_parsers, data, names, expected):
  91. # see gh-17095
  92. parser = all_parsers
  93. with pytest.raises(ValueError, match="Duplicate names"):
  94. parser.read_csv(StringIO(data), names=names)
  95. @skip_pyarrow
  96. def test_mangled_unnamed_placeholders(all_parsers):
  97. # xref gh-13017
  98. orig_key = "0"
  99. parser = all_parsers
  100. orig_value = [1, 2, 3]
  101. df = DataFrame({orig_key: orig_value})
  102. # This test recursively updates `df`.
  103. for i in range(3):
  104. expected = DataFrame()
  105. for j in range(i + 1):
  106. col_name = "Unnamed: 0" + f".{1*j}" * min(j, 1)
  107. expected.insert(loc=0, column=col_name, value=[0, 1, 2])
  108. expected[orig_key] = orig_value
  109. df = parser.read_csv(StringIO(df.to_csv()))
  110. tm.assert_frame_equal(df, expected)
  111. @skip_pyarrow
  112. def test_mangle_dupe_cols_already_exists(all_parsers):
  113. # GH#14704
  114. parser = all_parsers
  115. data = "a,a,a.1,a,a.3,a.1,a.1.1\n1,2,3,4,5,6,7"
  116. result = parser.read_csv(StringIO(data))
  117. expected = DataFrame(
  118. [[1, 2, 3, 4, 5, 6, 7]],
  119. columns=["a", "a.2", "a.1", "a.4", "a.3", "a.1.2", "a.1.1"],
  120. )
  121. tm.assert_frame_equal(result, expected)
  122. @skip_pyarrow
  123. def test_mangle_dupe_cols_already_exists_unnamed_col(all_parsers):
  124. # GH#14704
  125. parser = all_parsers
  126. data = ",Unnamed: 0,,Unnamed: 2\n1,2,3,4"
  127. result = parser.read_csv(StringIO(data))
  128. expected = DataFrame(
  129. [[1, 2, 3, 4]],
  130. columns=["Unnamed: 0.1", "Unnamed: 0", "Unnamed: 2.1", "Unnamed: 2"],
  131. )
  132. tm.assert_frame_equal(result, expected)