test_dialect.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. """
  2. Tests that dialects are properly handled during parsing
  3. for all of the parsers defined in parsers.py
  4. """
  5. import csv
  6. from io import StringIO
  7. import pytest
  8. from pandas.errors import ParserWarning
  9. from pandas import DataFrame
  10. import pandas._testing as tm
  11. pytestmark = pytest.mark.usefixtures("pyarrow_skip")
  12. @pytest.fixture
  13. def custom_dialect():
  14. dialect_name = "weird"
  15. dialect_kwargs = {
  16. "doublequote": False,
  17. "escapechar": "~",
  18. "delimiter": ":",
  19. "skipinitialspace": False,
  20. "quotechar": "~",
  21. "quoting": 3,
  22. }
  23. return dialect_name, dialect_kwargs
  24. def test_dialect(all_parsers):
  25. parser = all_parsers
  26. data = """\
  27. label1,label2,label3
  28. index1,"a,c,e
  29. index2,b,d,f
  30. """
  31. dia = csv.excel()
  32. dia.quoting = csv.QUOTE_NONE
  33. df = parser.read_csv(StringIO(data), dialect=dia)
  34. data = """\
  35. label1,label2,label3
  36. index1,a,c,e
  37. index2,b,d,f
  38. """
  39. exp = parser.read_csv(StringIO(data))
  40. exp.replace("a", '"a', inplace=True)
  41. tm.assert_frame_equal(df, exp)
  42. def test_dialect_str(all_parsers):
  43. dialect_name = "mydialect"
  44. parser = all_parsers
  45. data = """\
  46. fruit:vegetable
  47. apple:broccoli
  48. pear:tomato
  49. """
  50. exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]})
  51. with tm.with_csv_dialect(dialect_name, delimiter=":"):
  52. df = parser.read_csv(StringIO(data), dialect=dialect_name)
  53. tm.assert_frame_equal(df, exp)
  54. def test_invalid_dialect(all_parsers):
  55. class InvalidDialect:
  56. pass
  57. data = "a\n1"
  58. parser = all_parsers
  59. msg = "Invalid dialect"
  60. with pytest.raises(ValueError, match=msg):
  61. parser.read_csv(StringIO(data), dialect=InvalidDialect)
  62. @pytest.mark.parametrize(
  63. "arg",
  64. [None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"],
  65. )
  66. @pytest.mark.parametrize("value", ["dialect", "default", "other"])
  67. def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value):
  68. # see gh-23761.
  69. dialect_name, dialect_kwargs = custom_dialect
  70. parser = all_parsers
  71. expected = DataFrame({"a": [1], "b": [2]})
  72. data = "a:b\n1:2"
  73. warning_klass = None
  74. kwds = {}
  75. # arg=None tests when we pass in the dialect without any other arguments.
  76. if arg is not None:
  77. if value == "dialect": # No conflict --> no warning.
  78. kwds[arg] = dialect_kwargs[arg]
  79. elif value == "default": # Default --> no warning.
  80. from pandas.io.parsers.base_parser import parser_defaults
  81. kwds[arg] = parser_defaults[arg]
  82. else: # Non-default + conflict with dialect --> warning.
  83. warning_klass = ParserWarning
  84. kwds[arg] = "blah"
  85. with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
  86. result = parser.read_csv_check_warnings(
  87. warning_klass,
  88. "Conflicting values for",
  89. StringIO(data),
  90. dialect=dialect_name,
  91. **kwds,
  92. )
  93. tm.assert_frame_equal(result, expected)
  94. @pytest.mark.parametrize(
  95. "kwargs,warning_klass",
  96. [
  97. ({"sep": ","}, None), # sep is default --> sep_override=True
  98. ({"sep": "."}, ParserWarning), # sep isn't default --> sep_override=False
  99. ({"delimiter": ":"}, None), # No conflict
  100. ({"delimiter": None}, None), # Default arguments --> sep_override=True
  101. ({"delimiter": ","}, ParserWarning), # Conflict
  102. ({"delimiter": "."}, ParserWarning), # Conflict
  103. ],
  104. ids=[
  105. "sep-override-true",
  106. "sep-override-false",
  107. "delimiter-no-conflict",
  108. "delimiter-default-arg",
  109. "delimiter-conflict",
  110. "delimiter-conflict2",
  111. ],
  112. )
  113. def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass):
  114. # see gh-23761.
  115. dialect_name, dialect_kwargs = custom_dialect
  116. parser = all_parsers
  117. expected = DataFrame({"a": [1], "b": [2]})
  118. data = "a:b\n1:2"
  119. with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
  120. result = parser.read_csv_check_warnings(
  121. warning_klass,
  122. "Conflicting values for 'delimiter'",
  123. StringIO(data),
  124. dialect=dialect_name,
  125. **kwargs,
  126. )
  127. tm.assert_frame_equal(result, expected)