123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156 |
- """
- Tests that dialects are properly handled during parsing
- for all of the parsers defined in parsers.py
- """
- import csv
- from io import StringIO
- import pytest
- from pandas.errors import ParserWarning
- from pandas import DataFrame
- import pandas._testing as tm
- pytestmark = pytest.mark.usefixtures("pyarrow_skip")
- @pytest.fixture
- def custom_dialect():
- dialect_name = "weird"
- dialect_kwargs = {
- "doublequote": False,
- "escapechar": "~",
- "delimiter": ":",
- "skipinitialspace": False,
- "quotechar": "~",
- "quoting": 3,
- }
- return dialect_name, dialect_kwargs
- def test_dialect(all_parsers):
- parser = all_parsers
- data = """\
- label1,label2,label3
- index1,"a,c,e
- index2,b,d,f
- """
- dia = csv.excel()
- dia.quoting = csv.QUOTE_NONE
- df = parser.read_csv(StringIO(data), dialect=dia)
- data = """\
- label1,label2,label3
- index1,a,c,e
- index2,b,d,f
- """
- exp = parser.read_csv(StringIO(data))
- exp.replace("a", '"a', inplace=True)
- tm.assert_frame_equal(df, exp)
- def test_dialect_str(all_parsers):
- dialect_name = "mydialect"
- parser = all_parsers
- data = """\
- fruit:vegetable
- apple:broccoli
- pear:tomato
- """
- exp = DataFrame({"fruit": ["apple", "pear"], "vegetable": ["broccoli", "tomato"]})
- with tm.with_csv_dialect(dialect_name, delimiter=":"):
- df = parser.read_csv(StringIO(data), dialect=dialect_name)
- tm.assert_frame_equal(df, exp)
- def test_invalid_dialect(all_parsers):
- class InvalidDialect:
- pass
- data = "a\n1"
- parser = all_parsers
- msg = "Invalid dialect"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(StringIO(data), dialect=InvalidDialect)
- @pytest.mark.parametrize(
- "arg",
- [None, "doublequote", "escapechar", "skipinitialspace", "quotechar", "quoting"],
- )
- @pytest.mark.parametrize("value", ["dialect", "default", "other"])
- def test_dialect_conflict_except_delimiter(all_parsers, custom_dialect, arg, value):
- # see gh-23761.
- dialect_name, dialect_kwargs = custom_dialect
- parser = all_parsers
- expected = DataFrame({"a": [1], "b": [2]})
- data = "a:b\n1:2"
- warning_klass = None
- kwds = {}
- # arg=None tests when we pass in the dialect without any other arguments.
- if arg is not None:
- if value == "dialect": # No conflict --> no warning.
- kwds[arg] = dialect_kwargs[arg]
- elif value == "default": # Default --> no warning.
- from pandas.io.parsers.base_parser import parser_defaults
- kwds[arg] = parser_defaults[arg]
- else: # Non-default + conflict with dialect --> warning.
- warning_klass = ParserWarning
- kwds[arg] = "blah"
- with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
- result = parser.read_csv_check_warnings(
- warning_klass,
- "Conflicting values for",
- StringIO(data),
- dialect=dialect_name,
- **kwds,
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "kwargs,warning_klass",
- [
- ({"sep": ","}, None), # sep is default --> sep_override=True
- ({"sep": "."}, ParserWarning), # sep isn't default --> sep_override=False
- ({"delimiter": ":"}, None), # No conflict
- ({"delimiter": None}, None), # Default arguments --> sep_override=True
- ({"delimiter": ","}, ParserWarning), # Conflict
- ({"delimiter": "."}, ParserWarning), # Conflict
- ],
- ids=[
- "sep-override-true",
- "sep-override-false",
- "delimiter-no-conflict",
- "delimiter-default-arg",
- "delimiter-conflict",
- "delimiter-conflict2",
- ],
- )
- def test_dialect_conflict_delimiter(all_parsers, custom_dialect, kwargs, warning_klass):
- # see gh-23761.
- dialect_name, dialect_kwargs = custom_dialect
- parser = all_parsers
- expected = DataFrame({"a": [1], "b": [2]})
- data = "a:b\n1:2"
- with tm.with_csv_dialect(dialect_name, **dialect_kwargs):
- result = parser.read_csv_check_warnings(
- warning_klass,
- "Conflicting values for 'delimiter'",
- StringIO(data),
- dialect=dialect_name,
- **kwargs,
- )
- tm.assert_frame_equal(result, expected)
|