123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167 |
- """
- Tests that quoting specifications are properly handled
- during parsing for all of the parsers defined in parsers.py
- """
- import csv
- from io import StringIO
- import pytest
- from pandas.compat import PY311
- from pandas.errors import ParserError
- from pandas import DataFrame
- import pandas._testing as tm
- pytestmark = pytest.mark.usefixtures("pyarrow_skip")
- @pytest.mark.parametrize(
- "kwargs,msg",
- [
- ({"quotechar": "foo"}, '"quotechar" must be a(n)? 1-character string'),
- (
- {"quotechar": None, "quoting": csv.QUOTE_MINIMAL},
- "quotechar must be set if quoting enabled",
- ),
- ({"quotechar": 2}, '"quotechar" must be string( or None)?, not int'),
- ],
- )
- def test_bad_quote_char(all_parsers, kwargs, msg):
- data = "1,2,3"
- parser = all_parsers
- with pytest.raises(TypeError, match=msg):
- parser.read_csv(StringIO(data), **kwargs)
- @pytest.mark.parametrize(
- "quoting,msg",
- [
- ("foo", '"quoting" must be an integer|Argument'),
- (5, 'bad "quoting" value'), # quoting must be in the range [0, 3]
- ],
- )
- def test_bad_quoting(all_parsers, quoting, msg):
- data = "1,2,3"
- parser = all_parsers
- with pytest.raises(TypeError, match=msg):
- parser.read_csv(StringIO(data), quoting=quoting)
- def test_quote_char_basic(all_parsers):
- parser = all_parsers
- data = 'a,b,c\n1,2,"cat"'
- expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
- result = parser.read_csv(StringIO(data), quotechar='"')
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("quote_char", ["~", "*", "%", "$", "@", "P"])
- def test_quote_char_various(all_parsers, quote_char):
- parser = all_parsers
- expected = DataFrame([[1, 2, "cat"]], columns=["a", "b", "c"])
- data = 'a,b,c\n1,2,"cat"'
- new_data = data.replace('"', quote_char)
- result = parser.read_csv(StringIO(new_data), quotechar=quote_char)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
- @pytest.mark.parametrize("quote_char", ["", None])
- def test_null_quote_char(all_parsers, quoting, quote_char):
- kwargs = {"quotechar": quote_char, "quoting": quoting}
- data = "a,b,c\n1,2,3"
- parser = all_parsers
- if quoting != csv.QUOTE_NONE:
- # Sanity checking.
- msg = (
- '"quotechar" must be a 1-character string'
- if PY311 and all_parsers.engine == "python" and quote_char == ""
- else "quotechar must be set if quoting enabled"
- )
- with pytest.raises(TypeError, match=msg):
- parser.read_csv(StringIO(data), **kwargs)
- elif not (PY311 and all_parsers.engine == "python"):
- # Python 3.11+ doesn't support null/blank quote chars in their csv parsers
- expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
- result = parser.read_csv(StringIO(data), **kwargs)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "kwargs,exp_data",
- [
- ({}, [[1, 2, "foo"]]), # Test default.
- # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
- ({"quotechar": '"', "quoting": csv.QUOTE_MINIMAL}, [[1, 2, "foo"]]),
- # QUOTE_MINIMAL only applies to CSV writing, so no effect on reading.
- ({"quotechar": '"', "quoting": csv.QUOTE_ALL}, [[1, 2, "foo"]]),
- # QUOTE_NONE tells the reader to do no special handling
- # of quote characters and leave them alone.
- ({"quotechar": '"', "quoting": csv.QUOTE_NONE}, [[1, 2, '"foo"']]),
- # QUOTE_NONNUMERIC tells the reader to cast
- # all non-quoted fields to float
- ({"quotechar": '"', "quoting": csv.QUOTE_NONNUMERIC}, [[1.0, 2.0, "foo"]]),
- ],
- )
- def test_quoting_various(all_parsers, kwargs, exp_data):
- data = '1,2,"foo"'
- parser = all_parsers
- columns = ["a", "b", "c"]
- result = parser.read_csv(StringIO(data), names=columns, **kwargs)
- expected = DataFrame(exp_data, columns=columns)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "doublequote,exp_data", [(True, [[3, '4 " 5']]), (False, [[3, '4 " 5"']])]
- )
- def test_double_quote(all_parsers, doublequote, exp_data):
- parser = all_parsers
- data = 'a,b\n3,"4 "" 5"'
- result = parser.read_csv(StringIO(data), quotechar='"', doublequote=doublequote)
- expected = DataFrame(exp_data, columns=["a", "b"])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("quotechar", ['"', "\u0001"])
- def test_quotechar_unicode(all_parsers, quotechar):
- # see gh-14477
- data = "a\n1"
- parser = all_parsers
- expected = DataFrame({"a": [1]})
- result = parser.read_csv(StringIO(data), quotechar=quotechar)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("balanced", [True, False])
- def test_unbalanced_quoting(all_parsers, balanced):
- # see gh-22789.
- parser = all_parsers
- data = 'a,b,c\n1,2,"3'
- if balanced:
- # Re-balance the quoting and read in without errors.
- expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"])
- result = parser.read_csv(StringIO(data + '"'))
- tm.assert_frame_equal(result, expected)
- else:
- msg = (
- "EOF inside string starting at row 1"
- if parser.engine == "c"
- else "unexpected end of data"
- )
- with pytest.raises(ParserError, match=msg):
- parser.read_csv(StringIO(data))
|