123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212 |
- """
- Tests that features that are currently unsupported in
- either the Python or C parser are actually enforced
- and are clearly communicated to the user.
- Ultimately, the goal is to remove test cases from this
- test suite as new feature support is added to the parsers.
- """
- from io import StringIO
- import os
- from pathlib import Path
- import pytest
- from pandas.compat import (
- is_ci_environment,
- is_platform_mac,
- is_platform_windows,
- )
- from pandas.errors import ParserError
- import pandas._testing as tm
- from pandas.io.parsers import read_csv
- import pandas.io.parsers.readers as parsers
- @pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
- def python_engine(request):
- return request.param
- class TestUnsupportedFeatures:
- def test_mangle_dupe_cols_false(self):
- # see gh-12935
- data = "a b c\n1 2 3"
- for engine in ("c", "python"):
- with pytest.raises(TypeError, match="unexpected keyword"):
- read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)
- def test_c_engine(self):
- # see gh-6607
- data = "a b c\n1 2 3"
- msg = "does not support"
- # specify C engine with unsupported options (raise)
- with pytest.raises(ValueError, match=msg):
- read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False)
- with pytest.raises(ValueError, match=msg):
- read_csv(StringIO(data), engine="c", sep=r"\s")
- with pytest.raises(ValueError, match=msg):
- read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128))
- with pytest.raises(ValueError, match=msg):
- read_csv(StringIO(data), engine="c", skipfooter=1)
- # specify C-unsupported options without python-unsupported options
- with tm.assert_produces_warning(parsers.ParserWarning):
- read_csv(StringIO(data), sep=None, delim_whitespace=False)
- with tm.assert_produces_warning(parsers.ParserWarning):
- read_csv(StringIO(data), sep=r"\s")
- with tm.assert_produces_warning(parsers.ParserWarning):
- read_csv(StringIO(data), sep="\t", quotechar=chr(128))
- with tm.assert_produces_warning(parsers.ParserWarning):
- read_csv(StringIO(data), skipfooter=1)
- text = """ A B C D E
- one two three four
- a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
- a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
- x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
- msg = "Error tokenizing data"
- with pytest.raises(ParserError, match=msg):
- read_csv(StringIO(text), sep="\\s+")
- with pytest.raises(ParserError, match=msg):
- read_csv(StringIO(text), engine="c", sep="\\s+")
- msg = "Only length-1 thousands markers supported"
- data = """A|B|C
- 1|2,334|5
- 10|13|10.
- """
- with pytest.raises(ValueError, match=msg):
- read_csv(StringIO(data), thousands=",,")
- with pytest.raises(ValueError, match=msg):
- read_csv(StringIO(data), thousands="")
- msg = "Only length-1 line terminators supported"
- data = "a,b,c~~1,2,3~~4,5,6"
- with pytest.raises(ValueError, match=msg):
- read_csv(StringIO(data), lineterminator="~~")
- def test_python_engine(self, python_engine):
- from pandas.io.parsers.readers import _python_unsupported as py_unsupported
- data = """1,2,3,,
- 1,2,3,4,
- 1,2,3,4,5
- 1,2,,,
- 1,2,3,4,"""
- for default in py_unsupported:
- msg = (
- f"The {repr(default)} option is not "
- f"supported with the {repr(python_engine)} engine"
- )
- kwargs = {default: object()}
- with pytest.raises(ValueError, match=msg):
- read_csv(StringIO(data), engine=python_engine, **kwargs)
- def test_python_engine_file_no_iter(self, python_engine):
- # see gh-16530
- class NoNextBuffer:
- def __init__(self, csv_data) -> None:
- self.data = csv_data
- def __next__(self):
- return self.data.__next__()
- def read(self):
- return self.data
- def readline(self):
- return self.data
- data = "a\n1"
- msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator"
- with pytest.raises(TypeError, match=msg):
- read_csv(NoNextBuffer(data), engine=python_engine)
- def test_pyarrow_engine(self):
- from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported
- data = """1,2,3,,
- 1,2,3,4,
- 1,2,3,4,5
- 1,2,,,
- 1,2,3,4,"""
- for default in pa_unsupported:
- msg = (
- f"The {repr(default)} option is not "
- f"supported with the 'pyarrow' engine"
- )
- kwargs = {default: object()}
- default_needs_bool = {"warn_bad_lines", "error_bad_lines"}
- if default == "dialect":
- kwargs[default] = "excel" # test a random dialect
- elif default in default_needs_bool:
- kwargs[default] = True
- elif default == "on_bad_lines":
- kwargs[default] = "warn"
- with pytest.raises(ValueError, match=msg):
- read_csv(StringIO(data), engine="pyarrow", **kwargs)
- def test_on_bad_lines_callable_python_only(self, all_parsers):
- # GH 5686
- sio = StringIO("a,b\n1,2")
- bad_lines_func = lambda x: x
- parser = all_parsers
- if all_parsers.engine != "python":
- msg = "on_bad_line can only be a callable function if engine='python'"
- with pytest.raises(ValueError, match=msg):
- parser.read_csv(sio, on_bad_lines=bad_lines_func)
- else:
- parser.read_csv(sio, on_bad_lines=bad_lines_func)
- def test_close_file_handle_on_invalid_usecols(all_parsers):
- # GH 45384
- parser = all_parsers
- error = ValueError
- if parser.engine == "pyarrow":
- pyarrow = pytest.importorskip("pyarrow")
- error = pyarrow.lib.ArrowKeyError
- if is_ci_environment() and (is_platform_windows() or is_platform_mac()):
- # GH#45547 causes timeouts on windows/mac builds
- pytest.skip("GH#45547 causing timeouts on windows/mac builds 2022-01-22")
- with tm.ensure_clean("test.csv") as fname:
- Path(fname).write_text("col1,col2\na,b\n1,2")
- with tm.assert_produces_warning(False):
- with pytest.raises(error, match="col3"):
- parser.read_csv(fname, usecols=["col1", "col2", "col3"])
- # unlink fails on windows if file handles still point to it
- os.unlink(fname)
- def test_invalid_file_inputs(request, all_parsers):
- # GH#45957
- parser = all_parsers
- if parser.engine == "python":
- request.node.add_marker(
- pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.")
- )
- with pytest.raises(ValueError, match="Invalid"):
- parser.read_csv([])
- def test_invalid_dtype_backend(all_parsers):
- parser = all_parsers
- msg = (
- "dtype_backend numpy is invalid, only 'numpy_nullable' and "
- "'pyarrow' are allowed."
- )
- with pytest.raises(ValueError, match=msg):
- parser.read_csv("test", dtype_backend="numpy")
|