test_unsupported.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. """
  2. Tests that features that are currently unsupported in
  3. either the Python or C parser are actually enforced
  4. and are clearly communicated to the user.
  5. Ultimately, the goal is to remove test cases from this
  6. test suite as new feature support is added to the parsers.
  7. """
  8. from io import StringIO
  9. import os
  10. from pathlib import Path
  11. import pytest
  12. from pandas.compat import (
  13. is_ci_environment,
  14. is_platform_mac,
  15. is_platform_windows,
  16. )
  17. from pandas.errors import ParserError
  18. import pandas._testing as tm
  19. from pandas.io.parsers import read_csv
  20. import pandas.io.parsers.readers as parsers
  21. @pytest.fixture(params=["python", "python-fwf"], ids=lambda val: val)
  22. def python_engine(request):
  23. return request.param
  24. class TestUnsupportedFeatures:
  25. def test_mangle_dupe_cols_false(self):
  26. # see gh-12935
  27. data = "a b c\n1 2 3"
  28. for engine in ("c", "python"):
  29. with pytest.raises(TypeError, match="unexpected keyword"):
  30. read_csv(StringIO(data), engine=engine, mangle_dupe_cols=True)
  31. def test_c_engine(self):
  32. # see gh-6607
  33. data = "a b c\n1 2 3"
  34. msg = "does not support"
  35. # specify C engine with unsupported options (raise)
  36. with pytest.raises(ValueError, match=msg):
  37. read_csv(StringIO(data), engine="c", sep=None, delim_whitespace=False)
  38. with pytest.raises(ValueError, match=msg):
  39. read_csv(StringIO(data), engine="c", sep=r"\s")
  40. with pytest.raises(ValueError, match=msg):
  41. read_csv(StringIO(data), engine="c", sep="\t", quotechar=chr(128))
  42. with pytest.raises(ValueError, match=msg):
  43. read_csv(StringIO(data), engine="c", skipfooter=1)
  44. # specify C-unsupported options without python-unsupported options
  45. with tm.assert_produces_warning(parsers.ParserWarning):
  46. read_csv(StringIO(data), sep=None, delim_whitespace=False)
  47. with tm.assert_produces_warning(parsers.ParserWarning):
  48. read_csv(StringIO(data), sep=r"\s")
  49. with tm.assert_produces_warning(parsers.ParserWarning):
  50. read_csv(StringIO(data), sep="\t", quotechar=chr(128))
  51. with tm.assert_produces_warning(parsers.ParserWarning):
  52. read_csv(StringIO(data), skipfooter=1)
  53. text = """ A B C D E
  54. one two three four
  55. a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
  56. a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
  57. x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
  58. msg = "Error tokenizing data"
  59. with pytest.raises(ParserError, match=msg):
  60. read_csv(StringIO(text), sep="\\s+")
  61. with pytest.raises(ParserError, match=msg):
  62. read_csv(StringIO(text), engine="c", sep="\\s+")
  63. msg = "Only length-1 thousands markers supported"
  64. data = """A|B|C
  65. 1|2,334|5
  66. 10|13|10.
  67. """
  68. with pytest.raises(ValueError, match=msg):
  69. read_csv(StringIO(data), thousands=",,")
  70. with pytest.raises(ValueError, match=msg):
  71. read_csv(StringIO(data), thousands="")
  72. msg = "Only length-1 line terminators supported"
  73. data = "a,b,c~~1,2,3~~4,5,6"
  74. with pytest.raises(ValueError, match=msg):
  75. read_csv(StringIO(data), lineterminator="~~")
  76. def test_python_engine(self, python_engine):
  77. from pandas.io.parsers.readers import _python_unsupported as py_unsupported
  78. data = """1,2,3,,
  79. 1,2,3,4,
  80. 1,2,3,4,5
  81. 1,2,,,
  82. 1,2,3,4,"""
  83. for default in py_unsupported:
  84. msg = (
  85. f"The {repr(default)} option is not "
  86. f"supported with the {repr(python_engine)} engine"
  87. )
  88. kwargs = {default: object()}
  89. with pytest.raises(ValueError, match=msg):
  90. read_csv(StringIO(data), engine=python_engine, **kwargs)
  91. def test_python_engine_file_no_iter(self, python_engine):
  92. # see gh-16530
  93. class NoNextBuffer:
  94. def __init__(self, csv_data) -> None:
  95. self.data = csv_data
  96. def __next__(self):
  97. return self.data.__next__()
  98. def read(self):
  99. return self.data
  100. def readline(self):
  101. return self.data
  102. data = "a\n1"
  103. msg = "'NoNextBuffer' object is not iterable|argument 1 must be an iterator"
  104. with pytest.raises(TypeError, match=msg):
  105. read_csv(NoNextBuffer(data), engine=python_engine)
  106. def test_pyarrow_engine(self):
  107. from pandas.io.parsers.readers import _pyarrow_unsupported as pa_unsupported
  108. data = """1,2,3,,
  109. 1,2,3,4,
  110. 1,2,3,4,5
  111. 1,2,,,
  112. 1,2,3,4,"""
  113. for default in pa_unsupported:
  114. msg = (
  115. f"The {repr(default)} option is not "
  116. f"supported with the 'pyarrow' engine"
  117. )
  118. kwargs = {default: object()}
  119. default_needs_bool = {"warn_bad_lines", "error_bad_lines"}
  120. if default == "dialect":
  121. kwargs[default] = "excel" # test a random dialect
  122. elif default in default_needs_bool:
  123. kwargs[default] = True
  124. elif default == "on_bad_lines":
  125. kwargs[default] = "warn"
  126. with pytest.raises(ValueError, match=msg):
  127. read_csv(StringIO(data), engine="pyarrow", **kwargs)
  128. def test_on_bad_lines_callable_python_only(self, all_parsers):
  129. # GH 5686
  130. sio = StringIO("a,b\n1,2")
  131. bad_lines_func = lambda x: x
  132. parser = all_parsers
  133. if all_parsers.engine != "python":
  134. msg = "on_bad_line can only be a callable function if engine='python'"
  135. with pytest.raises(ValueError, match=msg):
  136. parser.read_csv(sio, on_bad_lines=bad_lines_func)
  137. else:
  138. parser.read_csv(sio, on_bad_lines=bad_lines_func)
  139. def test_close_file_handle_on_invalid_usecols(all_parsers):
  140. # GH 45384
  141. parser = all_parsers
  142. error = ValueError
  143. if parser.engine == "pyarrow":
  144. pyarrow = pytest.importorskip("pyarrow")
  145. error = pyarrow.lib.ArrowKeyError
  146. if is_ci_environment() and (is_platform_windows() or is_platform_mac()):
  147. # GH#45547 causes timeouts on windows/mac builds
  148. pytest.skip("GH#45547 causing timeouts on windows/mac builds 2022-01-22")
  149. with tm.ensure_clean("test.csv") as fname:
  150. Path(fname).write_text("col1,col2\na,b\n1,2")
  151. with tm.assert_produces_warning(False):
  152. with pytest.raises(error, match="col3"):
  153. parser.read_csv(fname, usecols=["col1", "col2", "col3"])
  154. # unlink fails on windows if file handles still point to it
  155. os.unlink(fname)
  156. def test_invalid_file_inputs(request, all_parsers):
  157. # GH#45957
  158. parser = all_parsers
  159. if parser.engine == "python":
  160. request.node.add_marker(
  161. pytest.mark.xfail(reason=f"{parser.engine} engine supports lists.")
  162. )
  163. with pytest.raises(ValueError, match="Invalid"):
  164. parser.read_csv([])
  165. def test_invalid_dtype_backend(all_parsers):
  166. parser = all_parsers
  167. msg = (
  168. "dtype_backend numpy is invalid, only 'numpy_nullable' and "
  169. "'pyarrow' are allowed."
  170. )
  171. with pytest.raises(ValueError, match=msg):
  172. parser.read_csv("test", dtype_backend="numpy")