test_python_parser_only.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490
  1. """
  2. Tests that apply specifically to the Python parser. Unless specifically
  3. stated as a Python-specific issue, the goal is to eventually move as many of
  4. these tests out of this module as soon as the C parser can accept further
  5. arguments when parsing.
  6. """
  7. from __future__ import annotations
  8. import csv
  9. from io import (
  10. BytesIO,
  11. StringIO,
  12. TextIOWrapper,
  13. )
  14. from typing import Iterator
  15. import pytest
  16. from pandas.errors import (
  17. ParserError,
  18. ParserWarning,
  19. )
  20. from pandas import (
  21. DataFrame,
  22. Index,
  23. MultiIndex,
  24. )
  25. import pandas._testing as tm
  26. def test_default_separator(python_parser_only):
  27. # see gh-17333
  28. #
  29. # csv.Sniffer in Python treats "o" as separator.
  30. data = "aob\n1o2\n3o4"
  31. parser = python_parser_only
  32. expected = DataFrame({"a": [1, 3], "b": [2, 4]})
  33. result = parser.read_csv(StringIO(data), sep=None)
  34. tm.assert_frame_equal(result, expected)
  35. @pytest.mark.parametrize("skipfooter", ["foo", 1.5, True])
  36. def test_invalid_skipfooter_non_int(python_parser_only, skipfooter):
  37. # see gh-15925 (comment)
  38. data = "a\n1\n2"
  39. parser = python_parser_only
  40. msg = "skipfooter must be an integer"
  41. with pytest.raises(ValueError, match=msg):
  42. parser.read_csv(StringIO(data), skipfooter=skipfooter)
  43. def test_invalid_skipfooter_negative(python_parser_only):
  44. # see gh-15925 (comment)
  45. data = "a\n1\n2"
  46. parser = python_parser_only
  47. msg = "skipfooter cannot be negative"
  48. with pytest.raises(ValueError, match=msg):
  49. parser.read_csv(StringIO(data), skipfooter=-1)
  50. @pytest.mark.parametrize("kwargs", [{"sep": None}, {"delimiter": "|"}])
  51. def test_sniff_delimiter(python_parser_only, kwargs):
  52. data = """index|A|B|C
  53. foo|1|2|3
  54. bar|4|5|6
  55. baz|7|8|9
  56. """
  57. parser = python_parser_only
  58. result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
  59. expected = DataFrame(
  60. [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  61. columns=["A", "B", "C"],
  62. index=Index(["foo", "bar", "baz"], name="index"),
  63. )
  64. tm.assert_frame_equal(result, expected)
  65. def test_sniff_delimiter_comment(python_parser_only):
  66. data = """# comment line
  67. index|A|B|C
  68. # comment line
  69. foo|1|2|3 # ignore | this
  70. bar|4|5|6
  71. baz|7|8|9
  72. """
  73. parser = python_parser_only
  74. result = parser.read_csv(StringIO(data), index_col=0, sep=None, comment="#")
  75. expected = DataFrame(
  76. [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  77. columns=["A", "B", "C"],
  78. index=Index(["foo", "bar", "baz"], name="index"),
  79. )
  80. tm.assert_frame_equal(result, expected)
  81. @pytest.mark.parametrize("encoding", [None, "utf-8"])
  82. def test_sniff_delimiter_encoding(python_parser_only, encoding):
  83. parser = python_parser_only
  84. data = """ignore this
  85. ignore this too
  86. index|A|B|C
  87. foo|1|2|3
  88. bar|4|5|6
  89. baz|7|8|9
  90. """
  91. if encoding is not None:
  92. data = data.encode(encoding)
  93. data = BytesIO(data)
  94. data = TextIOWrapper(data, encoding=encoding)
  95. else:
  96. data = StringIO(data)
  97. result = parser.read_csv(data, index_col=0, sep=None, skiprows=2, encoding=encoding)
  98. expected = DataFrame(
  99. [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  100. columns=["A", "B", "C"],
  101. index=Index(["foo", "bar", "baz"], name="index"),
  102. )
  103. tm.assert_frame_equal(result, expected)
  104. def test_single_line(python_parser_only):
  105. # see gh-6607: sniff separator
  106. parser = python_parser_only
  107. result = parser.read_csv(StringIO("1,2"), names=["a", "b"], header=None, sep=None)
  108. expected = DataFrame({"a": [1], "b": [2]})
  109. tm.assert_frame_equal(result, expected)
  110. @pytest.mark.parametrize("kwargs", [{"skipfooter": 2}, {"nrows": 3}])
  111. def test_skipfooter(python_parser_only, kwargs):
  112. # see gh-6607
  113. data = """A,B,C
  114. 1,2,3
  115. 4,5,6
  116. 7,8,9
  117. want to skip this
  118. also also skip this
  119. """
  120. parser = python_parser_only
  121. result = parser.read_csv(StringIO(data), **kwargs)
  122. expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["A", "B", "C"])
  123. tm.assert_frame_equal(result, expected)
  124. @pytest.mark.parametrize(
  125. "compression,klass", [("gzip", "GzipFile"), ("bz2", "BZ2File")]
  126. )
  127. def test_decompression_regex_sep(python_parser_only, csv1, compression, klass):
  128. # see gh-6607
  129. parser = python_parser_only
  130. with open(csv1, "rb") as f:
  131. data = f.read()
  132. data = data.replace(b",", b"::")
  133. expected = parser.read_csv(csv1)
  134. module = pytest.importorskip(compression)
  135. klass = getattr(module, klass)
  136. with tm.ensure_clean() as path:
  137. with klass(path, mode="wb") as tmp:
  138. tmp.write(data)
  139. result = parser.read_csv(path, sep="::", compression=compression)
  140. tm.assert_frame_equal(result, expected)
  141. def test_read_csv_buglet_4x_multi_index(python_parser_only):
  142. # see gh-6607
  143. data = """ A B C D E
  144. one two three four
  145. a b 10.0032 5 -0.5109 -2.3358 -0.4645 0.05076 0.3640
  146. a q 20 4 0.4473 1.4152 0.2834 1.00661 0.1744
  147. x q 30 3 -0.6662 -0.5243 -0.3580 0.89145 2.5838"""
  148. parser = python_parser_only
  149. expected = DataFrame(
  150. [
  151. [-0.5109, -2.3358, -0.4645, 0.05076, 0.3640],
  152. [0.4473, 1.4152, 0.2834, 1.00661, 0.1744],
  153. [-0.6662, -0.5243, -0.3580, 0.89145, 2.5838],
  154. ],
  155. columns=["A", "B", "C", "D", "E"],
  156. index=MultiIndex.from_tuples(
  157. [("a", "b", 10.0032, 5), ("a", "q", 20, 4), ("x", "q", 30, 3)],
  158. names=["one", "two", "three", "four"],
  159. ),
  160. )
  161. result = parser.read_csv(StringIO(data), sep=r"\s+")
  162. tm.assert_frame_equal(result, expected)
  163. def test_read_csv_buglet_4x_multi_index2(python_parser_only):
  164. # see gh-6893
  165. data = " A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9"
  166. parser = python_parser_only
  167. expected = DataFrame.from_records(
  168. [(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)],
  169. columns=list("abcABC"),
  170. index=list("abc"),
  171. )
  172. result = parser.read_csv(StringIO(data), sep=r"\s+")
  173. tm.assert_frame_equal(result, expected)
  174. @pytest.mark.parametrize("add_footer", [True, False])
  175. def test_skipfooter_with_decimal(python_parser_only, add_footer):
  176. # see gh-6971
  177. data = "1#2\n3#4"
  178. parser = python_parser_only
  179. expected = DataFrame({"a": [1.2, 3.4]})
  180. if add_footer:
  181. # The stray footer line should not mess with the
  182. # casting of the first two lines if we skip it.
  183. kwargs = {"skipfooter": 1}
  184. data += "\nFooter"
  185. else:
  186. kwargs = {}
  187. result = parser.read_csv(StringIO(data), names=["a"], decimal="#", **kwargs)
  188. tm.assert_frame_equal(result, expected)
  189. @pytest.mark.parametrize(
  190. "sep", ["::", "#####", "!!!", "123", "#1!c5", "%!c!d", "@@#4:2", "_!pd#_"]
  191. )
  192. @pytest.mark.parametrize(
  193. "encoding", ["utf-16", "utf-16-be", "utf-16-le", "utf-32", "cp037"]
  194. )
  195. def test_encoding_non_utf8_multichar_sep(python_parser_only, sep, encoding):
  196. # see gh-3404
  197. expected = DataFrame({"a": [1], "b": [2]})
  198. parser = python_parser_only
  199. data = "1" + sep + "2"
  200. encoded_data = data.encode(encoding)
  201. result = parser.read_csv(
  202. BytesIO(encoded_data), sep=sep, names=["a", "b"], encoding=encoding
  203. )
  204. tm.assert_frame_equal(result, expected)
  205. @pytest.mark.parametrize("quoting", [csv.QUOTE_MINIMAL, csv.QUOTE_NONE])
  206. def test_multi_char_sep_quotes(python_parser_only, quoting):
  207. # see gh-13374
  208. kwargs = {"sep": ",,"}
  209. parser = python_parser_only
  210. data = 'a,,b\n1,,a\n2,,"2,,b"'
  211. if quoting == csv.QUOTE_NONE:
  212. msg = "Expected 2 fields in line 3, saw 3"
  213. with pytest.raises(ParserError, match=msg):
  214. parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
  215. else:
  216. msg = "ignored when a multi-char delimiter is used"
  217. with pytest.raises(ParserError, match=msg):
  218. parser.read_csv(StringIO(data), quoting=quoting, **kwargs)
  219. def test_none_delimiter(python_parser_only, capsys):
  220. # see gh-13374 and gh-17465
  221. parser = python_parser_only
  222. data = "a,b,c\n0,1,2\n3,4,5,6\n7,8,9"
  223. expected = DataFrame({"a": [0, 7], "b": [1, 8], "c": [2, 9]})
  224. # We expect the third line in the data to be
  225. # skipped because it is malformed, but we do
  226. # not expect any errors to occur.
  227. result = parser.read_csv(StringIO(data), header=0, sep=None, on_bad_lines="warn")
  228. tm.assert_frame_equal(result, expected)
  229. captured = capsys.readouterr()
  230. assert "Skipping line 3" in captured.err
  231. @pytest.mark.parametrize("data", ['a\n1\n"b"a', 'a,b,c\ncat,foo,bar\ndog,foo,"baz'])
  232. @pytest.mark.parametrize("skipfooter", [0, 1])
  233. def test_skipfooter_bad_row(python_parser_only, data, skipfooter):
  234. # see gh-13879 and gh-15910
  235. parser = python_parser_only
  236. if skipfooter:
  237. msg = "parsing errors in the skipped footer rows"
  238. with pytest.raises(ParserError, match=msg):
  239. parser.read_csv(StringIO(data), skipfooter=skipfooter)
  240. else:
  241. msg = "unexpected end of data|expected after"
  242. with pytest.raises(ParserError, match=msg):
  243. parser.read_csv(StringIO(data), skipfooter=skipfooter)
  244. def test_malformed_skipfooter(python_parser_only):
  245. parser = python_parser_only
  246. data = """ignore
  247. A,B,C
  248. 1,2,3 # comment
  249. 1,2,3,4,5
  250. 2,3,4
  251. footer
  252. """
  253. msg = "Expected 3 fields in line 4, saw 5"
  254. with pytest.raises(ParserError, match=msg):
  255. parser.read_csv(StringIO(data), header=1, comment="#", skipfooter=1)
  256. def test_python_engine_file_no_next(python_parser_only):
  257. parser = python_parser_only
  258. class NoNextBuffer:
  259. def __init__(self, csv_data) -> None:
  260. self.data = csv_data
  261. def __iter__(self) -> Iterator:
  262. return self.data.__iter__()
  263. def read(self):
  264. return self.data
  265. def readline(self):
  266. return self.data
  267. parser.read_csv(NoNextBuffer("a\n1"))
  268. @pytest.mark.parametrize("bad_line_func", [lambda x: ["2", "3"], lambda x: x[:2]])
  269. def test_on_bad_lines_callable(python_parser_only, bad_line_func):
  270. # GH 5686
  271. parser = python_parser_only
  272. data = """a,b
  273. 1,2
  274. 2,3,4,5,6
  275. 3,4
  276. """
  277. bad_sio = StringIO(data)
  278. result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
  279. expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
  280. tm.assert_frame_equal(result, expected)
  281. def test_on_bad_lines_callable_write_to_external_list(python_parser_only):
  282. # GH 5686
  283. parser = python_parser_only
  284. data = """a,b
  285. 1,2
  286. 2,3,4,5,6
  287. 3,4
  288. """
  289. bad_sio = StringIO(data)
  290. lst = []
  291. def bad_line_func(bad_line: list[str]) -> list[str]:
  292. lst.append(bad_line)
  293. return ["2", "3"]
  294. result = parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
  295. expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
  296. tm.assert_frame_equal(result, expected)
  297. assert lst == [["2", "3", "4", "5", "6"]]
  298. @pytest.mark.parametrize("bad_line_func", [lambda x: ["foo", "bar"], lambda x: x[:2]])
  299. @pytest.mark.parametrize("sep", [",", "111"])
  300. def test_on_bad_lines_callable_iterator_true(python_parser_only, bad_line_func, sep):
  301. # GH 5686
  302. # iterator=True has a separate code path than iterator=False
  303. parser = python_parser_only
  304. data = f"""
  305. 0{sep}1
  306. hi{sep}there
  307. foo{sep}bar{sep}baz
  308. good{sep}bye
  309. """
  310. bad_sio = StringIO(data)
  311. result_iter = parser.read_csv(
  312. bad_sio, on_bad_lines=bad_line_func, chunksize=1, iterator=True, sep=sep
  313. )
  314. expecteds = [
  315. {"0": "hi", "1": "there"},
  316. {"0": "foo", "1": "bar"},
  317. {"0": "good", "1": "bye"},
  318. ]
  319. for i, (result, expected) in enumerate(zip(result_iter, expecteds)):
  320. expected = DataFrame(expected, index=range(i, i + 1))
  321. tm.assert_frame_equal(result, expected)
  322. def test_on_bad_lines_callable_dont_swallow_errors(python_parser_only):
  323. # GH 5686
  324. parser = python_parser_only
  325. data = """a,b
  326. 1,2
  327. 2,3,4,5,6
  328. 3,4
  329. """
  330. bad_sio = StringIO(data)
  331. msg = "This function is buggy."
  332. def bad_line_func(bad_line):
  333. raise ValueError(msg)
  334. with pytest.raises(ValueError, match=msg):
  335. parser.read_csv(bad_sio, on_bad_lines=bad_line_func)
  336. def test_on_bad_lines_callable_not_expected_length(python_parser_only):
  337. # GH 5686
  338. parser = python_parser_only
  339. data = """a,b
  340. 1,2
  341. 2,3,4,5,6
  342. 3,4
  343. """
  344. bad_sio = StringIO(data)
  345. result = parser.read_csv_check_warnings(
  346. ParserWarning, "Length of header or names", bad_sio, on_bad_lines=lambda x: x
  347. )
  348. expected = DataFrame({"a": [1, 2, 3], "b": [2, 3, 4]})
  349. tm.assert_frame_equal(result, expected)
  350. def test_on_bad_lines_callable_returns_none(python_parser_only):
  351. # GH 5686
  352. parser = python_parser_only
  353. data = """a,b
  354. 1,2
  355. 2,3,4,5,6
  356. 3,4
  357. """
  358. bad_sio = StringIO(data)
  359. result = parser.read_csv(bad_sio, on_bad_lines=lambda x: None)
  360. expected = DataFrame({"a": [1, 3], "b": [2, 4]})
  361. tm.assert_frame_equal(result, expected)
  362. def test_on_bad_lines_index_col_inferred(python_parser_only):
  363. # GH 5686
  364. parser = python_parser_only
  365. data = """a,b
  366. 1,2,3
  367. 4,5,6
  368. """
  369. bad_sio = StringIO(data)
  370. result = parser.read_csv(bad_sio, on_bad_lines=lambda x: ["99", "99"])
  371. expected = DataFrame({"a": [2, 5], "b": [3, 6]}, index=[1, 4])
  372. tm.assert_frame_equal(result, expected)
  373. def test_index_col_false_and_header_none(python_parser_only):
  374. # GH#46955
  375. parser = python_parser_only
  376. data = """
  377. 0.5,0.03
  378. 0.1,0.2,0.3,2
  379. """
  380. result = parser.read_csv_check_warnings(
  381. ParserWarning,
  382. "Length of header",
  383. StringIO(data),
  384. sep=",",
  385. header=None,
  386. index_col=False,
  387. )
  388. expected = DataFrame({0: [0.5, 0.1], 1: [0.03, 0.2]})
  389. tm.assert_frame_equal(result, expected)
  390. def test_header_int_do_not_infer_multiindex_names_on_different_line(python_parser_only):
  391. # GH#46569
  392. parser = python_parser_only
  393. data = StringIO("a\na,b\nc,d,e\nf,g,h")
  394. result = parser.read_csv_check_warnings(
  395. ParserWarning, "Length of header", data, engine="python", index_col=False
  396. )
  397. expected = DataFrame({"a": ["a", "c", "f"]})
  398. tm.assert_frame_equal(result, expected)