test_c_parser_only.py 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692
  1. """
  2. Tests that apply specifically to the CParser. Unless specifically stated
  3. as a CParser-specific issue, the goal is to eventually move as many of
  4. these tests out of this module as soon as the Python parser can accept
  5. further arguments when parsing.
  6. """
  7. from decimal import Decimal
  8. from io import (
  9. BytesIO,
  10. StringIO,
  11. TextIOWrapper,
  12. )
  13. import mmap
  14. import os
  15. import tarfile
  16. import numpy as np
  17. import pytest
  18. from pandas.compat import (
  19. IS64,
  20. is_ci_environment,
  21. )
  22. from pandas.compat.numpy import np_version_gte1p24
  23. from pandas.errors import ParserError
  24. import pandas.util._test_decorators as td
  25. from pandas import (
  26. DataFrame,
  27. concat,
  28. )
  29. import pandas._testing as tm
  30. @pytest.mark.parametrize(
  31. "malformed",
  32. ["1\r1\r1\r 1\r 1\r", "1\r1\r1\r 1\r 1\r11\r", "1\r1\r1\r 1\r 1\r11\r1\r"],
  33. ids=["words pointer", "stream pointer", "lines pointer"],
  34. )
  35. def test_buffer_overflow(c_parser_only, malformed):
  36. # see gh-9205: test certain malformed input files that cause
  37. # buffer overflows in tokenizer.c
  38. msg = "Buffer overflow caught - possible malformed input file."
  39. parser = c_parser_only
  40. with pytest.raises(ParserError, match=msg):
  41. parser.read_csv(StringIO(malformed))
  42. def test_buffer_rd_bytes(c_parser_only):
  43. # see gh-12098: src->buffer in the C parser can be freed twice leading
  44. # to a segfault if a corrupt gzip file is read with 'read_csv', and the
  45. # buffer is filled more than once before gzip raises an Exception.
  46. data = (
  47. "\x1F\x8B\x08\x00\x00\x00\x00\x00\x00\x03\xED\xC3\x41\x09"
  48. "\x00\x00\x08\x00\xB1\xB7\xB6\xBA\xFE\xA5\xCC\x21\x6C\xB0"
  49. "\xA6\x4D" + "\x55" * 267 + "\x7D\xF7\x00\x91\xE0\x47\x97\x14\x38\x04\x00"
  50. "\x1f\x8b\x08\x00VT\x97V\x00\x03\xed]\xefO"
  51. )
  52. parser = c_parser_only
  53. for _ in range(100):
  54. try:
  55. parser.read_csv_check_warnings(
  56. RuntimeWarning,
  57. "compression has no effect when passing a non-binary object as input",
  58. StringIO(data),
  59. compression="gzip",
  60. delim_whitespace=True,
  61. )
  62. except Exception:
  63. pass
  64. def test_delim_whitespace_custom_terminator(c_parser_only):
  65. # See gh-12912
  66. data = "a b c~1 2 3~4 5 6~7 8 9"
  67. parser = c_parser_only
  68. df = parser.read_csv(StringIO(data), lineterminator="~", delim_whitespace=True)
  69. expected = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
  70. tm.assert_frame_equal(df, expected)
  71. def test_dtype_and_names_error(c_parser_only):
  72. # see gh-8833: passing both dtype and names
  73. # resulting in an error reporting issue
  74. parser = c_parser_only
  75. data = """
  76. 1.0 1
  77. 2.0 2
  78. 3.0 3
  79. """
  80. # base cases
  81. result = parser.read_csv(StringIO(data), sep=r"\s+", header=None)
  82. expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]])
  83. tm.assert_frame_equal(result, expected)
  84. result = parser.read_csv(StringIO(data), sep=r"\s+", header=None, names=["a", "b"])
  85. expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]], columns=["a", "b"])
  86. tm.assert_frame_equal(result, expected)
  87. # fallback casting
  88. result = parser.read_csv(
  89. StringIO(data), sep=r"\s+", header=None, names=["a", "b"], dtype={"a": np.int32}
  90. )
  91. expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=["a", "b"])
  92. expected["a"] = expected["a"].astype(np.int32)
  93. tm.assert_frame_equal(result, expected)
  94. data = """
  95. 1.0 1
  96. nan 2
  97. 3.0 3
  98. """
  99. # fallback casting, but not castable
  100. warning = RuntimeWarning if np_version_gte1p24 else None
  101. with pytest.raises(ValueError, match="cannot safely convert"):
  102. with tm.assert_produces_warning(warning, check_stacklevel=False):
  103. parser.read_csv(
  104. StringIO(data),
  105. sep=r"\s+",
  106. header=None,
  107. names=["a", "b"],
  108. dtype={"a": np.int32},
  109. )
  110. @pytest.mark.parametrize(
  111. "match,kwargs",
  112. [
  113. # For each of these cases, all of the dtypes are valid, just unsupported.
  114. (
  115. (
  116. "the dtype datetime64 is not supported for parsing, "
  117. "pass this column using parse_dates instead"
  118. ),
  119. {"dtype": {"A": "datetime64", "B": "float64"}},
  120. ),
  121. (
  122. (
  123. "the dtype datetime64 is not supported for parsing, "
  124. "pass this column using parse_dates instead"
  125. ),
  126. {"dtype": {"A": "datetime64", "B": "float64"}, "parse_dates": ["B"]},
  127. ),
  128. (
  129. "the dtype timedelta64 is not supported for parsing",
  130. {"dtype": {"A": "timedelta64", "B": "float64"}},
  131. ),
  132. (
  133. f"the dtype {tm.ENDIAN}U8 is not supported for parsing",
  134. {"dtype": {"A": "U8"}},
  135. ),
  136. ],
  137. ids=["dt64-0", "dt64-1", "td64", f"{tm.ENDIAN}U8"],
  138. )
  139. def test_unsupported_dtype(c_parser_only, match, kwargs):
  140. parser = c_parser_only
  141. df = DataFrame(
  142. np.random.rand(5, 2), columns=list("AB"), index=["1A", "1B", "1C", "1D", "1E"]
  143. )
  144. with tm.ensure_clean("__unsupported_dtype__.csv") as path:
  145. df.to_csv(path)
  146. with pytest.raises(TypeError, match=match):
  147. parser.read_csv(path, index_col=0, **kwargs)
  148. @td.skip_if_32bit
  149. @pytest.mark.slow
  150. def test_precise_conversion(c_parser_only):
  151. parser = c_parser_only
  152. normal_errors = []
  153. precise_errors = []
  154. def error(val: float, actual_val: Decimal) -> Decimal:
  155. return abs(Decimal(f"{val:.100}") - actual_val)
  156. # test numbers between 1 and 2
  157. for num in np.linspace(1.0, 2.0, num=500):
  158. # 25 decimal digits of precision
  159. text = f"a\n{num:.25}"
  160. normal_val = float(
  161. parser.read_csv(StringIO(text), float_precision="legacy")["a"][0]
  162. )
  163. precise_val = float(
  164. parser.read_csv(StringIO(text), float_precision="high")["a"][0]
  165. )
  166. roundtrip_val = float(
  167. parser.read_csv(StringIO(text), float_precision="round_trip")["a"][0]
  168. )
  169. actual_val = Decimal(text[2:])
  170. normal_errors.append(error(normal_val, actual_val))
  171. precise_errors.append(error(precise_val, actual_val))
  172. # round-trip should match float()
  173. assert roundtrip_val == float(text[2:])
  174. assert sum(precise_errors) <= sum(normal_errors)
  175. assert max(precise_errors) <= max(normal_errors)
  176. def test_usecols_dtypes(c_parser_only):
  177. parser = c_parser_only
  178. data = """\
  179. 1,2,3
  180. 4,5,6
  181. 7,8,9
  182. 10,11,12"""
  183. result = parser.read_csv(
  184. StringIO(data),
  185. usecols=(0, 1, 2),
  186. names=("a", "b", "c"),
  187. header=None,
  188. converters={"a": str},
  189. dtype={"b": int, "c": float},
  190. )
  191. result2 = parser.read_csv(
  192. StringIO(data),
  193. usecols=(0, 2),
  194. names=("a", "b", "c"),
  195. header=None,
  196. converters={"a": str},
  197. dtype={"b": int, "c": float},
  198. )
  199. assert (result.dtypes == [object, int, float]).all()
  200. assert (result2.dtypes == [object, float]).all()
  201. def test_disable_bool_parsing(c_parser_only):
  202. # see gh-2090
  203. parser = c_parser_only
  204. data = """A,B,C
  205. Yes,No,Yes
  206. No,Yes,Yes
  207. Yes,,Yes
  208. No,No,No"""
  209. result = parser.read_csv(StringIO(data), dtype=object)
  210. assert (result.dtypes == object).all()
  211. result = parser.read_csv(StringIO(data), dtype=object, na_filter=False)
  212. assert result["B"][2] == ""
  213. def test_custom_lineterminator(c_parser_only):
  214. parser = c_parser_only
  215. data = "a,b,c~1,2,3~4,5,6"
  216. result = parser.read_csv(StringIO(data), lineterminator="~")
  217. expected = parser.read_csv(StringIO(data.replace("~", "\n")))
  218. tm.assert_frame_equal(result, expected)
  219. def test_parse_ragged_csv(c_parser_only):
  220. parser = c_parser_only
  221. data = """1,2,3
  222. 1,2,3,4
  223. 1,2,3,4,5
  224. 1,2
  225. 1,2,3,4"""
  226. nice_data = """1,2,3,,
  227. 1,2,3,4,
  228. 1,2,3,4,5
  229. 1,2,,,
  230. 1,2,3,4,"""
  231. result = parser.read_csv(
  232. StringIO(data), header=None, names=["a", "b", "c", "d", "e"]
  233. )
  234. expected = parser.read_csv(
  235. StringIO(nice_data), header=None, names=["a", "b", "c", "d", "e"]
  236. )
  237. tm.assert_frame_equal(result, expected)
  238. # too many columns, cause segfault if not careful
  239. data = "1,2\n3,4,5"
  240. result = parser.read_csv(StringIO(data), header=None, names=range(50))
  241. expected = parser.read_csv(StringIO(data), header=None, names=range(3)).reindex(
  242. columns=range(50)
  243. )
  244. tm.assert_frame_equal(result, expected)
  245. def test_tokenize_CR_with_quoting(c_parser_only):
  246. # see gh-3453
  247. parser = c_parser_only
  248. data = ' a,b,c\r"a,b","e,d","f,f"'
  249. result = parser.read_csv(StringIO(data), header=None)
  250. expected = parser.read_csv(StringIO(data.replace("\r", "\n")), header=None)
  251. tm.assert_frame_equal(result, expected)
  252. result = parser.read_csv(StringIO(data))
  253. expected = parser.read_csv(StringIO(data.replace("\r", "\n")))
  254. tm.assert_frame_equal(result, expected)
  255. @pytest.mark.slow
  256. def test_grow_boundary_at_cap(c_parser_only):
  257. # See gh-12494
  258. #
  259. # Cause of error was that the C parser
  260. # was not increasing the buffer size when
  261. # the desired space would fill the buffer
  262. # to capacity, which would later cause a
  263. # buffer overflow error when checking the
  264. # EOF terminator of the CSV stream.
  265. parser = c_parser_only
  266. def test_empty_header_read(count):
  267. with StringIO("," * count) as s:
  268. expected = DataFrame(columns=[f"Unnamed: {i}" for i in range(count + 1)])
  269. df = parser.read_csv(s)
  270. tm.assert_frame_equal(df, expected)
  271. for cnt in range(1, 101):
  272. test_empty_header_read(cnt)
  273. def test_parse_trim_buffers(c_parser_only):
  274. # This test is part of a bugfix for gh-13703. It attempts to
  275. # to stress the system memory allocator, to cause it to move the
  276. # stream buffer and either let the OS reclaim the region, or let
  277. # other memory requests of parser otherwise modify the contents
  278. # of memory space, where it was formally located.
  279. # This test is designed to cause a `segfault` with unpatched
  280. # `tokenizer.c`. Sometimes the test fails on `segfault`, other
  281. # times it fails due to memory corruption, which causes the
  282. # loaded DataFrame to differ from the expected one.
  283. parser = c_parser_only
  284. # Generate a large mixed-type CSV file on-the-fly (one record is
  285. # approx 1.5KiB).
  286. record_ = (
  287. """9999-9,99:99,,,,ZZ,ZZ,,,ZZZ-ZZZZ,.Z-ZZZZ,-9.99,,,9.99,Z"""
  288. """ZZZZ,,-99,9,ZZZ-ZZZZ,ZZ-ZZZZ,,9.99,ZZZ-ZZZZZ,ZZZ-ZZZZZ,"""
  289. """ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,9"""
  290. """99,ZZZ-ZZZZ,,ZZ-ZZZZ,,,,,ZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,,,9,9,"""
  291. """9,9,99,99,999,999,ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZ,9,ZZ-ZZZZ,9."""
  292. """99,ZZ-ZZZZ,ZZ-ZZZZ,,,,ZZZZ,,,ZZ,ZZ,,,,,,,,,,,,,9,,,999."""
  293. """99,999.99,,,ZZZZZ,,,Z9,,,,,,,ZZZ,ZZZ,,,,,,,,,,,ZZZZZ,ZZ"""
  294. """ZZZ,ZZZ-ZZZZZZ,ZZZ-ZZZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZZZ,ZZ-ZZ"""
  295. """ZZ,,,999999,999999,ZZZ,ZZZ,,,ZZZ,ZZZ,999.99,999.99,,,,Z"""
  296. """ZZ-ZZZ,ZZZ-ZZZ,-9.99,-9.99,9,9,,99,,9.99,9.99,9,9,9.99,"""
  297. """9.99,,,,9.99,9.99,,99,,99,9.99,9.99,,,ZZZ,ZZZ,,999.99,,"""
  298. """999.99,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,ZZZZZ,ZZZZZ,ZZZ,ZZZ,9,9,"""
  299. """,,,,,ZZZ-ZZZZ,ZZZ999Z,,,999.99,,999.99,ZZZ-ZZZZ,,,9.999"""
  300. """,9.999,9.999,9.999,-9.999,-9.999,-9.999,-9.999,9.999,9."""
  301. """999,9.999,9.999,9.999,9.999,9.999,9.999,99999,ZZZ-ZZZZ,"""
  302. """,9.99,ZZZ,,,,,,,,ZZZ,,,,,9,,,,9,,,,,,,,,,ZZZ-ZZZZ,ZZZ-Z"""
  303. """ZZZ,,ZZZZZ,ZZZZZ,ZZZZZ,ZZZZZ,,,9.99,,ZZ-ZZZZ,ZZ-ZZZZ,ZZ"""
  304. """,999,,,,ZZ-ZZZZ,ZZZ,ZZZ,ZZZ-ZZZZ,ZZZ-ZZZZ,,,99.99,99.99"""
  305. """,,,9.99,9.99,9.99,9.99,ZZZ-ZZZZ,,,ZZZ-ZZZZZ,,,,,-9.99,-"""
  306. """9.99,-9.99,-9.99,,,,,,,,,ZZZ-ZZZZ,,9,9.99,9.99,99ZZ,,-9"""
  307. """.99,-9.99,ZZZ-ZZZZ,,,,,,,ZZZ-ZZZZ,9.99,9.99,9999,,,,,,,"""
  308. """,,,-9.9,Z/Z-ZZZZ,999.99,9.99,,999.99,ZZ-ZZZZ,ZZ-ZZZZ,9."""
  309. """99,9.99,9.99,9.99,9.99,9.99,,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ-ZZ"""
  310. """ZZZ,ZZZ-ZZZZZ,ZZZ-ZZZZZ,ZZZ,ZZZ,ZZZ,ZZZ,9.99,,,-9.99,ZZ"""
  311. """-ZZZZ,-999.99,,-9999,,999.99,,,,999.99,99.99,,,ZZ-ZZZZZ"""
  312. """ZZZ,ZZ-ZZZZ-ZZZZZZZ,,,,ZZ-ZZ-ZZZZZZZZ,ZZZZZZZZ,ZZZ-ZZZZ"""
  313. """,9999,999.99,ZZZ-ZZZZ,-9.99,-9.99,ZZZ-ZZZZ,99:99:99,,99"""
  314. """,99,,9.99,,-99.99,,,,,,9.99,ZZZ-ZZZZ,-9.99,-9.99,9.99,9"""
  315. """.99,,ZZZ,,,,,,,ZZZ,ZZZ,,,,,"""
  316. )
  317. # Set the number of lines so that a call to `parser_trim_buffers`
  318. # is triggered: after a couple of full chunks are consumed a
  319. # relatively small 'residual' chunk would cause reallocation
  320. # within the parser.
  321. chunksize, n_lines = 128, 2 * 128 + 15
  322. csv_data = "\n".join([record_] * n_lines) + "\n"
  323. # We will use StringIO to load the CSV from this text buffer.
  324. # pd.read_csv() will iterate over the file in chunks and will
  325. # finally read a residual chunk of really small size.
  326. # Generate the expected output: manually create the dataframe
  327. # by splitting by comma and repeating the `n_lines` times.
  328. row = tuple(val_ if val_ else np.nan for val_ in record_.split(","))
  329. expected = DataFrame(
  330. [row for _ in range(n_lines)], dtype=object, columns=None, index=None
  331. )
  332. # Iterate over the CSV file in chunks of `chunksize` lines
  333. with parser.read_csv(
  334. StringIO(csv_data), header=None, dtype=object, chunksize=chunksize
  335. ) as chunks_:
  336. result = concat(chunks_, axis=0, ignore_index=True)
  337. # Check for data corruption if there was no segfault
  338. tm.assert_frame_equal(result, expected)
  339. # This extra test was added to replicate the fault in gh-5291.
  340. # Force 'utf-8' encoding, so that `_string_convert` would take
  341. # a different execution branch.
  342. with parser.read_csv(
  343. StringIO(csv_data),
  344. header=None,
  345. dtype=object,
  346. chunksize=chunksize,
  347. encoding="utf_8",
  348. ) as chunks_:
  349. result = concat(chunks_, axis=0, ignore_index=True)
  350. tm.assert_frame_equal(result, expected)
  351. def test_internal_null_byte(c_parser_only):
  352. # see gh-14012
  353. #
  354. # The null byte ('\x00') should not be used as a
  355. # true line terminator, escape character, or comment
  356. # character, only as a placeholder to indicate that
  357. # none was specified.
  358. #
  359. # This test should be moved to test_common.py ONLY when
  360. # Python's csv class supports parsing '\x00'.
  361. parser = c_parser_only
  362. names = ["a", "b", "c"]
  363. data = "1,2,3\n4,\x00,6\n7,8,9"
  364. expected = DataFrame([[1, 2.0, 3], [4, np.nan, 6], [7, 8, 9]], columns=names)
  365. result = parser.read_csv(StringIO(data), names=names)
  366. tm.assert_frame_equal(result, expected)
  367. def test_read_nrows_large(c_parser_only):
  368. # gh-7626 - Read only nrows of data in for large inputs (>262144b)
  369. parser = c_parser_only
  370. header_narrow = "\t".join(["COL_HEADER_" + str(i) for i in range(10)]) + "\n"
  371. data_narrow = "\t".join(["somedatasomedatasomedata1" for _ in range(10)]) + "\n"
  372. header_wide = "\t".join(["COL_HEADER_" + str(i) for i in range(15)]) + "\n"
  373. data_wide = "\t".join(["somedatasomedatasomedata2" for _ in range(15)]) + "\n"
  374. test_input = header_narrow + data_narrow * 1050 + header_wide + data_wide * 2
  375. df = parser.read_csv(StringIO(test_input), sep="\t", nrows=1010)
  376. assert df.size == 1010 * 10
  377. def test_float_precision_round_trip_with_text(c_parser_only):
  378. # see gh-15140
  379. parser = c_parser_only
  380. df = parser.read_csv(StringIO("a"), header=None, float_precision="round_trip")
  381. tm.assert_frame_equal(df, DataFrame({0: ["a"]}))
  382. def test_large_difference_in_columns(c_parser_only):
  383. # see gh-14125
  384. parser = c_parser_only
  385. count = 10000
  386. large_row = ("X," * count)[:-1] + "\n"
  387. normal_row = "XXXXXX XXXXXX,111111111111111\n"
  388. test_input = (large_row + normal_row * 6)[:-1]
  389. result = parser.read_csv(StringIO(test_input), header=None, usecols=[0])
  390. rows = test_input.split("\n")
  391. expected = DataFrame([row.split(",")[0] for row in rows])
  392. tm.assert_frame_equal(result, expected)
  393. def test_data_after_quote(c_parser_only):
  394. # see gh-15910
  395. parser = c_parser_only
  396. data = 'a\n1\n"b"a'
  397. result = parser.read_csv(StringIO(data))
  398. expected = DataFrame({"a": ["1", "ba"]})
  399. tm.assert_frame_equal(result, expected)
  400. def test_comment_whitespace_delimited(c_parser_only, capsys):
  401. parser = c_parser_only
  402. test_input = """\
  403. 1 2
  404. 2 2 3
  405. 3 2 3 # 3 fields
  406. 4 2 3# 3 fields
  407. 5 2 # 2 fields
  408. 6 2# 2 fields
  409. 7 # 1 field, NaN
  410. 8# 1 field, NaN
  411. 9 2 3 # skipped line
  412. # comment"""
  413. df = parser.read_csv(
  414. StringIO(test_input),
  415. comment="#",
  416. header=None,
  417. delimiter="\\s+",
  418. skiprows=0,
  419. on_bad_lines="warn",
  420. )
  421. captured = capsys.readouterr()
  422. # skipped lines 2, 3, 4, 9
  423. for line_num in (2, 3, 4, 9):
  424. assert f"Skipping line {line_num}" in captured.err
  425. expected = DataFrame([[1, 2], [5, 2], [6, 2], [7, np.nan], [8, np.nan]])
  426. tm.assert_frame_equal(df, expected)
  427. def test_file_like_no_next(c_parser_only):
  428. # gh-16530: the file-like need not have a "next" or "__next__"
  429. # attribute despite having an "__iter__" attribute.
  430. #
  431. # NOTE: This is only true for the C engine, not Python engine.
  432. class NoNextBuffer(StringIO):
  433. def __next__(self):
  434. raise AttributeError("No next method")
  435. next = __next__
  436. parser = c_parser_only
  437. data = "a\n1"
  438. expected = DataFrame({"a": [1]})
  439. result = parser.read_csv(NoNextBuffer(data))
  440. tm.assert_frame_equal(result, expected)
  441. def test_buffer_rd_bytes_bad_unicode(c_parser_only):
  442. # see gh-22748
  443. t = BytesIO(b"\xB0")
  444. t = TextIOWrapper(t, encoding="ascii", errors="surrogateescape")
  445. msg = "'utf-8' codec can't encode character"
  446. with pytest.raises(UnicodeError, match=msg):
  447. c_parser_only.read_csv(t, encoding="UTF-8")
  448. @pytest.mark.parametrize("tar_suffix", [".tar", ".tar.gz"])
  449. def test_read_tarfile(c_parser_only, csv_dir_path, tar_suffix):
  450. # see gh-16530
  451. #
  452. # Unfortunately, Python's CSV library can't handle
  453. # tarfile objects (expects string, not bytes when
  454. # iterating through a file-like).
  455. parser = c_parser_only
  456. tar_path = os.path.join(csv_dir_path, "tar_csv" + tar_suffix)
  457. with tarfile.open(tar_path, "r") as tar:
  458. data_file = tar.extractfile("tar_data.csv")
  459. out = parser.read_csv(data_file)
  460. expected = DataFrame({"a": [1]})
  461. tm.assert_frame_equal(out, expected)
  462. @pytest.mark.single_cpu
  463. @pytest.mark.skipif(is_ci_environment(), reason="Too memory intensive for CI.")
  464. def test_bytes_exceed_2gb(c_parser_only):
  465. # see gh-16798
  466. #
  467. # Read from a "CSV" that has a column larger than 2GB.
  468. parser = c_parser_only
  469. if parser.low_memory:
  470. pytest.skip("not a low_memory test")
  471. csv = StringIO("strings\n" + "\n".join(["x" * (1 << 20) for _ in range(2100)]))
  472. df = parser.read_csv(csv)
  473. assert not df.empty
  474. def test_chunk_whitespace_on_boundary(c_parser_only):
  475. # see gh-9735: this issue is C parser-specific (bug when
  476. # parsing whitespace and characters at chunk boundary)
  477. #
  478. # This test case has a field too large for the Python parser / CSV library.
  479. parser = c_parser_only
  480. chunk1 = "a" * (1024 * 256 - 2) + "\na"
  481. chunk2 = "\n a"
  482. result = parser.read_csv(StringIO(chunk1 + chunk2), header=None)
  483. expected = DataFrame(["a" * (1024 * 256 - 2), "a", " a"])
  484. tm.assert_frame_equal(result, expected)
  485. def test_file_handles_mmap(c_parser_only, csv1):
  486. # gh-14418
  487. #
  488. # Don't close user provided file handles.
  489. parser = c_parser_only
  490. with open(csv1) as f:
  491. with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as m:
  492. parser.read_csv(m)
  493. assert not m.closed
  494. def test_file_binary_mode(c_parser_only):
  495. # see gh-23779
  496. parser = c_parser_only
  497. expected = DataFrame([[1, 2, 3], [4, 5, 6]])
  498. with tm.ensure_clean() as path:
  499. with open(path, "w") as f:
  500. f.write("1,2,3\n4,5,6")
  501. with open(path, "rb") as f:
  502. result = parser.read_csv(f, header=None)
  503. tm.assert_frame_equal(result, expected)
  504. def test_unix_style_breaks(c_parser_only):
  505. # GH 11020
  506. parser = c_parser_only
  507. with tm.ensure_clean() as path:
  508. with open(path, "w", newline="\n") as f:
  509. f.write("blah\n\ncol_1,col_2,col_3\n\n")
  510. result = parser.read_csv(path, skiprows=2, encoding="utf-8", engine="c")
  511. expected = DataFrame(columns=["col_1", "col_2", "col_3"])
  512. tm.assert_frame_equal(result, expected)
  513. @pytest.mark.parametrize("float_precision", [None, "legacy", "high", "round_trip"])
  514. @pytest.mark.parametrize(
  515. "data,thousands,decimal",
  516. [
  517. (
  518. """A|B|C
  519. 1|2,334.01|5
  520. 10|13|10.
  521. """,
  522. ",",
  523. ".",
  524. ),
  525. (
  526. """A|B|C
  527. 1|2.334,01|5
  528. 10|13|10,
  529. """,
  530. ".",
  531. ",",
  532. ),
  533. ],
  534. )
  535. def test_1000_sep_with_decimal(
  536. c_parser_only, data, thousands, decimal, float_precision
  537. ):
  538. parser = c_parser_only
  539. expected = DataFrame({"A": [1, 10], "B": [2334.01, 13], "C": [5, 10.0]})
  540. result = parser.read_csv(
  541. StringIO(data),
  542. sep="|",
  543. thousands=thousands,
  544. decimal=decimal,
  545. float_precision=float_precision,
  546. )
  547. tm.assert_frame_equal(result, expected)
  548. def test_float_precision_options(c_parser_only):
  549. # GH 17154, 36228
  550. parser = c_parser_only
  551. s = "foo\n243.164\n"
  552. df = parser.read_csv(StringIO(s))
  553. df2 = parser.read_csv(StringIO(s), float_precision="high")
  554. tm.assert_frame_equal(df, df2)
  555. df3 = parser.read_csv(StringIO(s), float_precision="legacy")
  556. if IS64:
  557. assert not df.iloc[0, 0] == df3.iloc[0, 0]
  558. else:
  559. assert df.iloc[0, 0] == df3.iloc[0, 0]
  560. msg = "Unrecognized float_precision option: junk"
  561. with pytest.raises(ValueError, match=msg):
  562. parser.read_csv(StringIO(s), float_precision="junk")