test_common_basic.py 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864
  1. """
  2. Tests that work on both the Python and C engines but do not have a
  3. specific classification into the other test modules.
  4. """
  5. from datetime import datetime
  6. from inspect import signature
  7. from io import StringIO
  8. import os
  9. from pathlib import Path
  10. import sys
  11. import numpy as np
  12. import pytest
  13. from pandas.errors import (
  14. EmptyDataError,
  15. ParserError,
  16. ParserWarning,
  17. )
  18. from pandas import (
  19. DataFrame,
  20. Index,
  21. Timestamp,
  22. compat,
  23. )
  24. import pandas._testing as tm
  25. from pandas.io.parsers import TextFileReader
  26. from pandas.io.parsers.c_parser_wrapper import CParserWrapper
  27. xfail_pyarrow = pytest.mark.usefixtures("pyarrow_xfail")
  28. skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
  29. def test_override_set_noconvert_columns():
  30. # see gh-17351
  31. #
  32. # Usecols needs to be sorted in _set_noconvert_columns based
  33. # on the test_usecols_with_parse_dates test from test_usecols.py
  34. class MyTextFileReader(TextFileReader):
  35. def __init__(self) -> None:
  36. self._currow = 0
  37. self.squeeze = False
  38. class MyCParserWrapper(CParserWrapper):
  39. def _set_noconvert_columns(self):
  40. if self.usecols_dtype == "integer":
  41. # self.usecols is a set, which is documented as unordered
  42. # but in practice, a CPython set of integers is sorted.
  43. # In other implementations this assumption does not hold.
  44. # The following code simulates a different order, which
  45. # before GH 17351 would cause the wrong columns to be
  46. # converted via the parse_dates parameter
  47. self.usecols = list(self.usecols)
  48. self.usecols.reverse()
  49. return CParserWrapper._set_noconvert_columns(self)
  50. data = """a,b,c,d,e
  51. 0,1,2014-01-01,09:00,4
  52. 0,1,2014-01-02,10:00,4"""
  53. parse_dates = [[1, 2]]
  54. cols = {
  55. "a": [0, 0],
  56. "c_d": [Timestamp("2014-01-01 09:00:00"), Timestamp("2014-01-02 10:00:00")],
  57. }
  58. expected = DataFrame(cols, columns=["c_d", "a"])
  59. parser = MyTextFileReader()
  60. parser.options = {
  61. "usecols": [0, 2, 3],
  62. "parse_dates": parse_dates,
  63. "delimiter": ",",
  64. }
  65. parser.engine = "c"
  66. parser._engine = MyCParserWrapper(StringIO(data), **parser.options)
  67. result = parser.read()
  68. tm.assert_frame_equal(result, expected)
  69. def test_read_csv_local(all_parsers, csv1):
  70. prefix = "file:///" if compat.is_platform_windows() else "file://"
  71. parser = all_parsers
  72. fname = prefix + str(os.path.abspath(csv1))
  73. result = parser.read_csv(fname, index_col=0, parse_dates=True)
  74. expected = DataFrame(
  75. [
  76. [0.980269, 3.685731, -0.364216805298, -1.159738],
  77. [1.047916, -0.041232, -0.16181208307, 0.212549],
  78. [0.498581, 0.731168, -0.537677223318, 1.346270],
  79. [1.120202, 1.567621, 0.00364077397681, 0.675253],
  80. [-0.487094, 0.571455, -1.6116394093, 0.103469],
  81. [0.836649, 0.246462, 0.588542635376, 1.062782],
  82. [-0.157161, 1.340307, 1.1957779562, -1.097007],
  83. ],
  84. columns=["A", "B", "C", "D"],
  85. index=Index(
  86. [
  87. datetime(2000, 1, 3),
  88. datetime(2000, 1, 4),
  89. datetime(2000, 1, 5),
  90. datetime(2000, 1, 6),
  91. datetime(2000, 1, 7),
  92. datetime(2000, 1, 10),
  93. datetime(2000, 1, 11),
  94. ],
  95. name="index",
  96. ),
  97. )
  98. tm.assert_frame_equal(result, expected)
  99. @xfail_pyarrow
  100. def test_1000_sep(all_parsers):
  101. parser = all_parsers
  102. data = """A|B|C
  103. 1|2,334|5
  104. 10|13|10.
  105. """
  106. expected = DataFrame({"A": [1, 10], "B": [2334, 13], "C": [5, 10.0]})
  107. result = parser.read_csv(StringIO(data), sep="|", thousands=",")
  108. tm.assert_frame_equal(result, expected)
  109. @xfail_pyarrow
  110. def test_unnamed_columns(all_parsers):
  111. data = """A,B,C,,
  112. 1,2,3,4,5
  113. 6,7,8,9,10
  114. 11,12,13,14,15
  115. """
  116. parser = all_parsers
  117. expected = DataFrame(
  118. [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]],
  119. dtype=np.int64,
  120. columns=["A", "B", "C", "Unnamed: 3", "Unnamed: 4"],
  121. )
  122. result = parser.read_csv(StringIO(data))
  123. tm.assert_frame_equal(result, expected)
  124. def test_csv_mixed_type(all_parsers):
  125. data = """A,B,C
  126. a,1,2
  127. b,3,4
  128. c,4,5
  129. """
  130. parser = all_parsers
  131. expected = DataFrame({"A": ["a", "b", "c"], "B": [1, 3, 4], "C": [2, 4, 5]})
  132. result = parser.read_csv(StringIO(data))
  133. tm.assert_frame_equal(result, expected)
  134. @xfail_pyarrow
  135. def test_read_csv_low_memory_no_rows_with_index(all_parsers):
  136. # see gh-21141
  137. parser = all_parsers
  138. if not parser.low_memory:
  139. pytest.skip("This is a low-memory specific test")
  140. data = """A,B,C
  141. 1,1,1,2
  142. 2,2,3,4
  143. 3,3,4,5
  144. """
  145. result = parser.read_csv(StringIO(data), low_memory=True, index_col=0, nrows=0)
  146. expected = DataFrame(columns=["A", "B", "C"])
  147. tm.assert_frame_equal(result, expected)
  148. def test_read_csv_dataframe(all_parsers, csv1):
  149. parser = all_parsers
  150. result = parser.read_csv(csv1, index_col=0, parse_dates=True)
  151. expected = DataFrame(
  152. [
  153. [0.980269, 3.685731, -0.364216805298, -1.159738],
  154. [1.047916, -0.041232, -0.16181208307, 0.212549],
  155. [0.498581, 0.731168, -0.537677223318, 1.346270],
  156. [1.120202, 1.567621, 0.00364077397681, 0.675253],
  157. [-0.487094, 0.571455, -1.6116394093, 0.103469],
  158. [0.836649, 0.246462, 0.588542635376, 1.062782],
  159. [-0.157161, 1.340307, 1.1957779562, -1.097007],
  160. ],
  161. columns=["A", "B", "C", "D"],
  162. index=Index(
  163. [
  164. datetime(2000, 1, 3),
  165. datetime(2000, 1, 4),
  166. datetime(2000, 1, 5),
  167. datetime(2000, 1, 6),
  168. datetime(2000, 1, 7),
  169. datetime(2000, 1, 10),
  170. datetime(2000, 1, 11),
  171. ],
  172. name="index",
  173. ),
  174. )
  175. tm.assert_frame_equal(result, expected)
  176. @xfail_pyarrow
  177. @pytest.mark.parametrize("nrows", [3, 3.0])
  178. def test_read_nrows(all_parsers, nrows):
  179. # see gh-10476
  180. data = """index,A,B,C,D
  181. foo,2,3,4,5
  182. bar,7,8,9,10
  183. baz,12,13,14,15
  184. qux,12,13,14,15
  185. foo2,12,13,14,15
  186. bar2,12,13,14,15
  187. """
  188. expected = DataFrame(
  189. [["foo", 2, 3, 4, 5], ["bar", 7, 8, 9, 10], ["baz", 12, 13, 14, 15]],
  190. columns=["index", "A", "B", "C", "D"],
  191. )
  192. parser = all_parsers
  193. result = parser.read_csv(StringIO(data), nrows=nrows)
  194. tm.assert_frame_equal(result, expected)
  195. @xfail_pyarrow
  196. @pytest.mark.parametrize("nrows", [1.2, "foo", -1])
  197. def test_read_nrows_bad(all_parsers, nrows):
  198. data = """index,A,B,C,D
  199. foo,2,3,4,5
  200. bar,7,8,9,10
  201. baz,12,13,14,15
  202. qux,12,13,14,15
  203. foo2,12,13,14,15
  204. bar2,12,13,14,15
  205. """
  206. msg = r"'nrows' must be an integer >=0"
  207. parser = all_parsers
  208. with pytest.raises(ValueError, match=msg):
  209. parser.read_csv(StringIO(data), nrows=nrows)
  210. def test_nrows_skipfooter_errors(all_parsers):
  211. msg = "'skipfooter' not supported with 'nrows'"
  212. data = "a\n1\n2\n3\n4\n5\n6"
  213. parser = all_parsers
  214. with pytest.raises(ValueError, match=msg):
  215. parser.read_csv(StringIO(data), skipfooter=1, nrows=5)
  216. @xfail_pyarrow
  217. def test_missing_trailing_delimiters(all_parsers):
  218. parser = all_parsers
  219. data = """A,B,C,D
  220. 1,2,3,4
  221. 1,3,3,
  222. 1,4,5"""
  223. result = parser.read_csv(StringIO(data))
  224. expected = DataFrame(
  225. [[1, 2, 3, 4], [1, 3, 3, np.nan], [1, 4, 5, np.nan]],
  226. columns=["A", "B", "C", "D"],
  227. )
  228. tm.assert_frame_equal(result, expected)
  229. @xfail_pyarrow
  230. def test_skip_initial_space(all_parsers):
  231. data = (
  232. '"09-Apr-2012", "01:10:18.300", 2456026.548822908, 12849, '
  233. "1.00361, 1.12551, 330.65659, 0355626618.16711, 73.48821, "
  234. "314.11625, 1917.09447, 179.71425, 80.000, 240.000, -350, "
  235. "70.06056, 344.98370, 1, 1, -0.689265, -0.692787, "
  236. "0.212036, 14.7674, 41.605, -9999.0, -9999.0, "
  237. "-9999.0, -9999.0, -9999.0, -9999.0, 000, 012, 128"
  238. )
  239. parser = all_parsers
  240. result = parser.read_csv(
  241. StringIO(data),
  242. names=list(range(33)),
  243. header=None,
  244. na_values=["-9999.0"],
  245. skipinitialspace=True,
  246. )
  247. expected = DataFrame(
  248. [
  249. [
  250. "09-Apr-2012",
  251. "01:10:18.300",
  252. 2456026.548822908,
  253. 12849,
  254. 1.00361,
  255. 1.12551,
  256. 330.65659,
  257. 355626618.16711,
  258. 73.48821,
  259. 314.11625,
  260. 1917.09447,
  261. 179.71425,
  262. 80.0,
  263. 240.0,
  264. -350,
  265. 70.06056,
  266. 344.9837,
  267. 1,
  268. 1,
  269. -0.689265,
  270. -0.692787,
  271. 0.212036,
  272. 14.7674,
  273. 41.605,
  274. np.nan,
  275. np.nan,
  276. np.nan,
  277. np.nan,
  278. np.nan,
  279. np.nan,
  280. 0,
  281. 12,
  282. 128,
  283. ]
  284. ]
  285. )
  286. tm.assert_frame_equal(result, expected)
  287. @xfail_pyarrow
  288. def test_trailing_delimiters(all_parsers):
  289. # see gh-2442
  290. data = """A,B,C
  291. 1,2,3,
  292. 4,5,6,
  293. 7,8,9,"""
  294. parser = all_parsers
  295. result = parser.read_csv(StringIO(data), index_col=False)
  296. expected = DataFrame({"A": [1, 4, 7], "B": [2, 5, 8], "C": [3, 6, 9]})
  297. tm.assert_frame_equal(result, expected)
  298. def test_escapechar(all_parsers):
  299. # https://stackoverflow.com/questions/13824840/feature-request-for-
  300. # pandas-read-csv
  301. data = '''SEARCH_TERM,ACTUAL_URL
  302. "bra tv board","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
  303. "tv p\xc3\xa5 hjul","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"
  304. "SLAGBORD, \\"Bergslagen\\", IKEA:s 1700-tals series","http://www.ikea.com/se/sv/catalog/categories/departments/living_room/10475/?se%7cps%7cnonbranded%7cvardagsrum%7cgoogle%7ctv_bord"''' # noqa:E501
  305. parser = all_parsers
  306. result = parser.read_csv(
  307. StringIO(data), escapechar="\\", quotechar='"', encoding="utf-8"
  308. )
  309. assert result["SEARCH_TERM"][2] == 'SLAGBORD, "Bergslagen", IKEA:s 1700-tals series'
  310. tm.assert_index_equal(result.columns, Index(["SEARCH_TERM", "ACTUAL_URL"]))
  311. @xfail_pyarrow
  312. def test_ignore_leading_whitespace(all_parsers):
  313. # see gh-3374, gh-6607
  314. parser = all_parsers
  315. data = " a b c\n 1 2 3\n 4 5 6\n 7 8 9"
  316. result = parser.read_csv(StringIO(data), sep=r"\s+")
  317. expected = DataFrame({"a": [1, 4, 7], "b": [2, 5, 8], "c": [3, 6, 9]})
  318. tm.assert_frame_equal(result, expected)
  319. @xfail_pyarrow
  320. @pytest.mark.parametrize("usecols", [None, [0, 1], ["a", "b"]])
  321. def test_uneven_lines_with_usecols(all_parsers, usecols):
  322. # see gh-12203
  323. parser = all_parsers
  324. data = r"""a,b,c
  325. 0,1,2
  326. 3,4,5,6,7
  327. 8,9,10"""
  328. if usecols is None:
  329. # Make sure that an error is still raised
  330. # when the "usecols" parameter is not provided.
  331. msg = r"Expected \d+ fields in line \d+, saw \d+"
  332. with pytest.raises(ParserError, match=msg):
  333. parser.read_csv(StringIO(data))
  334. else:
  335. expected = DataFrame({"a": [0, 3, 8], "b": [1, 4, 9]})
  336. result = parser.read_csv(StringIO(data), usecols=usecols)
  337. tm.assert_frame_equal(result, expected)
  338. @xfail_pyarrow
  339. @pytest.mark.parametrize(
  340. "data,kwargs,expected",
  341. [
  342. # First, check to see that the response of parser when faced with no
  343. # provided columns raises the correct error, with or without usecols.
  344. ("", {}, None),
  345. ("", {"usecols": ["X"]}, None),
  346. (
  347. ",,",
  348. {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
  349. DataFrame(columns=["X"], index=[0], dtype=np.float64),
  350. ),
  351. (
  352. "",
  353. {"names": ["Dummy", "X", "Dummy_2"], "usecols": ["X"]},
  354. DataFrame(columns=["X"]),
  355. ),
  356. ],
  357. )
  358. def test_read_empty_with_usecols(all_parsers, data, kwargs, expected):
  359. # see gh-12493
  360. parser = all_parsers
  361. if expected is None:
  362. msg = "No columns to parse from file"
  363. with pytest.raises(EmptyDataError, match=msg):
  364. parser.read_csv(StringIO(data), **kwargs)
  365. else:
  366. result = parser.read_csv(StringIO(data), **kwargs)
  367. tm.assert_frame_equal(result, expected)
  368. @xfail_pyarrow
  369. @pytest.mark.parametrize(
  370. "kwargs,expected",
  371. [
  372. # gh-8661, gh-8679: this should ignore six lines, including
  373. # lines with trailing whitespace and blank lines.
  374. (
  375. {
  376. "header": None,
  377. "delim_whitespace": True,
  378. "skiprows": [0, 1, 2, 3, 5, 6],
  379. "skip_blank_lines": True,
  380. },
  381. DataFrame([[1.0, 2.0, 4.0], [5.1, np.nan, 10.0]]),
  382. ),
  383. # gh-8983: test skipping set of rows after a row with trailing spaces.
  384. (
  385. {
  386. "delim_whitespace": True,
  387. "skiprows": [1, 2, 3, 5, 6],
  388. "skip_blank_lines": True,
  389. },
  390. DataFrame({"A": [1.0, 5.1], "B": [2.0, np.nan], "C": [4.0, 10]}),
  391. ),
  392. ],
  393. )
  394. def test_trailing_spaces(all_parsers, kwargs, expected):
  395. data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" # noqa:E501
  396. parser = all_parsers
  397. result = parser.read_csv(StringIO(data.replace(",", " ")), **kwargs)
  398. tm.assert_frame_equal(result, expected)
  399. def test_raise_on_sep_with_delim_whitespace(all_parsers):
  400. # see gh-6607
  401. data = "a b c\n1 2 3"
  402. parser = all_parsers
  403. with pytest.raises(ValueError, match="you can only specify one"):
  404. parser.read_csv(StringIO(data), sep=r"\s", delim_whitespace=True)
  405. def test_read_filepath_or_buffer(all_parsers):
  406. # see gh-43366
  407. parser = all_parsers
  408. with pytest.raises(TypeError, match="Expected file path name or file-like"):
  409. parser.read_csv(filepath_or_buffer=b"input")
  410. @xfail_pyarrow
  411. @pytest.mark.parametrize("delim_whitespace", [True, False])
  412. def test_single_char_leading_whitespace(all_parsers, delim_whitespace):
  413. # see gh-9710
  414. parser = all_parsers
  415. data = """\
  416. MyColumn
  417. a
  418. b
  419. a
  420. b\n"""
  421. expected = DataFrame({"MyColumn": list("abab")})
  422. result = parser.read_csv(
  423. StringIO(data), skipinitialspace=True, delim_whitespace=delim_whitespace
  424. )
  425. tm.assert_frame_equal(result, expected)
  426. # Skip for now, actually only one test fails though, but its tricky to xfail
  427. @skip_pyarrow
  428. @pytest.mark.parametrize(
  429. "sep,skip_blank_lines,exp_data",
  430. [
  431. (",", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
  432. (r"\s+", True, [[1.0, 2.0, 4.0], [5.0, np.nan, 10.0], [-70.0, 0.4, 1.0]]),
  433. (
  434. ",",
  435. False,
  436. [
  437. [1.0, 2.0, 4.0],
  438. [np.nan, np.nan, np.nan],
  439. [np.nan, np.nan, np.nan],
  440. [5.0, np.nan, 10.0],
  441. [np.nan, np.nan, np.nan],
  442. [-70.0, 0.4, 1.0],
  443. ],
  444. ),
  445. ],
  446. )
  447. def test_empty_lines(all_parsers, sep, skip_blank_lines, exp_data):
  448. parser = all_parsers
  449. data = """\
  450. A,B,C
  451. 1,2.,4.
  452. 5.,NaN,10.0
  453. -70,.4,1
  454. """
  455. if sep == r"\s+":
  456. data = data.replace(",", " ")
  457. result = parser.read_csv(StringIO(data), sep=sep, skip_blank_lines=skip_blank_lines)
  458. expected = DataFrame(exp_data, columns=["A", "B", "C"])
  459. tm.assert_frame_equal(result, expected)
  460. @xfail_pyarrow
  461. def test_whitespace_lines(all_parsers):
  462. parser = all_parsers
  463. data = """
  464. \t \t\t
  465. \t
  466. A,B,C
  467. \t 1,2.,4.
  468. 5.,NaN,10.0
  469. """
  470. expected = DataFrame([[1, 2.0, 4.0], [5.0, np.nan, 10.0]], columns=["A", "B", "C"])
  471. result = parser.read_csv(StringIO(data))
  472. tm.assert_frame_equal(result, expected)
  473. @xfail_pyarrow
  474. @pytest.mark.parametrize(
  475. "data,expected",
  476. [
  477. (
  478. """ A B C D
  479. a 1 2 3 4
  480. b 1 2 3 4
  481. c 1 2 3 4
  482. """,
  483. DataFrame(
  484. [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
  485. columns=["A", "B", "C", "D"],
  486. index=["a", "b", "c"],
  487. ),
  488. ),
  489. (
  490. " a b c\n1 2 3 \n4 5 6\n 7 8 9",
  491. DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"]),
  492. ),
  493. ],
  494. )
  495. def test_whitespace_regex_separator(all_parsers, data, expected):
  496. # see gh-6607
  497. parser = all_parsers
  498. result = parser.read_csv(StringIO(data), sep=r"\s+")
  499. tm.assert_frame_equal(result, expected)
  500. def test_sub_character(all_parsers, csv_dir_path):
  501. # see gh-16893
  502. filename = os.path.join(csv_dir_path, "sub_char.csv")
  503. expected = DataFrame([[1, 2, 3]], columns=["a", "\x1ab", "c"])
  504. parser = all_parsers
  505. result = parser.read_csv(filename)
  506. tm.assert_frame_equal(result, expected)
  507. @pytest.mark.parametrize("filename", ["sé-es-vé.csv", "ru-sй.csv", "中文文件名.csv"])
  508. def test_filename_with_special_chars(all_parsers, filename):
  509. # see gh-15086.
  510. parser = all_parsers
  511. df = DataFrame({"a": [1, 2, 3]})
  512. with tm.ensure_clean(filename) as path:
  513. df.to_csv(path, index=False)
  514. result = parser.read_csv(path)
  515. tm.assert_frame_equal(result, df)
  516. def test_read_table_same_signature_as_read_csv(all_parsers):
  517. # GH-34976
  518. parser = all_parsers
  519. table_sign = signature(parser.read_table)
  520. csv_sign = signature(parser.read_csv)
  521. assert table_sign.parameters.keys() == csv_sign.parameters.keys()
  522. assert table_sign.return_annotation == csv_sign.return_annotation
  523. for key, csv_param in csv_sign.parameters.items():
  524. table_param = table_sign.parameters[key]
  525. if key == "sep":
  526. assert csv_param.default == ","
  527. assert table_param.default == "\t"
  528. assert table_param.annotation == csv_param.annotation
  529. assert table_param.kind == csv_param.kind
  530. continue
  531. assert table_param == csv_param
  532. def test_read_table_equivalency_to_read_csv(all_parsers):
  533. # see gh-21948
  534. # As of 0.25.0, read_table is undeprecated
  535. parser = all_parsers
  536. data = "a\tb\n1\t2\n3\t4"
  537. expected = parser.read_csv(StringIO(data), sep="\t")
  538. result = parser.read_table(StringIO(data))
  539. tm.assert_frame_equal(result, expected)
  540. @pytest.mark.parametrize("read_func", ["read_csv", "read_table"])
  541. def test_read_csv_and_table_sys_setprofile(all_parsers, read_func):
  542. # GH#41069
  543. parser = all_parsers
  544. data = "a b\n0 1"
  545. sys.setprofile(lambda *a, **k: None)
  546. result = getattr(parser, read_func)(StringIO(data))
  547. sys.setprofile(None)
  548. expected = DataFrame({"a b": ["0 1"]})
  549. tm.assert_frame_equal(result, expected)
  550. @xfail_pyarrow
  551. def test_first_row_bom(all_parsers):
  552. # see gh-26545
  553. parser = all_parsers
  554. data = '''\ufeff"Head1"\t"Head2"\t"Head3"'''
  555. result = parser.read_csv(StringIO(data), delimiter="\t")
  556. expected = DataFrame(columns=["Head1", "Head2", "Head3"])
  557. tm.assert_frame_equal(result, expected)
  558. @xfail_pyarrow
  559. def test_first_row_bom_unquoted(all_parsers):
  560. # see gh-36343
  561. parser = all_parsers
  562. data = """\ufeffHead1\tHead2\tHead3"""
  563. result = parser.read_csv(StringIO(data), delimiter="\t")
  564. expected = DataFrame(columns=["Head1", "Head2", "Head3"])
  565. tm.assert_frame_equal(result, expected)
  566. @xfail_pyarrow
  567. @pytest.mark.parametrize("nrows", range(1, 6))
  568. def test_blank_lines_between_header_and_data_rows(all_parsers, nrows):
  569. # GH 28071
  570. ref = DataFrame(
  571. [[np.nan, np.nan], [np.nan, np.nan], [1, 2], [np.nan, np.nan], [3, 4]],
  572. columns=list("ab"),
  573. )
  574. csv = "\nheader\n\na,b\n\n\n1,2\n\n3,4"
  575. parser = all_parsers
  576. df = parser.read_csv(StringIO(csv), header=3, nrows=nrows, skip_blank_lines=False)
  577. tm.assert_frame_equal(df, ref[:nrows])
  578. @xfail_pyarrow
  579. def test_no_header_two_extra_columns(all_parsers):
  580. # GH 26218
  581. column_names = ["one", "two", "three"]
  582. ref = DataFrame([["foo", "bar", "baz"]], columns=column_names)
  583. stream = StringIO("foo,bar,baz,bam,blah")
  584. parser = all_parsers
  585. df = parser.read_csv_check_warnings(
  586. ParserWarning,
  587. "Length of header or names does not match length of data. "
  588. "This leads to a loss of data with index_col=False.",
  589. stream,
  590. header=None,
  591. names=column_names,
  592. index_col=False,
  593. )
  594. tm.assert_frame_equal(df, ref)
  595. def test_read_csv_names_not_accepting_sets(all_parsers):
  596. # GH 34946
  597. data = """\
  598. 1,2,3
  599. 4,5,6\n"""
  600. parser = all_parsers
  601. with pytest.raises(ValueError, match="Names should be an ordered collection."):
  602. parser.read_csv(StringIO(data), names=set("QAZ"))
  603. @xfail_pyarrow
  604. def test_read_table_delim_whitespace_default_sep(all_parsers):
  605. # GH: 35958
  606. f = StringIO("a b c\n1 -2 -3\n4 5 6")
  607. parser = all_parsers
  608. result = parser.read_table(f, delim_whitespace=True)
  609. expected = DataFrame({"a": [1, 4], "b": [-2, 5], "c": [-3, 6]})
  610. tm.assert_frame_equal(result, expected)
  611. @pytest.mark.parametrize("delimiter", [",", "\t"])
  612. def test_read_csv_delim_whitespace_non_default_sep(all_parsers, delimiter):
  613. # GH: 35958
  614. f = StringIO("a b c\n1 -2 -3\n4 5 6")
  615. parser = all_parsers
  616. msg = (
  617. "Specified a delimiter with both sep and "
  618. "delim_whitespace=True; you can only specify one."
  619. )
  620. with pytest.raises(ValueError, match=msg):
  621. parser.read_csv(f, delim_whitespace=True, sep=delimiter)
  622. with pytest.raises(ValueError, match=msg):
  623. parser.read_csv(f, delim_whitespace=True, delimiter=delimiter)
  624. def test_read_csv_delimiter_and_sep_no_default(all_parsers):
  625. # GH#39823
  626. f = StringIO("a,b\n1,2")
  627. parser = all_parsers
  628. msg = "Specified a sep and a delimiter; you can only specify one."
  629. with pytest.raises(ValueError, match=msg):
  630. parser.read_csv(f, sep=" ", delimiter=".")
  631. @pytest.mark.parametrize("kwargs", [{"delimiter": "\n"}, {"sep": "\n"}])
  632. def test_read_csv_line_break_as_separator(kwargs, all_parsers):
  633. # GH#43528
  634. parser = all_parsers
  635. data = """a,b,c
  636. 1,2,3
  637. """
  638. msg = (
  639. r"Specified \\n as separator or delimiter. This forces the python engine "
  640. r"which does not accept a line terminator. Hence it is not allowed to use "
  641. r"the line terminator as separator."
  642. )
  643. with pytest.raises(ValueError, match=msg):
  644. parser.read_csv(StringIO(data), **kwargs)
  645. @pytest.mark.parametrize("delimiter", [",", "\t"])
  646. def test_read_table_delim_whitespace_non_default_sep(all_parsers, delimiter):
  647. # GH: 35958
  648. f = StringIO("a b c\n1 -2 -3\n4 5 6")
  649. parser = all_parsers
  650. msg = (
  651. "Specified a delimiter with both sep and "
  652. "delim_whitespace=True; you can only specify one."
  653. )
  654. with pytest.raises(ValueError, match=msg):
  655. parser.read_table(f, delim_whitespace=True, sep=delimiter)
  656. with pytest.raises(ValueError, match=msg):
  657. parser.read_table(f, delim_whitespace=True, delimiter=delimiter)
  658. @xfail_pyarrow
  659. def test_dict_keys_as_names(all_parsers):
  660. # GH: 36928
  661. data = "1,2"
  662. keys = {"a": int, "b": int}.keys()
  663. parser = all_parsers
  664. result = parser.read_csv(StringIO(data), names=keys)
  665. expected = DataFrame({"a": [1], "b": [2]})
  666. tm.assert_frame_equal(result, expected)
  667. @xfail_pyarrow
  668. def test_encoding_surrogatepass(all_parsers):
  669. # GH39017
  670. parser = all_parsers
  671. content = b"\xed\xbd\xbf"
  672. decoded = content.decode("utf-8", errors="surrogatepass")
  673. expected = DataFrame({decoded: [decoded]}, index=[decoded * 2])
  674. expected.index.name = decoded * 2
  675. with tm.ensure_clean() as path:
  676. Path(path).write_bytes(
  677. content * 2 + b"," + content + b"\n" + content * 2 + b"," + content
  678. )
  679. df = parser.read_csv(path, encoding_errors="surrogatepass", index_col=0)
  680. tm.assert_frame_equal(df, expected)
  681. with pytest.raises(UnicodeDecodeError, match="'utf-8' codec can't decode byte"):
  682. parser.read_csv(path)
  683. def test_malformed_second_line(all_parsers):
  684. # see GH14782
  685. parser = all_parsers
  686. data = "\na\nb\n"
  687. result = parser.read_csv(StringIO(data), skip_blank_lines=False, header=1)
  688. expected = DataFrame({"a": ["b"]})
  689. tm.assert_frame_equal(result, expected)
  690. @xfail_pyarrow
  691. def test_short_single_line(all_parsers):
  692. # GH 47566
  693. parser = all_parsers
  694. columns = ["a", "b", "c"]
  695. data = "1,2"
  696. result = parser.read_csv(StringIO(data), header=None, names=columns)
  697. expected = DataFrame({"a": [1], "b": [2], "c": [np.nan]})
  698. tm.assert_frame_equal(result, expected)
  699. @xfail_pyarrow
  700. def test_short_multi_line(all_parsers):
  701. # GH 47566
  702. parser = all_parsers
  703. columns = ["a", "b", "c"]
  704. data = "1,2\n1,2"
  705. result = parser.read_csv(StringIO(data), header=None, names=columns)
  706. expected = DataFrame({"a": [1, 1], "b": [2, 2], "c": [np.nan, np.nan]})
  707. tm.assert_frame_equal(result, expected)
  708. def test_read_seek(all_parsers):
  709. # GH48646
  710. parser = all_parsers
  711. prefix = "### DATA\n"
  712. content = "nkey,value\ntables,rectangular\n"
  713. with tm.ensure_clean() as path:
  714. Path(path).write_text(prefix + content)
  715. with open(path, encoding="utf-8") as file:
  716. file.readline()
  717. actual = parser.read_csv(file)
  718. expected = parser.read_csv(StringIO(content))
  719. tm.assert_frame_equal(actual, expected)