test_header.py 18 KB


  1. """
  2. Tests that the file header is properly handled or inferred
  3. during parsing for all of the parsers defined in parsers.py
  4. """
  5. from collections import namedtuple
  6. from io import StringIO
  7. import numpy as np
  8. import pytest
  9. from pandas.errors import ParserError
  10. from pandas import (
  11. DataFrame,
  12. Index,
  13. MultiIndex,
  14. )
  15. import pandas._testing as tm
  16. # TODO(1.4): Change me to xfails at release time
  17. skip_pyarrow = pytest.mark.usefixtures("pyarrow_skip")
  18. @skip_pyarrow
  19. def test_read_with_bad_header(all_parsers):
  20. parser = all_parsers
  21. msg = r"but only \d+ lines in file"
  22. with pytest.raises(ValueError, match=msg):
  23. s = StringIO(",,")
  24. parser.read_csv(s, header=[10])
  25. def test_negative_header(all_parsers):
  26. # see gh-27779
  27. parser = all_parsers
  28. data = """1,2,3,4,5
  29. 6,7,8,9,10
  30. 11,12,13,14,15
  31. """
  32. with pytest.raises(
  33. ValueError,
  34. match="Passing negative integer to header is invalid. "
  35. "For no header, use header=None instead",
  36. ):
  37. parser.read_csv(StringIO(data), header=-1)
  38. @pytest.mark.parametrize("header", [([-1, 2, 4]), ([-5, 0])])
  39. def test_negative_multi_index_header(all_parsers, header):
  40. # see gh-27779
  41. parser = all_parsers
  42. data = """1,2,3,4,5
  43. 6,7,8,9,10
  44. 11,12,13,14,15
  45. """
  46. with pytest.raises(
  47. ValueError, match="cannot specify multi-index header with negative integers"
  48. ):
  49. parser.read_csv(StringIO(data), header=header)
  50. @pytest.mark.parametrize("header", [True, False])
  51. def test_bool_header_arg(all_parsers, header):
  52. # see gh-6114
  53. parser = all_parsers
  54. data = """\
  55. MyColumn
  56. a
  57. b
  58. a
  59. b"""
  60. msg = "Passing a bool to header is invalid"
  61. with pytest.raises(TypeError, match=msg):
  62. parser.read_csv(StringIO(data), header=header)
  63. @skip_pyarrow
  64. def test_header_with_index_col(all_parsers):
  65. parser = all_parsers
  66. data = """foo,1,2,3
  67. bar,4,5,6
  68. baz,7,8,9
  69. """
  70. names = ["A", "B", "C"]
  71. result = parser.read_csv(StringIO(data), names=names)
  72. expected = DataFrame(
  73. [[1, 2, 3], [4, 5, 6], [7, 8, 9]],
  74. index=["foo", "bar", "baz"],
  75. columns=["A", "B", "C"],
  76. )
  77. tm.assert_frame_equal(result, expected)
  78. def test_header_not_first_line(all_parsers):
  79. parser = all_parsers
  80. data = """got,to,ignore,this,line
  81. got,to,ignore,this,line
  82. index,A,B,C,D
  83. foo,2,3,4,5
  84. bar,7,8,9,10
  85. baz,12,13,14,15
  86. """
  87. data2 = """index,A,B,C,D
  88. foo,2,3,4,5
  89. bar,7,8,9,10
  90. baz,12,13,14,15
  91. """
  92. result = parser.read_csv(StringIO(data), header=2, index_col=0)
  93. expected = parser.read_csv(StringIO(data2), header=0, index_col=0)
  94. tm.assert_frame_equal(result, expected)
  95. @skip_pyarrow
  96. def test_header_multi_index(all_parsers):
  97. parser = all_parsers
  98. expected = tm.makeCustomDataframe(5, 3, r_idx_nlevels=2, c_idx_nlevels=4)
  99. data = """\
  100. C0,,C_l0_g0,C_l0_g1,C_l0_g2
  101. C1,,C_l1_g0,C_l1_g1,C_l1_g2
  102. C2,,C_l2_g0,C_l2_g1,C_l2_g2
  103. C3,,C_l3_g0,C_l3_g1,C_l3_g2
  104. R0,R1,,,
  105. R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
  106. R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
  107. R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
  108. R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
  109. R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
  110. """
  111. result = parser.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1])
  112. tm.assert_frame_equal(result, expected)
  113. @pytest.mark.parametrize(
  114. "kwargs,msg",
  115. [
  116. (
  117. {"index_col": ["foo", "bar"]},
  118. (
  119. "index_col must only contain "
  120. "row numbers when specifying "
  121. "a multi-index header"
  122. ),
  123. ),
  124. (
  125. {"index_col": [0, 1], "names": ["foo", "bar"]},
  126. ("cannot specify names when specifying a multi-index header"),
  127. ),
  128. (
  129. {"index_col": [0, 1], "usecols": ["foo", "bar"]},
  130. ("cannot specify usecols when specifying a multi-index header"),
  131. ),
  132. ],
  133. )
  134. def test_header_multi_index_invalid(all_parsers, kwargs, msg):
  135. data = """\
  136. C0,,C_l0_g0,C_l0_g1,C_l0_g2
  137. C1,,C_l1_g0,C_l1_g1,C_l1_g2
  138. C2,,C_l2_g0,C_l2_g1,C_l2_g2
  139. C3,,C_l3_g0,C_l3_g1,C_l3_g2
  140. R0,R1,,,
  141. R_l0_g0,R_l1_g0,R0C0,R0C1,R0C2
  142. R_l0_g1,R_l1_g1,R1C0,R1C1,R1C2
  143. R_l0_g2,R_l1_g2,R2C0,R2C1,R2C2
  144. R_l0_g3,R_l1_g3,R3C0,R3C1,R3C2
  145. R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2
  146. """
  147. parser = all_parsers
  148. with pytest.raises(ValueError, match=msg):
  149. parser.read_csv(StringIO(data), header=[0, 1, 2, 3], **kwargs)
  150. _TestTuple = namedtuple("_TestTuple", ["first", "second"])
  151. @skip_pyarrow
  152. @pytest.mark.parametrize(
  153. "kwargs",
  154. [
  155. {"header": [0, 1]},
  156. {
  157. "skiprows": 3,
  158. "names": [
  159. ("a", "q"),
  160. ("a", "r"),
  161. ("a", "s"),
  162. ("b", "t"),
  163. ("c", "u"),
  164. ("c", "v"),
  165. ],
  166. },
  167. {
  168. "skiprows": 3,
  169. "names": [
  170. _TestTuple("a", "q"),
  171. _TestTuple("a", "r"),
  172. _TestTuple("a", "s"),
  173. _TestTuple("b", "t"),
  174. _TestTuple("c", "u"),
  175. _TestTuple("c", "v"),
  176. ],
  177. },
  178. ],
  179. )
  180. def test_header_multi_index_common_format1(all_parsers, kwargs):
  181. parser = all_parsers
  182. expected = DataFrame(
  183. [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
  184. index=["one", "two"],
  185. columns=MultiIndex.from_tuples(
  186. [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
  187. ),
  188. )
  189. data = """,a,a,a,b,c,c
  190. ,q,r,s,t,u,v
  191. ,,,,,,
  192. one,1,2,3,4,5,6
  193. two,7,8,9,10,11,12"""
  194. result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
  195. tm.assert_frame_equal(result, expected)
  196. @skip_pyarrow
  197. @pytest.mark.parametrize(
  198. "kwargs",
  199. [
  200. {"header": [0, 1]},
  201. {
  202. "skiprows": 2,
  203. "names": [
  204. ("a", "q"),
  205. ("a", "r"),
  206. ("a", "s"),
  207. ("b", "t"),
  208. ("c", "u"),
  209. ("c", "v"),
  210. ],
  211. },
  212. {
  213. "skiprows": 2,
  214. "names": [
  215. _TestTuple("a", "q"),
  216. _TestTuple("a", "r"),
  217. _TestTuple("a", "s"),
  218. _TestTuple("b", "t"),
  219. _TestTuple("c", "u"),
  220. _TestTuple("c", "v"),
  221. ],
  222. },
  223. ],
  224. )
  225. def test_header_multi_index_common_format2(all_parsers, kwargs):
  226. parser = all_parsers
  227. expected = DataFrame(
  228. [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
  229. index=["one", "two"],
  230. columns=MultiIndex.from_tuples(
  231. [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
  232. ),
  233. )
  234. data = """,a,a,a,b,c,c
  235. ,q,r,s,t,u,v
  236. one,1,2,3,4,5,6
  237. two,7,8,9,10,11,12"""
  238. result = parser.read_csv(StringIO(data), index_col=0, **kwargs)
  239. tm.assert_frame_equal(result, expected)
  240. @skip_pyarrow
  241. @pytest.mark.parametrize(
  242. "kwargs",
  243. [
  244. {"header": [0, 1]},
  245. {
  246. "skiprows": 2,
  247. "names": [
  248. ("a", "q"),
  249. ("a", "r"),
  250. ("a", "s"),
  251. ("b", "t"),
  252. ("c", "u"),
  253. ("c", "v"),
  254. ],
  255. },
  256. {
  257. "skiprows": 2,
  258. "names": [
  259. _TestTuple("a", "q"),
  260. _TestTuple("a", "r"),
  261. _TestTuple("a", "s"),
  262. _TestTuple("b", "t"),
  263. _TestTuple("c", "u"),
  264. _TestTuple("c", "v"),
  265. ],
  266. },
  267. ],
  268. )
  269. def test_header_multi_index_common_format3(all_parsers, kwargs):
  270. parser = all_parsers
  271. expected = DataFrame(
  272. [[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]],
  273. index=["one", "two"],
  274. columns=MultiIndex.from_tuples(
  275. [("a", "q"), ("a", "r"), ("a", "s"), ("b", "t"), ("c", "u"), ("c", "v")]
  276. ),
  277. )
  278. expected = expected.reset_index(drop=True)
  279. data = """a,a,a,b,c,c
  280. q,r,s,t,u,v
  281. 1,2,3,4,5,6
  282. 7,8,9,10,11,12"""
  283. result = parser.read_csv(StringIO(data), index_col=None, **kwargs)
  284. tm.assert_frame_equal(result, expected)
  285. @skip_pyarrow
  286. def test_header_multi_index_common_format_malformed1(all_parsers):
  287. parser = all_parsers
  288. expected = DataFrame(
  289. np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
  290. index=Index([1, 7]),
  291. columns=MultiIndex(
  292. levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
  293. codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
  294. names=["a", "q"],
  295. ),
  296. )
  297. data = """a,a,a,b,c,c
  298. q,r,s,t,u,v
  299. 1,2,3,4,5,6
  300. 7,8,9,10,11,12"""
  301. result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
  302. tm.assert_frame_equal(expected, result)
  303. @skip_pyarrow
  304. def test_header_multi_index_common_format_malformed2(all_parsers):
  305. parser = all_parsers
  306. expected = DataFrame(
  307. np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype="int64"),
  308. index=Index([1, 7]),
  309. columns=MultiIndex(
  310. levels=[["a", "b", "c"], ["r", "s", "t", "u", "v"]],
  311. codes=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]],
  312. names=[None, "q"],
  313. ),
  314. )
  315. data = """,a,a,b,c,c
  316. q,r,s,t,u,v
  317. 1,2,3,4,5,6
  318. 7,8,9,10,11,12"""
  319. result = parser.read_csv(StringIO(data), header=[0, 1], index_col=0)
  320. tm.assert_frame_equal(expected, result)
  321. @skip_pyarrow
  322. def test_header_multi_index_common_format_malformed3(all_parsers):
  323. parser = all_parsers
  324. expected = DataFrame(
  325. np.array([[3, 4, 5, 6], [9, 10, 11, 12]], dtype="int64"),
  326. index=MultiIndex(levels=[[1, 7], [2, 8]], codes=[[0, 1], [0, 1]]),
  327. columns=MultiIndex(
  328. levels=[["a", "b", "c"], ["s", "t", "u", "v"]],
  329. codes=[[0, 1, 2, 2], [0, 1, 2, 3]],
  330. names=[None, "q"],
  331. ),
  332. )
  333. data = """,a,a,b,c,c
  334. q,r,s,t,u,v
  335. 1,2,3,4,5,6
  336. 7,8,9,10,11,12"""
  337. result = parser.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1])
  338. tm.assert_frame_equal(expected, result)
  339. @skip_pyarrow
  340. def test_header_multi_index_blank_line(all_parsers):
  341. # GH 40442
  342. parser = all_parsers
  343. data = [[None, None], [1, 2], [3, 4]]
  344. columns = MultiIndex.from_tuples([("a", "A"), ("b", "B")])
  345. expected = DataFrame(data, columns=columns)
  346. data = "a,b\nA,B\n,\n1,2\n3,4"
  347. result = parser.read_csv(StringIO(data), header=[0, 1])
  348. tm.assert_frame_equal(expected, result)
  349. @skip_pyarrow
  350. @pytest.mark.parametrize(
  351. "data,header", [("1,2,3\n4,5,6", None), ("foo,bar,baz\n1,2,3\n4,5,6", 0)]
  352. )
  353. def test_header_names_backward_compat(all_parsers, data, header):
  354. # see gh-2539
  355. parser = all_parsers
  356. expected = parser.read_csv(StringIO("1,2,3\n4,5,6"), names=["a", "b", "c"])
  357. result = parser.read_csv(StringIO(data), names=["a", "b", "c"], header=header)
  358. tm.assert_frame_equal(result, expected)
  359. @skip_pyarrow
  360. @pytest.mark.parametrize("kwargs", [{}, {"index_col": False}])
  361. def test_read_only_header_no_rows(all_parsers, kwargs):
  362. # See gh-7773
  363. parser = all_parsers
  364. expected = DataFrame(columns=["a", "b", "c"])
  365. result = parser.read_csv(StringIO("a,b,c"), **kwargs)
  366. tm.assert_frame_equal(result, expected)
  367. @pytest.mark.parametrize(
  368. "kwargs,names",
  369. [
  370. ({}, [0, 1, 2, 3, 4]),
  371. (
  372. {"names": ["foo", "bar", "baz", "quux", "panda"]},
  373. ["foo", "bar", "baz", "quux", "panda"],
  374. ),
  375. ],
  376. )
  377. def test_no_header(all_parsers, kwargs, names):
  378. parser = all_parsers
  379. data = """1,2,3,4,5
  380. 6,7,8,9,10
  381. 11,12,13,14,15
  382. """
  383. expected = DataFrame(
  384. [[1, 2, 3, 4, 5], [6, 7, 8, 9, 10], [11, 12, 13, 14, 15]], columns=names
  385. )
  386. result = parser.read_csv(StringIO(data), header=None, **kwargs)
  387. tm.assert_frame_equal(result, expected)
  388. @pytest.mark.parametrize("header", [["a", "b"], "string_header"])
  389. def test_non_int_header(all_parsers, header):
  390. # see gh-16338
  391. msg = "header must be integer or list of integers"
  392. data = """1,2\n3,4"""
  393. parser = all_parsers
  394. with pytest.raises(ValueError, match=msg):
  395. parser.read_csv(StringIO(data), header=header)
  396. @skip_pyarrow
  397. def test_singleton_header(all_parsers):
  398. # see gh-7757
  399. data = """a,b,c\n0,1,2\n1,2,3"""
  400. parser = all_parsers
  401. expected = DataFrame({"a": [0, 1], "b": [1, 2], "c": [2, 3]})
  402. result = parser.read_csv(StringIO(data), header=[0])
  403. tm.assert_frame_equal(result, expected)
  404. @skip_pyarrow
  405. @pytest.mark.parametrize(
  406. "data,expected",
  407. [
  408. (
  409. "A,A,A,B\none,one,one,two\n0,40,34,0.1",
  410. DataFrame(
  411. [[0, 40, 34, 0.1]],
  412. columns=MultiIndex.from_tuples(
  413. [("A", "one"), ("A", "one.1"), ("A", "one.2"), ("B", "two")]
  414. ),
  415. ),
  416. ),
  417. (
  418. "A,A,A,B\none,one,one.1,two\n0,40,34,0.1",
  419. DataFrame(
  420. [[0, 40, 34, 0.1]],
  421. columns=MultiIndex.from_tuples(
  422. [("A", "one"), ("A", "one.1"), ("A", "one.1.1"), ("B", "two")]
  423. ),
  424. ),
  425. ),
  426. (
  427. "A,A,A,B,B\none,one,one.1,two,two\n0,40,34,0.1,0.1",
  428. DataFrame(
  429. [[0, 40, 34, 0.1, 0.1]],
  430. columns=MultiIndex.from_tuples(
  431. [
  432. ("A", "one"),
  433. ("A", "one.1"),
  434. ("A", "one.1.1"),
  435. ("B", "two"),
  436. ("B", "two.1"),
  437. ]
  438. ),
  439. ),
  440. ),
  441. ],
  442. )
  443. def test_mangles_multi_index(all_parsers, data, expected):
  444. # see gh-18062
  445. parser = all_parsers
  446. result = parser.read_csv(StringIO(data), header=[0, 1])
  447. tm.assert_frame_equal(result, expected)
  448. @skip_pyarrow
  449. @pytest.mark.parametrize("index_col", [None, [0]])
  450. @pytest.mark.parametrize(
  451. "columns", [None, (["", "Unnamed"]), (["Unnamed", ""]), (["Unnamed", "NotUnnamed"])]
  452. )
  453. def test_multi_index_unnamed(all_parsers, index_col, columns):
  454. # see gh-23687
  455. #
  456. # When specifying a multi-index header, make sure that
  457. # we don't error just because one of the rows in our header
  458. # has ALL column names containing the string "Unnamed". The
  459. # correct condition to check is whether the row contains
  460. # ALL columns that did not have names (and instead were given
  461. # placeholder ones).
  462. parser = all_parsers
  463. header = [0, 1]
  464. if index_col is None:
  465. data = ",".join(columns or ["", ""]) + "\n0,1\n2,3\n4,5\n"
  466. else:
  467. data = ",".join([""] + (columns or ["", ""])) + "\n,0,1\n0,2,3\n1,4,5\n"
  468. result = parser.read_csv(StringIO(data), header=header, index_col=index_col)
  469. exp_columns = []
  470. if columns is None:
  471. columns = ["", "", ""]
  472. for i, col in enumerate(columns):
  473. if not col: # Unnamed.
  474. col = f"Unnamed: {i if index_col is None else i + 1}_level_0"
  475. exp_columns.append(col)
  476. columns = MultiIndex.from_tuples(zip(exp_columns, ["0", "1"]))
  477. expected = DataFrame([[2, 3], [4, 5]], columns=columns)
  478. tm.assert_frame_equal(result, expected)
  479. @skip_pyarrow
  480. def test_names_longer_than_header_but_equal_with_data_rows(all_parsers):
  481. # GH#38453
  482. parser = all_parsers
  483. data = """a, b
  484. 1,2,3
  485. 5,6,4
  486. """
  487. result = parser.read_csv(StringIO(data), header=0, names=["A", "B", "C"])
  488. expected = DataFrame({"A": [1, 5], "B": [2, 6], "C": [3, 4]})
  489. tm.assert_frame_equal(result, expected)
  490. @skip_pyarrow
  491. def test_read_csv_multiindex_columns(all_parsers):
  492. # GH#6051
  493. parser = all_parsers
  494. s1 = "Male, Male, Male, Female, Female\nR, R, L, R, R\n.86, .67, .88, .78, .81"
  495. s2 = (
  496. "Male, Male, Male, Female, Female\n"
  497. "R, R, L, R, R\n"
  498. ".86, .67, .88, .78, .81\n"
  499. ".86, .67, .88, .78, .82"
  500. )
  501. mi = MultiIndex.from_tuples(
  502. [
  503. ("Male", "R"),
  504. (" Male", " R"),
  505. (" Male", " L"),
  506. (" Female", " R"),
  507. (" Female", " R.1"),
  508. ]
  509. )
  510. expected = DataFrame(
  511. [[0.86, 0.67, 0.88, 0.78, 0.81], [0.86, 0.67, 0.88, 0.78, 0.82]], columns=mi
  512. )
  513. df1 = parser.read_csv(StringIO(s1), header=[0, 1])
  514. tm.assert_frame_equal(df1, expected.iloc[:1])
  515. df2 = parser.read_csv(StringIO(s2), header=[0, 1])
  516. tm.assert_frame_equal(df2, expected)
  517. @skip_pyarrow
  518. def test_read_csv_multi_header_length_check(all_parsers):
  519. # GH#43102
  520. parser = all_parsers
  521. case = """row11,row12,row13
  522. row21,row22, row23
  523. row31,row32
  524. """
  525. with pytest.raises(
  526. ParserError, match="Header rows must have an equal number of columns."
  527. ):
  528. parser.read_csv(StringIO(case), header=[0, 2])
  529. @skip_pyarrow
  530. def test_header_none_and_implicit_index(all_parsers):
  531. # GH#22144
  532. parser = all_parsers
  533. data = "x,1,5\ny,2\nz,3\n"
  534. result = parser.read_csv(StringIO(data), names=["a", "b"], header=None)
  535. expected = DataFrame(
  536. {"a": [1, 2, 3], "b": [5, np.nan, np.nan]}, index=["x", "y", "z"]
  537. )
  538. tm.assert_frame_equal(result, expected)
  539. @skip_pyarrow
  540. def test_header_none_and_implicit_index_in_second_row(all_parsers):
  541. # GH#22144
  542. parser = all_parsers
  543. data = "x,1\ny,2,5\nz,3\n"
  544. with pytest.raises(ParserError, match="Expected 2 fields in line 2, saw 3"):
  545. parser.read_csv(StringIO(data), names=["a", "b"], header=None)
  546. @skip_pyarrow
  547. def test_header_none_and_on_bad_lines_skip(all_parsers):
  548. # GH#22144
  549. parser = all_parsers
  550. data = "x,1\ny,2,5\nz,3\n"
  551. result = parser.read_csv(
  552. StringIO(data), names=["a", "b"], header=None, on_bad_lines="skip"
  553. )
  554. expected = DataFrame({"a": ["x", "z"], "b": [1, 3]})
  555. tm.assert_frame_equal(result, expected)
  556. @skip_pyarrow
  557. def test_header_missing_rows(all_parsers):
  558. # GH#47400
  559. parser = all_parsers
  560. data = """a,b
  561. 1,2
  562. """
  563. msg = r"Passed header=\[0,1,2\], len of 3, but only 2 lines in file"
  564. with pytest.raises(ValueError, match=msg):
  565. parser.read_csv(StringIO(data), header=[0, 1, 2])