test_skiprows.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288
  1. """
  2. Tests that skipped rows are properly handled during
  3. parsing for all of the parsers defined in parsers.py
  4. """
  5. from datetime import datetime
  6. from io import StringIO
  7. import numpy as np
  8. import pytest
  9. from pandas.errors import EmptyDataError
  10. from pandas import (
  11. DataFrame,
  12. Index,
  13. )
  14. import pandas._testing as tm
  15. # XFAIL ME PLS once hanging tests issues identified
  16. pytestmark = pytest.mark.usefixtures("pyarrow_skip")
  17. @pytest.mark.parametrize("skiprows", [list(range(6)), 6])
  18. def test_skip_rows_bug(all_parsers, skiprows):
  19. # see gh-505
  20. parser = all_parsers
  21. text = """#foo,a,b,c
  22. #foo,a,b,c
  23. #foo,a,b,c
  24. #foo,a,b,c
  25. #foo,a,b,c
  26. #foo,a,b,c
  27. 1/1/2000,1.,2.,3.
  28. 1/2/2000,4,5,6
  29. 1/3/2000,7,8,9
  30. """
  31. result = parser.read_csv(
  32. StringIO(text), skiprows=skiprows, header=None, index_col=0, parse_dates=True
  33. )
  34. index = Index(
  35. [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
  36. )
  37. expected = DataFrame(
  38. np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
  39. )
  40. tm.assert_frame_equal(result, expected)
  41. def test_deep_skip_rows(all_parsers):
  42. # see gh-4382
  43. parser = all_parsers
  44. data = "a,b,c\n" + "\n".join(
  45. [",".join([str(i), str(i + 1), str(i + 2)]) for i in range(10)]
  46. )
  47. condensed_data = "a,b,c\n" + "\n".join(
  48. [",".join([str(i), str(i + 1), str(i + 2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]
  49. )
  50. result = parser.read_csv(StringIO(data), skiprows=[6, 8])
  51. condensed_result = parser.read_csv(StringIO(condensed_data))
  52. tm.assert_frame_equal(result, condensed_result)
  53. def test_skip_rows_blank(all_parsers):
  54. # see gh-9832
  55. parser = all_parsers
  56. text = """#foo,a,b,c
  57. #foo,a,b,c
  58. #foo,a,b,c
  59. #foo,a,b,c
  60. 1/1/2000,1.,2.,3.
  61. 1/2/2000,4,5,6
  62. 1/3/2000,7,8,9
  63. """
  64. data = parser.read_csv(
  65. StringIO(text), skiprows=6, header=None, index_col=0, parse_dates=True
  66. )
  67. index = Index(
  68. [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)], name=0
  69. )
  70. expected = DataFrame(
  71. np.arange(1.0, 10.0).reshape((3, 3)), columns=[1, 2, 3], index=index
  72. )
  73. tm.assert_frame_equal(data, expected)
  74. @pytest.mark.parametrize(
  75. "data,kwargs,expected",
  76. [
  77. (
  78. """id,text,num_lines
  79. 1,"line 11
  80. line 12",2
  81. 2,"line 21
  82. line 22",2
  83. 3,"line 31",1""",
  84. {"skiprows": [1]},
  85. DataFrame(
  86. [[2, "line 21\nline 22", 2], [3, "line 31", 1]],
  87. columns=["id", "text", "num_lines"],
  88. ),
  89. ),
  90. (
  91. "a,b,c\n~a\n b~,~e\n d~,~f\n f~\n1,2,~12\n 13\n 14~",
  92. {"quotechar": "~", "skiprows": [2]},
  93. DataFrame([["a\n b", "e\n d", "f\n f"]], columns=["a", "b", "c"]),
  94. ),
  95. (
  96. (
  97. "Text,url\n~example\n "
  98. "sentence\n one~,url1\n~"
  99. "example\n sentence\n two~,url2\n~"
  100. "example\n sentence\n three~,url3"
  101. ),
  102. {"quotechar": "~", "skiprows": [1, 3]},
  103. DataFrame([["example\n sentence\n two", "url2"]], columns=["Text", "url"]),
  104. ),
  105. ],
  106. )
  107. def test_skip_row_with_newline(all_parsers, data, kwargs, expected):
  108. # see gh-12775 and gh-10911
  109. parser = all_parsers
  110. result = parser.read_csv(StringIO(data), **kwargs)
  111. tm.assert_frame_equal(result, expected)
  112. def test_skip_row_with_quote(all_parsers):
  113. # see gh-12775 and gh-10911
  114. parser = all_parsers
  115. data = """id,text,num_lines
  116. 1,"line '11' line 12",2
  117. 2,"line '21' line 22",2
  118. 3,"line '31' line 32",1"""
  119. exp_data = [[2, "line '21' line 22", 2], [3, "line '31' line 32", 1]]
  120. expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
  121. result = parser.read_csv(StringIO(data), skiprows=[1])
  122. tm.assert_frame_equal(result, expected)
  123. @pytest.mark.parametrize(
  124. "data,exp_data",
  125. [
  126. (
  127. """id,text,num_lines
  128. 1,"line \n'11' line 12",2
  129. 2,"line \n'21' line 22",2
  130. 3,"line \n'31' line 32",1""",
  131. [[2, "line \n'21' line 22", 2], [3, "line \n'31' line 32", 1]],
  132. ),
  133. (
  134. """id,text,num_lines
  135. 1,"line '11\n' line 12",2
  136. 2,"line '21\n' line 22",2
  137. 3,"line '31\n' line 32",1""",
  138. [[2, "line '21\n' line 22", 2], [3, "line '31\n' line 32", 1]],
  139. ),
  140. (
  141. """id,text,num_lines
  142. 1,"line '11\n' \r\tline 12",2
  143. 2,"line '21\n' \r\tline 22",2
  144. 3,"line '31\n' \r\tline 32",1""",
  145. [[2, "line '21\n' \r\tline 22", 2], [3, "line '31\n' \r\tline 32", 1]],
  146. ),
  147. ],
  148. )
  149. def test_skip_row_with_newline_and_quote(all_parsers, data, exp_data):
  150. # see gh-12775 and gh-10911
  151. parser = all_parsers
  152. result = parser.read_csv(StringIO(data), skiprows=[1])
  153. expected = DataFrame(exp_data, columns=["id", "text", "num_lines"])
  154. tm.assert_frame_equal(result, expected)
  155. @pytest.mark.parametrize(
  156. "lineterminator", ["\n", "\r\n", "\r"] # "LF" # "CRLF" # "CR"
  157. )
  158. def test_skiprows_lineterminator(all_parsers, lineterminator, request):
  159. # see gh-9079
  160. parser = all_parsers
  161. data = "\n".join(
  162. [
  163. "SMOSMANIA ThetaProbe-ML2X ",
  164. "2007/01/01 01:00 0.2140 U M ",
  165. "2007/01/01 02:00 0.2141 M O ",
  166. "2007/01/01 04:00 0.2142 D M ",
  167. ]
  168. )
  169. expected = DataFrame(
  170. [
  171. ["2007/01/01", "01:00", 0.2140, "U", "M"],
  172. ["2007/01/01", "02:00", 0.2141, "M", "O"],
  173. ["2007/01/01", "04:00", 0.2142, "D", "M"],
  174. ],
  175. columns=["date", "time", "var", "flag", "oflag"],
  176. )
  177. if parser.engine == "python" and lineterminator == "\r":
  178. mark = pytest.mark.xfail(reason="'CR' not respect with the Python parser yet")
  179. request.node.add_marker(mark)
  180. data = data.replace("\n", lineterminator)
  181. result = parser.read_csv(
  182. StringIO(data),
  183. skiprows=1,
  184. delim_whitespace=True,
  185. names=["date", "time", "var", "flag", "oflag"],
  186. )
  187. tm.assert_frame_equal(result, expected)
  188. def test_skiprows_infield_quote(all_parsers):
  189. # see gh-14459
  190. parser = all_parsers
  191. data = 'a"\nb"\na\n1'
  192. expected = DataFrame({"a": [1]})
  193. result = parser.read_csv(StringIO(data), skiprows=2)
  194. tm.assert_frame_equal(result, expected)
  195. @pytest.mark.parametrize(
  196. "kwargs,expected",
  197. [
  198. ({}, DataFrame({"1": [3, 5]})),
  199. ({"header": 0, "names": ["foo"]}, DataFrame({"foo": [3, 5]})),
  200. ],
  201. )
  202. def test_skip_rows_callable(all_parsers, kwargs, expected):
  203. parser = all_parsers
  204. data = "a\n1\n2\n3\n4\n5"
  205. result = parser.read_csv(StringIO(data), skiprows=lambda x: x % 2 == 0, **kwargs)
  206. tm.assert_frame_equal(result, expected)
  207. def test_skip_rows_callable_not_in(all_parsers):
  208. parser = all_parsers
  209. data = "0,a\n1,b\n2,c\n3,d\n4,e"
  210. expected = DataFrame([[1, "b"], [3, "d"]])
  211. result = parser.read_csv(
  212. StringIO(data), header=None, skiprows=lambda x: x not in [1, 3]
  213. )
  214. tm.assert_frame_equal(result, expected)
  215. def test_skip_rows_skip_all(all_parsers):
  216. parser = all_parsers
  217. data = "a\n1\n2\n3\n4\n5"
  218. msg = "No columns to parse from file"
  219. with pytest.raises(EmptyDataError, match=msg):
  220. parser.read_csv(StringIO(data), skiprows=lambda x: True)
  221. def test_skip_rows_bad_callable(all_parsers):
  222. msg = "by zero"
  223. parser = all_parsers
  224. data = "a\n1\n2\n3\n4\n5"
  225. with pytest.raises(ZeroDivisionError, match=msg):
  226. parser.read_csv(StringIO(data), skiprows=lambda x: 1 / 0)
  227. def test_skip_rows_and_n_rows(all_parsers):
  228. # GH#44021
  229. data = """a,b
  230. 1,a
  231. 2,b
  232. 3,c
  233. 4,d
  234. 5,e
  235. 6,f
  236. 7,g
  237. 8,h
  238. """
  239. parser = all_parsers
  240. result = parser.read_csv(StringIO(data), nrows=5, skiprows=[2, 4, 6])
  241. expected = DataFrame({"a": [1, 3, 5, 7, 8], "b": ["a", "c", "e", "g", "h"]})
  242. tm.assert_frame_equal(result, expected)