test_textreader.py 10 KB


  1. """
  2. Tests the TextReader class in parsers.pyx, which
  3. is integral to the C engine in parsers.py
  4. """
  5. from io import (
  6. BytesIO,
  7. StringIO,
  8. )
  9. import numpy as np
  10. import pytest
  11. import pandas._libs.parsers as parser
  12. from pandas._libs.parsers import TextReader
  13. from pandas import DataFrame
  14. import pandas._testing as tm
  15. from pandas.io.parsers import (
  16. TextFileReader,
  17. read_csv,
  18. )
  19. from pandas.io.parsers.c_parser_wrapper import ensure_dtype_objs
  20. class TestTextReader:
  21. @pytest.fixture
  22. def csv_path(self, datapath):
  23. return datapath("io", "data", "csv", "test1.csv")
  24. def test_file_handle(self, csv_path):
  25. with open(csv_path, "rb") as f:
  26. reader = TextReader(f)
  27. reader.read()
  28. def test_file_handle_mmap(self, csv_path):
  29. # this was never using memory_map=True
  30. with open(csv_path, "rb") as f:
  31. reader = TextReader(f, header=None)
  32. reader.read()
  33. def test_StringIO(self, csv_path):
  34. with open(csv_path, "rb") as f:
  35. text = f.read()
  36. src = BytesIO(text)
  37. reader = TextReader(src, header=None)
  38. reader.read()
  39. def test_string_factorize(self):
  40. # should this be optional?
  41. data = "a\nb\na\nb\na"
  42. reader = TextReader(StringIO(data), header=None)
  43. result = reader.read()
  44. assert len(set(map(id, result[0]))) == 2
  45. def test_skipinitialspace(self):
  46. data = "a, b\na, b\na, b\na, b"
  47. reader = TextReader(StringIO(data), skipinitialspace=True, header=None)
  48. result = reader.read()
  49. tm.assert_numpy_array_equal(
  50. result[0], np.array(["a", "a", "a", "a"], dtype=np.object_)
  51. )
  52. tm.assert_numpy_array_equal(
  53. result[1], np.array(["b", "b", "b", "b"], dtype=np.object_)
  54. )
  55. def test_parse_booleans(self):
  56. data = "True\nFalse\nTrue\nTrue"
  57. reader = TextReader(StringIO(data), header=None)
  58. result = reader.read()
  59. assert result[0].dtype == np.bool_
  60. def test_delimit_whitespace(self):
  61. data = 'a b\na\t\t "b"\n"a"\t \t b'
  62. reader = TextReader(StringIO(data), delim_whitespace=True, header=None)
  63. result = reader.read()
  64. tm.assert_numpy_array_equal(
  65. result[0], np.array(["a", "a", "a"], dtype=np.object_)
  66. )
  67. tm.assert_numpy_array_equal(
  68. result[1], np.array(["b", "b", "b"], dtype=np.object_)
  69. )
  70. def test_embedded_newline(self):
  71. data = 'a\n"hello\nthere"\nthis'
  72. reader = TextReader(StringIO(data), header=None)
  73. result = reader.read()
  74. expected = np.array(["a", "hello\nthere", "this"], dtype=np.object_)
  75. tm.assert_numpy_array_equal(result[0], expected)
  76. def test_euro_decimal(self):
  77. data = "12345,67\n345,678"
  78. reader = TextReader(StringIO(data), delimiter=":", decimal=",", header=None)
  79. result = reader.read()
  80. expected = np.array([12345.67, 345.678])
  81. tm.assert_almost_equal(result[0], expected)
  82. def test_integer_thousands(self):
  83. data = "123,456\n12,500"
  84. reader = TextReader(StringIO(data), delimiter=":", thousands=",", header=None)
  85. result = reader.read()
  86. expected = np.array([123456, 12500], dtype=np.int64)
  87. tm.assert_almost_equal(result[0], expected)
  88. def test_integer_thousands_alt(self):
  89. data = "123.456\n12.500"
  90. reader = TextFileReader(
  91. StringIO(data), delimiter=":", thousands=".", header=None
  92. )
  93. result = reader.read()
  94. expected = DataFrame([123456, 12500])
  95. tm.assert_frame_equal(result, expected)
  96. def test_skip_bad_lines(self, capsys):
  97. # too many lines, see #2430 for why
  98. data = "a:b:c\nd:e:f\ng:h:i\nj:k:l:m\nl:m:n\no:p:q:r"
  99. reader = TextReader(StringIO(data), delimiter=":", header=None)
  100. msg = r"Error tokenizing data\. C error: Expected 3 fields in line 4, saw 4"
  101. with pytest.raises(parser.ParserError, match=msg):
  102. reader.read()
  103. reader = TextReader(
  104. StringIO(data), delimiter=":", header=None, on_bad_lines=2 # Skip
  105. )
  106. result = reader.read()
  107. expected = {
  108. 0: np.array(["a", "d", "g", "l"], dtype=object),
  109. 1: np.array(["b", "e", "h", "m"], dtype=object),
  110. 2: np.array(["c", "f", "i", "n"], dtype=object),
  111. }
  112. assert_array_dicts_equal(result, expected)
  113. reader = TextReader(
  114. StringIO(data), delimiter=":", header=None, on_bad_lines=1 # Warn
  115. )
  116. reader.read()
  117. captured = capsys.readouterr()
  118. assert "Skipping line 4" in captured.err
  119. assert "Skipping line 6" in captured.err
  120. def test_header_not_enough_lines(self):
  121. data = "skip this\nskip this\na,b,c\n1,2,3\n4,5,6"
  122. reader = TextReader(StringIO(data), delimiter=",", header=2)
  123. header = reader.header
  124. expected = [["a", "b", "c"]]
  125. assert header == expected
  126. recs = reader.read()
  127. expected = {
  128. 0: np.array([1, 4], dtype=np.int64),
  129. 1: np.array([2, 5], dtype=np.int64),
  130. 2: np.array([3, 6], dtype=np.int64),
  131. }
  132. assert_array_dicts_equal(recs, expected)
  133. def test_escapechar(self):
  134. data = '\\"hello world"\n\\"hello world"\n\\"hello world"'
  135. reader = TextReader(StringIO(data), delimiter=",", header=None, escapechar="\\")
  136. result = reader.read()
  137. expected = {0: np.array(['"hello world"'] * 3, dtype=object)}
  138. assert_array_dicts_equal(result, expected)
  139. def test_eof_has_eol(self):
  140. # handling of new line at EOF
  141. pass
  142. def test_na_substitution(self):
  143. pass
  144. def test_numpy_string_dtype(self):
  145. data = """\
  146. a,1
  147. aa,2
  148. aaa,3
  149. aaaa,4
  150. aaaaa,5"""
  151. def _make_reader(**kwds):
  152. if "dtype" in kwds:
  153. kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
  154. return TextReader(StringIO(data), delimiter=",", header=None, **kwds)
  155. reader = _make_reader(dtype="S5,i4")
  156. result = reader.read()
  157. assert result[0].dtype == "S5"
  158. ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaaa"], dtype="S5")
  159. assert (result[0] == ex_values).all()
  160. assert result[1].dtype == "i4"
  161. reader = _make_reader(dtype="S4")
  162. result = reader.read()
  163. assert result[0].dtype == "S4"
  164. ex_values = np.array(["a", "aa", "aaa", "aaaa", "aaaa"], dtype="S4")
  165. assert (result[0] == ex_values).all()
  166. assert result[1].dtype == "S4"
  167. def test_pass_dtype(self):
  168. data = """\
  169. one,two
  170. 1,a
  171. 2,b
  172. 3,c
  173. 4,d"""
  174. def _make_reader(**kwds):
  175. if "dtype" in kwds:
  176. kwds["dtype"] = ensure_dtype_objs(kwds["dtype"])
  177. return TextReader(StringIO(data), delimiter=",", **kwds)
  178. reader = _make_reader(dtype={"one": "u1", 1: "S1"})
  179. result = reader.read()
  180. assert result[0].dtype == "u1"
  181. assert result[1].dtype == "S1"
  182. reader = _make_reader(dtype={"one": np.uint8, 1: object})
  183. result = reader.read()
  184. assert result[0].dtype == "u1"
  185. assert result[1].dtype == "O"
  186. reader = _make_reader(dtype={"one": np.dtype("u1"), 1: np.dtype("O")})
  187. result = reader.read()
  188. assert result[0].dtype == "u1"
  189. assert result[1].dtype == "O"
  190. def test_usecols(self):
  191. data = """\
  192. a,b,c
  193. 1,2,3
  194. 4,5,6
  195. 7,8,9
  196. 10,11,12"""
  197. def _make_reader(**kwds):
  198. return TextReader(StringIO(data), delimiter=",", **kwds)
  199. reader = _make_reader(usecols=(1, 2))
  200. result = reader.read()
  201. exp = _make_reader().read()
  202. assert len(result) == 2
  203. assert (result[1] == exp[1]).all()
  204. assert (result[2] == exp[2]).all()
  205. @pytest.mark.parametrize(
  206. "text, kwargs",
  207. [
  208. ("a,b,c\r1,2,3\r4,5,6\r7,8,9\r10,11,12", {"delimiter": ","}),
  209. (
  210. "a b c\r1 2 3\r4 5 6\r7 8 9\r10 11 12",
  211. {"delim_whitespace": True},
  212. ),
  213. ("a,b,c\r1,2,3\r4,5,6\r,88,9\r10,11,12", {"delimiter": ","}),
  214. (
  215. (
  216. "A,B,C,D,E,F,G,H,I,J,K,L,M,N,O\r"
  217. "AAAAA,BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0\r"
  218. ",BBBBB,0,0,0,0,0,0,0,0,0,0,0,0,0"
  219. ),
  220. {"delimiter": ","},
  221. ),
  222. ("A B C\r 2 3\r4 5 6", {"delim_whitespace": True}),
  223. ("A B C\r2 3\r4 5 6", {"delim_whitespace": True}),
  224. ],
  225. )
  226. def test_cr_delimited(self, text, kwargs):
  227. nice_text = text.replace("\r", "\r\n")
  228. result = TextReader(StringIO(text), **kwargs).read()
  229. expected = TextReader(StringIO(nice_text), **kwargs).read()
  230. assert_array_dicts_equal(result, expected)
  231. def test_empty_field_eof(self):
  232. data = "a,b,c\n1,2,3\n4,,"
  233. result = TextReader(StringIO(data), delimiter=",").read()
  234. expected = {
  235. 0: np.array([1, 4], dtype=np.int64),
  236. 1: np.array(["2", ""], dtype=object),
  237. 2: np.array(["3", ""], dtype=object),
  238. }
  239. assert_array_dicts_equal(result, expected)
  240. # GH5664
  241. a = DataFrame([["b"], [np.nan]], columns=["a"], index=["a", "c"])
  242. b = DataFrame([[1, 1, 1, 0], [1, 1, 1, 0]], columns=list("abcd"), index=[1, 1])
  243. c = DataFrame(
  244. [
  245. [1, 2, 3, 4],
  246. [6, np.nan, np.nan, np.nan],
  247. [8, 9, 10, 11],
  248. [13, 14, np.nan, np.nan],
  249. ],
  250. columns=list("abcd"),
  251. index=[0, 5, 7, 12],
  252. )
  253. for _ in range(100):
  254. df = read_csv(StringIO("a,b\nc\n"), skiprows=0, names=["a"], engine="c")
  255. tm.assert_frame_equal(df, a)
  256. df = read_csv(
  257. StringIO("1,1,1,1,0\n" * 2 + "\n" * 2), names=list("abcd"), engine="c"
  258. )
  259. tm.assert_frame_equal(df, b)
  260. df = read_csv(
  261. StringIO("0,1,2,3,4\n5,6\n7,8,9,10,11\n12,13,14"),
  262. names=list("abcd"),
  263. engine="c",
  264. )
  265. tm.assert_frame_equal(df, c)
  266. def test_empty_csv_input(self):
  267. # GH14867
  268. with read_csv(
  269. StringIO(), chunksize=20, header=None, names=["a", "b", "c"]
  270. ) as df:
  271. assert isinstance(df, TextFileReader)
  272. def assert_array_dicts_equal(left, right):
  273. for k, v in left.items():
  274. tm.assert_numpy_array_equal(np.asarray(v), np.asarray(right[k]))