test_readlines.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520
  1. from io import StringIO
  2. from pathlib import Path
  3. from typing import Iterator
  4. import pytest
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. read_json,
  9. )
  10. import pandas._testing as tm
  11. from pandas.io.json._json import JsonReader
  12. @pytest.fixture
  13. def lines_json_df():
  14. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  15. return df.to_json(lines=True, orient="records")
  16. def test_read_jsonl():
  17. # GH9180
  18. result = read_json('{"a": 1, "b": 2}\n{"b":2, "a" :1}\n', lines=True)
  19. expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  20. tm.assert_frame_equal(result, expected)
  21. def test_read_jsonl_engine_pyarrow(datapath, engine):
  22. result = read_json(
  23. datapath("io", "json", "data", "line_delimited.json"),
  24. lines=True,
  25. engine=engine,
  26. )
  27. expected = DataFrame({"a": [1, 3, 5], "b": [2, 4, 6]})
  28. tm.assert_frame_equal(result, expected)
  29. def test_read_datetime(request, engine):
  30. # GH33787
  31. if engine == "pyarrow":
  32. # GH 48893
  33. reason = "Pyarrow only supports a file path as an input and line delimited json"
  34. request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
  35. df = DataFrame(
  36. [([1, 2], ["2020-03-05", "2020-04-08T09:58:49+00:00"], "hector")],
  37. columns=["accounts", "date", "name"],
  38. )
  39. json_line = df.to_json(lines=True, orient="records")
  40. result = read_json(json_line, engine=engine)
  41. expected = DataFrame(
  42. [[1, "2020-03-05", "hector"], [2, "2020-04-08T09:58:49+00:00", "hector"]],
  43. columns=["accounts", "date", "name"],
  44. )
  45. tm.assert_frame_equal(result, expected)
  46. def test_read_jsonl_unicode_chars():
  47. # GH15132: non-ascii unicode characters
  48. # \u201d == RIGHT DOUBLE QUOTATION MARK
  49. # simulate file handle
  50. json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
  51. json = StringIO(json)
  52. result = read_json(json, lines=True)
  53. expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
  54. tm.assert_frame_equal(result, expected)
  55. # simulate string
  56. json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
  57. result = read_json(json, lines=True)
  58. expected = DataFrame([["foo\u201d", "bar"], ["foo", "bar"]], columns=["a", "b"])
  59. tm.assert_frame_equal(result, expected)
  60. def test_to_jsonl():
  61. # GH9180
  62. df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  63. result = df.to_json(orient="records", lines=True)
  64. expected = '{"a":1,"b":2}\n{"a":1,"b":2}\n'
  65. assert result == expected
  66. df = DataFrame([["foo}", "bar"], ['foo"', "bar"]], columns=["a", "b"])
  67. result = df.to_json(orient="records", lines=True)
  68. expected = '{"a":"foo}","b":"bar"}\n{"a":"foo\\"","b":"bar"}\n'
  69. assert result == expected
  70. tm.assert_frame_equal(read_json(result, lines=True), df)
  71. # GH15096: escaped characters in columns and data
  72. df = DataFrame([["foo\\", "bar"], ['foo"', "bar"]], columns=["a\\", "b"])
  73. result = df.to_json(orient="records", lines=True)
  74. expected = '{"a\\\\":"foo\\\\","b":"bar"}\n{"a\\\\":"foo\\"","b":"bar"}\n'
  75. assert result == expected
  76. tm.assert_frame_equal(read_json(result, lines=True), df)
  77. def test_to_jsonl_count_new_lines():
  78. # GH36888
  79. df = DataFrame([[1, 2], [1, 2]], columns=["a", "b"])
  80. actual_new_lines_count = df.to_json(orient="records", lines=True).count("\n")
  81. expected_new_lines_count = 2
  82. assert actual_new_lines_count == expected_new_lines_count
  83. @pytest.mark.parametrize("chunksize", [1, 1.0])
  84. def test_readjson_chunks(request, lines_json_df, chunksize, engine):
  85. # Basic test that read_json(chunks=True) gives the same result as
  86. # read_json(chunks=False)
  87. # GH17048: memory usage when lines=True
  88. if engine == "pyarrow":
  89. # GH 48893
  90. reason = (
  91. "Pyarrow only supports a file path as an input and line delimited json"
  92. "and doesn't support chunksize parameter."
  93. )
  94. request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
  95. unchunked = read_json(StringIO(lines_json_df), lines=True)
  96. with read_json(
  97. StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
  98. ) as reader:
  99. chunked = pd.concat(reader)
  100. tm.assert_frame_equal(chunked, unchunked)
  101. def test_readjson_chunksize_requires_lines(lines_json_df, engine):
  102. msg = "chunksize can only be passed if lines=True"
  103. with pytest.raises(ValueError, match=msg):
  104. with read_json(
  105. StringIO(lines_json_df), lines=False, chunksize=2, engine=engine
  106. ) as _:
  107. pass
  108. def test_readjson_chunks_series(request, engine):
  109. if engine == "pyarrow":
  110. # GH 48893
  111. reason = (
  112. "Pyarrow only supports a file path as an input and line delimited json"
  113. "and doesn't support chunksize parameter."
  114. )
  115. request.node.add_marker(pytest.mark.xfail(reason=reason))
  116. # Test reading line-format JSON to Series with chunksize param
  117. s = pd.Series({"A": 1, "B": 2})
  118. strio = StringIO(s.to_json(lines=True, orient="records"))
  119. unchunked = read_json(strio, lines=True, typ="Series", engine=engine)
  120. strio = StringIO(s.to_json(lines=True, orient="records"))
  121. with read_json(
  122. strio, lines=True, typ="Series", chunksize=1, engine=engine
  123. ) as reader:
  124. chunked = pd.concat(reader)
  125. tm.assert_series_equal(chunked, unchunked)
  126. def test_readjson_each_chunk(request, lines_json_df, engine):
  127. if engine == "pyarrow":
  128. # GH 48893
  129. reason = (
  130. "Pyarrow only supports a file path as an input and line delimited json"
  131. "and doesn't support chunksize parameter."
  132. )
  133. request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
  134. # Other tests check that the final result of read_json(chunksize=True)
  135. # is correct. This checks the intermediate chunks.
  136. with read_json(
  137. StringIO(lines_json_df), lines=True, chunksize=2, engine=engine
  138. ) as reader:
  139. chunks = list(reader)
  140. assert chunks[0].shape == (2, 2)
  141. assert chunks[1].shape == (1, 2)
  142. def test_readjson_chunks_from_file(request, engine):
  143. if engine == "pyarrow":
  144. # GH 48893
  145. reason = (
  146. "Pyarrow only supports a file path as an input and line delimited json"
  147. "and doesn't support chunksize parameter."
  148. )
  149. request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
  150. with tm.ensure_clean("test.json") as path:
  151. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  152. df.to_json(path, lines=True, orient="records")
  153. with read_json(path, lines=True, chunksize=1, engine=engine) as reader:
  154. chunked = pd.concat(reader)
  155. unchunked = read_json(path, lines=True, engine=engine)
  156. tm.assert_frame_equal(unchunked, chunked)
  157. @pytest.mark.parametrize("chunksize", [None, 1])
  158. def test_readjson_chunks_closes(chunksize):
  159. with tm.ensure_clean("test.json") as path:
  160. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  161. df.to_json(path, lines=True, orient="records")
  162. reader = JsonReader(
  163. path,
  164. orient=None,
  165. typ="frame",
  166. dtype=True,
  167. convert_axes=True,
  168. convert_dates=True,
  169. keep_default_dates=True,
  170. precise_float=False,
  171. date_unit=None,
  172. encoding=None,
  173. lines=True,
  174. chunksize=chunksize,
  175. compression=None,
  176. nrows=None,
  177. )
  178. with reader:
  179. reader.read()
  180. assert (
  181. reader.handles.handle.closed
  182. ), f"didn't close stream with chunksize = {chunksize}"
  183. @pytest.mark.parametrize("chunksize", [0, -1, 2.2, "foo"])
  184. def test_readjson_invalid_chunksize(lines_json_df, chunksize, engine):
  185. msg = r"'chunksize' must be an integer >=1"
  186. with pytest.raises(ValueError, match=msg):
  187. with read_json(
  188. StringIO(lines_json_df), lines=True, chunksize=chunksize, engine=engine
  189. ) as _:
  190. pass
  191. @pytest.mark.parametrize("chunksize", [None, 1, 2])
  192. def test_readjson_chunks_multiple_empty_lines(chunksize):
  193. j = """
  194. {"A":1,"B":4}
  195. {"A":2,"B":5}
  196. {"A":3,"B":6}
  197. """
  198. orig = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  199. test = read_json(j, lines=True, chunksize=chunksize)
  200. if chunksize is not None:
  201. with test:
  202. test = pd.concat(test)
  203. tm.assert_frame_equal(orig, test, obj=f"chunksize: {chunksize}")
  204. def test_readjson_unicode(request, monkeypatch, engine):
  205. if engine == "pyarrow":
  206. # GH 48893
  207. reason = (
  208. "Pyarrow only supports a file path as an input and line delimited json"
  209. "and doesn't support chunksize parameter."
  210. )
  211. request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
  212. with tm.ensure_clean("test.json") as path:
  213. monkeypatch.setattr("locale.getpreferredencoding", lambda do_setlocale: "cp949")
  214. with open(path, "w", encoding="utf-8") as f:
  215. f.write('{"£©µÀÆÖÞßéöÿ":["АБВГДабвгд가"]}')
  216. result = read_json(path, engine=engine)
  217. expected = DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]})
  218. tm.assert_frame_equal(result, expected)
  219. @pytest.mark.parametrize("nrows", [1, 2])
  220. def test_readjson_nrows(nrows, engine):
  221. # GH 33916
  222. # Test reading line-format JSON to Series with nrows param
  223. jsonl = """{"a": 1, "b": 2}
  224. {"a": 3, "b": 4}
  225. {"a": 5, "b": 6}
  226. {"a": 7, "b": 8}"""
  227. result = read_json(jsonl, lines=True, nrows=nrows)
  228. expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
  229. tm.assert_frame_equal(result, expected)
  230. @pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)])
  231. def test_readjson_nrows_chunks(request, nrows, chunksize, engine):
  232. # GH 33916
  233. # Test reading line-format JSON to Series with nrows and chunksize param
  234. if engine == "pyarrow":
  235. # GH 48893
  236. reason = (
  237. "Pyarrow only supports a file path as an input and line delimited json"
  238. "and doesn't support chunksize parameter."
  239. )
  240. request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
  241. jsonl = """{"a": 1, "b": 2}
  242. {"a": 3, "b": 4}
  243. {"a": 5, "b": 6}
  244. {"a": 7, "b": 8}"""
  245. with read_json(
  246. jsonl, lines=True, nrows=nrows, chunksize=chunksize, engine=engine
  247. ) as reader:
  248. chunked = pd.concat(reader)
  249. expected = DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows]
  250. tm.assert_frame_equal(chunked, expected)
  251. def test_readjson_nrows_requires_lines(engine):
  252. # GH 33916
  253. # Test ValuError raised if nrows is set without setting lines in read_json
  254. jsonl = """{"a": 1, "b": 2}
  255. {"a": 3, "b": 4}
  256. {"a": 5, "b": 6}
  257. {"a": 7, "b": 8}"""
  258. msg = "nrows can only be passed if lines=True"
  259. with pytest.raises(ValueError, match=msg):
  260. read_json(jsonl, lines=False, nrows=2, engine=engine)
  261. def test_readjson_lines_chunks_fileurl(request, datapath, engine):
  262. # GH 27135
  263. # Test reading line-format JSON from file url
  264. if engine == "pyarrow":
  265. # GH 48893
  266. reason = (
  267. "Pyarrow only supports a file path as an input and line delimited json"
  268. "and doesn't support chunksize parameter."
  269. )
  270. request.node.add_marker(pytest.mark.xfail(reason=reason, raises=ValueError))
  271. df_list_expected = [
  272. DataFrame([[1, 2]], columns=["a", "b"], index=[0]),
  273. DataFrame([[3, 4]], columns=["a", "b"], index=[1]),
  274. DataFrame([[5, 6]], columns=["a", "b"], index=[2]),
  275. ]
  276. os_path = datapath("io", "json", "data", "line_delimited.json")
  277. file_url = Path(os_path).as_uri()
  278. with read_json(file_url, lines=True, chunksize=1, engine=engine) as url_reader:
  279. for index, chuck in enumerate(url_reader):
  280. tm.assert_frame_equal(chuck, df_list_expected[index])
  281. def test_chunksize_is_incremental():
  282. # See https://github.com/pandas-dev/pandas/issues/34548
  283. jsonl = (
  284. """{"a": 1, "b": 2}
  285. {"a": 3, "b": 4}
  286. {"a": 5, "b": 6}
  287. {"a": 7, "b": 8}\n"""
  288. * 1000
  289. )
  290. class MyReader:
  291. def __init__(self, contents) -> None:
  292. self.read_count = 0
  293. self.stringio = StringIO(contents)
  294. def read(self, *args):
  295. self.read_count += 1
  296. return self.stringio.read(*args)
  297. def __iter__(self) -> Iterator:
  298. self.read_count += 1
  299. return iter(self.stringio)
  300. reader = MyReader(jsonl)
  301. assert len(list(read_json(reader, lines=True, chunksize=100))) > 1
  302. assert reader.read_count > 10
  303. @pytest.mark.parametrize("orient_", ["split", "index", "table"])
  304. def test_to_json_append_orient(orient_):
  305. # GH 35849
  306. # Test ValueError when orient is not 'records'
  307. df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  308. msg = (
  309. r"mode='a' \(append\) is only supported when"
  310. "lines is True and orient is 'records'"
  311. )
  312. with pytest.raises(ValueError, match=msg):
  313. df.to_json(mode="a", orient=orient_)
  314. def test_to_json_append_lines():
  315. # GH 35849
  316. # Test ValueError when lines is not True
  317. df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  318. msg = (
  319. r"mode='a' \(append\) is only supported when"
  320. "lines is True and orient is 'records'"
  321. )
  322. with pytest.raises(ValueError, match=msg):
  323. df.to_json(mode="a", lines=False, orient="records")
  324. @pytest.mark.parametrize("mode_", ["r", "x"])
  325. def test_to_json_append_mode(mode_):
  326. # GH 35849
  327. # Test ValueError when mode is not supported option
  328. df = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  329. msg = (
  330. f"mode={mode_} is not a valid option."
  331. "Only 'w' and 'a' are currently supported."
  332. )
  333. with pytest.raises(ValueError, match=msg):
  334. df.to_json(mode=mode_, lines=False, orient="records")
  335. def test_to_json_append_output_consistent_columns():
  336. # GH 35849
  337. # Testing that resulting output reads in as expected.
  338. # Testing same columns, new rows
  339. df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  340. df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
  341. expected = DataFrame({"col1": [1, 2, 3, 4], "col2": ["a", "b", "c", "d"]})
  342. with tm.ensure_clean("test.json") as path:
  343. # Save dataframes to the same file
  344. df1.to_json(path, lines=True, orient="records")
  345. df2.to_json(path, mode="a", lines=True, orient="records")
  346. # Read path file
  347. result = read_json(path, lines=True)
  348. tm.assert_frame_equal(result, expected)
  349. def test_to_json_append_output_inconsistent_columns():
  350. # GH 35849
  351. # Testing that resulting output reads in as expected.
  352. # Testing one new column, one old column, new rows
  353. df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  354. df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
  355. expected = DataFrame(
  356. {
  357. "col1": [1, 2, None, None],
  358. "col2": ["a", "b", "e", "f"],
  359. "col3": [None, None, "!", "#"],
  360. }
  361. )
  362. with tm.ensure_clean("test.json") as path:
  363. # Save dataframes to the same file
  364. df1.to_json(path, mode="a", lines=True, orient="records")
  365. df3.to_json(path, mode="a", lines=True, orient="records")
  366. # Read path file
  367. result = read_json(path, lines=True)
  368. tm.assert_frame_equal(result, expected)
  369. def test_to_json_append_output_different_columns():
  370. # GH 35849
  371. # Testing that resulting output reads in as expected.
  372. # Testing same, differing and new columns
  373. df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  374. df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
  375. df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
  376. df4 = DataFrame({"col4": [True, False]})
  377. expected = DataFrame(
  378. {
  379. "col1": [1, 2, 3, 4, None, None, None, None],
  380. "col2": ["a", "b", "c", "d", "e", "f", None, None],
  381. "col3": [None, None, None, None, "!", "#", None, None],
  382. "col4": [None, None, None, None, None, None, True, False],
  383. }
  384. ).astype({"col4": "float"})
  385. with tm.ensure_clean("test.json") as path:
  386. # Save dataframes to the same file
  387. df1.to_json(path, mode="a", lines=True, orient="records")
  388. df2.to_json(path, mode="a", lines=True, orient="records")
  389. df3.to_json(path, mode="a", lines=True, orient="records")
  390. df4.to_json(path, mode="a", lines=True, orient="records")
  391. # Read path file
  392. result = read_json(path, lines=True)
  393. tm.assert_frame_equal(result, expected)
  394. def test_to_json_append_output_different_columns_reordered():
  395. # GH 35849
  396. # Testing that resulting output reads in as expected.
  397. # Testing specific result column order.
  398. df1 = DataFrame({"col1": [1, 2], "col2": ["a", "b"]})
  399. df2 = DataFrame({"col1": [3, 4], "col2": ["c", "d"]})
  400. df3 = DataFrame({"col2": ["e", "f"], "col3": ["!", "#"]})
  401. df4 = DataFrame({"col4": [True, False]})
  402. # df4, df3, df2, df1 (in that order)
  403. expected = DataFrame(
  404. {
  405. "col4": [True, False, None, None, None, None, None, None],
  406. "col2": [None, None, "e", "f", "c", "d", "a", "b"],
  407. "col3": [None, None, "!", "#", None, None, None, None],
  408. "col1": [None, None, None, None, 3, 4, 1, 2],
  409. }
  410. ).astype({"col4": "float"})
  411. with tm.ensure_clean("test.json") as path:
  412. # Save dataframes to the same file
  413. df4.to_json(path, mode="a", lines=True, orient="records")
  414. df3.to_json(path, mode="a", lines=True, orient="records")
  415. df2.to_json(path, mode="a", lines=True, orient="records")
  416. df1.to_json(path, mode="a", lines=True, orient="records")
  417. # Read path file
  418. result = read_json(path, lines=True)
  419. tm.assert_frame_equal(result, expected)