test_to_csv.py 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732
  1. import io
  2. import os
  3. import sys
  4. from zipfile import ZipFile
  5. from _csv import Error
  6. import numpy as np
  7. import pytest
  8. import pandas as pd
  9. from pandas import (
  10. DataFrame,
  11. compat,
  12. )
  13. import pandas._testing as tm
  14. from pandas.tests.io.test_compression import _compression_to_extension
  15. class TestToCSV:
  16. def test_to_csv_with_single_column(self):
  17. # see gh-18676, https://bugs.python.org/issue32255
  18. #
  19. # Python's CSV library adds an extraneous '""'
  20. # before the newline when the NaN-value is in
  21. # the first row. Otherwise, only the newline
  22. # character is added. This behavior is inconsistent
  23. # and was patched in https://bugs.python.org/pull_request4672.
  24. df1 = DataFrame([None, 1])
  25. expected1 = """\
  26. ""
  27. 1.0
  28. """
  29. with tm.ensure_clean("test.csv") as path:
  30. df1.to_csv(path, header=None, index=None)
  31. with open(path) as f:
  32. assert f.read() == expected1
  33. df2 = DataFrame([1, None])
  34. expected2 = """\
  35. 1.0
  36. ""
  37. """
  38. with tm.ensure_clean("test.csv") as path:
  39. df2.to_csv(path, header=None, index=None)
  40. with open(path) as f:
  41. assert f.read() == expected2
  42. def test_to_csv_default_encoding(self):
  43. # GH17097
  44. df = DataFrame({"col": ["AAAAA", "ÄÄÄÄÄ", "ßßßßß", "聞聞聞聞聞"]})
  45. with tm.ensure_clean("test.csv") as path:
  46. # the default to_csv encoding is uft-8.
  47. df.to_csv(path)
  48. tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)
  49. def test_to_csv_quotechar(self):
  50. df = DataFrame({"col": [1, 2]})
  51. expected = """\
  52. "","col"
  53. "0","1"
  54. "1","2"
  55. """
  56. with tm.ensure_clean("test.csv") as path:
  57. df.to_csv(path, quoting=1) # 1=QUOTE_ALL
  58. with open(path) as f:
  59. assert f.read() == expected
  60. expected = """\
  61. $$,$col$
  62. $0$,$1$
  63. $1$,$2$
  64. """
  65. with tm.ensure_clean("test.csv") as path:
  66. df.to_csv(path, quoting=1, quotechar="$")
  67. with open(path) as f:
  68. assert f.read() == expected
  69. with tm.ensure_clean("test.csv") as path:
  70. with pytest.raises(TypeError, match="quotechar"):
  71. df.to_csv(path, quoting=1, quotechar=None)
  72. def test_to_csv_doublequote(self):
  73. df = DataFrame({"col": ['a"a', '"bb"']})
  74. expected = '''\
  75. "","col"
  76. "0","a""a"
  77. "1","""bb"""
  78. '''
  79. with tm.ensure_clean("test.csv") as path:
  80. df.to_csv(path, quoting=1, doublequote=True) # QUOTE_ALL
  81. with open(path) as f:
  82. assert f.read() == expected
  83. with tm.ensure_clean("test.csv") as path:
  84. with pytest.raises(Error, match="escapechar"):
  85. df.to_csv(path, doublequote=False) # no escapechar set
  86. def test_to_csv_escapechar(self):
  87. df = DataFrame({"col": ['a"a', '"bb"']})
  88. expected = """\
  89. "","col"
  90. "0","a\\"a"
  91. "1","\\"bb\\""
  92. """
  93. with tm.ensure_clean("test.csv") as path: # QUOTE_ALL
  94. df.to_csv(path, quoting=1, doublequote=False, escapechar="\\")
  95. with open(path) as f:
  96. assert f.read() == expected
  97. df = DataFrame({"col": ["a,a", ",bb,"]})
  98. expected = """\
  99. ,col
  100. 0,a\\,a
  101. 1,\\,bb\\,
  102. """
  103. with tm.ensure_clean("test.csv") as path:
  104. df.to_csv(path, quoting=3, escapechar="\\") # QUOTE_NONE
  105. with open(path) as f:
  106. assert f.read() == expected
  107. def test_csv_to_string(self):
  108. df = DataFrame({"col": [1, 2]})
  109. expected_rows = [",col", "0,1", "1,2"]
  110. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  111. assert df.to_csv() == expected
  112. def test_to_csv_decimal(self):
  113. # see gh-781
  114. df = DataFrame({"col1": [1], "col2": ["a"], "col3": [10.1]})
  115. expected_rows = [",col1,col2,col3", "0,1,a,10.1"]
  116. expected_default = tm.convert_rows_list_to_csv_str(expected_rows)
  117. assert df.to_csv() == expected_default
  118. expected_rows = [";col1;col2;col3", "0;1;a;10,1"]
  119. expected_european_excel = tm.convert_rows_list_to_csv_str(expected_rows)
  120. assert df.to_csv(decimal=",", sep=";") == expected_european_excel
  121. expected_rows = [",col1,col2,col3", "0,1,a,10.10"]
  122. expected_float_format_default = tm.convert_rows_list_to_csv_str(expected_rows)
  123. assert df.to_csv(float_format="%.2f") == expected_float_format_default
  124. expected_rows = [";col1;col2;col3", "0;1;a;10,10"]
  125. expected_float_format = tm.convert_rows_list_to_csv_str(expected_rows)
  126. assert (
  127. df.to_csv(decimal=",", sep=";", float_format="%.2f")
  128. == expected_float_format
  129. )
  130. # see gh-11553: testing if decimal is taken into account for '0.0'
  131. df = DataFrame({"a": [0, 1.1], "b": [2.2, 3.3], "c": 1})
  132. expected_rows = ["a,b,c", "0^0,2^2,1", "1^1,3^3,1"]
  133. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  134. assert df.to_csv(index=False, decimal="^") == expected
  135. # same but for an index
  136. assert df.set_index("a").to_csv(decimal="^") == expected
  137. # same for a multi-index
  138. assert df.set_index(["a", "b"]).to_csv(decimal="^") == expected
  139. def test_to_csv_float_format(self):
  140. # testing if float_format is taken into account for the index
  141. # GH 11553
  142. df = DataFrame({"a": [0, 1], "b": [2.2, 3.3], "c": 1})
  143. expected_rows = ["a,b,c", "0,2.20,1", "1,3.30,1"]
  144. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  145. assert df.set_index("a").to_csv(float_format="%.2f") == expected
  146. # same for a multi-index
  147. assert df.set_index(["a", "b"]).to_csv(float_format="%.2f") == expected
  148. def test_to_csv_na_rep(self):
  149. # see gh-11553
  150. #
  151. # Testing if NaN values are correctly represented in the index.
  152. df = DataFrame({"a": [0, np.NaN], "b": [0, 1], "c": [2, 3]})
  153. expected_rows = ["a,b,c", "0.0,0,2", "_,1,3"]
  154. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  155. assert df.set_index("a").to_csv(na_rep="_") == expected
  156. assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
  157. # now with an index containing only NaNs
  158. df = DataFrame({"a": np.NaN, "b": [0, 1], "c": [2, 3]})
  159. expected_rows = ["a,b,c", "_,0,2", "_,1,3"]
  160. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  161. assert df.set_index("a").to_csv(na_rep="_") == expected
  162. assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
  163. # check if na_rep parameter does not break anything when no NaN
  164. df = DataFrame({"a": 0, "b": [0, 1], "c": [2, 3]})
  165. expected_rows = ["a,b,c", "0,0,2", "0,1,3"]
  166. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  167. assert df.set_index("a").to_csv(na_rep="_") == expected
  168. assert df.set_index(["a", "b"]).to_csv(na_rep="_") == expected
  169. csv = pd.Series(["a", pd.NA, "c"]).to_csv(na_rep="ZZZZZ")
  170. expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"])
  171. assert expected == csv
  172. def test_to_csv_na_rep_nullable_string(self, nullable_string_dtype):
  173. # GH 29975
  174. # Make sure full na_rep shows up when a dtype is provided
  175. expected = tm.convert_rows_list_to_csv_str([",0", "0,a", "1,ZZZZZ", "2,c"])
  176. csv = pd.Series(["a", pd.NA, "c"], dtype=nullable_string_dtype).to_csv(
  177. na_rep="ZZZZZ"
  178. )
  179. assert expected == csv
  180. def test_to_csv_date_format(self):
  181. # GH 10209
  182. df_sec = DataFrame({"A": pd.date_range("20130101", periods=5, freq="s")})
  183. df_day = DataFrame({"A": pd.date_range("20130101", periods=5, freq="d")})
  184. expected_rows = [
  185. ",A",
  186. "0,2013-01-01 00:00:00",
  187. "1,2013-01-01 00:00:01",
  188. "2,2013-01-01 00:00:02",
  189. "3,2013-01-01 00:00:03",
  190. "4,2013-01-01 00:00:04",
  191. ]
  192. expected_default_sec = tm.convert_rows_list_to_csv_str(expected_rows)
  193. assert df_sec.to_csv() == expected_default_sec
  194. expected_rows = [
  195. ",A",
  196. "0,2013-01-01 00:00:00",
  197. "1,2013-01-02 00:00:00",
  198. "2,2013-01-03 00:00:00",
  199. "3,2013-01-04 00:00:00",
  200. "4,2013-01-05 00:00:00",
  201. ]
  202. expected_ymdhms_day = tm.convert_rows_list_to_csv_str(expected_rows)
  203. assert df_day.to_csv(date_format="%Y-%m-%d %H:%M:%S") == expected_ymdhms_day
  204. expected_rows = [
  205. ",A",
  206. "0,2013-01-01",
  207. "1,2013-01-01",
  208. "2,2013-01-01",
  209. "3,2013-01-01",
  210. "4,2013-01-01",
  211. ]
  212. expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
  213. assert df_sec.to_csv(date_format="%Y-%m-%d") == expected_ymd_sec
  214. expected_rows = [
  215. ",A",
  216. "0,2013-01-01",
  217. "1,2013-01-02",
  218. "2,2013-01-03",
  219. "3,2013-01-04",
  220. "4,2013-01-05",
  221. ]
  222. expected_default_day = tm.convert_rows_list_to_csv_str(expected_rows)
  223. assert df_day.to_csv() == expected_default_day
  224. assert df_day.to_csv(date_format="%Y-%m-%d") == expected_default_day
  225. # see gh-7791
  226. #
  227. # Testing if date_format parameter is taken into account
  228. # for multi-indexed DataFrames.
  229. df_sec["B"] = 0
  230. df_sec["C"] = 1
  231. expected_rows = ["A,B,C", "2013-01-01,0,1.0"]
  232. expected_ymd_sec = tm.convert_rows_list_to_csv_str(expected_rows)
  233. df_sec_grouped = df_sec.groupby([pd.Grouper(key="A", freq="1h"), "B"])
  234. assert df_sec_grouped.mean().to_csv(date_format="%Y-%m-%d") == expected_ymd_sec
  235. def test_to_csv_different_datetime_formats(self):
  236. # GH#21734
  237. df = DataFrame(
  238. {
  239. "date": pd.to_datetime("1970-01-01"),
  240. "datetime": pd.date_range("1970-01-01", periods=2, freq="H"),
  241. }
  242. )
  243. expected_rows = [
  244. "date,datetime",
  245. "1970-01-01,1970-01-01 00:00:00",
  246. "1970-01-01,1970-01-01 01:00:00",
  247. ]
  248. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  249. assert df.to_csv(index=False) == expected
  250. def test_to_csv_date_format_in_categorical(self):
  251. # GH#40754
  252. ser = pd.Series(pd.to_datetime(["2021-03-27", pd.NaT], format="%Y-%m-%d"))
  253. ser = ser.astype("category")
  254. expected = tm.convert_rows_list_to_csv_str(["0", "2021-03-27", '""'])
  255. assert ser.to_csv(index=False) == expected
  256. ser = pd.Series(
  257. pd.date_range(
  258. start="2021-03-27", freq="D", periods=1, tz="Europe/Berlin"
  259. ).append(pd.DatetimeIndex([pd.NaT]))
  260. )
  261. ser = ser.astype("category")
  262. assert ser.to_csv(index=False, date_format="%Y-%m-%d") == expected
  263. def test_to_csv_float_ea_float_format(self):
  264. # GH#45991
  265. df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"})
  266. df["a"] = df["a"].astype("Float64")
  267. result = df.to_csv(index=False, float_format="%.5f")
  268. expected = tm.convert_rows_list_to_csv_str(
  269. ["a,b", "1.10000,c", "2.02000,c", ",c", "6.00001,c"]
  270. )
  271. assert result == expected
  272. def test_to_csv_float_ea_no_float_format(self):
  273. # GH#45991
  274. df = DataFrame({"a": [1.1, 2.02, pd.NA, 6.000006], "b": "c"})
  275. df["a"] = df["a"].astype("Float64")
  276. result = df.to_csv(index=False)
  277. expected = tm.convert_rows_list_to_csv_str(
  278. ["a,b", "1.1,c", "2.02,c", ",c", "6.000006,c"]
  279. )
  280. assert result == expected
  281. def test_to_csv_multi_index(self):
  282. # see gh-6618
  283. df = DataFrame([1], columns=pd.MultiIndex.from_arrays([[1], [2]]))
  284. exp_rows = [",1", ",2", "0,1"]
  285. exp = tm.convert_rows_list_to_csv_str(exp_rows)
  286. assert df.to_csv() == exp
  287. exp_rows = ["1", "2", "1"]
  288. exp = tm.convert_rows_list_to_csv_str(exp_rows)
  289. assert df.to_csv(index=False) == exp
  290. df = DataFrame(
  291. [1],
  292. columns=pd.MultiIndex.from_arrays([[1], [2]]),
  293. index=pd.MultiIndex.from_arrays([[1], [2]]),
  294. )
  295. exp_rows = [",,1", ",,2", "1,2,1"]
  296. exp = tm.convert_rows_list_to_csv_str(exp_rows)
  297. assert df.to_csv() == exp
  298. exp_rows = ["1", "2", "1"]
  299. exp = tm.convert_rows_list_to_csv_str(exp_rows)
  300. assert df.to_csv(index=False) == exp
  301. df = DataFrame([1], columns=pd.MultiIndex.from_arrays([["foo"], ["bar"]]))
  302. exp_rows = [",foo", ",bar", "0,1"]
  303. exp = tm.convert_rows_list_to_csv_str(exp_rows)
  304. assert df.to_csv() == exp
  305. exp_rows = ["foo", "bar", "1"]
  306. exp = tm.convert_rows_list_to_csv_str(exp_rows)
  307. assert df.to_csv(index=False) == exp
  308. @pytest.mark.parametrize(
  309. "ind,expected",
  310. [
  311. (
  312. pd.MultiIndex(levels=[[1.0]], codes=[[0]], names=["x"]),
  313. "x,data\n1.0,1\n",
  314. ),
  315. (
  316. pd.MultiIndex(
  317. levels=[[1.0], [2.0]], codes=[[0], [0]], names=["x", "y"]
  318. ),
  319. "x,y,data\n1.0,2.0,1\n",
  320. ),
  321. ],
  322. )
  323. def test_to_csv_single_level_multi_index(self, ind, expected, frame_or_series):
  324. # see gh-19589
  325. obj = frame_or_series(pd.Series([1], ind, name="data"))
  326. result = obj.to_csv(lineterminator="\n", header=True)
  327. assert result == expected
  328. def test_to_csv_string_array_ascii(self):
  329. # GH 10813
  330. str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
  331. df = DataFrame(str_array)
  332. expected_ascii = """\
  333. ,names
  334. 0,"['foo', 'bar']"
  335. 1,"['baz', 'qux']"
  336. """
  337. with tm.ensure_clean("str_test.csv") as path:
  338. df.to_csv(path, encoding="ascii")
  339. with open(path) as f:
  340. assert f.read() == expected_ascii
  341. def test_to_csv_string_array_utf8(self):
  342. # GH 10813
  343. str_array = [{"names": ["foo", "bar"]}, {"names": ["baz", "qux"]}]
  344. df = DataFrame(str_array)
  345. expected_utf8 = """\
  346. ,names
  347. 0,"['foo', 'bar']"
  348. 1,"['baz', 'qux']"
  349. """
  350. with tm.ensure_clean("unicode_test.csv") as path:
  351. df.to_csv(path, encoding="utf-8")
  352. with open(path) as f:
  353. assert f.read() == expected_utf8
  354. def test_to_csv_string_with_lf(self):
  355. # GH 20353
  356. data = {"int": [1, 2, 3], "str_lf": ["abc", "d\nef", "g\nh\n\ni"]}
  357. df = DataFrame(data)
  358. with tm.ensure_clean("lf_test.csv") as path:
  359. # case 1: The default line terminator(=os.linesep)(PR 21406)
  360. os_linesep = os.linesep.encode("utf-8")
  361. expected_noarg = (
  362. b"int,str_lf"
  363. + os_linesep
  364. + b"1,abc"
  365. + os_linesep
  366. + b'2,"d\nef"'
  367. + os_linesep
  368. + b'3,"g\nh\n\ni"'
  369. + os_linesep
  370. )
  371. df.to_csv(path, index=False)
  372. with open(path, "rb") as f:
  373. assert f.read() == expected_noarg
  374. with tm.ensure_clean("lf_test.csv") as path:
  375. # case 2: LF as line terminator
  376. expected_lf = b'int,str_lf\n1,abc\n2,"d\nef"\n3,"g\nh\n\ni"\n'
  377. df.to_csv(path, lineterminator="\n", index=False)
  378. with open(path, "rb") as f:
  379. assert f.read() == expected_lf
  380. with tm.ensure_clean("lf_test.csv") as path:
  381. # case 3: CRLF as line terminator
  382. # 'lineterminator' should not change inner element
  383. expected_crlf = b'int,str_lf\r\n1,abc\r\n2,"d\nef"\r\n3,"g\nh\n\ni"\r\n'
  384. df.to_csv(path, lineterminator="\r\n", index=False)
  385. with open(path, "rb") as f:
  386. assert f.read() == expected_crlf
  387. def test_to_csv_string_with_crlf(self):
  388. # GH 20353
  389. data = {"int": [1, 2, 3], "str_crlf": ["abc", "d\r\nef", "g\r\nh\r\n\r\ni"]}
  390. df = DataFrame(data)
  391. with tm.ensure_clean("crlf_test.csv") as path:
  392. # case 1: The default line terminator(=os.linesep)(PR 21406)
  393. os_linesep = os.linesep.encode("utf-8")
  394. expected_noarg = (
  395. b"int,str_crlf"
  396. + os_linesep
  397. + b"1,abc"
  398. + os_linesep
  399. + b'2,"d\r\nef"'
  400. + os_linesep
  401. + b'3,"g\r\nh\r\n\r\ni"'
  402. + os_linesep
  403. )
  404. df.to_csv(path, index=False)
  405. with open(path, "rb") as f:
  406. assert f.read() == expected_noarg
  407. with tm.ensure_clean("crlf_test.csv") as path:
  408. # case 2: LF as line terminator
  409. expected_lf = b'int,str_crlf\n1,abc\n2,"d\r\nef"\n3,"g\r\nh\r\n\r\ni"\n'
  410. df.to_csv(path, lineterminator="\n", index=False)
  411. with open(path, "rb") as f:
  412. assert f.read() == expected_lf
  413. with tm.ensure_clean("crlf_test.csv") as path:
  414. # case 3: CRLF as line terminator
  415. # 'lineterminator' should not change inner element
  416. expected_crlf = (
  417. b"int,str_crlf\r\n"
  418. b"1,abc\r\n"
  419. b'2,"d\r\nef"\r\n'
  420. b'3,"g\r\nh\r\n\r\ni"\r\n'
  421. )
  422. df.to_csv(path, lineterminator="\r\n", index=False)
  423. with open(path, "rb") as f:
  424. assert f.read() == expected_crlf
  425. def test_to_csv_stdout_file(self, capsys):
  426. # GH 21561
  427. df = DataFrame([["foo", "bar"], ["baz", "qux"]], columns=["name_1", "name_2"])
  428. expected_rows = [",name_1,name_2", "0,foo,bar", "1,baz,qux"]
  429. expected_ascii = tm.convert_rows_list_to_csv_str(expected_rows)
  430. df.to_csv(sys.stdout, encoding="ascii")
  431. captured = capsys.readouterr()
  432. assert captured.out == expected_ascii
  433. assert not sys.stdout.closed
  434. @pytest.mark.xfail(
  435. compat.is_platform_windows(),
  436. reason=(
  437. "Especially in Windows, file stream should not be passed"
  438. "to csv writer without newline='' option."
  439. "(https://docs.python.org/3.6/library/csv.html#csv.writer)"
  440. ),
  441. )
  442. def test_to_csv_write_to_open_file(self):
  443. # GH 21696
  444. df = DataFrame({"a": ["x", "y", "z"]})
  445. expected = """\
  446. manual header
  447. x
  448. y
  449. z
  450. """
  451. with tm.ensure_clean("test.txt") as path:
  452. with open(path, "w") as f:
  453. f.write("manual header\n")
  454. df.to_csv(f, header=None, index=None)
  455. with open(path) as f:
  456. assert f.read() == expected
  457. def test_to_csv_write_to_open_file_with_newline_py3(self):
  458. # see gh-21696
  459. # see gh-20353
  460. df = DataFrame({"a": ["x", "y", "z"]})
  461. expected_rows = ["x", "y", "z"]
  462. expected = "manual header\n" + tm.convert_rows_list_to_csv_str(expected_rows)
  463. with tm.ensure_clean("test.txt") as path:
  464. with open(path, "w", newline="") as f:
  465. f.write("manual header\n")
  466. df.to_csv(f, header=None, index=None)
  467. with open(path, "rb") as f:
  468. assert f.read() == bytes(expected, "utf-8")
  469. @pytest.mark.parametrize("to_infer", [True, False])
  470. @pytest.mark.parametrize("read_infer", [True, False])
  471. def test_to_csv_compression(self, compression_only, read_infer, to_infer):
  472. # see gh-15008
  473. compression = compression_only
  474. # We'll complete file extension subsequently.
  475. filename = "test."
  476. filename += _compression_to_extension[compression]
  477. df = DataFrame({"A": [1]})
  478. to_compression = "infer" if to_infer else compression
  479. read_compression = "infer" if read_infer else compression
  480. with tm.ensure_clean(filename) as path:
  481. df.to_csv(path, compression=to_compression)
  482. result = pd.read_csv(path, index_col=0, compression=read_compression)
  483. tm.assert_frame_equal(result, df)
  484. def test_to_csv_compression_dict(self, compression_only):
  485. # GH 26023
  486. method = compression_only
  487. df = DataFrame({"ABC": [1]})
  488. filename = "to_csv_compress_as_dict."
  489. extension = {
  490. "gzip": "gz",
  491. "zstd": "zst",
  492. }.get(method, method)
  493. filename += extension
  494. with tm.ensure_clean(filename) as path:
  495. df.to_csv(path, compression={"method": method})
  496. read_df = pd.read_csv(path, index_col=0)
  497. tm.assert_frame_equal(read_df, df)
  498. def test_to_csv_compression_dict_no_method_raises(self):
  499. # GH 26023
  500. df = DataFrame({"ABC": [1]})
  501. compression = {"some_option": True}
  502. msg = "must have key 'method'"
  503. with tm.ensure_clean("out.zip") as path:
  504. with pytest.raises(ValueError, match=msg):
  505. df.to_csv(path, compression=compression)
  506. @pytest.mark.parametrize("compression", ["zip", "infer"])
  507. @pytest.mark.parametrize("archive_name", ["test_to_csv.csv", "test_to_csv.zip"])
  508. def test_to_csv_zip_arguments(self, compression, archive_name):
  509. # GH 26023
  510. df = DataFrame({"ABC": [1]})
  511. with tm.ensure_clean("to_csv_archive_name.zip") as path:
  512. df.to_csv(
  513. path, compression={"method": compression, "archive_name": archive_name}
  514. )
  515. with ZipFile(path) as zp:
  516. assert len(zp.filelist) == 1
  517. archived_file = zp.filelist[0].filename
  518. assert archived_file == archive_name
  519. @pytest.mark.parametrize(
  520. "filename,expected_arcname",
  521. [
  522. ("archive.csv", "archive.csv"),
  523. ("archive.tsv", "archive.tsv"),
  524. ("archive.csv.zip", "archive.csv"),
  525. ("archive.tsv.zip", "archive.tsv"),
  526. ("archive.zip", "archive"),
  527. ],
  528. )
  529. def test_to_csv_zip_infer_name(self, tmp_path, filename, expected_arcname):
  530. # GH 39465
  531. df = DataFrame({"ABC": [1]})
  532. path = tmp_path / filename
  533. df.to_csv(path, compression="zip")
  534. with ZipFile(path) as zp:
  535. assert len(zp.filelist) == 1
  536. archived_file = zp.filelist[0].filename
  537. assert archived_file == expected_arcname
  538. @pytest.mark.parametrize("df_new_type", ["Int64"])
  539. def test_to_csv_na_rep_long_string(self, df_new_type):
  540. # see gh-25099
  541. df = DataFrame({"c": [float("nan")] * 3})
  542. df = df.astype(df_new_type)
  543. expected_rows = ["c", "mynull", "mynull", "mynull"]
  544. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  545. result = df.to_csv(index=False, na_rep="mynull", encoding="ascii")
  546. assert expected == result
  547. def test_to_csv_timedelta_precision(self):
  548. # GH 6783
  549. s = pd.Series([1, 1]).astype("timedelta64[ns]")
  550. buf = io.StringIO()
  551. s.to_csv(buf)
  552. result = buf.getvalue()
  553. expected_rows = [
  554. ",0",
  555. "0,0 days 00:00:00.000000001",
  556. "1,0 days 00:00:00.000000001",
  557. ]
  558. expected = tm.convert_rows_list_to_csv_str(expected_rows)
  559. assert result == expected
  560. def test_na_rep_truncated(self):
  561. # https://github.com/pandas-dev/pandas/issues/31447
  562. result = pd.Series(range(8, 12)).to_csv(na_rep="-")
  563. expected = tm.convert_rows_list_to_csv_str([",0", "0,8", "1,9", "2,10", "3,11"])
  564. assert result == expected
  565. result = pd.Series([True, False]).to_csv(na_rep="nan")
  566. expected = tm.convert_rows_list_to_csv_str([",0", "0,True", "1,False"])
  567. assert result == expected
  568. result = pd.Series([1.1, 2.2]).to_csv(na_rep=".")
  569. expected = tm.convert_rows_list_to_csv_str([",0", "0,1.1", "1,2.2"])
  570. assert result == expected
  571. @pytest.mark.parametrize("errors", ["surrogatepass", "ignore", "replace"])
  572. def test_to_csv_errors(self, errors):
  573. # GH 22610
  574. data = ["\ud800foo"]
  575. ser = pd.Series(data, index=pd.Index(data))
  576. with tm.ensure_clean("test.csv") as path:
  577. ser.to_csv(path, errors=errors)
  578. # No use in reading back the data as it is not the same anymore
  579. # due to the error handling
  580. @pytest.mark.parametrize("mode", ["wb", "w"])
  581. def test_to_csv_binary_handle(self, mode):
  582. """
  583. Binary file objects should work (if 'mode' contains a 'b') or even without
  584. it in most cases.
  585. GH 35058 and GH 19827
  586. """
  587. df = tm.makeDataFrame()
  588. with tm.ensure_clean() as path:
  589. with open(path, mode="w+b") as handle:
  590. df.to_csv(handle, mode=mode)
  591. tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
  592. @pytest.mark.parametrize("mode", ["wb", "w"])
  593. def test_to_csv_encoding_binary_handle(self, mode):
  594. """
  595. Binary file objects should honor a specified encoding.
  596. GH 23854 and GH 13068 with binary handles
  597. """
  598. # example from GH 23854
  599. content = "a, b, 🐟".encode("utf-8-sig")
  600. buffer = io.BytesIO(content)
  601. df = pd.read_csv(buffer, encoding="utf-8-sig")
  602. buffer = io.BytesIO()
  603. df.to_csv(buffer, mode=mode, encoding="utf-8-sig", index=False)
  604. buffer.seek(0) # tests whether file handle wasn't closed
  605. assert buffer.getvalue().startswith(content)
  606. # example from GH 13068
  607. with tm.ensure_clean() as path:
  608. with open(path, "w+b") as handle:
  609. DataFrame().to_csv(handle, mode=mode, encoding="utf-8-sig")
  610. handle.seek(0)
  611. assert handle.read().startswith(b'\xef\xbb\xbf""')
  612. def test_to_csv_iterative_compression_name(compression):
  613. # GH 38714
  614. df = tm.makeDataFrame()
  615. with tm.ensure_clean() as path:
  616. df.to_csv(path, compression=compression, chunksize=1)
  617. tm.assert_frame_equal(
  618. pd.read_csv(path, compression=compression, index_col=0), df
  619. )
  620. def test_to_csv_iterative_compression_buffer(compression):
  621. # GH 38714
  622. df = tm.makeDataFrame()
  623. with io.BytesIO() as buffer:
  624. df.to_csv(buffer, compression=compression, chunksize=1)
  625. buffer.seek(0)
  626. tm.assert_frame_equal(
  627. pd.read_csv(buffer, compression=compression, index_col=0), df
  628. )
  629. assert not buffer.closed