test_compression.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340
  1. import gzip
  2. import io
  3. import os
  4. from pathlib import Path
  5. import subprocess
  6. import sys
  7. import tarfile
  8. import textwrap
  9. import time
  10. import zipfile
  11. import pytest
  12. from pandas.compat import is_platform_windows
  13. import pandas as pd
  14. import pandas._testing as tm
  15. import pandas.io.common as icom
  16. _compression_to_extension = {
  17. value: key for key, value in icom.extension_to_compression.items()
  18. }
  19. @pytest.mark.parametrize(
  20. "obj",
  21. [
  22. pd.DataFrame(
  23. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  24. columns=["X", "Y", "Z"],
  25. ),
  26. pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
  27. ],
  28. )
  29. @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
  30. def test_compression_size(obj, method, compression_only):
  31. if compression_only == "tar":
  32. compression_only = {"method": "tar", "mode": "w:gz"}
  33. with tm.ensure_clean() as path:
  34. getattr(obj, method)(path, compression=compression_only)
  35. compressed_size = os.path.getsize(path)
  36. getattr(obj, method)(path, compression=None)
  37. uncompressed_size = os.path.getsize(path)
  38. assert uncompressed_size > compressed_size
  39. @pytest.mark.parametrize(
  40. "obj",
  41. [
  42. pd.DataFrame(
  43. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  44. columns=["X", "Y", "Z"],
  45. ),
  46. pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
  47. ],
  48. )
  49. @pytest.mark.parametrize("method", ["to_csv", "to_json"])
  50. def test_compression_size_fh(obj, method, compression_only):
  51. with tm.ensure_clean() as path:
  52. with icom.get_handle(
  53. path,
  54. "w:gz" if compression_only == "tar" else "w",
  55. compression=compression_only,
  56. ) as handles:
  57. getattr(obj, method)(handles.handle)
  58. assert not handles.handle.closed
  59. compressed_size = os.path.getsize(path)
  60. with tm.ensure_clean() as path:
  61. with icom.get_handle(path, "w", compression=None) as handles:
  62. getattr(obj, method)(handles.handle)
  63. assert not handles.handle.closed
  64. uncompressed_size = os.path.getsize(path)
  65. assert uncompressed_size > compressed_size
  66. @pytest.mark.parametrize(
  67. "write_method, write_kwargs, read_method",
  68. [
  69. ("to_csv", {"index": False}, pd.read_csv),
  70. ("to_json", {}, pd.read_json),
  71. ("to_pickle", {}, pd.read_pickle),
  72. ],
  73. )
  74. def test_dataframe_compression_defaults_to_infer(
  75. write_method, write_kwargs, read_method, compression_only
  76. ):
  77. # GH22004
  78. input = pd.DataFrame([[1.0, 0, -4], [3.4, 5, 2]], columns=["X", "Y", "Z"])
  79. extension = _compression_to_extension[compression_only]
  80. with tm.ensure_clean("compressed" + extension) as path:
  81. getattr(input, write_method)(path, **write_kwargs)
  82. output = read_method(path, compression=compression_only)
  83. tm.assert_frame_equal(output, input)
  84. @pytest.mark.parametrize(
  85. "write_method,write_kwargs,read_method,read_kwargs",
  86. [
  87. ("to_csv", {"index": False, "header": True}, pd.read_csv, {"squeeze": True}),
  88. ("to_json", {}, pd.read_json, {"typ": "series"}),
  89. ("to_pickle", {}, pd.read_pickle, {}),
  90. ],
  91. )
  92. def test_series_compression_defaults_to_infer(
  93. write_method, write_kwargs, read_method, read_kwargs, compression_only
  94. ):
  95. # GH22004
  96. input = pd.Series([0, 5, -2, 10], name="X")
  97. extension = _compression_to_extension[compression_only]
  98. with tm.ensure_clean("compressed" + extension) as path:
  99. getattr(input, write_method)(path, **write_kwargs)
  100. if "squeeze" in read_kwargs:
  101. kwargs = read_kwargs.copy()
  102. del kwargs["squeeze"]
  103. output = read_method(path, compression=compression_only, **kwargs).squeeze(
  104. "columns"
  105. )
  106. else:
  107. output = read_method(path, compression=compression_only, **read_kwargs)
  108. tm.assert_series_equal(output, input, check_names=False)
  109. def test_compression_warning(compression_only):
  110. # Assert that passing a file object to to_csv while explicitly specifying a
  111. # compression protocol triggers a RuntimeWarning, as per GH21227.
  112. df = pd.DataFrame(
  113. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  114. columns=["X", "Y", "Z"],
  115. )
  116. with tm.ensure_clean() as path:
  117. with icom.get_handle(path, "w", compression=compression_only) as handles:
  118. with tm.assert_produces_warning(RuntimeWarning):
  119. df.to_csv(handles.handle, compression=compression_only)
  120. def test_compression_binary(compression_only):
  121. """
  122. Binary file handles support compression.
  123. GH22555
  124. """
  125. df = tm.makeDataFrame()
  126. # with a file
  127. with tm.ensure_clean() as path:
  128. with open(path, mode="wb") as file:
  129. df.to_csv(file, mode="wb", compression=compression_only)
  130. file.seek(0) # file shouldn't be closed
  131. tm.assert_frame_equal(
  132. df, pd.read_csv(path, index_col=0, compression=compression_only)
  133. )
  134. # with BytesIO
  135. file = io.BytesIO()
  136. df.to_csv(file, mode="wb", compression=compression_only)
  137. file.seek(0) # file shouldn't be closed
  138. tm.assert_frame_equal(
  139. df, pd.read_csv(file, index_col=0, compression=compression_only)
  140. )
  141. def test_gzip_reproducibility_file_name():
  142. """
  143. Gzip should create reproducible archives with mtime.
  144. Note: Archives created with different filenames will still be different!
  145. GH 28103
  146. """
  147. df = tm.makeDataFrame()
  148. compression_options = {"method": "gzip", "mtime": 1}
  149. # test for filename
  150. with tm.ensure_clean() as path:
  151. path = Path(path)
  152. df.to_csv(path, compression=compression_options)
  153. time.sleep(2)
  154. output = path.read_bytes()
  155. df.to_csv(path, compression=compression_options)
  156. assert output == path.read_bytes()
  157. def test_gzip_reproducibility_file_object():
  158. """
  159. Gzip should create reproducible archives with mtime.
  160. GH 28103
  161. """
  162. df = tm.makeDataFrame()
  163. compression_options = {"method": "gzip", "mtime": 1}
  164. # test for file object
  165. buffer = io.BytesIO()
  166. df.to_csv(buffer, compression=compression_options, mode="wb")
  167. output = buffer.getvalue()
  168. time.sleep(2)
  169. buffer = io.BytesIO()
  170. df.to_csv(buffer, compression=compression_options, mode="wb")
  171. assert output == buffer.getvalue()
  172. def test_with_missing_lzma():
  173. """Tests if import pandas works when lzma is not present."""
  174. # https://github.com/pandas-dev/pandas/issues/27575
  175. code = textwrap.dedent(
  176. """\
  177. import sys
  178. sys.modules['lzma'] = None
  179. import pandas
  180. """
  181. )
  182. subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
  183. def test_with_missing_lzma_runtime():
  184. """Tests if RuntimeError is hit when calling lzma without
  185. having the module available.
  186. """
  187. code = textwrap.dedent(
  188. """
  189. import sys
  190. import pytest
  191. sys.modules['lzma'] = None
  192. import pandas as pd
  193. df = pd.DataFrame()
  194. with pytest.raises(RuntimeError, match='lzma module'):
  195. df.to_csv('foo.csv', compression='xz')
  196. """
  197. )
  198. subprocess.check_output([sys.executable, "-c", code], stderr=subprocess.PIPE)
  199. @pytest.mark.parametrize(
  200. "obj",
  201. [
  202. pd.DataFrame(
  203. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  204. columns=["X", "Y", "Z"],
  205. ),
  206. pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
  207. ],
  208. )
  209. @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
  210. def test_gzip_compression_level(obj, method):
  211. # GH33196
  212. with tm.ensure_clean() as path:
  213. getattr(obj, method)(path, compression="gzip")
  214. compressed_size_default = os.path.getsize(path)
  215. getattr(obj, method)(path, compression={"method": "gzip", "compresslevel": 1})
  216. compressed_size_fast = os.path.getsize(path)
  217. assert compressed_size_default < compressed_size_fast
  218. @pytest.mark.parametrize(
  219. "obj",
  220. [
  221. pd.DataFrame(
  222. 100 * [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]],
  223. columns=["X", "Y", "Z"],
  224. ),
  225. pd.Series(100 * [0.123456, 0.234567, 0.567567], name="X"),
  226. ],
  227. )
  228. @pytest.mark.parametrize("method", ["to_pickle", "to_json", "to_csv"])
  229. def test_bzip_compression_level(obj, method):
  230. """GH33196 bzip needs file size > 100k to show a size difference between
  231. compression levels, so here we just check if the call works when
  232. compression is passed as a dict.
  233. """
  234. with tm.ensure_clean() as path:
  235. getattr(obj, method)(path, compression={"method": "bz2", "compresslevel": 1})
  236. @pytest.mark.parametrize(
  237. "suffix,archive",
  238. [
  239. (".zip", zipfile.ZipFile),
  240. (".tar", tarfile.TarFile),
  241. ],
  242. )
  243. def test_empty_archive_zip(suffix, archive):
  244. with tm.ensure_clean(filename=suffix) as path:
  245. with archive(path, "w"):
  246. pass
  247. with pytest.raises(ValueError, match="Zero files found"):
  248. pd.read_csv(path)
  249. def test_ambiguous_archive_zip():
  250. with tm.ensure_clean(filename=".zip") as path:
  251. with zipfile.ZipFile(path, "w") as file:
  252. file.writestr("a.csv", "foo,bar")
  253. file.writestr("b.csv", "foo,bar")
  254. with pytest.raises(ValueError, match="Multiple files found in ZIP file"):
  255. pd.read_csv(path)
  256. def test_ambiguous_archive_tar(tmp_path):
  257. csvAPath = tmp_path / "a.csv"
  258. with open(csvAPath, "w") as a:
  259. a.write("foo,bar\n")
  260. csvBPath = tmp_path / "b.csv"
  261. with open(csvBPath, "w") as b:
  262. b.write("foo,bar\n")
  263. tarpath = tmp_path / "archive.tar"
  264. with tarfile.TarFile(tarpath, "w") as tar:
  265. tar.add(csvAPath, "a.csv")
  266. tar.add(csvBPath, "b.csv")
  267. with pytest.raises(ValueError, match="Multiple files found in TAR archive"):
  268. pd.read_csv(tarpath)
  269. def test_tar_gz_to_different_filename():
  270. with tm.ensure_clean(filename=".foo") as file:
  271. pd.DataFrame(
  272. [["1", "2"]],
  273. columns=["foo", "bar"],
  274. ).to_csv(file, compression={"method": "tar", "mode": "w:gz"}, index=False)
  275. with gzip.open(file) as uncompressed:
  276. with tarfile.TarFile(fileobj=uncompressed) as archive:
  277. members = archive.getmembers()
  278. assert len(members) == 1
  279. content = archive.extractfile(members[0]).read().decode("utf8")
  280. if is_platform_windows():
  281. expected = "foo,bar\r\n1,2\r\n"
  282. else:
  283. expected = "foo,bar\n1,2\n"
  284. assert content == expected
  285. def test_tar_no_error_on_close():
  286. with io.BytesIO() as buffer:
  287. with icom._BytesTarFile(fileobj=buffer, mode="w"):
  288. pass