test_pickle.py 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590
  1. """
  2. manage legacy pickle tests
  3. How to add pickle tests:
  4. 1. Install pandas version intended to output the pickle.
  5. 2. Execute "generate_legacy_storage_files.py" to create the pickle.
  6. $ python generate_legacy_storage_files.py <output_dir> pickle
  7. 3. Move the created pickle to "data/legacy_pickle/<version>" directory.
  8. """
  9. from array import array
  10. import bz2
  11. import datetime
  12. import functools
  13. from functools import partial
  14. import glob
  15. import gzip
  16. import io
  17. import os
  18. from pathlib import Path
  19. import pickle
  20. import shutil
  21. import tarfile
  22. import uuid
  23. from warnings import catch_warnings
  24. import zipfile
  25. import numpy as np
  26. import pytest
  27. from pandas.compat import (
  28. get_lzma_file,
  29. is_platform_little_endian,
  30. )
  31. from pandas.compat._optional import import_optional_dependency
  32. from pandas.compat.compressors import flatten_buffer
  33. import pandas.util._test_decorators as td
  34. import pandas as pd
  35. from pandas import (
  36. Index,
  37. Series,
  38. period_range,
  39. )
  40. import pandas._testing as tm
  41. import pandas.io.common as icom
  42. from pandas.tseries.offsets import (
  43. Day,
  44. MonthEnd,
  45. )
  46. @pytest.fixture(scope="module")
  47. def current_pickle_data():
  48. # our current version pickle data
  49. from pandas.tests.io.generate_legacy_storage_files import create_pickle_data
  50. with catch_warnings():
  51. return create_pickle_data()
  52. # ---------------------
  53. # comparison functions
  54. # ---------------------
  55. def compare_element(result, expected, typ):
  56. if isinstance(expected, Index):
  57. tm.assert_index_equal(expected, result)
  58. return
  59. if typ.startswith("sp_"):
  60. tm.assert_equal(result, expected)
  61. elif typ == "timestamp":
  62. if expected is pd.NaT:
  63. assert result is pd.NaT
  64. else:
  65. assert result == expected
  66. else:
  67. comparator = getattr(tm, f"assert_{typ}_equal", tm.assert_almost_equal)
  68. comparator(result, expected)
  69. legacy_dirname = os.path.join(os.path.dirname(__file__), "data", "legacy_pickle")
  70. files = glob.glob(os.path.join(legacy_dirname, "*", "*.pickle"))
  71. @pytest.fixture(params=files)
  72. def legacy_pickle(request, datapath):
  73. return datapath(request.param)
  74. # ---------------------
  75. # tests
  76. # ---------------------
  77. @pytest.mark.parametrize(
  78. "data",
  79. [
  80. b"123",
  81. b"123456",
  82. bytearray(b"123"),
  83. memoryview(b"123"),
  84. pickle.PickleBuffer(b"123"),
  85. array("I", [1, 2, 3]),
  86. memoryview(b"123456").cast("B", (3, 2)),
  87. memoryview(b"123456").cast("B", (3, 2))[::2],
  88. np.arange(12).reshape((3, 4), order="C"),
  89. np.arange(12).reshape((3, 4), order="F"),
  90. np.arange(12).reshape((3, 4), order="C")[:, ::2],
  91. ],
  92. )
  93. def test_flatten_buffer(data):
  94. result = flatten_buffer(data)
  95. expected = memoryview(data).tobytes("A")
  96. assert result == expected
  97. if isinstance(data, (bytes, bytearray)):
  98. assert result is data
  99. elif isinstance(result, memoryview):
  100. assert result.ndim == 1
  101. assert result.format == "B"
  102. assert result.contiguous
  103. assert result.shape == (result.nbytes,)
  104. def test_pickles(legacy_pickle):
  105. if not is_platform_little_endian():
  106. pytest.skip("known failure on non-little endian")
  107. data = pd.read_pickle(legacy_pickle)
  108. for typ, dv in data.items():
  109. for dt, result in dv.items():
  110. expected = data[typ][dt]
  111. if typ == "series" and dt == "ts":
  112. # GH 7748
  113. tm.assert_series_equal(result, expected)
  114. assert result.index.freq == expected.index.freq
  115. assert not result.index.freq.normalize
  116. tm.assert_series_equal(result > 0, expected > 0)
  117. # GH 9291
  118. freq = result.index.freq
  119. assert freq + Day(1) == Day(2)
  120. res = freq + pd.Timedelta(hours=1)
  121. assert isinstance(res, pd.Timedelta)
  122. assert res == pd.Timedelta(days=1, hours=1)
  123. res = freq + pd.Timedelta(nanoseconds=1)
  124. assert isinstance(res, pd.Timedelta)
  125. assert res == pd.Timedelta(days=1, nanoseconds=1)
  126. elif typ == "index" and dt == "period":
  127. tm.assert_index_equal(result, expected)
  128. assert isinstance(result.freq, MonthEnd)
  129. assert result.freq == MonthEnd()
  130. assert result.freqstr == "M"
  131. tm.assert_index_equal(result.shift(2), expected.shift(2))
  132. elif typ == "series" and dt in ("dt_tz", "cat"):
  133. tm.assert_series_equal(result, expected)
  134. elif typ == "frame" and dt in (
  135. "dt_mixed_tzs",
  136. "cat_onecol",
  137. "cat_and_float",
  138. ):
  139. tm.assert_frame_equal(result, expected)
  140. else:
  141. compare_element(result, expected, typ)
  142. def python_pickler(obj, path):
  143. with open(path, "wb") as fh:
  144. pickle.dump(obj, fh, protocol=-1)
  145. def python_unpickler(path):
  146. with open(path, "rb") as fh:
  147. fh.seek(0)
  148. return pickle.load(fh)
  149. @pytest.mark.parametrize(
  150. "pickle_writer",
  151. [
  152. pytest.param(python_pickler, id="python"),
  153. pytest.param(pd.to_pickle, id="pandas_proto_default"),
  154. pytest.param(
  155. functools.partial(pd.to_pickle, protocol=pickle.HIGHEST_PROTOCOL),
  156. id="pandas_proto_highest",
  157. ),
  158. pytest.param(functools.partial(pd.to_pickle, protocol=4), id="pandas_proto_4"),
  159. pytest.param(
  160. functools.partial(pd.to_pickle, protocol=5),
  161. id="pandas_proto_5",
  162. ),
  163. ],
  164. )
  165. @pytest.mark.parametrize("writer", [pd.to_pickle, python_pickler])
  166. def test_round_trip_current(current_pickle_data, pickle_writer, writer):
  167. data = current_pickle_data
  168. for typ, dv in data.items():
  169. for dt, expected in dv.items():
  170. with tm.ensure_clean() as path:
  171. # test writing with each pickler
  172. pickle_writer(expected, path)
  173. # test reading with each unpickler
  174. result = pd.read_pickle(path)
  175. compare_element(result, expected, typ)
  176. result = python_unpickler(path)
  177. compare_element(result, expected, typ)
  178. # and the same for file objects (GH 35679)
  179. with open(path, mode="wb") as handle:
  180. writer(expected, path)
  181. handle.seek(0) # shouldn't close file handle
  182. with open(path, mode="rb") as handle:
  183. result = pd.read_pickle(handle)
  184. handle.seek(0) # shouldn't close file handle
  185. compare_element(result, expected, typ)
  186. def test_pickle_path_pathlib():
  187. df = tm.makeDataFrame()
  188. result = tm.round_trip_pathlib(df.to_pickle, pd.read_pickle)
  189. tm.assert_frame_equal(df, result)
  190. def test_pickle_path_localpath():
  191. df = tm.makeDataFrame()
  192. result = tm.round_trip_localpath(df.to_pickle, pd.read_pickle)
  193. tm.assert_frame_equal(df, result)
  194. # ---------------------
  195. # test pickle compression
  196. # ---------------------
  197. @pytest.fixture
  198. def get_random_path():
  199. return f"__{uuid.uuid4()}__.pickle"
  200. class TestCompression:
  201. _extension_to_compression = icom.extension_to_compression
  202. def compress_file(self, src_path, dest_path, compression):
  203. if compression is None:
  204. shutil.copyfile(src_path, dest_path)
  205. return
  206. if compression == "gzip":
  207. f = gzip.open(dest_path, "w")
  208. elif compression == "bz2":
  209. f = bz2.BZ2File(dest_path, "w")
  210. elif compression == "zip":
  211. with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_DEFLATED) as f:
  212. f.write(src_path, os.path.basename(src_path))
  213. elif compression == "tar":
  214. with open(src_path, "rb") as fh:
  215. with tarfile.open(dest_path, mode="w") as tar:
  216. tarinfo = tar.gettarinfo(src_path, os.path.basename(src_path))
  217. tar.addfile(tarinfo, fh)
  218. elif compression == "xz":
  219. f = get_lzma_file()(dest_path, "w")
  220. elif compression == "zstd":
  221. f = import_optional_dependency("zstandard").open(dest_path, "wb")
  222. else:
  223. msg = f"Unrecognized compression type: {compression}"
  224. raise ValueError(msg)
  225. if compression not in ["zip", "tar"]:
  226. with open(src_path, "rb") as fh:
  227. with f:
  228. f.write(fh.read())
  229. def test_write_explicit(self, compression, get_random_path):
  230. base = get_random_path
  231. path1 = base + ".compressed"
  232. path2 = base + ".raw"
  233. with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
  234. df = tm.makeDataFrame()
  235. # write to compressed file
  236. df.to_pickle(p1, compression=compression)
  237. # decompress
  238. with tm.decompress_file(p1, compression=compression) as f:
  239. with open(p2, "wb") as fh:
  240. fh.write(f.read())
  241. # read decompressed file
  242. df2 = pd.read_pickle(p2, compression=None)
  243. tm.assert_frame_equal(df, df2)
  244. @pytest.mark.parametrize("compression", ["", "None", "bad", "7z"])
  245. def test_write_explicit_bad(self, compression, get_random_path):
  246. with pytest.raises(ValueError, match="Unrecognized compression type"):
  247. with tm.ensure_clean(get_random_path) as path:
  248. df = tm.makeDataFrame()
  249. df.to_pickle(path, compression=compression)
  250. def test_write_infer(self, compression_ext, get_random_path):
  251. base = get_random_path
  252. path1 = base + compression_ext
  253. path2 = base + ".raw"
  254. compression = self._extension_to_compression.get(compression_ext.lower())
  255. with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
  256. df = tm.makeDataFrame()
  257. # write to compressed file by inferred compression method
  258. df.to_pickle(p1)
  259. # decompress
  260. with tm.decompress_file(p1, compression=compression) as f:
  261. with open(p2, "wb") as fh:
  262. fh.write(f.read())
  263. # read decompressed file
  264. df2 = pd.read_pickle(p2, compression=None)
  265. tm.assert_frame_equal(df, df2)
  266. def test_read_explicit(self, compression, get_random_path):
  267. base = get_random_path
  268. path1 = base + ".raw"
  269. path2 = base + ".compressed"
  270. with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
  271. df = tm.makeDataFrame()
  272. # write to uncompressed file
  273. df.to_pickle(p1, compression=None)
  274. # compress
  275. self.compress_file(p1, p2, compression=compression)
  276. # read compressed file
  277. df2 = pd.read_pickle(p2, compression=compression)
  278. tm.assert_frame_equal(df, df2)
  279. def test_read_infer(self, compression_ext, get_random_path):
  280. base = get_random_path
  281. path1 = base + ".raw"
  282. path2 = base + compression_ext
  283. compression = self._extension_to_compression.get(compression_ext.lower())
  284. with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2:
  285. df = tm.makeDataFrame()
  286. # write to uncompressed file
  287. df.to_pickle(p1, compression=None)
  288. # compress
  289. self.compress_file(p1, p2, compression=compression)
  290. # read compressed file by inferred compression method
  291. df2 = pd.read_pickle(p2)
  292. tm.assert_frame_equal(df, df2)
  293. # ---------------------
  294. # test pickle compression
  295. # ---------------------
  296. class TestProtocol:
  297. @pytest.mark.parametrize("protocol", [-1, 0, 1, 2])
  298. def test_read(self, protocol, get_random_path):
  299. with tm.ensure_clean(get_random_path) as path:
  300. df = tm.makeDataFrame()
  301. df.to_pickle(path, protocol=protocol)
  302. df2 = pd.read_pickle(path)
  303. tm.assert_frame_equal(df, df2)
  304. @pytest.mark.parametrize(
  305. ["pickle_file", "excols"],
  306. [
  307. ("test_py27.pkl", Index(["a", "b", "c"])),
  308. (
  309. "test_mi_py27.pkl",
  310. pd.MultiIndex.from_arrays([["a", "b", "c"], ["A", "B", "C"]]),
  311. ),
  312. ],
  313. )
  314. def test_unicode_decode_error(datapath, pickle_file, excols):
  315. # pickle file written with py27, should be readable without raising
  316. # UnicodeDecodeError, see GH#28645 and GH#31988
  317. path = datapath("io", "data", "pickle", pickle_file)
  318. df = pd.read_pickle(path)
  319. # just test the columns are correct since the values are random
  320. tm.assert_index_equal(df.columns, excols)
  321. # ---------------------
  322. # tests for buffer I/O
  323. # ---------------------
  324. def test_pickle_buffer_roundtrip():
  325. with tm.ensure_clean() as path:
  326. df = tm.makeDataFrame()
  327. with open(path, "wb") as fh:
  328. df.to_pickle(fh)
  329. with open(path, "rb") as fh:
  330. result = pd.read_pickle(fh)
  331. tm.assert_frame_equal(df, result)
  332. # ---------------------
  333. # tests for URL I/O
  334. # ---------------------
  335. @pytest.mark.parametrize(
  336. "mockurl", ["http://url.com", "ftp://test.com", "http://gzip.com"]
  337. )
  338. def test_pickle_generalurl_read(monkeypatch, mockurl):
  339. def python_pickler(obj, path):
  340. with open(path, "wb") as fh:
  341. pickle.dump(obj, fh, protocol=-1)
  342. class MockReadResponse:
  343. def __init__(self, path) -> None:
  344. self.file = open(path, "rb")
  345. if "gzip" in path:
  346. self.headers = {"Content-Encoding": "gzip"}
  347. else:
  348. self.headers = {"Content-Encoding": ""}
  349. def __enter__(self):
  350. return self
  351. def __exit__(self, *args):
  352. self.close()
  353. def read(self):
  354. return self.file.read()
  355. def close(self):
  356. return self.file.close()
  357. with tm.ensure_clean() as path:
  358. def mock_urlopen_read(*args, **kwargs):
  359. return MockReadResponse(path)
  360. df = tm.makeDataFrame()
  361. python_pickler(df, path)
  362. monkeypatch.setattr("urllib.request.urlopen", mock_urlopen_read)
  363. result = pd.read_pickle(mockurl)
  364. tm.assert_frame_equal(df, result)
  365. @td.skip_if_no("fsspec")
  366. def test_pickle_fsspec_roundtrip():
  367. with tm.ensure_clean():
  368. mockurl = "memory://afile"
  369. df = tm.makeDataFrame()
  370. df.to_pickle(mockurl)
  371. result = pd.read_pickle(mockurl)
  372. tm.assert_frame_equal(df, result)
  373. class MyTz(datetime.tzinfo):
  374. def __init__(self) -> None:
  375. pass
  376. def test_read_pickle_with_subclass():
  377. # GH 12163
  378. expected = Series(dtype=object), MyTz()
  379. result = tm.round_trip_pickle(expected)
  380. tm.assert_series_equal(result[0], expected[0])
  381. assert isinstance(result[1], MyTz)
  382. def test_pickle_binary_object_compression(compression):
  383. """
  384. Read/write from binary file-objects w/wo compression.
  385. GH 26237, GH 29054, and GH 29570
  386. """
  387. df = tm.makeDataFrame()
  388. # reference for compression
  389. with tm.ensure_clean() as path:
  390. df.to_pickle(path, compression=compression)
  391. reference = Path(path).read_bytes()
  392. # write
  393. buffer = io.BytesIO()
  394. df.to_pickle(buffer, compression=compression)
  395. buffer.seek(0)
  396. # gzip and zip safe the filename: cannot compare the compressed content
  397. assert buffer.getvalue() == reference or compression in ("gzip", "zip", "tar")
  398. # read
  399. read_df = pd.read_pickle(buffer, compression=compression)
  400. buffer.seek(0)
  401. tm.assert_frame_equal(df, read_df)
  402. def test_pickle_dataframe_with_multilevel_index(
  403. multiindex_year_month_day_dataframe_random_data,
  404. multiindex_dataframe_random_data,
  405. ):
  406. ymd = multiindex_year_month_day_dataframe_random_data
  407. frame = multiindex_dataframe_random_data
  408. def _test_roundtrip(frame):
  409. unpickled = tm.round_trip_pickle(frame)
  410. tm.assert_frame_equal(frame, unpickled)
  411. _test_roundtrip(frame)
  412. _test_roundtrip(frame.T)
  413. _test_roundtrip(ymd)
  414. _test_roundtrip(ymd.T)
  415. def test_pickle_timeseries_periodindex():
  416. # GH#2891
  417. prng = period_range("1/1/2011", "1/1/2012", freq="M")
  418. ts = Series(np.random.randn(len(prng)), prng)
  419. new_ts = tm.round_trip_pickle(ts)
  420. assert new_ts.index.freq == "M"
  421. @pytest.mark.parametrize(
  422. "name", [777, 777.0, "name", datetime.datetime(2001, 11, 11), (1, 2)]
  423. )
  424. def test_pickle_preserve_name(name):
  425. unpickled = tm.round_trip_pickle(tm.makeTimeSeries(name=name))
  426. assert unpickled.name == name
  427. def test_pickle_datetimes(datetime_series):
  428. unp_ts = tm.round_trip_pickle(datetime_series)
  429. tm.assert_series_equal(unp_ts, datetime_series)
  430. def test_pickle_strings(string_series):
  431. unp_series = tm.round_trip_pickle(string_series)
  432. tm.assert_series_equal(unp_series, string_series)
  433. @td.skip_array_manager_invalid_test
  434. def test_pickle_preserves_block_ndim():
  435. # GH#37631
  436. ser = Series(list("abc")).astype("category").iloc[[0]]
  437. res = tm.round_trip_pickle(ser)
  438. assert res._mgr.blocks[0].ndim == 1
  439. assert res._mgr.blocks[0].shape == (1,)
  440. # GH#37631 OP issue was about indexing, underlying problem was pickle
  441. tm.assert_series_equal(res[[True]], ser)
  442. @pytest.mark.parametrize("protocol", [pickle.DEFAULT_PROTOCOL, pickle.HIGHEST_PROTOCOL])
  443. def test_pickle_big_dataframe_compression(protocol, compression):
  444. # GH#39002
  445. df = pd.DataFrame(range(100000))
  446. result = tm.round_trip_pathlib(
  447. partial(df.to_pickle, protocol=protocol, compression=compression),
  448. partial(pd.read_pickle, compression=compression),
  449. )
  450. tm.assert_frame_equal(df, result)
  451. def test_pickle_frame_v124_unpickle_130():
  452. # GH#42345 DataFrame created in 1.2.x, unpickle in 1.3.x
  453. path = os.path.join(legacy_dirname, "1.2.4", "empty_frame_v1_2_4-GH#42345.pkl")
  454. with open(path, "rb") as fd:
  455. df = pickle.load(fd)
  456. expected = pd.DataFrame(index=[], columns=[])
  457. tm.assert_frame_equal(df, expected)