test_file_handling.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449
  1. import os
  2. import numpy as np
  3. import pytest
  4. from pandas.compat import (
  5. PY311,
  6. is_ci_environment,
  7. is_platform_linux,
  8. is_platform_little_endian,
  9. )
  10. from pandas.errors import (
  11. ClosedFileError,
  12. PossibleDataLossError,
  13. )
  14. from pandas import (
  15. DataFrame,
  16. HDFStore,
  17. Series,
  18. _testing as tm,
  19. read_hdf,
  20. )
  21. from pandas.tests.io.pytables.common import (
  22. _maybe_remove,
  23. ensure_clean_store,
  24. tables,
  25. )
  26. from pandas.io import pytables
  27. from pandas.io.pytables import Term
  28. pytestmark = pytest.mark.single_cpu
  29. @pytest.mark.parametrize("mode", ["r", "r+", "a", "w"])
  30. def test_mode(setup_path, tmp_path, mode):
  31. df = tm.makeTimeDataFrame()
  32. msg = r"[\S]* does not exist"
  33. path = tmp_path / setup_path
  34. # constructor
  35. if mode in ["r", "r+"]:
  36. with pytest.raises(OSError, match=msg):
  37. HDFStore(path, mode=mode)
  38. else:
  39. with HDFStore(path, mode=mode) as store:
  40. assert store._handle.mode == mode
  41. path = tmp_path / setup_path
  42. # context
  43. if mode in ["r", "r+"]:
  44. with pytest.raises(OSError, match=msg):
  45. with HDFStore(path, mode=mode) as store:
  46. pass
  47. else:
  48. with HDFStore(path, mode=mode) as store:
  49. assert store._handle.mode == mode
  50. path = tmp_path / setup_path
  51. # conv write
  52. if mode in ["r", "r+"]:
  53. with pytest.raises(OSError, match=msg):
  54. df.to_hdf(path, "df", mode=mode)
  55. df.to_hdf(path, "df", mode="w")
  56. else:
  57. df.to_hdf(path, "df", mode=mode)
  58. # conv read
  59. if mode in ["w"]:
  60. msg = (
  61. "mode w is not allowed while performing a read. "
  62. r"Allowed modes are r, r\+ and a."
  63. )
  64. with pytest.raises(ValueError, match=msg):
  65. read_hdf(path, "df", mode=mode)
  66. else:
  67. result = read_hdf(path, "df", mode=mode)
  68. tm.assert_frame_equal(result, df)
  69. def test_default_mode(tmp_path, setup_path):
  70. # read_hdf uses default mode
  71. df = tm.makeTimeDataFrame()
  72. path = tmp_path / setup_path
  73. df.to_hdf(path, "df", mode="w")
  74. result = read_hdf(path, "df")
  75. tm.assert_frame_equal(result, df)
  76. def test_reopen_handle(tmp_path, setup_path):
  77. path = tmp_path / setup_path
  78. store = HDFStore(path, mode="a")
  79. store["a"] = tm.makeTimeSeries()
  80. msg = (
  81. r"Re-opening the file \[[\S]*\] with mode \[a\] will delete the "
  82. "current file!"
  83. )
  84. # invalid mode change
  85. with pytest.raises(PossibleDataLossError, match=msg):
  86. store.open("w")
  87. store.close()
  88. assert not store.is_open
  89. # truncation ok here
  90. store.open("w")
  91. assert store.is_open
  92. assert len(store) == 0
  93. store.close()
  94. assert not store.is_open
  95. store = HDFStore(path, mode="a")
  96. store["a"] = tm.makeTimeSeries()
  97. # reopen as read
  98. store.open("r")
  99. assert store.is_open
  100. assert len(store) == 1
  101. assert store._mode == "r"
  102. store.close()
  103. assert not store.is_open
  104. # reopen as append
  105. store.open("a")
  106. assert store.is_open
  107. assert len(store) == 1
  108. assert store._mode == "a"
  109. store.close()
  110. assert not store.is_open
  111. # reopen as append (again)
  112. store.open("a")
  113. assert store.is_open
  114. assert len(store) == 1
  115. assert store._mode == "a"
  116. store.close()
  117. assert not store.is_open
  118. def test_open_args(setup_path):
  119. with tm.ensure_clean(setup_path) as path:
  120. df = tm.makeDataFrame()
  121. # create an in memory store
  122. store = HDFStore(
  123. path, mode="a", driver="H5FD_CORE", driver_core_backing_store=0
  124. )
  125. store["df"] = df
  126. store.append("df2", df)
  127. tm.assert_frame_equal(store["df"], df)
  128. tm.assert_frame_equal(store["df2"], df)
  129. store.close()
  130. # the file should not have actually been written
  131. assert not os.path.exists(path)
  132. def test_flush(setup_path):
  133. with ensure_clean_store(setup_path) as store:
  134. store["a"] = tm.makeTimeSeries()
  135. store.flush()
  136. store.flush(fsync=True)
  137. def test_complibs_default_settings(tmp_path, setup_path):
  138. # GH15943
  139. df = tm.makeDataFrame()
  140. # Set complevel and check if complib is automatically set to
  141. # default value
  142. tmpfile = tmp_path / setup_path
  143. df.to_hdf(tmpfile, "df", complevel=9)
  144. result = read_hdf(tmpfile, "df")
  145. tm.assert_frame_equal(result, df)
  146. with tables.open_file(tmpfile, mode="r") as h5file:
  147. for node in h5file.walk_nodes(where="/df", classname="Leaf"):
  148. assert node.filters.complevel == 9
  149. assert node.filters.complib == "zlib"
  150. # Set complib and check to see if compression is disabled
  151. tmpfile = tmp_path / setup_path
  152. df.to_hdf(tmpfile, "df", complib="zlib")
  153. result = read_hdf(tmpfile, "df")
  154. tm.assert_frame_equal(result, df)
  155. with tables.open_file(tmpfile, mode="r") as h5file:
  156. for node in h5file.walk_nodes(where="/df", classname="Leaf"):
  157. assert node.filters.complevel == 0
  158. assert node.filters.complib is None
  159. # Check if not setting complib or complevel results in no compression
  160. tmpfile = tmp_path / setup_path
  161. df.to_hdf(tmpfile, "df")
  162. result = read_hdf(tmpfile, "df")
  163. tm.assert_frame_equal(result, df)
  164. with tables.open_file(tmpfile, mode="r") as h5file:
  165. for node in h5file.walk_nodes(where="/df", classname="Leaf"):
  166. assert node.filters.complevel == 0
  167. assert node.filters.complib is None
  168. def test_complibs_default_settings_override(tmp_path, setup_path):
  169. # Check if file-defaults can be overridden on a per table basis
  170. df = tm.makeDataFrame()
  171. tmpfile = tmp_path / setup_path
  172. store = HDFStore(tmpfile)
  173. store.append("dfc", df, complevel=9, complib="blosc")
  174. store.append("df", df)
  175. store.close()
  176. with tables.open_file(tmpfile, mode="r") as h5file:
  177. for node in h5file.walk_nodes(where="/df", classname="Leaf"):
  178. assert node.filters.complevel == 0
  179. assert node.filters.complib is None
  180. for node in h5file.walk_nodes(where="/dfc", classname="Leaf"):
  181. assert node.filters.complevel == 9
  182. assert node.filters.complib == "blosc"
  183. @pytest.mark.parametrize("lvl", range(10))
  184. @pytest.mark.parametrize("lib", tables.filters.all_complibs)
  185. @pytest.mark.filterwarnings("ignore:object name is not a valid")
  186. @pytest.mark.skipif(
  187. not PY311 and is_ci_environment() and is_platform_linux(),
  188. reason="Segfaulting in a CI environment"
  189. # with xfail, would sometimes raise UnicodeDecodeError
  190. # invalid state byte
  191. )
  192. def test_complibs(tmp_path, lvl, lib):
  193. # GH14478
  194. df = DataFrame(
  195. np.ones((30, 4)), columns=list("ABCD"), index=np.arange(30).astype(np.str_)
  196. )
  197. # Remove lzo if its not available on this platform
  198. if not tables.which_lib_version("lzo"):
  199. pytest.skip("lzo not available")
  200. # Remove bzip2 if its not available on this platform
  201. if not tables.which_lib_version("bzip2"):
  202. pytest.skip("bzip2 not available")
  203. tmpfile = tmp_path / f"{lvl}_{lib}.h5"
  204. gname = f"{lvl}_{lib}"
  205. # Write and read file to see if data is consistent
  206. df.to_hdf(tmpfile, gname, complib=lib, complevel=lvl)
  207. result = read_hdf(tmpfile, gname)
  208. tm.assert_frame_equal(result, df)
  209. # Open file and check metadata for correct amount of compression
  210. with tables.open_file(tmpfile, mode="r") as h5table:
  211. for node in h5table.walk_nodes(where="/" + gname, classname="Leaf"):
  212. assert node.filters.complevel == lvl
  213. if lvl == 0:
  214. assert node.filters.complib is None
  215. else:
  216. assert node.filters.complib == lib
  217. @pytest.mark.skipif(
  218. not is_platform_little_endian(), reason="reason platform is not little endian"
  219. )
  220. def test_encoding(setup_path):
  221. with ensure_clean_store(setup_path) as store:
  222. df = DataFrame({"A": "foo", "B": "bar"}, index=range(5))
  223. df.loc[2, "A"] = np.nan
  224. df.loc[3, "B"] = np.nan
  225. _maybe_remove(store, "df")
  226. store.append("df", df, encoding="ascii")
  227. tm.assert_frame_equal(store["df"], df)
  228. expected = df.reindex(columns=["A"])
  229. result = store.select("df", Term("columns=A", encoding="ascii"))
  230. tm.assert_frame_equal(result, expected)
  231. @pytest.mark.parametrize(
  232. "val",
  233. [
  234. [b"E\xc9, 17", b"", b"a", b"b", b"c"],
  235. [b"E\xc9, 17", b"a", b"b", b"c"],
  236. [b"EE, 17", b"", b"a", b"b", b"c"],
  237. [b"E\xc9, 17", b"\xf8\xfc", b"a", b"b", b"c"],
  238. [b"", b"a", b"b", b"c"],
  239. [b"\xf8\xfc", b"a", b"b", b"c"],
  240. [b"A\xf8\xfc", b"", b"a", b"b", b"c"],
  241. [np.nan, b"", b"b", b"c"],
  242. [b"A\xf8\xfc", np.nan, b"", b"b", b"c"],
  243. ],
  244. )
  245. @pytest.mark.parametrize("dtype", ["category", object])
  246. def test_latin_encoding(tmp_path, setup_path, dtype, val):
  247. enc = "latin-1"
  248. nan_rep = ""
  249. key = "data"
  250. val = [x.decode(enc) if isinstance(x, bytes) else x for x in val]
  251. ser = Series(val, dtype=dtype)
  252. store = tmp_path / setup_path
  253. ser.to_hdf(store, key, format="table", encoding=enc, nan_rep=nan_rep)
  254. retr = read_hdf(store, key)
  255. s_nan = ser.replace(nan_rep, np.nan)
  256. tm.assert_series_equal(s_nan, retr)
  257. def test_multiple_open_close(tmp_path, setup_path):
  258. # gh-4409: open & close multiple times
  259. path = tmp_path / setup_path
  260. df = tm.makeDataFrame()
  261. df.to_hdf(path, "df", mode="w", format="table")
  262. # single
  263. store = HDFStore(path)
  264. assert "CLOSED" not in store.info()
  265. assert store.is_open
  266. store.close()
  267. assert "CLOSED" in store.info()
  268. assert not store.is_open
  269. path = tmp_path / setup_path
  270. if pytables._table_file_open_policy_is_strict:
  271. # multiples
  272. store1 = HDFStore(path)
  273. msg = (
  274. r"The file [\S]* is already opened\. Please close it before "
  275. r"reopening in write mode\."
  276. )
  277. with pytest.raises(ValueError, match=msg):
  278. HDFStore(path)
  279. store1.close()
  280. else:
  281. # multiples
  282. store1 = HDFStore(path)
  283. store2 = HDFStore(path)
  284. assert "CLOSED" not in store1.info()
  285. assert "CLOSED" not in store2.info()
  286. assert store1.is_open
  287. assert store2.is_open
  288. store1.close()
  289. assert "CLOSED" in store1.info()
  290. assert not store1.is_open
  291. assert "CLOSED" not in store2.info()
  292. assert store2.is_open
  293. store2.close()
  294. assert "CLOSED" in store1.info()
  295. assert "CLOSED" in store2.info()
  296. assert not store1.is_open
  297. assert not store2.is_open
  298. # nested close
  299. store = HDFStore(path, mode="w")
  300. store.append("df", df)
  301. store2 = HDFStore(path)
  302. store2.append("df2", df)
  303. store2.close()
  304. assert "CLOSED" in store2.info()
  305. assert not store2.is_open
  306. store.close()
  307. assert "CLOSED" in store.info()
  308. assert not store.is_open
  309. # double closing
  310. store = HDFStore(path, mode="w")
  311. store.append("df", df)
  312. store2 = HDFStore(path)
  313. store.close()
  314. assert "CLOSED" in store.info()
  315. assert not store.is_open
  316. store2.close()
  317. assert "CLOSED" in store2.info()
  318. assert not store2.is_open
  319. # ops on a closed store
  320. path = tmp_path / setup_path
  321. df = tm.makeDataFrame()
  322. df.to_hdf(path, "df", mode="w", format="table")
  323. store = HDFStore(path)
  324. store.close()
  325. msg = r"[\S]* file is not open!"
  326. with pytest.raises(ClosedFileError, match=msg):
  327. store.keys()
  328. with pytest.raises(ClosedFileError, match=msg):
  329. "df" in store
  330. with pytest.raises(ClosedFileError, match=msg):
  331. len(store)
  332. with pytest.raises(ClosedFileError, match=msg):
  333. store["df"]
  334. with pytest.raises(ClosedFileError, match=msg):
  335. store.select("df")
  336. with pytest.raises(ClosedFileError, match=msg):
  337. store.get("df")
  338. with pytest.raises(ClosedFileError, match=msg):
  339. store.append("df2", df)
  340. with pytest.raises(ClosedFileError, match=msg):
  341. store.put("df3", df)
  342. with pytest.raises(ClosedFileError, match=msg):
  343. store.get_storer("df2")
  344. with pytest.raises(ClosedFileError, match=msg):
  345. store.remove("df2")
  346. with pytest.raises(ClosedFileError, match=msg):
  347. store.select("df")
  348. msg = "'HDFStore' object has no attribute 'df'"
  349. with pytest.raises(AttributeError, match=msg):
  350. store.df
  351. def test_fspath():
  352. with tm.ensure_clean("foo.h5") as path:
  353. with HDFStore(path) as store:
  354. assert os.fspath(store) == str(path)