test_read.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354
  1. from contextlib import closing
  2. from pathlib import Path
  3. import re
  4. import numpy as np
  5. import pytest
  6. from pandas._libs.tslibs import Timestamp
  7. from pandas.compat import is_platform_windows
  8. import pandas as pd
  9. from pandas import (
  10. DataFrame,
  11. HDFStore,
  12. Index,
  13. Series,
  14. _testing as tm,
  15. read_hdf,
  16. )
  17. from pandas.tests.io.pytables.common import (
  18. _maybe_remove,
  19. ensure_clean_store,
  20. )
  21. from pandas.util import _test_decorators as td
  22. from pandas.io.pytables import TableIterator
  23. pytestmark = pytest.mark.single_cpu
  24. def test_read_missing_key_close_store(tmp_path, setup_path):
  25. # GH 25766
  26. path = tmp_path / setup_path
  27. df = DataFrame({"a": range(2), "b": range(2)})
  28. df.to_hdf(path, "k1")
  29. with pytest.raises(KeyError, match="'No object named k2 in the file'"):
  30. read_hdf(path, "k2")
  31. # smoke test to test that file is properly closed after
  32. # read with KeyError before another write
  33. df.to_hdf(path, "k2")
  34. def test_read_missing_key_opened_store(tmp_path, setup_path):
  35. # GH 28699
  36. path = tmp_path / setup_path
  37. df = DataFrame({"a": range(2), "b": range(2)})
  38. df.to_hdf(path, "k1")
  39. with HDFStore(path, "r") as store:
  40. with pytest.raises(KeyError, match="'No object named k2 in the file'"):
  41. read_hdf(store, "k2")
  42. # Test that the file is still open after a KeyError and that we can
  43. # still read from it.
  44. read_hdf(store, "k1")
  45. def test_read_column(setup_path):
  46. df = tm.makeTimeDataFrame()
  47. with ensure_clean_store(setup_path) as store:
  48. _maybe_remove(store, "df")
  49. # GH 17912
  50. # HDFStore.select_column should raise a KeyError
  51. # exception if the key is not a valid store
  52. with pytest.raises(KeyError, match="No object named df in the file"):
  53. store.select_column("df", "index")
  54. store.append("df", df)
  55. # error
  56. with pytest.raises(
  57. KeyError, match=re.escape("'column [foo] not found in the table'")
  58. ):
  59. store.select_column("df", "foo")
  60. msg = re.escape("select_column() got an unexpected keyword argument 'where'")
  61. with pytest.raises(TypeError, match=msg):
  62. store.select_column("df", "index", where=["index>5"])
  63. # valid
  64. result = store.select_column("df", "index")
  65. tm.assert_almost_equal(result.values, Series(df.index).values)
  66. assert isinstance(result, Series)
  67. # not a data indexable column
  68. msg = re.escape(
  69. "column [values_block_0] can not be extracted individually; "
  70. "it is not data indexable"
  71. )
  72. with pytest.raises(ValueError, match=msg):
  73. store.select_column("df", "values_block_0")
  74. # a data column
  75. df2 = df.copy()
  76. df2["string"] = "foo"
  77. store.append("df2", df2, data_columns=["string"])
  78. result = store.select_column("df2", "string")
  79. tm.assert_almost_equal(result.values, df2["string"].values)
  80. # a data column with NaNs, result excludes the NaNs
  81. df3 = df.copy()
  82. df3["string"] = "foo"
  83. df3.loc[df3.index[4:6], "string"] = np.nan
  84. store.append("df3", df3, data_columns=["string"])
  85. result = store.select_column("df3", "string")
  86. tm.assert_almost_equal(result.values, df3["string"].values)
  87. # start/stop
  88. result = store.select_column("df3", "string", start=2)
  89. tm.assert_almost_equal(result.values, df3["string"].values[2:])
  90. result = store.select_column("df3", "string", start=-2)
  91. tm.assert_almost_equal(result.values, df3["string"].values[-2:])
  92. result = store.select_column("df3", "string", stop=2)
  93. tm.assert_almost_equal(result.values, df3["string"].values[:2])
  94. result = store.select_column("df3", "string", stop=-2)
  95. tm.assert_almost_equal(result.values, df3["string"].values[:-2])
  96. result = store.select_column("df3", "string", start=2, stop=-2)
  97. tm.assert_almost_equal(result.values, df3["string"].values[2:-2])
  98. result = store.select_column("df3", "string", start=-2, stop=2)
  99. tm.assert_almost_equal(result.values, df3["string"].values[-2:2])
  100. # GH 10392 - make sure column name is preserved
  101. df4 = DataFrame({"A": np.random.randn(10), "B": "foo"})
  102. store.append("df4", df4, data_columns=True)
  103. expected = df4["B"]
  104. result = store.select_column("df4", "B")
  105. tm.assert_series_equal(result, expected)
  106. def test_pytables_native_read(datapath):
  107. with ensure_clean_store(
  108. datapath("io", "data", "legacy_hdf/pytables_native.h5"), mode="r"
  109. ) as store:
  110. d2 = store["detector/readout"]
  111. assert isinstance(d2, DataFrame)
  112. @pytest.mark.skipif(is_platform_windows(), reason="native2 read fails oddly on windows")
  113. def test_pytables_native2_read(datapath):
  114. with ensure_clean_store(
  115. datapath("io", "data", "legacy_hdf", "pytables_native2.h5"), mode="r"
  116. ) as store:
  117. str(store)
  118. d1 = store["detector"]
  119. assert isinstance(d1, DataFrame)
  120. def test_legacy_table_fixed_format_read_py2(datapath):
  121. # GH 24510
  122. # legacy table with fixed format written in Python 2
  123. with ensure_clean_store(
  124. datapath("io", "data", "legacy_hdf", "legacy_table_fixed_py2.h5"), mode="r"
  125. ) as store:
  126. result = store.select("df")
  127. expected = DataFrame(
  128. [[1, 2, 3, "D"]],
  129. columns=["A", "B", "C", "D"],
  130. index=Index(["ABC"], name="INDEX_NAME"),
  131. )
  132. tm.assert_frame_equal(expected, result)
  133. def test_legacy_table_fixed_format_read_datetime_py2(datapath):
  134. # GH 31750
  135. # legacy table with fixed format and datetime64 column written in Python 2
  136. with ensure_clean_store(
  137. datapath("io", "data", "legacy_hdf", "legacy_table_fixed_datetime_py2.h5"),
  138. mode="r",
  139. ) as store:
  140. result = store.select("df")
  141. expected = DataFrame(
  142. [[Timestamp("2020-02-06T18:00")]],
  143. columns=["A"],
  144. index=Index(["date"]),
  145. )
  146. tm.assert_frame_equal(expected, result)
  147. def test_legacy_table_read_py2(datapath):
  148. # issue: 24925
  149. # legacy table written in Python 2
  150. with ensure_clean_store(
  151. datapath("io", "data", "legacy_hdf", "legacy_table_py2.h5"), mode="r"
  152. ) as store:
  153. result = store.select("table")
  154. expected = DataFrame({"a": ["a", "b"], "b": [2, 3]})
  155. tm.assert_frame_equal(expected, result)
  156. def test_read_hdf_open_store(tmp_path, setup_path):
  157. # GH10330
  158. # No check for non-string path_or-buf, and no test of open store
  159. df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE"))
  160. df.index.name = "letters"
  161. df = df.set_index(keys="E", append=True)
  162. path = tmp_path / setup_path
  163. df.to_hdf(path, "df", mode="w")
  164. direct = read_hdf(path, "df")
  165. with HDFStore(path, mode="r") as store:
  166. indirect = read_hdf(store, "df")
  167. tm.assert_frame_equal(direct, indirect)
  168. assert store.is_open
  169. def test_read_hdf_index_not_view(tmp_path, setup_path):
  170. # GH 37441
  171. # Ensure that the index of the DataFrame is not a view
  172. # into the original recarray that pytables reads in
  173. df = DataFrame(np.random.rand(4, 5), index=[0, 1, 2, 3], columns=list("ABCDE"))
  174. path = tmp_path / setup_path
  175. df.to_hdf(path, "df", mode="w", format="table")
  176. df2 = read_hdf(path, "df")
  177. assert df2.index._data.base is None
  178. tm.assert_frame_equal(df, df2)
  179. def test_read_hdf_iterator(tmp_path, setup_path):
  180. df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE"))
  181. df.index.name = "letters"
  182. df = df.set_index(keys="E", append=True)
  183. path = tmp_path / setup_path
  184. df.to_hdf(path, "df", mode="w", format="t")
  185. direct = read_hdf(path, "df")
  186. iterator = read_hdf(path, "df", iterator=True)
  187. with closing(iterator.store):
  188. assert isinstance(iterator, TableIterator)
  189. indirect = next(iterator.__iter__())
  190. tm.assert_frame_equal(direct, indirect)
  191. def test_read_nokey(tmp_path, setup_path):
  192. # GH10443
  193. df = DataFrame(np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE"))
  194. # Categorical dtype not supported for "fixed" format. So no need
  195. # to test with that dtype in the dataframe here.
  196. path = tmp_path / setup_path
  197. df.to_hdf(path, "df", mode="a")
  198. reread = read_hdf(path)
  199. tm.assert_frame_equal(df, reread)
  200. df.to_hdf(path, "df2", mode="a")
  201. msg = "key must be provided when HDF5 file contains multiple datasets."
  202. with pytest.raises(ValueError, match=msg):
  203. read_hdf(path)
  204. def test_read_nokey_table(tmp_path, setup_path):
  205. # GH13231
  206. df = DataFrame({"i": range(5), "c": Series(list("abacd"), dtype="category")})
  207. path = tmp_path / setup_path
  208. df.to_hdf(path, "df", mode="a", format="table")
  209. reread = read_hdf(path)
  210. tm.assert_frame_equal(df, reread)
  211. df.to_hdf(path, "df2", mode="a", format="table")
  212. msg = "key must be provided when HDF5 file contains multiple datasets."
  213. with pytest.raises(ValueError, match=msg):
  214. read_hdf(path)
  215. def test_read_nokey_empty(tmp_path, setup_path):
  216. path = tmp_path / setup_path
  217. store = HDFStore(path)
  218. store.close()
  219. msg = re.escape(
  220. "Dataset(s) incompatible with Pandas data types, not table, or no "
  221. "datasets found in HDF5 file."
  222. )
  223. with pytest.raises(ValueError, match=msg):
  224. read_hdf(path)
  225. def test_read_from_pathlib_path(tmp_path, setup_path):
  226. # GH11773
  227. expected = DataFrame(
  228. np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")
  229. )
  230. filename = tmp_path / setup_path
  231. path_obj = Path(filename)
  232. expected.to_hdf(path_obj, "df", mode="a")
  233. actual = read_hdf(path_obj, "df")
  234. tm.assert_frame_equal(expected, actual)
  235. @td.skip_if_no("py.path")
  236. def test_read_from_py_localpath(tmp_path, setup_path):
  237. # GH11773
  238. from py.path import local as LocalPath
  239. expected = DataFrame(
  240. np.random.rand(4, 5), index=list("abcd"), columns=list("ABCDE")
  241. )
  242. filename = tmp_path / setup_path
  243. path_obj = LocalPath(filename)
  244. expected.to_hdf(path_obj, "df", mode="a")
  245. actual = read_hdf(path_obj, "df")
  246. tm.assert_frame_equal(expected, actual)
  247. @pytest.mark.parametrize("format", ["fixed", "table"])
  248. def test_read_hdf_series_mode_r(tmp_path, format, setup_path):
  249. # GH 16583
  250. # Tests that reading a Series saved to an HDF file
  251. # still works if a mode='r' argument is supplied
  252. series = tm.makeFloatSeries()
  253. path = tmp_path / setup_path
  254. series.to_hdf(path, key="data", format=format)
  255. result = read_hdf(path, key="data", mode="r")
  256. tm.assert_series_equal(result, series)
  257. def test_read_py2_hdf_file_in_py3(datapath):
  258. # GH 16781
  259. # tests reading a PeriodIndex DataFrame written in Python2 in Python3
  260. # the file was generated in Python 2.7 like so:
  261. #
  262. # df = DataFrame([1.,2,3], index=pd.PeriodIndex(
  263. # ['2015-01-01', '2015-01-02', '2015-01-05'], freq='B'))
  264. # df.to_hdf('periodindex_0.20.1_x86_64_darwin_2.7.13.h5', 'p')
  265. expected = DataFrame(
  266. [1.0, 2, 3],
  267. index=pd.PeriodIndex(["2015-01-01", "2015-01-02", "2015-01-05"], freq="B"),
  268. )
  269. with ensure_clean_store(
  270. datapath(
  271. "io", "data", "legacy_hdf", "periodindex_0.20.1_x86_64_darwin_2.7.13.h5"
  272. ),
  273. mode="r",
  274. ) as store:
  275. result = store["p"]
  276. tm.assert_frame_equal(result, expected)