test_put.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. import datetime
  2. import re
  3. from warnings import (
  4. catch_warnings,
  5. simplefilter,
  6. )
  7. import numpy as np
  8. import pytest
  9. from pandas._libs.tslibs import Timestamp
  10. import pandas as pd
  11. from pandas import (
  12. DataFrame,
  13. HDFStore,
  14. Index,
  15. MultiIndex,
  16. Series,
  17. _testing as tm,
  18. concat,
  19. )
  20. from pandas.tests.io.pytables.common import (
  21. _maybe_remove,
  22. ensure_clean_store,
  23. )
  24. from pandas.util import _test_decorators as td
  25. pytestmark = pytest.mark.single_cpu
  26. def test_format_type(tmp_path, setup_path):
  27. df = DataFrame({"A": [1, 2]})
  28. with HDFStore(tmp_path / setup_path) as store:
  29. store.put("a", df, format="fixed")
  30. store.put("b", df, format="table")
  31. assert store.get_storer("a").format_type == "fixed"
  32. assert store.get_storer("b").format_type == "table"
  33. def test_format_kwarg_in_constructor(tmp_path, setup_path):
  34. # GH 13291
  35. msg = "format is not a defined argument for HDFStore"
  36. with pytest.raises(ValueError, match=msg):
  37. HDFStore(tmp_path / setup_path, format="table")
  38. def test_api_default_format(tmp_path, setup_path):
  39. # default_format option
  40. with ensure_clean_store(setup_path) as store:
  41. df = tm.makeDataFrame()
  42. with pd.option_context("io.hdf.default_format", "fixed"):
  43. _maybe_remove(store, "df")
  44. store.put("df", df)
  45. assert not store.get_storer("df").is_table
  46. msg = "Can only append to Tables"
  47. with pytest.raises(ValueError, match=msg):
  48. store.append("df2", df)
  49. with pd.option_context("io.hdf.default_format", "table"):
  50. _maybe_remove(store, "df")
  51. store.put("df", df)
  52. assert store.get_storer("df").is_table
  53. _maybe_remove(store, "df2")
  54. store.append("df2", df)
  55. assert store.get_storer("df").is_table
  56. path = tmp_path / setup_path
  57. df = tm.makeDataFrame()
  58. with pd.option_context("io.hdf.default_format", "fixed"):
  59. df.to_hdf(path, "df")
  60. with HDFStore(path) as store:
  61. assert not store.get_storer("df").is_table
  62. with pytest.raises(ValueError, match=msg):
  63. df.to_hdf(path, "df2", append=True)
  64. with pd.option_context("io.hdf.default_format", "table"):
  65. df.to_hdf(path, "df3")
  66. with HDFStore(path) as store:
  67. assert store.get_storer("df3").is_table
  68. df.to_hdf(path, "df4", append=True)
  69. with HDFStore(path) as store:
  70. assert store.get_storer("df4").is_table
  71. def test_put(setup_path):
  72. with ensure_clean_store(setup_path) as store:
  73. ts = tm.makeTimeSeries()
  74. df = tm.makeTimeDataFrame()
  75. store["a"] = ts
  76. store["b"] = df[:10]
  77. store["foo/bar/bah"] = df[:10]
  78. store["foo"] = df[:10]
  79. store["/foo"] = df[:10]
  80. store.put("c", df[:10], format="table")
  81. # not OK, not a table
  82. msg = "Can only append to Tables"
  83. with pytest.raises(ValueError, match=msg):
  84. store.put("b", df[10:], append=True)
  85. # node does not currently exist, test _is_table_type returns False
  86. # in this case
  87. _maybe_remove(store, "f")
  88. with pytest.raises(ValueError, match=msg):
  89. store.put("f", df[10:], append=True)
  90. # can't put to a table (use append instead)
  91. with pytest.raises(ValueError, match=msg):
  92. store.put("c", df[10:], append=True)
  93. # overwrite table
  94. store.put("c", df[:10], format="table", append=False)
  95. tm.assert_frame_equal(df[:10], store["c"])
  96. def test_put_string_index(setup_path):
  97. with ensure_clean_store(setup_path) as store:
  98. index = Index([f"I am a very long string index: {i}" for i in range(20)])
  99. s = Series(np.arange(20), index=index)
  100. df = DataFrame({"A": s, "B": s})
  101. store["a"] = s
  102. tm.assert_series_equal(store["a"], s)
  103. store["b"] = df
  104. tm.assert_frame_equal(store["b"], df)
  105. # mixed length
  106. index = Index(
  107. ["abcdefghijklmnopqrstuvwxyz1234567890"]
  108. + [f"I am a very long string index: {i}" for i in range(20)]
  109. )
  110. s = Series(np.arange(21), index=index)
  111. df = DataFrame({"A": s, "B": s})
  112. store["a"] = s
  113. tm.assert_series_equal(store["a"], s)
  114. store["b"] = df
  115. tm.assert_frame_equal(store["b"], df)
  116. def test_put_compression(setup_path):
  117. with ensure_clean_store(setup_path) as store:
  118. df = tm.makeTimeDataFrame()
  119. store.put("c", df, format="table", complib="zlib")
  120. tm.assert_frame_equal(store["c"], df)
  121. # can't compress if format='fixed'
  122. msg = "Compression not supported on Fixed format stores"
  123. with pytest.raises(ValueError, match=msg):
  124. store.put("b", df, format="fixed", complib="zlib")
  125. @td.skip_if_windows
  126. def test_put_compression_blosc(setup_path):
  127. df = tm.makeTimeDataFrame()
  128. with ensure_clean_store(setup_path) as store:
  129. # can't compress if format='fixed'
  130. msg = "Compression not supported on Fixed format stores"
  131. with pytest.raises(ValueError, match=msg):
  132. store.put("b", df, format="fixed", complib="blosc")
  133. store.put("c", df, format="table", complib="blosc")
  134. tm.assert_frame_equal(store["c"], df)
  135. def test_put_mixed_type(setup_path):
  136. df = tm.makeTimeDataFrame()
  137. df["obj1"] = "foo"
  138. df["obj2"] = "bar"
  139. df["bool1"] = df["A"] > 0
  140. df["bool2"] = df["B"] > 0
  141. df["bool3"] = True
  142. df["int1"] = 1
  143. df["int2"] = 2
  144. df["timestamp1"] = Timestamp("20010102")
  145. df["timestamp2"] = Timestamp("20010103")
  146. df["datetime1"] = datetime.datetime(2001, 1, 2, 0, 0)
  147. df["datetime2"] = datetime.datetime(2001, 1, 3, 0, 0)
  148. df.loc[df.index[3:6], ["obj1"]] = np.nan
  149. df = df._consolidate()
  150. with ensure_clean_store(setup_path) as store:
  151. _maybe_remove(store, "df")
  152. # PerformanceWarning
  153. with catch_warnings(record=True):
  154. simplefilter("ignore", pd.errors.PerformanceWarning)
  155. store.put("df", df)
  156. expected = store.get("df")
  157. tm.assert_frame_equal(expected, df)
  158. @pytest.mark.parametrize(
  159. "format, index",
  160. [
  161. ["table", tm.makeFloatIndex],
  162. ["table", tm.makeStringIndex],
  163. ["table", tm.makeIntIndex],
  164. ["table", tm.makeDateIndex],
  165. ["fixed", tm.makeFloatIndex],
  166. ["fixed", tm.makeStringIndex],
  167. ["fixed", tm.makeIntIndex],
  168. ["fixed", tm.makeDateIndex],
  169. ["table", tm.makePeriodIndex], # GH#7796
  170. ["fixed", tm.makePeriodIndex],
  171. ],
  172. )
  173. def test_store_index_types(setup_path, format, index):
  174. # GH5386
  175. # test storing various index types
  176. with ensure_clean_store(setup_path) as store:
  177. df = DataFrame(np.random.randn(10, 2), columns=list("AB"))
  178. df.index = index(len(df))
  179. _maybe_remove(store, "df")
  180. store.put("df", df, format=format)
  181. tm.assert_frame_equal(df, store["df"])
  182. def test_column_multiindex(setup_path):
  183. # GH 4710
  184. # recreate multi-indexes properly
  185. index = MultiIndex.from_tuples(
  186. [("A", "a"), ("A", "b"), ("B", "a"), ("B", "b")], names=["first", "second"]
  187. )
  188. df = DataFrame(np.arange(12).reshape(3, 4), columns=index)
  189. expected = df.set_axis(df.index.to_numpy())
  190. with ensure_clean_store(setup_path) as store:
  191. store.put("df", df)
  192. tm.assert_frame_equal(
  193. store["df"], expected, check_index_type=True, check_column_type=True
  194. )
  195. store.put("df1", df, format="table")
  196. tm.assert_frame_equal(
  197. store["df1"], expected, check_index_type=True, check_column_type=True
  198. )
  199. msg = re.escape("cannot use a multi-index on axis [1] with data_columns ['A']")
  200. with pytest.raises(ValueError, match=msg):
  201. store.put("df2", df, format="table", data_columns=["A"])
  202. msg = re.escape("cannot use a multi-index on axis [1] with data_columns True")
  203. with pytest.raises(ValueError, match=msg):
  204. store.put("df3", df, format="table", data_columns=True)
  205. # appending multi-column on existing table (see GH 6167)
  206. with ensure_clean_store(setup_path) as store:
  207. store.append("df2", df)
  208. store.append("df2", df)
  209. tm.assert_frame_equal(store["df2"], concat((df, df)))
  210. # non_index_axes name
  211. df = DataFrame(np.arange(12).reshape(3, 4), columns=Index(list("ABCD"), name="foo"))
  212. expected = df.set_axis(df.index.to_numpy())
  213. with ensure_clean_store(setup_path) as store:
  214. store.put("df1", df, format="table")
  215. tm.assert_frame_equal(
  216. store["df1"], expected, check_index_type=True, check_column_type=True
  217. )
  218. def test_store_multiindex(setup_path):
  219. # validate multi-index names
  220. # GH 5527
  221. with ensure_clean_store(setup_path) as store:
  222. def make_index(names=None):
  223. return MultiIndex.from_tuples(
  224. [
  225. (datetime.datetime(2013, 12, d), s, t)
  226. for d in range(1, 3)
  227. for s in range(2)
  228. for t in range(3)
  229. ],
  230. names=names,
  231. )
  232. # no names
  233. _maybe_remove(store, "df")
  234. df = DataFrame(np.zeros((12, 2)), columns=["a", "b"], index=make_index())
  235. store.append("df", df)
  236. tm.assert_frame_equal(store.select("df"), df)
  237. # partial names
  238. _maybe_remove(store, "df")
  239. df = DataFrame(
  240. np.zeros((12, 2)),
  241. columns=["a", "b"],
  242. index=make_index(["date", None, None]),
  243. )
  244. store.append("df", df)
  245. tm.assert_frame_equal(store.select("df"), df)
  246. # series
  247. _maybe_remove(store, "s")
  248. s = Series(np.zeros(12), index=make_index(["date", None, None]))
  249. store.append("s", s)
  250. xp = Series(np.zeros(12), index=make_index(["date", "level_1", "level_2"]))
  251. tm.assert_series_equal(store.select("s"), xp)
  252. # dup with column
  253. _maybe_remove(store, "df")
  254. df = DataFrame(
  255. np.zeros((12, 2)),
  256. columns=["a", "b"],
  257. index=make_index(["date", "a", "t"]),
  258. )
  259. msg = "duplicate names/columns in the multi-index when storing as a table"
  260. with pytest.raises(ValueError, match=msg):
  261. store.append("df", df)
  262. # dup within level
  263. _maybe_remove(store, "df")
  264. df = DataFrame(
  265. np.zeros((12, 2)),
  266. columns=["a", "b"],
  267. index=make_index(["date", "date", "date"]),
  268. )
  269. with pytest.raises(ValueError, match=msg):
  270. store.append("df", df)
  271. # fully names
  272. _maybe_remove(store, "df")
  273. df = DataFrame(
  274. np.zeros((12, 2)),
  275. columns=["a", "b"],
  276. index=make_index(["date", "s", "t"]),
  277. )
  278. store.append("df", df)
  279. tm.assert_frame_equal(store.select("df"), df)
  280. @pytest.mark.parametrize("format", ["fixed", "table"])
  281. def test_store_periodindex(tmp_path, setup_path, format):
  282. # GH 7796
  283. # test of PeriodIndex in HDFStore
  284. df = DataFrame(
  285. np.random.randn(5, 1), index=pd.period_range("20220101", freq="M", periods=5)
  286. )
  287. path = tmp_path / setup_path
  288. df.to_hdf(path, "df", mode="w", format=format)
  289. expected = pd.read_hdf(path, "df")
  290. tm.assert_frame_equal(df, expected)