test_round_trip.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544
  1. import datetime
  2. import re
  3. from warnings import (
  4. catch_warnings,
  5. simplefilter,
  6. )
  7. import numpy as np
  8. import pytest
  9. from pandas._libs.tslibs import Timestamp
  10. from pandas.compat import is_platform_windows
  11. import pandas as pd
  12. from pandas import (
  13. DataFrame,
  14. Index,
  15. Series,
  16. _testing as tm,
  17. bdate_range,
  18. read_hdf,
  19. )
  20. from pandas.tests.io.pytables.common import (
  21. _maybe_remove,
  22. ensure_clean_store,
  23. )
  24. from pandas.util import _test_decorators as td
  25. _default_compressor = "blosc"
  26. pytestmark = pytest.mark.single_cpu
  27. def test_conv_read_write():
  28. with tm.ensure_clean() as path:
  29. def roundtrip(key, obj, **kwargs):
  30. obj.to_hdf(path, key, **kwargs)
  31. return read_hdf(path, key)
  32. o = tm.makeTimeSeries()
  33. tm.assert_series_equal(o, roundtrip("series", o))
  34. o = tm.makeStringSeries()
  35. tm.assert_series_equal(o, roundtrip("string_series", o))
  36. o = tm.makeDataFrame()
  37. tm.assert_frame_equal(o, roundtrip("frame", o))
  38. # table
  39. df = DataFrame({"A": range(5), "B": range(5)})
  40. df.to_hdf(path, "table", append=True)
  41. result = read_hdf(path, "table", where=["index>2"])
  42. tm.assert_frame_equal(df[df.index > 2], result)
  43. def test_long_strings(setup_path):
  44. # GH6166
  45. df = DataFrame(
  46. {"a": tm.rands_array(100, size=10)}, index=tm.rands_array(100, size=10)
  47. )
  48. with ensure_clean_store(setup_path) as store:
  49. store.append("df", df, data_columns=["a"])
  50. result = store.select("df")
  51. tm.assert_frame_equal(df, result)
  52. def test_api(tmp_path, setup_path):
  53. # GH4584
  54. # API issue when to_hdf doesn't accept append AND format args
  55. path = tmp_path / setup_path
  56. df = tm.makeDataFrame()
  57. df.iloc[:10].to_hdf(path, "df", append=True, format="table")
  58. df.iloc[10:].to_hdf(path, "df", append=True, format="table")
  59. tm.assert_frame_equal(read_hdf(path, "df"), df)
  60. # append to False
  61. df.iloc[:10].to_hdf(path, "df", append=False, format="table")
  62. df.iloc[10:].to_hdf(path, "df", append=True, format="table")
  63. tm.assert_frame_equal(read_hdf(path, "df"), df)
  64. def test_api_append(tmp_path, setup_path):
  65. path = tmp_path / setup_path
  66. df = tm.makeDataFrame()
  67. df.iloc[:10].to_hdf(path, "df", append=True)
  68. df.iloc[10:].to_hdf(path, "df", append=True, format="table")
  69. tm.assert_frame_equal(read_hdf(path, "df"), df)
  70. # append to False
  71. df.iloc[:10].to_hdf(path, "df", append=False, format="table")
  72. df.iloc[10:].to_hdf(path, "df", append=True)
  73. tm.assert_frame_equal(read_hdf(path, "df"), df)
  74. def test_api_2(tmp_path, setup_path):
  75. path = tmp_path / setup_path
  76. df = tm.makeDataFrame()
  77. df.to_hdf(path, "df", append=False, format="fixed")
  78. tm.assert_frame_equal(read_hdf(path, "df"), df)
  79. df.to_hdf(path, "df", append=False, format="f")
  80. tm.assert_frame_equal(read_hdf(path, "df"), df)
  81. df.to_hdf(path, "df", append=False)
  82. tm.assert_frame_equal(read_hdf(path, "df"), df)
  83. df.to_hdf(path, "df")
  84. tm.assert_frame_equal(read_hdf(path, "df"), df)
  85. with ensure_clean_store(setup_path) as store:
  86. df = tm.makeDataFrame()
  87. _maybe_remove(store, "df")
  88. store.append("df", df.iloc[:10], append=True, format="table")
  89. store.append("df", df.iloc[10:], append=True, format="table")
  90. tm.assert_frame_equal(store.select("df"), df)
  91. # append to False
  92. _maybe_remove(store, "df")
  93. store.append("df", df.iloc[:10], append=False, format="table")
  94. store.append("df", df.iloc[10:], append=True, format="table")
  95. tm.assert_frame_equal(store.select("df"), df)
  96. # formats
  97. _maybe_remove(store, "df")
  98. store.append("df", df.iloc[:10], append=False, format="table")
  99. store.append("df", df.iloc[10:], append=True, format="table")
  100. tm.assert_frame_equal(store.select("df"), df)
  101. _maybe_remove(store, "df")
  102. store.append("df", df.iloc[:10], append=False, format="table")
  103. store.append("df", df.iloc[10:], append=True, format=None)
  104. tm.assert_frame_equal(store.select("df"), df)
  105. def test_api_invalid(tmp_path, setup_path):
  106. path = tmp_path / setup_path
  107. # Invalid.
  108. df = tm.makeDataFrame()
  109. msg = "Can only append to Tables"
  110. with pytest.raises(ValueError, match=msg):
  111. df.to_hdf(path, "df", append=True, format="f")
  112. with pytest.raises(ValueError, match=msg):
  113. df.to_hdf(path, "df", append=True, format="fixed")
  114. msg = r"invalid HDFStore format specified \[foo\]"
  115. with pytest.raises(TypeError, match=msg):
  116. df.to_hdf(path, "df", append=True, format="foo")
  117. with pytest.raises(TypeError, match=msg):
  118. df.to_hdf(path, "df", append=False, format="foo")
  119. # File path doesn't exist
  120. path = ""
  121. msg = f"File {path} does not exist"
  122. with pytest.raises(FileNotFoundError, match=msg):
  123. read_hdf(path, "df")
  124. def test_get(setup_path):
  125. with ensure_clean_store(setup_path) as store:
  126. store["a"] = tm.makeTimeSeries()
  127. left = store.get("a")
  128. right = store["a"]
  129. tm.assert_series_equal(left, right)
  130. left = store.get("/a")
  131. right = store["/a"]
  132. tm.assert_series_equal(left, right)
  133. with pytest.raises(KeyError, match="'No object named b in the file'"):
  134. store.get("b")
  135. def test_put_integer(setup_path):
  136. # non-date, non-string index
  137. df = DataFrame(np.random.randn(50, 100))
  138. _check_roundtrip(df, tm.assert_frame_equal, setup_path)
  139. def test_table_values_dtypes_roundtrip(setup_path):
  140. with ensure_clean_store(setup_path) as store:
  141. df1 = DataFrame({"a": [1, 2, 3]}, dtype="f8")
  142. store.append("df_f8", df1)
  143. tm.assert_series_equal(df1.dtypes, store["df_f8"].dtypes)
  144. df2 = DataFrame({"a": [1, 2, 3]}, dtype="i8")
  145. store.append("df_i8", df2)
  146. tm.assert_series_equal(df2.dtypes, store["df_i8"].dtypes)
  147. # incompatible dtype
  148. msg = re.escape(
  149. "invalid combination of [values_axes] on appending data "
  150. "[name->values_block_0,cname->values_block_0,"
  151. "dtype->float64,kind->float,shape->(1, 3)] vs "
  152. "current table [name->values_block_0,"
  153. "cname->values_block_0,dtype->int64,kind->integer,"
  154. "shape->None]"
  155. )
  156. with pytest.raises(ValueError, match=msg):
  157. store.append("df_i8", df1)
  158. # check creation/storage/retrieval of float32 (a bit hacky to
  159. # actually create them thought)
  160. df1 = DataFrame(np.array([[1], [2], [3]], dtype="f4"), columns=["A"])
  161. store.append("df_f4", df1)
  162. tm.assert_series_equal(df1.dtypes, store["df_f4"].dtypes)
  163. assert df1.dtypes[0] == "float32"
  164. # check with mixed dtypes
  165. df1 = DataFrame(
  166. {
  167. c: Series(np.random.randint(5), dtype=c)
  168. for c in ["float32", "float64", "int32", "int64", "int16", "int8"]
  169. }
  170. )
  171. df1["string"] = "foo"
  172. df1["float322"] = 1.0
  173. df1["float322"] = df1["float322"].astype("float32")
  174. df1["bool"] = df1["float32"] > 0
  175. df1["time1"] = Timestamp("20130101")
  176. df1["time2"] = Timestamp("20130102")
  177. store.append("df_mixed_dtypes1", df1)
  178. result = store.select("df_mixed_dtypes1").dtypes.value_counts()
  179. result.index = [str(i) for i in result.index]
  180. expected = Series(
  181. {
  182. "float32": 2,
  183. "float64": 1,
  184. "int32": 1,
  185. "bool": 1,
  186. "int16": 1,
  187. "int8": 1,
  188. "int64": 1,
  189. "object": 1,
  190. "datetime64[ns]": 2,
  191. },
  192. name="count",
  193. )
  194. result = result.sort_index()
  195. expected = expected.sort_index()
  196. tm.assert_series_equal(result, expected)
  197. @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning")
  198. def test_series(setup_path):
  199. s = tm.makeStringSeries()
  200. _check_roundtrip(s, tm.assert_series_equal, path=setup_path)
  201. ts = tm.makeTimeSeries()
  202. _check_roundtrip(ts, tm.assert_series_equal, path=setup_path)
  203. ts2 = Series(ts.index, Index(ts.index, dtype=object))
  204. _check_roundtrip(ts2, tm.assert_series_equal, path=setup_path)
  205. ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object))
  206. _check_roundtrip(
  207. ts3, tm.assert_series_equal, path=setup_path, check_index_type=False
  208. )
  209. def test_float_index(setup_path):
  210. # GH #454
  211. index = np.random.randn(10)
  212. s = Series(np.random.randn(10), index=index)
  213. _check_roundtrip(s, tm.assert_series_equal, path=setup_path)
  214. def test_tuple_index(setup_path):
  215. # GH #492
  216. col = np.arange(10)
  217. idx = [(0.0, 1.0), (2.0, 3.0), (4.0, 5.0)]
  218. data = np.random.randn(30).reshape((3, 10))
  219. DF = DataFrame(data, index=idx, columns=col)
  220. with catch_warnings(record=True):
  221. simplefilter("ignore", pd.errors.PerformanceWarning)
  222. _check_roundtrip(DF, tm.assert_frame_equal, path=setup_path)
  223. @pytest.mark.filterwarnings("ignore::pandas.errors.PerformanceWarning")
  224. def test_index_types(setup_path):
  225. with catch_warnings(record=True):
  226. values = np.random.randn(2)
  227. func = lambda lhs, rhs: tm.assert_series_equal(lhs, rhs, check_index_type=True)
  228. with catch_warnings(record=True):
  229. ser = Series(values, [0, "y"])
  230. _check_roundtrip(ser, func, path=setup_path)
  231. with catch_warnings(record=True):
  232. ser = Series(values, [datetime.datetime.today(), 0])
  233. _check_roundtrip(ser, func, path=setup_path)
  234. with catch_warnings(record=True):
  235. ser = Series(values, ["y", 0])
  236. _check_roundtrip(ser, func, path=setup_path)
  237. with catch_warnings(record=True):
  238. ser = Series(values, [datetime.date.today(), "a"])
  239. _check_roundtrip(ser, func, path=setup_path)
  240. with catch_warnings(record=True):
  241. ser = Series(values, [0, "y"])
  242. _check_roundtrip(ser, func, path=setup_path)
  243. ser = Series(values, [datetime.datetime.today(), 0])
  244. _check_roundtrip(ser, func, path=setup_path)
  245. ser = Series(values, ["y", 0])
  246. _check_roundtrip(ser, func, path=setup_path)
  247. ser = Series(values, [datetime.date.today(), "a"])
  248. _check_roundtrip(ser, func, path=setup_path)
  249. ser = Series(values, [1.23, "b"])
  250. _check_roundtrip(ser, func, path=setup_path)
  251. ser = Series(values, [1, 1.53])
  252. _check_roundtrip(ser, func, path=setup_path)
  253. ser = Series(values, [1, 5])
  254. _check_roundtrip(ser, func, path=setup_path)
  255. ser = Series(
  256. values, [datetime.datetime(2012, 1, 1), datetime.datetime(2012, 1, 2)]
  257. )
  258. _check_roundtrip(ser, func, path=setup_path)
  259. def test_timeseries_preepoch(setup_path, request):
  260. dr = bdate_range("1/1/1940", "1/1/1960")
  261. ts = Series(np.random.randn(len(dr)), index=dr)
  262. try:
  263. _check_roundtrip(ts, tm.assert_series_equal, path=setup_path)
  264. except OverflowError:
  265. if is_platform_windows():
  266. request.node.add_marker(
  267. pytest.mark.xfail("known failure on some windows platforms")
  268. )
  269. raise
  270. @pytest.mark.parametrize(
  271. "compression", [False, pytest.param(True, marks=td.skip_if_windows)]
  272. )
  273. def test_frame(compression, setup_path):
  274. df = tm.makeDataFrame()
  275. # put in some random NAs
  276. df.iloc[0, 0] = np.nan
  277. df.iloc[5, 3] = np.nan
  278. _check_roundtrip_table(
  279. df, tm.assert_frame_equal, path=setup_path, compression=compression
  280. )
  281. _check_roundtrip(
  282. df, tm.assert_frame_equal, path=setup_path, compression=compression
  283. )
  284. tdf = tm.makeTimeDataFrame()
  285. _check_roundtrip(
  286. tdf, tm.assert_frame_equal, path=setup_path, compression=compression
  287. )
  288. with ensure_clean_store(setup_path) as store:
  289. # not consolidated
  290. df["foo"] = np.random.randn(len(df))
  291. store["df"] = df
  292. recons = store["df"]
  293. assert recons._mgr.is_consolidated()
  294. # empty
  295. _check_roundtrip(df[:0], tm.assert_frame_equal, path=setup_path)
  296. def test_empty_series_frame(setup_path):
  297. s0 = Series(dtype=object)
  298. s1 = Series(name="myseries", dtype=object)
  299. df0 = DataFrame()
  300. df1 = DataFrame(index=["a", "b", "c"])
  301. df2 = DataFrame(columns=["d", "e", "f"])
  302. _check_roundtrip(s0, tm.assert_series_equal, path=setup_path)
  303. _check_roundtrip(s1, tm.assert_series_equal, path=setup_path)
  304. _check_roundtrip(df0, tm.assert_frame_equal, path=setup_path)
  305. _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path)
  306. _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path)
  307. @pytest.mark.parametrize("dtype", [np.int64, np.float64, object, "m8[ns]", "M8[ns]"])
  308. def test_empty_series(dtype, setup_path):
  309. s = Series(dtype=dtype)
  310. _check_roundtrip(s, tm.assert_series_equal, path=setup_path)
  311. def test_can_serialize_dates(setup_path):
  312. rng = [x.date() for x in bdate_range("1/1/2000", "1/30/2000")]
  313. frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
  314. _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
  315. def test_store_hierarchical(setup_path, multiindex_dataframe_random_data):
  316. frame = multiindex_dataframe_random_data
  317. _check_roundtrip(frame, tm.assert_frame_equal, path=setup_path)
  318. _check_roundtrip(frame.T, tm.assert_frame_equal, path=setup_path)
  319. _check_roundtrip(frame["A"], tm.assert_series_equal, path=setup_path)
  320. # check that the names are stored
  321. with ensure_clean_store(setup_path) as store:
  322. store["frame"] = frame
  323. recons = store["frame"]
  324. tm.assert_frame_equal(recons, frame)
  325. @pytest.mark.parametrize(
  326. "compression", [False, pytest.param(True, marks=td.skip_if_windows)]
  327. )
  328. def test_store_mixed(compression, setup_path):
  329. def _make_one():
  330. df = tm.makeDataFrame()
  331. df["obj1"] = "foo"
  332. df["obj2"] = "bar"
  333. df["bool1"] = df["A"] > 0
  334. df["bool2"] = df["B"] > 0
  335. df["int1"] = 1
  336. df["int2"] = 2
  337. return df._consolidate()
  338. df1 = _make_one()
  339. df2 = _make_one()
  340. _check_roundtrip(df1, tm.assert_frame_equal, path=setup_path)
  341. _check_roundtrip(df2, tm.assert_frame_equal, path=setup_path)
  342. with ensure_clean_store(setup_path) as store:
  343. store["obj"] = df1
  344. tm.assert_frame_equal(store["obj"], df1)
  345. store["obj"] = df2
  346. tm.assert_frame_equal(store["obj"], df2)
  347. # check that can store Series of all of these types
  348. _check_roundtrip(
  349. df1["obj1"],
  350. tm.assert_series_equal,
  351. path=setup_path,
  352. compression=compression,
  353. )
  354. _check_roundtrip(
  355. df1["bool1"],
  356. tm.assert_series_equal,
  357. path=setup_path,
  358. compression=compression,
  359. )
  360. _check_roundtrip(
  361. df1["int1"],
  362. tm.assert_series_equal,
  363. path=setup_path,
  364. compression=compression,
  365. )
  366. def _check_roundtrip(obj, comparator, path, compression=False, **kwargs):
  367. options = {}
  368. if compression:
  369. options["complib"] = _default_compressor
  370. with ensure_clean_store(path, "w", **options) as store:
  371. store["obj"] = obj
  372. retrieved = store["obj"]
  373. comparator(retrieved, obj, **kwargs)
  374. def _check_roundtrip_table(obj, comparator, path, compression=False):
  375. options = {}
  376. if compression:
  377. options["complib"] = _default_compressor
  378. with ensure_clean_store(path, "w", **options) as store:
  379. store.put("obj", obj, format="table")
  380. retrieved = store["obj"]
  381. comparator(retrieved, obj)
  382. def test_unicode_index(setup_path):
  383. unicode_values = ["\u03c3", "\u03c3\u03c3"]
  384. # PerformanceWarning
  385. with catch_warnings(record=True):
  386. simplefilter("ignore", pd.errors.PerformanceWarning)
  387. s = Series(np.random.randn(len(unicode_values)), unicode_values)
  388. _check_roundtrip(s, tm.assert_series_equal, path=setup_path)
  389. def test_unicode_longer_encoded(setup_path):
  390. # GH 11234
  391. char = "\u0394"
  392. df = DataFrame({"A": [char]})
  393. with ensure_clean_store(setup_path) as store:
  394. store.put("df", df, format="table", encoding="utf-8")
  395. result = store.get("df")
  396. tm.assert_frame_equal(result, df)
  397. df = DataFrame({"A": ["a", char], "B": ["b", "b"]})
  398. with ensure_clean_store(setup_path) as store:
  399. store.put("df", df, format="table", encoding="utf-8")
  400. result = store.get("df")
  401. tm.assert_frame_equal(result, df)
  402. def test_store_datetime_mixed(setup_path):
  403. df = DataFrame({"a": [1, 2, 3], "b": [1.0, 2.0, 3.0], "c": ["a", "b", "c"]})
  404. ts = tm.makeTimeSeries()
  405. df["d"] = ts.index[:3]
  406. _check_roundtrip(df, tm.assert_frame_equal, path=setup_path)
  407. def test_round_trip_equals(tmp_path, setup_path):
  408. # GH 9330
  409. df = DataFrame({"B": [1, 2], "A": ["x", "y"]})
  410. path = tmp_path / setup_path
  411. df.to_hdf(path, "df", format="table")
  412. other = read_hdf(path, "df")
  413. tm.assert_frame_equal(df, other)
  414. assert df.equals(other)
  415. assert other.equals(df)