test_append.py 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910
  1. import datetime
  2. from datetime import timedelta
  3. import re
  4. from warnings import catch_warnings
  5. import numpy as np
  6. import pytest
  7. from pandas._libs.tslibs import Timestamp
  8. import pandas.util._test_decorators as td
  9. import pandas as pd
  10. from pandas import (
  11. DataFrame,
  12. Series,
  13. _testing as tm,
  14. concat,
  15. date_range,
  16. read_hdf,
  17. )
  18. from pandas.tests.io.pytables.common import (
  19. _maybe_remove,
  20. ensure_clean_store,
  21. )
  22. pytestmark = pytest.mark.single_cpu
  23. def test_append(setup_path):
  24. with ensure_clean_store(setup_path) as store:
  25. # this is allowed by almost always don't want to do it
  26. # tables.NaturalNameWarning):
  27. with catch_warnings(record=True):
  28. df = tm.makeTimeDataFrame()
  29. _maybe_remove(store, "df1")
  30. store.append("df1", df[:10])
  31. store.append("df1", df[10:])
  32. tm.assert_frame_equal(store["df1"], df)
  33. _maybe_remove(store, "df2")
  34. store.put("df2", df[:10], format="table")
  35. store.append("df2", df[10:])
  36. tm.assert_frame_equal(store["df2"], df)
  37. _maybe_remove(store, "df3")
  38. store.append("/df3", df[:10])
  39. store.append("/df3", df[10:])
  40. tm.assert_frame_equal(store["df3"], df)
  41. # this is allowed by almost always don't want to do it
  42. # tables.NaturalNameWarning
  43. _maybe_remove(store, "/df3 foo")
  44. store.append("/df3 foo", df[:10])
  45. store.append("/df3 foo", df[10:])
  46. tm.assert_frame_equal(store["df3 foo"], df)
  47. # dtype issues - mizxed type in a single object column
  48. df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
  49. df["mixed_column"] = "testing"
  50. df.loc[2, "mixed_column"] = np.nan
  51. _maybe_remove(store, "df")
  52. store.append("df", df)
  53. tm.assert_frame_equal(store["df"], df)
  54. # uints - test storage of uints
  55. uint_data = DataFrame(
  56. {
  57. "u08": Series(
  58. np.random.randint(0, high=255, size=5), dtype=np.uint8
  59. ),
  60. "u16": Series(
  61. np.random.randint(0, high=65535, size=5), dtype=np.uint16
  62. ),
  63. "u32": Series(
  64. np.random.randint(0, high=2**30, size=5), dtype=np.uint32
  65. ),
  66. "u64": Series(
  67. [2**58, 2**59, 2**60, 2**61, 2**62],
  68. dtype=np.uint64,
  69. ),
  70. },
  71. index=np.arange(5),
  72. )
  73. _maybe_remove(store, "uints")
  74. store.append("uints", uint_data)
  75. tm.assert_frame_equal(store["uints"], uint_data, check_index_type=True)
  76. # uints - test storage of uints in indexable columns
  77. _maybe_remove(store, "uints")
  78. # 64-bit indices not yet supported
  79. store.append("uints", uint_data, data_columns=["u08", "u16", "u32"])
  80. tm.assert_frame_equal(store["uints"], uint_data, check_index_type=True)
  81. def test_append_series(setup_path):
  82. with ensure_clean_store(setup_path) as store:
  83. # basic
  84. ss = tm.makeStringSeries()
  85. ts = tm.makeTimeSeries()
  86. ns = Series(np.arange(100))
  87. store.append("ss", ss)
  88. result = store["ss"]
  89. tm.assert_series_equal(result, ss)
  90. assert result.name is None
  91. store.append("ts", ts)
  92. result = store["ts"]
  93. tm.assert_series_equal(result, ts)
  94. assert result.name is None
  95. ns.name = "foo"
  96. store.append("ns", ns)
  97. result = store["ns"]
  98. tm.assert_series_equal(result, ns)
  99. assert result.name == ns.name
  100. # select on the values
  101. expected = ns[ns > 60]
  102. result = store.select("ns", "foo>60")
  103. tm.assert_series_equal(result, expected)
  104. # select on the index and values
  105. expected = ns[(ns > 70) & (ns.index < 90)]
  106. result = store.select("ns", "foo>70 and index<90")
  107. tm.assert_series_equal(result, expected, check_index_type=True)
  108. # multi-index
  109. mi = DataFrame(np.random.randn(5, 1), columns=["A"])
  110. mi["B"] = np.arange(len(mi))
  111. mi["C"] = "foo"
  112. mi.loc[3:5, "C"] = "bar"
  113. mi.set_index(["C", "B"], inplace=True)
  114. s = mi.stack()
  115. s.index = s.index.droplevel(2)
  116. store.append("mi", s)
  117. tm.assert_series_equal(store["mi"], s, check_index_type=True)
  118. def test_append_some_nans(setup_path):
  119. with ensure_clean_store(setup_path) as store:
  120. df = DataFrame(
  121. {
  122. "A": Series(np.random.randn(20)).astype("int32"),
  123. "A1": np.random.randn(20),
  124. "A2": np.random.randn(20),
  125. "B": "foo",
  126. "C": "bar",
  127. "D": Timestamp("20010101"),
  128. "E": datetime.datetime(2001, 1, 2, 0, 0),
  129. },
  130. index=np.arange(20),
  131. )
  132. # some nans
  133. _maybe_remove(store, "df1")
  134. df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan
  135. store.append("df1", df[:10])
  136. store.append("df1", df[10:])
  137. tm.assert_frame_equal(store["df1"], df, check_index_type=True)
  138. # first column
  139. df1 = df.copy()
  140. df1["A1"] = np.nan
  141. _maybe_remove(store, "df1")
  142. store.append("df1", df1[:10])
  143. store.append("df1", df1[10:])
  144. tm.assert_frame_equal(store["df1"], df1, check_index_type=True)
  145. # 2nd column
  146. df2 = df.copy()
  147. df2["A2"] = np.nan
  148. _maybe_remove(store, "df2")
  149. store.append("df2", df2[:10])
  150. store.append("df2", df2[10:])
  151. tm.assert_frame_equal(store["df2"], df2, check_index_type=True)
  152. # datetimes
  153. df3 = df.copy()
  154. df3["E"] = np.nan
  155. _maybe_remove(store, "df3")
  156. store.append("df3", df3[:10])
  157. store.append("df3", df3[10:])
  158. tm.assert_frame_equal(store["df3"], df3, check_index_type=True)
  159. def test_append_all_nans(setup_path):
  160. with ensure_clean_store(setup_path) as store:
  161. df = DataFrame(
  162. {"A1": np.random.randn(20), "A2": np.random.randn(20)},
  163. index=np.arange(20),
  164. )
  165. df.loc[0:15, :] = np.nan
  166. # nan some entire rows (dropna=True)
  167. _maybe_remove(store, "df")
  168. store.append("df", df[:10], dropna=True)
  169. store.append("df", df[10:], dropna=True)
  170. tm.assert_frame_equal(store["df"], df[-4:], check_index_type=True)
  171. # nan some entire rows (dropna=False)
  172. _maybe_remove(store, "df2")
  173. store.append("df2", df[:10], dropna=False)
  174. store.append("df2", df[10:], dropna=False)
  175. tm.assert_frame_equal(store["df2"], df, check_index_type=True)
  176. # tests the option io.hdf.dropna_table
  177. with pd.option_context("io.hdf.dropna_table", False):
  178. _maybe_remove(store, "df3")
  179. store.append("df3", df[:10])
  180. store.append("df3", df[10:])
  181. tm.assert_frame_equal(store["df3"], df)
  182. with pd.option_context("io.hdf.dropna_table", True):
  183. _maybe_remove(store, "df4")
  184. store.append("df4", df[:10])
  185. store.append("df4", df[10:])
  186. tm.assert_frame_equal(store["df4"], df[-4:])
  187. # nan some entire rows (string are still written!)
  188. df = DataFrame(
  189. {
  190. "A1": np.random.randn(20),
  191. "A2": np.random.randn(20),
  192. "B": "foo",
  193. "C": "bar",
  194. },
  195. index=np.arange(20),
  196. )
  197. df.loc[0:15, :] = np.nan
  198. _maybe_remove(store, "df")
  199. store.append("df", df[:10], dropna=True)
  200. store.append("df", df[10:], dropna=True)
  201. tm.assert_frame_equal(store["df"], df, check_index_type=True)
  202. _maybe_remove(store, "df2")
  203. store.append("df2", df[:10], dropna=False)
  204. store.append("df2", df[10:], dropna=False)
  205. tm.assert_frame_equal(store["df2"], df, check_index_type=True)
  206. # nan some entire rows (but since we have dates they are still
  207. # written!)
  208. df = DataFrame(
  209. {
  210. "A1": np.random.randn(20),
  211. "A2": np.random.randn(20),
  212. "B": "foo",
  213. "C": "bar",
  214. "D": Timestamp("20010101"),
  215. "E": datetime.datetime(2001, 1, 2, 0, 0),
  216. },
  217. index=np.arange(20),
  218. )
  219. df.loc[0:15, :] = np.nan
  220. _maybe_remove(store, "df")
  221. store.append("df", df[:10], dropna=True)
  222. store.append("df", df[10:], dropna=True)
  223. tm.assert_frame_equal(store["df"], df, check_index_type=True)
  224. _maybe_remove(store, "df2")
  225. store.append("df2", df[:10], dropna=False)
  226. store.append("df2", df[10:], dropna=False)
  227. tm.assert_frame_equal(store["df2"], df, check_index_type=True)
  228. def test_append_frame_column_oriented(setup_path):
  229. with ensure_clean_store(setup_path) as store:
  230. # column oriented
  231. df = tm.makeTimeDataFrame()
  232. df.index = df.index._with_freq(None) # freq doesn't round-trip
  233. _maybe_remove(store, "df1")
  234. store.append("df1", df.iloc[:, :2], axes=["columns"])
  235. store.append("df1", df.iloc[:, 2:])
  236. tm.assert_frame_equal(store["df1"], df)
  237. result = store.select("df1", "columns=A")
  238. expected = df.reindex(columns=["A"])
  239. tm.assert_frame_equal(expected, result)
  240. # selection on the non-indexable
  241. result = store.select("df1", ("columns=A", "index=df.index[0:4]"))
  242. expected = df.reindex(columns=["A"], index=df.index[0:4])
  243. tm.assert_frame_equal(expected, result)
  244. # this isn't supported
  245. msg = re.escape(
  246. "passing a filterable condition to a non-table indexer "
  247. "[Filter: Not Initialized]"
  248. )
  249. with pytest.raises(TypeError, match=msg):
  250. store.select("df1", "columns=A and index>df.index[4]")
  251. def test_append_with_different_block_ordering(setup_path):
  252. # GH 4096; using same frames, but different block orderings
  253. with ensure_clean_store(setup_path) as store:
  254. for i in range(10):
  255. df = DataFrame(np.random.randn(10, 2), columns=list("AB"))
  256. df["index"] = range(10)
  257. df["index"] += i * 10
  258. df["int64"] = Series([1] * len(df), dtype="int64")
  259. df["int16"] = Series([1] * len(df), dtype="int16")
  260. if i % 2 == 0:
  261. del df["int64"]
  262. df["int64"] = Series([1] * len(df), dtype="int64")
  263. if i % 3 == 0:
  264. a = df.pop("A")
  265. df["A"] = a
  266. df.set_index("index", inplace=True)
  267. store.append("df", df)
  268. # test a different ordering but with more fields (like invalid
  269. # combinations)
  270. with ensure_clean_store(setup_path) as store:
  271. df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64")
  272. df["int64"] = Series([1] * len(df), dtype="int64")
  273. df["int16"] = Series([1] * len(df), dtype="int16")
  274. store.append("df", df)
  275. # store additional fields in different blocks
  276. df["int16_2"] = Series([1] * len(df), dtype="int16")
  277. msg = re.escape(
  278. "cannot match existing table structure for [int16] on appending data"
  279. )
  280. with pytest.raises(ValueError, match=msg):
  281. store.append("df", df)
  282. # store multiple additional fields in different blocks
  283. df["float_3"] = Series([1.0] * len(df), dtype="float64")
  284. msg = re.escape(
  285. "cannot match existing table structure for [A,B] on appending data"
  286. )
  287. with pytest.raises(ValueError, match=msg):
  288. store.append("df", df)
  289. def test_append_with_strings(setup_path):
  290. with ensure_clean_store(setup_path) as store:
  291. with catch_warnings(record=True):
  292. def check_col(key, name, size):
  293. assert (
  294. getattr(store.get_storer(key).table.description, name).itemsize
  295. == size
  296. )
  297. # avoid truncation on elements
  298. df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
  299. store.append("df_big", df)
  300. tm.assert_frame_equal(store.select("df_big"), df)
  301. check_col("df_big", "values_block_1", 15)
  302. # appending smaller string ok
  303. df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]])
  304. store.append("df_big", df2)
  305. expected = concat([df, df2])
  306. tm.assert_frame_equal(store.select("df_big"), expected)
  307. check_col("df_big", "values_block_1", 15)
  308. # avoid truncation on elements
  309. df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
  310. store.append("df_big2", df, min_itemsize={"values": 50})
  311. tm.assert_frame_equal(store.select("df_big2"), df)
  312. check_col("df_big2", "values_block_1", 50)
  313. # bigger string on next append
  314. store.append("df_new", df)
  315. df_new = DataFrame(
  316. [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]]
  317. )
  318. msg = (
  319. r"Trying to store a string with len \[26\] in "
  320. r"\[values_block_1\] column but\n"
  321. r"this column has a limit of \[15\]!\n"
  322. "Consider using min_itemsize to preset the sizes on these "
  323. "columns"
  324. )
  325. with pytest.raises(ValueError, match=msg):
  326. store.append("df_new", df_new)
  327. # min_itemsize on Series index (GH 11412)
  328. df = tm.makeMixedDataFrame().set_index("C")
  329. store.append("ss", df["B"], min_itemsize={"index": 4})
  330. tm.assert_series_equal(store.select("ss"), df["B"])
  331. # same as above, with data_columns=True
  332. store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4})
  333. tm.assert_series_equal(store.select("ss2"), df["B"])
  334. # min_itemsize in index without appending (GH 10381)
  335. store.put("ss3", df, format="table", min_itemsize={"index": 6})
  336. # just make sure there is a longer string:
  337. df2 = df.copy().reset_index().assign(C="longer").set_index("C")
  338. store.append("ss3", df2)
  339. tm.assert_frame_equal(store.select("ss3"), concat([df, df2]))
  340. # same as above, with a Series
  341. store.put("ss4", df["B"], format="table", min_itemsize={"index": 6})
  342. store.append("ss4", df2["B"])
  343. tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]]))
  344. # with nans
  345. _maybe_remove(store, "df")
  346. df = tm.makeTimeDataFrame()
  347. df["string"] = "foo"
  348. df.loc[df.index[1:4], "string"] = np.nan
  349. df["string2"] = "bar"
  350. df.loc[df.index[4:8], "string2"] = np.nan
  351. df["string3"] = "bah"
  352. df.loc[df.index[1:], "string3"] = np.nan
  353. store.append("df", df)
  354. result = store.select("df")
  355. tm.assert_frame_equal(result, df)
  356. with ensure_clean_store(setup_path) as store:
  357. df = DataFrame({"A": "foo", "B": "bar"}, index=range(10))
  358. # a min_itemsize that creates a data_column
  359. _maybe_remove(store, "df")
  360. store.append("df", df, min_itemsize={"A": 200})
  361. check_col("df", "A", 200)
  362. assert store.get_storer("df").data_columns == ["A"]
  363. # a min_itemsize that creates a data_column2
  364. _maybe_remove(store, "df")
  365. store.append("df", df, data_columns=["B"], min_itemsize={"A": 200})
  366. check_col("df", "A", 200)
  367. assert store.get_storer("df").data_columns == ["B", "A"]
  368. # a min_itemsize that creates a data_column2
  369. _maybe_remove(store, "df")
  370. store.append("df", df, data_columns=["B"], min_itemsize={"values": 200})
  371. check_col("df", "B", 200)
  372. check_col("df", "values_block_0", 200)
  373. assert store.get_storer("df").data_columns == ["B"]
  374. # infer the .typ on subsequent appends
  375. _maybe_remove(store, "df")
  376. store.append("df", df[:5], min_itemsize=200)
  377. store.append("df", df[5:], min_itemsize=200)
  378. tm.assert_frame_equal(store["df"], df)
  379. # invalid min_itemsize keys
  380. df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"])
  381. _maybe_remove(store, "df")
  382. msg = re.escape(
  383. "min_itemsize has the key [foo] which is not an axis or data_column"
  384. )
  385. with pytest.raises(ValueError, match=msg):
  386. store.append("df", df, min_itemsize={"foo": 20, "foobar": 20})
  387. def test_append_with_empty_string(setup_path):
  388. with ensure_clean_store(setup_path) as store:
  389. # with all empty strings (GH 12242)
  390. df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]})
  391. store.append("df", df[:-1], min_itemsize={"x": 1})
  392. store.append("df", df[-1:], min_itemsize={"x": 1})
  393. tm.assert_frame_equal(store.select("df"), df)
  394. def test_append_with_data_columns(setup_path):
  395. with ensure_clean_store(setup_path) as store:
  396. df = tm.makeTimeDataFrame()
  397. df.iloc[0, df.columns.get_loc("B")] = 1.0
  398. _maybe_remove(store, "df")
  399. store.append("df", df[:2], data_columns=["B"])
  400. store.append("df", df[2:])
  401. tm.assert_frame_equal(store["df"], df)
  402. # check that we have indices created
  403. assert store._handle.root.df.table.cols.index.is_indexed is True
  404. assert store._handle.root.df.table.cols.B.is_indexed is True
  405. # data column searching
  406. result = store.select("df", "B>0")
  407. expected = df[df.B > 0]
  408. tm.assert_frame_equal(result, expected)
  409. # data column searching (with an indexable and a data_columns)
  410. result = store.select("df", "B>0 and index>df.index[3]")
  411. df_new = df.reindex(index=df.index[4:])
  412. expected = df_new[df_new.B > 0]
  413. tm.assert_frame_equal(result, expected)
  414. # data column selection with a string data_column
  415. df_new = df.copy()
  416. df_new["string"] = "foo"
  417. df_new.loc[df_new.index[1:4], "string"] = np.nan
  418. df_new.loc[df_new.index[5:6], "string"] = "bar"
  419. _maybe_remove(store, "df")
  420. store.append("df", df_new, data_columns=["string"])
  421. result = store.select("df", "string='foo'")
  422. expected = df_new[df_new.string == "foo"]
  423. tm.assert_frame_equal(result, expected)
  424. # using min_itemsize and a data column
  425. def check_col(key, name, size):
  426. assert (
  427. getattr(store.get_storer(key).table.description, name).itemsize == size
  428. )
  429. with ensure_clean_store(setup_path) as store:
  430. _maybe_remove(store, "df")
  431. store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30})
  432. check_col("df", "string", 30)
  433. _maybe_remove(store, "df")
  434. store.append("df", df_new, data_columns=["string"], min_itemsize=30)
  435. check_col("df", "string", 30)
  436. _maybe_remove(store, "df")
  437. store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30})
  438. check_col("df", "string", 30)
  439. with ensure_clean_store(setup_path) as store:
  440. df_new["string2"] = "foobarbah"
  441. df_new["string_block1"] = "foobarbah1"
  442. df_new["string_block2"] = "foobarbah2"
  443. _maybe_remove(store, "df")
  444. store.append(
  445. "df",
  446. df_new,
  447. data_columns=["string", "string2"],
  448. min_itemsize={"string": 30, "string2": 40, "values": 50},
  449. )
  450. check_col("df", "string", 30)
  451. check_col("df", "string2", 40)
  452. check_col("df", "values_block_1", 50)
  453. with ensure_clean_store(setup_path) as store:
  454. # multiple data columns
  455. df_new = df.copy()
  456. df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0
  457. df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0
  458. df_new["string"] = "foo"
  459. sl = df_new.columns.get_loc("string")
  460. df_new.iloc[1:4, sl] = np.nan
  461. df_new.iloc[5:6, sl] = "bar"
  462. df_new["string2"] = "foo"
  463. sl = df_new.columns.get_loc("string2")
  464. df_new.iloc[2:5, sl] = np.nan
  465. df_new.iloc[7:8, sl] = "bar"
  466. _maybe_remove(store, "df")
  467. store.append("df", df_new, data_columns=["A", "B", "string", "string2"])
  468. result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0")
  469. expected = df_new[
  470. (df_new.string == "foo")
  471. & (df_new.string2 == "foo")
  472. & (df_new.A > 0)
  473. & (df_new.B < 0)
  474. ]
  475. tm.assert_frame_equal(result, expected, check_freq=False)
  476. # FIXME: 2020-05-07 freq check randomly fails in the CI
  477. # yield an empty frame
  478. result = store.select("df", "string='foo' and string2='cool'")
  479. expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")]
  480. tm.assert_frame_equal(result, expected)
  481. with ensure_clean_store(setup_path) as store:
  482. # doc example
  483. df_dc = df.copy()
  484. df_dc["string"] = "foo"
  485. df_dc.loc[df_dc.index[4:6], "string"] = np.nan
  486. df_dc.loc[df_dc.index[7:9], "string"] = "bar"
  487. df_dc["string2"] = "cool"
  488. df_dc["datetime"] = Timestamp("20010102")
  489. df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan
  490. _maybe_remove(store, "df_dc")
  491. store.append(
  492. "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"]
  493. )
  494. result = store.select("df_dc", "B>0")
  495. expected = df_dc[df_dc.B > 0]
  496. tm.assert_frame_equal(result, expected)
  497. result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"])
  498. expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
  499. tm.assert_frame_equal(result, expected, check_freq=False)
  500. # FIXME: 2020-12-07 intermittent build failures here with freq of
  501. # None instead of BDay(4)
  502. with ensure_clean_store(setup_path) as store:
  503. # doc example part 2
  504. np.random.seed(1234)
  505. index = date_range("1/1/2000", periods=8)
  506. df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])
  507. df_dc["string"] = "foo"
  508. df_dc.loc[df_dc.index[4:6], "string"] = np.nan
  509. df_dc.loc[df_dc.index[7:9], "string"] = "bar"
  510. df_dc[["B", "C"]] = df_dc[["B", "C"]].abs()
  511. df_dc["string2"] = "cool"
  512. # on-disk operations
  513. store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"])
  514. result = store.select("df_dc", "B>0")
  515. expected = df_dc[df_dc.B > 0]
  516. tm.assert_frame_equal(result, expected)
  517. result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"'])
  518. expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
  519. tm.assert_frame_equal(result, expected)
  520. def test_append_hierarchical(tmp_path, setup_path, multiindex_dataframe_random_data):
  521. df = multiindex_dataframe_random_data
  522. df.columns.name = None
  523. with ensure_clean_store(setup_path) as store:
  524. store.append("mi", df)
  525. result = store.select("mi")
  526. tm.assert_frame_equal(result, df)
  527. # GH 3748
  528. result = store.select("mi", columns=["A", "B"])
  529. expected = df.reindex(columns=["A", "B"])
  530. tm.assert_frame_equal(result, expected)
  531. path = tmp_path / "test.hdf"
  532. df.to_hdf(path, "df", format="table")
  533. result = read_hdf(path, "df", columns=["A", "B"])
  534. expected = df.reindex(columns=["A", "B"])
  535. tm.assert_frame_equal(result, expected)
  536. def test_append_misc(setup_path):
  537. with ensure_clean_store(setup_path) as store:
  538. df = tm.makeDataFrame()
  539. store.append("df", df, chunksize=1)
  540. result = store.select("df")
  541. tm.assert_frame_equal(result, df)
  542. store.append("df1", df, expectedrows=10)
  543. result = store.select("df1")
  544. tm.assert_frame_equal(result, df)
  545. @pytest.mark.parametrize("chunksize", [10, 200, 1000])
  546. def test_append_misc_chunksize(setup_path, chunksize):
  547. # more chunksize in append tests
  548. df = tm.makeDataFrame()
  549. df["string"] = "foo"
  550. df["float322"] = 1.0
  551. df["float322"] = df["float322"].astype("float32")
  552. df["bool"] = df["float322"] > 0
  553. df["time1"] = Timestamp("20130101")
  554. df["time2"] = Timestamp("20130102")
  555. with ensure_clean_store(setup_path, mode="w") as store:
  556. store.append("obj", df, chunksize=chunksize)
  557. result = store.select("obj")
  558. tm.assert_frame_equal(result, df)
  559. def test_append_misc_empty_frame(setup_path):
  560. # empty frame, GH4273
  561. with ensure_clean_store(setup_path) as store:
  562. # 0 len
  563. df_empty = DataFrame(columns=list("ABC"))
  564. store.append("df", df_empty)
  565. with pytest.raises(KeyError, match="'No object named df in the file'"):
  566. store.select("df")
  567. # repeated append of 0/non-zero frames
  568. df = DataFrame(np.random.rand(10, 3), columns=list("ABC"))
  569. store.append("df", df)
  570. tm.assert_frame_equal(store.select("df"), df)
  571. store.append("df", df_empty)
  572. tm.assert_frame_equal(store.select("df"), df)
  573. # store
  574. df = DataFrame(columns=list("ABC"))
  575. store.put("df2", df)
  576. tm.assert_frame_equal(store.select("df2"), df)
  577. # TODO(ArrayManager) currently we rely on falling back to BlockManager, but
  578. # the conversion from AM->BM converts the invalid object dtype column into
  579. # a datetime64 column no longer raising an error
  580. @td.skip_array_manager_not_yet_implemented
  581. def test_append_raise(setup_path):
  582. with ensure_clean_store(setup_path) as store:
  583. # test append with invalid input to get good error messages
  584. # list in column
  585. df = tm.makeDataFrame()
  586. df["invalid"] = [["a"]] * len(df)
  587. assert df.dtypes["invalid"] == np.object_
  588. msg = re.escape(
  589. """Cannot serialize the column [invalid]
  590. because its data contents are not [string] but [mixed] object dtype"""
  591. )
  592. with pytest.raises(TypeError, match=msg):
  593. store.append("df", df)
  594. # multiple invalid columns
  595. df["invalid2"] = [["a"]] * len(df)
  596. df["invalid3"] = [["a"]] * len(df)
  597. with pytest.raises(TypeError, match=msg):
  598. store.append("df", df)
  599. # datetime with embedded nans as object
  600. df = tm.makeDataFrame()
  601. s = Series(datetime.datetime(2001, 1, 2), index=df.index)
  602. s = s.astype(object)
  603. s[0:5] = np.nan
  604. df["invalid"] = s
  605. assert df.dtypes["invalid"] == np.object_
  606. msg = "too many timezones in this block, create separate data columns"
  607. with pytest.raises(TypeError, match=msg):
  608. store.append("df", df)
  609. # directly ndarray
  610. msg = "value must be None, Series, or DataFrame"
  611. with pytest.raises(TypeError, match=msg):
  612. store.append("df", np.arange(10))
  613. # series directly
  614. msg = re.escape(
  615. "cannot properly create the storer for: "
  616. "[group->df,value-><class 'pandas.core.series.Series'>]"
  617. )
  618. with pytest.raises(TypeError, match=msg):
  619. store.append("df", Series(np.arange(10)))
  620. # appending an incompatible table
  621. df = tm.makeDataFrame()
  622. store.append("df", df)
  623. df["foo"] = "foo"
  624. msg = re.escape(
  625. "invalid combination of [non_index_axes] on appending data "
  626. "[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table "
  627. "[(1, ['A', 'B', 'C', 'D'])]"
  628. )
  629. with pytest.raises(ValueError, match=msg):
  630. store.append("df", df)
  631. # incompatible type (GH 41897)
  632. _maybe_remove(store, "df")
  633. df["foo"] = Timestamp("20130101")
  634. store.append("df", df)
  635. df["foo"] = "bar"
  636. msg = re.escape(
  637. "invalid combination of [values_axes] on appending data "
  638. "[name->values_block_1,cname->values_block_1,"
  639. "dtype->bytes24,kind->string,shape->(1, 30)] "
  640. "vs current table "
  641. "[name->values_block_1,cname->values_block_1,"
  642. "dtype->datetime64,kind->datetime64,shape->None]"
  643. )
  644. with pytest.raises(ValueError, match=msg):
  645. store.append("df", df)
  646. def test_append_with_timedelta(setup_path):
  647. # GH 3577
  648. # append timedelta
  649. df = DataFrame(
  650. {
  651. "A": Timestamp("20130101"),
  652. "B": [
  653. Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10)
  654. ],
  655. }
  656. )
  657. df["C"] = df["A"] - df["B"]
  658. df.loc[3:5, "C"] = np.nan
  659. with ensure_clean_store(setup_path) as store:
  660. # table
  661. _maybe_remove(store, "df")
  662. store.append("df", df, data_columns=True)
  663. result = store.select("df")
  664. tm.assert_frame_equal(result, df)
  665. result = store.select("df", where="C<100000")
  666. tm.assert_frame_equal(result, df)
  667. result = store.select("df", where="C<pd.Timedelta('-3D')")
  668. tm.assert_frame_equal(result, df.iloc[3:])
  669. result = store.select("df", "C<'-3D'")
  670. tm.assert_frame_equal(result, df.iloc[3:])
  671. # a bit hacky here as we don't really deal with the NaT properly
  672. result = store.select("df", "C<'-500000s'")
  673. result = result.dropna(subset=["C"])
  674. tm.assert_frame_equal(result, df.iloc[6:])
  675. result = store.select("df", "C<'-3.5D'")
  676. result = result.iloc[1:]
  677. tm.assert_frame_equal(result, df.iloc[4:])
  678. # fixed
  679. _maybe_remove(store, "df2")
  680. store.put("df2", df)
  681. result = store.select("df2")
  682. tm.assert_frame_equal(result, df)
  683. def test_append_to_multiple(setup_path):
  684. df1 = tm.makeTimeDataFrame()
  685. df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
  686. df2["foo"] = "bar"
  687. df = concat([df1, df2], axis=1)
  688. with ensure_clean_store(setup_path) as store:
  689. # exceptions
  690. msg = "append_to_multiple requires a selector that is in passed dict"
  691. with pytest.raises(ValueError, match=msg):
  692. store.append_to_multiple(
  693. {"df1": ["A", "B"], "df2": None}, df, selector="df3"
  694. )
  695. with pytest.raises(ValueError, match=msg):
  696. store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3")
  697. msg = (
  698. "append_to_multiple must have a dictionary specified as the way to "
  699. "split the value"
  700. )
  701. with pytest.raises(ValueError, match=msg):
  702. store.append_to_multiple("df1", df, "df1")
  703. # regular operation
  704. store.append_to_multiple({"df1": ["A", "B"], "df2": None}, df, selector="df1")
  705. result = store.select_as_multiple(
  706. ["df1", "df2"], where=["A>0", "B>0"], selector="df1"
  707. )
  708. expected = df[(df.A > 0) & (df.B > 0)]
  709. tm.assert_frame_equal(result, expected)
  710. def test_append_to_multiple_dropna(setup_path):
  711. df1 = tm.makeTimeDataFrame()
  712. df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
  713. df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
  714. df = concat([df1, df2], axis=1)
  715. with ensure_clean_store(setup_path) as store:
  716. # dropna=True should guarantee rows are synchronized
  717. store.append_to_multiple(
  718. {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True
  719. )
  720. result = store.select_as_multiple(["df1", "df2"])
  721. expected = df.dropna()
  722. tm.assert_frame_equal(result, expected, check_index_type=True)
  723. tm.assert_index_equal(store.select("df1").index, store.select("df2").index)
  724. def test_append_to_multiple_dropna_false(setup_path):
  725. df1 = tm.makeTimeDataFrame()
  726. df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
  727. df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
  728. df = concat([df1, df2], axis=1)
  729. with ensure_clean_store(setup_path) as store, pd.option_context(
  730. "io.hdf.dropna_table", True
  731. ):
  732. # dropna=False shouldn't synchronize row indexes
  733. store.append_to_multiple(
  734. {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False
  735. )
  736. msg = "all tables must have exactly the same nrows!"
  737. with pytest.raises(ValueError, match=msg):
  738. store.select_as_multiple(["df1a", "df2a"])
  739. assert not store.select("df1a").index.equals(store.select("df2a").index)
  740. def test_append_to_multiple_min_itemsize(setup_path):
  741. # GH 11238
  742. df = DataFrame(
  743. {
  744. "IX": np.arange(1, 21),
  745. "Num": np.arange(1, 21),
  746. "BigNum": np.arange(1, 21) * 88,
  747. "Str": ["a" for _ in range(20)],
  748. "LongStr": ["abcde" for _ in range(20)],
  749. }
  750. )
  751. expected = df.iloc[[0]]
  752. with ensure_clean_store(setup_path) as store:
  753. store.append_to_multiple(
  754. {
  755. "index": ["IX"],
  756. "nums": ["Num", "BigNum"],
  757. "strs": ["Str", "LongStr"],
  758. },
  759. df.iloc[[0]],
  760. "index",
  761. min_itemsize={"Str": 10, "LongStr": 100, "Num": 2},
  762. )
  763. result = store.select_as_multiple(["index", "nums", "strs"])
  764. tm.assert_frame_equal(result, expected, check_index_type=True)