123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910 |
- import datetime
- from datetime import timedelta
- import re
- from warnings import catch_warnings
- import numpy as np
- import pytest
- from pandas._libs.tslibs import Timestamp
- import pandas.util._test_decorators as td
- import pandas as pd
- from pandas import (
- DataFrame,
- Series,
- _testing as tm,
- concat,
- date_range,
- read_hdf,
- )
- from pandas.tests.io.pytables.common import (
- _maybe_remove,
- ensure_clean_store,
- )
- pytestmark = pytest.mark.single_cpu
- def test_append(setup_path):
- with ensure_clean_store(setup_path) as store:
- # this is allowed by almost always don't want to do it
- # tables.NaturalNameWarning):
- with catch_warnings(record=True):
- df = tm.makeTimeDataFrame()
- _maybe_remove(store, "df1")
- store.append("df1", df[:10])
- store.append("df1", df[10:])
- tm.assert_frame_equal(store["df1"], df)
- _maybe_remove(store, "df2")
- store.put("df2", df[:10], format="table")
- store.append("df2", df[10:])
- tm.assert_frame_equal(store["df2"], df)
- _maybe_remove(store, "df3")
- store.append("/df3", df[:10])
- store.append("/df3", df[10:])
- tm.assert_frame_equal(store["df3"], df)
- # this is allowed by almost always don't want to do it
- # tables.NaturalNameWarning
- _maybe_remove(store, "/df3 foo")
- store.append("/df3 foo", df[:10])
- store.append("/df3 foo", df[10:])
- tm.assert_frame_equal(store["df3 foo"], df)
- # dtype issues - mizxed type in a single object column
- df = DataFrame(data=[[1, 2], [0, 1], [1, 2], [0, 0]])
- df["mixed_column"] = "testing"
- df.loc[2, "mixed_column"] = np.nan
- _maybe_remove(store, "df")
- store.append("df", df)
- tm.assert_frame_equal(store["df"], df)
- # uints - test storage of uints
- uint_data = DataFrame(
- {
- "u08": Series(
- np.random.randint(0, high=255, size=5), dtype=np.uint8
- ),
- "u16": Series(
- np.random.randint(0, high=65535, size=5), dtype=np.uint16
- ),
- "u32": Series(
- np.random.randint(0, high=2**30, size=5), dtype=np.uint32
- ),
- "u64": Series(
- [2**58, 2**59, 2**60, 2**61, 2**62],
- dtype=np.uint64,
- ),
- },
- index=np.arange(5),
- )
- _maybe_remove(store, "uints")
- store.append("uints", uint_data)
- tm.assert_frame_equal(store["uints"], uint_data, check_index_type=True)
- # uints - test storage of uints in indexable columns
- _maybe_remove(store, "uints")
- # 64-bit indices not yet supported
- store.append("uints", uint_data, data_columns=["u08", "u16", "u32"])
- tm.assert_frame_equal(store["uints"], uint_data, check_index_type=True)
- def test_append_series(setup_path):
- with ensure_clean_store(setup_path) as store:
- # basic
- ss = tm.makeStringSeries()
- ts = tm.makeTimeSeries()
- ns = Series(np.arange(100))
- store.append("ss", ss)
- result = store["ss"]
- tm.assert_series_equal(result, ss)
- assert result.name is None
- store.append("ts", ts)
- result = store["ts"]
- tm.assert_series_equal(result, ts)
- assert result.name is None
- ns.name = "foo"
- store.append("ns", ns)
- result = store["ns"]
- tm.assert_series_equal(result, ns)
- assert result.name == ns.name
- # select on the values
- expected = ns[ns > 60]
- result = store.select("ns", "foo>60")
- tm.assert_series_equal(result, expected)
- # select on the index and values
- expected = ns[(ns > 70) & (ns.index < 90)]
- result = store.select("ns", "foo>70 and index<90")
- tm.assert_series_equal(result, expected, check_index_type=True)
- # multi-index
- mi = DataFrame(np.random.randn(5, 1), columns=["A"])
- mi["B"] = np.arange(len(mi))
- mi["C"] = "foo"
- mi.loc[3:5, "C"] = "bar"
- mi.set_index(["C", "B"], inplace=True)
- s = mi.stack()
- s.index = s.index.droplevel(2)
- store.append("mi", s)
- tm.assert_series_equal(store["mi"], s, check_index_type=True)
- def test_append_some_nans(setup_path):
- with ensure_clean_store(setup_path) as store:
- df = DataFrame(
- {
- "A": Series(np.random.randn(20)).astype("int32"),
- "A1": np.random.randn(20),
- "A2": np.random.randn(20),
- "B": "foo",
- "C": "bar",
- "D": Timestamp("20010101"),
- "E": datetime.datetime(2001, 1, 2, 0, 0),
- },
- index=np.arange(20),
- )
- # some nans
- _maybe_remove(store, "df1")
- df.loc[0:15, ["A1", "B", "D", "E"]] = np.nan
- store.append("df1", df[:10])
- store.append("df1", df[10:])
- tm.assert_frame_equal(store["df1"], df, check_index_type=True)
- # first column
- df1 = df.copy()
- df1["A1"] = np.nan
- _maybe_remove(store, "df1")
- store.append("df1", df1[:10])
- store.append("df1", df1[10:])
- tm.assert_frame_equal(store["df1"], df1, check_index_type=True)
- # 2nd column
- df2 = df.copy()
- df2["A2"] = np.nan
- _maybe_remove(store, "df2")
- store.append("df2", df2[:10])
- store.append("df2", df2[10:])
- tm.assert_frame_equal(store["df2"], df2, check_index_type=True)
- # datetimes
- df3 = df.copy()
- df3["E"] = np.nan
- _maybe_remove(store, "df3")
- store.append("df3", df3[:10])
- store.append("df3", df3[10:])
- tm.assert_frame_equal(store["df3"], df3, check_index_type=True)
- def test_append_all_nans(setup_path):
- with ensure_clean_store(setup_path) as store:
- df = DataFrame(
- {"A1": np.random.randn(20), "A2": np.random.randn(20)},
- index=np.arange(20),
- )
- df.loc[0:15, :] = np.nan
- # nan some entire rows (dropna=True)
- _maybe_remove(store, "df")
- store.append("df", df[:10], dropna=True)
- store.append("df", df[10:], dropna=True)
- tm.assert_frame_equal(store["df"], df[-4:], check_index_type=True)
- # nan some entire rows (dropna=False)
- _maybe_remove(store, "df2")
- store.append("df2", df[:10], dropna=False)
- store.append("df2", df[10:], dropna=False)
- tm.assert_frame_equal(store["df2"], df, check_index_type=True)
- # tests the option io.hdf.dropna_table
- with pd.option_context("io.hdf.dropna_table", False):
- _maybe_remove(store, "df3")
- store.append("df3", df[:10])
- store.append("df3", df[10:])
- tm.assert_frame_equal(store["df3"], df)
- with pd.option_context("io.hdf.dropna_table", True):
- _maybe_remove(store, "df4")
- store.append("df4", df[:10])
- store.append("df4", df[10:])
- tm.assert_frame_equal(store["df4"], df[-4:])
- # nan some entire rows (string are still written!)
- df = DataFrame(
- {
- "A1": np.random.randn(20),
- "A2": np.random.randn(20),
- "B": "foo",
- "C": "bar",
- },
- index=np.arange(20),
- )
- df.loc[0:15, :] = np.nan
- _maybe_remove(store, "df")
- store.append("df", df[:10], dropna=True)
- store.append("df", df[10:], dropna=True)
- tm.assert_frame_equal(store["df"], df, check_index_type=True)
- _maybe_remove(store, "df2")
- store.append("df2", df[:10], dropna=False)
- store.append("df2", df[10:], dropna=False)
- tm.assert_frame_equal(store["df2"], df, check_index_type=True)
- # nan some entire rows (but since we have dates they are still
- # written!)
- df = DataFrame(
- {
- "A1": np.random.randn(20),
- "A2": np.random.randn(20),
- "B": "foo",
- "C": "bar",
- "D": Timestamp("20010101"),
- "E": datetime.datetime(2001, 1, 2, 0, 0),
- },
- index=np.arange(20),
- )
- df.loc[0:15, :] = np.nan
- _maybe_remove(store, "df")
- store.append("df", df[:10], dropna=True)
- store.append("df", df[10:], dropna=True)
- tm.assert_frame_equal(store["df"], df, check_index_type=True)
- _maybe_remove(store, "df2")
- store.append("df2", df[:10], dropna=False)
- store.append("df2", df[10:], dropna=False)
- tm.assert_frame_equal(store["df2"], df, check_index_type=True)
- def test_append_frame_column_oriented(setup_path):
- with ensure_clean_store(setup_path) as store:
- # column oriented
- df = tm.makeTimeDataFrame()
- df.index = df.index._with_freq(None) # freq doesn't round-trip
- _maybe_remove(store, "df1")
- store.append("df1", df.iloc[:, :2], axes=["columns"])
- store.append("df1", df.iloc[:, 2:])
- tm.assert_frame_equal(store["df1"], df)
- result = store.select("df1", "columns=A")
- expected = df.reindex(columns=["A"])
- tm.assert_frame_equal(expected, result)
- # selection on the non-indexable
- result = store.select("df1", ("columns=A", "index=df.index[0:4]"))
- expected = df.reindex(columns=["A"], index=df.index[0:4])
- tm.assert_frame_equal(expected, result)
- # this isn't supported
- msg = re.escape(
- "passing a filterable condition to a non-table indexer "
- "[Filter: Not Initialized]"
- )
- with pytest.raises(TypeError, match=msg):
- store.select("df1", "columns=A and index>df.index[4]")
- def test_append_with_different_block_ordering(setup_path):
- # GH 4096; using same frames, but different block orderings
- with ensure_clean_store(setup_path) as store:
- for i in range(10):
- df = DataFrame(np.random.randn(10, 2), columns=list("AB"))
- df["index"] = range(10)
- df["index"] += i * 10
- df["int64"] = Series([1] * len(df), dtype="int64")
- df["int16"] = Series([1] * len(df), dtype="int16")
- if i % 2 == 0:
- del df["int64"]
- df["int64"] = Series([1] * len(df), dtype="int64")
- if i % 3 == 0:
- a = df.pop("A")
- df["A"] = a
- df.set_index("index", inplace=True)
- store.append("df", df)
- # test a different ordering but with more fields (like invalid
- # combinations)
- with ensure_clean_store(setup_path) as store:
- df = DataFrame(np.random.randn(10, 2), columns=list("AB"), dtype="float64")
- df["int64"] = Series([1] * len(df), dtype="int64")
- df["int16"] = Series([1] * len(df), dtype="int16")
- store.append("df", df)
- # store additional fields in different blocks
- df["int16_2"] = Series([1] * len(df), dtype="int16")
- msg = re.escape(
- "cannot match existing table structure for [int16] on appending data"
- )
- with pytest.raises(ValueError, match=msg):
- store.append("df", df)
- # store multiple additional fields in different blocks
- df["float_3"] = Series([1.0] * len(df), dtype="float64")
- msg = re.escape(
- "cannot match existing table structure for [A,B] on appending data"
- )
- with pytest.raises(ValueError, match=msg):
- store.append("df", df)
- def test_append_with_strings(setup_path):
- with ensure_clean_store(setup_path) as store:
- with catch_warnings(record=True):
- def check_col(key, name, size):
- assert (
- getattr(store.get_storer(key).table.description, name).itemsize
- == size
- )
- # avoid truncation on elements
- df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
- store.append("df_big", df)
- tm.assert_frame_equal(store.select("df_big"), df)
- check_col("df_big", "values_block_1", 15)
- # appending smaller string ok
- df2 = DataFrame([[124, "asdqy"], [346, "dggnhefbdfb"]])
- store.append("df_big", df2)
- expected = concat([df, df2])
- tm.assert_frame_equal(store.select("df_big"), expected)
- check_col("df_big", "values_block_1", 15)
- # avoid truncation on elements
- df = DataFrame([[123, "asdqwerty"], [345, "dggnhebbsdfbdfb"]])
- store.append("df_big2", df, min_itemsize={"values": 50})
- tm.assert_frame_equal(store.select("df_big2"), df)
- check_col("df_big2", "values_block_1", 50)
- # bigger string on next append
- store.append("df_new", df)
- df_new = DataFrame(
- [[124, "abcdefqhij"], [346, "abcdefghijklmnopqrtsuvwxyz"]]
- )
- msg = (
- r"Trying to store a string with len \[26\] in "
- r"\[values_block_1\] column but\n"
- r"this column has a limit of \[15\]!\n"
- "Consider using min_itemsize to preset the sizes on these "
- "columns"
- )
- with pytest.raises(ValueError, match=msg):
- store.append("df_new", df_new)
- # min_itemsize on Series index (GH 11412)
- df = tm.makeMixedDataFrame().set_index("C")
- store.append("ss", df["B"], min_itemsize={"index": 4})
- tm.assert_series_equal(store.select("ss"), df["B"])
- # same as above, with data_columns=True
- store.append("ss2", df["B"], data_columns=True, min_itemsize={"index": 4})
- tm.assert_series_equal(store.select("ss2"), df["B"])
- # min_itemsize in index without appending (GH 10381)
- store.put("ss3", df, format="table", min_itemsize={"index": 6})
- # just make sure there is a longer string:
- df2 = df.copy().reset_index().assign(C="longer").set_index("C")
- store.append("ss3", df2)
- tm.assert_frame_equal(store.select("ss3"), concat([df, df2]))
- # same as above, with a Series
- store.put("ss4", df["B"], format="table", min_itemsize={"index": 6})
- store.append("ss4", df2["B"])
- tm.assert_series_equal(store.select("ss4"), concat([df["B"], df2["B"]]))
- # with nans
- _maybe_remove(store, "df")
- df = tm.makeTimeDataFrame()
- df["string"] = "foo"
- df.loc[df.index[1:4], "string"] = np.nan
- df["string2"] = "bar"
- df.loc[df.index[4:8], "string2"] = np.nan
- df["string3"] = "bah"
- df.loc[df.index[1:], "string3"] = np.nan
- store.append("df", df)
- result = store.select("df")
- tm.assert_frame_equal(result, df)
- with ensure_clean_store(setup_path) as store:
- df = DataFrame({"A": "foo", "B": "bar"}, index=range(10))
- # a min_itemsize that creates a data_column
- _maybe_remove(store, "df")
- store.append("df", df, min_itemsize={"A": 200})
- check_col("df", "A", 200)
- assert store.get_storer("df").data_columns == ["A"]
- # a min_itemsize that creates a data_column2
- _maybe_remove(store, "df")
- store.append("df", df, data_columns=["B"], min_itemsize={"A": 200})
- check_col("df", "A", 200)
- assert store.get_storer("df").data_columns == ["B", "A"]
- # a min_itemsize that creates a data_column2
- _maybe_remove(store, "df")
- store.append("df", df, data_columns=["B"], min_itemsize={"values": 200})
- check_col("df", "B", 200)
- check_col("df", "values_block_0", 200)
- assert store.get_storer("df").data_columns == ["B"]
- # infer the .typ on subsequent appends
- _maybe_remove(store, "df")
- store.append("df", df[:5], min_itemsize=200)
- store.append("df", df[5:], min_itemsize=200)
- tm.assert_frame_equal(store["df"], df)
- # invalid min_itemsize keys
- df = DataFrame(["foo", "foo", "foo", "barh", "barh", "barh"], columns=["A"])
- _maybe_remove(store, "df")
- msg = re.escape(
- "min_itemsize has the key [foo] which is not an axis or data_column"
- )
- with pytest.raises(ValueError, match=msg):
- store.append("df", df, min_itemsize={"foo": 20, "foobar": 20})
- def test_append_with_empty_string(setup_path):
- with ensure_clean_store(setup_path) as store:
- # with all empty strings (GH 12242)
- df = DataFrame({"x": ["a", "b", "c", "d", "e", "f", ""]})
- store.append("df", df[:-1], min_itemsize={"x": 1})
- store.append("df", df[-1:], min_itemsize={"x": 1})
- tm.assert_frame_equal(store.select("df"), df)
- def test_append_with_data_columns(setup_path):
- with ensure_clean_store(setup_path) as store:
- df = tm.makeTimeDataFrame()
- df.iloc[0, df.columns.get_loc("B")] = 1.0
- _maybe_remove(store, "df")
- store.append("df", df[:2], data_columns=["B"])
- store.append("df", df[2:])
- tm.assert_frame_equal(store["df"], df)
- # check that we have indices created
- assert store._handle.root.df.table.cols.index.is_indexed is True
- assert store._handle.root.df.table.cols.B.is_indexed is True
- # data column searching
- result = store.select("df", "B>0")
- expected = df[df.B > 0]
- tm.assert_frame_equal(result, expected)
- # data column searching (with an indexable and a data_columns)
- result = store.select("df", "B>0 and index>df.index[3]")
- df_new = df.reindex(index=df.index[4:])
- expected = df_new[df_new.B > 0]
- tm.assert_frame_equal(result, expected)
- # data column selection with a string data_column
- df_new = df.copy()
- df_new["string"] = "foo"
- df_new.loc[df_new.index[1:4], "string"] = np.nan
- df_new.loc[df_new.index[5:6], "string"] = "bar"
- _maybe_remove(store, "df")
- store.append("df", df_new, data_columns=["string"])
- result = store.select("df", "string='foo'")
- expected = df_new[df_new.string == "foo"]
- tm.assert_frame_equal(result, expected)
- # using min_itemsize and a data column
- def check_col(key, name, size):
- assert (
- getattr(store.get_storer(key).table.description, name).itemsize == size
- )
- with ensure_clean_store(setup_path) as store:
- _maybe_remove(store, "df")
- store.append("df", df_new, data_columns=["string"], min_itemsize={"string": 30})
- check_col("df", "string", 30)
- _maybe_remove(store, "df")
- store.append("df", df_new, data_columns=["string"], min_itemsize=30)
- check_col("df", "string", 30)
- _maybe_remove(store, "df")
- store.append("df", df_new, data_columns=["string"], min_itemsize={"values": 30})
- check_col("df", "string", 30)
- with ensure_clean_store(setup_path) as store:
- df_new["string2"] = "foobarbah"
- df_new["string_block1"] = "foobarbah1"
- df_new["string_block2"] = "foobarbah2"
- _maybe_remove(store, "df")
- store.append(
- "df",
- df_new,
- data_columns=["string", "string2"],
- min_itemsize={"string": 30, "string2": 40, "values": 50},
- )
- check_col("df", "string", 30)
- check_col("df", "string2", 40)
- check_col("df", "values_block_1", 50)
- with ensure_clean_store(setup_path) as store:
- # multiple data columns
- df_new = df.copy()
- df_new.iloc[0, df_new.columns.get_loc("A")] = 1.0
- df_new.iloc[0, df_new.columns.get_loc("B")] = -1.0
- df_new["string"] = "foo"
- sl = df_new.columns.get_loc("string")
- df_new.iloc[1:4, sl] = np.nan
- df_new.iloc[5:6, sl] = "bar"
- df_new["string2"] = "foo"
- sl = df_new.columns.get_loc("string2")
- df_new.iloc[2:5, sl] = np.nan
- df_new.iloc[7:8, sl] = "bar"
- _maybe_remove(store, "df")
- store.append("df", df_new, data_columns=["A", "B", "string", "string2"])
- result = store.select("df", "string='foo' and string2='foo' and A>0 and B<0")
- expected = df_new[
- (df_new.string == "foo")
- & (df_new.string2 == "foo")
- & (df_new.A > 0)
- & (df_new.B < 0)
- ]
- tm.assert_frame_equal(result, expected, check_freq=False)
- # FIXME: 2020-05-07 freq check randomly fails in the CI
- # yield an empty frame
- result = store.select("df", "string='foo' and string2='cool'")
- expected = df_new[(df_new.string == "foo") & (df_new.string2 == "cool")]
- tm.assert_frame_equal(result, expected)
- with ensure_clean_store(setup_path) as store:
- # doc example
- df_dc = df.copy()
- df_dc["string"] = "foo"
- df_dc.loc[df_dc.index[4:6], "string"] = np.nan
- df_dc.loc[df_dc.index[7:9], "string"] = "bar"
- df_dc["string2"] = "cool"
- df_dc["datetime"] = Timestamp("20010102")
- df_dc.loc[df_dc.index[3:5], ["A", "B", "datetime"]] = np.nan
- _maybe_remove(store, "df_dc")
- store.append(
- "df_dc", df_dc, data_columns=["B", "C", "string", "string2", "datetime"]
- )
- result = store.select("df_dc", "B>0")
- expected = df_dc[df_dc.B > 0]
- tm.assert_frame_equal(result, expected)
- result = store.select("df_dc", ["B > 0", "C > 0", "string == foo"])
- expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
- tm.assert_frame_equal(result, expected, check_freq=False)
- # FIXME: 2020-12-07 intermittent build failures here with freq of
- # None instead of BDay(4)
- with ensure_clean_store(setup_path) as store:
- # doc example part 2
- np.random.seed(1234)
- index = date_range("1/1/2000", periods=8)
- df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=["A", "B", "C"])
- df_dc["string"] = "foo"
- df_dc.loc[df_dc.index[4:6], "string"] = np.nan
- df_dc.loc[df_dc.index[7:9], "string"] = "bar"
- df_dc[["B", "C"]] = df_dc[["B", "C"]].abs()
- df_dc["string2"] = "cool"
- # on-disk operations
- store.append("df_dc", df_dc, data_columns=["B", "C", "string", "string2"])
- result = store.select("df_dc", "B>0")
- expected = df_dc[df_dc.B > 0]
- tm.assert_frame_equal(result, expected)
- result = store.select("df_dc", ["B > 0", "C > 0", 'string == "foo"'])
- expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == "foo")]
- tm.assert_frame_equal(result, expected)
- def test_append_hierarchical(tmp_path, setup_path, multiindex_dataframe_random_data):
- df = multiindex_dataframe_random_data
- df.columns.name = None
- with ensure_clean_store(setup_path) as store:
- store.append("mi", df)
- result = store.select("mi")
- tm.assert_frame_equal(result, df)
- # GH 3748
- result = store.select("mi", columns=["A", "B"])
- expected = df.reindex(columns=["A", "B"])
- tm.assert_frame_equal(result, expected)
- path = tmp_path / "test.hdf"
- df.to_hdf(path, "df", format="table")
- result = read_hdf(path, "df", columns=["A", "B"])
- expected = df.reindex(columns=["A", "B"])
- tm.assert_frame_equal(result, expected)
- def test_append_misc(setup_path):
- with ensure_clean_store(setup_path) as store:
- df = tm.makeDataFrame()
- store.append("df", df, chunksize=1)
- result = store.select("df")
- tm.assert_frame_equal(result, df)
- store.append("df1", df, expectedrows=10)
- result = store.select("df1")
- tm.assert_frame_equal(result, df)
- @pytest.mark.parametrize("chunksize", [10, 200, 1000])
- def test_append_misc_chunksize(setup_path, chunksize):
- # more chunksize in append tests
- df = tm.makeDataFrame()
- df["string"] = "foo"
- df["float322"] = 1.0
- df["float322"] = df["float322"].astype("float32")
- df["bool"] = df["float322"] > 0
- df["time1"] = Timestamp("20130101")
- df["time2"] = Timestamp("20130102")
- with ensure_clean_store(setup_path, mode="w") as store:
- store.append("obj", df, chunksize=chunksize)
- result = store.select("obj")
- tm.assert_frame_equal(result, df)
- def test_append_misc_empty_frame(setup_path):
- # empty frame, GH4273
- with ensure_clean_store(setup_path) as store:
- # 0 len
- df_empty = DataFrame(columns=list("ABC"))
- store.append("df", df_empty)
- with pytest.raises(KeyError, match="'No object named df in the file'"):
- store.select("df")
- # repeated append of 0/non-zero frames
- df = DataFrame(np.random.rand(10, 3), columns=list("ABC"))
- store.append("df", df)
- tm.assert_frame_equal(store.select("df"), df)
- store.append("df", df_empty)
- tm.assert_frame_equal(store.select("df"), df)
- # store
- df = DataFrame(columns=list("ABC"))
- store.put("df2", df)
- tm.assert_frame_equal(store.select("df2"), df)
- # TODO(ArrayManager) currently we rely on falling back to BlockManager, but
- # the conversion from AM->BM converts the invalid object dtype column into
- # a datetime64 column no longer raising an error
- @td.skip_array_manager_not_yet_implemented
- def test_append_raise(setup_path):
- with ensure_clean_store(setup_path) as store:
- # test append with invalid input to get good error messages
- # list in column
- df = tm.makeDataFrame()
- df["invalid"] = [["a"]] * len(df)
- assert df.dtypes["invalid"] == np.object_
- msg = re.escape(
- """Cannot serialize the column [invalid]
- because its data contents are not [string] but [mixed] object dtype"""
- )
- with pytest.raises(TypeError, match=msg):
- store.append("df", df)
- # multiple invalid columns
- df["invalid2"] = [["a"]] * len(df)
- df["invalid3"] = [["a"]] * len(df)
- with pytest.raises(TypeError, match=msg):
- store.append("df", df)
- # datetime with embedded nans as object
- df = tm.makeDataFrame()
- s = Series(datetime.datetime(2001, 1, 2), index=df.index)
- s = s.astype(object)
- s[0:5] = np.nan
- df["invalid"] = s
- assert df.dtypes["invalid"] == np.object_
- msg = "too many timezones in this block, create separate data columns"
- with pytest.raises(TypeError, match=msg):
- store.append("df", df)
- # directly ndarray
- msg = "value must be None, Series, or DataFrame"
- with pytest.raises(TypeError, match=msg):
- store.append("df", np.arange(10))
- # series directly
- msg = re.escape(
- "cannot properly create the storer for: "
- "[group->df,value-><class 'pandas.core.series.Series'>]"
- )
- with pytest.raises(TypeError, match=msg):
- store.append("df", Series(np.arange(10)))
- # appending an incompatible table
- df = tm.makeDataFrame()
- store.append("df", df)
- df["foo"] = "foo"
- msg = re.escape(
- "invalid combination of [non_index_axes] on appending data "
- "[(1, ['A', 'B', 'C', 'D', 'foo'])] vs current table "
- "[(1, ['A', 'B', 'C', 'D'])]"
- )
- with pytest.raises(ValueError, match=msg):
- store.append("df", df)
- # incompatible type (GH 41897)
- _maybe_remove(store, "df")
- df["foo"] = Timestamp("20130101")
- store.append("df", df)
- df["foo"] = "bar"
- msg = re.escape(
- "invalid combination of [values_axes] on appending data "
- "[name->values_block_1,cname->values_block_1,"
- "dtype->bytes24,kind->string,shape->(1, 30)] "
- "vs current table "
- "[name->values_block_1,cname->values_block_1,"
- "dtype->datetime64,kind->datetime64,shape->None]"
- )
- with pytest.raises(ValueError, match=msg):
- store.append("df", df)
- def test_append_with_timedelta(setup_path):
- # GH 3577
- # append timedelta
- df = DataFrame(
- {
- "A": Timestamp("20130101"),
- "B": [
- Timestamp("20130101") + timedelta(days=i, seconds=10) for i in range(10)
- ],
- }
- )
- df["C"] = df["A"] - df["B"]
- df.loc[3:5, "C"] = np.nan
- with ensure_clean_store(setup_path) as store:
- # table
- _maybe_remove(store, "df")
- store.append("df", df, data_columns=True)
- result = store.select("df")
- tm.assert_frame_equal(result, df)
- result = store.select("df", where="C<100000")
- tm.assert_frame_equal(result, df)
- result = store.select("df", where="C<pd.Timedelta('-3D')")
- tm.assert_frame_equal(result, df.iloc[3:])
- result = store.select("df", "C<'-3D'")
- tm.assert_frame_equal(result, df.iloc[3:])
- # a bit hacky here as we don't really deal with the NaT properly
- result = store.select("df", "C<'-500000s'")
- result = result.dropna(subset=["C"])
- tm.assert_frame_equal(result, df.iloc[6:])
- result = store.select("df", "C<'-3.5D'")
- result = result.iloc[1:]
- tm.assert_frame_equal(result, df.iloc[4:])
- # fixed
- _maybe_remove(store, "df2")
- store.put("df2", df)
- result = store.select("df2")
- tm.assert_frame_equal(result, df)
- def test_append_to_multiple(setup_path):
- df1 = tm.makeTimeDataFrame()
- df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
- df2["foo"] = "bar"
- df = concat([df1, df2], axis=1)
- with ensure_clean_store(setup_path) as store:
- # exceptions
- msg = "append_to_multiple requires a selector that is in passed dict"
- with pytest.raises(ValueError, match=msg):
- store.append_to_multiple(
- {"df1": ["A", "B"], "df2": None}, df, selector="df3"
- )
- with pytest.raises(ValueError, match=msg):
- store.append_to_multiple({"df1": None, "df2": None}, df, selector="df3")
- msg = (
- "append_to_multiple must have a dictionary specified as the way to "
- "split the value"
- )
- with pytest.raises(ValueError, match=msg):
- store.append_to_multiple("df1", df, "df1")
- # regular operation
- store.append_to_multiple({"df1": ["A", "B"], "df2": None}, df, selector="df1")
- result = store.select_as_multiple(
- ["df1", "df2"], where=["A>0", "B>0"], selector="df1"
- )
- expected = df[(df.A > 0) & (df.B > 0)]
- tm.assert_frame_equal(result, expected)
- def test_append_to_multiple_dropna(setup_path):
- df1 = tm.makeTimeDataFrame()
- df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
- df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
- df = concat([df1, df2], axis=1)
- with ensure_clean_store(setup_path) as store:
- # dropna=True should guarantee rows are synchronized
- store.append_to_multiple(
- {"df1": ["A", "B"], "df2": None}, df, selector="df1", dropna=True
- )
- result = store.select_as_multiple(["df1", "df2"])
- expected = df.dropna()
- tm.assert_frame_equal(result, expected, check_index_type=True)
- tm.assert_index_equal(store.select("df1").index, store.select("df2").index)
- def test_append_to_multiple_dropna_false(setup_path):
- df1 = tm.makeTimeDataFrame()
- df2 = tm.makeTimeDataFrame().rename(columns="{}_2".format)
- df1.iloc[1, df1.columns.get_indexer(["A", "B"])] = np.nan
- df = concat([df1, df2], axis=1)
- with ensure_clean_store(setup_path) as store, pd.option_context(
- "io.hdf.dropna_table", True
- ):
- # dropna=False shouldn't synchronize row indexes
- store.append_to_multiple(
- {"df1a": ["A", "B"], "df2a": None}, df, selector="df1a", dropna=False
- )
- msg = "all tables must have exactly the same nrows!"
- with pytest.raises(ValueError, match=msg):
- store.select_as_multiple(["df1a", "df2a"])
- assert not store.select("df1a").index.equals(store.select("df2a").index)
- def test_append_to_multiple_min_itemsize(setup_path):
- # GH 11238
- df = DataFrame(
- {
- "IX": np.arange(1, 21),
- "Num": np.arange(1, 21),
- "BigNum": np.arange(1, 21) * 88,
- "Str": ["a" for _ in range(20)],
- "LongStr": ["abcde" for _ in range(20)],
- }
- )
- expected = df.iloc[[0]]
- with ensure_clean_store(setup_path) as store:
- store.append_to_multiple(
- {
- "index": ["IX"],
- "nums": ["Num", "BigNum"],
- "strs": ["Str", "LongStr"],
- },
- df.iloc[[0]],
- "index",
- min_itemsize={"Str": 10, "LongStr": 100, "Num": 2},
- )
- result = store.select_as_multiple(["index", "nums", "strs"])
- tm.assert_frame_equal(result, expected, check_index_type=True)
|