123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- import numpy as np
- import pytest
- from pandas import (
- Categorical,
- DataFrame,
- Series,
- _testing as tm,
- concat,
- read_hdf,
- )
- from pandas.tests.io.pytables.common import (
- _maybe_remove,
- ensure_clean_store,
- )
- pytestmark = [
- pytest.mark.single_cpu,
- ]
- def test_categorical(setup_path):
- with ensure_clean_store(setup_path) as store:
- # Basic
- _maybe_remove(store, "s")
- s = Series(
- Categorical(
- ["a", "b", "b", "a", "a", "c"],
- categories=["a", "b", "c", "d"],
- ordered=False,
- )
- )
- store.append("s", s, format="table")
- result = store.select("s")
- tm.assert_series_equal(s, result)
- _maybe_remove(store, "s_ordered")
- s = Series(
- Categorical(
- ["a", "b", "b", "a", "a", "c"],
- categories=["a", "b", "c", "d"],
- ordered=True,
- )
- )
- store.append("s_ordered", s, format="table")
- result = store.select("s_ordered")
- tm.assert_series_equal(s, result)
- _maybe_remove(store, "df")
- df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]})
- store.append("df", df, format="table")
- result = store.select("df")
- tm.assert_frame_equal(result, df)
- # Dtypes
- _maybe_remove(store, "si")
- s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category")
- store.append("si", s)
- result = store.select("si")
- tm.assert_series_equal(result, s)
- _maybe_remove(store, "si2")
- s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category")
- store.append("si2", s)
- result = store.select("si2")
- tm.assert_series_equal(result, s)
- # Multiple
- _maybe_remove(store, "df2")
- df2 = df.copy()
- df2["s2"] = Series(list("abcdefg")).astype("category")
- store.append("df2", df2)
- result = store.select("df2")
- tm.assert_frame_equal(result, df2)
- # Make sure the metadata is OK
- info = store.info()
- assert "/df2 " in info
- # df2._mgr.blocks[0] and df2._mgr.blocks[2] are Categorical
- assert "/df2/meta/values_block_0/meta" in info
- assert "/df2/meta/values_block_2/meta" in info
- # unordered
- _maybe_remove(store, "s2")
- s = Series(
- Categorical(
- ["a", "b", "b", "a", "a", "c"],
- categories=["a", "b", "c", "d"],
- ordered=False,
- )
- )
- store.append("s2", s, format="table")
- result = store.select("s2")
- tm.assert_series_equal(result, s)
- # Query
- _maybe_remove(store, "df3")
- store.append("df3", df, data_columns=["s"])
- expected = df[df.s.isin(["b", "c"])]
- result = store.select("df3", where=['s in ["b","c"]'])
- tm.assert_frame_equal(result, expected)
- expected = df[df.s.isin(["b", "c"])]
- result = store.select("df3", where=['s = ["b","c"]'])
- tm.assert_frame_equal(result, expected)
- expected = df[df.s.isin(["d"])]
- result = store.select("df3", where=['s in ["d"]'])
- tm.assert_frame_equal(result, expected)
- expected = df[df.s.isin(["f"])]
- result = store.select("df3", where=['s in ["f"]'])
- tm.assert_frame_equal(result, expected)
- # Appending with same categories is ok
- store.append("df3", df)
- df = concat([df, df])
- expected = df[df.s.isin(["b", "c"])]
- result = store.select("df3", where=['s in ["b","c"]'])
- tm.assert_frame_equal(result, expected)
- # Appending must have the same categories
- df3 = df.copy()
- df3["s"] = df3["s"].cat.remove_unused_categories()
- msg = "cannot append a categorical with different categories to the existing"
- with pytest.raises(ValueError, match=msg):
- store.append("df3", df3)
- # Remove, and make sure meta data is removed (its a recursive
- # removal so should be).
- result = store.select("df3/meta/s/meta")
- assert result is not None
- store.remove("df3")
- with pytest.raises(
- KeyError, match="'No object named df3/meta/s/meta in the file'"
- ):
- store.select("df3/meta/s/meta")
- def test_categorical_conversion(tmp_path, setup_path):
- # GH13322
- # Check that read_hdf with categorical columns doesn't return rows if
- # where criteria isn't met.
- obsids = ["ESP_012345_6789", "ESP_987654_3210"]
- imgids = ["APF00006np", "APF0001imm"]
- data = [4.3, 9.8]
- # Test without categories
- df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data})
- # We are expecting an empty DataFrame matching types of df
- expected = df.iloc[[], :]
- path = tmp_path / setup_path
- df.to_hdf(path, "df", format="table", data_columns=True)
- result = read_hdf(path, "df", where="obsids=B")
- tm.assert_frame_equal(result, expected)
- # Test with categories
- df.obsids = df.obsids.astype("category")
- df.imgids = df.imgids.astype("category")
- # We are expecting an empty DataFrame matching types of df
- expected = df.iloc[[], :]
- path = tmp_path / setup_path
- df.to_hdf(path, "df", format="table", data_columns=True)
- result = read_hdf(path, "df", where="obsids=B")
- tm.assert_frame_equal(result, expected)
- def test_categorical_nan_only_columns(tmp_path, setup_path):
- # GH18413
- # Check that read_hdf with categorical columns with NaN-only values can
- # be read back.
- df = DataFrame(
- {
- "a": ["a", "b", "c", np.nan],
- "b": [np.nan, np.nan, np.nan, np.nan],
- "c": [1, 2, 3, 4],
- "d": Series([None] * 4, dtype=object),
- }
- )
- df["a"] = df.a.astype("category")
- df["b"] = df.b.astype("category")
- df["d"] = df.b.astype("category")
- expected = df
- path = tmp_path / setup_path
- df.to_hdf(path, "df", format="table", data_columns=True)
- result = read_hdf(path, "df")
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "where, df, expected",
- [
- ('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})),
- ('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})),
- ],
- )
- def test_convert_value(
- tmp_path, setup_path, where: str, df: DataFrame, expected: DataFrame
- ):
- # GH39420
- # Check that read_hdf with categorical columns can filter by where condition.
- df.col = df.col.astype("category")
- max_widths = {"col": 1}
- categorical_values = sorted(df.col.unique())
- expected.col = expected.col.astype("category")
- expected.col = expected.col.cat.set_categories(categorical_values)
- path = tmp_path / setup_path
- df.to_hdf(path, "df", format="table", min_itemsize=max_widths)
- result = read_hdf(path, where=where)
- tm.assert_frame_equal(result, expected)
|