123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257 |
- """ test feather-format compat """
- import numpy as np
- import pytest
- import pandas as pd
- import pandas._testing as tm
- from pandas.core.arrays import (
- ArrowStringArray,
- StringArray,
- )
- from pandas.io.feather_format import read_feather, to_feather # isort:skip
- pyarrow = pytest.importorskip("pyarrow", minversion="1.0.1")
- @pytest.mark.single_cpu
- class TestFeather:
- def check_error_on_write(self, df, exc, err_msg):
- # check that we are raising the exception
- # on writing
- with pytest.raises(exc, match=err_msg):
- with tm.ensure_clean() as path:
- to_feather(df, path)
- def check_external_error_on_write(self, df):
- # check that we are raising the exception
- # on writing
- with tm.external_error_raised(Exception):
- with tm.ensure_clean() as path:
- to_feather(df, path)
- def check_round_trip(self, df, expected=None, write_kwargs={}, **read_kwargs):
- if expected is None:
- expected = df
- with tm.ensure_clean() as path:
- to_feather(df, path, **write_kwargs)
- result = read_feather(path, **read_kwargs)
- tm.assert_frame_equal(result, expected)
- def test_error(self):
- msg = "feather only support IO with DataFrames"
- for obj in [
- pd.Series([1, 2, 3]),
- 1,
- "foo",
- pd.Timestamp("20130101"),
- np.array([1, 2, 3]),
- ]:
- self.check_error_on_write(obj, ValueError, msg)
- def test_basic(self):
- df = pd.DataFrame(
- {
- "string": list("abc"),
- "int": list(range(1, 4)),
- "uint": np.arange(3, 6).astype("u1"),
- "float": np.arange(4.0, 7.0, dtype="float64"),
- "float_with_null": [1.0, np.nan, 3],
- "bool": [True, False, True],
- "bool_with_null": [True, np.nan, False],
- "cat": pd.Categorical(list("abc")),
- "dt": pd.DatetimeIndex(
- list(pd.date_range("20130101", periods=3)), freq=None
- ),
- "dttz": pd.DatetimeIndex(
- list(pd.date_range("20130101", periods=3, tz="US/Eastern")),
- freq=None,
- ),
- "dt_with_null": [
- pd.Timestamp("20130101"),
- pd.NaT,
- pd.Timestamp("20130103"),
- ],
- "dtns": pd.DatetimeIndex(
- list(pd.date_range("20130101", periods=3, freq="ns")), freq=None
- ),
- }
- )
- df["periods"] = pd.period_range("2013", freq="M", periods=3)
- df["timedeltas"] = pd.timedelta_range("1 day", periods=3)
- df["intervals"] = pd.interval_range(0, 3, 3)
- assert df.dttz.dtype.tz.zone == "US/Eastern"
- self.check_round_trip(df)
- def test_duplicate_columns(self):
- # https://github.com/wesm/feather/issues/53
- # not currently able to handle duplicate columns
- df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
- self.check_external_error_on_write(df)
- def test_stringify_columns(self):
- df = pd.DataFrame(np.arange(12).reshape(4, 3)).copy()
- msg = "feather must have string column names"
- self.check_error_on_write(df, ValueError, msg)
- def test_read_columns(self):
- # GH 24025
- df = pd.DataFrame(
- {
- "col1": list("abc"),
- "col2": list(range(1, 4)),
- "col3": list("xyz"),
- "col4": list(range(4, 7)),
- }
- )
- columns = ["col1", "col3"]
- self.check_round_trip(df, expected=df[columns], columns=columns)
- def test_read_columns_different_order(self):
- # GH 33878
- df = pd.DataFrame({"A": [1, 2], "B": ["x", "y"], "C": [True, False]})
- expected = df[["B", "A"]]
- self.check_round_trip(df, expected, columns=["B", "A"])
- def test_unsupported_other(self):
- # mixed python objects
- df = pd.DataFrame({"a": ["a", 1, 2.0]})
- self.check_external_error_on_write(df)
- def test_rw_use_threads(self):
- df = pd.DataFrame({"A": np.arange(100000)})
- self.check_round_trip(df, use_threads=True)
- self.check_round_trip(df, use_threads=False)
- def test_write_with_index(self):
- df = pd.DataFrame({"A": [1, 2, 3]})
- self.check_round_trip(df)
- msg = (
- r"feather does not support serializing .* for the index; "
- r"you can \.reset_index\(\) to make the index into column\(s\)"
- )
- # non-default index
- for index in [
- [2, 3, 4],
- pd.date_range("20130101", periods=3),
- list("abc"),
- [1, 3, 4],
- pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)]),
- ]:
- df.index = index
- self.check_error_on_write(df, ValueError, msg)
- # index with meta-data
- df.index = [0, 1, 2]
- df.index.name = "foo"
- msg = "feather does not serialize index meta-data on a default index"
- self.check_error_on_write(df, ValueError, msg)
- # column multi-index
- df.index = [0, 1, 2]
- df.columns = pd.MultiIndex.from_tuples([("a", 1)])
- msg = "feather must have string column names"
- self.check_error_on_write(df, ValueError, msg)
- def test_path_pathlib(self):
- df = tm.makeDataFrame().reset_index()
- result = tm.round_trip_pathlib(df.to_feather, read_feather)
- tm.assert_frame_equal(df, result)
- def test_path_localpath(self):
- df = tm.makeDataFrame().reset_index()
- result = tm.round_trip_localpath(df.to_feather, read_feather)
- tm.assert_frame_equal(df, result)
- def test_passthrough_keywords(self):
- df = tm.makeDataFrame().reset_index()
- self.check_round_trip(df, write_kwargs={"version": 1})
- @pytest.mark.network
- @tm.network(
- url=(
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/data/feather/feather-0_3_1.feather"
- ),
- check_before_test=True,
- )
- def test_http_path(self, feather_file):
- # GH 29055
- url = (
- "https://raw.githubusercontent.com/pandas-dev/pandas/main/"
- "pandas/tests/io/data/feather/feather-0_3_1.feather"
- )
- expected = read_feather(feather_file)
- res = read_feather(url)
- tm.assert_frame_equal(expected, res)
- def test_read_feather_dtype_backend(self, string_storage, dtype_backend):
- # GH#50765
- pa = pytest.importorskip("pyarrow")
- df = pd.DataFrame(
- {
- "a": pd.Series([1, np.nan, 3], dtype="Int64"),
- "b": pd.Series([1, 2, 3], dtype="Int64"),
- "c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"),
- "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
- "e": [True, False, None],
- "f": [True, False, True],
- "g": ["a", "b", "c"],
- "h": ["a", "b", None],
- }
- )
- if string_storage == "python":
- string_array = StringArray(np.array(["a", "b", "c"], dtype=np.object_))
- string_array_na = StringArray(np.array(["a", "b", pd.NA], dtype=np.object_))
- else:
- string_array = ArrowStringArray(pa.array(["a", "b", "c"]))
- string_array_na = ArrowStringArray(pa.array(["a", "b", None]))
- with tm.ensure_clean() as path:
- to_feather(df, path)
- with pd.option_context("mode.string_storage", string_storage):
- result = read_feather(path, dtype_backend=dtype_backend)
- expected = pd.DataFrame(
- {
- "a": pd.Series([1, np.nan, 3], dtype="Int64"),
- "b": pd.Series([1, 2, 3], dtype="Int64"),
- "c": pd.Series([1.5, np.nan, 2.5], dtype="Float64"),
- "d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
- "e": pd.Series([True, False, pd.NA], dtype="boolean"),
- "f": pd.Series([True, False, True], dtype="boolean"),
- "g": string_array,
- "h": string_array_na,
- }
- )
- if dtype_backend == "pyarrow":
- from pandas.arrays import ArrowExtensionArray
- expected = pd.DataFrame(
- {
- col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
- for col in expected.columns
- }
- )
- tm.assert_frame_equal(result, expected)
- def test_invalid_dtype_backend(self):
- msg = (
- "dtype_backend numpy is invalid, only 'numpy_nullable' and "
- "'pyarrow' are allowed."
- )
- df = pd.DataFrame({"int": list(range(1, 4))})
- with tm.ensure_clean("tmp.feather") as path:
- df.to_feather(path)
- with pytest.raises(ValueError, match=msg):
- read_feather(path, dtype_backend="numpy")
|