123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278 |
- import numpy as np
- import pytest
- import pandas.util._test_decorators as td
- from pandas import (
- DataFrame,
- NaT,
- Series,
- Timestamp,
- date_range,
- period_range,
- )
- import pandas._testing as tm
- class TestDataFrameValues:
- @td.skip_array_manager_invalid_test
- def test_values(self, float_frame, using_copy_on_write):
- if using_copy_on_write:
- with pytest.raises(ValueError, match="read-only"):
- float_frame.values[:, 0] = 5.0
- assert (float_frame.values[:, 0] != 5).all()
- else:
- float_frame.values[:, 0] = 5.0
- assert (float_frame.values[:, 0] == 5).all()
- def test_more_values(self, float_string_frame):
- values = float_string_frame.values
- assert values.shape[1] == len(float_string_frame.columns)
- def test_values_mixed_dtypes(self, float_frame, float_string_frame):
- frame = float_frame
- arr = frame.values
- frame_cols = frame.columns
- for i, row in enumerate(arr):
- for j, value in enumerate(row):
- col = frame_cols[j]
- if np.isnan(value):
- assert np.isnan(frame[col][i])
- else:
- assert value == frame[col][i]
- # mixed type
- arr = float_string_frame[["foo", "A"]].values
- assert arr[0, 0] == "bar"
- df = DataFrame({"complex": [1j, 2j, 3j], "real": [1, 2, 3]})
- arr = df.values
- assert arr[0, 0] == 1j
- def test_values_duplicates(self):
- df = DataFrame(
- [[1, 2, "a", "b"], [1, 2, "a", "b"]], columns=["one", "one", "two", "two"]
- )
- result = df.values
- expected = np.array([[1, 2, "a", "b"], [1, 2, "a", "b"]], dtype=object)
- tm.assert_numpy_array_equal(result, expected)
- def test_values_with_duplicate_columns(self):
- df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=["x", "x"])
- result = df.values
- expected = np.array([[1, 2.5], [3, 4.5]])
- assert (result == expected).all().all()
- @pytest.mark.parametrize("constructor", [date_range, period_range])
- def test_values_casts_datetimelike_to_object(self, constructor):
- series = Series(constructor("2000-01-01", periods=10, freq="D"))
- expected = series.astype("object")
- df = DataFrame({"a": series, "b": np.random.randn(len(series))})
- result = df.values.squeeze()
- assert (result[:, 0] == expected.values).all()
- df = DataFrame({"a": series, "b": ["foo"] * len(series)})
- result = df.values.squeeze()
- assert (result[:, 0] == expected.values).all()
- def test_frame_values_with_tz(self):
- tz = "US/Central"
- df = DataFrame({"A": date_range("2000", periods=4, tz=tz)})
- result = df.values
- expected = np.array(
- [
- [Timestamp("2000-01-01", tz=tz)],
- [Timestamp("2000-01-02", tz=tz)],
- [Timestamp("2000-01-03", tz=tz)],
- [Timestamp("2000-01-04", tz=tz)],
- ]
- )
- tm.assert_numpy_array_equal(result, expected)
- # two columns, homogeneous
- df["B"] = df["A"]
- result = df.values
- expected = np.concatenate([expected, expected], axis=1)
- tm.assert_numpy_array_equal(result, expected)
- # three columns, heterogeneous
- est = "US/Eastern"
- df["C"] = df["A"].dt.tz_convert(est)
- new = np.array(
- [
- [Timestamp("2000-01-01T01:00:00", tz=est)],
- [Timestamp("2000-01-02T01:00:00", tz=est)],
- [Timestamp("2000-01-03T01:00:00", tz=est)],
- [Timestamp("2000-01-04T01:00:00", tz=est)],
- ]
- )
- expected = np.concatenate([expected, new], axis=1)
- result = df.values
- tm.assert_numpy_array_equal(result, expected)
- def test_interleave_with_tzaware(self, timezone_frame):
- # interleave with object
- result = timezone_frame.assign(D="foo").values
- expected = np.array(
- [
- [
- Timestamp("2013-01-01 00:00:00"),
- Timestamp("2013-01-02 00:00:00"),
- Timestamp("2013-01-03 00:00:00"),
- ],
- [
- Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
- NaT,
- Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
- ],
- [
- Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
- NaT,
- Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
- ],
- ["foo", "foo", "foo"],
- ],
- dtype=object,
- ).T
- tm.assert_numpy_array_equal(result, expected)
- # interleave with only datetime64[ns]
- result = timezone_frame.values
- expected = np.array(
- [
- [
- Timestamp("2013-01-01 00:00:00"),
- Timestamp("2013-01-02 00:00:00"),
- Timestamp("2013-01-03 00:00:00"),
- ],
- [
- Timestamp("2013-01-01 00:00:00-0500", tz="US/Eastern"),
- NaT,
- Timestamp("2013-01-03 00:00:00-0500", tz="US/Eastern"),
- ],
- [
- Timestamp("2013-01-01 00:00:00+0100", tz="CET"),
- NaT,
- Timestamp("2013-01-03 00:00:00+0100", tz="CET"),
- ],
- ],
- dtype=object,
- ).T
- tm.assert_numpy_array_equal(result, expected)
- def test_values_interleave_non_unique_cols(self):
- df = DataFrame(
- [[Timestamp("20130101"), 3.5], [Timestamp("20130102"), 4.5]],
- columns=["x", "x"],
- index=[1, 2],
- )
- df_unique = df.copy()
- df_unique.columns = ["x", "y"]
- assert df_unique.values.shape == df.values.shape
- tm.assert_numpy_array_equal(df_unique.values[0], df.values[0])
- tm.assert_numpy_array_equal(df_unique.values[1], df.values[1])
- def test_values_numeric_cols(self, float_frame):
- float_frame["foo"] = "bar"
- values = float_frame[["A", "B", "C", "D"]].values
- assert values.dtype == np.float64
- def test_values_lcd(self, mixed_float_frame, mixed_int_frame):
- # mixed lcd
- values = mixed_float_frame[["A", "B", "C", "D"]].values
- assert values.dtype == np.float64
- values = mixed_float_frame[["A", "B", "C"]].values
- assert values.dtype == np.float32
- values = mixed_float_frame[["C"]].values
- assert values.dtype == np.float16
- # GH#10364
- # B uint64 forces float because there are other signed int types
- values = mixed_int_frame[["A", "B", "C", "D"]].values
- assert values.dtype == np.float64
- values = mixed_int_frame[["A", "D"]].values
- assert values.dtype == np.int64
- # B uint64 forces float because there are other signed int types
- values = mixed_int_frame[["A", "B", "C"]].values
- assert values.dtype == np.float64
- # as B and C are both unsigned, no forcing to float is needed
- values = mixed_int_frame[["B", "C"]].values
- assert values.dtype == np.uint64
- values = mixed_int_frame[["A", "C"]].values
- assert values.dtype == np.int32
- values = mixed_int_frame[["C", "D"]].values
- assert values.dtype == np.int64
- values = mixed_int_frame[["A"]].values
- assert values.dtype == np.int32
- values = mixed_int_frame[["C"]].values
- assert values.dtype == np.uint8
- class TestPrivateValues:
- @td.skip_array_manager_invalid_test
- def test_private_values_dt64tz(self, using_copy_on_write):
- dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1)
- df = DataFrame(dta, columns=["A"])
- tm.assert_equal(df._values, dta)
- if using_copy_on_write:
- assert not np.shares_memory(df._values._ndarray, dta._ndarray)
- else:
- # we have a view
- assert np.shares_memory(df._values._ndarray, dta._ndarray)
- # TimedeltaArray
- tda = dta - dta
- df2 = df - df
- tm.assert_equal(df2._values, tda)
- @td.skip_array_manager_invalid_test
- def test_private_values_dt64tz_multicol(self, using_copy_on_write):
- dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2)
- df = DataFrame(dta, columns=["A", "B"])
- tm.assert_equal(df._values, dta)
- if using_copy_on_write:
- assert not np.shares_memory(df._values._ndarray, dta._ndarray)
- else:
- # we have a view
- assert np.shares_memory(df._values._ndarray, dta._ndarray)
- # TimedeltaArray
- tda = dta - dta
- df2 = df - df
- tm.assert_equal(df2._values, tda)
- def test_private_values_dt64_multiblock(self):
- dta = date_range("2000", periods=8)._data
- df = DataFrame({"A": dta[:4]}, copy=False)
- df["B"] = dta[4:]
- assert len(df._mgr.arrays) == 2
- result = df._values
- expected = dta.reshape(2, 4).T
- tm.assert_equal(result, expected)
|