123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341 |
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import (
- DataFrame,
- DatetimeIndex,
- Index,
- Period,
- PeriodIndex,
- Series,
- Timedelta,
- TimedeltaIndex,
- Timestamp,
- )
- import pandas._testing as tm
- from pandas.tests.copy_view.util import get_array
- # -----------------------------------------------------------------------------
- # Copy/view behaviour for Series / DataFrame constructors
- @pytest.mark.parametrize("dtype", [None, "int64"])
- def test_series_from_series(dtype, using_copy_on_write):
- # Case: constructing a Series from another Series object follows CoW rules:
- # a new object is returned and thus mutations are not propagated
- ser = Series([1, 2, 3], name="name")
- # default is copy=False -> new Series is a shallow copy / view of original
- result = Series(ser, dtype=dtype)
- # the shallow copy still shares memory
- assert np.shares_memory(get_array(ser), get_array(result))
- if using_copy_on_write:
- assert result._mgr.blocks[0].refs.has_reference()
- if using_copy_on_write:
- # mutating new series copy doesn't mutate original
- result.iloc[0] = 0
- assert ser.iloc[0] == 1
- # mutating triggered a copy-on-write -> no longer shares memory
- assert not np.shares_memory(get_array(ser), get_array(result))
- else:
- # mutating shallow copy does mutate original
- result.iloc[0] = 0
- assert ser.iloc[0] == 0
- # and still shares memory
- assert np.shares_memory(get_array(ser), get_array(result))
- # the same when modifying the parent
- result = Series(ser, dtype=dtype)
- if using_copy_on_write:
- # mutating original doesn't mutate new series
- ser.iloc[0] = 0
- assert result.iloc[0] == 1
- else:
- # mutating original does mutate shallow copy
- ser.iloc[0] = 0
- assert result.iloc[0] == 0
- def test_series_from_series_with_reindex(using_copy_on_write):
- # Case: constructing a Series from another Series with specifying an index
- # that potentially requires a reindex of the values
- ser = Series([1, 2, 3], name="name")
- # passing an index that doesn't actually require a reindex of the values
- # -> without CoW we get an actual mutating view
- for index in [
- ser.index,
- ser.index.copy(),
- list(ser.index),
- ser.index.rename("idx"),
- ]:
- result = Series(ser, index=index)
- assert np.shares_memory(ser.values, result.values)
- result.iloc[0] = 0
- if using_copy_on_write:
- assert ser.iloc[0] == 1
- else:
- assert ser.iloc[0] == 0
- # ensure that if an actual reindex is needed, we don't have any refs
- # (mutating the result wouldn't trigger CoW)
- result = Series(ser, index=[0, 1, 2, 3])
- assert not np.shares_memory(ser.values, result.values)
- if using_copy_on_write:
- assert not result._mgr.blocks[0].refs.has_reference()
- @pytest.mark.parametrize("fastpath", [False, True])
- @pytest.mark.parametrize("dtype", [None, "int64"])
- @pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
- @pytest.mark.parametrize(
- "arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
- )
- def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr):
- if idx is None or dtype is not None:
- fastpath = False
- ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath)
- ser_orig = ser.copy()
- data = getattr(arr, "_data", arr)
- if using_copy_on_write:
- assert not np.shares_memory(get_array(ser), data)
- else:
- assert np.shares_memory(get_array(ser), data)
- arr[0] = 100
- if using_copy_on_write:
- tm.assert_series_equal(ser, ser_orig)
- else:
- expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype)
- tm.assert_series_equal(ser, expected)
- @pytest.mark.parametrize("copy", [True, False, None])
- def test_series_from_array_different_dtype(using_copy_on_write, copy):
- arr = np.array([1, 2, 3], dtype="int64")
- ser = Series(arr, dtype="int32", copy=copy)
- assert not np.shares_memory(get_array(ser), arr)
- @pytest.mark.parametrize(
- "idx",
- [
- Index([1, 2]),
- DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]),
- PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]),
- TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]),
- ],
- )
- def test_series_from_index(using_copy_on_write, idx):
- ser = Series(idx)
- expected = idx.copy(deep=True)
- if using_copy_on_write:
- assert np.shares_memory(get_array(ser), get_array(idx))
- assert not ser._mgr._has_no_reference(0)
- else:
- assert not np.shares_memory(get_array(ser), get_array(idx))
- ser.iloc[0] = ser.iloc[1]
- tm.assert_index_equal(idx, expected)
- def test_series_from_index_different_dtypes(using_copy_on_write):
- idx = Index([1, 2, 3], dtype="int64")
- ser = Series(idx, dtype="int32")
- assert not np.shares_memory(get_array(ser), get_array(idx))
- if using_copy_on_write:
- assert ser._mgr._has_no_reference(0)
- @pytest.mark.parametrize("fastpath", [False, True])
- @pytest.mark.parametrize("dtype", [None, "int64"])
- @pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
- def test_series_from_block_manager(using_copy_on_write, idx, dtype, fastpath):
- ser = Series([1, 2, 3], dtype="int64")
- ser_orig = ser.copy()
- ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx)
- assert np.shares_memory(get_array(ser), get_array(ser2))
- if using_copy_on_write:
- assert not ser2._mgr._has_no_reference(0)
- ser2.iloc[0] = 100
- if using_copy_on_write:
- tm.assert_series_equal(ser, ser_orig)
- else:
- expected = Series([100, 2, 3])
- tm.assert_series_equal(ser, expected)
- def test_series_from_block_manager_different_dtype(using_copy_on_write):
- ser = Series([1, 2, 3], dtype="int64")
- ser2 = Series(ser._mgr, dtype="int32")
- assert not np.shares_memory(get_array(ser), get_array(ser2))
- if using_copy_on_write:
- assert ser2._mgr._has_no_reference(0)
- @pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr])
- @pytest.mark.parametrize("columns", [None, ["a"]])
- def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func):
- df = DataFrame({"a": [1, 2, 3]})
- df_orig = df.copy()
- new_df = DataFrame(func(df))
- assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
- new_df.iloc[0] = 100
- if using_copy_on_write:
- assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
- tm.assert_frame_equal(df, df_orig)
- else:
- assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
- tm.assert_frame_equal(df, new_df)
- @pytest.mark.parametrize("dtype", [None, "int64", "Int64"])
- @pytest.mark.parametrize("index", [None, [0, 1, 2]])
- @pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]])
- def test_dataframe_from_dict_of_series(
- request, using_copy_on_write, columns, index, dtype
- ):
- # Case: constructing a DataFrame from Series objects with copy=False
- # has to do a lazy following CoW rules
- # (the default for DataFrame(dict) is still to copy to ensure consolidation)
- s1 = Series([1, 2, 3])
- s2 = Series([4, 5, 6])
- s1_orig = s1.copy()
- expected = DataFrame(
- {"a": [1, 2, 3], "b": [4, 5, 6]}, index=index, columns=columns, dtype=dtype
- )
- result = DataFrame(
- {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
- )
- # the shallow copy still shares memory
- assert np.shares_memory(get_array(result, "a"), get_array(s1))
- # mutating the new dataframe doesn't mutate original
- result.iloc[0, 0] = 10
- if using_copy_on_write:
- assert not np.shares_memory(get_array(result, "a"), get_array(s1))
- tm.assert_series_equal(s1, s1_orig)
- else:
- assert s1.iloc[0] == 10
- # the same when modifying the parent series
- s1 = Series([1, 2, 3])
- s2 = Series([4, 5, 6])
- result = DataFrame(
- {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
- )
- s1.iloc[0] = 10
- if using_copy_on_write:
- assert not np.shares_memory(get_array(result, "a"), get_array(s1))
- tm.assert_frame_equal(result, expected)
- else:
- assert result.iloc[0, 0] == 10
- @pytest.mark.parametrize("dtype", [None, "int64"])
- def test_dataframe_from_dict_of_series_with_reindex(dtype):
- # Case: constructing a DataFrame from Series objects with copy=False
- # and passing an index that requires an actual (no-view) reindex -> need
- # to ensure the result doesn't have refs set up to unnecessarily trigger
- # a copy on write
- s1 = Series([1, 2, 3])
- s2 = Series([4, 5, 6])
- df = DataFrame({"a": s1, "b": s2}, index=[1, 2, 3], dtype=dtype, copy=False)
- # df should own its memory, so mutating shouldn't trigger a copy
- arr_before = get_array(df, "a")
- assert not np.shares_memory(arr_before, get_array(s1))
- df.iloc[0, 0] = 100
- arr_after = get_array(df, "a")
- assert np.shares_memory(arr_before, arr_after)
- @pytest.mark.parametrize("cons", [Series, Index])
- @pytest.mark.parametrize(
- "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)]
- )
- def test_dataframe_from_series_or_index(using_copy_on_write, data, dtype, cons):
- obj = cons(data, dtype=dtype)
- obj_orig = obj.copy()
- df = DataFrame(obj, dtype=dtype)
- assert np.shares_memory(get_array(obj), get_array(df, 0))
- if using_copy_on_write:
- assert not df._mgr._has_no_reference(0)
- df.iloc[0, 0] = data[-1]
- if using_copy_on_write:
- tm.assert_equal(obj, obj_orig)
- @pytest.mark.parametrize("cons", [Series, Index])
- def test_dataframe_from_series_or_index_different_dtype(using_copy_on_write, cons):
- obj = cons([1, 2], dtype="int64")
- df = DataFrame(obj, dtype="int32")
- assert not np.shares_memory(get_array(obj), get_array(df, 0))
- if using_copy_on_write:
- assert df._mgr._has_no_reference(0)
- def test_dataframe_from_series_infer_datetime(using_copy_on_write):
- ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object)
- df = DataFrame(ser)
- assert not np.shares_memory(get_array(ser), get_array(df, 0))
- if using_copy_on_write:
- assert df._mgr._has_no_reference(0)
- @pytest.mark.parametrize("index", [None, [0, 1, 2]])
- def test_dataframe_from_dict_of_series_with_dtype(index):
- # Variant of above, but now passing a dtype that causes a copy
- # -> need to ensure the result doesn't have refs set up to unnecessarily
- # trigger a copy on write
- s1 = Series([1.0, 2.0, 3.0])
- s2 = Series([4, 5, 6])
- df = DataFrame({"a": s1, "b": s2}, index=index, dtype="int64", copy=False)
- # df should own its memory, so mutating shouldn't trigger a copy
- arr_before = get_array(df, "a")
- assert not np.shares_memory(arr_before, get_array(s1))
- df.iloc[0, 0] = 100
- arr_after = get_array(df, "a")
- assert np.shares_memory(arr_before, arr_after)
- @pytest.mark.parametrize("copy", [False, None, True])
- def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager):
- arr = np.array([[1, 2], [3, 4]])
- df = DataFrame(arr, copy=copy)
- if (
- using_copy_on_write
- and copy is not False
- or copy is True
- or (using_array_manager and copy is None)
- ):
- assert not np.shares_memory(get_array(df, 0), arr)
- else:
- assert np.shares_memory(get_array(df, 0), arr)
- def test_dataframe_from_records_with_dataframe(using_copy_on_write):
- df = DataFrame({"a": [1, 2, 3]})
- df_orig = df.copy()
- df2 = DataFrame.from_records(df)
- if using_copy_on_write:
- assert not df._mgr._has_no_reference(0)
- assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
- df2.iloc[0, 0] = 100
- if using_copy_on_write:
- tm.assert_frame_equal(df, df_orig)
- else:
- tm.assert_frame_equal(df, df2)
|