123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364 |
- import numpy as np
- import pytest
- from pandas import (
- Categorical,
- DataFrame,
- )
- import pandas._testing as tm
- from pandas.tests.copy_view.util import get_array
- @pytest.mark.parametrize(
- "replace_kwargs",
- [
- {"to_replace": {"a": 1, "b": 4}, "value": -1},
- # Test CoW splits blocks to avoid copying unchanged columns
- {"to_replace": {"a": 1}, "value": -1},
- {"to_replace": {"b": 4}, "value": -1},
- {"to_replace": {"b": {4: 1}}},
- # TODO: Add these in a further optimization
- # We would need to see which columns got replaced in the mask
- # which could be expensive
- # {"to_replace": {"b": 1}},
- # 1
- ],
- )
- def test_replace(using_copy_on_write, replace_kwargs):
- df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
- df_orig = df.copy()
- df_replaced = df.replace(**replace_kwargs)
- if using_copy_on_write:
- if (df_replaced["b"] == df["b"]).all():
- assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
- assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
- # mutating squeezed df triggers a copy-on-write for that column/block
- df_replaced.loc[0, "c"] = -1
- if using_copy_on_write:
- assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
- if "a" in replace_kwargs["to_replace"]:
- arr = get_array(df_replaced, "a")
- df_replaced.loc[0, "a"] = 100
- assert np.shares_memory(get_array(df_replaced, "a"), arr)
- tm.assert_frame_equal(df, df_orig)
- def test_replace_regex_inplace_refs(using_copy_on_write):
- df = DataFrame({"a": ["aaa", "bbb"]})
- df_orig = df.copy()
- view = df[:]
- arr = get_array(df, "a")
- df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
- if using_copy_on_write:
- assert not np.shares_memory(arr, get_array(df, "a"))
- assert df._mgr._has_no_reference(0)
- tm.assert_frame_equal(view, df_orig)
- else:
- assert np.shares_memory(arr, get_array(df, "a"))
- def test_replace_regex_inplace(using_copy_on_write):
- df = DataFrame({"a": ["aaa", "bbb"]})
- arr = get_array(df, "a")
- df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
- if using_copy_on_write:
- assert df._mgr._has_no_reference(0)
- assert np.shares_memory(arr, get_array(df, "a"))
- df_orig = df.copy()
- df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
- tm.assert_frame_equal(df_orig, df)
- assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
- def test_replace_regex_inplace_no_op(using_copy_on_write):
- df = DataFrame({"a": [1, 2]})
- arr = get_array(df, "a")
- df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True)
- if using_copy_on_write:
- assert df._mgr._has_no_reference(0)
- assert np.shares_memory(arr, get_array(df, "a"))
- df_orig = df.copy()
- df2 = df.replace(to_replace=r"^x.$", value="new", regex=True)
- tm.assert_frame_equal(df_orig, df)
- if using_copy_on_write:
- assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
- else:
- assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
- def test_replace_mask_all_false_second_block(using_copy_on_write):
- df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
- df_orig = df.copy()
- df2 = df.replace(to_replace=1.5, value=55.5)
- if using_copy_on_write:
- # TODO: Block splitting would allow us to avoid copying b
- assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
- assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
- else:
- assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
- assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
- df2.loc[0, "c"] = 1
- tm.assert_frame_equal(df, df_orig) # Original is unchanged
- if using_copy_on_write:
- assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
- # TODO: This should split and not copy the whole block
- # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))
- def test_replace_coerce_single_column(using_copy_on_write, using_array_manager):
- df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
- df_orig = df.copy()
- df2 = df.replace(to_replace=1.5, value="a")
- if using_copy_on_write:
- assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
- assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
- elif not using_array_manager:
- assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
- assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
- if using_copy_on_write:
- df2.loc[0, "b"] = 0.5
- tm.assert_frame_equal(df, df_orig) # Original is unchanged
- assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
- def test_replace_to_replace_wrong_dtype(using_copy_on_write):
- df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
- df_orig = df.copy()
- df2 = df.replace(to_replace="xxx", value=1.5)
- if using_copy_on_write:
- assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
- assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
- else:
- assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
- assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
- df2.loc[0, "b"] = 0.5
- tm.assert_frame_equal(df, df_orig) # Original is unchanged
- if using_copy_on_write:
- assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
- def test_replace_list_categorical(using_copy_on_write):
- df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
- arr = get_array(df, "a")
- df.replace(["c"], value="a", inplace=True)
- assert np.shares_memory(arr.codes, get_array(df, "a").codes)
- if using_copy_on_write:
- assert df._mgr._has_no_reference(0)
- df_orig = df.copy()
- df2 = df.replace(["b"], value="a")
- assert not np.shares_memory(arr.codes, get_array(df2, "a").codes)
- tm.assert_frame_equal(df, df_orig)
- def test_replace_list_inplace_refs_categorical(using_copy_on_write):
- df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
- view = df[:]
- df_orig = df.copy()
- df.replace(["c"], value="a", inplace=True)
- if using_copy_on_write:
- assert not np.shares_memory(
- get_array(view, "a").codes, get_array(df, "a").codes
- )
- tm.assert_frame_equal(df_orig, view)
- else:
- # This could be inplace
- assert not np.shares_memory(
- get_array(view, "a").codes, get_array(df, "a").codes
- )
- @pytest.mark.parametrize("to_replace", [1.5, [1.5], []])
- def test_replace_inplace(using_copy_on_write, to_replace):
- df = DataFrame({"a": [1.5, 2, 3]})
- arr_a = get_array(df, "a")
- df.replace(to_replace=1.5, value=15.5, inplace=True)
- assert np.shares_memory(get_array(df, "a"), arr_a)
- if using_copy_on_write:
- assert df._mgr._has_no_reference(0)
- @pytest.mark.parametrize("to_replace", [1.5, [1.5]])
- def test_replace_inplace_reference(using_copy_on_write, to_replace):
- df = DataFrame({"a": [1.5, 2, 3]})
- arr_a = get_array(df, "a")
- view = df[:]
- df.replace(to_replace=to_replace, value=15.5, inplace=True)
- if using_copy_on_write:
- assert not np.shares_memory(get_array(df, "a"), arr_a)
- assert df._mgr._has_no_reference(0)
- assert view._mgr._has_no_reference(0)
- else:
- assert np.shares_memory(get_array(df, "a"), arr_a)
- @pytest.mark.parametrize("to_replace", ["a", 100.5])
- def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace):
- df = DataFrame({"a": [1.5, 2, 3]})
- arr_a = get_array(df, "a")
- view = df[:]
- df.replace(to_replace=to_replace, value=15.5, inplace=True)
- assert np.shares_memory(get_array(df, "a"), arr_a)
- if using_copy_on_write:
- assert not df._mgr._has_no_reference(0)
- assert not view._mgr._has_no_reference(0)
- @pytest.mark.parametrize("to_replace", [1, [1]])
- @pytest.mark.parametrize("val", [1, 1.5])
- def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace):
- df = DataFrame({"a": Categorical([1, 2, 3])})
- df_orig = df.copy()
- arr_a = get_array(df, "a")
- view = df[:]
- df.replace(to_replace=to_replace, value=val, inplace=True)
- if using_copy_on_write:
- assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
- assert df._mgr._has_no_reference(0)
- assert view._mgr._has_no_reference(0)
- tm.assert_frame_equal(view, df_orig)
- else:
- assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
- @pytest.mark.parametrize("val", [1, 1.5])
- def test_replace_categorical_inplace(using_copy_on_write, val):
- df = DataFrame({"a": Categorical([1, 2, 3])})
- arr_a = get_array(df, "a")
- df.replace(to_replace=1, value=val, inplace=True)
- assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
- if using_copy_on_write:
- assert df._mgr._has_no_reference(0)
- expected = DataFrame({"a": Categorical([val, 2, 3])})
- tm.assert_frame_equal(df, expected)
- @pytest.mark.parametrize("val", [1, 1.5])
- def test_replace_categorical(using_copy_on_write, val):
- df = DataFrame({"a": Categorical([1, 2, 3])})
- df_orig = df.copy()
- df2 = df.replace(to_replace=1, value=val)
- if using_copy_on_write:
- assert df._mgr._has_no_reference(0)
- assert df2._mgr._has_no_reference(0)
- assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes)
- tm.assert_frame_equal(df, df_orig)
- arr_a = get_array(df2, "a").codes
- df2.iloc[0, 0] = 2.0
- assert np.shares_memory(get_array(df2, "a").codes, arr_a)
- @pytest.mark.parametrize("method", ["where", "mask"])
- def test_masking_inplace(using_copy_on_write, method):
- df = DataFrame({"a": [1.5, 2, 3]})
- df_orig = df.copy()
- arr_a = get_array(df, "a")
- view = df[:]
- method = getattr(df, method)
- method(df["a"] > 1.6, -1, inplace=True)
- if using_copy_on_write:
- assert not np.shares_memory(get_array(df, "a"), arr_a)
- assert df._mgr._has_no_reference(0)
- assert view._mgr._has_no_reference(0)
- tm.assert_frame_equal(view, df_orig)
- else:
- assert np.shares_memory(get_array(df, "a"), arr_a)
- def test_replace_empty_list(using_copy_on_write):
- df = DataFrame({"a": [1, 2]})
- df2 = df.replace([], [])
- if using_copy_on_write:
- assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
- assert not df._mgr._has_no_reference(0)
- else:
- assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
- arr_a = get_array(df, "a")
- df.replace([], [])
- if using_copy_on_write:
- assert np.shares_memory(get_array(df, "a"), arr_a)
- assert not df._mgr._has_no_reference(0)
- assert not df2._mgr._has_no_reference(0)
- @pytest.mark.parametrize("value", ["d", None])
- def test_replace_object_list_inplace(using_copy_on_write, value):
- df = DataFrame({"a": ["a", "b", "c"]})
- arr = get_array(df, "a")
- df.replace(["c"], value, inplace=True)
- if using_copy_on_write or value is None:
- assert np.shares_memory(arr, get_array(df, "a"))
- else:
- # This could be inplace
- assert not np.shares_memory(arr, get_array(df, "a"))
- if using_copy_on_write:
- assert df._mgr._has_no_reference(0)
- def test_replace_list_multiple_elements_inplace(using_copy_on_write):
- df = DataFrame({"a": [1, 2, 3]})
- arr = get_array(df, "a")
- df.replace([1, 2], 4, inplace=True)
- if using_copy_on_write:
- # TODO(CoW): This should share memory
- assert not np.shares_memory(arr, get_array(df, "a"))
- assert df._mgr._has_no_reference(0)
- else:
- assert np.shares_memory(arr, get_array(df, "a"))
- def test_replace_list_none(using_copy_on_write):
- df = DataFrame({"a": ["a", "b", "c"]})
- df_orig = df.copy()
- df2 = df.replace(["b"], value=None)
- tm.assert_frame_equal(df, df_orig)
- assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
- def test_replace_list_none_inplace_refs(using_copy_on_write):
- df = DataFrame({"a": ["a", "b", "c"]})
- arr = get_array(df, "a")
- df_orig = df.copy()
- view = df[:]
- df.replace(["a"], value=None, inplace=True)
- if using_copy_on_write:
- assert df._mgr._has_no_reference(0)
- assert not np.shares_memory(arr, get_array(df, "a"))
- tm.assert_frame_equal(df_orig, view)
- else:
- assert np.shares_memory(arr, get_array(df, "a"))
|