123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631 |
- from string import ascii_letters as letters
- import numpy as np
- import pytest
- from pandas.errors import (
- SettingWithCopyError,
- SettingWithCopyWarning,
- )
- import pandas.util._test_decorators as td
- import pandas as pd
- from pandas import (
- DataFrame,
- Series,
- Timestamp,
- date_range,
- option_context,
- )
- import pandas._testing as tm
- msg = "A value is trying to be set on a copy of a slice from a DataFrame"
- def random_text(nobs=100):
- # Construct a DataFrame where each row is a random slice from 'letters'
- idxs = np.random.randint(len(letters), size=(nobs, 2))
- idxs.sort(axis=1)
- strings = [letters[x[0] : x[1]] for x in idxs]
- return DataFrame(strings, columns=["letters"])
- class TestCaching:
- def test_slice_consolidate_invalidate_item_cache(self, using_copy_on_write):
- # this is chained assignment, but will 'work'
- with option_context("chained_assignment", None):
- # #3970
- df = DataFrame({"aa": np.arange(5), "bb": [2.2] * 5})
- # Creates a second float block
- df["cc"] = 0.0
- # caches a reference to the 'bb' series
- df["bb"]
- # repr machinery triggers consolidation
- repr(df)
- # Assignment to wrong series
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df["bb"].iloc[0] = 0.17
- else:
- df["bb"].iloc[0] = 0.17
- df._clear_item_cache()
- if not using_copy_on_write:
- tm.assert_almost_equal(df["bb"][0], 0.17)
- else:
- # with ArrayManager, parent is not mutated with chained assignment
- tm.assert_almost_equal(df["bb"][0], 2.2)
- @pytest.mark.parametrize("do_ref", [True, False])
- def test_setitem_cache_updating(self, do_ref):
- # GH 5424
- cont = ["one", "two", "three", "four", "five", "six", "seven"]
- df = DataFrame({"a": cont, "b": cont[3:] + cont[:3], "c": np.arange(7)})
- # ref the cache
- if do_ref:
- df.loc[0, "c"]
- # set it
- df.loc[7, "c"] = 1
- assert df.loc[0, "c"] == 0.0
- assert df.loc[7, "c"] == 1.0
- def test_setitem_cache_updating_slices(self, using_copy_on_write):
- # GH 7084
- # not updating cache on series setting with slices
- expected = DataFrame(
- {"A": [600, 600, 600]}, index=date_range("5/7/2014", "5/9/2014")
- )
- out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014"))
- df = DataFrame({"C": ["A", "A", "A"], "D": [100, 200, 300]})
- # loop through df to update out
- six = Timestamp("5/7/2014")
- eix = Timestamp("5/9/2014")
- for ix, row in df.iterrows():
- out.loc[six:eix, row["C"]] = out.loc[six:eix, row["C"]] + row["D"]
- tm.assert_frame_equal(out, expected)
- tm.assert_series_equal(out["A"], expected["A"])
- # try via a chain indexing
- # this actually works
- out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014"))
- out_original = out.copy()
- for ix, row in df.iterrows():
- v = out[row["C"]][six:eix] + row["D"]
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- out[row["C"]][six:eix] = v
- else:
- out[row["C"]][six:eix] = v
- if not using_copy_on_write:
- tm.assert_frame_equal(out, expected)
- tm.assert_series_equal(out["A"], expected["A"])
- else:
- tm.assert_frame_equal(out, out_original)
- tm.assert_series_equal(out["A"], out_original["A"])
- out = DataFrame({"A": [0, 0, 0]}, index=date_range("5/7/2014", "5/9/2014"))
- for ix, row in df.iterrows():
- out.loc[six:eix, row["C"]] += row["D"]
- tm.assert_frame_equal(out, expected)
- tm.assert_series_equal(out["A"], expected["A"])
- def test_altering_series_clears_parent_cache(self, using_copy_on_write):
- # GH #33675
- df = DataFrame([[1, 2], [3, 4]], index=["a", "b"], columns=["A", "B"])
- ser = df["A"]
- if using_copy_on_write:
- assert "A" not in df._item_cache
- else:
- assert "A" in df._item_cache
- # Adding a new entry to ser swaps in a new array, so "A" needs to
- # be removed from df._item_cache
- ser["c"] = 5
- assert len(ser) == 3
- assert "A" not in df._item_cache
- assert df["A"] is not ser
- assert len(df["A"]) == 2
- class TestChaining:
- def test_setitem_chained_setfault(self, using_copy_on_write):
- # GH6026
- data = ["right", "left", "left", "left", "right", "left", "timeout"]
- mdata = ["right", "left", "left", "left", "right", "left", "none"]
- df = DataFrame({"response": np.array(data)})
- mask = df.response == "timeout"
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df.response[mask] = "none"
- tm.assert_frame_equal(df, DataFrame({"response": data}))
- else:
- df.response[mask] = "none"
- tm.assert_frame_equal(df, DataFrame({"response": mdata}))
- recarray = np.rec.fromarrays([data], names=["response"])
- df = DataFrame(recarray)
- mask = df.response == "timeout"
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df.response[mask] = "none"
- tm.assert_frame_equal(df, DataFrame({"response": data}))
- else:
- df.response[mask] = "none"
- tm.assert_frame_equal(df, DataFrame({"response": mdata}))
- df = DataFrame({"response": data, "response1": data})
- df_original = df.copy()
- mask = df.response == "timeout"
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df.response[mask] = "none"
- tm.assert_frame_equal(df, df_original)
- else:
- df.response[mask] = "none"
- tm.assert_frame_equal(df, DataFrame({"response": mdata, "response1": data}))
- # GH 6056
- expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]})
- df = DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])})
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df["A"].iloc[0] = np.nan
- expected = DataFrame({"A": ["foo", "bar", "bah", "foo", "bar"]})
- else:
- df["A"].iloc[0] = np.nan
- expected = DataFrame({"A": [np.nan, "bar", "bah", "foo", "bar"]})
- result = df.head()
- tm.assert_frame_equal(result, expected)
- df = DataFrame({"A": np.array(["foo", "bar", "bah", "foo", "bar"])})
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df.A.iloc[0] = np.nan
- else:
- df.A.iloc[0] = np.nan
- result = df.head()
- tm.assert_frame_equal(result, expected)
- @pytest.mark.arm_slow
- def test_detect_chained_assignment(self, using_copy_on_write):
- with option_context("chained_assignment", "raise"):
- # work with the chain
- expected = DataFrame([[-5, 1], [-6, 3]], columns=list("AB"))
- df = DataFrame(
- np.arange(4).reshape(2, 2), columns=list("AB"), dtype="int64"
- )
- df_original = df.copy()
- assert df._is_copy is None
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df["A"][0] = -5
- with tm.raises_chained_assignment_error():
- df["A"][1] = -6
- tm.assert_frame_equal(df, df_original)
- else:
- df["A"][0] = -5
- df["A"][1] = -6
- tm.assert_frame_equal(df, expected)
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_raises(
- self, using_array_manager, using_copy_on_write
- ):
- # test with the chaining
- df = DataFrame(
- {
- "A": Series(range(2), dtype="int64"),
- "B": np.array(np.arange(2, 4), dtype=np.float64),
- }
- )
- df_original = df.copy()
- assert df._is_copy is None
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df["A"][0] = -5
- with tm.raises_chained_assignment_error():
- df["A"][1] = -6
- tm.assert_frame_equal(df, df_original)
- elif not using_array_manager:
- with pytest.raises(SettingWithCopyError, match=msg):
- df["A"][0] = -5
- with pytest.raises(SettingWithCopyError, match=msg):
- df["A"][1] = np.nan
- assert df["A"]._is_copy is None
- else:
- # INFO(ArrayManager) for ArrayManager it doesn't matter that it's
- # a mixed dataframe
- df["A"][0] = -5
- df["A"][1] = -6
- expected = DataFrame([[-5, 2], [-6, 3]], columns=list("AB"))
- expected["B"] = expected["B"].astype("float64")
- tm.assert_frame_equal(df, expected)
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_fails(self, using_copy_on_write):
- # Using a copy (the chain), fails
- df = DataFrame(
- {
- "A": Series(range(2), dtype="int64"),
- "B": np.array(np.arange(2, 4), dtype=np.float64),
- }
- )
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df.loc[0]["A"] = -5
- else:
- with pytest.raises(SettingWithCopyError, match=msg):
- df.loc[0]["A"] = -5
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_doc_example(self, using_copy_on_write):
- # Doc example
- df = DataFrame(
- {
- "a": ["one", "one", "two", "three", "two", "one", "six"],
- "c": Series(range(7), dtype="int64"),
- }
- )
- assert df._is_copy is None
- if using_copy_on_write:
- indexer = df.a.str.startswith("o")
- with tm.raises_chained_assignment_error():
- df[indexer]["c"] = 42
- else:
- with pytest.raises(SettingWithCopyError, match=msg):
- indexer = df.a.str.startswith("o")
- df[indexer]["c"] = 42
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_object_dtype(
- self, using_array_manager, using_copy_on_write
- ):
- expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]})
- df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})
- df_original = df.copy()
- if not using_copy_on_write:
- with pytest.raises(SettingWithCopyError, match=msg):
- df.loc[0]["A"] = 111
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df["A"][0] = 111
- tm.assert_frame_equal(df, df_original)
- elif not using_array_manager:
- with pytest.raises(SettingWithCopyError, match=msg):
- df["A"][0] = 111
- df.loc[0, "A"] = 111
- tm.assert_frame_equal(df, expected)
- else:
- # INFO(ArrayManager) for ArrayManager it doesn't matter that it's
- # a mixed dataframe
- df["A"][0] = 111
- tm.assert_frame_equal(df, expected)
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_is_copy_pickle(self):
- # gh-5475: Make sure that is_copy is picked up reconstruction
- df = DataFrame({"A": [1, 2]})
- assert df._is_copy is None
- with tm.ensure_clean("__tmp__pickle") as path:
- df.to_pickle(path)
- df2 = pd.read_pickle(path)
- df2["B"] = df2["A"]
- df2["B"] = df2["A"]
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_setting_entire_column(self):
- # gh-5597: a spurious raise as we are setting the entire column here
- df = random_text(100000)
- # Always a copy
- x = df.iloc[[0, 1, 2]]
- assert x._is_copy is not None
- x = df.iloc[[0, 1, 2, 4]]
- assert x._is_copy is not None
- # Explicitly copy
- indexer = df.letters.apply(lambda x: len(x) > 10)
- df = df.loc[indexer].copy()
- assert df._is_copy is None
- df["letters"] = df["letters"].apply(str.lower)
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_implicit_take(self):
- # Implicitly take
- df = random_text(100000)
- indexer = df.letters.apply(lambda x: len(x) > 10)
- df = df.loc[indexer]
- assert df._is_copy is not None
- df["letters"] = df["letters"].apply(str.lower)
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_implicit_take2(self, using_copy_on_write):
- if using_copy_on_write:
- pytest.skip("_is_copy is not always set for CoW")
- # Implicitly take 2
- df = random_text(100000)
- indexer = df.letters.apply(lambda x: len(x) > 10)
- df = df.loc[indexer]
- assert df._is_copy is not None
- df.loc[:, "letters"] = df["letters"].apply(str.lower)
- # with the enforcement of #45333 in 2.0, the .loc[:, letters] setting
- # is inplace, so df._is_copy remains non-None.
- assert df._is_copy is not None
- df["letters"] = df["letters"].apply(str.lower)
- assert df._is_copy is None
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_str(self):
- df = random_text(100000)
- indexer = df.letters.apply(lambda x: len(x) > 10)
- df.loc[indexer, "letters"] = df.loc[indexer, "letters"].apply(str.lower)
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_is_copy(self):
- # an identical take, so no copy
- df = DataFrame({"a": [1]}).dropna()
- assert df._is_copy is None
- df["a"] += 1
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_sorting(self):
- df = DataFrame(np.random.randn(10, 4))
- ser = df.iloc[:, 0].sort_values()
- tm.assert_series_equal(ser, df.iloc[:, 0].sort_values())
- tm.assert_series_equal(ser, df[0].sort_values())
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_false_positives(self):
- # see gh-6025: false positives
- df = DataFrame({"column1": ["a", "a", "a"], "column2": [4, 8, 9]})
- str(df)
- df["column1"] = df["column1"] + "b"
- str(df)
- df = df[df["column2"] != 8]
- str(df)
- df["column1"] = df["column1"] + "c"
- str(df)
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_undefined_column(self, using_copy_on_write):
- # from SO:
- # https://stackoverflow.com/questions/24054495/potential-bug-setting-value-for-undefined-column-using-iloc
- df = DataFrame(np.arange(0, 9), columns=["count"])
- df["group"] = "b"
- df_original = df.copy()
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df.iloc[0:5]["group"] = "a"
- tm.assert_frame_equal(df, df_original)
- else:
- with pytest.raises(SettingWithCopyError, match=msg):
- df.iloc[0:5]["group"] = "a"
- @pytest.mark.arm_slow
- def test_detect_chained_assignment_changing_dtype(
- self, using_array_manager, using_copy_on_write
- ):
- # Mixed type setting but same dtype & changing dtype
- df = DataFrame(
- {
- "A": date_range("20130101", periods=5),
- "B": np.random.randn(5),
- "C": np.arange(5, dtype="int64"),
- "D": ["a", "b", "c", "d", "e"],
- }
- )
- df_original = df.copy()
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df.loc[2]["D"] = "foo"
- with tm.raises_chained_assignment_error():
- df.loc[2]["C"] = "foo"
- with tm.raises_chained_assignment_error():
- df["C"][2] = "foo"
- tm.assert_frame_equal(df, df_original)
- if not using_copy_on_write:
- with pytest.raises(SettingWithCopyError, match=msg):
- df.loc[2]["D"] = "foo"
- with pytest.raises(SettingWithCopyError, match=msg):
- df.loc[2]["C"] = "foo"
- if not using_array_manager:
- with pytest.raises(SettingWithCopyError, match=msg):
- df["C"][2] = "foo"
- else:
- # INFO(ArrayManager) for ArrayManager it doesn't matter if it's
- # changing the dtype or not
- df["C"][2] = "foo"
- assert df.loc[2, "C"] == "foo"
- def test_setting_with_copy_bug(self, using_copy_on_write):
- # operating on a copy
- df = DataFrame(
- {"a": list(range(4)), "b": list("ab.."), "c": ["a", "b", np.nan, "d"]}
- )
- df_original = df.copy()
- mask = pd.isna(df.c)
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df[["c"]][mask] = df[["b"]][mask]
- tm.assert_frame_equal(df, df_original)
- else:
- with pytest.raises(SettingWithCopyError, match=msg):
- df[["c"]][mask] = df[["b"]][mask]
- def test_setting_with_copy_bug_no_warning(self):
- # invalid warning as we are returning a new object
- # GH 8730
- df1 = DataFrame({"x": Series(["a", "b", "c"]), "y": Series(["d", "e", "f"])})
- df2 = df1[["x"]]
- # this should not raise
- df2["y"] = ["g", "h", "i"]
- def test_detect_chained_assignment_warnings_errors(self, using_copy_on_write):
- df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df.loc[0]["A"] = 111
- return
- with option_context("chained_assignment", "warn"):
- with tm.assert_produces_warning(SettingWithCopyWarning):
- df.loc[0]["A"] = 111
- with option_context("chained_assignment", "raise"):
- with pytest.raises(SettingWithCopyError, match=msg):
- df.loc[0]["A"] = 111
- @pytest.mark.parametrize("rhs", [3, DataFrame({0: [1, 2, 3, 4]})])
- def test_detect_chained_assignment_warning_stacklevel(
- self, rhs, using_copy_on_write
- ):
- # GH#42570
- df = DataFrame(np.arange(25).reshape(5, 5))
- df_original = df.copy()
- chained = df.loc[:3]
- with option_context("chained_assignment", "warn"):
- if not using_copy_on_write:
- with tm.assert_produces_warning(SettingWithCopyWarning) as t:
- chained[2] = rhs
- assert t[0].filename == __file__
- else:
- # INFO(CoW) no warning, and original dataframe not changed
- with tm.assert_produces_warning(None):
- chained[2] = rhs
- tm.assert_frame_equal(df, df_original)
- # TODO(ArrayManager) fast_xs with array-like scalars is not yet working
- @td.skip_array_manager_not_yet_implemented
- def test_chained_getitem_with_lists(self):
- # GH6394
- # Regression in chained getitem indexing with embedded list-like from
- # 0.12
- df = DataFrame({"A": 5 * [np.zeros(3)], "B": 5 * [np.ones(3)]})
- expected = df["A"].iloc[2]
- result = df.loc[2, "A"]
- tm.assert_numpy_array_equal(result, expected)
- result2 = df.iloc[2]["A"]
- tm.assert_numpy_array_equal(result2, expected)
- result3 = df["A"].loc[2]
- tm.assert_numpy_array_equal(result3, expected)
- result4 = df["A"].iloc[2]
- tm.assert_numpy_array_equal(result4, expected)
- def test_cache_updating(self):
- # GH 4939, make sure to update the cache on setitem
- df = tm.makeDataFrame()
- df["A"] # cache series
- df.loc["Hello Friend"] = df.iloc[0]
- assert "Hello Friend" in df["A"].index
- assert "Hello Friend" in df["B"].index
- def test_cache_updating2(self, using_copy_on_write):
- # 10264
- df = DataFrame(
- np.zeros((5, 5), dtype="int64"),
- columns=["a", "b", "c", "d", "e"],
- index=range(5),
- )
- df["f"] = 0
- df_orig = df.copy()
- if using_copy_on_write:
- with pytest.raises(ValueError, match="read-only"):
- df.f.values[3] = 1
- tm.assert_frame_equal(df, df_orig)
- return
- df.f.values[3] = 1
- df.f.values[3] = 2
- expected = DataFrame(
- np.zeros((5, 6), dtype="int64"),
- columns=["a", "b", "c", "d", "e", "f"],
- index=range(5),
- )
- expected.at[3, "f"] = 2
- tm.assert_frame_equal(df, expected)
- expected = Series([0, 0, 0, 2, 0], name="f")
- tm.assert_series_equal(df.f, expected)
- def test_iloc_setitem_chained_assignment(self, using_copy_on_write):
- # GH#3970
- with option_context("chained_assignment", None):
- df = DataFrame({"aa": range(5), "bb": [2.2] * 5})
- df["cc"] = 0.0
- ck = [True] * len(df)
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df["bb"].iloc[0] = 0.13
- else:
- df["bb"].iloc[0] = 0.13
- # GH#3970 this lookup used to break the chained setting to 0.15
- df.iloc[ck]
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df["bb"].iloc[0] = 0.15
- else:
- df["bb"].iloc[0] = 0.15
- if not using_copy_on_write:
- assert df["bb"].iloc[0] == 0.15
- else:
- assert df["bb"].iloc[0] == 2.2
- def test_getitem_loc_assignment_slice_state(self, using_copy_on_write):
- # GH 13569
- df = DataFrame({"a": [10, 20, 30]})
- if using_copy_on_write:
- with tm.raises_chained_assignment_error():
- df["a"].loc[4] = 40
- else:
- df["a"].loc[4] = 40
- tm.assert_frame_equal(df, DataFrame({"a": [10, 20, 30]}))
- tm.assert_series_equal(df["a"], Series([10, 20, 30], name="a"))
|