test_replace.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. Categorical,
  5. DataFrame,
  6. )
  7. import pandas._testing as tm
  8. from pandas.tests.copy_view.util import get_array
  9. @pytest.mark.parametrize(
  10. "replace_kwargs",
  11. [
  12. {"to_replace": {"a": 1, "b": 4}, "value": -1},
  13. # Test CoW splits blocks to avoid copying unchanged columns
  14. {"to_replace": {"a": 1}, "value": -1},
  15. {"to_replace": {"b": 4}, "value": -1},
  16. {"to_replace": {"b": {4: 1}}},
  17. # TODO: Add these in a further optimization
  18. # We would need to see which columns got replaced in the mask
  19. # which could be expensive
  20. # {"to_replace": {"b": 1}},
  21. # 1
  22. ],
  23. )
  24. def test_replace(using_copy_on_write, replace_kwargs):
  25. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": ["foo", "bar", "baz"]})
  26. df_orig = df.copy()
  27. df_replaced = df.replace(**replace_kwargs)
  28. if using_copy_on_write:
  29. if (df_replaced["b"] == df["b"]).all():
  30. assert np.shares_memory(get_array(df_replaced, "b"), get_array(df, "b"))
  31. assert np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
  32. # mutating squeezed df triggers a copy-on-write for that column/block
  33. df_replaced.loc[0, "c"] = -1
  34. if using_copy_on_write:
  35. assert not np.shares_memory(get_array(df_replaced, "c"), get_array(df, "c"))
  36. if "a" in replace_kwargs["to_replace"]:
  37. arr = get_array(df_replaced, "a")
  38. df_replaced.loc[0, "a"] = 100
  39. assert np.shares_memory(get_array(df_replaced, "a"), arr)
  40. tm.assert_frame_equal(df, df_orig)
  41. def test_replace_regex_inplace_refs(using_copy_on_write):
  42. df = DataFrame({"a": ["aaa", "bbb"]})
  43. df_orig = df.copy()
  44. view = df[:]
  45. arr = get_array(df, "a")
  46. df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
  47. if using_copy_on_write:
  48. assert not np.shares_memory(arr, get_array(df, "a"))
  49. assert df._mgr._has_no_reference(0)
  50. tm.assert_frame_equal(view, df_orig)
  51. else:
  52. assert np.shares_memory(arr, get_array(df, "a"))
  53. def test_replace_regex_inplace(using_copy_on_write):
  54. df = DataFrame({"a": ["aaa", "bbb"]})
  55. arr = get_array(df, "a")
  56. df.replace(to_replace=r"^a.*$", value="new", inplace=True, regex=True)
  57. if using_copy_on_write:
  58. assert df._mgr._has_no_reference(0)
  59. assert np.shares_memory(arr, get_array(df, "a"))
  60. df_orig = df.copy()
  61. df2 = df.replace(to_replace=r"^b.*$", value="new", regex=True)
  62. tm.assert_frame_equal(df_orig, df)
  63. assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  64. def test_replace_regex_inplace_no_op(using_copy_on_write):
  65. df = DataFrame({"a": [1, 2]})
  66. arr = get_array(df, "a")
  67. df.replace(to_replace=r"^a.$", value="new", inplace=True, regex=True)
  68. if using_copy_on_write:
  69. assert df._mgr._has_no_reference(0)
  70. assert np.shares_memory(arr, get_array(df, "a"))
  71. df_orig = df.copy()
  72. df2 = df.replace(to_replace=r"^x.$", value="new", regex=True)
  73. tm.assert_frame_equal(df_orig, df)
  74. if using_copy_on_write:
  75. assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  76. else:
  77. assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  78. def test_replace_mask_all_false_second_block(using_copy_on_write):
  79. df = DataFrame({"a": [1.5, 2, 3], "b": 100.5, "c": 1, "d": 2})
  80. df_orig = df.copy()
  81. df2 = df.replace(to_replace=1.5, value=55.5)
  82. if using_copy_on_write:
  83. # TODO: Block splitting would allow us to avoid copying b
  84. assert np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
  85. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  86. else:
  87. assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
  88. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  89. df2.loc[0, "c"] = 1
  90. tm.assert_frame_equal(df, df_orig) # Original is unchanged
  91. if using_copy_on_write:
  92. assert not np.shares_memory(get_array(df, "c"), get_array(df2, "c"))
  93. # TODO: This should split and not copy the whole block
  94. # assert np.shares_memory(get_array(df, "d"), get_array(df2, "d"))
  95. def test_replace_coerce_single_column(using_copy_on_write, using_array_manager):
  96. df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
  97. df_orig = df.copy()
  98. df2 = df.replace(to_replace=1.5, value="a")
  99. if using_copy_on_write:
  100. assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  101. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  102. elif not using_array_manager:
  103. assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  104. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  105. if using_copy_on_write:
  106. df2.loc[0, "b"] = 0.5
  107. tm.assert_frame_equal(df, df_orig) # Original is unchanged
  108. assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  109. def test_replace_to_replace_wrong_dtype(using_copy_on_write):
  110. df = DataFrame({"a": [1.5, 2, 3], "b": 100.5})
  111. df_orig = df.copy()
  112. df2 = df.replace(to_replace="xxx", value=1.5)
  113. if using_copy_on_write:
  114. assert np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  115. assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  116. else:
  117. assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  118. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  119. df2.loc[0, "b"] = 0.5
  120. tm.assert_frame_equal(df, df_orig) # Original is unchanged
  121. if using_copy_on_write:
  122. assert not np.shares_memory(get_array(df, "b"), get_array(df2, "b"))
  123. def test_replace_list_categorical(using_copy_on_write):
  124. df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
  125. arr = get_array(df, "a")
  126. df.replace(["c"], value="a", inplace=True)
  127. assert np.shares_memory(arr.codes, get_array(df, "a").codes)
  128. if using_copy_on_write:
  129. assert df._mgr._has_no_reference(0)
  130. df_orig = df.copy()
  131. df2 = df.replace(["b"], value="a")
  132. assert not np.shares_memory(arr.codes, get_array(df2, "a").codes)
  133. tm.assert_frame_equal(df, df_orig)
  134. def test_replace_list_inplace_refs_categorical(using_copy_on_write):
  135. df = DataFrame({"a": ["a", "b", "c"]}, dtype="category")
  136. view = df[:]
  137. df_orig = df.copy()
  138. df.replace(["c"], value="a", inplace=True)
  139. if using_copy_on_write:
  140. assert not np.shares_memory(
  141. get_array(view, "a").codes, get_array(df, "a").codes
  142. )
  143. tm.assert_frame_equal(df_orig, view)
  144. else:
  145. # This could be inplace
  146. assert not np.shares_memory(
  147. get_array(view, "a").codes, get_array(df, "a").codes
  148. )
  149. @pytest.mark.parametrize("to_replace", [1.5, [1.5], []])
  150. def test_replace_inplace(using_copy_on_write, to_replace):
  151. df = DataFrame({"a": [1.5, 2, 3]})
  152. arr_a = get_array(df, "a")
  153. df.replace(to_replace=1.5, value=15.5, inplace=True)
  154. assert np.shares_memory(get_array(df, "a"), arr_a)
  155. if using_copy_on_write:
  156. assert df._mgr._has_no_reference(0)
  157. @pytest.mark.parametrize("to_replace", [1.5, [1.5]])
  158. def test_replace_inplace_reference(using_copy_on_write, to_replace):
  159. df = DataFrame({"a": [1.5, 2, 3]})
  160. arr_a = get_array(df, "a")
  161. view = df[:]
  162. df.replace(to_replace=to_replace, value=15.5, inplace=True)
  163. if using_copy_on_write:
  164. assert not np.shares_memory(get_array(df, "a"), arr_a)
  165. assert df._mgr._has_no_reference(0)
  166. assert view._mgr._has_no_reference(0)
  167. else:
  168. assert np.shares_memory(get_array(df, "a"), arr_a)
  169. @pytest.mark.parametrize("to_replace", ["a", 100.5])
  170. def test_replace_inplace_reference_no_op(using_copy_on_write, to_replace):
  171. df = DataFrame({"a": [1.5, 2, 3]})
  172. arr_a = get_array(df, "a")
  173. view = df[:]
  174. df.replace(to_replace=to_replace, value=15.5, inplace=True)
  175. assert np.shares_memory(get_array(df, "a"), arr_a)
  176. if using_copy_on_write:
  177. assert not df._mgr._has_no_reference(0)
  178. assert not view._mgr._has_no_reference(0)
  179. @pytest.mark.parametrize("to_replace", [1, [1]])
  180. @pytest.mark.parametrize("val", [1, 1.5])
  181. def test_replace_categorical_inplace_reference(using_copy_on_write, val, to_replace):
  182. df = DataFrame({"a": Categorical([1, 2, 3])})
  183. df_orig = df.copy()
  184. arr_a = get_array(df, "a")
  185. view = df[:]
  186. df.replace(to_replace=to_replace, value=val, inplace=True)
  187. if using_copy_on_write:
  188. assert not np.shares_memory(get_array(df, "a").codes, arr_a.codes)
  189. assert df._mgr._has_no_reference(0)
  190. assert view._mgr._has_no_reference(0)
  191. tm.assert_frame_equal(view, df_orig)
  192. else:
  193. assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
  194. @pytest.mark.parametrize("val", [1, 1.5])
  195. def test_replace_categorical_inplace(using_copy_on_write, val):
  196. df = DataFrame({"a": Categorical([1, 2, 3])})
  197. arr_a = get_array(df, "a")
  198. df.replace(to_replace=1, value=val, inplace=True)
  199. assert np.shares_memory(get_array(df, "a").codes, arr_a.codes)
  200. if using_copy_on_write:
  201. assert df._mgr._has_no_reference(0)
  202. expected = DataFrame({"a": Categorical([val, 2, 3])})
  203. tm.assert_frame_equal(df, expected)
  204. @pytest.mark.parametrize("val", [1, 1.5])
  205. def test_replace_categorical(using_copy_on_write, val):
  206. df = DataFrame({"a": Categorical([1, 2, 3])})
  207. df_orig = df.copy()
  208. df2 = df.replace(to_replace=1, value=val)
  209. if using_copy_on_write:
  210. assert df._mgr._has_no_reference(0)
  211. assert df2._mgr._has_no_reference(0)
  212. assert not np.shares_memory(get_array(df, "a").codes, get_array(df2, "a").codes)
  213. tm.assert_frame_equal(df, df_orig)
  214. arr_a = get_array(df2, "a").codes
  215. df2.iloc[0, 0] = 2.0
  216. assert np.shares_memory(get_array(df2, "a").codes, arr_a)
  217. @pytest.mark.parametrize("method", ["where", "mask"])
  218. def test_masking_inplace(using_copy_on_write, method):
  219. df = DataFrame({"a": [1.5, 2, 3]})
  220. df_orig = df.copy()
  221. arr_a = get_array(df, "a")
  222. view = df[:]
  223. method = getattr(df, method)
  224. method(df["a"] > 1.6, -1, inplace=True)
  225. if using_copy_on_write:
  226. assert not np.shares_memory(get_array(df, "a"), arr_a)
  227. assert df._mgr._has_no_reference(0)
  228. assert view._mgr._has_no_reference(0)
  229. tm.assert_frame_equal(view, df_orig)
  230. else:
  231. assert np.shares_memory(get_array(df, "a"), arr_a)
  232. def test_replace_empty_list(using_copy_on_write):
  233. df = DataFrame({"a": [1, 2]})
  234. df2 = df.replace([], [])
  235. if using_copy_on_write:
  236. assert np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  237. assert not df._mgr._has_no_reference(0)
  238. else:
  239. assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a"))
  240. arr_a = get_array(df, "a")
  241. df.replace([], [])
  242. if using_copy_on_write:
  243. assert np.shares_memory(get_array(df, "a"), arr_a)
  244. assert not df._mgr._has_no_reference(0)
  245. assert not df2._mgr._has_no_reference(0)
  246. @pytest.mark.parametrize("value", ["d", None])
  247. def test_replace_object_list_inplace(using_copy_on_write, value):
  248. df = DataFrame({"a": ["a", "b", "c"]})
  249. arr = get_array(df, "a")
  250. df.replace(["c"], value, inplace=True)
  251. if using_copy_on_write or value is None:
  252. assert np.shares_memory(arr, get_array(df, "a"))
  253. else:
  254. # This could be inplace
  255. assert not np.shares_memory(arr, get_array(df, "a"))
  256. if using_copy_on_write:
  257. assert df._mgr._has_no_reference(0)
  258. def test_replace_list_multiple_elements_inplace(using_copy_on_write):
  259. df = DataFrame({"a": [1, 2, 3]})
  260. arr = get_array(df, "a")
  261. df.replace([1, 2], 4, inplace=True)
  262. if using_copy_on_write:
  263. # TODO(CoW): This should share memory
  264. assert not np.shares_memory(arr, get_array(df, "a"))
  265. assert df._mgr._has_no_reference(0)
  266. else:
  267. assert np.shares_memory(arr, get_array(df, "a"))
  268. def test_replace_list_none(using_copy_on_write):
  269. df = DataFrame({"a": ["a", "b", "c"]})
  270. df_orig = df.copy()
  271. df2 = df.replace(["b"], value=None)
  272. tm.assert_frame_equal(df, df_orig)
  273. assert not np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  274. def test_replace_list_none_inplace_refs(using_copy_on_write):
  275. df = DataFrame({"a": ["a", "b", "c"]})
  276. arr = get_array(df, "a")
  277. df_orig = df.copy()
  278. view = df[:]
  279. df.replace(["a"], value=None, inplace=True)
  280. if using_copy_on_write:
  281. assert df._mgr._has_no_reference(0)
  282. assert not np.shares_memory(arr, get_array(df, "a"))
  283. tm.assert_frame_equal(df_orig, view)
  284. else:
  285. assert np.shares_memory(arr, get_array(df, "a"))