test_indexing.py 35 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079
  1. import numpy as np
  2. import pytest
  3. from pandas.errors import SettingWithCopyWarning
  4. from pandas.core.dtypes.common import is_float_dtype
  5. import pandas as pd
  6. from pandas import (
  7. DataFrame,
  8. Series,
  9. )
  10. import pandas._testing as tm
  11. from pandas.tests.copy_view.util import get_array
  12. @pytest.fixture(params=["numpy", "nullable"])
  13. def backend(request):
  14. if request.param == "numpy":
  15. def make_dataframe(*args, **kwargs):
  16. return DataFrame(*args, **kwargs)
  17. def make_series(*args, **kwargs):
  18. return Series(*args, **kwargs)
  19. elif request.param == "nullable":
  20. def make_dataframe(*args, **kwargs):
  21. df = DataFrame(*args, **kwargs)
  22. df_nullable = df.convert_dtypes()
  23. # convert_dtypes will try to cast float to int if there is no loss in
  24. # precision -> undo that change
  25. for col in df.columns:
  26. if is_float_dtype(df[col].dtype) and not is_float_dtype(
  27. df_nullable[col].dtype
  28. ):
  29. df_nullable[col] = df_nullable[col].astype("Float64")
  30. # copy final result to ensure we start with a fully self-owning DataFrame
  31. return df_nullable.copy()
  32. def make_series(*args, **kwargs):
  33. ser = Series(*args, **kwargs)
  34. return ser.convert_dtypes().copy()
  35. return request.param, make_dataframe, make_series
  36. # -----------------------------------------------------------------------------
  37. # Indexing operations taking subset + modifying the subset/parent
  38. def test_subset_column_selection(backend, using_copy_on_write):
  39. # Case: taking a subset of the columns of a DataFrame
  40. # + afterwards modifying the subset
  41. _, DataFrame, _ = backend
  42. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
  43. df_orig = df.copy()
  44. subset = df[["a", "c"]]
  45. if using_copy_on_write:
  46. # the subset shares memory ...
  47. assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
  48. # ... but uses CoW when being modified
  49. subset.iloc[0, 0] = 0
  50. else:
  51. assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
  52. # INFO this no longer raise warning since pandas 1.4
  53. # with pd.option_context("chained_assignment", "warn"):
  54. # with tm.assert_produces_warning(SettingWithCopyWarning):
  55. subset.iloc[0, 0] = 0
  56. assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
  57. expected = DataFrame({"a": [0, 2, 3], "c": [0.1, 0.2, 0.3]})
  58. tm.assert_frame_equal(subset, expected)
  59. tm.assert_frame_equal(df, df_orig)
  60. def test_subset_column_selection_modify_parent(backend, using_copy_on_write):
  61. # Case: taking a subset of the columns of a DataFrame
  62. # + afterwards modifying the parent
  63. _, DataFrame, _ = backend
  64. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
  65. subset = df[["a", "c"]]
  66. if using_copy_on_write:
  67. # the subset shares memory ...
  68. assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
  69. # ... but parent uses CoW parent when it is modified
  70. df.iloc[0, 0] = 0
  71. assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
  72. if using_copy_on_write:
  73. # different column/block still shares memory
  74. assert np.shares_memory(get_array(subset, "c"), get_array(df, "c"))
  75. expected = DataFrame({"a": [1, 2, 3], "c": [0.1, 0.2, 0.3]})
  76. tm.assert_frame_equal(subset, expected)
  77. def test_subset_row_slice(backend, using_copy_on_write):
  78. # Case: taking a subset of the rows of a DataFrame using a slice
  79. # + afterwards modifying the subset
  80. _, DataFrame, _ = backend
  81. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
  82. df_orig = df.copy()
  83. subset = df[1:3]
  84. subset._mgr._verify_integrity()
  85. assert np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
  86. if using_copy_on_write:
  87. subset.iloc[0, 0] = 0
  88. assert not np.shares_memory(get_array(subset, "a"), get_array(df, "a"))
  89. else:
  90. # INFO this no longer raise warning since pandas 1.4
  91. # with pd.option_context("chained_assignment", "warn"):
  92. # with tm.assert_produces_warning(SettingWithCopyWarning):
  93. subset.iloc[0, 0] = 0
  94. subset._mgr._verify_integrity()
  95. expected = DataFrame({"a": [0, 3], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3))
  96. tm.assert_frame_equal(subset, expected)
  97. if using_copy_on_write:
  98. # original parent dataframe is not modified (CoW)
  99. tm.assert_frame_equal(df, df_orig)
  100. else:
  101. # original parent dataframe is actually updated
  102. df_orig.iloc[1, 0] = 0
  103. tm.assert_frame_equal(df, df_orig)
  104. @pytest.mark.parametrize(
  105. "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
  106. )
  107. def test_subset_column_slice(backend, using_copy_on_write, using_array_manager, dtype):
  108. # Case: taking a subset of the columns of a DataFrame using a slice
  109. # + afterwards modifying the subset
  110. dtype_backend, DataFrame, _ = backend
  111. single_block = (
  112. dtype == "int64" and dtype_backend == "numpy"
  113. ) and not using_array_manager
  114. df = DataFrame(
  115. {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
  116. )
  117. df_orig = df.copy()
  118. subset = df.iloc[:, 1:]
  119. subset._mgr._verify_integrity()
  120. if using_copy_on_write:
  121. assert np.shares_memory(get_array(subset, "b"), get_array(df, "b"))
  122. subset.iloc[0, 0] = 0
  123. assert not np.shares_memory(get_array(subset, "b"), get_array(df, "b"))
  124. else:
  125. # we only get a warning in case of a single block
  126. warn = SettingWithCopyWarning if single_block else None
  127. with pd.option_context("chained_assignment", "warn"):
  128. with tm.assert_produces_warning(warn):
  129. subset.iloc[0, 0] = 0
  130. expected = DataFrame({"b": [0, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)})
  131. tm.assert_frame_equal(subset, expected)
  132. # original parent dataframe is not modified (also not for BlockManager case,
  133. # except for single block)
  134. if not using_copy_on_write and (using_array_manager or single_block):
  135. df_orig.iloc[0, 1] = 0
  136. tm.assert_frame_equal(df, df_orig)
  137. else:
  138. tm.assert_frame_equal(df, df_orig)
  139. @pytest.mark.parametrize(
  140. "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
  141. )
  142. @pytest.mark.parametrize(
  143. "row_indexer",
  144. [slice(1, 2), np.array([False, True, True]), np.array([1, 2])],
  145. ids=["slice", "mask", "array"],
  146. )
  147. @pytest.mark.parametrize(
  148. "column_indexer",
  149. [slice("b", "c"), np.array([False, True, True]), ["b", "c"]],
  150. ids=["slice", "mask", "array"],
  151. )
  152. def test_subset_loc_rows_columns(
  153. backend,
  154. dtype,
  155. row_indexer,
  156. column_indexer,
  157. using_array_manager,
  158. using_copy_on_write,
  159. ):
  160. # Case: taking a subset of the rows+columns of a DataFrame using .loc
  161. # + afterwards modifying the subset
  162. # Generic test for several combinations of row/column indexers, not all
  163. # of those could actually return a view / need CoW (so this test is not
  164. # checking memory sharing, only ensuring subsequent mutation doesn't
  165. # affect the parent dataframe)
  166. dtype_backend, DataFrame, _ = backend
  167. df = DataFrame(
  168. {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
  169. )
  170. df_orig = df.copy()
  171. subset = df.loc[row_indexer, column_indexer]
  172. # modifying the subset never modifies the parent
  173. subset.iloc[0, 0] = 0
  174. expected = DataFrame(
  175. {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3)
  176. )
  177. tm.assert_frame_equal(subset, expected)
  178. # a few corner cases _do_ actually modify the parent (with both row and column
  179. # slice, and in case of ArrayManager or BlockManager with single block)
  180. if (
  181. isinstance(row_indexer, slice)
  182. and isinstance(column_indexer, slice)
  183. and (
  184. using_array_manager
  185. or (
  186. dtype == "int64"
  187. and dtype_backend == "numpy"
  188. and not using_copy_on_write
  189. )
  190. )
  191. ):
  192. df_orig.iloc[1, 1] = 0
  193. tm.assert_frame_equal(df, df_orig)
  194. @pytest.mark.parametrize(
  195. "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
  196. )
  197. @pytest.mark.parametrize(
  198. "row_indexer",
  199. [slice(1, 3), np.array([False, True, True]), np.array([1, 2])],
  200. ids=["slice", "mask", "array"],
  201. )
  202. @pytest.mark.parametrize(
  203. "column_indexer",
  204. [slice(1, 3), np.array([False, True, True]), [1, 2]],
  205. ids=["slice", "mask", "array"],
  206. )
  207. def test_subset_iloc_rows_columns(
  208. backend,
  209. dtype,
  210. row_indexer,
  211. column_indexer,
  212. using_array_manager,
  213. using_copy_on_write,
  214. ):
  215. # Case: taking a subset of the rows+columns of a DataFrame using .iloc
  216. # + afterwards modifying the subset
  217. # Generic test for several combinations of row/column indexers, not all
  218. # of those could actually return a view / need CoW (so this test is not
  219. # checking memory sharing, only ensuring subsequent mutation doesn't
  220. # affect the parent dataframe)
  221. dtype_backend, DataFrame, _ = backend
  222. df = DataFrame(
  223. {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
  224. )
  225. df_orig = df.copy()
  226. subset = df.iloc[row_indexer, column_indexer]
  227. # modifying the subset never modifies the parent
  228. subset.iloc[0, 0] = 0
  229. expected = DataFrame(
  230. {"b": [0, 6], "c": np.array([8, 9], dtype=dtype)}, index=range(1, 3)
  231. )
  232. tm.assert_frame_equal(subset, expected)
  233. # a few corner cases _do_ actually modify the parent (with both row and column
  234. # slice, and in case of ArrayManager or BlockManager with single block)
  235. if (
  236. isinstance(row_indexer, slice)
  237. and isinstance(column_indexer, slice)
  238. and (
  239. using_array_manager
  240. or (
  241. dtype == "int64"
  242. and dtype_backend == "numpy"
  243. and not using_copy_on_write
  244. )
  245. )
  246. ):
  247. df_orig.iloc[1, 1] = 0
  248. tm.assert_frame_equal(df, df_orig)
  249. @pytest.mark.parametrize(
  250. "indexer",
  251. [slice(0, 2), np.array([True, True, False]), np.array([0, 1])],
  252. ids=["slice", "mask", "array"],
  253. )
  254. def test_subset_set_with_row_indexer(backend, indexer_si, indexer, using_copy_on_write):
  255. # Case: setting values with a row indexer on a viewing subset
  256. # subset[indexer] = value and subset.iloc[indexer] = value
  257. _, DataFrame, _ = backend
  258. df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]})
  259. df_orig = df.copy()
  260. subset = df[1:4]
  261. if (
  262. indexer_si is tm.setitem
  263. and isinstance(indexer, np.ndarray)
  264. and indexer.dtype == "int"
  265. ):
  266. pytest.skip("setitem with labels selects on columns")
  267. if using_copy_on_write:
  268. indexer_si(subset)[indexer] = 0
  269. else:
  270. # INFO iloc no longer raises warning since pandas 1.4
  271. warn = SettingWithCopyWarning if indexer_si is tm.setitem else None
  272. with pd.option_context("chained_assignment", "warn"):
  273. with tm.assert_produces_warning(warn):
  274. indexer_si(subset)[indexer] = 0
  275. expected = DataFrame(
  276. {"a": [0, 0, 4], "b": [0, 0, 7], "c": [0.0, 0.0, 0.4]}, index=range(1, 4)
  277. )
  278. tm.assert_frame_equal(subset, expected)
  279. if using_copy_on_write:
  280. # original parent dataframe is not modified (CoW)
  281. tm.assert_frame_equal(df, df_orig)
  282. else:
  283. # original parent dataframe is actually updated
  284. df_orig[1:3] = 0
  285. tm.assert_frame_equal(df, df_orig)
  286. def test_subset_set_with_mask(backend, using_copy_on_write):
  287. # Case: setting values with a mask on a viewing subset: subset[mask] = value
  288. _, DataFrame, _ = backend
  289. df = DataFrame({"a": [1, 2, 3, 4], "b": [4, 5, 6, 7], "c": [0.1, 0.2, 0.3, 0.4]})
  290. df_orig = df.copy()
  291. subset = df[1:4]
  292. mask = subset > 3
  293. if using_copy_on_write:
  294. subset[mask] = 0
  295. else:
  296. with pd.option_context("chained_assignment", "warn"):
  297. with tm.assert_produces_warning(SettingWithCopyWarning):
  298. subset[mask] = 0
  299. expected = DataFrame(
  300. {"a": [2, 3, 0], "b": [0, 0, 0], "c": [0.20, 0.3, 0.4]}, index=range(1, 4)
  301. )
  302. tm.assert_frame_equal(subset, expected)
  303. if using_copy_on_write:
  304. # original parent dataframe is not modified (CoW)
  305. tm.assert_frame_equal(df, df_orig)
  306. else:
  307. # original parent dataframe is actually updated
  308. df_orig.loc[3, "a"] = 0
  309. df_orig.loc[1:3, "b"] = 0
  310. tm.assert_frame_equal(df, df_orig)
  311. def test_subset_set_column(backend, using_copy_on_write):
  312. # Case: setting a single column on a viewing subset -> subset[col] = value
  313. dtype_backend, DataFrame, _ = backend
  314. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
  315. df_orig = df.copy()
  316. subset = df[1:3]
  317. if dtype_backend == "numpy":
  318. arr = np.array([10, 11], dtype="int64")
  319. else:
  320. arr = pd.array([10, 11], dtype="Int64")
  321. if using_copy_on_write:
  322. subset["a"] = arr
  323. else:
  324. with pd.option_context("chained_assignment", "warn"):
  325. with tm.assert_produces_warning(SettingWithCopyWarning):
  326. subset["a"] = arr
  327. subset._mgr._verify_integrity()
  328. expected = DataFrame(
  329. {"a": [10, 11], "b": [5, 6], "c": [0.2, 0.3]}, index=range(1, 3)
  330. )
  331. tm.assert_frame_equal(subset, expected)
  332. tm.assert_frame_equal(df, df_orig)
  333. @pytest.mark.parametrize(
  334. "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
  335. )
  336. def test_subset_set_column_with_loc(
  337. backend, using_copy_on_write, using_array_manager, dtype
  338. ):
  339. # Case: setting a single column with loc on a viewing subset
  340. # -> subset.loc[:, col] = value
  341. _, DataFrame, _ = backend
  342. df = DataFrame(
  343. {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
  344. )
  345. df_orig = df.copy()
  346. subset = df[1:3]
  347. if using_copy_on_write:
  348. subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
  349. else:
  350. with pd.option_context("chained_assignment", "warn"):
  351. with tm.assert_produces_warning(
  352. None,
  353. raise_on_extra_warnings=not using_array_manager,
  354. ):
  355. subset.loc[:, "a"] = np.array([10, 11], dtype="int64")
  356. subset._mgr._verify_integrity()
  357. expected = DataFrame(
  358. {"a": [10, 11], "b": [5, 6], "c": np.array([8, 9], dtype=dtype)},
  359. index=range(1, 3),
  360. )
  361. tm.assert_frame_equal(subset, expected)
  362. if using_copy_on_write:
  363. # original parent dataframe is not modified (CoW)
  364. tm.assert_frame_equal(df, df_orig)
  365. else:
  366. # original parent dataframe is actually updated
  367. df_orig.loc[1:3, "a"] = np.array([10, 11], dtype="int64")
  368. tm.assert_frame_equal(df, df_orig)
  369. def test_subset_set_column_with_loc2(backend, using_copy_on_write, using_array_manager):
  370. # Case: setting a single column with loc on a viewing subset
  371. # -> subset.loc[:, col] = value
  372. # separate test for case of DataFrame of a single column -> takes a separate
  373. # code path
  374. _, DataFrame, _ = backend
  375. df = DataFrame({"a": [1, 2, 3]})
  376. df_orig = df.copy()
  377. subset = df[1:3]
  378. if using_copy_on_write:
  379. subset.loc[:, "a"] = 0
  380. else:
  381. with pd.option_context("chained_assignment", "warn"):
  382. with tm.assert_produces_warning(
  383. None,
  384. raise_on_extra_warnings=not using_array_manager,
  385. ):
  386. subset.loc[:, "a"] = 0
  387. subset._mgr._verify_integrity()
  388. expected = DataFrame({"a": [0, 0]}, index=range(1, 3))
  389. tm.assert_frame_equal(subset, expected)
  390. if using_copy_on_write:
  391. # original parent dataframe is not modified (CoW)
  392. tm.assert_frame_equal(df, df_orig)
  393. else:
  394. # original parent dataframe is actually updated
  395. df_orig.loc[1:3, "a"] = 0
  396. tm.assert_frame_equal(df, df_orig)
  397. @pytest.mark.parametrize(
  398. "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
  399. )
  400. def test_subset_set_columns(backend, using_copy_on_write, dtype):
  401. # Case: setting multiple columns on a viewing subset
  402. # -> subset[[col1, col2]] = value
  403. dtype_backend, DataFrame, _ = backend
  404. df = DataFrame(
  405. {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
  406. )
  407. df_orig = df.copy()
  408. subset = df[1:3]
  409. if using_copy_on_write:
  410. subset[["a", "c"]] = 0
  411. else:
  412. with pd.option_context("chained_assignment", "warn"):
  413. with tm.assert_produces_warning(SettingWithCopyWarning):
  414. subset[["a", "c"]] = 0
  415. subset._mgr._verify_integrity()
  416. if using_copy_on_write:
  417. # first and third column should certainly have no references anymore
  418. assert all(subset._mgr._has_no_reference(i) for i in [0, 2])
  419. expected = DataFrame({"a": [0, 0], "b": [5, 6], "c": [0, 0]}, index=range(1, 3))
  420. if dtype_backend == "nullable":
  421. # there is not yet a global option, so overriding a column by setting a scalar
  422. # defaults to numpy dtype even if original column was nullable
  423. expected["a"] = expected["a"].astype("int64")
  424. expected["c"] = expected["c"].astype("int64")
  425. tm.assert_frame_equal(subset, expected)
  426. tm.assert_frame_equal(df, df_orig)
  427. @pytest.mark.parametrize(
  428. "indexer",
  429. [slice("a", "b"), np.array([True, True, False]), ["a", "b"]],
  430. ids=["slice", "mask", "array"],
  431. )
  432. def test_subset_set_with_column_indexer(backend, indexer, using_copy_on_write):
  433. # Case: setting multiple columns with a column indexer on a viewing subset
  434. # -> subset.loc[:, [col1, col2]] = value
  435. _, DataFrame, _ = backend
  436. df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "c": [4, 5, 6]})
  437. df_orig = df.copy()
  438. subset = df[1:3]
  439. if using_copy_on_write:
  440. subset.loc[:, indexer] = 0
  441. else:
  442. with pd.option_context("chained_assignment", "warn"):
  443. # As of 2.0, this setitem attempts (successfully) to set values
  444. # inplace, so the assignment is not chained.
  445. subset.loc[:, indexer] = 0
  446. subset._mgr._verify_integrity()
  447. expected = DataFrame({"a": [0, 0], "b": [0.0, 0.0], "c": [5, 6]}, index=range(1, 3))
  448. tm.assert_frame_equal(subset, expected)
  449. if using_copy_on_write:
  450. tm.assert_frame_equal(df, df_orig)
  451. else:
  452. # pre-2.0, in the mixed case with BlockManager, only column "a"
  453. # would be mutated in the parent frame. this changed with the
  454. # enforcement of GH#45333
  455. df_orig.loc[1:2, ["a", "b"]] = 0
  456. tm.assert_frame_equal(df, df_orig)
  457. @pytest.mark.parametrize(
  458. "method",
  459. [
  460. lambda df: df[["a", "b"]][0:2],
  461. lambda df: df[0:2][["a", "b"]],
  462. lambda df: df[["a", "b"]].iloc[0:2],
  463. lambda df: df[["a", "b"]].loc[0:1],
  464. lambda df: df[0:2].iloc[:, 0:2],
  465. lambda df: df[0:2].loc[:, "a":"b"], # type: ignore[misc]
  466. ],
  467. ids=[
  468. "row-getitem-slice",
  469. "column-getitem",
  470. "row-iloc-slice",
  471. "row-loc-slice",
  472. "column-iloc-slice",
  473. "column-loc-slice",
  474. ],
  475. )
  476. @pytest.mark.parametrize(
  477. "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
  478. )
  479. def test_subset_chained_getitem(
  480. request, backend, method, dtype, using_copy_on_write, using_array_manager
  481. ):
  482. # Case: creating a subset using multiple, chained getitem calls using views
  483. # still needs to guarantee proper CoW behaviour
  484. _, DataFrame, _ = backend
  485. df = DataFrame(
  486. {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
  487. )
  488. df_orig = df.copy()
  489. # when not using CoW, it depends on whether we have a single block or not
  490. # and whether we are slicing the columns -> in that case we have a view
  491. test_callspec = request.node.callspec.id
  492. if not using_array_manager:
  493. subset_is_view = test_callspec in (
  494. "numpy-single-block-column-iloc-slice",
  495. "numpy-single-block-column-loc-slice",
  496. )
  497. else:
  498. # with ArrayManager, it doesn't matter whether we have
  499. # single vs mixed block or numpy vs nullable dtypes
  500. subset_is_view = test_callspec.endswith(
  501. "column-iloc-slice"
  502. ) or test_callspec.endswith("column-loc-slice")
  503. # modify subset -> don't modify parent
  504. subset = method(df)
  505. subset.iloc[0, 0] = 0
  506. if using_copy_on_write or (not subset_is_view):
  507. tm.assert_frame_equal(df, df_orig)
  508. else:
  509. assert df.iloc[0, 0] == 0
  510. # modify parent -> don't modify subset
  511. subset = method(df)
  512. df.iloc[0, 0] = 0
  513. expected = DataFrame({"a": [1, 2], "b": [4, 5]})
  514. if using_copy_on_write or not subset_is_view:
  515. tm.assert_frame_equal(subset, expected)
  516. else:
  517. assert subset.iloc[0, 0] == 0
  518. @pytest.mark.parametrize(
  519. "dtype", ["int64", "float64"], ids=["single-block", "mixed-block"]
  520. )
  521. def test_subset_chained_getitem_column(backend, dtype, using_copy_on_write):
  522. # Case: creating a subset using multiple, chained getitem calls using views
  523. # still needs to guarantee proper CoW behaviour
  524. _, DataFrame, Series = backend
  525. df = DataFrame(
  526. {"a": [1, 2, 3], "b": [4, 5, 6], "c": np.array([7, 8, 9], dtype=dtype)}
  527. )
  528. df_orig = df.copy()
  529. # modify subset -> don't modify parent
  530. subset = df[:]["a"][0:2]
  531. df._clear_item_cache()
  532. subset.iloc[0] = 0
  533. if using_copy_on_write:
  534. tm.assert_frame_equal(df, df_orig)
  535. else:
  536. assert df.iloc[0, 0] == 0
  537. # modify parent -> don't modify subset
  538. subset = df[:]["a"][0:2]
  539. df._clear_item_cache()
  540. df.iloc[0, 0] = 0
  541. expected = Series([1, 2], name="a")
  542. if using_copy_on_write:
  543. tm.assert_series_equal(subset, expected)
  544. else:
  545. assert subset.iloc[0] == 0
  546. @pytest.mark.parametrize(
  547. "method",
  548. [
  549. lambda s: s["a":"c"]["a":"b"], # type: ignore[misc]
  550. lambda s: s.iloc[0:3].iloc[0:2],
  551. lambda s: s.loc["a":"c"].loc["a":"b"], # type: ignore[misc]
  552. lambda s: s.loc["a":"c"] # type: ignore[misc]
  553. .iloc[0:3]
  554. .iloc[0:2]
  555. .loc["a":"b"] # type: ignore[misc]
  556. .iloc[0:1],
  557. ],
  558. ids=["getitem", "iloc", "loc", "long-chain"],
  559. )
  560. def test_subset_chained_getitem_series(backend, method, using_copy_on_write):
  561. # Case: creating a subset using multiple, chained getitem calls using views
  562. # still needs to guarantee proper CoW behaviour
  563. _, _, Series = backend
  564. s = Series([1, 2, 3], index=["a", "b", "c"])
  565. s_orig = s.copy()
  566. # modify subset -> don't modify parent
  567. subset = method(s)
  568. subset.iloc[0] = 0
  569. if using_copy_on_write:
  570. tm.assert_series_equal(s, s_orig)
  571. else:
  572. assert s.iloc[0] == 0
  573. # modify parent -> don't modify subset
  574. subset = s.iloc[0:3].iloc[0:2]
  575. s.iloc[0] = 0
  576. expected = Series([1, 2], index=["a", "b"])
  577. if using_copy_on_write:
  578. tm.assert_series_equal(subset, expected)
  579. else:
  580. assert subset.iloc[0] == 0
  581. def test_subset_chained_single_block_row(using_copy_on_write, using_array_manager):
  582. # not parametrizing this for dtype backend, since this explicitly tests single block
  583. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
  584. df_orig = df.copy()
  585. # modify subset -> don't modify parent
  586. subset = df[:].iloc[0].iloc[0:2]
  587. subset.iloc[0] = 0
  588. if using_copy_on_write or using_array_manager:
  589. tm.assert_frame_equal(df, df_orig)
  590. else:
  591. assert df.iloc[0, 0] == 0
  592. # modify parent -> don't modify subset
  593. subset = df[:].iloc[0].iloc[0:2]
  594. df.iloc[0, 0] = 0
  595. expected = Series([1, 4], index=["a", "b"], name=0)
  596. if using_copy_on_write or using_array_manager:
  597. tm.assert_series_equal(subset, expected)
  598. else:
  599. assert subset.iloc[0] == 0
  600. @pytest.mark.parametrize(
  601. "method",
  602. [
  603. lambda df: df[:],
  604. lambda df: df.loc[:, :],
  605. lambda df: df.loc[:],
  606. lambda df: df.iloc[:, :],
  607. lambda df: df.iloc[:],
  608. ],
  609. ids=["getitem", "loc", "loc-rows", "iloc", "iloc-rows"],
  610. )
  611. def test_null_slice(backend, method, using_copy_on_write):
  612. # Case: also all variants of indexing with a null slice (:) should return
  613. # new objects to ensure we correctly use CoW for the results
  614. _, DataFrame, _ = backend
  615. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
  616. df_orig = df.copy()
  617. df2 = method(df)
  618. # we always return new objects (shallow copy), regardless of CoW or not
  619. assert df2 is not df
  620. # and those trigger CoW when mutated
  621. df2.iloc[0, 0] = 0
  622. if using_copy_on_write:
  623. tm.assert_frame_equal(df, df_orig)
  624. else:
  625. assert df.iloc[0, 0] == 0
  626. @pytest.mark.parametrize(
  627. "method",
  628. [
  629. lambda s: s[:],
  630. lambda s: s.loc[:],
  631. lambda s: s.iloc[:],
  632. ],
  633. ids=["getitem", "loc", "iloc"],
  634. )
  635. def test_null_slice_series(backend, method, using_copy_on_write):
  636. _, _, Series = backend
  637. s = Series([1, 2, 3], index=["a", "b", "c"])
  638. s_orig = s.copy()
  639. s2 = method(s)
  640. # we always return new objects, regardless of CoW or not
  641. assert s2 is not s
  642. # and those trigger CoW when mutated
  643. s2.iloc[0] = 0
  644. if using_copy_on_write:
  645. tm.assert_series_equal(s, s_orig)
  646. else:
  647. assert s.iloc[0] == 0
  648. # TODO add more tests modifying the parent
  649. # -----------------------------------------------------------------------------
  650. # Series -- Indexing operations taking subset + modifying the subset/parent
  651. def test_series_getitem_slice(backend, using_copy_on_write):
  652. # Case: taking a slice of a Series + afterwards modifying the subset
  653. _, _, Series = backend
  654. s = Series([1, 2, 3], index=["a", "b", "c"])
  655. s_orig = s.copy()
  656. subset = s[:]
  657. assert np.shares_memory(get_array(subset), get_array(s))
  658. subset.iloc[0] = 0
  659. if using_copy_on_write:
  660. assert not np.shares_memory(get_array(subset), get_array(s))
  661. expected = Series([0, 2, 3], index=["a", "b", "c"])
  662. tm.assert_series_equal(subset, expected)
  663. if using_copy_on_write:
  664. # original parent series is not modified (CoW)
  665. tm.assert_series_equal(s, s_orig)
  666. else:
  667. # original parent series is actually updated
  668. assert s.iloc[0] == 0
  669. @pytest.mark.parametrize(
  670. "indexer",
  671. [slice(0, 2), np.array([True, True, False]), np.array([0, 1])],
  672. ids=["slice", "mask", "array"],
  673. )
  674. def test_series_subset_set_with_indexer(
  675. backend, indexer_si, indexer, using_copy_on_write
  676. ):
  677. # Case: setting values in a viewing Series with an indexer
  678. _, _, Series = backend
  679. s = Series([1, 2, 3], index=["a", "b", "c"])
  680. s_orig = s.copy()
  681. subset = s[:]
  682. indexer_si(subset)[indexer] = 0
  683. expected = Series([0, 0, 3], index=["a", "b", "c"])
  684. tm.assert_series_equal(subset, expected)
  685. if using_copy_on_write:
  686. tm.assert_series_equal(s, s_orig)
  687. else:
  688. tm.assert_series_equal(s, expected)
  689. # -----------------------------------------------------------------------------
  690. # del operator
  691. def test_del_frame(backend, using_copy_on_write):
  692. # Case: deleting a column with `del` on a viewing child dataframe should
  693. # not modify parent + update the references
  694. _, DataFrame, _ = backend
  695. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
  696. df_orig = df.copy()
  697. df2 = df[:]
  698. assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  699. del df2["b"]
  700. assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  701. tm.assert_frame_equal(df, df_orig)
  702. tm.assert_frame_equal(df2, df_orig[["a", "c"]])
  703. df2._mgr._verify_integrity()
  704. # TODO in theory modifying column "b" of the parent wouldn't need a CoW
  705. # but the weakref is still alive and so we still perform CoW
  706. df2.loc[0, "a"] = 100
  707. if using_copy_on_write:
  708. # modifying child after deleting a column still doesn't update parent
  709. tm.assert_frame_equal(df, df_orig)
  710. else:
  711. assert df.loc[0, "a"] == 100
  712. def test_del_series(backend):
  713. _, _, Series = backend
  714. s = Series([1, 2, 3], index=["a", "b", "c"])
  715. s_orig = s.copy()
  716. s2 = s[:]
  717. assert np.shares_memory(get_array(s), get_array(s2))
  718. del s2["a"]
  719. assert not np.shares_memory(get_array(s), get_array(s2))
  720. tm.assert_series_equal(s, s_orig)
  721. tm.assert_series_equal(s2, s_orig[["b", "c"]])
  722. # modifying s2 doesn't need copy on write (due to `del`, s2 is backed by new array)
  723. values = s2.values
  724. s2.loc["b"] = 100
  725. assert values[0] == 100
  726. # -----------------------------------------------------------------------------
  727. # Accessing column as Series
  728. def test_column_as_series(backend, using_copy_on_write, using_array_manager):
  729. # Case: selecting a single column now also uses Copy-on-Write
  730. dtype_backend, DataFrame, Series = backend
  731. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
  732. df_orig = df.copy()
  733. s = df["a"]
  734. assert np.shares_memory(get_array(s, "a"), get_array(df, "a"))
  735. if using_copy_on_write or using_array_manager:
  736. s[0] = 0
  737. else:
  738. warn = SettingWithCopyWarning if dtype_backend == "numpy" else None
  739. with pd.option_context("chained_assignment", "warn"):
  740. with tm.assert_produces_warning(warn):
  741. s[0] = 0
  742. expected = Series([0, 2, 3], name="a")
  743. tm.assert_series_equal(s, expected)
  744. if using_copy_on_write:
  745. # assert not np.shares_memory(s.values, get_array(df, "a"))
  746. tm.assert_frame_equal(df, df_orig)
  747. # ensure cached series on getitem is not the changed series
  748. tm.assert_series_equal(df["a"], df_orig["a"])
  749. else:
  750. df_orig.iloc[0, 0] = 0
  751. tm.assert_frame_equal(df, df_orig)
  752. def test_column_as_series_set_with_upcast(
  753. backend, using_copy_on_write, using_array_manager
  754. ):
  755. # Case: selecting a single column now also uses Copy-on-Write -> when
  756. # setting a value causes an upcast, we don't need to update the parent
  757. # DataFrame through the cache mechanism
  758. dtype_backend, DataFrame, Series = backend
  759. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
  760. df_orig = df.copy()
  761. s = df["a"]
  762. if dtype_backend == "nullable":
  763. with pytest.raises(TypeError, match="Invalid value"):
  764. s[0] = "foo"
  765. expected = Series([1, 2, 3], name="a")
  766. elif using_copy_on_write or using_array_manager:
  767. s[0] = "foo"
  768. expected = Series(["foo", 2, 3], dtype=object, name="a")
  769. else:
  770. with pd.option_context("chained_assignment", "warn"):
  771. with tm.assert_produces_warning(SettingWithCopyWarning):
  772. s[0] = "foo"
  773. expected = Series(["foo", 2, 3], dtype=object, name="a")
  774. tm.assert_series_equal(s, expected)
  775. if using_copy_on_write:
  776. tm.assert_frame_equal(df, df_orig)
  777. # ensure cached series on getitem is not the changed series
  778. tm.assert_series_equal(df["a"], df_orig["a"])
  779. else:
  780. df_orig["a"] = expected
  781. tm.assert_frame_equal(df, df_orig)
  782. @pytest.mark.parametrize(
  783. "method",
  784. [
  785. lambda df: df["a"],
  786. lambda df: df.loc[:, "a"],
  787. lambda df: df.iloc[:, 0],
  788. ],
  789. ids=["getitem", "loc", "iloc"],
  790. )
  791. def test_column_as_series_no_item_cache(
  792. request, backend, method, using_copy_on_write, using_array_manager
  793. ):
  794. # Case: selecting a single column (which now also uses Copy-on-Write to protect
  795. # the view) should always give a new object (i.e. not make use of a cache)
  796. dtype_backend, DataFrame, _ = backend
  797. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
  798. df_orig = df.copy()
  799. s1 = method(df)
  800. s2 = method(df)
  801. is_iloc = "iloc" in request.node.name
  802. if using_copy_on_write or is_iloc:
  803. assert s1 is not s2
  804. else:
  805. assert s1 is s2
  806. if using_copy_on_write or using_array_manager:
  807. s1.iloc[0] = 0
  808. else:
  809. warn = SettingWithCopyWarning if dtype_backend == "numpy" else None
  810. with pd.option_context("chained_assignment", "warn"):
  811. with tm.assert_produces_warning(warn):
  812. s1.iloc[0] = 0
  813. if using_copy_on_write:
  814. tm.assert_series_equal(s2, df_orig["a"])
  815. tm.assert_frame_equal(df, df_orig)
  816. else:
  817. assert s2.iloc[0] == 0
  818. # TODO add tests for other indexing methods on the Series
  819. def test_dataframe_add_column_from_series(backend):
  820. # Case: adding a new column to a DataFrame from an existing column/series
  821. # -> always already takes a copy on assignment
  822. # (no change in behaviour here)
  823. # TODO can we achieve the same behaviour with Copy-on-Write?
  824. _, DataFrame, Series = backend
  825. df = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3]})
  826. s = Series([10, 11, 12])
  827. df["new"] = s
  828. assert not np.shares_memory(get_array(df, "new"), s.values)
  829. # editing series -> doesn't modify column in frame
  830. s[0] = 0
  831. expected = DataFrame({"a": [1, 2, 3], "b": [0.1, 0.2, 0.3], "new": [10, 11, 12]})
  832. tm.assert_frame_equal(df, expected)
  833. # editing column in frame -> doesn't modify series
  834. df.loc[2, "new"] = 100
  835. expected_s = Series([0, 11, 12])
  836. tm.assert_series_equal(s, expected_s)
  837. @pytest.mark.parametrize("val", [100, "a"])
  838. @pytest.mark.parametrize(
  839. "indexer_func, indexer",
  840. [
  841. (tm.loc, (0, "a")),
  842. (tm.iloc, (0, 0)),
  843. (tm.loc, ([0], "a")),
  844. (tm.iloc, ([0], 0)),
  845. (tm.loc, (slice(None), "a")),
  846. (tm.iloc, (slice(None), 0)),
  847. ],
  848. )
  849. def test_set_value_copy_only_necessary_column(
  850. using_copy_on_write, indexer_func, indexer, val
  851. ):
  852. # When setting inplace, only copy column that is modified instead of the whole
  853. # block (by splitting the block)
  854. # TODO multi-block only for now
  855. df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]})
  856. df_orig = df.copy()
  857. view = df[:]
  858. indexer_func(df)[indexer] = val
  859. if using_copy_on_write:
  860. assert np.shares_memory(get_array(df, "b"), get_array(view, "b"))
  861. assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
  862. tm.assert_frame_equal(view, df_orig)
  863. else:
  864. assert np.shares_memory(get_array(df, "c"), get_array(view, "c"))
  865. if val == "a":
  866. assert not np.shares_memory(get_array(df, "a"), get_array(view, "a"))
  867. else:
  868. assert np.shares_memory(get_array(df, "a"), get_array(view, "a"))
  869. def test_series_midx_slice(using_copy_on_write):
  870. ser = Series([1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]]))
  871. result = ser[1]
  872. assert np.shares_memory(get_array(ser), get_array(result))
  873. result.iloc[0] = 100
  874. if using_copy_on_write:
  875. expected = Series(
  876. [1, 2, 3], index=pd.MultiIndex.from_arrays([[1, 1, 2], [3, 4, 5]])
  877. )
  878. tm.assert_series_equal(ser, expected)
  879. def test_getitem_midx_slice(using_copy_on_write, using_array_manager):
  880. df = DataFrame({("a", "x"): [1, 2], ("a", "y"): 1, ("b", "x"): 2})
  881. df_orig = df.copy()
  882. new_df = df[("a",)]
  883. if using_copy_on_write:
  884. assert not new_df._mgr._has_no_reference(0)
  885. if not using_array_manager:
  886. assert np.shares_memory(get_array(df, ("a", "x")), get_array(new_df, "x"))
  887. if using_copy_on_write:
  888. new_df.iloc[0, 0] = 100
  889. tm.assert_frame_equal(df_orig, df)
  890. def test_series_midx_tuples_slice(using_copy_on_write):
  891. ser = Series(
  892. [1, 2, 3],
  893. index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]),
  894. )
  895. result = ser[(1, 2)]
  896. assert np.shares_memory(get_array(ser), get_array(result))
  897. result.iloc[0] = 100
  898. if using_copy_on_write:
  899. expected = Series(
  900. [1, 2, 3],
  901. index=pd.MultiIndex.from_tuples([((1, 2), 3), ((1, 2), 4), ((2, 3), 4)]),
  902. )
  903. tm.assert_series_equal(ser, expected)