test_constructors.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. from pandas import (
  5. DataFrame,
  6. DatetimeIndex,
  7. Index,
  8. Period,
  9. PeriodIndex,
  10. Series,
  11. Timedelta,
  12. TimedeltaIndex,
  13. Timestamp,
  14. )
  15. import pandas._testing as tm
  16. from pandas.tests.copy_view.util import get_array
  17. # -----------------------------------------------------------------------------
  18. # Copy/view behaviour for Series / DataFrame constructors
  19. @pytest.mark.parametrize("dtype", [None, "int64"])
  20. def test_series_from_series(dtype, using_copy_on_write):
  21. # Case: constructing a Series from another Series object follows CoW rules:
  22. # a new object is returned and thus mutations are not propagated
  23. ser = Series([1, 2, 3], name="name")
  24. # default is copy=False -> new Series is a shallow copy / view of original
  25. result = Series(ser, dtype=dtype)
  26. # the shallow copy still shares memory
  27. assert np.shares_memory(get_array(ser), get_array(result))
  28. if using_copy_on_write:
  29. assert result._mgr.blocks[0].refs.has_reference()
  30. if using_copy_on_write:
  31. # mutating new series copy doesn't mutate original
  32. result.iloc[0] = 0
  33. assert ser.iloc[0] == 1
  34. # mutating triggered a copy-on-write -> no longer shares memory
  35. assert not np.shares_memory(get_array(ser), get_array(result))
  36. else:
  37. # mutating shallow copy does mutate original
  38. result.iloc[0] = 0
  39. assert ser.iloc[0] == 0
  40. # and still shares memory
  41. assert np.shares_memory(get_array(ser), get_array(result))
  42. # the same when modifying the parent
  43. result = Series(ser, dtype=dtype)
  44. if using_copy_on_write:
  45. # mutating original doesn't mutate new series
  46. ser.iloc[0] = 0
  47. assert result.iloc[0] == 1
  48. else:
  49. # mutating original does mutate shallow copy
  50. ser.iloc[0] = 0
  51. assert result.iloc[0] == 0
  52. def test_series_from_series_with_reindex(using_copy_on_write):
  53. # Case: constructing a Series from another Series with specifying an index
  54. # that potentially requires a reindex of the values
  55. ser = Series([1, 2, 3], name="name")
  56. # passing an index that doesn't actually require a reindex of the values
  57. # -> without CoW we get an actual mutating view
  58. for index in [
  59. ser.index,
  60. ser.index.copy(),
  61. list(ser.index),
  62. ser.index.rename("idx"),
  63. ]:
  64. result = Series(ser, index=index)
  65. assert np.shares_memory(ser.values, result.values)
  66. result.iloc[0] = 0
  67. if using_copy_on_write:
  68. assert ser.iloc[0] == 1
  69. else:
  70. assert ser.iloc[0] == 0
  71. # ensure that if an actual reindex is needed, we don't have any refs
  72. # (mutating the result wouldn't trigger CoW)
  73. result = Series(ser, index=[0, 1, 2, 3])
  74. assert not np.shares_memory(ser.values, result.values)
  75. if using_copy_on_write:
  76. assert not result._mgr.blocks[0].refs.has_reference()
  77. @pytest.mark.parametrize("fastpath", [False, True])
  78. @pytest.mark.parametrize("dtype", [None, "int64"])
  79. @pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
  80. @pytest.mark.parametrize(
  81. "arr", [np.array([1, 2, 3], dtype="int64"), pd.array([1, 2, 3], dtype="Int64")]
  82. )
  83. def test_series_from_array(using_copy_on_write, idx, dtype, fastpath, arr):
  84. if idx is None or dtype is not None:
  85. fastpath = False
  86. ser = Series(arr, dtype=dtype, index=idx, fastpath=fastpath)
  87. ser_orig = ser.copy()
  88. data = getattr(arr, "_data", arr)
  89. if using_copy_on_write:
  90. assert not np.shares_memory(get_array(ser), data)
  91. else:
  92. assert np.shares_memory(get_array(ser), data)
  93. arr[0] = 100
  94. if using_copy_on_write:
  95. tm.assert_series_equal(ser, ser_orig)
  96. else:
  97. expected = Series([100, 2, 3], dtype=dtype if dtype is not None else arr.dtype)
  98. tm.assert_series_equal(ser, expected)
  99. @pytest.mark.parametrize("copy", [True, False, None])
  100. def test_series_from_array_different_dtype(using_copy_on_write, copy):
  101. arr = np.array([1, 2, 3], dtype="int64")
  102. ser = Series(arr, dtype="int32", copy=copy)
  103. assert not np.shares_memory(get_array(ser), arr)
  104. @pytest.mark.parametrize(
  105. "idx",
  106. [
  107. Index([1, 2]),
  108. DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]),
  109. PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]),
  110. TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]),
  111. ],
  112. )
  113. def test_series_from_index(using_copy_on_write, idx):
  114. ser = Series(idx)
  115. expected = idx.copy(deep=True)
  116. if using_copy_on_write:
  117. assert np.shares_memory(get_array(ser), get_array(idx))
  118. assert not ser._mgr._has_no_reference(0)
  119. else:
  120. assert not np.shares_memory(get_array(ser), get_array(idx))
  121. ser.iloc[0] = ser.iloc[1]
  122. tm.assert_index_equal(idx, expected)
  123. def test_series_from_index_different_dtypes(using_copy_on_write):
  124. idx = Index([1, 2, 3], dtype="int64")
  125. ser = Series(idx, dtype="int32")
  126. assert not np.shares_memory(get_array(ser), get_array(idx))
  127. if using_copy_on_write:
  128. assert ser._mgr._has_no_reference(0)
  129. @pytest.mark.parametrize("fastpath", [False, True])
  130. @pytest.mark.parametrize("dtype", [None, "int64"])
  131. @pytest.mark.parametrize("idx", [None, pd.RangeIndex(start=0, stop=3, step=1)])
  132. def test_series_from_block_manager(using_copy_on_write, idx, dtype, fastpath):
  133. ser = Series([1, 2, 3], dtype="int64")
  134. ser_orig = ser.copy()
  135. ser2 = Series(ser._mgr, dtype=dtype, fastpath=fastpath, index=idx)
  136. assert np.shares_memory(get_array(ser), get_array(ser2))
  137. if using_copy_on_write:
  138. assert not ser2._mgr._has_no_reference(0)
  139. ser2.iloc[0] = 100
  140. if using_copy_on_write:
  141. tm.assert_series_equal(ser, ser_orig)
  142. else:
  143. expected = Series([100, 2, 3])
  144. tm.assert_series_equal(ser, expected)
  145. def test_series_from_block_manager_different_dtype(using_copy_on_write):
  146. ser = Series([1, 2, 3], dtype="int64")
  147. ser2 = Series(ser._mgr, dtype="int32")
  148. assert not np.shares_memory(get_array(ser), get_array(ser2))
  149. if using_copy_on_write:
  150. assert ser2._mgr._has_no_reference(0)
  151. @pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr])
  152. @pytest.mark.parametrize("columns", [None, ["a"]])
  153. def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func):
  154. df = DataFrame({"a": [1, 2, 3]})
  155. df_orig = df.copy()
  156. new_df = DataFrame(func(df))
  157. assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
  158. new_df.iloc[0] = 100
  159. if using_copy_on_write:
  160. assert not np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
  161. tm.assert_frame_equal(df, df_orig)
  162. else:
  163. assert np.shares_memory(get_array(df, "a"), get_array(new_df, "a"))
  164. tm.assert_frame_equal(df, new_df)
  165. @pytest.mark.parametrize("dtype", [None, "int64", "Int64"])
  166. @pytest.mark.parametrize("index", [None, [0, 1, 2]])
  167. @pytest.mark.parametrize("columns", [None, ["a", "b"], ["a", "b", "c"]])
  168. def test_dataframe_from_dict_of_series(
  169. request, using_copy_on_write, columns, index, dtype
  170. ):
  171. # Case: constructing a DataFrame from Series objects with copy=False
  172. # has to do a lazy following CoW rules
  173. # (the default for DataFrame(dict) is still to copy to ensure consolidation)
  174. s1 = Series([1, 2, 3])
  175. s2 = Series([4, 5, 6])
  176. s1_orig = s1.copy()
  177. expected = DataFrame(
  178. {"a": [1, 2, 3], "b": [4, 5, 6]}, index=index, columns=columns, dtype=dtype
  179. )
  180. result = DataFrame(
  181. {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
  182. )
  183. # the shallow copy still shares memory
  184. assert np.shares_memory(get_array(result, "a"), get_array(s1))
  185. # mutating the new dataframe doesn't mutate original
  186. result.iloc[0, 0] = 10
  187. if using_copy_on_write:
  188. assert not np.shares_memory(get_array(result, "a"), get_array(s1))
  189. tm.assert_series_equal(s1, s1_orig)
  190. else:
  191. assert s1.iloc[0] == 10
  192. # the same when modifying the parent series
  193. s1 = Series([1, 2, 3])
  194. s2 = Series([4, 5, 6])
  195. result = DataFrame(
  196. {"a": s1, "b": s2}, index=index, columns=columns, dtype=dtype, copy=False
  197. )
  198. s1.iloc[0] = 10
  199. if using_copy_on_write:
  200. assert not np.shares_memory(get_array(result, "a"), get_array(s1))
  201. tm.assert_frame_equal(result, expected)
  202. else:
  203. assert result.iloc[0, 0] == 10
  204. @pytest.mark.parametrize("dtype", [None, "int64"])
  205. def test_dataframe_from_dict_of_series_with_reindex(dtype):
  206. # Case: constructing a DataFrame from Series objects with copy=False
  207. # and passing an index that requires an actual (no-view) reindex -> need
  208. # to ensure the result doesn't have refs set up to unnecessarily trigger
  209. # a copy on write
  210. s1 = Series([1, 2, 3])
  211. s2 = Series([4, 5, 6])
  212. df = DataFrame({"a": s1, "b": s2}, index=[1, 2, 3], dtype=dtype, copy=False)
  213. # df should own its memory, so mutating shouldn't trigger a copy
  214. arr_before = get_array(df, "a")
  215. assert not np.shares_memory(arr_before, get_array(s1))
  216. df.iloc[0, 0] = 100
  217. arr_after = get_array(df, "a")
  218. assert np.shares_memory(arr_before, arr_after)
  219. @pytest.mark.parametrize("cons", [Series, Index])
  220. @pytest.mark.parametrize(
  221. "data, dtype", [([1, 2], None), ([1, 2], "int64"), (["a", "b"], None)]
  222. )
  223. def test_dataframe_from_series_or_index(using_copy_on_write, data, dtype, cons):
  224. obj = cons(data, dtype=dtype)
  225. obj_orig = obj.copy()
  226. df = DataFrame(obj, dtype=dtype)
  227. assert np.shares_memory(get_array(obj), get_array(df, 0))
  228. if using_copy_on_write:
  229. assert not df._mgr._has_no_reference(0)
  230. df.iloc[0, 0] = data[-1]
  231. if using_copy_on_write:
  232. tm.assert_equal(obj, obj_orig)
  233. @pytest.mark.parametrize("cons", [Series, Index])
  234. def test_dataframe_from_series_or_index_different_dtype(using_copy_on_write, cons):
  235. obj = cons([1, 2], dtype="int64")
  236. df = DataFrame(obj, dtype="int32")
  237. assert not np.shares_memory(get_array(obj), get_array(df, 0))
  238. if using_copy_on_write:
  239. assert df._mgr._has_no_reference(0)
  240. def test_dataframe_from_series_infer_datetime(using_copy_on_write):
  241. ser = Series([Timestamp("2019-12-31"), Timestamp("2020-12-31")], dtype=object)
  242. df = DataFrame(ser)
  243. assert not np.shares_memory(get_array(ser), get_array(df, 0))
  244. if using_copy_on_write:
  245. assert df._mgr._has_no_reference(0)
  246. @pytest.mark.parametrize("index", [None, [0, 1, 2]])
  247. def test_dataframe_from_dict_of_series_with_dtype(index):
  248. # Variant of above, but now passing a dtype that causes a copy
  249. # -> need to ensure the result doesn't have refs set up to unnecessarily
  250. # trigger a copy on write
  251. s1 = Series([1.0, 2.0, 3.0])
  252. s2 = Series([4, 5, 6])
  253. df = DataFrame({"a": s1, "b": s2}, index=index, dtype="int64", copy=False)
  254. # df should own its memory, so mutating shouldn't trigger a copy
  255. arr_before = get_array(df, "a")
  256. assert not np.shares_memory(arr_before, get_array(s1))
  257. df.iloc[0, 0] = 100
  258. arr_after = get_array(df, "a")
  259. assert np.shares_memory(arr_before, arr_after)
  260. @pytest.mark.parametrize("copy", [False, None, True])
  261. def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager):
  262. arr = np.array([[1, 2], [3, 4]])
  263. df = DataFrame(arr, copy=copy)
  264. if (
  265. using_copy_on_write
  266. and copy is not False
  267. or copy is True
  268. or (using_array_manager and copy is None)
  269. ):
  270. assert not np.shares_memory(get_array(df, 0), arr)
  271. else:
  272. assert np.shares_memory(get_array(df, 0), arr)
  273. def test_dataframe_from_records_with_dataframe(using_copy_on_write):
  274. df = DataFrame({"a": [1, 2, 3]})
  275. df_orig = df.copy()
  276. df2 = DataFrame.from_records(df)
  277. if using_copy_on_write:
  278. assert not df._mgr._has_no_reference(0)
  279. assert np.shares_memory(get_array(df, "a"), get_array(df2, "a"))
  280. df2.iloc[0, 0] = 100
  281. if using_copy_on_write:
  282. tm.assert_frame_equal(df, df_orig)
  283. else:
  284. tm.assert_frame_equal(df, df2)