test_append.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. import datetime as dt
  2. from itertools import combinations
  3. import dateutil
  4. import numpy as np
  5. import pytest
  6. import pandas as pd
  7. from pandas import (
  8. DataFrame,
  9. Index,
  10. Series,
  11. Timestamp,
  12. concat,
  13. isna,
  14. )
  15. import pandas._testing as tm
  16. class TestAppend:
  17. def test_append(self, sort, float_frame):
  18. mixed_frame = float_frame.copy()
  19. mixed_frame["foo"] = "bar"
  20. begin_index = float_frame.index[:5]
  21. end_index = float_frame.index[5:]
  22. begin_frame = float_frame.reindex(begin_index)
  23. end_frame = float_frame.reindex(end_index)
  24. appended = begin_frame._append(end_frame)
  25. tm.assert_almost_equal(appended["A"], float_frame["A"])
  26. del end_frame["A"]
  27. partial_appended = begin_frame._append(end_frame, sort=sort)
  28. assert "A" in partial_appended
  29. partial_appended = end_frame._append(begin_frame, sort=sort)
  30. assert "A" in partial_appended
  31. # mixed type handling
  32. appended = mixed_frame[:5]._append(mixed_frame[5:])
  33. tm.assert_frame_equal(appended, mixed_frame)
  34. # what to test here
  35. mixed_appended = mixed_frame[:5]._append(float_frame[5:], sort=sort)
  36. mixed_appended2 = float_frame[:5]._append(mixed_frame[5:], sort=sort)
  37. # all equal except 'foo' column
  38. tm.assert_frame_equal(
  39. mixed_appended.reindex(columns=["A", "B", "C", "D"]),
  40. mixed_appended2.reindex(columns=["A", "B", "C", "D"]),
  41. )
  42. def test_append_empty(self, float_frame):
  43. empty = DataFrame()
  44. appended = float_frame._append(empty)
  45. tm.assert_frame_equal(float_frame, appended)
  46. assert appended is not float_frame
  47. appended = empty._append(float_frame)
  48. tm.assert_frame_equal(float_frame, appended)
  49. assert appended is not float_frame
  50. def test_append_overlap_raises(self, float_frame):
  51. msg = "Indexes have overlapping values"
  52. with pytest.raises(ValueError, match=msg):
  53. float_frame._append(float_frame, verify_integrity=True)
  54. def test_append_new_columns(self):
  55. # see gh-6129: new columns
  56. df = DataFrame({"a": {"x": 1, "y": 2}, "b": {"x": 3, "y": 4}})
  57. row = Series([5, 6, 7], index=["a", "b", "c"], name="z")
  58. expected = DataFrame(
  59. {
  60. "a": {"x": 1, "y": 2, "z": 5},
  61. "b": {"x": 3, "y": 4, "z": 6},
  62. "c": {"z": 7},
  63. }
  64. )
  65. result = df._append(row)
  66. tm.assert_frame_equal(result, expected)
  67. def test_append_length0_frame(self, sort):
  68. df = DataFrame(columns=["A", "B", "C"])
  69. df3 = DataFrame(index=[0, 1], columns=["A", "B"])
  70. df5 = df._append(df3, sort=sort)
  71. expected = DataFrame(index=[0, 1], columns=["A", "B", "C"])
  72. tm.assert_frame_equal(df5, expected)
  73. def test_append_records(self):
  74. arr1 = np.zeros((2,), dtype=("i4,f4,a10"))
  75. arr1[:] = [(1, 2.0, "Hello"), (2, 3.0, "World")]
  76. arr2 = np.zeros((3,), dtype=("i4,f4,a10"))
  77. arr2[:] = [(3, 4.0, "foo"), (5, 6.0, "bar"), (7.0, 8.0, "baz")]
  78. df1 = DataFrame(arr1)
  79. df2 = DataFrame(arr2)
  80. result = df1._append(df2, ignore_index=True)
  81. expected = DataFrame(np.concatenate((arr1, arr2)))
  82. tm.assert_frame_equal(result, expected)
  83. # rewrite sort fixture, since we also want to test default of None
  84. def test_append_sorts(self, sort):
  85. df1 = DataFrame({"a": [1, 2], "b": [1, 2]}, columns=["b", "a"])
  86. df2 = DataFrame({"a": [1, 2], "c": [3, 4]}, index=[2, 3])
  87. result = df1._append(df2, sort=sort)
  88. # for None / True
  89. expected = DataFrame(
  90. {"b": [1, 2, None, None], "a": [1, 2, 1, 2], "c": [None, None, 3, 4]},
  91. columns=["a", "b", "c"],
  92. )
  93. if sort is False:
  94. expected = expected[["b", "a", "c"]]
  95. tm.assert_frame_equal(result, expected)
  96. def test_append_different_columns(self, sort):
  97. df = DataFrame(
  98. {
  99. "bools": np.random.randn(10) > 0,
  100. "ints": np.random.randint(0, 10, 10),
  101. "floats": np.random.randn(10),
  102. "strings": ["foo", "bar"] * 5,
  103. }
  104. )
  105. a = df[:5].loc[:, ["bools", "ints", "floats"]]
  106. b = df[5:].loc[:, ["strings", "ints", "floats"]]
  107. appended = a._append(b, sort=sort)
  108. assert isna(appended["strings"][0:4]).all()
  109. assert isna(appended["bools"][5:]).all()
  110. def test_append_many(self, sort, float_frame):
  111. chunks = [
  112. float_frame[:5],
  113. float_frame[5:10],
  114. float_frame[10:15],
  115. float_frame[15:],
  116. ]
  117. result = chunks[0]._append(chunks[1:])
  118. tm.assert_frame_equal(result, float_frame)
  119. chunks[-1] = chunks[-1].copy()
  120. chunks[-1]["foo"] = "bar"
  121. result = chunks[0]._append(chunks[1:], sort=sort)
  122. tm.assert_frame_equal(result.loc[:, float_frame.columns], float_frame)
  123. assert (result["foo"][15:] == "bar").all()
  124. assert result["foo"][:15].isna().all()
  125. def test_append_preserve_index_name(self):
  126. # #980
  127. df1 = DataFrame(columns=["A", "B", "C"])
  128. df1 = df1.set_index(["A"])
  129. df2 = DataFrame(data=[[1, 4, 7], [2, 5, 8], [3, 6, 9]], columns=["A", "B", "C"])
  130. df2 = df2.set_index(["A"])
  131. result = df1._append(df2)
  132. assert result.index.name == "A"
  133. indexes_can_append = [
  134. pd.RangeIndex(3),
  135. Index([4, 5, 6]),
  136. Index([4.5, 5.5, 6.5]),
  137. Index(list("abc")),
  138. pd.CategoricalIndex("A B C".split()),
  139. pd.CategoricalIndex("D E F".split(), ordered=True),
  140. pd.IntervalIndex.from_breaks([7, 8, 9, 10]),
  141. pd.DatetimeIndex(
  142. [
  143. dt.datetime(2013, 1, 3, 0, 0),
  144. dt.datetime(2013, 1, 3, 6, 10),
  145. dt.datetime(2013, 1, 3, 7, 12),
  146. ]
  147. ),
  148. pd.MultiIndex.from_arrays(["A B C".split(), "D E F".split()]),
  149. ]
  150. @pytest.mark.parametrize(
  151. "index", indexes_can_append, ids=lambda x: type(x).__name__
  152. )
  153. def test_append_same_columns_type(self, index):
  154. # GH18359
  155. # df wider than ser
  156. df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=index)
  157. ser_index = index[:2]
  158. ser = Series([7, 8], index=ser_index, name=2)
  159. result = df._append(ser)
  160. expected = DataFrame(
  161. [[1, 2, 3.0], [4, 5, 6], [7, 8, np.nan]], index=[0, 1, 2], columns=index
  162. )
  163. # integer dtype is preserved for columns present in ser.index
  164. assert expected.dtypes.iloc[0].kind == "i"
  165. assert expected.dtypes.iloc[1].kind == "i"
  166. tm.assert_frame_equal(result, expected)
  167. # ser wider than df
  168. ser_index = index
  169. index = index[:2]
  170. df = DataFrame([[1, 2], [4, 5]], columns=index)
  171. ser = Series([7, 8, 9], index=ser_index, name=2)
  172. result = df._append(ser)
  173. expected = DataFrame(
  174. [[1, 2, np.nan], [4, 5, np.nan], [7, 8, 9]],
  175. index=[0, 1, 2],
  176. columns=ser_index,
  177. )
  178. tm.assert_frame_equal(result, expected)
  179. @pytest.mark.parametrize(
  180. "df_columns, series_index",
  181. combinations(indexes_can_append, r=2),
  182. ids=lambda x: type(x).__name__,
  183. )
  184. def test_append_different_columns_types(self, df_columns, series_index):
  185. # GH18359
  186. # See also test 'test_append_different_columns_types_raises' below
  187. # for errors raised when appending
  188. df = DataFrame([[1, 2, 3], [4, 5, 6]], columns=df_columns)
  189. ser = Series([7, 8, 9], index=series_index, name=2)
  190. result = df._append(ser)
  191. idx_diff = ser.index.difference(df_columns)
  192. combined_columns = Index(df_columns.tolist()).append(idx_diff)
  193. expected = DataFrame(
  194. [
  195. [1.0, 2.0, 3.0, np.nan, np.nan, np.nan],
  196. [4, 5, 6, np.nan, np.nan, np.nan],
  197. [np.nan, np.nan, np.nan, 7, 8, 9],
  198. ],
  199. index=[0, 1, 2],
  200. columns=combined_columns,
  201. )
  202. tm.assert_frame_equal(result, expected)
  203. def test_append_dtype_coerce(self, sort):
  204. # GH 4993
  205. # appending with datetime will incorrectly convert datetime64
  206. df1 = DataFrame(
  207. index=[1, 2],
  208. data=[dt.datetime(2013, 1, 1, 0, 0), dt.datetime(2013, 1, 2, 0, 0)],
  209. columns=["start_time"],
  210. )
  211. df2 = DataFrame(
  212. index=[4, 5],
  213. data=[
  214. [dt.datetime(2013, 1, 3, 0, 0), dt.datetime(2013, 1, 3, 6, 10)],
  215. [dt.datetime(2013, 1, 4, 0, 0), dt.datetime(2013, 1, 4, 7, 10)],
  216. ],
  217. columns=["start_time", "end_time"],
  218. )
  219. expected = concat(
  220. [
  221. Series(
  222. [
  223. pd.NaT,
  224. pd.NaT,
  225. dt.datetime(2013, 1, 3, 6, 10),
  226. dt.datetime(2013, 1, 4, 7, 10),
  227. ],
  228. name="end_time",
  229. ),
  230. Series(
  231. [
  232. dt.datetime(2013, 1, 1, 0, 0),
  233. dt.datetime(2013, 1, 2, 0, 0),
  234. dt.datetime(2013, 1, 3, 0, 0),
  235. dt.datetime(2013, 1, 4, 0, 0),
  236. ],
  237. name="start_time",
  238. ),
  239. ],
  240. axis=1,
  241. sort=sort,
  242. )
  243. result = df1._append(df2, ignore_index=True, sort=sort)
  244. if sort:
  245. expected = expected[["end_time", "start_time"]]
  246. else:
  247. expected = expected[["start_time", "end_time"]]
  248. tm.assert_frame_equal(result, expected)
  249. def test_append_missing_column_proper_upcast(self, sort):
  250. df1 = DataFrame({"A": np.array([1, 2, 3, 4], dtype="i8")})
  251. df2 = DataFrame({"B": np.array([True, False, True, False], dtype=bool)})
  252. appended = df1._append(df2, ignore_index=True, sort=sort)
  253. assert appended["A"].dtype == "f8"
  254. assert appended["B"].dtype == "O"
  255. def test_append_empty_frame_to_series_with_dateutil_tz(self):
  256. # GH 23682
  257. date = Timestamp("2018-10-24 07:30:00", tz=dateutil.tz.tzutc())
  258. ser = Series({"a": 1.0, "b": 2.0, "date": date})
  259. df = DataFrame(columns=["c", "d"])
  260. result_a = df._append(ser, ignore_index=True)
  261. expected = DataFrame(
  262. [[np.nan, np.nan, 1.0, 2.0, date]], columns=["c", "d", "a", "b", "date"]
  263. )
  264. # These columns get cast to object after append
  265. expected["c"] = expected["c"].astype(object)
  266. expected["d"] = expected["d"].astype(object)
  267. tm.assert_frame_equal(result_a, expected)
  268. expected = DataFrame(
  269. [[np.nan, np.nan, 1.0, 2.0, date]] * 2, columns=["c", "d", "a", "b", "date"]
  270. )
  271. expected["c"] = expected["c"].astype(object)
  272. expected["d"] = expected["d"].astype(object)
  273. result_b = result_a._append(ser, ignore_index=True)
  274. tm.assert_frame_equal(result_b, expected)
  275. result = df._append([ser, ser], ignore_index=True)
  276. tm.assert_frame_equal(result, expected)
  277. def test_append_empty_tz_frame_with_datetime64ns(self):
  278. # https://github.com/pandas-dev/pandas/issues/35460
  279. df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
  280. # pd.NaT gets inferred as tz-naive, so append result is tz-naive
  281. result = df._append({"a": pd.NaT}, ignore_index=True)
  282. expected = DataFrame({"a": [pd.NaT]}).astype(object)
  283. tm.assert_frame_equal(result, expected)
  284. # also test with typed value to append
  285. df = DataFrame(columns=["a"]).astype("datetime64[ns, UTC]")
  286. other = Series({"a": pd.NaT}, dtype="datetime64[ns]")
  287. result = df._append(other, ignore_index=True)
  288. expected = DataFrame({"a": [pd.NaT]}).astype(object)
  289. tm.assert_frame_equal(result, expected)
  290. # mismatched tz
  291. other = Series({"a": pd.NaT}, dtype="datetime64[ns, US/Pacific]")
  292. result = df._append(other, ignore_index=True)
  293. expected = DataFrame({"a": [pd.NaT]}).astype(object)
  294. tm.assert_frame_equal(result, expected)
  295. @pytest.mark.parametrize(
  296. "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
  297. )
  298. @pytest.mark.parametrize("val", [1, "NaT"])
  299. def test_append_empty_frame_with_timedelta64ns_nat(self, dtype_str, val):
  300. # https://github.com/pandas-dev/pandas/issues/35460
  301. df = DataFrame(columns=["a"]).astype(dtype_str)
  302. other = DataFrame({"a": [np.timedelta64(val, "ns")]})
  303. result = df._append(other, ignore_index=True)
  304. expected = other.astype(object)
  305. tm.assert_frame_equal(result, expected)
  306. @pytest.mark.parametrize(
  307. "dtype_str", ["datetime64[ns, UTC]", "datetime64[ns]", "Int64", "int64"]
  308. )
  309. @pytest.mark.parametrize("val", [1, "NaT"])
  310. def test_append_frame_with_timedelta64ns_nat(self, dtype_str, val):
  311. # https://github.com/pandas-dev/pandas/issues/35460
  312. df = DataFrame({"a": pd.array([1], dtype=dtype_str)})
  313. other = DataFrame({"a": [np.timedelta64(val, "ns")]})
  314. result = df._append(other, ignore_index=True)
  315. expected = DataFrame({"a": [df.iloc[0, 0], other.iloc[0, 0]]}, dtype=object)
  316. tm.assert_frame_equal(result, expected)