test_interpolate.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422
  1. import numpy as np
  2. import pytest
  3. import pandas.util._test_decorators as td
  4. from pandas import (
  5. DataFrame,
  6. NaT,
  7. Series,
  8. date_range,
  9. )
  10. import pandas._testing as tm
  11. class TestDataFrameInterpolate:
  12. def test_interpolate_datetimelike_values(self, frame_or_series):
  13. # GH#11312, GH#51005
  14. orig = Series(date_range("2012-01-01", periods=5))
  15. ser = orig.copy()
  16. ser[2] = NaT
  17. res = frame_or_series(ser).interpolate()
  18. expected = frame_or_series(orig)
  19. tm.assert_equal(res, expected)
  20. # datetime64tz cast
  21. ser_tz = ser.dt.tz_localize("US/Pacific")
  22. res_tz = frame_or_series(ser_tz).interpolate()
  23. expected_tz = frame_or_series(orig.dt.tz_localize("US/Pacific"))
  24. tm.assert_equal(res_tz, expected_tz)
  25. # timedelta64 cast
  26. ser_td = ser - ser[0]
  27. res_td = frame_or_series(ser_td).interpolate()
  28. expected_td = frame_or_series(orig - orig[0])
  29. tm.assert_equal(res_td, expected_td)
  30. def test_interpolate_inplace(self, frame_or_series, using_array_manager, request):
  31. # GH#44749
  32. if using_array_manager and frame_or_series is DataFrame:
  33. mark = pytest.mark.xfail(reason=".values-based in-place check is invalid")
  34. request.node.add_marker(mark)
  35. obj = frame_or_series([1, np.nan, 2])
  36. orig = obj.values
  37. obj.interpolate(inplace=True)
  38. expected = frame_or_series([1, 1.5, 2])
  39. tm.assert_equal(obj, expected)
  40. # check we operated *actually* inplace
  41. assert np.shares_memory(orig, obj.values)
  42. assert orig.squeeze()[1] == 1.5
  43. def test_interp_basic(self, using_copy_on_write):
  44. df = DataFrame(
  45. {
  46. "A": [1, 2, np.nan, 4],
  47. "B": [1, 4, 9, np.nan],
  48. "C": [1, 2, 3, 5],
  49. "D": list("abcd"),
  50. }
  51. )
  52. expected = DataFrame(
  53. {
  54. "A": [1.0, 2.0, 3.0, 4.0],
  55. "B": [1.0, 4.0, 9.0, 9.0],
  56. "C": [1, 2, 3, 5],
  57. "D": list("abcd"),
  58. }
  59. )
  60. result = df.interpolate()
  61. tm.assert_frame_equal(result, expected)
  62. # check we didn't operate inplace GH#45791
  63. cvalues = df["C"]._values
  64. dvalues = df["D"].values
  65. if using_copy_on_write:
  66. assert np.shares_memory(cvalues, result["C"]._values)
  67. assert np.shares_memory(dvalues, result["D"]._values)
  68. else:
  69. assert not np.shares_memory(cvalues, result["C"]._values)
  70. assert not np.shares_memory(dvalues, result["D"]._values)
  71. res = df.interpolate(inplace=True)
  72. assert res is None
  73. tm.assert_frame_equal(df, expected)
  74. # check we DID operate inplace
  75. assert np.shares_memory(df["C"]._values, cvalues)
  76. assert np.shares_memory(df["D"]._values, dvalues)
  77. def test_interp_basic_with_non_range_index(self):
  78. df = DataFrame(
  79. {
  80. "A": [1, 2, np.nan, 4],
  81. "B": [1, 4, 9, np.nan],
  82. "C": [1, 2, 3, 5],
  83. "D": list("abcd"),
  84. }
  85. )
  86. result = df.set_index("C").interpolate()
  87. expected = df.set_index("C")
  88. expected.loc[3, "A"] = 3
  89. expected.loc[5, "B"] = 9
  90. tm.assert_frame_equal(result, expected)
  91. def test_interp_empty(self):
  92. # https://github.com/pandas-dev/pandas/issues/35598
  93. df = DataFrame()
  94. result = df.interpolate()
  95. assert result is not df
  96. expected = df
  97. tm.assert_frame_equal(result, expected)
  98. def test_interp_bad_method(self):
  99. df = DataFrame(
  100. {
  101. "A": [1, 2, np.nan, 4],
  102. "B": [1, 4, 9, np.nan],
  103. "C": [1, 2, 3, 5],
  104. "D": list("abcd"),
  105. }
  106. )
  107. msg = (
  108. r"method must be one of \['linear', 'time', 'index', 'values', "
  109. r"'nearest', 'zero', 'slinear', 'quadratic', 'cubic', "
  110. r"'barycentric', 'krogh', 'spline', 'polynomial', "
  111. r"'from_derivatives', 'piecewise_polynomial', 'pchip', 'akima', "
  112. r"'cubicspline'\]. Got 'not_a_method' instead."
  113. )
  114. with pytest.raises(ValueError, match=msg):
  115. df.interpolate(method="not_a_method")
  116. def test_interp_combo(self):
  117. df = DataFrame(
  118. {
  119. "A": [1.0, 2.0, np.nan, 4.0],
  120. "B": [1, 4, 9, np.nan],
  121. "C": [1, 2, 3, 5],
  122. "D": list("abcd"),
  123. }
  124. )
  125. result = df["A"].interpolate()
  126. expected = Series([1.0, 2.0, 3.0, 4.0], name="A")
  127. tm.assert_series_equal(result, expected)
  128. result = df["A"].interpolate(downcast="infer")
  129. expected = Series([1, 2, 3, 4], name="A")
  130. tm.assert_series_equal(result, expected)
  131. def test_interp_nan_idx(self):
  132. df = DataFrame({"A": [1, 2, np.nan, 4], "B": [np.nan, 2, 3, 4]})
  133. df = df.set_index("A")
  134. msg = (
  135. "Interpolation with NaNs in the index has not been implemented. "
  136. "Try filling those NaNs before interpolating."
  137. )
  138. with pytest.raises(NotImplementedError, match=msg):
  139. df.interpolate(method="values")
  140. @td.skip_if_no_scipy
  141. def test_interp_various(self):
  142. df = DataFrame(
  143. {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
  144. )
  145. df = df.set_index("C")
  146. expected = df.copy()
  147. result = df.interpolate(method="polynomial", order=1)
  148. expected.loc[3, "A"] = 2.66666667
  149. expected.loc[13, "A"] = 5.76923076
  150. tm.assert_frame_equal(result, expected)
  151. result = df.interpolate(method="cubic")
  152. # GH #15662.
  153. expected.loc[3, "A"] = 2.81547781
  154. expected.loc[13, "A"] = 5.52964175
  155. tm.assert_frame_equal(result, expected)
  156. result = df.interpolate(method="nearest")
  157. expected.loc[3, "A"] = 2
  158. expected.loc[13, "A"] = 5
  159. tm.assert_frame_equal(result, expected, check_dtype=False)
  160. result = df.interpolate(method="quadratic")
  161. expected.loc[3, "A"] = 2.82150771
  162. expected.loc[13, "A"] = 6.12648668
  163. tm.assert_frame_equal(result, expected)
  164. result = df.interpolate(method="slinear")
  165. expected.loc[3, "A"] = 2.66666667
  166. expected.loc[13, "A"] = 5.76923077
  167. tm.assert_frame_equal(result, expected)
  168. result = df.interpolate(method="zero")
  169. expected.loc[3, "A"] = 2.0
  170. expected.loc[13, "A"] = 5
  171. tm.assert_frame_equal(result, expected, check_dtype=False)
  172. @td.skip_if_no_scipy
  173. def test_interp_alt_scipy(self):
  174. df = DataFrame(
  175. {"A": [1, 2, np.nan, 4, 5, np.nan, 7], "C": [1, 2, 3, 5, 8, 13, 21]}
  176. )
  177. result = df.interpolate(method="barycentric")
  178. expected = df.copy()
  179. expected.loc[2, "A"] = 3
  180. expected.loc[5, "A"] = 6
  181. tm.assert_frame_equal(result, expected)
  182. result = df.interpolate(method="barycentric", downcast="infer")
  183. tm.assert_frame_equal(result, expected.astype(np.int64))
  184. result = df.interpolate(method="krogh")
  185. expectedk = df.copy()
  186. expectedk["A"] = expected["A"]
  187. tm.assert_frame_equal(result, expectedk)
  188. result = df.interpolate(method="pchip")
  189. expected.loc[2, "A"] = 3
  190. expected.loc[5, "A"] = 6.0
  191. tm.assert_frame_equal(result, expected)
  192. def test_interp_rowwise(self):
  193. df = DataFrame(
  194. {
  195. 0: [1, 2, np.nan, 4],
  196. 1: [2, 3, 4, np.nan],
  197. 2: [np.nan, 4, 5, 6],
  198. 3: [4, np.nan, 6, 7],
  199. 4: [1, 2, 3, 4],
  200. }
  201. )
  202. result = df.interpolate(axis=1)
  203. expected = df.copy()
  204. expected.loc[3, 1] = 5
  205. expected.loc[0, 2] = 3
  206. expected.loc[1, 3] = 3
  207. expected[4] = expected[4].astype(np.float64)
  208. tm.assert_frame_equal(result, expected)
  209. result = df.interpolate(axis=1, method="values")
  210. tm.assert_frame_equal(result, expected)
  211. result = df.interpolate(axis=0)
  212. expected = df.interpolate()
  213. tm.assert_frame_equal(result, expected)
  214. @pytest.mark.parametrize(
  215. "axis_name, axis_number",
  216. [
  217. pytest.param("rows", 0, id="rows_0"),
  218. pytest.param("index", 0, id="index_0"),
  219. pytest.param("columns", 1, id="columns_1"),
  220. ],
  221. )
  222. def test_interp_axis_names(self, axis_name, axis_number):
  223. # GH 29132: test axis names
  224. data = {0: [0, np.nan, 6], 1: [1, np.nan, 7], 2: [2, 5, 8]}
  225. df = DataFrame(data, dtype=np.float64)
  226. result = df.interpolate(axis=axis_name, method="linear")
  227. expected = df.interpolate(axis=axis_number, method="linear")
  228. tm.assert_frame_equal(result, expected)
  229. def test_rowwise_alt(self):
  230. df = DataFrame(
  231. {
  232. 0: [0, 0.5, 1.0, np.nan, 4, 8, np.nan, np.nan, 64],
  233. 1: [1, 2, 3, 4, 3, 2, 1, 0, -1],
  234. }
  235. )
  236. df.interpolate(axis=0)
  237. # TODO: assert something?
  238. @pytest.mark.parametrize(
  239. "check_scipy", [False, pytest.param(True, marks=td.skip_if_no_scipy)]
  240. )
  241. def test_interp_leading_nans(self, check_scipy):
  242. df = DataFrame(
  243. {"A": [np.nan, np.nan, 0.5, 0.25, 0], "B": [np.nan, -3, -3.5, np.nan, -4]}
  244. )
  245. result = df.interpolate()
  246. expected = df.copy()
  247. expected.loc[3, "B"] = -3.75
  248. tm.assert_frame_equal(result, expected)
  249. if check_scipy:
  250. result = df.interpolate(method="polynomial", order=1)
  251. tm.assert_frame_equal(result, expected)
  252. def test_interp_raise_on_only_mixed(self, axis):
  253. df = DataFrame(
  254. {
  255. "A": [1, 2, np.nan, 4],
  256. "B": ["a", "b", "c", "d"],
  257. "C": [np.nan, 2, 5, 7],
  258. "D": [np.nan, np.nan, 9, 9],
  259. "E": [1, 2, 3, 4],
  260. }
  261. )
  262. msg = (
  263. "Cannot interpolate with all object-dtype columns "
  264. "in the DataFrame. Try setting at least one "
  265. "column to a numeric dtype."
  266. )
  267. with pytest.raises(TypeError, match=msg):
  268. df.astype("object").interpolate(axis=axis)
  269. def test_interp_raise_on_all_object_dtype(self):
  270. # GH 22985
  271. df = DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, dtype="object")
  272. msg = (
  273. "Cannot interpolate with all object-dtype columns "
  274. "in the DataFrame. Try setting at least one "
  275. "column to a numeric dtype."
  276. )
  277. with pytest.raises(TypeError, match=msg):
  278. df.interpolate()
  279. def test_interp_inplace(self, using_copy_on_write):
  280. df = DataFrame({"a": [1.0, 2.0, np.nan, 4.0]})
  281. expected = DataFrame({"a": [1.0, 2.0, 3.0, 4.0]})
  282. expected_cow = df.copy()
  283. result = df.copy()
  284. return_value = result["a"].interpolate(inplace=True)
  285. assert return_value is None
  286. if using_copy_on_write:
  287. tm.assert_frame_equal(result, expected_cow)
  288. else:
  289. tm.assert_frame_equal(result, expected)
  290. result = df.copy()
  291. return_value = result["a"].interpolate(inplace=True, downcast="infer")
  292. assert return_value is None
  293. if using_copy_on_write:
  294. tm.assert_frame_equal(result, expected_cow)
  295. else:
  296. tm.assert_frame_equal(result, expected.astype("int64"))
  297. def test_interp_inplace_row(self):
  298. # GH 10395
  299. result = DataFrame(
  300. {"a": [1.0, 2.0, 3.0, 4.0], "b": [np.nan, 2.0, 3.0, 4.0], "c": [3, 2, 2, 2]}
  301. )
  302. expected = result.interpolate(method="linear", axis=1, inplace=False)
  303. return_value = result.interpolate(method="linear", axis=1, inplace=True)
  304. assert return_value is None
  305. tm.assert_frame_equal(result, expected)
  306. def test_interp_ignore_all_good(self):
  307. # GH
  308. df = DataFrame(
  309. {
  310. "A": [1, 2, np.nan, 4],
  311. "B": [1, 2, 3, 4],
  312. "C": [1.0, 2.0, np.nan, 4.0],
  313. "D": [1.0, 2.0, 3.0, 4.0],
  314. }
  315. )
  316. expected = DataFrame(
  317. {
  318. "A": np.array([1, 2, 3, 4], dtype="float64"),
  319. "B": np.array([1, 2, 3, 4], dtype="int64"),
  320. "C": np.array([1.0, 2.0, 3, 4.0], dtype="float64"),
  321. "D": np.array([1.0, 2.0, 3.0, 4.0], dtype="float64"),
  322. }
  323. )
  324. result = df.interpolate(downcast=None)
  325. tm.assert_frame_equal(result, expected)
  326. # all good
  327. result = df[["B", "D"]].interpolate(downcast=None)
  328. tm.assert_frame_equal(result, df[["B", "D"]])
  329. def test_interp_time_inplace_axis(self):
  330. # GH 9687
  331. periods = 5
  332. idx = date_range(start="2014-01-01", periods=periods)
  333. data = np.random.rand(periods, periods)
  334. data[data < 0.5] = np.nan
  335. expected = DataFrame(index=idx, columns=idx, data=data)
  336. result = expected.interpolate(axis=0, method="time")
  337. return_value = expected.interpolate(axis=0, method="time", inplace=True)
  338. assert return_value is None
  339. tm.assert_frame_equal(result, expected)
  340. @pytest.mark.parametrize("axis_name, axis_number", [("index", 0), ("columns", 1)])
  341. def test_interp_string_axis(self, axis_name, axis_number):
  342. # https://github.com/pandas-dev/pandas/issues/25190
  343. x = np.linspace(0, 100, 1000)
  344. y = np.sin(x)
  345. df = DataFrame(
  346. data=np.tile(y, (10, 1)), index=np.arange(10), columns=x
  347. ).reindex(columns=x * 1.005)
  348. result = df.interpolate(method="linear", axis=axis_name)
  349. expected = df.interpolate(method="linear", axis=axis_number)
  350. tm.assert_frame_equal(result, expected)
  351. @pytest.mark.parametrize("method", ["ffill", "bfill", "pad"])
  352. def test_interp_fillna_methods(self, request, axis, method, using_array_manager):
  353. # GH 12918
  354. if using_array_manager and axis in (1, "columns"):
  355. # TODO(ArrayManager) support axis=1
  356. td.mark_array_manager_not_yet_implemented(request)
  357. df = DataFrame(
  358. {
  359. "A": [1.0, 2.0, 3.0, 4.0, np.nan, 5.0],
  360. "B": [2.0, 4.0, 6.0, np.nan, 8.0, 10.0],
  361. "C": [3.0, 6.0, 9.0, np.nan, np.nan, 30.0],
  362. }
  363. )
  364. expected = df.fillna(axis=axis, method=method)
  365. result = df.interpolate(method=method, axis=axis)
  366. tm.assert_frame_equal(result, expected)