test_compare.py 9.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301
  1. import numpy as np
  2. import pytest
  3. from pandas.compat.numpy import np_version_gte1p25
  4. import pandas as pd
  5. import pandas._testing as tm
  6. @pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"])
  7. def test_compare_axis(align_axis):
  8. # GH#30429
  9. df = pd.DataFrame(
  10. {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
  11. columns=["col1", "col2", "col3"],
  12. )
  13. df2 = df.copy()
  14. df2.loc[0, "col1"] = "c"
  15. df2.loc[2, "col3"] = 4.0
  16. result = df.compare(df2, align_axis=align_axis)
  17. if align_axis in (1, "columns"):
  18. indices = pd.Index([0, 2])
  19. columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
  20. expected = pd.DataFrame(
  21. [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]],
  22. index=indices,
  23. columns=columns,
  24. )
  25. else:
  26. indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]])
  27. columns = pd.Index(["col1", "col3"])
  28. expected = pd.DataFrame(
  29. [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]],
  30. index=indices,
  31. columns=columns,
  32. )
  33. tm.assert_frame_equal(result, expected)
  34. @pytest.mark.parametrize(
  35. "keep_shape, keep_equal",
  36. [
  37. (True, False),
  38. (False, True),
  39. (True, True),
  40. # False, False case is already covered in test_compare_axis
  41. ],
  42. )
  43. def test_compare_various_formats(keep_shape, keep_equal):
  44. df = pd.DataFrame(
  45. {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
  46. columns=["col1", "col2", "col3"],
  47. )
  48. df2 = df.copy()
  49. df2.loc[0, "col1"] = "c"
  50. df2.loc[2, "col3"] = 4.0
  51. result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal)
  52. if keep_shape:
  53. indices = pd.Index([0, 1, 2])
  54. columns = pd.MultiIndex.from_product(
  55. [["col1", "col2", "col3"], ["self", "other"]]
  56. )
  57. if keep_equal:
  58. expected = pd.DataFrame(
  59. [
  60. ["a", "c", 1.0, 1.0, 1.0, 1.0],
  61. ["b", "b", 2.0, 2.0, 2.0, 2.0],
  62. ["c", "c", np.nan, np.nan, 3.0, 4.0],
  63. ],
  64. index=indices,
  65. columns=columns,
  66. )
  67. else:
  68. expected = pd.DataFrame(
  69. [
  70. ["a", "c", np.nan, np.nan, np.nan, np.nan],
  71. [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
  72. [np.nan, np.nan, np.nan, np.nan, 3.0, 4.0],
  73. ],
  74. index=indices,
  75. columns=columns,
  76. )
  77. else:
  78. indices = pd.Index([0, 2])
  79. columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
  80. expected = pd.DataFrame(
  81. [["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns
  82. )
  83. tm.assert_frame_equal(result, expected)
  84. def test_compare_with_equal_nulls():
  85. # We want to make sure two NaNs are considered the same
  86. # and dropped where applicable
  87. df = pd.DataFrame(
  88. {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
  89. columns=["col1", "col2", "col3"],
  90. )
  91. df2 = df.copy()
  92. df2.loc[0, "col1"] = "c"
  93. result = df.compare(df2)
  94. indices = pd.Index([0])
  95. columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]])
  96. expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns)
  97. tm.assert_frame_equal(result, expected)
  98. def test_compare_with_non_equal_nulls():
  99. # We want to make sure the relevant NaNs do not get dropped
  100. # even if the entire row or column are NaNs
  101. df = pd.DataFrame(
  102. {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
  103. columns=["col1", "col2", "col3"],
  104. )
  105. df2 = df.copy()
  106. df2.loc[0, "col1"] = "c"
  107. df2.loc[2, "col3"] = np.nan
  108. result = df.compare(df2)
  109. indices = pd.Index([0, 2])
  110. columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
  111. expected = pd.DataFrame(
  112. [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan]],
  113. index=indices,
  114. columns=columns,
  115. )
  116. tm.assert_frame_equal(result, expected)
  117. @pytest.mark.parametrize("align_axis", [0, 1])
  118. def test_compare_multi_index(align_axis):
  119. df = pd.DataFrame(
  120. {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}
  121. )
  122. df.columns = pd.MultiIndex.from_arrays([["a", "a", "b"], ["col1", "col2", "col3"]])
  123. df.index = pd.MultiIndex.from_arrays([["x", "x", "y"], [0, 1, 2]])
  124. df2 = df.copy()
  125. df2.iloc[0, 0] = "c"
  126. df2.iloc[2, 2] = 4.0
  127. result = df.compare(df2, align_axis=align_axis)
  128. if align_axis == 0:
  129. indices = pd.MultiIndex.from_arrays(
  130. [["x", "x", "y", "y"], [0, 0, 2, 2], ["self", "other", "self", "other"]]
  131. )
  132. columns = pd.MultiIndex.from_arrays([["a", "b"], ["col1", "col3"]])
  133. data = [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]]
  134. else:
  135. indices = pd.MultiIndex.from_arrays([["x", "y"], [0, 2]])
  136. columns = pd.MultiIndex.from_arrays(
  137. [
  138. ["a", "a", "b", "b"],
  139. ["col1", "col1", "col3", "col3"],
  140. ["self", "other", "self", "other"],
  141. ]
  142. )
  143. data = [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]]
  144. expected = pd.DataFrame(data=data, index=indices, columns=columns)
  145. tm.assert_frame_equal(result, expected)
  146. def test_compare_unaligned_objects():
  147. # test DataFrames with different indices
  148. msg = (
  149. r"Can only compare identically-labeled \(both index and columns\) DataFrame "
  150. "objects"
  151. )
  152. with pytest.raises(ValueError, match=msg):
  153. df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"])
  154. df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"])
  155. df1.compare(df2)
  156. # test DataFrames with different shapes
  157. msg = (
  158. r"Can only compare identically-labeled \(both index and columns\) DataFrame "
  159. "objects"
  160. )
  161. with pytest.raises(ValueError, match=msg):
  162. df1 = pd.DataFrame(np.ones((3, 3)))
  163. df2 = pd.DataFrame(np.zeros((2, 1)))
  164. df1.compare(df2)
  165. def test_compare_result_names():
  166. # GH 44354
  167. df1 = pd.DataFrame(
  168. {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
  169. )
  170. df2 = pd.DataFrame(
  171. {
  172. "col1": ["c", "b", "c"],
  173. "col2": [1.0, 2.0, np.nan],
  174. "col3": [1.0, 2.0, np.nan],
  175. },
  176. )
  177. result = df1.compare(df2, result_names=("left", "right"))
  178. expected = pd.DataFrame(
  179. {
  180. ("col1", "left"): {0: "a", 2: np.nan},
  181. ("col1", "right"): {0: "c", 2: np.nan},
  182. ("col3", "left"): {0: np.nan, 2: 3.0},
  183. ("col3", "right"): {0: np.nan, 2: np.nan},
  184. }
  185. )
  186. tm.assert_frame_equal(result, expected)
  187. @pytest.mark.parametrize(
  188. "result_names",
  189. [
  190. [1, 2],
  191. "HK",
  192. {"2": 2, "3": 3},
  193. 3,
  194. 3.0,
  195. ],
  196. )
  197. def test_invalid_input_result_names(result_names):
  198. # GH 44354
  199. df1 = pd.DataFrame(
  200. {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
  201. )
  202. df2 = pd.DataFrame(
  203. {
  204. "col1": ["c", "b", "c"],
  205. "col2": [1.0, 2.0, np.nan],
  206. "col3": [1.0, 2.0, np.nan],
  207. },
  208. )
  209. with pytest.raises(
  210. TypeError,
  211. match=(
  212. f"Passing 'result_names' as a {type(result_names)} is not "
  213. "supported. Provide 'result_names' as a tuple instead."
  214. ),
  215. ):
  216. df1.compare(df2, result_names=result_names)
  217. @pytest.mark.parametrize(
  218. "val1,val2",
  219. [(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)],
  220. )
  221. def test_compare_ea_and_np_dtype(val1, val2):
  222. # GH 48966
  223. arr = [4.0, val1]
  224. ser = pd.Series([1, val2], dtype="Int64")
  225. df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]})
  226. df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]})
  227. expected = pd.DataFrame(
  228. {
  229. ("a", "self"): arr,
  230. ("a", "other"): ser,
  231. ("b", "self"): np.nan,
  232. ("b", "other"): np.nan,
  233. }
  234. )
  235. if val1 is pd.NA and np_version_gte1p25:
  236. # can't compare with numpy array if it contains pd.NA
  237. with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
  238. result = df1.compare(df2, keep_shape=True)
  239. else:
  240. result = df1.compare(df2, keep_shape=True)
  241. tm.assert_frame_equal(result, expected)
  242. @pytest.mark.parametrize(
  243. "df1_val,df2_val,diff_self,diff_other",
  244. [
  245. (4, 3, 4, 3),
  246. (4, 4, pd.NA, pd.NA),
  247. (4, pd.NA, 4, pd.NA),
  248. (pd.NA, pd.NA, pd.NA, pd.NA),
  249. ],
  250. )
  251. def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other):
  252. # GH 48966
  253. df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]})
  254. df2 = df1.copy()
  255. df2.loc[0, "a"] = df2_val
  256. expected = pd.DataFrame(
  257. {
  258. ("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"),
  259. ("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"),
  260. ("b", "self"): np.nan,
  261. ("b", "other"): np.nan,
  262. }
  263. )
  264. result = df1.compare(df2, keep_shape=True)
  265. tm.assert_frame_equal(result, expected)