123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301 |
- import numpy as np
- import pytest
- from pandas.compat.numpy import np_version_gte1p25
- import pandas as pd
- import pandas._testing as tm
- @pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"])
- def test_compare_axis(align_axis):
- # GH#30429
- df = pd.DataFrame(
- {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
- columns=["col1", "col2", "col3"],
- )
- df2 = df.copy()
- df2.loc[0, "col1"] = "c"
- df2.loc[2, "col3"] = 4.0
- result = df.compare(df2, align_axis=align_axis)
- if align_axis in (1, "columns"):
- indices = pd.Index([0, 2])
- columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
- expected = pd.DataFrame(
- [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]],
- index=indices,
- columns=columns,
- )
- else:
- indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]])
- columns = pd.Index(["col1", "col3"])
- expected = pd.DataFrame(
- [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]],
- index=indices,
- columns=columns,
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "keep_shape, keep_equal",
- [
- (True, False),
- (False, True),
- (True, True),
- # False, False case is already covered in test_compare_axis
- ],
- )
- def test_compare_various_formats(keep_shape, keep_equal):
- df = pd.DataFrame(
- {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
- columns=["col1", "col2", "col3"],
- )
- df2 = df.copy()
- df2.loc[0, "col1"] = "c"
- df2.loc[2, "col3"] = 4.0
- result = df.compare(df2, keep_shape=keep_shape, keep_equal=keep_equal)
- if keep_shape:
- indices = pd.Index([0, 1, 2])
- columns = pd.MultiIndex.from_product(
- [["col1", "col2", "col3"], ["self", "other"]]
- )
- if keep_equal:
- expected = pd.DataFrame(
- [
- ["a", "c", 1.0, 1.0, 1.0, 1.0],
- ["b", "b", 2.0, 2.0, 2.0, 2.0],
- ["c", "c", np.nan, np.nan, 3.0, 4.0],
- ],
- index=indices,
- columns=columns,
- )
- else:
- expected = pd.DataFrame(
- [
- ["a", "c", np.nan, np.nan, np.nan, np.nan],
- [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
- [np.nan, np.nan, np.nan, np.nan, 3.0, 4.0],
- ],
- index=indices,
- columns=columns,
- )
- else:
- indices = pd.Index([0, 2])
- columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
- expected = pd.DataFrame(
- [["a", "c", 1.0, 1.0], ["c", "c", 3.0, 4.0]], index=indices, columns=columns
- )
- tm.assert_frame_equal(result, expected)
- def test_compare_with_equal_nulls():
- # We want to make sure two NaNs are considered the same
- # and dropped where applicable
- df = pd.DataFrame(
- {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
- columns=["col1", "col2", "col3"],
- )
- df2 = df.copy()
- df2.loc[0, "col1"] = "c"
- result = df.compare(df2)
- indices = pd.Index([0])
- columns = pd.MultiIndex.from_product([["col1"], ["self", "other"]])
- expected = pd.DataFrame([["a", "c"]], index=indices, columns=columns)
- tm.assert_frame_equal(result, expected)
- def test_compare_with_non_equal_nulls():
- # We want to make sure the relevant NaNs do not get dropped
- # even if the entire row or column are NaNs
- df = pd.DataFrame(
- {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
- columns=["col1", "col2", "col3"],
- )
- df2 = df.copy()
- df2.loc[0, "col1"] = "c"
- df2.loc[2, "col3"] = np.nan
- result = df.compare(df2)
- indices = pd.Index([0, 2])
- columns = pd.MultiIndex.from_product([["col1", "col3"], ["self", "other"]])
- expected = pd.DataFrame(
- [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, np.nan]],
- index=indices,
- columns=columns,
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("align_axis", [0, 1])
- def test_compare_multi_index(align_axis):
- df = pd.DataFrame(
- {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]}
- )
- df.columns = pd.MultiIndex.from_arrays([["a", "a", "b"], ["col1", "col2", "col3"]])
- df.index = pd.MultiIndex.from_arrays([["x", "x", "y"], [0, 1, 2]])
- df2 = df.copy()
- df2.iloc[0, 0] = "c"
- df2.iloc[2, 2] = 4.0
- result = df.compare(df2, align_axis=align_axis)
- if align_axis == 0:
- indices = pd.MultiIndex.from_arrays(
- [["x", "x", "y", "y"], [0, 0, 2, 2], ["self", "other", "self", "other"]]
- )
- columns = pd.MultiIndex.from_arrays([["a", "b"], ["col1", "col3"]])
- data = [["a", np.nan], ["c", np.nan], [np.nan, 3.0], [np.nan, 4.0]]
- else:
- indices = pd.MultiIndex.from_arrays([["x", "y"], [0, 2]])
- columns = pd.MultiIndex.from_arrays(
- [
- ["a", "a", "b", "b"],
- ["col1", "col1", "col3", "col3"],
- ["self", "other", "self", "other"],
- ]
- )
- data = [["a", "c", np.nan, np.nan], [np.nan, np.nan, 3.0, 4.0]]
- expected = pd.DataFrame(data=data, index=indices, columns=columns)
- tm.assert_frame_equal(result, expected)
- def test_compare_unaligned_objects():
- # test DataFrames with different indices
- msg = (
- r"Can only compare identically-labeled \(both index and columns\) DataFrame "
- "objects"
- )
- with pytest.raises(ValueError, match=msg):
- df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"])
- df2 = pd.DataFrame([1, 2, 3], index=["a", "b", "d"])
- df1.compare(df2)
- # test DataFrames with different shapes
- msg = (
- r"Can only compare identically-labeled \(both index and columns\) DataFrame "
- "objects"
- )
- with pytest.raises(ValueError, match=msg):
- df1 = pd.DataFrame(np.ones((3, 3)))
- df2 = pd.DataFrame(np.zeros((2, 1)))
- df1.compare(df2)
- def test_compare_result_names():
- # GH 44354
- df1 = pd.DataFrame(
- {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
- )
- df2 = pd.DataFrame(
- {
- "col1": ["c", "b", "c"],
- "col2": [1.0, 2.0, np.nan],
- "col3": [1.0, 2.0, np.nan],
- },
- )
- result = df1.compare(df2, result_names=("left", "right"))
- expected = pd.DataFrame(
- {
- ("col1", "left"): {0: "a", 2: np.nan},
- ("col1", "right"): {0: "c", 2: np.nan},
- ("col3", "left"): {0: np.nan, 2: 3.0},
- ("col3", "right"): {0: np.nan, 2: np.nan},
- }
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "result_names",
- [
- [1, 2],
- "HK",
- {"2": 2, "3": 3},
- 3,
- 3.0,
- ],
- )
- def test_invalid_input_result_names(result_names):
- # GH 44354
- df1 = pd.DataFrame(
- {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 3.0]},
- )
- df2 = pd.DataFrame(
- {
- "col1": ["c", "b", "c"],
- "col2": [1.0, 2.0, np.nan],
- "col3": [1.0, 2.0, np.nan],
- },
- )
- with pytest.raises(
- TypeError,
- match=(
- f"Passing 'result_names' as a {type(result_names)} is not "
- "supported. Provide 'result_names' as a tuple instead."
- ),
- ):
- df1.compare(df2, result_names=result_names)
- @pytest.mark.parametrize(
- "val1,val2",
- [(4, pd.NA), (pd.NA, pd.NA), (pd.NA, 4)],
- )
- def test_compare_ea_and_np_dtype(val1, val2):
- # GH 48966
- arr = [4.0, val1]
- ser = pd.Series([1, val2], dtype="Int64")
- df1 = pd.DataFrame({"a": arr, "b": [1.0, 2]})
- df2 = pd.DataFrame({"a": ser, "b": [1.0, 2]})
- expected = pd.DataFrame(
- {
- ("a", "self"): arr,
- ("a", "other"): ser,
- ("b", "self"): np.nan,
- ("b", "other"): np.nan,
- }
- )
- if val1 is pd.NA and np_version_gte1p25:
- # can't compare with numpy array if it contains pd.NA
- with pytest.raises(TypeError, match="boolean value of NA is ambiguous"):
- result = df1.compare(df2, keep_shape=True)
- else:
- result = df1.compare(df2, keep_shape=True)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "df1_val,df2_val,diff_self,diff_other",
- [
- (4, 3, 4, 3),
- (4, 4, pd.NA, pd.NA),
- (4, pd.NA, 4, pd.NA),
- (pd.NA, pd.NA, pd.NA, pd.NA),
- ],
- )
- def test_compare_nullable_int64_dtype(df1_val, df2_val, diff_self, diff_other):
- # GH 48966
- df1 = pd.DataFrame({"a": pd.Series([df1_val, pd.NA], dtype="Int64"), "b": [1.0, 2]})
- df2 = df1.copy()
- df2.loc[0, "a"] = df2_val
- expected = pd.DataFrame(
- {
- ("a", "self"): pd.Series([diff_self, pd.NA], dtype="Int64"),
- ("a", "other"): pd.Series([diff_other, pd.NA], dtype="Int64"),
- ("b", "self"): np.nan,
- ("b", "other"): np.nan,
- }
- )
- result = df1.compare(df2, keep_shape=True)
- tm.assert_frame_equal(result, expected)
|