test_compare.py 4.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import numpy as np
  2. import pytest
  3. import pandas as pd
  4. import pandas._testing as tm
  5. @pytest.mark.parametrize("align_axis", [0, 1, "index", "columns"])
  6. def test_compare_axis(align_axis):
  7. # GH#30429
  8. s1 = pd.Series(["a", "b", "c"])
  9. s2 = pd.Series(["x", "b", "z"])
  10. result = s1.compare(s2, align_axis=align_axis)
  11. if align_axis in (1, "columns"):
  12. indices = pd.Index([0, 2])
  13. columns = pd.Index(["self", "other"])
  14. expected = pd.DataFrame(
  15. [["a", "x"], ["c", "z"]], index=indices, columns=columns
  16. )
  17. tm.assert_frame_equal(result, expected)
  18. else:
  19. indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]])
  20. expected = pd.Series(["a", "x", "c", "z"], index=indices)
  21. tm.assert_series_equal(result, expected)
  22. @pytest.mark.parametrize(
  23. "keep_shape, keep_equal",
  24. [
  25. (True, False),
  26. (False, True),
  27. (True, True),
  28. # False, False case is already covered in test_compare_axis
  29. ],
  30. )
  31. def test_compare_various_formats(keep_shape, keep_equal):
  32. s1 = pd.Series(["a", "b", "c"])
  33. s2 = pd.Series(["x", "b", "z"])
  34. result = s1.compare(s2, keep_shape=keep_shape, keep_equal=keep_equal)
  35. if keep_shape:
  36. indices = pd.Index([0, 1, 2])
  37. columns = pd.Index(["self", "other"])
  38. if keep_equal:
  39. expected = pd.DataFrame(
  40. [["a", "x"], ["b", "b"], ["c", "z"]], index=indices, columns=columns
  41. )
  42. else:
  43. expected = pd.DataFrame(
  44. [["a", "x"], [np.nan, np.nan], ["c", "z"]],
  45. index=indices,
  46. columns=columns,
  47. )
  48. else:
  49. indices = pd.Index([0, 2])
  50. columns = pd.Index(["self", "other"])
  51. expected = pd.DataFrame(
  52. [["a", "x"], ["c", "z"]], index=indices, columns=columns
  53. )
  54. tm.assert_frame_equal(result, expected)
  55. def test_compare_with_equal_nulls():
  56. # We want to make sure two NaNs are considered the same
  57. # and dropped where applicable
  58. s1 = pd.Series(["a", "b", np.nan])
  59. s2 = pd.Series(["x", "b", np.nan])
  60. result = s1.compare(s2)
  61. expected = pd.DataFrame([["a", "x"]], columns=["self", "other"])
  62. tm.assert_frame_equal(result, expected)
  63. def test_compare_with_non_equal_nulls():
  64. # We want to make sure the relevant NaNs do not get dropped
  65. s1 = pd.Series(["a", "b", "c"])
  66. s2 = pd.Series(["x", "b", np.nan])
  67. result = s1.compare(s2, align_axis=0)
  68. indices = pd.MultiIndex.from_product([[0, 2], ["self", "other"]])
  69. expected = pd.Series(["a", "x", "c", np.nan], index=indices)
  70. tm.assert_series_equal(result, expected)
  71. def test_compare_multi_index():
  72. index = pd.MultiIndex.from_arrays([[0, 0, 1], [0, 1, 2]])
  73. s1 = pd.Series(["a", "b", "c"], index=index)
  74. s2 = pd.Series(["x", "b", "z"], index=index)
  75. result = s1.compare(s2, align_axis=0)
  76. indices = pd.MultiIndex.from_arrays(
  77. [[0, 0, 1, 1], [0, 0, 2, 2], ["self", "other", "self", "other"]]
  78. )
  79. expected = pd.Series(["a", "x", "c", "z"], index=indices)
  80. tm.assert_series_equal(result, expected)
  81. def test_compare_unaligned_objects():
  82. # test Series with different indices
  83. msg = "Can only compare identically-labeled Series objects"
  84. with pytest.raises(ValueError, match=msg):
  85. ser1 = pd.Series([1, 2, 3], index=["a", "b", "c"])
  86. ser2 = pd.Series([1, 2, 3], index=["a", "b", "d"])
  87. ser1.compare(ser2)
  88. # test Series with different lengths
  89. msg = "Can only compare identically-labeled Series objects"
  90. with pytest.raises(ValueError, match=msg):
  91. ser1 = pd.Series([1, 2, 3])
  92. ser2 = pd.Series([1, 2, 3, 4])
  93. ser1.compare(ser2)
  94. def test_compare_datetime64_and_string():
  95. # Issue https://github.com/pandas-dev/pandas/issues/45506
  96. # Catch OverflowError when comparing datetime64 and string
  97. data = [
  98. {"a": "2015-07-01", "b": "08335394550"},
  99. {"a": "2015-07-02", "b": "+49 (0) 0345 300033"},
  100. {"a": "2015-07-03", "b": "+49(0)2598 04457"},
  101. {"a": "2015-07-04", "b": "0741470003"},
  102. {"a": "2015-07-05", "b": "04181 83668"},
  103. ]
  104. dtypes = {"a": "datetime64[ns]", "b": "string"}
  105. df = pd.DataFrame(data=data).astype(dtypes)
  106. result_eq1 = df["a"].eq(df["b"])
  107. result_eq2 = df["a"] == df["b"]
  108. result_neq = df["a"] != df["b"]
  109. expected_eq = pd.Series([False] * 5) # For .eq and ==
  110. expected_neq = pd.Series([True] * 5) # For !=
  111. tm.assert_series_equal(result_eq1, expected_eq)
  112. tm.assert_series_equal(result_eq2, expected_eq)
  113. tm.assert_series_equal(result_neq, expected_neq)