test_join.py 8.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. import numpy as np
  2. import pytest
  3. from pandas import (
  4. DataFrame,
  5. Index,
  6. Interval,
  7. MultiIndex,
  8. Series,
  9. StringDtype,
  10. )
  11. import pandas._testing as tm
  12. @pytest.mark.parametrize(
  13. "other", [Index(["three", "one", "two"]), Index(["one"]), Index(["one", "three"])]
  14. )
  15. def test_join_level(idx, other, join_type):
  16. join_index, lidx, ridx = other.join(
  17. idx, how=join_type, level="second", return_indexers=True
  18. )
  19. exp_level = other.join(idx.levels[1], how=join_type)
  20. assert join_index.levels[0].equals(idx.levels[0])
  21. assert join_index.levels[1].equals(exp_level)
  22. # pare down levels
  23. mask = np.array([x[1] in exp_level for x in idx], dtype=bool)
  24. exp_values = idx.values[mask]
  25. tm.assert_numpy_array_equal(join_index.values, exp_values)
  26. if join_type in ("outer", "inner"):
  27. join_index2, ridx2, lidx2 = idx.join(
  28. other, how=join_type, level="second", return_indexers=True
  29. )
  30. assert join_index.equals(join_index2)
  31. tm.assert_numpy_array_equal(lidx, lidx2)
  32. tm.assert_numpy_array_equal(ridx, ridx2)
  33. tm.assert_numpy_array_equal(join_index2.values, exp_values)
  34. def test_join_level_corner_case(idx):
  35. # some corner cases
  36. index = Index(["three", "one", "two"])
  37. result = index.join(idx, level="second")
  38. assert isinstance(result, MultiIndex)
  39. with pytest.raises(TypeError, match="Join.*MultiIndex.*ambiguous"):
  40. idx.join(idx, level=1)
  41. def test_join_self(idx, join_type):
  42. joined = idx.join(idx, how=join_type)
  43. tm.assert_index_equal(joined, idx)
  44. def test_join_multi():
  45. # GH 10665
  46. midx = MultiIndex.from_product([np.arange(4), np.arange(4)], names=["a", "b"])
  47. idx = Index([1, 2, 5], name="b")
  48. # inner
  49. jidx, lidx, ridx = midx.join(idx, how="inner", return_indexers=True)
  50. exp_idx = MultiIndex.from_product([np.arange(4), [1, 2]], names=["a", "b"])
  51. exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp)
  52. exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp)
  53. tm.assert_index_equal(jidx, exp_idx)
  54. tm.assert_numpy_array_equal(lidx, exp_lidx)
  55. tm.assert_numpy_array_equal(ridx, exp_ridx)
  56. # flip
  57. jidx, ridx, lidx = idx.join(midx, how="inner", return_indexers=True)
  58. tm.assert_index_equal(jidx, exp_idx)
  59. tm.assert_numpy_array_equal(lidx, exp_lidx)
  60. tm.assert_numpy_array_equal(ridx, exp_ridx)
  61. # keep MultiIndex
  62. jidx, lidx, ridx = midx.join(idx, how="left", return_indexers=True)
  63. exp_ridx = np.array(
  64. [-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1], dtype=np.intp
  65. )
  66. tm.assert_index_equal(jidx, midx)
  67. assert lidx is None
  68. tm.assert_numpy_array_equal(ridx, exp_ridx)
  69. # flip
  70. jidx, ridx, lidx = idx.join(midx, how="right", return_indexers=True)
  71. tm.assert_index_equal(jidx, midx)
  72. assert lidx is None
  73. tm.assert_numpy_array_equal(ridx, exp_ridx)
  74. def test_join_self_unique(idx, join_type):
  75. if idx.is_unique:
  76. joined = idx.join(idx, how=join_type)
  77. assert (idx == joined).all()
  78. def test_join_multi_wrong_order():
  79. # GH 25760
  80. # GH 28956
  81. midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
  82. midx2 = MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"])
  83. join_idx, lidx, ridx = midx1.join(midx2, return_indexers=True)
  84. exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp)
  85. tm.assert_index_equal(midx1, join_idx)
  86. assert lidx is None
  87. tm.assert_numpy_array_equal(ridx, exp_ridx)
  88. def test_join_multi_return_indexers():
  89. # GH 34074
  90. midx1 = MultiIndex.from_product([[1, 2], [3, 4], [5, 6]], names=["a", "b", "c"])
  91. midx2 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
  92. result = midx1.join(midx2, return_indexers=False)
  93. tm.assert_index_equal(result, midx1)
  94. def test_join_overlapping_interval_level():
  95. # GH 44096
  96. idx_1 = MultiIndex.from_tuples(
  97. [
  98. (1, Interval(0.0, 1.0)),
  99. (1, Interval(1.0, 2.0)),
  100. (1, Interval(2.0, 5.0)),
  101. (2, Interval(0.0, 1.0)),
  102. (2, Interval(1.0, 3.0)), # interval limit is here at 3.0, not at 2.0
  103. (2, Interval(3.0, 5.0)),
  104. ],
  105. names=["num", "interval"],
  106. )
  107. idx_2 = MultiIndex.from_tuples(
  108. [
  109. (1, Interval(2.0, 5.0)),
  110. (1, Interval(0.0, 1.0)),
  111. (1, Interval(1.0, 2.0)),
  112. (2, Interval(3.0, 5.0)),
  113. (2, Interval(0.0, 1.0)),
  114. (2, Interval(1.0, 3.0)),
  115. ],
  116. names=["num", "interval"],
  117. )
  118. expected = MultiIndex.from_tuples(
  119. [
  120. (1, Interval(0.0, 1.0)),
  121. (1, Interval(1.0, 2.0)),
  122. (1, Interval(2.0, 5.0)),
  123. (2, Interval(0.0, 1.0)),
  124. (2, Interval(1.0, 3.0)),
  125. (2, Interval(3.0, 5.0)),
  126. ],
  127. names=["num", "interval"],
  128. )
  129. result = idx_1.join(idx_2, how="outer")
  130. tm.assert_index_equal(result, expected)
  131. def test_join_midx_ea():
  132. # GH#49277
  133. midx = MultiIndex.from_arrays(
  134. [Series([1, 1, 3], dtype="Int64"), Series([1, 2, 3], dtype="Int64")],
  135. names=["a", "b"],
  136. )
  137. midx2 = MultiIndex.from_arrays(
  138. [Series([1], dtype="Int64"), Series([3], dtype="Int64")], names=["a", "c"]
  139. )
  140. result = midx.join(midx2, how="inner")
  141. expected = MultiIndex.from_arrays(
  142. [
  143. Series([1, 1], dtype="Int64"),
  144. Series([1, 2], dtype="Int64"),
  145. Series([3, 3], dtype="Int64"),
  146. ],
  147. names=["a", "b", "c"],
  148. )
  149. tm.assert_index_equal(result, expected)
  150. def test_join_midx_string():
  151. # GH#49277
  152. midx = MultiIndex.from_arrays(
  153. [
  154. Series(["a", "a", "c"], dtype=StringDtype()),
  155. Series(["a", "b", "c"], dtype=StringDtype()),
  156. ],
  157. names=["a", "b"],
  158. )
  159. midx2 = MultiIndex.from_arrays(
  160. [Series(["a"], dtype=StringDtype()), Series(["c"], dtype=StringDtype())],
  161. names=["a", "c"],
  162. )
  163. result = midx.join(midx2, how="inner")
  164. expected = MultiIndex.from_arrays(
  165. [
  166. Series(["a", "a"], dtype=StringDtype()),
  167. Series(["a", "b"], dtype=StringDtype()),
  168. Series(["c", "c"], dtype=StringDtype()),
  169. ],
  170. names=["a", "b", "c"],
  171. )
  172. tm.assert_index_equal(result, expected)
  173. def test_join_multi_with_nan():
  174. # GH29252
  175. df1 = DataFrame(
  176. data={"col1": [1.1, 1.2]},
  177. index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
  178. )
  179. df2 = DataFrame(
  180. data={"col2": [2.1, 2.2]},
  181. index=MultiIndex.from_product([["A"], [np.NaN, 2.0]], names=["id1", "id2"]),
  182. )
  183. result = df1.join(df2)
  184. expected = DataFrame(
  185. data={"col1": [1.1, 1.2], "col2": [np.nan, 2.2]},
  186. index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
  187. )
  188. tm.assert_frame_equal(result, expected)
  189. @pytest.mark.parametrize("val", [0, 5])
  190. def test_join_dtypes(any_numeric_ea_dtype, val):
  191. # GH#49830
  192. midx = MultiIndex.from_arrays([Series([1, 2], dtype=any_numeric_ea_dtype), [3, 4]])
  193. midx2 = MultiIndex.from_arrays(
  194. [Series([1, val, val], dtype=any_numeric_ea_dtype), [3, 4, 4]]
  195. )
  196. result = midx.join(midx2, how="outer")
  197. expected = MultiIndex.from_arrays(
  198. [Series([val, val, 1, 2], dtype=any_numeric_ea_dtype), [4, 4, 3, 4]]
  199. ).sort_values()
  200. tm.assert_index_equal(result, expected)
  201. def test_join_dtypes_all_nan(any_numeric_ea_dtype):
  202. # GH#49830
  203. midx = MultiIndex.from_arrays(
  204. [Series([1, 2], dtype=any_numeric_ea_dtype), [np.nan, np.nan]]
  205. )
  206. midx2 = MultiIndex.from_arrays(
  207. [Series([1, 0, 0], dtype=any_numeric_ea_dtype), [np.nan, np.nan, np.nan]]
  208. )
  209. result = midx.join(midx2, how="outer")
  210. expected = MultiIndex.from_arrays(
  211. [
  212. Series([0, 0, 1, 2], dtype=any_numeric_ea_dtype),
  213. [np.nan, np.nan, np.nan, np.nan],
  214. ]
  215. )
  216. tm.assert_index_equal(result, expected)
  217. def test_join_index_levels():
  218. # GH#53093
  219. midx = midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")])
  220. midx2 = MultiIndex.from_tuples([("a", "2019-01-31")])
  221. result = midx.join(midx2, how="outer")
  222. expected = MultiIndex.from_tuples(
  223. [("a", "2019-01-31"), ("a", "2019-02-01"), ("a", "2019-02-01")]
  224. )
  225. tm.assert_index_equal(result.levels[1], expected.levels[1])
  226. tm.assert_index_equal(result, expected)