test_join.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. import numpy as np
  2. import pytest
  3. import pandas._testing as tm
  4. from pandas.core.indexes.api import Index
  5. class TestJoinInt64Index:
  6. def test_join_non_unique(self):
  7. left = Index([4, 4, 3, 3])
  8. joined, lidx, ridx = left.join(left, return_indexers=True)
  9. exp_joined = Index([3, 3, 3, 3, 4, 4, 4, 4])
  10. tm.assert_index_equal(joined, exp_joined)
  11. exp_lidx = np.array([2, 2, 3, 3, 0, 0, 1, 1], dtype=np.intp)
  12. tm.assert_numpy_array_equal(lidx, exp_lidx)
  13. exp_ridx = np.array([2, 3, 2, 3, 0, 1, 0, 1], dtype=np.intp)
  14. tm.assert_numpy_array_equal(ridx, exp_ridx)
  15. def test_join_inner(self):
  16. index = Index(range(0, 20, 2), dtype=np.int64)
  17. other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64)
  18. other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64)
  19. # not monotonic
  20. res, lidx, ridx = index.join(other, how="inner", return_indexers=True)
  21. # no guarantee of sortedness, so sort for comparison purposes
  22. ind = res.argsort()
  23. res = res.take(ind)
  24. lidx = lidx.take(ind)
  25. ridx = ridx.take(ind)
  26. eres = Index([2, 12], dtype=np.int64)
  27. elidx = np.array([1, 6], dtype=np.intp)
  28. eridx = np.array([4, 1], dtype=np.intp)
  29. assert isinstance(res, Index) and res.dtype == np.int64
  30. tm.assert_index_equal(res, eres)
  31. tm.assert_numpy_array_equal(lidx, elidx)
  32. tm.assert_numpy_array_equal(ridx, eridx)
  33. # monotonic
  34. res, lidx, ridx = index.join(other_mono, how="inner", return_indexers=True)
  35. res2 = index.intersection(other_mono)
  36. tm.assert_index_equal(res, res2)
  37. elidx = np.array([1, 6], dtype=np.intp)
  38. eridx = np.array([1, 4], dtype=np.intp)
  39. assert isinstance(res, Index) and res.dtype == np.int64
  40. tm.assert_index_equal(res, eres)
  41. tm.assert_numpy_array_equal(lidx, elidx)
  42. tm.assert_numpy_array_equal(ridx, eridx)
  43. def test_join_left(self):
  44. index = Index(range(0, 20, 2), dtype=np.int64)
  45. other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64)
  46. other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64)
  47. # not monotonic
  48. res, lidx, ridx = index.join(other, how="left", return_indexers=True)
  49. eres = index
  50. eridx = np.array([-1, 4, -1, -1, -1, -1, 1, -1, -1, -1], dtype=np.intp)
  51. assert isinstance(res, Index) and res.dtype == np.int64
  52. tm.assert_index_equal(res, eres)
  53. assert lidx is None
  54. tm.assert_numpy_array_equal(ridx, eridx)
  55. # monotonic
  56. res, lidx, ridx = index.join(other_mono, how="left", return_indexers=True)
  57. eridx = np.array([-1, 1, -1, -1, -1, -1, 4, -1, -1, -1], dtype=np.intp)
  58. assert isinstance(res, Index) and res.dtype == np.int64
  59. tm.assert_index_equal(res, eres)
  60. assert lidx is None
  61. tm.assert_numpy_array_equal(ridx, eridx)
  62. # non-unique
  63. idx = Index([1, 1, 2, 5])
  64. idx2 = Index([1, 2, 5, 7, 9])
  65. res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True)
  66. eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2
  67. eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
  68. elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
  69. tm.assert_index_equal(res, eres)
  70. tm.assert_numpy_array_equal(lidx, elidx)
  71. tm.assert_numpy_array_equal(ridx, eridx)
  72. def test_join_right(self):
  73. index = Index(range(0, 20, 2), dtype=np.int64)
  74. other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64)
  75. other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64)
  76. # not monotonic
  77. res, lidx, ridx = index.join(other, how="right", return_indexers=True)
  78. eres = other
  79. elidx = np.array([-1, 6, -1, -1, 1, -1], dtype=np.intp)
  80. assert isinstance(other, Index) and other.dtype == np.int64
  81. tm.assert_index_equal(res, eres)
  82. tm.assert_numpy_array_equal(lidx, elidx)
  83. assert ridx is None
  84. # monotonic
  85. res, lidx, ridx = index.join(other_mono, how="right", return_indexers=True)
  86. eres = other_mono
  87. elidx = np.array([-1, 1, -1, -1, 6, -1], dtype=np.intp)
  88. assert isinstance(other, Index) and other.dtype == np.int64
  89. tm.assert_index_equal(res, eres)
  90. tm.assert_numpy_array_equal(lidx, elidx)
  91. assert ridx is None
  92. # non-unique
  93. idx = Index([1, 1, 2, 5])
  94. idx2 = Index([1, 2, 5, 7, 9])
  95. res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True)
  96. eres = Index([1, 1, 2, 5, 7, 9]) # 1 is in idx2, so it should be x2
  97. elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
  98. eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
  99. tm.assert_index_equal(res, eres)
  100. tm.assert_numpy_array_equal(lidx, elidx)
  101. tm.assert_numpy_array_equal(ridx, eridx)
  102. def test_join_non_int_index(self):
  103. index = Index(range(0, 20, 2), dtype=np.int64)
  104. other = Index([3, 6, 7, 8, 10], dtype=object)
  105. outer = index.join(other, how="outer")
  106. outer2 = other.join(index, how="outer")
  107. expected = Index([0, 2, 3, 4, 6, 7, 8, 10, 12, 14, 16, 18])
  108. tm.assert_index_equal(outer, outer2)
  109. tm.assert_index_equal(outer, expected)
  110. inner = index.join(other, how="inner")
  111. inner2 = other.join(index, how="inner")
  112. expected = Index([6, 8, 10])
  113. tm.assert_index_equal(inner, inner2)
  114. tm.assert_index_equal(inner, expected)
  115. left = index.join(other, how="left")
  116. tm.assert_index_equal(left, index.astype(object))
  117. left2 = other.join(index, how="left")
  118. tm.assert_index_equal(left2, other)
  119. right = index.join(other, how="right")
  120. tm.assert_index_equal(right, other)
  121. right2 = other.join(index, how="right")
  122. tm.assert_index_equal(right2, index.astype(object))
  123. def test_join_outer(self):
  124. index = Index(range(0, 20, 2), dtype=np.int64)
  125. other = Index([7, 12, 25, 1, 2, 5], dtype=np.int64)
  126. other_mono = Index([1, 2, 5, 7, 12, 25], dtype=np.int64)
  127. # not monotonic
  128. # guarantee of sortedness
  129. res, lidx, ridx = index.join(other, how="outer", return_indexers=True)
  130. noidx_res = index.join(other, how="outer")
  131. tm.assert_index_equal(res, noidx_res)
  132. eres = Index([0, 1, 2, 4, 5, 6, 7, 8, 10, 12, 14, 16, 18, 25], dtype=np.int64)
  133. elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp)
  134. eridx = np.array(
  135. [-1, 3, 4, -1, 5, -1, 0, -1, -1, 1, -1, -1, -1, 2], dtype=np.intp
  136. )
  137. assert isinstance(res, Index) and res.dtype == np.int64
  138. tm.assert_index_equal(res, eres)
  139. tm.assert_numpy_array_equal(lidx, elidx)
  140. tm.assert_numpy_array_equal(ridx, eridx)
  141. # monotonic
  142. res, lidx, ridx = index.join(other_mono, how="outer", return_indexers=True)
  143. noidx_res = index.join(other_mono, how="outer")
  144. tm.assert_index_equal(res, noidx_res)
  145. elidx = np.array([0, -1, 1, 2, -1, 3, -1, 4, 5, 6, 7, 8, 9, -1], dtype=np.intp)
  146. eridx = np.array(
  147. [-1, 0, 1, -1, 2, -1, 3, -1, -1, 4, -1, -1, -1, 5], dtype=np.intp
  148. )
  149. assert isinstance(res, Index) and res.dtype == np.int64
  150. tm.assert_index_equal(res, eres)
  151. tm.assert_numpy_array_equal(lidx, elidx)
  152. tm.assert_numpy_array_equal(ridx, eridx)
  153. class TestJoinUInt64Index:
  154. @pytest.fixture
  155. def index_large(self):
  156. # large values used in TestUInt64Index where no compat needed with int64/float64
  157. large = [2**63, 2**63 + 10, 2**63 + 15, 2**63 + 20, 2**63 + 25]
  158. return Index(large, dtype=np.uint64)
  159. def test_join_inner(self, index_large):
  160. other = Index(2**63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64"))
  161. other_mono = Index(2**63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64"))
  162. # not monotonic
  163. res, lidx, ridx = index_large.join(other, how="inner", return_indexers=True)
  164. # no guarantee of sortedness, so sort for comparison purposes
  165. ind = res.argsort()
  166. res = res.take(ind)
  167. lidx = lidx.take(ind)
  168. ridx = ridx.take(ind)
  169. eres = Index(2**63 + np.array([10, 25], dtype="uint64"))
  170. elidx = np.array([1, 4], dtype=np.intp)
  171. eridx = np.array([5, 2], dtype=np.intp)
  172. assert isinstance(res, Index) and res.dtype == np.uint64
  173. tm.assert_index_equal(res, eres)
  174. tm.assert_numpy_array_equal(lidx, elidx)
  175. tm.assert_numpy_array_equal(ridx, eridx)
  176. # monotonic
  177. res, lidx, ridx = index_large.join(
  178. other_mono, how="inner", return_indexers=True
  179. )
  180. res2 = index_large.intersection(other_mono)
  181. tm.assert_index_equal(res, res2)
  182. elidx = np.array([1, 4], dtype=np.intp)
  183. eridx = np.array([3, 5], dtype=np.intp)
  184. assert isinstance(res, Index) and res.dtype == np.uint64
  185. tm.assert_index_equal(res, eres)
  186. tm.assert_numpy_array_equal(lidx, elidx)
  187. tm.assert_numpy_array_equal(ridx, eridx)
  188. def test_join_left(self, index_large):
  189. other = Index(2**63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64"))
  190. other_mono = Index(2**63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64"))
  191. # not monotonic
  192. res, lidx, ridx = index_large.join(other, how="left", return_indexers=True)
  193. eres = index_large
  194. eridx = np.array([-1, 5, -1, -1, 2], dtype=np.intp)
  195. assert isinstance(res, Index) and res.dtype == np.uint64
  196. tm.assert_index_equal(res, eres)
  197. assert lidx is None
  198. tm.assert_numpy_array_equal(ridx, eridx)
  199. # monotonic
  200. res, lidx, ridx = index_large.join(other_mono, how="left", return_indexers=True)
  201. eridx = np.array([-1, 3, -1, -1, 5], dtype=np.intp)
  202. assert isinstance(res, Index) and res.dtype == np.uint64
  203. tm.assert_index_equal(res, eres)
  204. assert lidx is None
  205. tm.assert_numpy_array_equal(ridx, eridx)
  206. # non-unique
  207. idx = Index(2**63 + np.array([1, 1, 2, 5], dtype="uint64"))
  208. idx2 = Index(2**63 + np.array([1, 2, 5, 7, 9], dtype="uint64"))
  209. res, lidx, ridx = idx2.join(idx, how="left", return_indexers=True)
  210. # 1 is in idx2, so it should be x2
  211. eres = Index(2**63 + np.array([1, 1, 2, 5, 7, 9], dtype="uint64"))
  212. eridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
  213. elidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
  214. tm.assert_index_equal(res, eres)
  215. tm.assert_numpy_array_equal(lidx, elidx)
  216. tm.assert_numpy_array_equal(ridx, eridx)
  217. def test_join_right(self, index_large):
  218. other = Index(2**63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64"))
  219. other_mono = Index(2**63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64"))
  220. # not monotonic
  221. res, lidx, ridx = index_large.join(other, how="right", return_indexers=True)
  222. eres = other
  223. elidx = np.array([-1, -1, 4, -1, -1, 1], dtype=np.intp)
  224. tm.assert_numpy_array_equal(lidx, elidx)
  225. assert isinstance(other, Index) and other.dtype == np.uint64
  226. tm.assert_index_equal(res, eres)
  227. assert ridx is None
  228. # monotonic
  229. res, lidx, ridx = index_large.join(
  230. other_mono, how="right", return_indexers=True
  231. )
  232. eres = other_mono
  233. elidx = np.array([-1, -1, -1, 1, -1, 4], dtype=np.intp)
  234. assert isinstance(other, Index) and other.dtype == np.uint64
  235. tm.assert_numpy_array_equal(lidx, elidx)
  236. tm.assert_index_equal(res, eres)
  237. assert ridx is None
  238. # non-unique
  239. idx = Index(2**63 + np.array([1, 1, 2, 5], dtype="uint64"))
  240. idx2 = Index(2**63 + np.array([1, 2, 5, 7, 9], dtype="uint64"))
  241. res, lidx, ridx = idx.join(idx2, how="right", return_indexers=True)
  242. # 1 is in idx2, so it should be x2
  243. eres = Index(2**63 + np.array([1, 1, 2, 5, 7, 9], dtype="uint64"))
  244. elidx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
  245. eridx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
  246. tm.assert_index_equal(res, eres)
  247. tm.assert_numpy_array_equal(lidx, elidx)
  248. tm.assert_numpy_array_equal(ridx, eridx)
  249. def test_join_non_int_index(self, index_large):
  250. other = Index(
  251. 2**63 + np.array([1, 5, 7, 10, 20], dtype="uint64"), dtype=object
  252. )
  253. outer = index_large.join(other, how="outer")
  254. outer2 = other.join(index_large, how="outer")
  255. expected = Index(
  256. 2**63 + np.array([0, 1, 5, 7, 10, 15, 20, 25], dtype="uint64")
  257. )
  258. tm.assert_index_equal(outer, outer2)
  259. tm.assert_index_equal(outer, expected)
  260. inner = index_large.join(other, how="inner")
  261. inner2 = other.join(index_large, how="inner")
  262. expected = Index(2**63 + np.array([10, 20], dtype="uint64"))
  263. tm.assert_index_equal(inner, inner2)
  264. tm.assert_index_equal(inner, expected)
  265. left = index_large.join(other, how="left")
  266. tm.assert_index_equal(left, index_large.astype(object))
  267. left2 = other.join(index_large, how="left")
  268. tm.assert_index_equal(left2, other)
  269. right = index_large.join(other, how="right")
  270. tm.assert_index_equal(right, other)
  271. right2 = other.join(index_large, how="right")
  272. tm.assert_index_equal(right2, index_large.astype(object))
  273. def test_join_outer(self, index_large):
  274. other = Index(2**63 + np.array([7, 12, 25, 1, 2, 10], dtype="uint64"))
  275. other_mono = Index(2**63 + np.array([1, 2, 7, 10, 12, 25], dtype="uint64"))
  276. # not monotonic
  277. # guarantee of sortedness
  278. res, lidx, ridx = index_large.join(other, how="outer", return_indexers=True)
  279. noidx_res = index_large.join(other, how="outer")
  280. tm.assert_index_equal(res, noidx_res)
  281. eres = Index(
  282. 2**63 + np.array([0, 1, 2, 7, 10, 12, 15, 20, 25], dtype="uint64")
  283. )
  284. elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp)
  285. eridx = np.array([-1, 3, 4, 0, 5, 1, -1, -1, 2], dtype=np.intp)
  286. assert isinstance(res, Index) and res.dtype == np.uint64
  287. tm.assert_index_equal(res, eres)
  288. tm.assert_numpy_array_equal(lidx, elidx)
  289. tm.assert_numpy_array_equal(ridx, eridx)
  290. # monotonic
  291. res, lidx, ridx = index_large.join(
  292. other_mono, how="outer", return_indexers=True
  293. )
  294. noidx_res = index_large.join(other_mono, how="outer")
  295. tm.assert_index_equal(res, noidx_res)
  296. elidx = np.array([0, -1, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp)
  297. eridx = np.array([-1, 0, 1, 2, 3, 4, -1, -1, 5], dtype=np.intp)
  298. assert isinstance(res, Index) and res.dtype == np.uint64
  299. tm.assert_index_equal(res, eres)
  300. tm.assert_numpy_array_equal(lidx, elidx)
  301. tm.assert_numpy_array_equal(ridx, eridx)