test_join.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390
  1. import numpy as np
  2. import pytest
  3. from pandas._libs import join as libjoin
  4. from pandas._libs.join import (
  5. inner_join,
  6. left_outer_join,
  7. )
  8. import pandas._testing as tm
  9. class TestIndexer:
  10. @pytest.mark.parametrize(
  11. "dtype", ["int32", "int64", "float32", "float64", "object"]
  12. )
  13. def test_outer_join_indexer(self, dtype):
  14. indexer = libjoin.outer_join_indexer
  15. left = np.arange(3, dtype=dtype)
  16. right = np.arange(2, 5, dtype=dtype)
  17. empty = np.array([], dtype=dtype)
  18. result, lindexer, rindexer = indexer(left, right)
  19. assert isinstance(result, np.ndarray)
  20. assert isinstance(lindexer, np.ndarray)
  21. assert isinstance(rindexer, np.ndarray)
  22. tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype))
  23. exp = np.array([0, 1, 2, -1, -1], dtype=np.intp)
  24. tm.assert_numpy_array_equal(lindexer, exp)
  25. exp = np.array([-1, -1, 0, 1, 2], dtype=np.intp)
  26. tm.assert_numpy_array_equal(rindexer, exp)
  27. result, lindexer, rindexer = indexer(empty, right)
  28. tm.assert_numpy_array_equal(result, right)
  29. exp = np.array([-1, -1, -1], dtype=np.intp)
  30. tm.assert_numpy_array_equal(lindexer, exp)
  31. exp = np.array([0, 1, 2], dtype=np.intp)
  32. tm.assert_numpy_array_equal(rindexer, exp)
  33. result, lindexer, rindexer = indexer(left, empty)
  34. tm.assert_numpy_array_equal(result, left)
  35. exp = np.array([0, 1, 2], dtype=np.intp)
  36. tm.assert_numpy_array_equal(lindexer, exp)
  37. exp = np.array([-1, -1, -1], dtype=np.intp)
  38. tm.assert_numpy_array_equal(rindexer, exp)
  39. def test_cython_left_outer_join(self):
  40. left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
  41. right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
  42. max_group = 5
  43. ls, rs = left_outer_join(left, right, max_group)
  44. exp_ls = left.argsort(kind="mergesort")
  45. exp_rs = right.argsort(kind="mergesort")
  46. exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
  47. exp_ri = np.array(
  48. [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]
  49. )
  50. exp_ls = exp_ls.take(exp_li)
  51. exp_ls[exp_li == -1] = -1
  52. exp_rs = exp_rs.take(exp_ri)
  53. exp_rs[exp_ri == -1] = -1
  54. tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
  55. tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
  56. def test_cython_right_outer_join(self):
  57. left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
  58. right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
  59. max_group = 5
  60. rs, ls = left_outer_join(right, left, max_group)
  61. exp_ls = left.argsort(kind="mergesort")
  62. exp_rs = right.argsort(kind="mergesort")
  63. # 0 1 1 1
  64. exp_li = np.array(
  65. [
  66. 0,
  67. 1,
  68. 2,
  69. 3,
  70. 4,
  71. 5,
  72. 3,
  73. 4,
  74. 5,
  75. 3,
  76. 4,
  77. 5,
  78. # 2 2 4
  79. 6,
  80. 7,
  81. 8,
  82. 6,
  83. 7,
  84. 8,
  85. -1,
  86. ]
  87. )
  88. exp_ri = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])
  89. exp_ls = exp_ls.take(exp_li)
  90. exp_ls[exp_li == -1] = -1
  91. exp_rs = exp_rs.take(exp_ri)
  92. exp_rs[exp_ri == -1] = -1
  93. tm.assert_numpy_array_equal(ls, exp_ls)
  94. tm.assert_numpy_array_equal(rs, exp_rs)
  95. def test_cython_inner_join(self):
  96. left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
  97. right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.intp)
  98. max_group = 5
  99. ls, rs = inner_join(left, right, max_group)
  100. exp_ls = left.argsort(kind="mergesort")
  101. exp_rs = right.argsort(kind="mergesort")
  102. exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
  103. exp_ri = np.array([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])
  104. exp_ls = exp_ls.take(exp_li)
  105. exp_ls[exp_li == -1] = -1
  106. exp_rs = exp_rs.take(exp_ri)
  107. exp_rs[exp_ri == -1] = -1
  108. tm.assert_numpy_array_equal(ls, exp_ls)
  109. tm.assert_numpy_array_equal(rs, exp_rs)
  110. @pytest.mark.parametrize("readonly", [True, False])
  111. def test_left_join_indexer_unique(readonly):
  112. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  113. b = np.array([2, 2, 3, 4, 4], dtype=np.int64)
  114. if readonly:
  115. # GH#37312, GH#37264
  116. a.setflags(write=False)
  117. b.setflags(write=False)
  118. result = libjoin.left_join_indexer_unique(b, a)
  119. expected = np.array([1, 1, 2, 3, 3], dtype=np.intp)
  120. tm.assert_numpy_array_equal(result, expected)
  121. def test_left_outer_join_bug():
  122. left = np.array(
  123. [
  124. 0,
  125. 1,
  126. 0,
  127. 1,
  128. 1,
  129. 2,
  130. 3,
  131. 1,
  132. 0,
  133. 2,
  134. 1,
  135. 2,
  136. 0,
  137. 1,
  138. 1,
  139. 2,
  140. 3,
  141. 2,
  142. 3,
  143. 2,
  144. 1,
  145. 1,
  146. 3,
  147. 0,
  148. 3,
  149. 2,
  150. 3,
  151. 0,
  152. 0,
  153. 2,
  154. 3,
  155. 2,
  156. 0,
  157. 3,
  158. 1,
  159. 3,
  160. 0,
  161. 1,
  162. 3,
  163. 0,
  164. 0,
  165. 1,
  166. 0,
  167. 3,
  168. 1,
  169. 0,
  170. 1,
  171. 0,
  172. 1,
  173. 1,
  174. 0,
  175. 2,
  176. 2,
  177. 2,
  178. 2,
  179. 2,
  180. 0,
  181. 3,
  182. 1,
  183. 2,
  184. 0,
  185. 0,
  186. 3,
  187. 1,
  188. 3,
  189. 2,
  190. 2,
  191. 0,
  192. 1,
  193. 3,
  194. 0,
  195. 2,
  196. 3,
  197. 2,
  198. 3,
  199. 3,
  200. 2,
  201. 3,
  202. 3,
  203. 1,
  204. 3,
  205. 2,
  206. 0,
  207. 0,
  208. 3,
  209. 1,
  210. 1,
  211. 1,
  212. 0,
  213. 2,
  214. 3,
  215. 3,
  216. 1,
  217. 2,
  218. 0,
  219. 3,
  220. 1,
  221. 2,
  222. 0,
  223. 2,
  224. ],
  225. dtype=np.intp,
  226. )
  227. right = np.array([3, 1], dtype=np.intp)
  228. max_groups = 4
  229. lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False)
  230. exp_lidx = np.arange(len(left), dtype=np.intp)
  231. exp_ridx = -np.ones(len(left), dtype=np.intp)
  232. exp_ridx[left == 1] = 1
  233. exp_ridx[left == 3] = 0
  234. tm.assert_numpy_array_equal(lidx, exp_lidx)
  235. tm.assert_numpy_array_equal(ridx, exp_ridx)
  236. def test_inner_join_indexer():
  237. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  238. b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
  239. index, ares, bres = libjoin.inner_join_indexer(a, b)
  240. index_exp = np.array([3, 5], dtype=np.int64)
  241. tm.assert_almost_equal(index, index_exp)
  242. aexp = np.array([2, 4], dtype=np.intp)
  243. bexp = np.array([1, 2], dtype=np.intp)
  244. tm.assert_almost_equal(ares, aexp)
  245. tm.assert_almost_equal(bres, bexp)
  246. a = np.array([5], dtype=np.int64)
  247. b = np.array([5], dtype=np.int64)
  248. index, ares, bres = libjoin.inner_join_indexer(a, b)
  249. tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
  250. tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
  251. tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
  252. def test_outer_join_indexer():
  253. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  254. b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
  255. index, ares, bres = libjoin.outer_join_indexer(a, b)
  256. index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64)
  257. tm.assert_almost_equal(index, index_exp)
  258. aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.intp)
  259. bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp)
  260. tm.assert_almost_equal(ares, aexp)
  261. tm.assert_almost_equal(bres, bexp)
  262. a = np.array([5], dtype=np.int64)
  263. b = np.array([5], dtype=np.int64)
  264. index, ares, bres = libjoin.outer_join_indexer(a, b)
  265. tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
  266. tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
  267. tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
  268. def test_left_join_indexer():
  269. a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
  270. b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
  271. index, ares, bres = libjoin.left_join_indexer(a, b)
  272. tm.assert_almost_equal(index, a)
  273. aexp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
  274. bexp = np.array([-1, -1, 1, -1, 2], dtype=np.intp)
  275. tm.assert_almost_equal(ares, aexp)
  276. tm.assert_almost_equal(bres, bexp)
  277. a = np.array([5], dtype=np.int64)
  278. b = np.array([5], dtype=np.int64)
  279. index, ares, bres = libjoin.left_join_indexer(a, b)
  280. tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
  281. tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
  282. tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
  283. def test_left_join_indexer2():
  284. idx = np.array([1, 1, 2, 5], dtype=np.int64)
  285. idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
  286. res, lidx, ridx = libjoin.left_join_indexer(idx2, idx)
  287. exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
  288. tm.assert_almost_equal(res, exp_res)
  289. exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
  290. tm.assert_almost_equal(lidx, exp_lidx)
  291. exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
  292. tm.assert_almost_equal(ridx, exp_ridx)
  293. def test_outer_join_indexer2():
  294. idx = np.array([1, 1, 2, 5], dtype=np.int64)
  295. idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
  296. res, lidx, ridx = libjoin.outer_join_indexer(idx2, idx)
  297. exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
  298. tm.assert_almost_equal(res, exp_res)
  299. exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
  300. tm.assert_almost_equal(lidx, exp_lidx)
  301. exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
  302. tm.assert_almost_equal(ridx, exp_ridx)
  303. def test_inner_join_indexer2():
  304. idx = np.array([1, 1, 2, 5], dtype=np.int64)
  305. idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
  306. res, lidx, ridx = libjoin.inner_join_indexer(idx2, idx)
  307. exp_res = np.array([1, 1, 2, 5], dtype=np.int64)
  308. tm.assert_almost_equal(res, exp_res)
  309. exp_lidx = np.array([0, 0, 1, 2], dtype=np.intp)
  310. tm.assert_almost_equal(lidx, exp_lidx)
  311. exp_ridx = np.array([0, 1, 2, 3], dtype=np.intp)
  312. tm.assert_almost_equal(ridx, exp_ridx)