123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390 |
- import numpy as np
- import pytest
- from pandas._libs import join as libjoin
- from pandas._libs.join import (
- inner_join,
- left_outer_join,
- )
- import pandas._testing as tm
- class TestIndexer:
- @pytest.mark.parametrize(
- "dtype", ["int32", "int64", "float32", "float64", "object"]
- )
- def test_outer_join_indexer(self, dtype):
- indexer = libjoin.outer_join_indexer
- left = np.arange(3, dtype=dtype)
- right = np.arange(2, 5, dtype=dtype)
- empty = np.array([], dtype=dtype)
- result, lindexer, rindexer = indexer(left, right)
- assert isinstance(result, np.ndarray)
- assert isinstance(lindexer, np.ndarray)
- assert isinstance(rindexer, np.ndarray)
- tm.assert_numpy_array_equal(result, np.arange(5, dtype=dtype))
- exp = np.array([0, 1, 2, -1, -1], dtype=np.intp)
- tm.assert_numpy_array_equal(lindexer, exp)
- exp = np.array([-1, -1, 0, 1, 2], dtype=np.intp)
- tm.assert_numpy_array_equal(rindexer, exp)
- result, lindexer, rindexer = indexer(empty, right)
- tm.assert_numpy_array_equal(result, right)
- exp = np.array([-1, -1, -1], dtype=np.intp)
- tm.assert_numpy_array_equal(lindexer, exp)
- exp = np.array([0, 1, 2], dtype=np.intp)
- tm.assert_numpy_array_equal(rindexer, exp)
- result, lindexer, rindexer = indexer(left, empty)
- tm.assert_numpy_array_equal(result, left)
- exp = np.array([0, 1, 2], dtype=np.intp)
- tm.assert_numpy_array_equal(lindexer, exp)
- exp = np.array([-1, -1, -1], dtype=np.intp)
- tm.assert_numpy_array_equal(rindexer, exp)
- def test_cython_left_outer_join(self):
- left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
- right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
- max_group = 5
- ls, rs = left_outer_join(left, right, max_group)
- exp_ls = left.argsort(kind="mergesort")
- exp_rs = right.argsort(kind="mergesort")
- exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8, 9, 10])
- exp_ri = np.array(
- [0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5, -1, -1]
- )
- exp_ls = exp_ls.take(exp_li)
- exp_ls[exp_li == -1] = -1
- exp_rs = exp_rs.take(exp_ri)
- exp_rs[exp_ri == -1] = -1
- tm.assert_numpy_array_equal(ls, exp_ls, check_dtype=False)
- tm.assert_numpy_array_equal(rs, exp_rs, check_dtype=False)
- def test_cython_right_outer_join(self):
- left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
- right = np.array([1, 1, 0, 4, 2, 2, 1], dtype=np.intp)
- max_group = 5
- rs, ls = left_outer_join(right, left, max_group)
- exp_ls = left.argsort(kind="mergesort")
- exp_rs = right.argsort(kind="mergesort")
- # 0 1 1 1
- exp_li = np.array(
- [
- 0,
- 1,
- 2,
- 3,
- 4,
- 5,
- 3,
- 4,
- 5,
- 3,
- 4,
- 5,
- # 2 2 4
- 6,
- 7,
- 8,
- 6,
- 7,
- 8,
- -1,
- ]
- )
- exp_ri = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6])
- exp_ls = exp_ls.take(exp_li)
- exp_ls[exp_li == -1] = -1
- exp_rs = exp_rs.take(exp_ri)
- exp_rs[exp_ri == -1] = -1
- tm.assert_numpy_array_equal(ls, exp_ls)
- tm.assert_numpy_array_equal(rs, exp_rs)
- def test_cython_inner_join(self):
- left = np.array([0, 1, 2, 1, 2, 0, 0, 1, 2, 3, 3], dtype=np.intp)
- right = np.array([1, 1, 0, 4, 2, 2, 1, 4], dtype=np.intp)
- max_group = 5
- ls, rs = inner_join(left, right, max_group)
- exp_ls = left.argsort(kind="mergesort")
- exp_rs = right.argsort(kind="mergesort")
- exp_li = np.array([0, 1, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 7, 7, 8, 8])
- exp_ri = np.array([0, 0, 0, 1, 2, 3, 1, 2, 3, 1, 2, 3, 4, 5, 4, 5, 4, 5])
- exp_ls = exp_ls.take(exp_li)
- exp_ls[exp_li == -1] = -1
- exp_rs = exp_rs.take(exp_ri)
- exp_rs[exp_ri == -1] = -1
- tm.assert_numpy_array_equal(ls, exp_ls)
- tm.assert_numpy_array_equal(rs, exp_rs)
- @pytest.mark.parametrize("readonly", [True, False])
- def test_left_join_indexer_unique(readonly):
- a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
- b = np.array([2, 2, 3, 4, 4], dtype=np.int64)
- if readonly:
- # GH#37312, GH#37264
- a.setflags(write=False)
- b.setflags(write=False)
- result = libjoin.left_join_indexer_unique(b, a)
- expected = np.array([1, 1, 2, 3, 3], dtype=np.intp)
- tm.assert_numpy_array_equal(result, expected)
- def test_left_outer_join_bug():
- left = np.array(
- [
- 0,
- 1,
- 0,
- 1,
- 1,
- 2,
- 3,
- 1,
- 0,
- 2,
- 1,
- 2,
- 0,
- 1,
- 1,
- 2,
- 3,
- 2,
- 3,
- 2,
- 1,
- 1,
- 3,
- 0,
- 3,
- 2,
- 3,
- 0,
- 0,
- 2,
- 3,
- 2,
- 0,
- 3,
- 1,
- 3,
- 0,
- 1,
- 3,
- 0,
- 0,
- 1,
- 0,
- 3,
- 1,
- 0,
- 1,
- 0,
- 1,
- 1,
- 0,
- 2,
- 2,
- 2,
- 2,
- 2,
- 0,
- 3,
- 1,
- 2,
- 0,
- 0,
- 3,
- 1,
- 3,
- 2,
- 2,
- 0,
- 1,
- 3,
- 0,
- 2,
- 3,
- 2,
- 3,
- 3,
- 2,
- 3,
- 3,
- 1,
- 3,
- 2,
- 0,
- 0,
- 3,
- 1,
- 1,
- 1,
- 0,
- 2,
- 3,
- 3,
- 1,
- 2,
- 0,
- 3,
- 1,
- 2,
- 0,
- 2,
- ],
- dtype=np.intp,
- )
- right = np.array([3, 1], dtype=np.intp)
- max_groups = 4
- lidx, ridx = libjoin.left_outer_join(left, right, max_groups, sort=False)
- exp_lidx = np.arange(len(left), dtype=np.intp)
- exp_ridx = -np.ones(len(left), dtype=np.intp)
- exp_ridx[left == 1] = 1
- exp_ridx[left == 3] = 0
- tm.assert_numpy_array_equal(lidx, exp_lidx)
- tm.assert_numpy_array_equal(ridx, exp_ridx)
- def test_inner_join_indexer():
- a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
- b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
- index, ares, bres = libjoin.inner_join_indexer(a, b)
- index_exp = np.array([3, 5], dtype=np.int64)
- tm.assert_almost_equal(index, index_exp)
- aexp = np.array([2, 4], dtype=np.intp)
- bexp = np.array([1, 2], dtype=np.intp)
- tm.assert_almost_equal(ares, aexp)
- tm.assert_almost_equal(bres, bexp)
- a = np.array([5], dtype=np.int64)
- b = np.array([5], dtype=np.int64)
- index, ares, bres = libjoin.inner_join_indexer(a, b)
- tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
- tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
- tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
- def test_outer_join_indexer():
- a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
- b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
- index, ares, bres = libjoin.outer_join_indexer(a, b)
- index_exp = np.array([0, 1, 2, 3, 4, 5, 7, 9], dtype=np.int64)
- tm.assert_almost_equal(index, index_exp)
- aexp = np.array([-1, 0, 1, 2, 3, 4, -1, -1], dtype=np.intp)
- bexp = np.array([0, -1, -1, 1, -1, 2, 3, 4], dtype=np.intp)
- tm.assert_almost_equal(ares, aexp)
- tm.assert_almost_equal(bres, bexp)
- a = np.array([5], dtype=np.int64)
- b = np.array([5], dtype=np.int64)
- index, ares, bres = libjoin.outer_join_indexer(a, b)
- tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
- tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
- tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
- def test_left_join_indexer():
- a = np.array([1, 2, 3, 4, 5], dtype=np.int64)
- b = np.array([0, 3, 5, 7, 9], dtype=np.int64)
- index, ares, bres = libjoin.left_join_indexer(a, b)
- tm.assert_almost_equal(index, a)
- aexp = np.array([0, 1, 2, 3, 4], dtype=np.intp)
- bexp = np.array([-1, -1, 1, -1, 2], dtype=np.intp)
- tm.assert_almost_equal(ares, aexp)
- tm.assert_almost_equal(bres, bexp)
- a = np.array([5], dtype=np.int64)
- b = np.array([5], dtype=np.int64)
- index, ares, bres = libjoin.left_join_indexer(a, b)
- tm.assert_numpy_array_equal(index, np.array([5], dtype=np.int64))
- tm.assert_numpy_array_equal(ares, np.array([0], dtype=np.intp))
- tm.assert_numpy_array_equal(bres, np.array([0], dtype=np.intp))
- def test_left_join_indexer2():
- idx = np.array([1, 1, 2, 5], dtype=np.int64)
- idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
- res, lidx, ridx = libjoin.left_join_indexer(idx2, idx)
- exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
- tm.assert_almost_equal(res, exp_res)
- exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
- tm.assert_almost_equal(lidx, exp_lidx)
- exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
- tm.assert_almost_equal(ridx, exp_ridx)
- def test_outer_join_indexer2():
- idx = np.array([1, 1, 2, 5], dtype=np.int64)
- idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
- res, lidx, ridx = libjoin.outer_join_indexer(idx2, idx)
- exp_res = np.array([1, 1, 2, 5, 7, 9], dtype=np.int64)
- tm.assert_almost_equal(res, exp_res)
- exp_lidx = np.array([0, 0, 1, 2, 3, 4], dtype=np.intp)
- tm.assert_almost_equal(lidx, exp_lidx)
- exp_ridx = np.array([0, 1, 2, 3, -1, -1], dtype=np.intp)
- tm.assert_almost_equal(ridx, exp_ridx)
- def test_inner_join_indexer2():
- idx = np.array([1, 1, 2, 5], dtype=np.int64)
- idx2 = np.array([1, 2, 5, 7, 9], dtype=np.int64)
- res, lidx, ridx = libjoin.inner_join_indexer(idx2, idx)
- exp_res = np.array([1, 1, 2, 5], dtype=np.int64)
- tm.assert_almost_equal(res, exp_res)
- exp_lidx = np.array([0, 0, 1, 2], dtype=np.intp)
- tm.assert_almost_equal(lidx, exp_lidx)
- exp_ridx = np.array([0, 1, 2, 3], dtype=np.intp)
- tm.assert_almost_equal(ridx, exp_ridx)
|