123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271 |
- import numpy as np
- import pytest
- from pandas import (
- DataFrame,
- Index,
- Interval,
- MultiIndex,
- Series,
- StringDtype,
- )
- import pandas._testing as tm
- @pytest.mark.parametrize(
- "other", [Index(["three", "one", "two"]), Index(["one"]), Index(["one", "three"])]
- )
- def test_join_level(idx, other, join_type):
- join_index, lidx, ridx = other.join(
- idx, how=join_type, level="second", return_indexers=True
- )
- exp_level = other.join(idx.levels[1], how=join_type)
- assert join_index.levels[0].equals(idx.levels[0])
- assert join_index.levels[1].equals(exp_level)
- # pare down levels
- mask = np.array([x[1] in exp_level for x in idx], dtype=bool)
- exp_values = idx.values[mask]
- tm.assert_numpy_array_equal(join_index.values, exp_values)
- if join_type in ("outer", "inner"):
- join_index2, ridx2, lidx2 = idx.join(
- other, how=join_type, level="second", return_indexers=True
- )
- assert join_index.equals(join_index2)
- tm.assert_numpy_array_equal(lidx, lidx2)
- tm.assert_numpy_array_equal(ridx, ridx2)
- tm.assert_numpy_array_equal(join_index2.values, exp_values)
- def test_join_level_corner_case(idx):
- # some corner cases
- index = Index(["three", "one", "two"])
- result = index.join(idx, level="second")
- assert isinstance(result, MultiIndex)
- with pytest.raises(TypeError, match="Join.*MultiIndex.*ambiguous"):
- idx.join(idx, level=1)
- def test_join_self(idx, join_type):
- joined = idx.join(idx, how=join_type)
- tm.assert_index_equal(joined, idx)
- def test_join_multi():
- # GH 10665
- midx = MultiIndex.from_product([np.arange(4), np.arange(4)], names=["a", "b"])
- idx = Index([1, 2, 5], name="b")
- # inner
- jidx, lidx, ridx = midx.join(idx, how="inner", return_indexers=True)
- exp_idx = MultiIndex.from_product([np.arange(4), [1, 2]], names=["a", "b"])
- exp_lidx = np.array([1, 2, 5, 6, 9, 10, 13, 14], dtype=np.intp)
- exp_ridx = np.array([0, 1, 0, 1, 0, 1, 0, 1], dtype=np.intp)
- tm.assert_index_equal(jidx, exp_idx)
- tm.assert_numpy_array_equal(lidx, exp_lidx)
- tm.assert_numpy_array_equal(ridx, exp_ridx)
- # flip
- jidx, ridx, lidx = idx.join(midx, how="inner", return_indexers=True)
- tm.assert_index_equal(jidx, exp_idx)
- tm.assert_numpy_array_equal(lidx, exp_lidx)
- tm.assert_numpy_array_equal(ridx, exp_ridx)
- # keep MultiIndex
- jidx, lidx, ridx = midx.join(idx, how="left", return_indexers=True)
- exp_ridx = np.array(
- [-1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1, -1, 0, 1, -1], dtype=np.intp
- )
- tm.assert_index_equal(jidx, midx)
- assert lidx is None
- tm.assert_numpy_array_equal(ridx, exp_ridx)
- # flip
- jidx, ridx, lidx = idx.join(midx, how="right", return_indexers=True)
- tm.assert_index_equal(jidx, midx)
- assert lidx is None
- tm.assert_numpy_array_equal(ridx, exp_ridx)
- def test_join_self_unique(idx, join_type):
- if idx.is_unique:
- joined = idx.join(idx, how=join_type)
- assert (idx == joined).all()
- def test_join_multi_wrong_order():
- # GH 25760
- # GH 28956
- midx1 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
- midx2 = MultiIndex.from_product([[1, 2], [3, 4]], names=["b", "a"])
- join_idx, lidx, ridx = midx1.join(midx2, return_indexers=True)
- exp_ridx = np.array([-1, -1, -1, -1], dtype=np.intp)
- tm.assert_index_equal(midx1, join_idx)
- assert lidx is None
- tm.assert_numpy_array_equal(ridx, exp_ridx)
- def test_join_multi_return_indexers():
- # GH 34074
- midx1 = MultiIndex.from_product([[1, 2], [3, 4], [5, 6]], names=["a", "b", "c"])
- midx2 = MultiIndex.from_product([[1, 2], [3, 4]], names=["a", "b"])
- result = midx1.join(midx2, return_indexers=False)
- tm.assert_index_equal(result, midx1)
- def test_join_overlapping_interval_level():
- # GH 44096
- idx_1 = MultiIndex.from_tuples(
- [
- (1, Interval(0.0, 1.0)),
- (1, Interval(1.0, 2.0)),
- (1, Interval(2.0, 5.0)),
- (2, Interval(0.0, 1.0)),
- (2, Interval(1.0, 3.0)), # interval limit is here at 3.0, not at 2.0
- (2, Interval(3.0, 5.0)),
- ],
- names=["num", "interval"],
- )
- idx_2 = MultiIndex.from_tuples(
- [
- (1, Interval(2.0, 5.0)),
- (1, Interval(0.0, 1.0)),
- (1, Interval(1.0, 2.0)),
- (2, Interval(3.0, 5.0)),
- (2, Interval(0.0, 1.0)),
- (2, Interval(1.0, 3.0)),
- ],
- names=["num", "interval"],
- )
- expected = MultiIndex.from_tuples(
- [
- (1, Interval(0.0, 1.0)),
- (1, Interval(1.0, 2.0)),
- (1, Interval(2.0, 5.0)),
- (2, Interval(0.0, 1.0)),
- (2, Interval(1.0, 3.0)),
- (2, Interval(3.0, 5.0)),
- ],
- names=["num", "interval"],
- )
- result = idx_1.join(idx_2, how="outer")
- tm.assert_index_equal(result, expected)
- def test_join_midx_ea():
- # GH#49277
- midx = MultiIndex.from_arrays(
- [Series([1, 1, 3], dtype="Int64"), Series([1, 2, 3], dtype="Int64")],
- names=["a", "b"],
- )
- midx2 = MultiIndex.from_arrays(
- [Series([1], dtype="Int64"), Series([3], dtype="Int64")], names=["a", "c"]
- )
- result = midx.join(midx2, how="inner")
- expected = MultiIndex.from_arrays(
- [
- Series([1, 1], dtype="Int64"),
- Series([1, 2], dtype="Int64"),
- Series([3, 3], dtype="Int64"),
- ],
- names=["a", "b", "c"],
- )
- tm.assert_index_equal(result, expected)
- def test_join_midx_string():
- # GH#49277
- midx = MultiIndex.from_arrays(
- [
- Series(["a", "a", "c"], dtype=StringDtype()),
- Series(["a", "b", "c"], dtype=StringDtype()),
- ],
- names=["a", "b"],
- )
- midx2 = MultiIndex.from_arrays(
- [Series(["a"], dtype=StringDtype()), Series(["c"], dtype=StringDtype())],
- names=["a", "c"],
- )
- result = midx.join(midx2, how="inner")
- expected = MultiIndex.from_arrays(
- [
- Series(["a", "a"], dtype=StringDtype()),
- Series(["a", "b"], dtype=StringDtype()),
- Series(["c", "c"], dtype=StringDtype()),
- ],
- names=["a", "b", "c"],
- )
- tm.assert_index_equal(result, expected)
- def test_join_multi_with_nan():
- # GH29252
- df1 = DataFrame(
- data={"col1": [1.1, 1.2]},
- index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
- )
- df2 = DataFrame(
- data={"col2": [2.1, 2.2]},
- index=MultiIndex.from_product([["A"], [np.NaN, 2.0]], names=["id1", "id2"]),
- )
- result = df1.join(df2)
- expected = DataFrame(
- data={"col1": [1.1, 1.2], "col2": [np.nan, 2.2]},
- index=MultiIndex.from_product([["A"], [1.0, 2.0]], names=["id1", "id2"]),
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("val", [0, 5])
- def test_join_dtypes(any_numeric_ea_dtype, val):
- # GH#49830
- midx = MultiIndex.from_arrays([Series([1, 2], dtype=any_numeric_ea_dtype), [3, 4]])
- midx2 = MultiIndex.from_arrays(
- [Series([1, val, val], dtype=any_numeric_ea_dtype), [3, 4, 4]]
- )
- result = midx.join(midx2, how="outer")
- expected = MultiIndex.from_arrays(
- [Series([val, val, 1, 2], dtype=any_numeric_ea_dtype), [4, 4, 3, 4]]
- ).sort_values()
- tm.assert_index_equal(result, expected)
- def test_join_dtypes_all_nan(any_numeric_ea_dtype):
- # GH#49830
- midx = MultiIndex.from_arrays(
- [Series([1, 2], dtype=any_numeric_ea_dtype), [np.nan, np.nan]]
- )
- midx2 = MultiIndex.from_arrays(
- [Series([1, 0, 0], dtype=any_numeric_ea_dtype), [np.nan, np.nan, np.nan]]
- )
- result = midx.join(midx2, how="outer")
- expected = MultiIndex.from_arrays(
- [
- Series([0, 0, 1, 2], dtype=any_numeric_ea_dtype),
- [np.nan, np.nan, np.nan, np.nan],
- ]
- )
- tm.assert_index_equal(result, expected)
- def test_join_index_levels():
- # GH#53093
- midx = midx = MultiIndex.from_tuples([("a", "2019-02-01"), ("a", "2019-02-01")])
- midx2 = MultiIndex.from_tuples([("a", "2019-01-31")])
- result = midx.join(midx2, how="outer")
- expected = MultiIndex.from_tuples(
- [("a", "2019-01-31"), ("a", "2019-02-01"), ("a", "2019-02-01")]
- )
- tm.assert_index_equal(result.levels[1], expected.levels[1])
- tm.assert_index_equal(result, expected)
|