123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994 |
- import numpy as np
- import pytest
- import pandas as pd
- from pandas import (
- Categorical,
- DataFrame,
- Index,
- MultiIndex,
- Series,
- Timestamp,
- concat,
- merge,
- )
- import pandas._testing as tm
- def get_test_data(ngroups=8, n=50):
- unique_groups = list(range(ngroups))
- arr = np.asarray(np.tile(unique_groups, n // ngroups))
- if len(arr) < n:
- arr = np.asarray(list(arr) + unique_groups[: n - len(arr)])
- np.random.shuffle(arr)
- return arr
- class TestJoin:
- # aggregate multiple columns
- @pytest.fixture
- def df(self):
- df = DataFrame(
- {
- "key1": get_test_data(),
- "key2": get_test_data(),
- "data1": np.random.randn(50),
- "data2": np.random.randn(50),
- }
- )
- # exclude a couple keys for fun
- df = df[df["key2"] > 1]
- return df
- @pytest.fixture
- def df2(self):
- return DataFrame(
- {
- "key1": get_test_data(n=10),
- "key2": get_test_data(ngroups=4, n=10),
- "value": np.random.randn(10),
- }
- )
- @pytest.fixture
- def target_source(self):
- index, data = tm.getMixedTypeDict()
- target = DataFrame(data, index=index)
- # Join on string value
- source = DataFrame(
- {"MergedA": data["A"], "MergedD": data["D"]}, index=data["C"]
- )
- return target, source
- def test_left_outer_join(self, df, df2):
- joined_key2 = merge(df, df2, on="key2")
- _check_join(df, df2, joined_key2, ["key2"], how="left")
- joined_both = merge(df, df2)
- _check_join(df, df2, joined_both, ["key1", "key2"], how="left")
- def test_right_outer_join(self, df, df2):
- joined_key2 = merge(df, df2, on="key2", how="right")
- _check_join(df, df2, joined_key2, ["key2"], how="right")
- joined_both = merge(df, df2, how="right")
- _check_join(df, df2, joined_both, ["key1", "key2"], how="right")
- def test_full_outer_join(self, df, df2):
- joined_key2 = merge(df, df2, on="key2", how="outer")
- _check_join(df, df2, joined_key2, ["key2"], how="outer")
- joined_both = merge(df, df2, how="outer")
- _check_join(df, df2, joined_both, ["key1", "key2"], how="outer")
- def test_inner_join(self, df, df2):
- joined_key2 = merge(df, df2, on="key2", how="inner")
- _check_join(df, df2, joined_key2, ["key2"], how="inner")
- joined_both = merge(df, df2, how="inner")
- _check_join(df, df2, joined_both, ["key1", "key2"], how="inner")
- def test_handle_overlap(self, df, df2):
- joined = merge(df, df2, on="key2", suffixes=(".foo", ".bar"))
- assert "key1.foo" in joined
- assert "key1.bar" in joined
- def test_handle_overlap_arbitrary_key(self, df, df2):
- joined = merge(
- df,
- df2,
- left_on="key2",
- right_on="key1",
- suffixes=(".foo", ".bar"),
- )
- assert "key1.foo" in joined
- assert "key2.bar" in joined
- def test_join_on(self, target_source):
- target, source = target_source
- merged = target.join(source, on="C")
- tm.assert_series_equal(merged["MergedA"], target["A"], check_names=False)
- tm.assert_series_equal(merged["MergedD"], target["D"], check_names=False)
- # join with duplicates (fix regression from DataFrame/Matrix merge)
- df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
- df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
- joined = df.join(df2, on="key")
- expected = DataFrame(
- {"key": ["a", "a", "b", "b", "c"], "value": [0, 0, 1, 1, 2]}
- )
- tm.assert_frame_equal(joined, expected)
- # Test when some are missing
- df_a = DataFrame([[1], [2], [3]], index=["a", "b", "c"], columns=["one"])
- df_b = DataFrame([["foo"], ["bar"]], index=[1, 2], columns=["two"])
- df_c = DataFrame([[1], [2]], index=[1, 2], columns=["three"])
- joined = df_a.join(df_b, on="one")
- joined = joined.join(df_c, on="one")
- assert np.isnan(joined["two"]["c"])
- assert np.isnan(joined["three"]["c"])
- # merge column not p resent
- with pytest.raises(KeyError, match="^'E'$"):
- target.join(source, on="E")
- # overlap
- source_copy = source.copy()
- source_copy["A"] = 0
- msg = (
- "You are trying to merge on float64 and object columns. If "
- "you wish to proceed you should use pd.concat"
- )
- with pytest.raises(ValueError, match=msg):
- target.join(source_copy, on="A")
- def test_join_on_fails_with_different_right_index(self):
- df = DataFrame(
- {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
- )
- df2 = DataFrame(
- {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
- index=tm.makeCustomIndex(10, 2),
- )
- msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
- with pytest.raises(ValueError, match=msg):
- merge(df, df2, left_on="a", right_index=True)
- def test_join_on_fails_with_different_left_index(self):
- df = DataFrame(
- {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)},
- index=tm.makeCustomIndex(3, 2),
- )
- df2 = DataFrame(
- {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)}
- )
- msg = r'len\(right_on\) must equal the number of levels in the index of "left"'
- with pytest.raises(ValueError, match=msg):
- merge(df, df2, right_on="b", left_index=True)
- def test_join_on_fails_with_different_column_counts(self):
- df = DataFrame(
- {"a": np.random.choice(["m", "f"], size=3), "b": np.random.randn(3)}
- )
- df2 = DataFrame(
- {"a": np.random.choice(["m", "f"], size=10), "b": np.random.randn(10)},
- index=tm.makeCustomIndex(10, 2),
- )
- msg = r"len\(right_on\) must equal len\(left_on\)"
- with pytest.raises(ValueError, match=msg):
- merge(df, df2, right_on="a", left_on=["a", "b"])
- @pytest.mark.parametrize("wrong_type", [2, "str", None, np.array([0, 1])])
- def test_join_on_fails_with_wrong_object_type(self, wrong_type):
- # GH12081 - original issue
- # GH21220 - merging of Series and DataFrame is now allowed
- # Edited test to remove the Series object from test parameters
- df = DataFrame({"a": [1, 1]})
- msg = (
- "Can only merge Series or DataFrame objects, "
- f"a {type(wrong_type)} was passed"
- )
- with pytest.raises(TypeError, match=msg):
- merge(wrong_type, df, left_on="a", right_on="a")
- with pytest.raises(TypeError, match=msg):
- merge(df, wrong_type, left_on="a", right_on="a")
- def test_join_on_pass_vector(self, target_source):
- target, source = target_source
- expected = target.join(source, on="C")
- del expected["C"]
- join_col = target.pop("C")
- result = target.join(source, on=join_col)
- tm.assert_frame_equal(result, expected)
- def test_join_with_len0(self, target_source):
- # nothing to merge
- target, source = target_source
- merged = target.join(source.reindex([]), on="C")
- for col in source:
- assert col in merged
- assert merged[col].isna().all()
- merged2 = target.join(source.reindex([]), on="C", how="inner")
- tm.assert_index_equal(merged2.columns, merged.columns)
- assert len(merged2) == 0
- def test_join_on_inner(self):
- df = DataFrame({"key": ["a", "a", "d", "b", "b", "c"]})
- df2 = DataFrame({"value": [0, 1]}, index=["a", "b"])
- joined = df.join(df2, on="key", how="inner")
- expected = df.join(df2, on="key")
- expected = expected[expected["value"].notna()]
- tm.assert_series_equal(joined["key"], expected["key"])
- tm.assert_series_equal(joined["value"], expected["value"], check_dtype=False)
- tm.assert_index_equal(joined.index, expected.index)
- def test_join_on_singlekey_list(self):
- df = DataFrame({"key": ["a", "a", "b", "b", "c"]})
- df2 = DataFrame({"value": [0, 1, 2]}, index=["a", "b", "c"])
- # corner cases
- joined = df.join(df2, on=["key"])
- expected = df.join(df2, on="key")
- tm.assert_frame_equal(joined, expected)
- def test_join_on_series(self, target_source):
- target, source = target_source
- result = target.join(source["MergedA"], on="C")
- expected = target.join(source[["MergedA"]], on="C")
- tm.assert_frame_equal(result, expected)
- def test_join_on_series_buglet(self):
- # GH #638
- df = DataFrame({"a": [1, 1]})
- ds = Series([2], index=[1], name="b")
- result = df.join(ds, on="a")
- expected = DataFrame({"a": [1, 1], "b": [2, 2]}, index=df.index)
- tm.assert_frame_equal(result, expected)
- def test_join_index_mixed(self, join_type):
- # no overlapping blocks
- df1 = DataFrame(index=np.arange(10))
- df1["bool"] = True
- df1["string"] = "foo"
- df2 = DataFrame(index=np.arange(5, 15))
- df2["int"] = 1
- df2["float"] = 1.0
- joined = df1.join(df2, how=join_type)
- expected = _join_by_hand(df1, df2, how=join_type)
- tm.assert_frame_equal(joined, expected)
- joined = df2.join(df1, how=join_type)
- expected = _join_by_hand(df2, df1, how=join_type)
- tm.assert_frame_equal(joined, expected)
- def test_join_index_mixed_overlap(self):
- df1 = DataFrame(
- {"A": 1.0, "B": 2, "C": "foo", "D": True},
- index=np.arange(10),
- columns=["A", "B", "C", "D"],
- )
- assert df1["B"].dtype == np.int64
- assert df1["D"].dtype == np.bool_
- df2 = DataFrame(
- {"A": 1.0, "B": 2, "C": "foo", "D": True},
- index=np.arange(0, 10, 2),
- columns=["A", "B", "C", "D"],
- )
- # overlap
- joined = df1.join(df2, lsuffix="_one", rsuffix="_two")
- expected_columns = [
- "A_one",
- "B_one",
- "C_one",
- "D_one",
- "A_two",
- "B_two",
- "C_two",
- "D_two",
- ]
- df1.columns = expected_columns[:4]
- df2.columns = expected_columns[4:]
- expected = _join_by_hand(df1, df2)
- tm.assert_frame_equal(joined, expected)
- def test_join_empty_bug(self):
- # generated an exception in 0.4.3
- x = DataFrame()
- x.join(DataFrame([3], index=[0], columns=["A"]), how="outer")
- def test_join_unconsolidated(self):
- # GH #331
- a = DataFrame(np.random.randn(30, 2), columns=["a", "b"])
- c = Series(np.random.randn(30))
- a["c"] = c
- d = DataFrame(np.random.randn(30, 1), columns=["q"])
- # it works!
- a.join(d)
- d.join(a)
- def test_join_multiindex(self):
- index1 = MultiIndex.from_arrays(
- [["a", "a", "a", "b", "b", "b"], [1, 2, 3, 1, 2, 3]],
- names=["first", "second"],
- )
- index2 = MultiIndex.from_arrays(
- [["b", "b", "b", "c", "c", "c"], [1, 2, 3, 1, 2, 3]],
- names=["first", "second"],
- )
- df1 = DataFrame(data=np.random.randn(6), index=index1, columns=["var X"])
- df2 = DataFrame(data=np.random.randn(6), index=index2, columns=["var Y"])
- df1 = df1.sort_index(level=0)
- df2 = df2.sort_index(level=0)
- joined = df1.join(df2, how="outer")
- ex_index = Index(index1.values).union(Index(index2.values))
- expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
- expected.index.names = index1.names
- tm.assert_frame_equal(joined, expected)
- assert joined.index.names == index1.names
- df1 = df1.sort_index(level=1)
- df2 = df2.sort_index(level=1)
- joined = df1.join(df2, how="outer").sort_index(level=0)
- ex_index = Index(index1.values).union(Index(index2.values))
- expected = df1.reindex(ex_index).join(df2.reindex(ex_index))
- expected.index.names = index1.names
- tm.assert_frame_equal(joined, expected)
- assert joined.index.names == index1.names
- def test_join_inner_multiindex(self, lexsorted_two_level_string_multiindex):
- key1 = ["bar", "bar", "bar", "foo", "foo", "baz", "baz", "qux", "qux", "snap"]
- key2 = [
- "two",
- "one",
- "three",
- "one",
- "two",
- "one",
- "two",
- "two",
- "three",
- "one",
- ]
- data = np.random.randn(len(key1))
- data = DataFrame({"key1": key1, "key2": key2, "data": data})
- index = lexsorted_two_level_string_multiindex
- to_join = DataFrame(
- np.random.randn(10, 3), index=index, columns=["j_one", "j_two", "j_three"]
- )
- joined = data.join(to_join, on=["key1", "key2"], how="inner")
- expected = merge(
- data,
- to_join.reset_index(),
- left_on=["key1", "key2"],
- right_on=["first", "second"],
- how="inner",
- sort=False,
- )
- expected2 = merge(
- to_join,
- data,
- right_on=["key1", "key2"],
- left_index=True,
- how="inner",
- sort=False,
- )
- tm.assert_frame_equal(joined, expected2.reindex_like(joined))
- expected2 = merge(
- to_join,
- data,
- right_on=["key1", "key2"],
- left_index=True,
- how="inner",
- sort=False,
- )
- expected = expected.drop(["first", "second"], axis=1)
- expected.index = joined.index
- assert joined.index.is_monotonic_increasing
- tm.assert_frame_equal(joined, expected)
- # _assert_same_contents(expected, expected2.loc[:, expected.columns])
- def test_join_hierarchical_mixed_raises(self):
- # GH 2024
- # GH 40993: For raising, enforced in 2.0
- df = DataFrame([(1, 2, 3), (4, 5, 6)], columns=["a", "b", "c"])
- new_df = df.groupby(["a"]).agg({"b": [np.mean, np.sum]})
- other_df = DataFrame([(1, 2, 3), (7, 10, 6)], columns=["a", "b", "d"])
- other_df.set_index("a", inplace=True)
- # GH 9455, 12219
- with pytest.raises(
- pd.errors.MergeError, match="Not allowed to merge between different levels"
- ):
- merge(new_df, other_df, left_index=True, right_index=True)
- def test_join_float64_float32(self):
- a = DataFrame(np.random.randn(10, 2), columns=["a", "b"], dtype=np.float64)
- b = DataFrame(np.random.randn(10, 1), columns=["c"], dtype=np.float32)
- joined = a.join(b)
- assert joined.dtypes["a"] == "float64"
- assert joined.dtypes["b"] == "float64"
- assert joined.dtypes["c"] == "float32"
- a = np.random.randint(0, 5, 100).astype("int64")
- b = np.random.random(100).astype("float64")
- c = np.random.random(100).astype("float32")
- df = DataFrame({"a": a, "b": b, "c": c})
- xpdf = DataFrame({"a": a, "b": b, "c": c})
- s = DataFrame(np.random.random(5).astype("float32"), columns=["md"])
- rs = df.merge(s, left_on="a", right_index=True)
- assert rs.dtypes["a"] == "int64"
- assert rs.dtypes["b"] == "float64"
- assert rs.dtypes["c"] == "float32"
- assert rs.dtypes["md"] == "float32"
- xp = xpdf.merge(s, left_on="a", right_index=True)
- tm.assert_frame_equal(rs, xp)
- def test_join_many_non_unique_index(self):
- df1 = DataFrame({"a": [1, 1], "b": [1, 1], "c": [10, 20]})
- df2 = DataFrame({"a": [1, 1], "b": [1, 2], "d": [100, 200]})
- df3 = DataFrame({"a": [1, 1], "b": [1, 2], "e": [1000, 2000]})
- idf1 = df1.set_index(["a", "b"])
- idf2 = df2.set_index(["a", "b"])
- idf3 = df3.set_index(["a", "b"])
- result = idf1.join([idf2, idf3], how="outer")
- df_partially_merged = merge(df1, df2, on=["a", "b"], how="outer")
- expected = merge(df_partially_merged, df3, on=["a", "b"], how="outer")
- result = result.reset_index()
- expected = expected[result.columns]
- expected["a"] = expected.a.astype("int64")
- expected["b"] = expected.b.astype("int64")
- tm.assert_frame_equal(result, expected)
- df1 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 1], "c": [10, 20, 30]})
- df2 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "d": [100, 200, 300]})
- df3 = DataFrame({"a": [1, 1, 1], "b": [1, 1, 2], "e": [1000, 2000, 3000]})
- idf1 = df1.set_index(["a", "b"])
- idf2 = df2.set_index(["a", "b"])
- idf3 = df3.set_index(["a", "b"])
- result = idf1.join([idf2, idf3], how="inner")
- df_partially_merged = merge(df1, df2, on=["a", "b"], how="inner")
- expected = merge(df_partially_merged, df3, on=["a", "b"], how="inner")
- result = result.reset_index()
- tm.assert_frame_equal(result, expected.loc[:, result.columns])
- # GH 11519
- df = DataFrame(
- {
- "A": ["foo", "bar", "foo", "bar", "foo", "bar", "foo", "foo"],
- "B": ["one", "one", "two", "three", "two", "two", "one", "three"],
- "C": np.random.randn(8),
- "D": np.random.randn(8),
- }
- )
- s = Series(
- np.repeat(np.arange(8), 2), index=np.repeat(np.arange(8), 2), name="TEST"
- )
- inner = df.join(s, how="inner")
- outer = df.join(s, how="outer")
- left = df.join(s, how="left")
- right = df.join(s, how="right")
- tm.assert_frame_equal(inner, outer)
- tm.assert_frame_equal(inner, left)
- tm.assert_frame_equal(inner, right)
- def test_join_sort(self):
- left = DataFrame({"key": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
- right = DataFrame({"value2": ["a", "b", "c"]}, index=["bar", "baz", "foo"])
- joined = left.join(right, on="key", sort=True)
- expected = DataFrame(
- {
- "key": ["bar", "baz", "foo", "foo"],
- "value": [2, 3, 1, 4],
- "value2": ["a", "b", "c", "c"],
- },
- index=[1, 2, 0, 3],
- )
- tm.assert_frame_equal(joined, expected)
- # smoke test
- joined = left.join(right, on="key", sort=False)
- tm.assert_index_equal(joined.index, Index(range(4)), exact=True)
- def test_join_mixed_non_unique_index(self):
- # GH 12814, unorderable types in py3 with a non-unique index
- df1 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 3, "a"])
- df2 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 3, 3, 4])
- result = df1.join(df2)
- expected = DataFrame(
- {"a": [1, 2, 3, 3, 4], "b": [5, np.nan, 6, 7, np.nan]},
- index=[1, 2, 3, 3, "a"],
- )
- tm.assert_frame_equal(result, expected)
- df3 = DataFrame({"a": [1, 2, 3, 4]}, index=[1, 2, 2, "a"])
- df4 = DataFrame({"b": [5, 6, 7, 8]}, index=[1, 2, 3, 4])
- result = df3.join(df4)
- expected = DataFrame(
- {"a": [1, 2, 3, 4], "b": [5, 6, 6, np.nan]}, index=[1, 2, 2, "a"]
- )
- tm.assert_frame_equal(result, expected)
- def test_join_non_unique_period_index(self):
- # GH #16871
- index = pd.period_range("2016-01-01", periods=16, freq="M")
- df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
- df2 = concat([df, df])
- result = df.join(df2, how="inner", rsuffix="_df2")
- expected = DataFrame(
- np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
- columns=["pnum", "pnum_df2"],
- index=df2.sort_index().index,
- )
- tm.assert_frame_equal(result, expected)
- def test_mixed_type_join_with_suffix(self):
- # GH #916
- df = DataFrame(np.random.randn(20, 6), columns=["a", "b", "c", "d", "e", "f"])
- df.insert(0, "id", 0)
- df.insert(5, "dt", "foo")
- grouped = df.groupby("id")
- with pytest.raises(TypeError, match="Could not convert"):
- grouped.mean()
- mn = grouped.mean(numeric_only=True)
- cn = grouped.count()
- # it works!
- mn.join(cn, rsuffix="_right")
- def test_join_many(self):
- df = DataFrame(np.random.randn(10, 6), columns=list("abcdef"))
- df_list = [df[["a", "b"]], df[["c", "d"]], df[["e", "f"]]]
- joined = df_list[0].join(df_list[1:])
- tm.assert_frame_equal(joined, df)
- df_list = [df[["a", "b"]][:-2], df[["c", "d"]][2:], df[["e", "f"]][1:9]]
- def _check_diff_index(df_list, result, exp_index):
- reindexed = [x.reindex(exp_index) for x in df_list]
- expected = reindexed[0].join(reindexed[1:])
- tm.assert_frame_equal(result, expected)
- # different join types
- joined = df_list[0].join(df_list[1:], how="outer")
- _check_diff_index(df_list, joined, df.index)
- joined = df_list[0].join(df_list[1:])
- _check_diff_index(df_list, joined, df_list[0].index)
- joined = df_list[0].join(df_list[1:], how="inner")
- _check_diff_index(df_list, joined, df.index[2:8])
- msg = "Joining multiple DataFrames only supported for joining on index"
- with pytest.raises(ValueError, match=msg):
- df_list[0].join(df_list[1:], on="a")
- def test_join_many_mixed(self):
- df = DataFrame(np.random.randn(8, 4), columns=["A", "B", "C", "D"])
- df["key"] = ["foo", "bar"] * 4
- df1 = df.loc[:, ["A", "B"]]
- df2 = df.loc[:, ["C", "D"]]
- df3 = df.loc[:, ["key"]]
- result = df1.join([df2, df3])
- tm.assert_frame_equal(result, df)
- def test_join_dups(self):
- # joining dups
- df = concat(
- [
- DataFrame(np.random.randn(10, 4), columns=["A", "A", "B", "B"]),
- DataFrame(
- np.random.randint(0, 10, size=20).reshape(10, 2), columns=["A", "C"]
- ),
- ],
- axis=1,
- )
- expected = concat([df, df], axis=1)
- result = df.join(df, rsuffix="_2")
- result.columns = expected.columns
- tm.assert_frame_equal(result, expected)
- # GH 4975, invalid join on dups
- w = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
- x = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
- y = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
- z = DataFrame(np.random.randn(4, 2), columns=["x", "y"])
- dta = x.merge(y, left_index=True, right_index=True).merge(
- z, left_index=True, right_index=True, how="outer"
- )
- # GH 40991: As of 2.0 causes duplicate columns
- with pytest.raises(
- pd.errors.MergeError,
- match="Passing 'suffixes' which cause duplicate columns",
- ):
- dta.merge(w, left_index=True, right_index=True)
- def test_join_multi_to_multi(self, join_type):
- # GH 20475
- leftindex = MultiIndex.from_product(
- [list("abc"), list("xy"), [1, 2]], names=["abc", "xy", "num"]
- )
- left = DataFrame({"v1": range(12)}, index=leftindex)
- rightindex = MultiIndex.from_product(
- [list("abc"), list("xy")], names=["abc", "xy"]
- )
- right = DataFrame({"v2": [100 * i for i in range(1, 7)]}, index=rightindex)
- result = left.join(right, on=["abc", "xy"], how=join_type)
- expected = (
- left.reset_index()
- .merge(right.reset_index(), on=["abc", "xy"], how=join_type)
- .set_index(["abc", "xy", "num"])
- )
- tm.assert_frame_equal(expected, result)
- msg = r'len\(left_on\) must equal the number of levels in the index of "right"'
- with pytest.raises(ValueError, match=msg):
- left.join(right, on="xy", how=join_type)
- with pytest.raises(ValueError, match=msg):
- right.join(left, on=["abc", "xy"], how=join_type)
- def test_join_on_tz_aware_datetimeindex(self):
- # GH 23931, 26335
- df1 = DataFrame(
- {
- "date": pd.date_range(
- start="2018-01-01", periods=5, tz="America/Chicago"
- ),
- "vals": list("abcde"),
- }
- )
- df2 = DataFrame(
- {
- "date": pd.date_range(
- start="2018-01-03", periods=5, tz="America/Chicago"
- ),
- "vals_2": list("tuvwx"),
- }
- )
- result = df1.join(df2.set_index("date"), on="date")
- expected = df1.copy()
- expected["vals_2"] = Series([np.nan] * 2 + list("tuv"), dtype=object)
- tm.assert_frame_equal(result, expected)
- def test_join_datetime_string(self):
- # GH 5647
- dfa = DataFrame(
- [
- ["2012-08-02", "L", 10],
- ["2012-08-02", "J", 15],
- ["2013-04-06", "L", 20],
- ["2013-04-06", "J", 25],
- ],
- columns=["x", "y", "a"],
- )
- dfa["x"] = pd.to_datetime(dfa["x"])
- dfb = DataFrame(
- [["2012-08-02", "J", 1], ["2013-04-06", "L", 2]],
- columns=["x", "y", "z"],
- index=[2, 4],
- )
- dfb["x"] = pd.to_datetime(dfb["x"])
- result = dfb.join(dfa.set_index(["x", "y"]), on=["x", "y"])
- expected = DataFrame(
- [
- [Timestamp("2012-08-02 00:00:00"), "J", 1, 15],
- [Timestamp("2013-04-06 00:00:00"), "L", 2, 20],
- ],
- index=[2, 4],
- columns=["x", "y", "z", "a"],
- )
- tm.assert_frame_equal(result, expected)
- def test_join_with_categorical_index(self):
- # GH47812
- ix = ["a", "b"]
- id1 = pd.CategoricalIndex(ix, categories=ix)
- id2 = pd.CategoricalIndex(reversed(ix), categories=reversed(ix))
- df1 = DataFrame({"c1": ix}, index=id1)
- df2 = DataFrame({"c2": reversed(ix)}, index=id2)
- result = df1.join(df2)
- expected = DataFrame(
- {"c1": ["a", "b"], "c2": ["a", "b"]},
- index=pd.CategoricalIndex(["a", "b"], categories=["a", "b"]),
- )
- tm.assert_frame_equal(result, expected)
- def _check_join(left, right, result, join_col, how="left", lsuffix="_x", rsuffix="_y"):
- # some smoke tests
- for c in join_col:
- assert result[c].notna().all()
- left_grouped = left.groupby(join_col)
- right_grouped = right.groupby(join_col)
- for group_key, group in result.groupby(
- join_col if len(join_col) > 1 else join_col[0]
- ):
- l_joined = _restrict_to_columns(group, left.columns, lsuffix)
- r_joined = _restrict_to_columns(group, right.columns, rsuffix)
- try:
- lgroup = left_grouped.get_group(group_key)
- except KeyError as err:
- if how in ("left", "inner"):
- raise AssertionError(
- f"key {group_key} should not have been in the join"
- ) from err
- _assert_all_na(l_joined, left.columns, join_col)
- else:
- _assert_same_contents(l_joined, lgroup)
- try:
- rgroup = right_grouped.get_group(group_key)
- except KeyError as err:
- if how in ("right", "inner"):
- raise AssertionError(
- f"key {group_key} should not have been in the join"
- ) from err
- _assert_all_na(r_joined, right.columns, join_col)
- else:
- _assert_same_contents(r_joined, rgroup)
- def _restrict_to_columns(group, columns, suffix):
- found = [
- c for c in group.columns if c in columns or c.replace(suffix, "") in columns
- ]
- # filter
- group = group.loc[:, found]
- # get rid of suffixes, if any
- group = group.rename(columns=lambda x: x.replace(suffix, ""))
- # put in the right order...
- group = group.loc[:, columns]
- return group
- def _assert_same_contents(join_chunk, source):
- NA_SENTINEL = -1234567 # drop_duplicates not so NA-friendly...
- jvalues = join_chunk.fillna(NA_SENTINEL).drop_duplicates().values
- svalues = source.fillna(NA_SENTINEL).drop_duplicates().values
- rows = {tuple(row) for row in jvalues}
- assert len(rows) == len(source)
- assert all(tuple(row) in rows for row in svalues)
- def _assert_all_na(join_chunk, source_columns, join_col):
- for c in source_columns:
- if c in join_col:
- continue
- assert join_chunk[c].isna().all()
- def _join_by_hand(a, b, how="left"):
- join_index = a.index.join(b.index, how=how)
- a_re = a.reindex(join_index)
- b_re = b.reindex(join_index)
- result_columns = a.columns.append(b.columns)
- for col, s in b_re.items():
- a_re[col] = s
- return a_re.reindex(columns=result_columns)
- def test_join_inner_multiindex_deterministic_order():
- # GH: 36910
- left = DataFrame(
- data={"e": 5},
- index=MultiIndex.from_tuples([(1, 2, 4)], names=("a", "b", "d")),
- )
- right = DataFrame(
- data={"f": 6}, index=MultiIndex.from_tuples([(2, 3)], names=("b", "c"))
- )
- result = left.join(right, how="inner")
- expected = DataFrame(
- {"e": [5], "f": [6]},
- index=MultiIndex.from_tuples([(2, 1, 4, 3)], names=("b", "a", "d", "c")),
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- ("input_col", "output_cols"), [("b", ["a", "b"]), ("a", ["a_x", "a_y"])]
- )
- def test_join_cross(input_col, output_cols):
- # GH#5401
- left = DataFrame({"a": [1, 3]})
- right = DataFrame({input_col: [3, 4]})
- result = left.join(right, how="cross", lsuffix="_x", rsuffix="_y")
- expected = DataFrame({output_cols[0]: [1, 1, 3, 3], output_cols[1]: [3, 4, 3, 4]})
- tm.assert_frame_equal(result, expected)
- def test_join_multiindex_one_level(join_type):
- # GH#36909
- left = DataFrame(
- data={"c": 3}, index=MultiIndex.from_tuples([(1, 2)], names=("a", "b"))
- )
- right = DataFrame(data={"d": 4}, index=MultiIndex.from_tuples([(2,)], names=("b",)))
- result = left.join(right, how=join_type)
- expected = DataFrame(
- {"c": [3], "d": [4]},
- index=MultiIndex.from_tuples([(2, 1)], names=["b", "a"]),
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "categories, values",
- [
- (["Y", "X"], ["Y", "X", "X"]),
- ([2, 1], [2, 1, 1]),
- ([2.5, 1.5], [2.5, 1.5, 1.5]),
- (
- [Timestamp("2020-12-31"), Timestamp("2019-12-31")],
- [Timestamp("2020-12-31"), Timestamp("2019-12-31"), Timestamp("2019-12-31")],
- ),
- ],
- )
- def test_join_multiindex_not_alphabetical_categorical(categories, values):
- # GH#38502
- left = DataFrame(
- {
- "first": ["A", "A"],
- "second": Categorical(categories, categories=categories),
- "value": [1, 2],
- }
- ).set_index(["first", "second"])
- right = DataFrame(
- {
- "first": ["A", "A", "B"],
- "second": Categorical(values, categories=categories),
- "value": [3, 4, 5],
- }
- ).set_index(["first", "second"])
- result = left.join(right, lsuffix="_left", rsuffix="_right")
- expected = DataFrame(
- {
- "first": ["A", "A"],
- "second": Categorical(categories, categories=categories),
- "value_left": [1, 2],
- "value_right": [3, 4],
- }
- ).set_index(["first", "second"])
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "left_empty, how, exp",
- [
- (False, "left", "left"),
- (False, "right", "empty"),
- (False, "inner", "empty"),
- (False, "outer", "left"),
- (False, "cross", "empty"),
- (True, "left", "empty"),
- (True, "right", "right"),
- (True, "inner", "empty"),
- (True, "outer", "right"),
- (True, "cross", "empty"),
- ],
- )
- def test_join_empty(left_empty, how, exp):
- left = DataFrame({"A": [2, 1], "B": [3, 4]}, dtype="int64").set_index("A")
- right = DataFrame({"A": [1], "C": [5]}, dtype="int64").set_index("A")
- if left_empty:
- left = left.head(0)
- else:
- right = right.head(0)
- result = left.join(right, how=how)
- if exp == "left":
- expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]})
- expected = expected.set_index("A")
- elif exp == "right":
- expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]})
- expected = expected.set_index("A")
- elif exp == "empty":
- expected = DataFrame(columns=["B", "C"], dtype="int64")
- if how != "cross":
- expected = expected.rename_axis("A")
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "how, values",
- [
- ("inner", [0, 1, 2]),
- ("outer", [0, 1, 2]),
- ("left", [0, 1, 2]),
- ("right", [0, 2, 1]),
- ],
- )
- def test_join_multiindex_categorical_output_index_dtype(how, values):
- # GH#50906
- df1 = DataFrame(
- {
- "a": Categorical([0, 1, 2]),
- "b": Categorical([0, 1, 2]),
- "c": [0, 1, 2],
- }
- ).set_index(["a", "b"])
- df2 = DataFrame(
- {
- "a": Categorical([0, 2, 1]),
- "b": Categorical([0, 2, 1]),
- "d": [0, 2, 1],
- }
- ).set_index(["a", "b"])
- expected = DataFrame(
- {
- "a": Categorical(values),
- "b": Categorical(values),
- "c": values,
- "d": values,
- }
- ).set_index(["a", "b"])
- result = df1.join(df2, how=how)
- tm.assert_frame_equal(result, expected)
|