123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571 |
- from datetime import datetime
- import numpy as np
- import pytest
- from pandas.errors import MergeError
- import pandas as pd
- from pandas import (
- DataFrame,
- Index,
- MultiIndex,
- date_range,
- period_range,
- )
- import pandas._testing as tm
- from pandas.core.reshape.concat import concat
- @pytest.fixture
- def frame_with_period_index():
- return DataFrame(
- data=np.arange(20).reshape(4, 5),
- columns=list("abcde"),
- index=period_range(start="2000", freq="A", periods=4),
- )
- @pytest.fixture
- def left():
- return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0])
- @pytest.fixture
- def right():
- return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2])
- @pytest.fixture
- def left_no_dup():
- return DataFrame(
- {"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]},
- index=range(4),
- )
- @pytest.fixture
- def right_no_dup():
- return DataFrame(
- {
- "a": ["a", "b", "c", "d", "e"],
- "c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"],
- },
- index=range(5),
- ).set_index("a")
- @pytest.fixture
- def left_w_dups(left_no_dup):
- return concat(
- [left_no_dup, DataFrame({"a": ["a"], "b": ["cow"]}, index=[3])], sort=True
- )
- @pytest.fixture
- def right_w_dups(right_no_dup):
- return concat(
- [right_no_dup, DataFrame({"a": ["e"], "c": ["moo"]}, index=[3])]
- ).set_index("a")
- @pytest.mark.parametrize(
- "how, sort, expected",
- [
- ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])),
- ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])),
- (
- "left",
- False,
- DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]),
- ),
- (
- "left",
- True,
- DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]),
- ),
- (
- "right",
- False,
- DataFrame({"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]),
- ),
- (
- "right",
- True,
- DataFrame({"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]),
- ),
- (
- "outer",
- False,
- DataFrame(
- {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
- index=[0, 1, 2, 3],
- ),
- ),
- (
- "outer",
- True,
- DataFrame(
- {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
- index=[0, 1, 2, 3],
- ),
- ),
- ],
- )
- def test_join(left, right, how, sort, expected):
- result = left.join(right, how=how, sort=sort, validate="1:1")
- tm.assert_frame_equal(result, expected)
- def test_suffix_on_list_join():
- first = DataFrame({"key": [1, 2, 3, 4, 5]})
- second = DataFrame({"key": [1, 8, 3, 2, 5], "v1": [1, 2, 3, 4, 5]})
- third = DataFrame({"keys": [5, 2, 3, 4, 1], "v2": [1, 2, 3, 4, 5]})
- # check proper errors are raised
- msg = "Suffixes not supported when joining multiple DataFrames"
- with pytest.raises(ValueError, match=msg):
- first.join([second], lsuffix="y")
- with pytest.raises(ValueError, match=msg):
- first.join([second, third], rsuffix="x")
- with pytest.raises(ValueError, match=msg):
- first.join([second, third], lsuffix="y", rsuffix="x")
- with pytest.raises(ValueError, match="Indexes have overlapping values"):
- first.join([second, third])
- # no errors should be raised
- arr_joined = first.join([third])
- norm_joined = first.join(third)
- tm.assert_frame_equal(arr_joined, norm_joined)
- def test_join_invalid_validate(left_no_dup, right_no_dup):
- # GH 46622
- # Check invalid arguments
- msg = (
- '"invalid" is not a valid argument. '
- "Valid arguments are:\n"
- '- "1:1"\n'
- '- "1:m"\n'
- '- "m:1"\n'
- '- "m:m"\n'
- '- "one_to_one"\n'
- '- "one_to_many"\n'
- '- "many_to_one"\n'
- '- "many_to_many"'
- )
- with pytest.raises(ValueError, match=msg):
- left_no_dup.merge(right_no_dup, on="a", validate="invalid")
- def test_join_on_single_col_dup_on_right(left_no_dup, right_w_dups):
- # GH 46622
- # Dups on right allowed by one_to_many constraint
- left_no_dup.join(
- right_w_dups,
- on="a",
- validate="one_to_many",
- )
- # Dups on right not allowed by one_to_one constraint
- msg = "Merge keys are not unique in right dataset; not a one-to-one merge"
- with pytest.raises(MergeError, match=msg):
- left_no_dup.join(
- right_w_dups,
- on="a",
- validate="one_to_one",
- )
- def test_join_on_single_col_dup_on_left(left_w_dups, right_no_dup):
- # GH 46622
- # Dups on left allowed by many_to_one constraint
- left_w_dups.join(
- right_no_dup,
- on="a",
- validate="many_to_one",
- )
- # Dups on left not allowed by one_to_one constraint
- msg = "Merge keys are not unique in left dataset; not a one-to-one merge"
- with pytest.raises(MergeError, match=msg):
- left_w_dups.join(
- right_no_dup,
- on="a",
- validate="one_to_one",
- )
- def test_join_on_single_col_dup_on_both(left_w_dups, right_w_dups):
- # GH 46622
- # Dups on both allowed by many_to_many constraint
- left_w_dups.join(right_w_dups, on="a", validate="many_to_many")
- # Dups on both not allowed by many_to_one constraint
- msg = "Merge keys are not unique in right dataset; not a many-to-one merge"
- with pytest.raises(MergeError, match=msg):
- left_w_dups.join(
- right_w_dups,
- on="a",
- validate="many_to_one",
- )
- # Dups on both not allowed by one_to_many constraint
- msg = "Merge keys are not unique in left dataset; not a one-to-many merge"
- with pytest.raises(MergeError, match=msg):
- left_w_dups.join(
- right_w_dups,
- on="a",
- validate="one_to_many",
- )
- def test_join_on_multi_col_check_dup():
- # GH 46622
- # Two column join, dups in both, but jointly no dups
- left = DataFrame(
- {
- "a": ["a", "a", "b", "b"],
- "b": [0, 1, 0, 1],
- "c": ["cat", "dog", "weasel", "horse"],
- },
- index=range(4),
- ).set_index(["a", "b"])
- right = DataFrame(
- {
- "a": ["a", "a", "b"],
- "b": [0, 1, 0],
- "d": ["meow", "bark", "um... weasel noise?"],
- },
- index=range(3),
- ).set_index(["a", "b"])
- expected_multi = DataFrame(
- {
- "a": ["a", "a", "b"],
- "b": [0, 1, 0],
- "c": ["cat", "dog", "weasel"],
- "d": ["meow", "bark", "um... weasel noise?"],
- },
- index=range(3),
- ).set_index(["a", "b"])
- # Jointly no dups allowed by one_to_one constraint
- result = left.join(right, how="inner", validate="1:1")
- tm.assert_frame_equal(result, expected_multi)
- def test_join_index(float_frame):
- # left / right
- f = float_frame.loc[float_frame.index[:10], ["A", "B"]]
- f2 = float_frame.loc[float_frame.index[5:], ["C", "D"]].iloc[::-1]
- joined = f.join(f2)
- tm.assert_index_equal(f.index, joined.index)
- expected_columns = Index(["A", "B", "C", "D"])
- tm.assert_index_equal(joined.columns, expected_columns)
- joined = f.join(f2, how="left")
- tm.assert_index_equal(joined.index, f.index)
- tm.assert_index_equal(joined.columns, expected_columns)
- joined = f.join(f2, how="right")
- tm.assert_index_equal(joined.index, f2.index)
- tm.assert_index_equal(joined.columns, expected_columns)
- # inner
- joined = f.join(f2, how="inner")
- tm.assert_index_equal(joined.index, f.index[5:10])
- tm.assert_index_equal(joined.columns, expected_columns)
- # outer
- joined = f.join(f2, how="outer")
- tm.assert_index_equal(joined.index, float_frame.index.sort_values())
- tm.assert_index_equal(joined.columns, expected_columns)
- with pytest.raises(ValueError, match="join method"):
- f.join(f2, how="foo")
- # corner case - overlapping columns
- msg = "columns overlap but no suffix"
- for how in ("outer", "left", "inner"):
- with pytest.raises(ValueError, match=msg):
- float_frame.join(float_frame, how=how)
- def test_join_index_more(float_frame):
- af = float_frame.loc[:, ["A", "B"]]
- bf = float_frame.loc[::2, ["C", "D"]]
- expected = af.copy()
- expected["C"] = float_frame["C"][::2]
- expected["D"] = float_frame["D"][::2]
- result = af.join(bf)
- tm.assert_frame_equal(result, expected)
- result = af.join(bf, how="right")
- tm.assert_frame_equal(result, expected[::2])
- result = bf.join(af, how="right")
- tm.assert_frame_equal(result, expected.loc[:, result.columns])
- def test_join_index_series(float_frame):
- df = float_frame.copy()
- ser = df.pop(float_frame.columns[-1])
- joined = df.join(ser)
- tm.assert_frame_equal(joined, float_frame)
- ser.name = None
- with pytest.raises(ValueError, match="must have a name"):
- df.join(ser)
- def test_join_overlap(float_frame):
- df1 = float_frame.loc[:, ["A", "B", "C"]]
- df2 = float_frame.loc[:, ["B", "C", "D"]]
- joined = df1.join(df2, lsuffix="_df1", rsuffix="_df2")
- df1_suf = df1.loc[:, ["B", "C"]].add_suffix("_df1")
- df2_suf = df2.loc[:, ["B", "C"]].add_suffix("_df2")
- no_overlap = float_frame.loc[:, ["A", "D"]]
- expected = df1_suf.join(df2_suf).join(no_overlap)
- # column order not necessarily sorted
- tm.assert_frame_equal(joined, expected.loc[:, joined.columns])
- def test_join_period_index(frame_with_period_index):
- other = frame_with_period_index.rename(columns=lambda key: f"{key}{key}")
- joined_values = np.concatenate([frame_with_period_index.values] * 2, axis=1)
- joined_cols = frame_with_period_index.columns.append(other.columns)
- joined = frame_with_period_index.join(other)
- expected = DataFrame(
- data=joined_values, columns=joined_cols, index=frame_with_period_index.index
- )
- tm.assert_frame_equal(joined, expected)
- def test_join_left_sequence_non_unique_index():
- # https://github.com/pandas-dev/pandas/issues/19607
- df1 = DataFrame({"a": [0, 10, 20]}, index=[1, 2, 3])
- df2 = DataFrame({"b": [100, 200, 300]}, index=[4, 3, 2])
- df3 = DataFrame({"c": [400, 500, 600]}, index=[2, 2, 4])
- joined = df1.join([df2, df3], how="left")
- expected = DataFrame(
- {
- "a": [0, 10, 10, 20],
- "b": [np.nan, 300, 300, 200],
- "c": [np.nan, 400, 500, np.nan],
- },
- index=[1, 2, 2, 3],
- )
- tm.assert_frame_equal(joined, expected)
- def test_join_list_series(float_frame):
- # GH#46850
- # Join a DataFrame with a list containing both a Series and a DataFrame
- left = float_frame.A.to_frame()
- right = [float_frame.B, float_frame[["C", "D"]]]
- result = left.join(right)
- tm.assert_frame_equal(result, float_frame)
- @pytest.mark.parametrize("sort_kw", [True, False])
- def test_suppress_future_warning_with_sort_kw(sort_kw):
- a = DataFrame({"col1": [1, 2]}, index=["c", "a"])
- b = DataFrame({"col2": [4, 5]}, index=["b", "a"])
- c = DataFrame({"col3": [7, 8]}, index=["a", "b"])
- expected = DataFrame(
- {
- "col1": {"a": 2.0, "b": float("nan"), "c": 1.0},
- "col2": {"a": 5.0, "b": 4.0, "c": float("nan")},
- "col3": {"a": 7.0, "b": 8.0, "c": float("nan")},
- }
- )
- if sort_kw is False:
- expected = expected.reindex(index=["c", "a", "b"])
- with tm.assert_produces_warning(None):
- result = a.join([b, c], how="outer", sort=sort_kw)
- tm.assert_frame_equal(result, expected)
- class TestDataFrameJoin:
- def test_join(self, multiindex_dataframe_random_data):
- frame = multiindex_dataframe_random_data
- a = frame.loc[frame.index[:5], ["A"]]
- b = frame.loc[frame.index[2:], ["B", "C"]]
- joined = a.join(b, how="outer").reindex(frame.index)
- expected = frame.copy().values.copy()
- expected[np.isnan(joined.values)] = np.nan
- expected = DataFrame(expected, index=frame.index, columns=frame.columns)
- assert not np.isnan(joined.values).all()
- tm.assert_frame_equal(joined, expected)
- def test_join_segfault(self):
- # GH#1532
- df1 = DataFrame({"a": [1, 1], "b": [1, 2], "x": [1, 2]})
- df2 = DataFrame({"a": [2, 2], "b": [1, 2], "y": [1, 2]})
- df1 = df1.set_index(["a", "b"])
- df2 = df2.set_index(["a", "b"])
- # it works!
- for how in ["left", "right", "outer"]:
- df1.join(df2, how=how)
- def test_join_str_datetime(self):
- str_dates = ["20120209", "20120222"]
- dt_dates = [datetime(2012, 2, 9), datetime(2012, 2, 22)]
- A = DataFrame(str_dates, index=range(2), columns=["aa"])
- C = DataFrame([[1, 2], [3, 4]], index=str_dates, columns=dt_dates)
- tst = A.join(C, on="aa")
- assert len(tst.columns) == 3
- def test_join_multiindex_leftright(self):
- # GH 10741
- df1 = DataFrame(
- [
- ["a", "x", 0.471780],
- ["a", "y", 0.774908],
- ["a", "z", 0.563634],
- ["b", "x", -0.353756],
- ["b", "y", 0.368062],
- ["b", "z", -1.721840],
- ["c", "x", 1],
- ["c", "y", 2],
- ["c", "z", 3],
- ],
- columns=["first", "second", "value1"],
- ).set_index(["first", "second"])
- df2 = DataFrame([["a", 10], ["b", 20]], columns=["first", "value2"]).set_index(
- ["first"]
- )
- exp = DataFrame(
- [
- [0.471780, 10],
- [0.774908, 10],
- [0.563634, 10],
- [-0.353756, 20],
- [0.368062, 20],
- [-1.721840, 20],
- [1.000000, np.nan],
- [2.000000, np.nan],
- [3.000000, np.nan],
- ],
- index=df1.index,
- columns=["value1", "value2"],
- )
- # these must be the same results (but columns are flipped)
- tm.assert_frame_equal(df1.join(df2, how="left"), exp)
- tm.assert_frame_equal(df2.join(df1, how="right"), exp[["value2", "value1"]])
- exp_idx = MultiIndex.from_product(
- [["a", "b"], ["x", "y", "z"]], names=["first", "second"]
- )
- exp = DataFrame(
- [
- [0.471780, 10],
- [0.774908, 10],
- [0.563634, 10],
- [-0.353756, 20],
- [0.368062, 20],
- [-1.721840, 20],
- ],
- index=exp_idx,
- columns=["value1", "value2"],
- )
- tm.assert_frame_equal(df1.join(df2, how="right"), exp)
- tm.assert_frame_equal(df2.join(df1, how="left"), exp[["value2", "value1"]])
- def test_join_multiindex_dates(self):
- # GH 33692
- date = pd.Timestamp(2000, 1, 1).date()
- df1_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
- df1 = DataFrame({"col1": [0]}, index=df1_index)
- df2_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
- df2 = DataFrame({"col2": [0]}, index=df2_index)
- df3_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
- df3 = DataFrame({"col3": [0]}, index=df3_index)
- result = df1.join([df2, df3])
- expected_index = MultiIndex.from_tuples([(0, date)], names=["index_0", "date"])
- expected = DataFrame(
- {"col1": [0], "col2": [0], "col3": [0]}, index=expected_index
- )
- tm.assert_equal(result, expected)
- def test_merge_join_different_levels_raises(self):
- # GH#9455
- # GH 40993: For raising, enforced in 2.0
- # first dataframe
- df1 = DataFrame(columns=["a", "b"], data=[[1, 11], [0, 22]])
- # second dataframe
- columns = MultiIndex.from_tuples([("a", ""), ("c", "c1")])
- df2 = DataFrame(columns=columns, data=[[1, 33], [0, 44]])
- # merge
- with pytest.raises(
- MergeError, match="Not allowed to merge between different levels"
- ):
- pd.merge(df1, df2, on="a")
- # join, see discussion in GH#12219
- with pytest.raises(
- MergeError, match="Not allowed to merge between different levels"
- ):
- df1.join(df2, on="a")
- def test_frame_join_tzaware(self):
- test1 = DataFrame(
- np.zeros((6, 3)),
- index=date_range(
- "2012-11-15 00:00:00", periods=6, freq="100L", tz="US/Central"
- ),
- )
- test2 = DataFrame(
- np.zeros((3, 3)),
- index=date_range(
- "2012-11-15 00:00:00", periods=3, freq="250L", tz="US/Central"
- ),
- columns=range(3, 6),
- )
- result = test1.join(test2, how="outer")
- expected = test1.index.union(test2.index)
- tm.assert_index_equal(result.index, expected)
- assert result.index.tz.zone == "US/Central"
|