123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781 |
- from datetime import (
- date,
- datetime,
- timedelta,
- )
- import re
- import numpy as np
- import pytest
- from pandas.core.dtypes.common import (
- is_categorical_dtype,
- is_object_dtype,
- )
- from pandas.core.dtypes.dtypes import CategoricalDtype
- import pandas as pd
- from pandas import (
- Categorical,
- CategoricalIndex,
- DataFrame,
- DatetimeIndex,
- Index,
- IntervalIndex,
- MultiIndex,
- PeriodIndex,
- RangeIndex,
- Series,
- TimedeltaIndex,
- )
- import pandas._testing as tm
- from pandas.api.types import CategoricalDtype as CDT
- from pandas.core.reshape.concat import concat
- from pandas.core.reshape.merge import (
- MergeError,
- merge,
- )
- def get_test_data(ngroups=8, n=50):
- unique_groups = list(range(ngroups))
- arr = np.asarray(np.tile(unique_groups, n // ngroups))
- if len(arr) < n:
- arr = np.asarray(list(arr) + unique_groups[: n - len(arr)])
- np.random.shuffle(arr)
- return arr
- def get_series():
- return [
- Series([1], dtype="int64"),
- Series([1], dtype="Int64"),
- Series([1.23]),
- Series(["foo"]),
- Series([True]),
- Series([pd.Timestamp("2018-01-01")]),
- Series([pd.Timestamp("2018-01-01", tz="US/Eastern")]),
- ]
- def get_series_na():
- return [
- Series([np.nan], dtype="Int64"),
- Series([np.nan], dtype="float"),
- Series([np.nan], dtype="object"),
- Series([pd.NaT]),
- ]
- @pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name)
- def series_of_dtype(request):
- """
- A parametrized fixture returning a variety of Series of different
- dtypes
- """
- return request.param
- @pytest.fixture(params=get_series(), ids=lambda x: x.dtype.name)
- def series_of_dtype2(request):
- """
- A duplicate of the series_of_dtype fixture, so that it can be used
- twice by a single function
- """
- return request.param
- @pytest.fixture(params=get_series_na(), ids=lambda x: x.dtype.name)
- def series_of_dtype_all_na(request):
- """
- A parametrized fixture returning a variety of Series with all NA
- values
- """
- return request.param
- @pytest.fixture
- def dfs_for_indicator():
- df1 = DataFrame({"col1": [0, 1], "col_conflict": [1, 2], "col_left": ["a", "b"]})
- df2 = DataFrame(
- {
- "col1": [1, 2, 3, 4, 5],
- "col_conflict": [1, 2, 3, 4, 5],
- "col_right": [2, 2, 2, 2, 2],
- }
- )
- return df1, df2
- class TestMerge:
- @pytest.fixture
- def df(self):
- df = DataFrame(
- {
- "key1": get_test_data(),
- "key2": get_test_data(),
- "data1": np.random.randn(50),
- "data2": np.random.randn(50),
- }
- )
- # exclude a couple keys for fun
- df = df[df["key2"] > 1]
- return df
- @pytest.fixture
- def df2(self):
- return DataFrame(
- {
- "key1": get_test_data(n=10),
- "key2": get_test_data(ngroups=4, n=10),
- "value": np.random.randn(10),
- }
- )
- @pytest.fixture
- def left(self):
- return DataFrame(
- {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}
- )
- @pytest.fixture
- def right(self):
- return DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])
- def test_merge_inner_join_empty(self):
- # GH 15328
- df_empty = DataFrame()
- df_a = DataFrame({"a": [1, 2]}, index=[0, 1], dtype="int64")
- result = merge(df_empty, df_a, left_index=True, right_index=True)
- expected = DataFrame({"a": []}, dtype="int64")
- tm.assert_frame_equal(result, expected)
- def test_merge_common(self, df, df2):
- joined = merge(df, df2)
- exp = merge(df, df2, on=["key1", "key2"])
- tm.assert_frame_equal(joined, exp)
- def test_merge_non_string_columns(self):
- # https://github.com/pandas-dev/pandas/issues/17962
- # Checks that method runs for non string column names
- left = DataFrame(
- {0: [1, 0, 1, 0], 1: [0, 1, 0, 0], 2: [0, 0, 2, 0], 3: [1, 0, 0, 3]}
- )
- right = left.astype(float)
- expected = left
- result = merge(left, right)
- tm.assert_frame_equal(expected, result)
- def test_merge_index_as_on_arg(self, df, df2):
- # GH14355
- left = df.set_index("key1")
- right = df2.set_index("key1")
- result = merge(left, right, on="key1")
- expected = merge(df, df2, on="key1").set_index("key1")
- tm.assert_frame_equal(result, expected)
- def test_merge_index_singlekey_right_vs_left(self):
- left = DataFrame(
- {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}
- )
- right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])
- merged1 = merge(
- left, right, left_on="key", right_index=True, how="left", sort=False
- )
- merged2 = merge(
- right, left, right_on="key", left_index=True, how="right", sort=False
- )
- tm.assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
- merged1 = merge(
- left, right, left_on="key", right_index=True, how="left", sort=True
- )
- merged2 = merge(
- right, left, right_on="key", left_index=True, how="right", sort=True
- )
- tm.assert_frame_equal(merged1, merged2.loc[:, merged1.columns])
- def test_merge_index_singlekey_inner(self):
- left = DataFrame(
- {"key": ["a", "b", "c", "d", "e", "e", "a"], "v1": np.random.randn(7)}
- )
- right = DataFrame({"v2": np.random.randn(4)}, index=["d", "b", "c", "a"])
- # inner join
- result = merge(left, right, left_on="key", right_index=True, how="inner")
- expected = left.join(right, on="key").loc[result.index]
- tm.assert_frame_equal(result, expected)
- result = merge(right, left, right_on="key", left_index=True, how="inner")
- expected = left.join(right, on="key").loc[result.index]
- tm.assert_frame_equal(result, expected.loc[:, result.columns])
- def test_merge_misspecified(self, df, df2, left, right):
- msg = "Must pass right_on or right_index=True"
- with pytest.raises(pd.errors.MergeError, match=msg):
- merge(left, right, left_index=True)
- msg = "Must pass left_on or left_index=True"
- with pytest.raises(pd.errors.MergeError, match=msg):
- merge(left, right, right_index=True)
- msg = (
- 'Can only pass argument "on" OR "left_on" and "right_on", not '
- "a combination of both"
- )
- with pytest.raises(pd.errors.MergeError, match=msg):
- merge(left, left, left_on="key", on="key")
- msg = r"len\(right_on\) must equal len\(left_on\)"
- with pytest.raises(ValueError, match=msg):
- merge(df, df2, left_on=["key1"], right_on=["key1", "key2"])
- def test_index_and_on_parameters_confusion(self, df, df2):
- msg = "right_index parameter must be of type bool, not <class 'list'>"
- with pytest.raises(ValueError, match=msg):
- merge(
- df,
- df2,
- how="left",
- left_index=False,
- right_index=["key1", "key2"],
- )
- msg = "left_index parameter must be of type bool, not <class 'list'>"
- with pytest.raises(ValueError, match=msg):
- merge(
- df,
- df2,
- how="left",
- left_index=["key1", "key2"],
- right_index=False,
- )
- with pytest.raises(ValueError, match=msg):
- merge(
- df,
- df2,
- how="left",
- left_index=["key1", "key2"],
- right_index=["key1", "key2"],
- )
- def test_merge_overlap(self, left):
- merged = merge(left, left, on="key")
- exp_len = (left["key"].value_counts() ** 2).sum()
- assert len(merged) == exp_len
- assert "v1_x" in merged
- assert "v1_y" in merged
- def test_merge_different_column_key_names(self):
- left = DataFrame({"lkey": ["foo", "bar", "baz", "foo"], "value": [1, 2, 3, 4]})
- right = DataFrame({"rkey": ["foo", "bar", "qux", "foo"], "value": [5, 6, 7, 8]})
- merged = left.merge(
- right, left_on="lkey", right_on="rkey", how="outer", sort=True
- )
- exp = Series(["bar", "baz", "foo", "foo", "foo", "foo", np.nan], name="lkey")
- tm.assert_series_equal(merged["lkey"], exp)
- exp = Series(["bar", np.nan, "foo", "foo", "foo", "foo", "qux"], name="rkey")
- tm.assert_series_equal(merged["rkey"], exp)
- exp = Series([2, 3, 1, 1, 4, 4, np.nan], name="value_x")
- tm.assert_series_equal(merged["value_x"], exp)
- exp = Series([6, np.nan, 5, 8, 5, 8, 7], name="value_y")
- tm.assert_series_equal(merged["value_y"], exp)
- def test_merge_copy(self):
- left = DataFrame({"a": 0, "b": 1}, index=range(10))
- right = DataFrame({"c": "foo", "d": "bar"}, index=range(10))
- merged = merge(left, right, left_index=True, right_index=True, copy=True)
- merged["a"] = 6
- assert (left["a"] == 0).all()
- merged["d"] = "peekaboo"
- assert (right["d"] == "bar").all()
- def test_merge_nocopy(self, using_array_manager):
- left = DataFrame({"a": 0, "b": 1}, index=range(10))
- right = DataFrame({"c": "foo", "d": "bar"}, index=range(10))
- merged = merge(left, right, left_index=True, right_index=True, copy=False)
- assert np.shares_memory(merged["a"]._values, left["a"]._values)
- assert np.shares_memory(merged["d"]._values, right["d"]._values)
- def test_intelligently_handle_join_key(self):
- # #733, be a bit more 1337 about not returning unconsolidated DataFrame
- left = DataFrame(
- {"key": [1, 1, 2, 2, 3], "value": list(range(5))}, columns=["value", "key"]
- )
- right = DataFrame({"key": [1, 1, 2, 3, 4, 5], "rvalue": list(range(6))})
- joined = merge(left, right, on="key", how="outer")
- expected = DataFrame(
- {
- "key": [1, 1, 1, 1, 2, 2, 3, 4, 5],
- "value": np.array([0, 0, 1, 1, 2, 3, 4, np.nan, np.nan]),
- "rvalue": [0, 1, 0, 1, 2, 2, 3, 4, 5],
- },
- columns=["value", "key", "rvalue"],
- )
- tm.assert_frame_equal(joined, expected)
- def test_merge_join_key_dtype_cast(self):
- # #8596
- df1 = DataFrame({"key": [1], "v1": [10]})
- df2 = DataFrame({"key": [2], "v1": [20]})
- df = merge(df1, df2, how="outer")
- assert df["key"].dtype == "int64"
- df1 = DataFrame({"key": [True], "v1": [1]})
- df2 = DataFrame({"key": [False], "v1": [0]})
- df = merge(df1, df2, how="outer")
- # GH13169
- # GH#40073
- assert df["key"].dtype == "bool"
- df1 = DataFrame({"val": [1]})
- df2 = DataFrame({"val": [2]})
- lkey = np.array([1])
- rkey = np.array([2])
- df = merge(df1, df2, left_on=lkey, right_on=rkey, how="outer")
- assert df["key_0"].dtype == np.int_
- def test_handle_join_key_pass_array(self):
- left = DataFrame(
- {"key": [1, 1, 2, 2, 3], "value": np.arange(5)},
- columns=["value", "key"],
- dtype="int64",
- )
- right = DataFrame({"rvalue": np.arange(6)}, dtype="int64")
- key = np.array([1, 1, 2, 3, 4, 5], dtype="int64")
- merged = merge(left, right, left_on="key", right_on=key, how="outer")
- merged2 = merge(right, left, left_on=key, right_on="key", how="outer")
- tm.assert_series_equal(merged["key"], merged2["key"])
- assert merged["key"].notna().all()
- assert merged2["key"].notna().all()
- left = DataFrame({"value": np.arange(5)}, columns=["value"])
- right = DataFrame({"rvalue": np.arange(6)})
- lkey = np.array([1, 1, 2, 2, 3])
- rkey = np.array([1, 1, 2, 3, 4, 5])
- merged = merge(left, right, left_on=lkey, right_on=rkey, how="outer")
- expected = Series([1, 1, 1, 1, 2, 2, 3, 4, 5], dtype=np.int_, name="key_0")
- tm.assert_series_equal(merged["key_0"], expected)
- left = DataFrame({"value": np.arange(3)})
- right = DataFrame({"rvalue": np.arange(6)})
- key = np.array([0, 1, 1, 2, 2, 3], dtype=np.int64)
- merged = merge(left, right, left_index=True, right_on=key, how="outer")
- tm.assert_series_equal(merged["key_0"], Series(key, name="key_0"))
- def test_no_overlap_more_informative_error(self):
- dt = datetime.now()
- df1 = DataFrame({"x": ["a"]}, index=[dt])
- df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt])
- msg = (
- "No common columns to perform merge on. "
- f"Merge options: left_on={None}, right_on={None}, "
- f"left_index={False}, right_index={False}"
- )
- with pytest.raises(MergeError, match=msg):
- merge(df1, df2)
- def test_merge_non_unique_indexes(self):
- dt = datetime(2012, 5, 1)
- dt2 = datetime(2012, 5, 2)
- dt3 = datetime(2012, 5, 3)
- dt4 = datetime(2012, 5, 4)
- df1 = DataFrame({"x": ["a"]}, index=[dt])
- df2 = DataFrame({"y": ["b", "c"]}, index=[dt, dt])
- _check_merge(df1, df2)
- # Not monotonic
- df1 = DataFrame({"x": ["a", "b", "q"]}, index=[dt2, dt, dt4])
- df2 = DataFrame(
- {"y": ["c", "d", "e", "f", "g", "h"]}, index=[dt3, dt3, dt2, dt2, dt, dt]
- )
- _check_merge(df1, df2)
- df1 = DataFrame({"x": ["a", "b"]}, index=[dt, dt])
- df2 = DataFrame({"y": ["c", "d"]}, index=[dt, dt])
- _check_merge(df1, df2)
- def test_merge_non_unique_index_many_to_many(self):
- dt = datetime(2012, 5, 1)
- dt2 = datetime(2012, 5, 2)
- dt3 = datetime(2012, 5, 3)
- df1 = DataFrame({"x": ["a", "b", "c", "d"]}, index=[dt2, dt2, dt, dt])
- df2 = DataFrame(
- {"y": ["e", "f", "g", " h", "i"]}, index=[dt2, dt2, dt3, dt, dt]
- )
- _check_merge(df1, df2)
- def test_left_merge_empty_dataframe(self):
- left = DataFrame({"key": [1], "value": [2]})
- right = DataFrame({"key": []})
- result = merge(left, right, on="key", how="left")
- tm.assert_frame_equal(result, left)
- result = merge(right, left, on="key", how="right")
- tm.assert_frame_equal(result, left)
- @pytest.mark.parametrize(
- "kwarg",
- [
- {"left_index": True, "right_index": True},
- {"left_index": True, "right_on": "x"},
- {"left_on": "a", "right_index": True},
- {"left_on": "a", "right_on": "x"},
- ],
- )
- def test_merge_left_empty_right_empty(self, join_type, kwarg):
- # GH 10824
- left = DataFrame(columns=["a", "b", "c"])
- right = DataFrame(columns=["x", "y", "z"])
- exp_in = DataFrame(columns=["a", "b", "c", "x", "y", "z"], dtype=object)
- result = merge(left, right, how=join_type, **kwarg)
- tm.assert_frame_equal(result, exp_in)
- def test_merge_left_empty_right_notempty(self):
- # GH 10824
- left = DataFrame(columns=["a", "b", "c"])
- right = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["x", "y", "z"])
- exp_out = DataFrame(
- {
- "a": np.array([np.nan] * 3, dtype=object),
- "b": np.array([np.nan] * 3, dtype=object),
- "c": np.array([np.nan] * 3, dtype=object),
- "x": [1, 4, 7],
- "y": [2, 5, 8],
- "z": [3, 6, 9],
- },
- columns=["a", "b", "c", "x", "y", "z"],
- )
- exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
- def check1(exp, kwarg):
- result = merge(left, right, how="inner", **kwarg)
- tm.assert_frame_equal(result, exp)
- result = merge(left, right, how="left", **kwarg)
- tm.assert_frame_equal(result, exp)
- def check2(exp, kwarg):
- result = merge(left, right, how="right", **kwarg)
- tm.assert_frame_equal(result, exp)
- result = merge(left, right, how="outer", **kwarg)
- tm.assert_frame_equal(result, exp)
- for kwarg in [
- {"left_index": True, "right_index": True},
- {"left_index": True, "right_on": "x"},
- ]:
- check1(exp_in, kwarg)
- check2(exp_out, kwarg)
- kwarg = {"left_on": "a", "right_index": True}
- check1(exp_in, kwarg)
- exp_out["a"] = [0, 1, 2]
- check2(exp_out, kwarg)
- kwarg = {"left_on": "a", "right_on": "x"}
- check1(exp_in, kwarg)
- exp_out["a"] = np.array([np.nan] * 3, dtype=object)
- check2(exp_out, kwarg)
- def test_merge_left_notempty_right_empty(self):
- # GH 10824
- left = DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], columns=["a", "b", "c"])
- right = DataFrame(columns=["x", "y", "z"])
- exp_out = DataFrame(
- {
- "a": [1, 4, 7],
- "b": [2, 5, 8],
- "c": [3, 6, 9],
- "x": np.array([np.nan] * 3, dtype=object),
- "y": np.array([np.nan] * 3, dtype=object),
- "z": np.array([np.nan] * 3, dtype=object),
- },
- columns=["a", "b", "c", "x", "y", "z"],
- )
- exp_in = exp_out[0:0] # make empty DataFrame keeping dtype
- # result will have object dtype
- exp_in.index = exp_in.index.astype(object)
- def check1(exp, kwarg):
- result = merge(left, right, how="inner", **kwarg)
- tm.assert_frame_equal(result, exp)
- result = merge(left, right, how="right", **kwarg)
- tm.assert_frame_equal(result, exp)
- def check2(exp, kwarg):
- result = merge(left, right, how="left", **kwarg)
- tm.assert_frame_equal(result, exp)
- result = merge(left, right, how="outer", **kwarg)
- tm.assert_frame_equal(result, exp)
- # TODO: should the next loop be un-indented? doing so breaks this test
- for kwarg in [
- {"left_index": True, "right_index": True},
- {"left_index": True, "right_on": "x"},
- {"left_on": "a", "right_index": True},
- {"left_on": "a", "right_on": "x"},
- ]:
- check1(exp_in, kwarg)
- check2(exp_out, kwarg)
- def test_merge_empty_frame(self, series_of_dtype, series_of_dtype2):
- # GH 25183
- df = DataFrame(
- {"key": series_of_dtype, "value": series_of_dtype2},
- columns=["key", "value"],
- )
- df_empty = df[:0]
- expected = DataFrame(
- {
- "value_x": Series(dtype=df.dtypes["value"]),
- "key": Series(dtype=df.dtypes["key"]),
- "value_y": Series(dtype=df.dtypes["value"]),
- },
- columns=["value_x", "key", "value_y"],
- )
- actual = df_empty.merge(df, on="key")
- tm.assert_frame_equal(actual, expected)
- def test_merge_all_na_column(self, series_of_dtype, series_of_dtype_all_na):
- # GH 25183
- df_left = DataFrame(
- {"key": series_of_dtype, "value": series_of_dtype_all_na},
- columns=["key", "value"],
- )
- df_right = DataFrame(
- {"key": series_of_dtype, "value": series_of_dtype_all_na},
- columns=["key", "value"],
- )
- expected = DataFrame(
- {
- "key": series_of_dtype,
- "value_x": series_of_dtype_all_na,
- "value_y": series_of_dtype_all_na,
- },
- columns=["key", "value_x", "value_y"],
- )
- actual = df_left.merge(df_right, on="key")
- tm.assert_frame_equal(actual, expected)
- def test_merge_nosort(self):
- # GH#2098
- d = {
- "var1": np.random.randint(0, 10, size=10),
- "var2": np.random.randint(0, 10, size=10),
- "var3": [
- datetime(2012, 1, 12),
- datetime(2011, 2, 4),
- datetime(2010, 2, 3),
- datetime(2012, 1, 12),
- datetime(2011, 2, 4),
- datetime(2012, 4, 3),
- datetime(2012, 3, 4),
- datetime(2008, 5, 1),
- datetime(2010, 2, 3),
- datetime(2012, 2, 3),
- ],
- }
- df = DataFrame.from_dict(d)
- var3 = df.var3.unique()
- var3 = np.sort(var3)
- new = DataFrame.from_dict({"var3": var3, "var8": np.random.random(7)})
- result = df.merge(new, on="var3", sort=False)
- exp = merge(df, new, on="var3", sort=False)
- tm.assert_frame_equal(result, exp)
- assert (df.var3.unique() == result.var3.unique()).all()
- @pytest.mark.parametrize(
- ("sort", "values"), [(False, [1, 1, 0, 1, 1]), (True, [0, 1, 1, 1, 1])]
- )
- @pytest.mark.parametrize("how", ["left", "right"])
- def test_merge_same_order_left_right(self, sort, values, how):
- # GH#35382
- df = DataFrame({"a": [1, 0, 1]})
- result = df.merge(df, on="a", how=how, sort=sort)
- expected = DataFrame(values, columns=["a"])
- tm.assert_frame_equal(result, expected)
- def test_merge_nan_right(self):
- df1 = DataFrame({"i1": [0, 1], "i2": [0, 1]})
- df2 = DataFrame({"i1": [0], "i3": [0]})
- result = df1.join(df2, on="i1", rsuffix="_")
- expected = (
- DataFrame(
- {
- "i1": {0: 0.0, 1: 1},
- "i2": {0: 0, 1: 1},
- "i1_": {0: 0, 1: np.nan},
- "i3": {0: 0.0, 1: np.nan},
- None: {0: 0, 1: 0},
- }
- )
- .set_index(None)
- .reset_index()[["i1", "i2", "i1_", "i3"]]
- )
- tm.assert_frame_equal(result, expected, check_dtype=False)
- def test_merge_nan_right2(self):
- df1 = DataFrame({"i1": [0, 1], "i2": [0.5, 1.5]})
- df2 = DataFrame({"i1": [0], "i3": [0.7]})
- result = df1.join(df2, rsuffix="_", on="i1")
- expected = DataFrame(
- {
- "i1": {0: 0, 1: 1},
- "i1_": {0: 0.0, 1: np.nan},
- "i2": {0: 0.5, 1: 1.5},
- "i3": {0: 0.69999999999999996, 1: np.nan},
- }
- )[["i1", "i2", "i1_", "i3"]]
- tm.assert_frame_equal(result, expected)
- def test_merge_type(self, df, df2):
- class NotADataFrame(DataFrame):
- @property
- def _constructor(self):
- return NotADataFrame
- nad = NotADataFrame(df)
- result = nad.merge(df2, on="key1")
- assert isinstance(result, NotADataFrame)
- def test_join_append_timedeltas(self, using_array_manager):
- # timedelta64 issues with join/merge
- # GH 5695
- d = DataFrame.from_dict(
- {"d": [datetime(2013, 11, 5, 5, 56)], "t": [timedelta(0, 22500)]}
- )
- df = DataFrame(columns=list("dt"))
- df = concat([df, d], ignore_index=True)
- result = concat([df, d], ignore_index=True)
- expected = DataFrame(
- {
- "d": [datetime(2013, 11, 5, 5, 56), datetime(2013, 11, 5, 5, 56)],
- "t": [timedelta(0, 22500), timedelta(0, 22500)],
- }
- )
- if using_array_manager:
- # TODO(ArrayManager) decide on exact casting rules in concat
- expected = expected.astype(object)
- tm.assert_frame_equal(result, expected)
- def test_join_append_timedeltas2(self):
- # timedelta64 issues with join/merge
- # GH 5695
- td = np.timedelta64(300000000)
- lhs = DataFrame(Series([td, td], index=["A", "B"]))
- rhs = DataFrame(Series([td], index=["A"]))
- result = lhs.join(rhs, rsuffix="r", how="left")
- expected = DataFrame(
- {
- "0": Series([td, td], index=list("AB")),
- "0r": Series([td, pd.NaT], index=list("AB")),
- }
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"])
- def test_other_datetime_unit(self, unit):
- # GH 13389
- df1 = DataFrame({"entity_id": [101, 102]})
- ser = Series([None, None], index=[101, 102], name="days")
- dtype = f"datetime64[{unit}]"
- if unit in ["D", "h", "m"]:
- # not supported so we cast to the nearest supported unit, seconds
- exp_dtype = "datetime64[s]"
- else:
- exp_dtype = dtype
- df2 = ser.astype(exp_dtype).to_frame("days")
- assert df2["days"].dtype == exp_dtype
- result = df1.merge(df2, left_on="entity_id", right_index=True)
- days = np.array(["nat", "nat"], dtype=exp_dtype)
- days = pd.core.arrays.DatetimeArray._simple_new(days, dtype=days.dtype)
- exp = DataFrame(
- {
- "entity_id": [101, 102],
- "days": days,
- },
- columns=["entity_id", "days"],
- )
- assert exp["days"].dtype == exp_dtype
- tm.assert_frame_equal(result, exp)
- @pytest.mark.parametrize("unit", ["D", "h", "m", "s", "ms", "us", "ns"])
- def test_other_timedelta_unit(self, unit):
- # GH 13389
- df1 = DataFrame({"entity_id": [101, 102]})
- ser = Series([None, None], index=[101, 102], name="days")
- dtype = f"m8[{unit}]"
- if unit in ["D", "h", "m"]:
- # We cannot astype, instead do nearest supported unit, i.e. "s"
- msg = "Supported resolutions are 's', 'ms', 'us', 'ns'"
- with pytest.raises(ValueError, match=msg):
- ser.astype(dtype)
- df2 = ser.astype("m8[s]").to_frame("days")
- else:
- df2 = ser.astype(dtype).to_frame("days")
- assert df2["days"].dtype == dtype
- result = df1.merge(df2, left_on="entity_id", right_index=True)
- exp = DataFrame(
- {"entity_id": [101, 102], "days": np.array(["nat", "nat"], dtype=dtype)},
- columns=["entity_id", "days"],
- )
- tm.assert_frame_equal(result, exp)
- def test_overlapping_columns_error_message(self):
- df = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]})
- df2 = DataFrame({"key": [1, 2, 3], "v1": [4, 5, 6], "v2": [7, 8, 9]})
- df.columns = ["key", "foo", "foo"]
- df2.columns = ["key", "bar", "bar"]
- expected = DataFrame(
- {
- "key": [1, 2, 3],
- "v1": [4, 5, 6],
- "v2": [7, 8, 9],
- "v3": [4, 5, 6],
- "v4": [7, 8, 9],
- }
- )
- expected.columns = ["key", "foo", "foo", "bar", "bar"]
- tm.assert_frame_equal(merge(df, df2), expected)
- # #2649, #10639
- df2.columns = ["key1", "foo", "foo"]
- msg = r"Data columns not unique: Index\(\['foo'\], dtype='object'\)"
- with pytest.raises(MergeError, match=msg):
- merge(df, df2)
- def test_merge_on_datetime64tz(self):
- # GH11405
- left = DataFrame(
- {
- "key": pd.date_range("20151010", periods=2, tz="US/Eastern"),
- "value": [1, 2],
- }
- )
- right = DataFrame(
- {
- "key": pd.date_range("20151011", periods=3, tz="US/Eastern"),
- "value": [1, 2, 3],
- }
- )
- expected = DataFrame(
- {
- "key": pd.date_range("20151010", periods=4, tz="US/Eastern"),
- "value_x": [1, 2, np.nan, np.nan],
- "value_y": [np.nan, 1, 2, 3],
- }
- )
- result = merge(left, right, on="key", how="outer")
- tm.assert_frame_equal(result, expected)
- def test_merge_datetime64tz_values(self):
- left = DataFrame(
- {
- "key": [1, 2],
- "value": pd.date_range("20151010", periods=2, tz="US/Eastern"),
- }
- )
- right = DataFrame(
- {
- "key": [2, 3],
- "value": pd.date_range("20151011", periods=2, tz="US/Eastern"),
- }
- )
- expected = DataFrame(
- {
- "key": [1, 2, 3],
- "value_x": list(pd.date_range("20151010", periods=2, tz="US/Eastern"))
- + [pd.NaT],
- "value_y": [pd.NaT]
- + list(pd.date_range("20151011", periods=2, tz="US/Eastern")),
- }
- )
- result = merge(left, right, on="key", how="outer")
- tm.assert_frame_equal(result, expected)
- assert result["value_x"].dtype == "datetime64[ns, US/Eastern]"
- assert result["value_y"].dtype == "datetime64[ns, US/Eastern]"
- def test_merge_on_datetime64tz_empty(self):
- # https://github.com/pandas-dev/pandas/issues/25014
- dtz = pd.DatetimeTZDtype(tz="UTC")
- right = DataFrame(
- {
- "date": [pd.Timestamp("2018", tz=dtz.tz)],
- "value": [4.0],
- "date2": [pd.Timestamp("2019", tz=dtz.tz)],
- },
- columns=["date", "value", "date2"],
- )
- left = right[:0]
- result = left.merge(right, on="date")
- expected = DataFrame(
- {
- "value_x": Series(dtype=float),
- "date2_x": Series(dtype=dtz),
- "date": Series(dtype=dtz),
- "value_y": Series(dtype=float),
- "date2_y": Series(dtype=dtz),
- },
- columns=["value_x", "date2_x", "date", "value_y", "date2_y"],
- )
- tm.assert_frame_equal(result, expected)
- def test_merge_datetime64tz_with_dst_transition(self):
- # GH 18885
- df1 = DataFrame(
- pd.date_range("2017-10-29 01:00", periods=4, freq="H", tz="Europe/Madrid"),
- columns=["date"],
- )
- df1["value"] = 1
- df2 = DataFrame(
- {
- "date": pd.to_datetime(
- [
- "2017-10-29 03:00:00",
- "2017-10-29 04:00:00",
- "2017-10-29 05:00:00",
- ]
- ),
- "value": 2,
- }
- )
- df2["date"] = df2["date"].dt.tz_localize("UTC").dt.tz_convert("Europe/Madrid")
- result = merge(df1, df2, how="outer", on="date")
- expected = DataFrame(
- {
- "date": pd.date_range(
- "2017-10-29 01:00", periods=7, freq="H", tz="Europe/Madrid"
- ),
- "value_x": [1] * 4 + [np.nan] * 3,
- "value_y": [np.nan] * 4 + [2] * 3,
- }
- )
- tm.assert_frame_equal(result, expected)
- def test_merge_non_unique_period_index(self):
- # GH #16871
- index = pd.period_range("2016-01-01", periods=16, freq="M")
- df = DataFrame(list(range(len(index))), index=index, columns=["pnum"])
- df2 = concat([df, df])
- result = df.merge(df2, left_index=True, right_index=True, how="inner")
- expected = DataFrame(
- np.tile(np.arange(16, dtype=np.int64).repeat(2).reshape(-1, 1), 2),
- columns=["pnum_x", "pnum_y"],
- index=df2.sort_index().index,
- )
- tm.assert_frame_equal(result, expected)
- def test_merge_on_periods(self):
- left = DataFrame(
- {"key": pd.period_range("20151010", periods=2, freq="D"), "value": [1, 2]}
- )
- right = DataFrame(
- {
- "key": pd.period_range("20151011", periods=3, freq="D"),
- "value": [1, 2, 3],
- }
- )
- expected = DataFrame(
- {
- "key": pd.period_range("20151010", periods=4, freq="D"),
- "value_x": [1, 2, np.nan, np.nan],
- "value_y": [np.nan, 1, 2, 3],
- }
- )
- result = merge(left, right, on="key", how="outer")
- tm.assert_frame_equal(result, expected)
- def test_merge_period_values(self):
- left = DataFrame(
- {"key": [1, 2], "value": pd.period_range("20151010", periods=2, freq="D")}
- )
- right = DataFrame(
- {"key": [2, 3], "value": pd.period_range("20151011", periods=2, freq="D")}
- )
- exp_x = pd.period_range("20151010", periods=2, freq="D")
- exp_y = pd.period_range("20151011", periods=2, freq="D")
- expected = DataFrame(
- {
- "key": [1, 2, 3],
- "value_x": list(exp_x) + [pd.NaT],
- "value_y": [pd.NaT] + list(exp_y),
- }
- )
- result = merge(left, right, on="key", how="outer")
- tm.assert_frame_equal(result, expected)
- assert result["value_x"].dtype == "Period[D]"
- assert result["value_y"].dtype == "Period[D]"
- def test_indicator(self, dfs_for_indicator):
- # PR #10054. xref #7412 and closes #8790.
- df1, df2 = dfs_for_indicator
- df1_copy = df1.copy()
- df2_copy = df2.copy()
- df_result = DataFrame(
- {
- "col1": [0, 1, 2, 3, 4, 5],
- "col_conflict_x": [1, 2, np.nan, np.nan, np.nan, np.nan],
- "col_left": ["a", "b", np.nan, np.nan, np.nan, np.nan],
- "col_conflict_y": [np.nan, 1, 2, 3, 4, 5],
- "col_right": [np.nan, 2, 2, 2, 2, 2],
- }
- )
- df_result["_merge"] = Categorical(
- [
- "left_only",
- "both",
- "right_only",
- "right_only",
- "right_only",
- "right_only",
- ],
- categories=["left_only", "right_only", "both"],
- )
- df_result = df_result[
- [
- "col1",
- "col_conflict_x",
- "col_left",
- "col_conflict_y",
- "col_right",
- "_merge",
- ]
- ]
- test = merge(df1, df2, on="col1", how="outer", indicator=True)
- tm.assert_frame_equal(test, df_result)
- test = df1.merge(df2, on="col1", how="outer", indicator=True)
- tm.assert_frame_equal(test, df_result)
- # No side effects
- tm.assert_frame_equal(df1, df1_copy)
- tm.assert_frame_equal(df2, df2_copy)
- # Check with custom name
- df_result_custom_name = df_result
- df_result_custom_name = df_result_custom_name.rename(
- columns={"_merge": "custom_name"}
- )
- test_custom_name = merge(
- df1, df2, on="col1", how="outer", indicator="custom_name"
- )
- tm.assert_frame_equal(test_custom_name, df_result_custom_name)
- test_custom_name = df1.merge(
- df2, on="col1", how="outer", indicator="custom_name"
- )
- tm.assert_frame_equal(test_custom_name, df_result_custom_name)
- def test_merge_indicator_arg_validation(self, dfs_for_indicator):
- # Check only accepts strings and booleans
- df1, df2 = dfs_for_indicator
- msg = "indicator option can only accept boolean or string arguments"
- with pytest.raises(ValueError, match=msg):
- merge(df1, df2, on="col1", how="outer", indicator=5)
- with pytest.raises(ValueError, match=msg):
- df1.merge(df2, on="col1", how="outer", indicator=5)
- def test_merge_indicator_result_integrity(self, dfs_for_indicator):
- # Check result integrity
- df1, df2 = dfs_for_indicator
- test2 = merge(df1, df2, on="col1", how="left", indicator=True)
- assert (test2._merge != "right_only").all()
- test2 = df1.merge(df2, on="col1", how="left", indicator=True)
- assert (test2._merge != "right_only").all()
- test3 = merge(df1, df2, on="col1", how="right", indicator=True)
- assert (test3._merge != "left_only").all()
- test3 = df1.merge(df2, on="col1", how="right", indicator=True)
- assert (test3._merge != "left_only").all()
- test4 = merge(df1, df2, on="col1", how="inner", indicator=True)
- assert (test4._merge == "both").all()
- test4 = df1.merge(df2, on="col1", how="inner", indicator=True)
- assert (test4._merge == "both").all()
- def test_merge_indicator_invalid(self, dfs_for_indicator):
- # Check if working name in df
- df1, _ = dfs_for_indicator
- for i in ["_right_indicator", "_left_indicator", "_merge"]:
- df_badcolumn = DataFrame({"col1": [1, 2], i: [2, 2]})
- msg = (
- "Cannot use `indicator=True` option when data contains a "
- f"column named {i}|"
- "Cannot use name of an existing column for indicator column"
- )
- with pytest.raises(ValueError, match=msg):
- merge(df1, df_badcolumn, on="col1", how="outer", indicator=True)
- with pytest.raises(ValueError, match=msg):
- df1.merge(df_badcolumn, on="col1", how="outer", indicator=True)
- # Check for name conflict with custom name
- df_badcolumn = DataFrame({"col1": [1, 2], "custom_column_name": [2, 2]})
- msg = "Cannot use name of an existing column for indicator column"
- with pytest.raises(ValueError, match=msg):
- merge(
- df1,
- df_badcolumn,
- on="col1",
- how="outer",
- indicator="custom_column_name",
- )
- with pytest.raises(ValueError, match=msg):
- df1.merge(
- df_badcolumn, on="col1", how="outer", indicator="custom_column_name"
- )
- def test_merge_indicator_multiple_columns(self):
- # Merge on multiple columns
- df3 = DataFrame({"col1": [0, 1], "col2": ["a", "b"]})
- df4 = DataFrame({"col1": [1, 1, 3], "col2": ["b", "x", "y"]})
- hand_coded_result = DataFrame(
- {"col1": [0, 1, 1, 3], "col2": ["a", "b", "x", "y"]}
- )
- hand_coded_result["_merge"] = Categorical(
- ["left_only", "both", "right_only", "right_only"],
- categories=["left_only", "right_only", "both"],
- )
- test5 = merge(df3, df4, on=["col1", "col2"], how="outer", indicator=True)
- tm.assert_frame_equal(test5, hand_coded_result)
- test5 = df3.merge(df4, on=["col1", "col2"], how="outer", indicator=True)
- tm.assert_frame_equal(test5, hand_coded_result)
- def test_validation(self):
- left = DataFrame(
- {"a": ["a", "b", "c", "d"], "b": ["cat", "dog", "weasel", "horse"]},
- index=range(4),
- )
- right = DataFrame(
- {
- "a": ["a", "b", "c", "d", "e"],
- "c": ["meow", "bark", "um... weasel noise?", "nay", "chirp"],
- },
- index=range(5),
- )
- # Make sure no side effects.
- left_copy = left.copy()
- right_copy = right.copy()
- result = merge(left, right, left_index=True, right_index=True, validate="1:1")
- tm.assert_frame_equal(left, left_copy)
- tm.assert_frame_equal(right, right_copy)
- # make sure merge still correct
- expected = DataFrame(
- {
- "a_x": ["a", "b", "c", "d"],
- "b": ["cat", "dog", "weasel", "horse"],
- "a_y": ["a", "b", "c", "d"],
- "c": ["meow", "bark", "um... weasel noise?", "nay"],
- },
- index=range(4),
- columns=["a_x", "b", "a_y", "c"],
- )
- result = merge(
- left, right, left_index=True, right_index=True, validate="one_to_one"
- )
- tm.assert_frame_equal(result, expected)
- expected_2 = DataFrame(
- {
- "a": ["a", "b", "c", "d"],
- "b": ["cat", "dog", "weasel", "horse"],
- "c": ["meow", "bark", "um... weasel noise?", "nay"],
- },
- index=range(4),
- )
- result = merge(left, right, on="a", validate="1:1")
- tm.assert_frame_equal(left, left_copy)
- tm.assert_frame_equal(right, right_copy)
- tm.assert_frame_equal(result, expected_2)
- result = merge(left, right, on="a", validate="one_to_one")
- tm.assert_frame_equal(result, expected_2)
- # One index, one column
- expected_3 = DataFrame(
- {
- "b": ["cat", "dog", "weasel", "horse"],
- "a": ["a", "b", "c", "d"],
- "c": ["meow", "bark", "um... weasel noise?", "nay"],
- },
- columns=["b", "a", "c"],
- index=range(4),
- )
- left_index_reset = left.set_index("a")
- result = merge(
- left_index_reset,
- right,
- left_index=True,
- right_on="a",
- validate="one_to_one",
- )
- tm.assert_frame_equal(result, expected_3)
- # Dups on right
- right_w_dups = concat([right, DataFrame({"a": ["e"], "c": ["moo"]}, index=[4])])
- merge(
- left,
- right_w_dups,
- left_index=True,
- right_index=True,
- validate="one_to_many",
- )
- msg = "Merge keys are not unique in right dataset; not a one-to-one merge"
- with pytest.raises(MergeError, match=msg):
- merge(
- left,
- right_w_dups,
- left_index=True,
- right_index=True,
- validate="one_to_one",
- )
- with pytest.raises(MergeError, match=msg):
- merge(left, right_w_dups, on="a", validate="one_to_one")
- # Dups on left
- left_w_dups = concat(
- [left, DataFrame({"a": ["a"], "c": ["cow"]}, index=[3])], sort=True
- )
- merge(
- left_w_dups,
- right,
- left_index=True,
- right_index=True,
- validate="many_to_one",
- )
- msg = "Merge keys are not unique in left dataset; not a one-to-one merge"
- with pytest.raises(MergeError, match=msg):
- merge(
- left_w_dups,
- right,
- left_index=True,
- right_index=True,
- validate="one_to_one",
- )
- with pytest.raises(MergeError, match=msg):
- merge(left_w_dups, right, on="a", validate="one_to_one")
- # Dups on both
- merge(left_w_dups, right_w_dups, on="a", validate="many_to_many")
- msg = "Merge keys are not unique in right dataset; not a many-to-one merge"
- with pytest.raises(MergeError, match=msg):
- merge(
- left_w_dups,
- right_w_dups,
- left_index=True,
- right_index=True,
- validate="many_to_one",
- )
- msg = "Merge keys are not unique in left dataset; not a one-to-many merge"
- with pytest.raises(MergeError, match=msg):
- merge(left_w_dups, right_w_dups, on="a", validate="one_to_many")
- # Check invalid arguments
- msg = (
- '"jibberish" is not a valid argument. '
- "Valid arguments are:\n"
- '- "1:1"\n'
- '- "1:m"\n'
- '- "m:1"\n'
- '- "m:m"\n'
- '- "one_to_one"\n'
- '- "one_to_many"\n'
- '- "many_to_one"\n'
- '- "many_to_many"'
- )
- with pytest.raises(ValueError, match=msg):
- merge(left, right, on="a", validate="jibberish")
- # Two column merge, dups in both, but jointly no dups.
- left = DataFrame(
- {
- "a": ["a", "a", "b", "b"],
- "b": [0, 1, 0, 1],
- "c": ["cat", "dog", "weasel", "horse"],
- },
- index=range(4),
- )
- right = DataFrame(
- {
- "a": ["a", "a", "b"],
- "b": [0, 1, 0],
- "d": ["meow", "bark", "um... weasel noise?"],
- },
- index=range(3),
- )
- expected_multi = DataFrame(
- {
- "a": ["a", "a", "b"],
- "b": [0, 1, 0],
- "c": ["cat", "dog", "weasel"],
- "d": ["meow", "bark", "um... weasel noise?"],
- },
- index=range(3),
- )
- msg = (
- "Merge keys are not unique in either left or right dataset; "
- "not a one-to-one merge"
- )
- with pytest.raises(MergeError, match=msg):
- merge(left, right, on="a", validate="1:1")
- result = merge(left, right, on=["a", "b"], validate="1:1")
- tm.assert_frame_equal(result, expected_multi)
- def test_merge_two_empty_df_no_division_error(self):
- # GH17776, PR #17846
- a = DataFrame({"a": [], "b": [], "c": []})
- with np.errstate(divide="raise"):
- merge(a, a, on=("a", "b"))
- @pytest.mark.parametrize("how", ["right", "outer"])
- @pytest.mark.parametrize(
- "index,expected_index",
- [
- (
- CategoricalIndex([1, 2, 4]),
- CategoricalIndex([1, 2, 4, None, None, None]),
- ),
- (
- DatetimeIndex(["2001-01-01", "2002-02-02", "2003-03-03"]),
- DatetimeIndex(
- ["2001-01-01", "2002-02-02", "2003-03-03", pd.NaT, pd.NaT, pd.NaT]
- ),
- ),
- *[
- (
- Index([1, 2, 3], dtype=dtyp),
- Index([1, 2, 3, None, None, None], dtype=np.float64),
- )
- for dtyp in tm.ALL_REAL_NUMPY_DTYPES
- ],
- (
- IntervalIndex.from_tuples([(1, 2), (2, 3), (3, 4)]),
- IntervalIndex.from_tuples(
- [(1, 2), (2, 3), (3, 4), np.nan, np.nan, np.nan]
- ),
- ),
- (
- PeriodIndex(["2001-01-01", "2001-01-02", "2001-01-03"], freq="D"),
- PeriodIndex(
- ["2001-01-01", "2001-01-02", "2001-01-03", pd.NaT, pd.NaT, pd.NaT],
- freq="D",
- ),
- ),
- (
- TimedeltaIndex(["1d", "2d", "3d"]),
- TimedeltaIndex(["1d", "2d", "3d", pd.NaT, pd.NaT, pd.NaT]),
- ),
- ],
- )
- def test_merge_on_index_with_more_values(self, how, index, expected_index):
- # GH 24212
- # pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
- # -1 is interpreted as a missing value instead of the last element
- df1 = DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
- df2 = DataFrame({"b": [0, 1, 2, 3, 4, 5]})
- result = df1.merge(df2, left_on="key", right_index=True, how=how)
- expected = DataFrame(
- [
- [0, 0, 0],
- [1, 1, 1],
- [2, 2, 2],
- [np.nan, 3, 3],
- [np.nan, 4, 4],
- [np.nan, 5, 5],
- ],
- columns=["a", "key", "b"],
- )
- expected.set_index(expected_index, inplace=True)
- tm.assert_frame_equal(result, expected)
- def test_merge_right_index_right(self):
- # Note: the expected output here is probably incorrect.
- # See https://github.com/pandas-dev/pandas/issues/17257 for more.
- # We include this as a regression test for GH-24897.
- left = DataFrame({"a": [1, 2, 3], "key": [0, 1, 1]})
- right = DataFrame({"b": [1, 2, 3]})
- expected = DataFrame(
- {"a": [1, 2, 3, None], "key": [0, 1, 1, 2], "b": [1, 2, 2, 3]},
- columns=["a", "key", "b"],
- index=[0, 1, 2, np.nan],
- )
- result = left.merge(right, left_on="key", right_index=True, how="right")
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("how", ["left", "right"])
- def test_merge_preserves_row_order(self, how):
- # GH 27453
- left_df = DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
- right_df = DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
- result = left_df.merge(right_df, on=["animal", "max_speed"], how=how)
- if how == "right":
- expected = DataFrame({"animal": ["quetzal", "pig"], "max_speed": [80, 11]})
- else:
- expected = DataFrame({"animal": ["dog", "pig"], "max_speed": [40, 11]})
- tm.assert_frame_equal(result, expected)
- def test_merge_take_missing_values_from_index_of_other_dtype(self):
- # GH 24212
- left = DataFrame(
- {
- "a": [1, 2, 3],
- "key": Categorical(["a", "a", "b"], categories=list("abc")),
- }
- )
- right = DataFrame({"b": [1, 2, 3]}, index=CategoricalIndex(["a", "b", "c"]))
- result = left.merge(right, left_on="key", right_index=True, how="right")
- expected = DataFrame(
- {
- "a": [1, 2, 3, None],
- "key": Categorical(["a", "a", "b", "c"]),
- "b": [1, 1, 2, 3],
- },
- index=[0, 1, 2, np.nan],
- )
- expected = expected.reindex(columns=["a", "key", "b"])
- tm.assert_frame_equal(result, expected)
- def test_merge_readonly(self):
- # https://github.com/pandas-dev/pandas/issues/27943
- data1 = DataFrame(
- np.arange(20).reshape((4, 5)) + 1, columns=["a", "b", "c", "d", "e"]
- )
- data2 = DataFrame(
- np.arange(20).reshape((5, 4)) + 1, columns=["a", "b", "x", "y"]
- )
- # make each underlying block array / column array read-only
- for arr in data1._mgr.arrays:
- arr.flags.writeable = False
- data1.merge(data2) # no error
- def _check_merge(x, y):
- for how in ["inner", "left", "outer"]:
- result = x.join(y, how=how)
- expected = merge(x.reset_index(), y.reset_index(), how=how, sort=True)
- expected = expected.set_index("index")
- # TODO check_names on merge?
- tm.assert_frame_equal(result, expected, check_names=False)
- class TestMergeDtypes:
- @pytest.mark.parametrize(
- "right_vals", [["foo", "bar"], Series(["foo", "bar"]).astype("category")]
- )
- def test_different(self, right_vals):
- left = DataFrame(
- {
- "A": ["foo", "bar"],
- "B": Series(["foo", "bar"]).astype("category"),
- "C": [1, 2],
- "D": [1.0, 2.0],
- "E": Series([1, 2], dtype="uint64"),
- "F": Series([1, 2], dtype="int32"),
- }
- )
- right = DataFrame({"A": right_vals})
- # GH 9780
- # We allow merging on object and categorical cols and cast
- # categorical cols to object
- result = merge(left, right, on="A")
- assert is_object_dtype(result.A.dtype)
- @pytest.mark.parametrize(
- "d1", [np.int64, np.int32, np.intc, np.int16, np.int8, np.uint8]
- )
- @pytest.mark.parametrize("d2", [np.int64, np.float64, np.float32, np.float16])
- def test_join_multi_dtypes(self, d1, d2):
- dtype1 = np.dtype(d1)
- dtype2 = np.dtype(d2)
- left = DataFrame(
- {
- "k1": np.array([0, 1, 2] * 8, dtype=dtype1),
- "k2": ["foo", "bar"] * 12,
- "v": np.array(np.arange(24), dtype=np.int64),
- }
- )
- index = MultiIndex.from_tuples([(2, "bar"), (1, "foo")])
- right = DataFrame({"v2": np.array([5, 7], dtype=dtype2)}, index=index)
- result = left.join(right, on=["k1", "k2"])
- expected = left.copy()
- if dtype2.kind == "i":
- dtype2 = np.dtype("float64")
- expected["v2"] = np.array(np.nan, dtype=dtype2)
- expected.loc[(expected.k1 == 2) & (expected.k2 == "bar"), "v2"] = 5
- expected.loc[(expected.k1 == 1) & (expected.k2 == "foo"), "v2"] = 7
- tm.assert_frame_equal(result, expected)
- result = left.join(right, on=["k1", "k2"], sort=True)
- expected.sort_values(["k1", "k2"], kind="mergesort", inplace=True)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "int_vals, float_vals, exp_vals",
- [
- ([1, 2, 3], [1.0, 2.0, 3.0], {"X": [1, 2, 3], "Y": [1.0, 2.0, 3.0]}),
- ([1, 2, 3], [1.0, 3.0], {"X": [1, 3], "Y": [1.0, 3.0]}),
- ([1, 2], [1.0, 2.0, 3.0], {"X": [1, 2], "Y": [1.0, 2.0]}),
- ],
- )
- def test_merge_on_ints_floats(self, int_vals, float_vals, exp_vals):
- # GH 16572
- # Check that float column is not cast to object if
- # merging on float and int columns
- A = DataFrame({"X": int_vals})
- B = DataFrame({"Y": float_vals})
- expected = DataFrame(exp_vals)
- result = A.merge(B, left_on="X", right_on="Y")
- tm.assert_frame_equal(result, expected)
- result = B.merge(A, left_on="Y", right_on="X")
- tm.assert_frame_equal(result, expected[["Y", "X"]])
- def test_merge_key_dtype_cast(self):
- # GH 17044
- df1 = DataFrame({"key": [1.0, 2.0], "v1": [10, 20]}, columns=["key", "v1"])
- df2 = DataFrame({"key": [2], "v2": [200]}, columns=["key", "v2"])
- result = df1.merge(df2, on="key", how="left")
- expected = DataFrame(
- {"key": [1.0, 2.0], "v1": [10, 20], "v2": [np.nan, 200.0]},
- columns=["key", "v1", "v2"],
- )
- tm.assert_frame_equal(result, expected)
- def test_merge_on_ints_floats_warning(self):
- # GH 16572
- # merge will produce a warning when merging on int and
- # float columns where the float values are not exactly
- # equal to their int representation
- A = DataFrame({"X": [1, 2, 3]})
- B = DataFrame({"Y": [1.1, 2.5, 3.0]})
- expected = DataFrame({"X": [3], "Y": [3.0]})
- with tm.assert_produces_warning(UserWarning):
- result = A.merge(B, left_on="X", right_on="Y")
- tm.assert_frame_equal(result, expected)
- with tm.assert_produces_warning(UserWarning):
- result = B.merge(A, left_on="Y", right_on="X")
- tm.assert_frame_equal(result, expected[["Y", "X"]])
- # test no warning if float has NaNs
- B = DataFrame({"Y": [np.nan, np.nan, 3.0]})
- with tm.assert_produces_warning(None):
- result = B.merge(A, left_on="Y", right_on="X")
- tm.assert_frame_equal(result, expected[["Y", "X"]])
- def test_merge_incompat_infer_boolean_object(self):
- # GH21119: bool + object bool merge OK
- df1 = DataFrame({"key": Series([True, False], dtype=object)})
- df2 = DataFrame({"key": [True, False]})
- expected = DataFrame({"key": [True, False]}, dtype=object)
- result = merge(df1, df2, on="key")
- tm.assert_frame_equal(result, expected)
- result = merge(df2, df1, on="key")
- tm.assert_frame_equal(result, expected)
- def test_merge_incompat_infer_boolean_object_with_missing(self):
- # GH21119: bool + object bool merge OK
- # with missing value
- df1 = DataFrame({"key": Series([True, False, np.nan], dtype=object)})
- df2 = DataFrame({"key": [True, False]})
- expected = DataFrame({"key": [True, False]}, dtype=object)
- result = merge(df1, df2, on="key")
- tm.assert_frame_equal(result, expected)
- result = merge(df2, df1, on="key")
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "df1_vals, df2_vals",
- [
- # merge on category coerces to object
- ([0, 1, 2], Series(["a", "b", "a"]).astype("category")),
- ([0.0, 1.0, 2.0], Series(["a", "b", "a"]).astype("category")),
- # no not infer
- ([0, 1], Series([False, True], dtype=object)),
- ([0, 1], Series([False, True], dtype=bool)),
- ],
- )
- def test_merge_incompat_dtypes_are_ok(self, df1_vals, df2_vals):
- # these are explicitly allowed incompat merges, that pass thru
- # the result type is dependent on if the values on the rhs are
- # inferred, otherwise these will be coerced to object
- df1 = DataFrame({"A": df1_vals})
- df2 = DataFrame({"A": df2_vals})
- result = merge(df1, df2, on=["A"])
- assert is_object_dtype(result.A.dtype)
- result = merge(df2, df1, on=["A"])
- assert is_object_dtype(result.A.dtype)
- @pytest.mark.parametrize(
- "df1_vals, df2_vals",
- [
- # do not infer to numeric
- (Series([1, 2], dtype="uint64"), ["a", "b", "c"]),
- (Series([1, 2], dtype="int32"), ["a", "b", "c"]),
- ([0, 1, 2], ["0", "1", "2"]),
- ([0.0, 1.0, 2.0], ["0", "1", "2"]),
- ([0, 1, 2], ["0", "1", "2"]),
- (
- pd.date_range("1/1/2011", periods=2, freq="D"),
- ["2011-01-01", "2011-01-02"],
- ),
- (pd.date_range("1/1/2011", periods=2, freq="D"), [0, 1]),
- (pd.date_range("1/1/2011", periods=2, freq="D"), [0.0, 1.0]),
- (
- pd.date_range("20130101", periods=3),
- pd.date_range("20130101", periods=3, tz="US/Eastern"),
- ),
- ],
- )
- def test_merge_incompat_dtypes_error(self, df1_vals, df2_vals):
- # GH 9780, GH 15800
- # Raise a ValueError when a user tries to merge on
- # dtypes that are incompatible (e.g., obj and int/float)
- df1 = DataFrame({"A": df1_vals})
- df2 = DataFrame({"A": df2_vals})
- msg = (
- f"You are trying to merge on {df1['A'].dtype} and "
- f"{df2['A'].dtype} columns. If you wish to proceed "
- "you should use pd.concat"
- )
- msg = re.escape(msg)
- with pytest.raises(ValueError, match=msg):
- merge(df1, df2, on=["A"])
- # Check that error still raised when swapping order of dataframes
- msg = (
- f"You are trying to merge on {df2['A'].dtype} and "
- f"{df1['A'].dtype} columns. If you wish to proceed "
- "you should use pd.concat"
- )
- msg = re.escape(msg)
- with pytest.raises(ValueError, match=msg):
- merge(df2, df1, on=["A"])
- @pytest.mark.parametrize(
- "expected_data, how",
- [
- ([1, 2], "outer"),
- ([], "inner"),
- ([2], "right"),
- ([1], "left"),
- ],
- )
- def test_merge_EA_dtype(self, any_numeric_ea_dtype, how, expected_data):
- # GH#40073
- d1 = DataFrame([(1,)], columns=["id"], dtype=any_numeric_ea_dtype)
- d2 = DataFrame([(2,)], columns=["id"], dtype=any_numeric_ea_dtype)
- result = merge(d1, d2, how=how)
- exp_index = RangeIndex(len(expected_data))
- expected = DataFrame(
- expected_data, index=exp_index, columns=["id"], dtype=any_numeric_ea_dtype
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "expected_data, how",
- [
- (["a", "b"], "outer"),
- ([], "inner"),
- (["b"], "right"),
- (["a"], "left"),
- ],
- )
- def test_merge_string_dtype(self, how, expected_data, any_string_dtype):
- # GH#40073
- d1 = DataFrame([("a",)], columns=["id"], dtype=any_string_dtype)
- d2 = DataFrame([("b",)], columns=["id"], dtype=any_string_dtype)
- result = merge(d1, d2, how=how)
- exp_idx = RangeIndex(len(expected_data))
- expected = DataFrame(
- expected_data, index=exp_idx, columns=["id"], dtype=any_string_dtype
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "how, expected_data",
- [
- ("inner", [[True, 1, 4], [False, 5, 3]]),
- ("outer", [[True, 1, 4], [False, 5, 3]]),
- ("left", [[True, 1, 4], [False, 5, 3]]),
- ("right", [[False, 5, 3], [True, 1, 4]]),
- ],
- )
- def test_merge_bool_dtype(self, how, expected_data):
- # GH#40073
- df1 = DataFrame({"A": [True, False], "B": [1, 5]})
- df2 = DataFrame({"A": [False, True], "C": [3, 4]})
- result = merge(df1, df2, how=how)
- expected = DataFrame(expected_data, columns=["A", "B", "C"])
- tm.assert_frame_equal(result, expected)
- def test_merge_ea_with_string(self, join_type, string_dtype):
- # GH 43734 Avoid the use of `assign` with multi-index
- df1 = DataFrame(
- data={
- ("lvl0", "lvl1-a"): ["1", "2", "3", "4", None],
- ("lvl0", "lvl1-b"): ["4", "5", "6", "7", "8"],
- },
- dtype=pd.StringDtype(),
- )
- df1_copy = df1.copy()
- df2 = DataFrame(
- data={
- ("lvl0", "lvl1-a"): ["1", "2", "3", pd.NA, "5"],
- ("lvl0", "lvl1-c"): ["7", "8", "9", pd.NA, "11"],
- },
- dtype=string_dtype,
- )
- df2_copy = df2.copy()
- merged = merge(left=df1, right=df2, on=[("lvl0", "lvl1-a")], how=join_type)
- # No change in df1 and df2
- tm.assert_frame_equal(df1, df1_copy)
- tm.assert_frame_equal(df2, df2_copy)
- # Check the expected types for the merged data frame
- expected = Series(
- [np.dtype("O"), pd.StringDtype(), np.dtype("O")],
- index=MultiIndex.from_tuples(
- [("lvl0", "lvl1-a"), ("lvl0", "lvl1-b"), ("lvl0", "lvl1-c")]
- ),
- )
- tm.assert_series_equal(merged.dtypes, expected)
- @pytest.mark.parametrize(
- "left_empty, how, exp",
- [
- (False, "left", "left"),
- (False, "right", "empty"),
- (False, "inner", "empty"),
- (False, "outer", "left"),
- (False, "cross", "empty_cross"),
- (True, "left", "empty"),
- (True, "right", "right"),
- (True, "inner", "empty"),
- (True, "outer", "right"),
- (True, "cross", "empty_cross"),
- ],
- )
- def test_merge_empty(self, left_empty, how, exp):
- left = DataFrame({"A": [2, 1], "B": [3, 4]})
- right = DataFrame({"A": [1], "C": [5]}, dtype="int64")
- if left_empty:
- left = left.head(0)
- else:
- right = right.head(0)
- result = left.merge(right, how=how)
- if exp == "left":
- expected = DataFrame({"A": [2, 1], "B": [3, 4], "C": [np.nan, np.nan]})
- elif exp == "right":
- expected = DataFrame({"B": [np.nan], "A": [1], "C": [5]})
- elif exp == "empty":
- expected = DataFrame(columns=["A", "B", "C"], dtype="int64")
- if left_empty:
- expected = expected[["B", "A", "C"]]
- elif exp == "empty_cross":
- expected = DataFrame(columns=["A_x", "B", "A_y", "C"], dtype="int64")
- tm.assert_frame_equal(result, expected)
- @pytest.fixture
- def left():
- np.random.seed(1234)
- return DataFrame(
- {
- "X": Series(np.random.choice(["foo", "bar"], size=(10,))).astype(
- CDT(["foo", "bar"])
- ),
- "Y": np.random.choice(["one", "two", "three"], size=(10,)),
- }
- )
- @pytest.fixture
- def right():
- np.random.seed(1234)
- return DataFrame(
- {"X": Series(["foo", "bar"]).astype(CDT(["foo", "bar"])), "Z": [1, 2]}
- )
- class TestMergeCategorical:
- def test_identical(self, left):
- # merging on the same, should preserve dtypes
- merged = merge(left, left, on="X")
- result = merged.dtypes.sort_index()
- expected = Series(
- [CategoricalDtype(categories=["foo", "bar"]), np.dtype("O"), np.dtype("O")],
- index=["X", "Y_x", "Y_y"],
- )
- tm.assert_series_equal(result, expected)
- def test_basic(self, left, right):
- # we have matching Categorical dtypes in X
- # so should preserve the merged column
- merged = merge(left, right, on="X")
- result = merged.dtypes.sort_index()
- expected = Series(
- [
- CategoricalDtype(categories=["foo", "bar"]),
- np.dtype("O"),
- np.dtype("int64"),
- ],
- index=["X", "Y", "Z"],
- )
- tm.assert_series_equal(result, expected)
- def test_merge_categorical(self):
- # GH 9426
- right = DataFrame(
- {
- "c": {0: "a", 1: "b", 2: "c", 3: "d", 4: "e"},
- "d": {0: "null", 1: "null", 2: "null", 3: "null", 4: "null"},
- }
- )
- left = DataFrame(
- {
- "a": {0: "f", 1: "f", 2: "f", 3: "f", 4: "f"},
- "b": {0: "g", 1: "g", 2: "g", 3: "g", 4: "g"},
- }
- )
- df = merge(left, right, how="left", left_on="b", right_on="c")
- # object-object
- expected = df.copy()
- # object-cat
- # note that we propagate the category
- # because we don't have any matching rows
- cright = right.copy()
- cright["d"] = cright["d"].astype("category")
- result = merge(left, cright, how="left", left_on="b", right_on="c")
- expected["d"] = expected["d"].astype(CategoricalDtype(["null"]))
- tm.assert_frame_equal(result, expected)
- # cat-object
- cleft = left.copy()
- cleft["b"] = cleft["b"].astype("category")
- result = merge(cleft, cright, how="left", left_on="b", right_on="c")
- tm.assert_frame_equal(result, expected)
- # cat-cat
- cright = right.copy()
- cright["d"] = cright["d"].astype("category")
- cleft = left.copy()
- cleft["b"] = cleft["b"].astype("category")
- result = merge(cleft, cright, how="left", left_on="b", right_on="c")
- tm.assert_frame_equal(result, expected)
- def tests_merge_categorical_unordered_equal(self):
- # GH-19551
- df1 = DataFrame(
- {
- "Foo": Categorical(["A", "B", "C"], categories=["A", "B", "C"]),
- "Left": ["A0", "B0", "C0"],
- }
- )
- df2 = DataFrame(
- {
- "Foo": Categorical(["C", "B", "A"], categories=["C", "B", "A"]),
- "Right": ["C1", "B1", "A1"],
- }
- )
- result = merge(df1, df2, on=["Foo"])
- expected = DataFrame(
- {
- "Foo": Categorical(["A", "B", "C"]),
- "Left": ["A0", "B0", "C0"],
- "Right": ["A1", "B1", "C1"],
- }
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("ordered", [True, False])
- def test_multiindex_merge_with_unordered_categoricalindex(self, ordered):
- # GH 36973
- pcat = CategoricalDtype(categories=["P2", "P1"], ordered=ordered)
- df1 = DataFrame(
- {
- "id": ["C", "C", "D"],
- "p": Categorical(["P2", "P1", "P2"], dtype=pcat),
- "a": [0, 1, 2],
- }
- ).set_index(["id", "p"])
- df2 = DataFrame(
- {
- "id": ["A", "C", "C"],
- "p": Categorical(["P2", "P2", "P1"], dtype=pcat),
- "d1": [10, 11, 12],
- }
- ).set_index(["id", "p"])
- result = merge(df1, df2, how="left", left_index=True, right_index=True)
- expected = DataFrame(
- {
- "id": ["C", "C", "D"],
- "p": Categorical(["P2", "P1", "P2"], dtype=pcat),
- "a": [0, 1, 2],
- "d1": [11.0, 12.0, np.nan],
- }
- ).set_index(["id", "p"])
- tm.assert_frame_equal(result, expected)
- def test_other_columns(self, left, right):
- # non-merge columns should preserve if possible
- right = right.assign(Z=right.Z.astype("category"))
- merged = merge(left, right, on="X")
- result = merged.dtypes.sort_index()
- expected = Series(
- [
- CategoricalDtype(categories=["foo", "bar"]),
- np.dtype("O"),
- CategoricalDtype(categories=[1, 2]),
- ],
- index=["X", "Y", "Z"],
- )
- tm.assert_series_equal(result, expected)
- # categories are preserved
- assert left.X.values._categories_match_up_to_permutation(merged.X.values)
- assert right.Z.values._categories_match_up_to_permutation(merged.Z.values)
- @pytest.mark.parametrize(
- "change",
- [
- lambda x: x,
- lambda x: x.astype(CDT(["foo", "bar", "bah"])),
- lambda x: x.astype(CDT(ordered=True)),
- ],
- )
- def test_dtype_on_merged_different(self, change, join_type, left, right):
- # our merging columns, X now has 2 different dtypes
- # so we must be object as a result
- X = change(right.X.astype("object"))
- right = right.assign(X=X)
- assert is_categorical_dtype(left.X.values.dtype)
- # assert not left.X.values._categories_match_up_to_permutation(right.X.values)
- merged = merge(left, right, on="X", how=join_type)
- result = merged.dtypes.sort_index()
- expected = Series(
- [np.dtype("O"), np.dtype("O"), np.dtype("int64")], index=["X", "Y", "Z"]
- )
- tm.assert_series_equal(result, expected)
- def test_self_join_multiple_categories(self):
- # GH 16767
- # non-duplicates should work with multiple categories
- m = 5
- df = DataFrame(
- {
- "a": ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"] * m,
- "b": ["t", "w", "x", "y", "z"] * 2 * m,
- "c": [
- letter
- for each in ["m", "n", "u", "p", "o"]
- for letter in [each] * 2 * m
- ],
- "d": [
- letter
- for each in [
- "aa",
- "bb",
- "cc",
- "dd",
- "ee",
- "ff",
- "gg",
- "hh",
- "ii",
- "jj",
- ]
- for letter in [each] * m
- ],
- }
- )
- # change them all to categorical variables
- df = df.apply(lambda x: x.astype("category"))
- # self-join should equal ourselves
- result = merge(df, df, on=list(df.columns))
- tm.assert_frame_equal(result, df)
- def test_dtype_on_categorical_dates(self):
- # GH 16900
- # dates should not be coerced to ints
- df = DataFrame(
- [[date(2001, 1, 1), 1.1], [date(2001, 1, 2), 1.3]], columns=["date", "num2"]
- )
- df["date"] = df["date"].astype("category")
- df2 = DataFrame(
- [[date(2001, 1, 1), 1.3], [date(2001, 1, 3), 1.4]], columns=["date", "num4"]
- )
- df2["date"] = df2["date"].astype("category")
- expected_outer = DataFrame(
- [
- [pd.Timestamp("2001-01-01").date(), 1.1, 1.3],
- [pd.Timestamp("2001-01-02").date(), 1.3, np.nan],
- [pd.Timestamp("2001-01-03").date(), np.nan, 1.4],
- ],
- columns=["date", "num2", "num4"],
- )
- result_outer = merge(df, df2, how="outer", on=["date"])
- tm.assert_frame_equal(result_outer, expected_outer)
- expected_inner = DataFrame(
- [[pd.Timestamp("2001-01-01").date(), 1.1, 1.3]],
- columns=["date", "num2", "num4"],
- )
- result_inner = merge(df, df2, how="inner", on=["date"])
- tm.assert_frame_equal(result_inner, expected_inner)
- @pytest.mark.parametrize("ordered", [True, False])
- @pytest.mark.parametrize(
- "category_column,categories,expected_categories",
- [
- ([False, True, True, False], [True, False], [True, False]),
- ([2, 1, 1, 2], [1, 2], [1, 2]),
- (["False", "True", "True", "False"], ["True", "False"], ["True", "False"]),
- ],
- )
- def test_merging_with_bool_or_int_cateorical_column(
- self, category_column, categories, expected_categories, ordered
- ):
- # GH 17187
- # merging with a boolean/int categorical column
- df1 = DataFrame({"id": [1, 2, 3, 4], "cat": category_column})
- df1["cat"] = df1["cat"].astype(CDT(categories, ordered=ordered))
- df2 = DataFrame({"id": [2, 4], "num": [1, 9]})
- result = df1.merge(df2)
- expected = DataFrame({"id": [2, 4], "cat": expected_categories, "num": [1, 9]})
- expected["cat"] = expected["cat"].astype(CDT(categories, ordered=ordered))
- tm.assert_frame_equal(expected, result)
- def test_merge_on_int_array(self):
- # GH 23020
- df = DataFrame({"A": Series([1, 2, np.nan], dtype="Int64"), "B": 1})
- result = merge(df, df, on="A")
- expected = DataFrame(
- {"A": Series([1, 2, np.nan], dtype="Int64"), "B_x": 1, "B_y": 1}
- )
- tm.assert_frame_equal(result, expected)
- @pytest.fixture
- def left_df():
- return DataFrame({"a": [20, 10, 0]}, index=[2, 1, 0])
- @pytest.fixture
- def right_df():
- return DataFrame({"b": [300, 100, 200]}, index=[3, 1, 2])
- class TestMergeOnIndexes:
- @pytest.mark.parametrize(
- "how, sort, expected",
- [
- ("inner", False, DataFrame({"a": [20, 10], "b": [200, 100]}, index=[2, 1])),
- ("inner", True, DataFrame({"a": [10, 20], "b": [100, 200]}, index=[1, 2])),
- (
- "left",
- False,
- DataFrame({"a": [20, 10, 0], "b": [200, 100, np.nan]}, index=[2, 1, 0]),
- ),
- (
- "left",
- True,
- DataFrame({"a": [0, 10, 20], "b": [np.nan, 100, 200]}, index=[0, 1, 2]),
- ),
- (
- "right",
- False,
- DataFrame(
- {"a": [np.nan, 10, 20], "b": [300, 100, 200]}, index=[3, 1, 2]
- ),
- ),
- (
- "right",
- True,
- DataFrame(
- {"a": [10, 20, np.nan], "b": [100, 200, 300]}, index=[1, 2, 3]
- ),
- ),
- (
- "outer",
- False,
- DataFrame(
- {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
- index=[0, 1, 2, 3],
- ),
- ),
- (
- "outer",
- True,
- DataFrame(
- {"a": [0, 10, 20, np.nan], "b": [np.nan, 100, 200, 300]},
- index=[0, 1, 2, 3],
- ),
- ),
- ],
- )
- def test_merge_on_indexes(self, left_df, right_df, how, sort, expected):
- result = merge(
- left_df, right_df, left_index=True, right_index=True, how=how, sort=sort
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "index",
- [Index([1, 2], dtype=dtyp, name="index_col") for dtyp in tm.ALL_REAL_NUMPY_DTYPES]
- + [
- CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"),
- RangeIndex(start=0, stop=2, name="index_col"),
- DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"),
- ],
- ids=lambda x: f"{type(x).__name__}[{x.dtype}]",
- )
- def test_merge_index_types(index):
- # gh-20777
- # assert key access is consistent across index types
- left = DataFrame({"left_data": [1, 2]}, index=index)
- right = DataFrame({"right_data": [1.0, 2.0]}, index=index)
- result = left.merge(right, on=["index_col"])
- expected = DataFrame({"left_data": [1, 2], "right_data": [1.0, 2.0]}, index=index)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "on,left_on,right_on,left_index,right_index,nm",
- [
- (["outer", "inner"], None, None, False, False, "B"),
- (None, None, None, True, True, "B"),
- (None, ["outer", "inner"], None, False, True, "B"),
- (None, None, ["outer", "inner"], True, False, "B"),
- (["outer", "inner"], None, None, False, False, None),
- (None, None, None, True, True, None),
- (None, ["outer", "inner"], None, False, True, None),
- (None, None, ["outer", "inner"], True, False, None),
- ],
- )
- def test_merge_series(on, left_on, right_on, left_index, right_index, nm):
- # GH 21220
- a = DataFrame(
- {"A": [1, 2, 3, 4]},
- index=MultiIndex.from_product([["a", "b"], [0, 1]], names=["outer", "inner"]),
- )
- b = Series(
- [1, 2, 3, 4],
- index=MultiIndex.from_product([["a", "b"], [1, 2]], names=["outer", "inner"]),
- name=nm,
- )
- expected = DataFrame(
- {"A": [2, 4], "B": [1, 3]},
- index=MultiIndex.from_product([["a", "b"], [1]], names=["outer", "inner"]),
- )
- if nm is not None:
- result = merge(
- a,
- b,
- on=on,
- left_on=left_on,
- right_on=right_on,
- left_index=left_index,
- right_index=right_index,
- )
- tm.assert_frame_equal(result, expected)
- else:
- msg = "Cannot merge a Series without a name"
- with pytest.raises(ValueError, match=msg):
- result = merge(
- a,
- b,
- on=on,
- left_on=left_on,
- right_on=right_on,
- left_index=left_index,
- right_index=right_index,
- )
- def test_merge_series_multilevel():
- # GH#47946
- # GH 40993: For raising, enforced in 2.0
- a = DataFrame(
- {"A": [1, 2, 3, 4]},
- index=MultiIndex.from_product([["a", "b"], [0, 1]], names=["outer", "inner"]),
- )
- b = Series(
- [1, 2, 3, 4],
- index=MultiIndex.from_product([["a", "b"], [1, 2]], names=["outer", "inner"]),
- name=("B", "C"),
- )
- with pytest.raises(
- MergeError, match="Not allowed to merge between different levels"
- ):
- merge(a, b, on=["outer", "inner"])
- @pytest.mark.parametrize(
- "col1, col2, kwargs, expected_cols",
- [
- (0, 0, {"suffixes": ("", "_dup")}, ["0", "0_dup"]),
- (0, 0, {"suffixes": (None, "_dup")}, [0, "0_dup"]),
- (0, 0, {"suffixes": ("_x", "_y")}, ["0_x", "0_y"]),
- (0, 0, {"suffixes": ["_x", "_y"]}, ["0_x", "0_y"]),
- ("a", 0, {"suffixes": (None, "_y")}, ["a", 0]),
- (0.0, 0.0, {"suffixes": ("_x", None)}, ["0.0_x", 0.0]),
- ("b", "b", {"suffixes": (None, "_y")}, ["b", "b_y"]),
- ("a", "a", {"suffixes": ("_x", None)}, ["a_x", "a"]),
- ("a", "b", {"suffixes": ("_x", None)}, ["a", "b"]),
- ("a", "a", {"suffixes": (None, "_x")}, ["a", "a_x"]),
- (0, 0, {"suffixes": ("_a", None)}, ["0_a", 0]),
- ("a", "a", {}, ["a_x", "a_y"]),
- (0, 0, {}, ["0_x", "0_y"]),
- ],
- )
- def test_merge_suffix(col1, col2, kwargs, expected_cols):
- # issue: 24782
- a = DataFrame({col1: [1, 2, 3]})
- b = DataFrame({col2: [4, 5, 6]})
- expected = DataFrame([[1, 4], [2, 5], [3, 6]], columns=expected_cols)
- result = a.merge(b, left_index=True, right_index=True, **kwargs)
- tm.assert_frame_equal(result, expected)
- result = merge(a, b, left_index=True, right_index=True, **kwargs)
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "how,expected",
- [
- (
- "right",
- DataFrame(
- {"A": [100, 200, 300], "B1": [60, 70, np.nan], "B2": [600, 700, 800]}
- ),
- ),
- (
- "outer",
- DataFrame(
- {
- "A": [100, 200, 1, 300],
- "B1": [60, 70, 80, np.nan],
- "B2": [600, 700, np.nan, 800],
- }
- ),
- ),
- ],
- )
- def test_merge_duplicate_suffix(how, expected):
- left_df = DataFrame({"A": [100, 200, 1], "B": [60, 70, 80]})
- right_df = DataFrame({"A": [100, 200, 300], "B": [600, 700, 800]})
- result = merge(left_df, right_df, on="A", how=how, suffixes=("_x", "_x"))
- expected.columns = ["A", "B_x", "B_x"]
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize(
- "col1, col2, suffixes",
- [("a", "a", (None, None)), ("a", "a", ("", None)), (0, 0, (None, ""))],
- )
- def test_merge_suffix_error(col1, col2, suffixes):
- # issue: 24782
- a = DataFrame({col1: [1, 2, 3]})
- b = DataFrame({col2: [3, 4, 5]})
- # TODO: might reconsider current raise behaviour, see issue 24782
- msg = "columns overlap but no suffix specified"
- with pytest.raises(ValueError, match=msg):
- merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
- @pytest.mark.parametrize("suffixes", [{"left", "right"}, {"left": 0, "right": 0}])
- def test_merge_suffix_raises(suffixes):
- a = DataFrame({"a": [1, 2, 3]})
- b = DataFrame({"b": [3, 4, 5]})
- with pytest.raises(TypeError, match="Passing 'suffixes' as a"):
- merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
- @pytest.mark.parametrize(
- "col1, col2, suffixes, msg",
- [
- ("a", "a", ("a", "b", "c"), r"too many values to unpack \(expected 2\)"),
- ("a", "a", tuple("a"), r"not enough values to unpack \(expected 2, got 1\)"),
- ],
- )
- def test_merge_suffix_length_error(col1, col2, suffixes, msg):
- a = DataFrame({col1: [1, 2, 3]})
- b = DataFrame({col2: [3, 4, 5]})
- with pytest.raises(ValueError, match=msg):
- merge(a, b, left_index=True, right_index=True, suffixes=suffixes)
- @pytest.mark.parametrize("cat_dtype", ["one", "two"])
- @pytest.mark.parametrize("reverse", [True, False])
- def test_merge_equal_cat_dtypes(cat_dtype, reverse):
- # see gh-22501
- cat_dtypes = {
- "one": CategoricalDtype(categories=["a", "b", "c"], ordered=False),
- "two": CategoricalDtype(categories=["a", "b", "c"], ordered=False),
- }
- df1 = DataFrame(
- {"foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]), "left": [1, 2, 3]}
- ).set_index("foo")
- data_foo = ["a", "b", "c"]
- data_right = [1, 2, 3]
- if reverse:
- data_foo.reverse()
- data_right.reverse()
- df2 = DataFrame(
- {"foo": Series(data_foo).astype(cat_dtypes[cat_dtype]), "right": data_right}
- ).set_index("foo")
- result = df1.merge(df2, left_index=True, right_index=True)
- expected = DataFrame(
- {
- "left": [1, 2, 3],
- "right": [1, 2, 3],
- "foo": Series(["a", "b", "c"]).astype(cat_dtypes["one"]),
- }
- ).set_index("foo")
- tm.assert_frame_equal(result, expected)
- def test_merge_equal_cat_dtypes2():
- # see gh-22501
- cat_dtype = CategoricalDtype(categories=["a", "b", "c"], ordered=False)
- # Test Data
- df1 = DataFrame(
- {"foo": Series(["a", "b"]).astype(cat_dtype), "left": [1, 2]}
- ).set_index("foo")
- df2 = DataFrame(
- {"foo": Series(["a", "b", "c"]).astype(cat_dtype), "right": [3, 2, 1]}
- ).set_index("foo")
- result = df1.merge(df2, left_index=True, right_index=True)
- expected = DataFrame(
- {"left": [1, 2], "right": [3, 2], "foo": Series(["a", "b"]).astype(cat_dtype)}
- ).set_index("foo")
- tm.assert_frame_equal(result, expected)
- def test_merge_on_cat_and_ext_array():
- # GH 28668
- right = DataFrame(
- {"a": Series([pd.Interval(0, 1), pd.Interval(1, 2)], dtype="interval")}
- )
- left = right.copy()
- left["a"] = left["a"].astype("category")
- result = merge(left, right, how="inner", on="a")
- expected = right.copy()
- tm.assert_frame_equal(result, expected)
- def test_merge_multiindex_columns():
- # Issue #28518
- # Verify that merging two dataframes give the expected labels
- # The original cause of this issue come from a bug lexsort_depth and is tested in
- # test_lexsort_depth
- letters = ["a", "b", "c", "d"]
- numbers = ["1", "2", "3"]
- index = MultiIndex.from_product((letters, numbers), names=["outer", "inner"])
- frame_x = DataFrame(columns=index)
- frame_x["id"] = ""
- frame_y = DataFrame(columns=index)
- frame_y["id"] = ""
- l_suf = "_x"
- r_suf = "_y"
- result = frame_x.merge(frame_y, on="id", suffixes=((l_suf, r_suf)))
- # Constructing the expected results
- expected_labels = [letter + l_suf for letter in letters] + [
- letter + r_suf for letter in letters
- ]
- expected_index = MultiIndex.from_product(
- [expected_labels, numbers], names=["outer", "inner"]
- )
- expected = DataFrame(columns=expected_index)
- expected["id"] = ""
- tm.assert_frame_equal(result, expected)
- def test_merge_datetime_upcast_dtype():
- # https://github.com/pandas-dev/pandas/issues/31208
- df1 = DataFrame({"x": ["a", "b", "c"], "y": ["1", "2", "4"]})
- df2 = DataFrame(
- {"y": ["1", "2", "3"], "z": pd.to_datetime(["2000", "2001", "2002"])}
- )
- result = merge(df1, df2, how="left", on="y")
- expected = DataFrame(
- {
- "x": ["a", "b", "c"],
- "y": ["1", "2", "4"],
- "z": pd.to_datetime(["2000", "2001", "NaT"]),
- }
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("n_categories", [5, 128])
- def test_categorical_non_unique_monotonic(n_categories):
- # GH 28189
- # With n_categories as 5, we test the int8 case is hit in libjoin,
- # with n_categories as 128 we test the int16 case.
- left_index = CategoricalIndex([0] + list(range(n_categories)))
- df1 = DataFrame(range(n_categories + 1), columns=["value"], index=left_index)
- df2 = DataFrame(
- [[6]],
- columns=["value"],
- index=CategoricalIndex([0], categories=list(range(n_categories))),
- )
- result = merge(df1, df2, how="left", left_index=True, right_index=True)
- expected = DataFrame(
- [[i, 6.0] if i < 2 else [i, np.nan] for i in range(n_categories + 1)],
- columns=["value_x", "value_y"],
- index=left_index,
- )
- tm.assert_frame_equal(expected, result)
- def test_merge_join_categorical_multiindex():
- # From issue 16627
- a = {
- "Cat1": Categorical(["a", "b", "a", "c", "a", "b"], ["a", "b", "c"]),
- "Int1": [0, 1, 0, 1, 0, 0],
- }
- a = DataFrame(a)
- b = {
- "Cat": Categorical(["a", "b", "c", "a", "b", "c"], ["a", "b", "c"]),
- "Int": [0, 0, 0, 1, 1, 1],
- "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6],
- }
- b = DataFrame(b).set_index(["Cat", "Int"])["Factor"]
- expected = merge(
- a,
- b.reset_index(),
- left_on=["Cat1", "Int1"],
- right_on=["Cat", "Int"],
- how="left",
- )
- expected = expected.drop(["Cat", "Int"], axis=1)
- result = a.join(b, on=["Cat1", "Int1"])
- tm.assert_frame_equal(expected, result)
- # Same test, but with ordered categorical
- a = {
- "Cat1": Categorical(
- ["a", "b", "a", "c", "a", "b"], ["b", "a", "c"], ordered=True
- ),
- "Int1": [0, 1, 0, 1, 0, 0],
- }
- a = DataFrame(a)
- b = {
- "Cat": Categorical(
- ["a", "b", "c", "a", "b", "c"], ["b", "a", "c"], ordered=True
- ),
- "Int": [0, 0, 0, 1, 1, 1],
- "Factor": [1.1, 1.2, 1.3, 1.4, 1.5, 1.6],
- }
- b = DataFrame(b).set_index(["Cat", "Int"])["Factor"]
- expected = merge(
- a,
- b.reset_index(),
- left_on=["Cat1", "Int1"],
- right_on=["Cat", "Int"],
- how="left",
- )
- expected = expected.drop(["Cat", "Int"], axis=1)
- result = a.join(b, on=["Cat1", "Int1"])
- tm.assert_frame_equal(expected, result)
- @pytest.mark.parametrize("func", ["merge", "merge_asof"])
- @pytest.mark.parametrize(
- ("kwargs", "err_msg"),
- [
- ({"left_on": "a", "left_index": True}, ["left_on", "left_index"]),
- ({"right_on": "a", "right_index": True}, ["right_on", "right_index"]),
- ],
- )
- def test_merge_join_cols_error_reporting_duplicates(func, kwargs, err_msg):
- # GH: 16228
- left = DataFrame({"a": [1, 2], "b": [3, 4]})
- right = DataFrame({"a": [1, 1], "c": [5, 6]})
- msg = rf'Can only pass argument "{err_msg[0]}" OR "{err_msg[1]}" not both\.'
- with pytest.raises(MergeError, match=msg):
- getattr(pd, func)(left, right, **kwargs)
- @pytest.mark.parametrize("func", ["merge", "merge_asof"])
- @pytest.mark.parametrize(
- ("kwargs", "err_msg"),
- [
- ({"left_on": "a"}, ["right_on", "right_index"]),
- ({"right_on": "a"}, ["left_on", "left_index"]),
- ],
- )
- def test_merge_join_cols_error_reporting_missing(func, kwargs, err_msg):
- # GH: 16228
- left = DataFrame({"a": [1, 2], "b": [3, 4]})
- right = DataFrame({"a": [1, 1], "c": [5, 6]})
- msg = rf'Must pass "{err_msg[0]}" OR "{err_msg[1]}"\.'
- with pytest.raises(MergeError, match=msg):
- getattr(pd, func)(left, right, **kwargs)
- @pytest.mark.parametrize("func", ["merge", "merge_asof"])
- @pytest.mark.parametrize(
- "kwargs",
- [
- {"right_index": True},
- {"left_index": True},
- ],
- )
- def test_merge_join_cols_error_reporting_on_and_index(func, kwargs):
- # GH: 16228
- left = DataFrame({"a": [1, 2], "b": [3, 4]})
- right = DataFrame({"a": [1, 1], "c": [5, 6]})
- msg = (
- r'Can only pass argument "on" OR "left_index" '
- r'and "right_index", not a combination of both\.'
- )
- with pytest.raises(MergeError, match=msg):
- getattr(pd, func)(left, right, on="a", **kwargs)
- def test_merge_right_left_index():
- # GH#38616
- left = DataFrame({"x": [1, 1], "z": ["foo", "foo"]})
- right = DataFrame({"x": [1, 1], "z": ["foo", "foo"]})
- result = merge(left, right, how="right", left_index=True, right_on="x")
- expected = DataFrame(
- {
- "x": [1, 1],
- "x_x": [1, 1],
- "z_x": ["foo", "foo"],
- "x_y": [1, 1],
- "z_y": ["foo", "foo"],
- }
- )
- tm.assert_frame_equal(result, expected)
- def test_merge_result_empty_index_and_on():
- # GH#33814
- df1 = DataFrame({"a": [1], "b": [2]}).set_index(["a", "b"])
- df2 = DataFrame({"b": [1]}).set_index(["b"])
- expected = DataFrame({"a": [], "b": []}, dtype=np.int64).set_index(["a", "b"])
- result = merge(df1, df2, left_on=["b"], right_index=True)
- tm.assert_frame_equal(result, expected)
- result = merge(df2, df1, left_index=True, right_on=["b"])
- tm.assert_frame_equal(result, expected)
- def test_merge_suffixes_produce_dup_columns_raises():
- # GH#22818; Enforced in 2.0
- left = DataFrame({"a": [1, 2, 3], "b": 1, "b_x": 2})
- right = DataFrame({"a": [1, 2, 3], "b": 2})
- with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"):
- merge(left, right, on="a")
- with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"):
- merge(right, left, on="a", suffixes=("_y", "_x"))
- def test_merge_duplicate_columns_with_suffix_no_warning():
- # GH#22818
- # Do not raise warning when duplicates are caused by duplicates in origin
- left = DataFrame([[1, 1, 1], [2, 2, 2]], columns=["a", "b", "b"])
- right = DataFrame({"a": [1, 3], "b": 2})
- result = merge(left, right, on="a")
- expected = DataFrame([[1, 1, 1, 2]], columns=["a", "b_x", "b_x", "b_y"])
- tm.assert_frame_equal(result, expected)
- def test_merge_duplicate_columns_with_suffix_causing_another_duplicate_raises():
- # GH#22818, Enforced in 2.0
- # This should raise warning because suffixes cause another collision
- left = DataFrame([[1, 1, 1, 1], [2, 2, 2, 2]], columns=["a", "b", "b", "b_x"])
- right = DataFrame({"a": [1, 3], "b": 2})
- with pytest.raises(MergeError, match="Passing 'suffixes' which cause duplicate"):
- merge(left, right, on="a")
- def test_merge_string_float_column_result():
- # GH 13353
- df1 = DataFrame([[1, 2], [3, 4]], columns=Index(["a", 114.0]))
- df2 = DataFrame([[9, 10], [11, 12]], columns=["x", "y"])
- result = merge(df2, df1, how="inner", left_index=True, right_index=True)
- expected = DataFrame(
- [[9, 10, 1, 2], [11, 12, 3, 4]], columns=Index(["x", "y", "a", 114.0])
- )
- tm.assert_frame_equal(result, expected)
- def test_mergeerror_on_left_index_mismatched_dtypes():
- # GH 22449
- df_1 = DataFrame(data=["X"], columns=["C"], index=[22])
- df_2 = DataFrame(data=["X"], columns=["C"], index=[999])
- with pytest.raises(MergeError, match="Can only pass argument"):
- merge(df_1, df_2, on=["C"], left_index=True)
- def test_merge_on_left_categoricalindex():
- # GH#48464 don't raise when left_on is a CategoricalIndex
- ci = CategoricalIndex(range(3))
- right = DataFrame({"A": ci, "B": range(3)})
- left = DataFrame({"C": range(3, 6)})
- res = merge(left, right, left_on=ci, right_on="A")
- expected = merge(left, right, left_on=ci._data, right_on="A")
- tm.assert_frame_equal(res, expected)
- @pytest.mark.parametrize("dtype", [None, "Int64"])
- def test_merge_outer_with_NaN(dtype):
- # GH#43550
- left = DataFrame({"key": [1, 2], "col1": [1, 2]}, dtype=dtype)
- right = DataFrame({"key": [np.nan, np.nan], "col2": [3, 4]}, dtype=dtype)
- result = merge(left, right, on="key", how="outer")
- expected = DataFrame(
- {
- "key": [1, 2, np.nan, np.nan],
- "col1": [1, 2, np.nan, np.nan],
- "col2": [np.nan, np.nan, 3, 4],
- },
- dtype=dtype,
- )
- tm.assert_frame_equal(result, expected)
- # switch left and right
- result = merge(right, left, on="key", how="outer")
- expected = DataFrame(
- {
- "key": [np.nan, np.nan, 1, 2],
- "col2": [3, 4, np.nan, np.nan],
- "col1": [np.nan, np.nan, 1, 2],
- },
- dtype=dtype,
- )
- tm.assert_frame_equal(result, expected)
- def test_merge_different_index_names():
- # GH#45094
- left = DataFrame({"a": [1]}, index=Index([1], name="c"))
- right = DataFrame({"a": [1]}, index=Index([1], name="d"))
- result = merge(left, right, left_on="c", right_on="d")
- expected = DataFrame({"a_x": [1], "a_y": 1})
- tm.assert_frame_equal(result, expected)
- def test_merge_ea(any_numeric_ea_dtype, join_type):
- # GH#44240
- left = DataFrame({"a": [1, 2, 3], "b": 1}, dtype=any_numeric_ea_dtype)
- right = DataFrame({"a": [1, 2, 3], "c": 2}, dtype=any_numeric_ea_dtype)
- result = left.merge(right, how=join_type)
- expected = DataFrame({"a": [1, 2, 3], "b": 1, "c": 2}, dtype=any_numeric_ea_dtype)
- tm.assert_frame_equal(result, expected)
- def test_merge_ea_and_non_ea(any_numeric_ea_dtype, join_type):
- # GH#44240
- left = DataFrame({"a": [1, 2, 3], "b": 1}, dtype=any_numeric_ea_dtype)
- right = DataFrame({"a": [1, 2, 3], "c": 2}, dtype=any_numeric_ea_dtype.lower())
- result = left.merge(right, how=join_type)
- expected = DataFrame(
- {
- "a": Series([1, 2, 3], dtype=any_numeric_ea_dtype),
- "b": Series([1, 1, 1], dtype=any_numeric_ea_dtype),
- "c": Series([2, 2, 2], dtype=any_numeric_ea_dtype.lower()),
- }
- )
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("dtype", ["int64", "int64[pyarrow]"])
- def test_merge_arrow_and_numpy_dtypes(dtype):
- # GH#52406
- pytest.importorskip("pyarrow")
- df = DataFrame({"a": [1, 2]}, dtype=dtype)
- df2 = DataFrame({"a": [1, 2]}, dtype="int64[pyarrow]")
- result = df.merge(df2)
- expected = df.copy()
- tm.assert_frame_equal(result, expected)
- result = df2.merge(df)
- expected = df2.copy()
- tm.assert_frame_equal(result, expected)
- @pytest.mark.parametrize("how", ["inner", "left", "outer", "right"])
- @pytest.mark.parametrize("tz", [None, "America/Chicago"])
- def test_merge_datetime_different_resolution(tz, how):
- # https://github.com/pandas-dev/pandas/issues/53200
- vals = [
- pd.Timestamp(2023, 5, 12, tz=tz),
- pd.Timestamp(2023, 5, 13, tz=tz),
- pd.Timestamp(2023, 5, 14, tz=tz),
- ]
- df1 = DataFrame({"t": vals[:2], "a": [1.0, 2.0]})
- df1["t"] = df1["t"].dt.as_unit("ns")
- df2 = DataFrame({"t": vals[1:], "b": [1.0, 2.0]})
- df2["t"] = df2["t"].dt.as_unit("s")
- expected = DataFrame({"t": vals, "a": [1.0, 2.0, np.nan], "b": [np.nan, 1.0, 2.0]})
- expected["t"] = expected["t"].dt.as_unit("ns")
- if how == "inner":
- expected = expected.iloc[[1]].reset_index(drop=True)
- elif how == "left":
- expected = expected.iloc[[0, 1]]
- elif how == "right":
- expected = expected.iloc[[1, 2]].reset_index(drop=True)
- result = df1.merge(df2, on="t", how=how)
- tm.assert_frame_equal(result, expected)
|