merge.py 94 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645
  1. """
  2. SQL-style merge routines
  3. """
  4. from __future__ import annotations
  5. import copy as cp
  6. import datetime
  7. from functools import partial
  8. import string
  9. from typing import (
  10. TYPE_CHECKING,
  11. Hashable,
  12. Literal,
  13. Sequence,
  14. cast,
  15. )
  16. import uuid
  17. import warnings
  18. import numpy as np
  19. from pandas._libs import (
  20. Timedelta,
  21. hashtable as libhashtable,
  22. join as libjoin,
  23. lib,
  24. )
  25. from pandas._libs.lib import is_range_indexer
  26. from pandas._typing import (
  27. AnyArrayLike,
  28. ArrayLike,
  29. AxisInt,
  30. DtypeObj,
  31. IndexLabel,
  32. JoinHow,
  33. MergeHow,
  34. Shape,
  35. Suffixes,
  36. npt,
  37. )
  38. from pandas.errors import MergeError
  39. from pandas.util._decorators import (
  40. Appender,
  41. Substitution,
  42. cache_readonly,
  43. )
  44. from pandas.util._exceptions import find_stack_level
  45. from pandas.core.dtypes.base import ExtensionDtype
  46. from pandas.core.dtypes.cast import find_common_type
  47. from pandas.core.dtypes.common import (
  48. ensure_float64,
  49. ensure_int64,
  50. ensure_object,
  51. is_array_like,
  52. is_bool,
  53. is_bool_dtype,
  54. is_categorical_dtype,
  55. is_dtype_equal,
  56. is_extension_array_dtype,
  57. is_float_dtype,
  58. is_integer,
  59. is_integer_dtype,
  60. is_list_like,
  61. is_number,
  62. is_numeric_dtype,
  63. is_object_dtype,
  64. needs_i8_conversion,
  65. )
  66. from pandas.core.dtypes.dtypes import DatetimeTZDtype
  67. from pandas.core.dtypes.generic import (
  68. ABCDataFrame,
  69. ABCSeries,
  70. )
  71. from pandas.core.dtypes.missing import (
  72. isna,
  73. na_value_for_dtype,
  74. )
  75. from pandas import (
  76. ArrowDtype,
  77. Categorical,
  78. Index,
  79. MultiIndex,
  80. Series,
  81. )
  82. import pandas.core.algorithms as algos
  83. from pandas.core.arrays import (
  84. ArrowExtensionArray,
  85. BaseMaskedArray,
  86. ExtensionArray,
  87. )
  88. from pandas.core.arrays._mixins import NDArrayBackedExtensionArray
  89. import pandas.core.common as com
  90. from pandas.core.construction import (
  91. ensure_wrapped_if_datetimelike,
  92. extract_array,
  93. )
  94. from pandas.core.frame import _merge_doc
  95. from pandas.core.indexes.api import default_index
  96. from pandas.core.sorting import is_int64_overflow_possible
  97. if TYPE_CHECKING:
  98. from pandas import DataFrame
  99. from pandas.core import groupby
  100. from pandas.core.arrays import DatetimeArray
  101. _factorizers = {
  102. np.int64: libhashtable.Int64Factorizer,
  103. np.longlong: libhashtable.Int64Factorizer,
  104. np.int32: libhashtable.Int32Factorizer,
  105. np.int16: libhashtable.Int16Factorizer,
  106. np.int8: libhashtable.Int8Factorizer,
  107. np.uint64: libhashtable.UInt64Factorizer,
  108. np.uint32: libhashtable.UInt32Factorizer,
  109. np.uint16: libhashtable.UInt16Factorizer,
  110. np.uint8: libhashtable.UInt8Factorizer,
  111. np.bool_: libhashtable.UInt8Factorizer,
  112. np.float64: libhashtable.Float64Factorizer,
  113. np.float32: libhashtable.Float32Factorizer,
  114. np.complex64: libhashtable.Complex64Factorizer,
  115. np.complex128: libhashtable.Complex128Factorizer,
  116. np.object_: libhashtable.ObjectFactorizer,
  117. }
  118. # See https://github.com/pandas-dev/pandas/issues/52451
  119. if np.intc is not np.int32:
  120. _factorizers[np.intc] = libhashtable.Int64Factorizer
  121. @Substitution("\nleft : DataFrame or named Series")
  122. @Appender(_merge_doc, indents=0)
  123. def merge(
  124. left: DataFrame | Series,
  125. right: DataFrame | Series,
  126. how: MergeHow = "inner",
  127. on: IndexLabel | None = None,
  128. left_on: IndexLabel | None = None,
  129. right_on: IndexLabel | None = None,
  130. left_index: bool = False,
  131. right_index: bool = False,
  132. sort: bool = False,
  133. suffixes: Suffixes = ("_x", "_y"),
  134. copy: bool | None = None,
  135. indicator: str | bool = False,
  136. validate: str | None = None,
  137. ) -> DataFrame:
  138. op = _MergeOperation(
  139. left,
  140. right,
  141. how=how,
  142. on=on,
  143. left_on=left_on,
  144. right_on=right_on,
  145. left_index=left_index,
  146. right_index=right_index,
  147. sort=sort,
  148. suffixes=suffixes,
  149. indicator=indicator,
  150. validate=validate,
  151. )
  152. return op.get_result(copy=copy)
  153. def _groupby_and_merge(by, left: DataFrame, right: DataFrame, merge_pieces):
  154. """
  155. groupby & merge; we are always performing a left-by type operation
  156. Parameters
  157. ----------
  158. by: field to group
  159. left: DataFrame
  160. right: DataFrame
  161. merge_pieces: function for merging
  162. """
  163. pieces = []
  164. if not isinstance(by, (list, tuple)):
  165. by = [by]
  166. lby = left.groupby(by, sort=False)
  167. rby: groupby.DataFrameGroupBy | None = None
  168. # if we can groupby the rhs
  169. # then we can get vastly better perf
  170. if all(item in right.columns for item in by):
  171. rby = right.groupby(by, sort=False)
  172. for key, lhs in lby.grouper.get_iterator(lby._selected_obj, axis=lby.axis):
  173. if rby is None:
  174. rhs = right
  175. else:
  176. try:
  177. rhs = right.take(rby.indices[key])
  178. except KeyError:
  179. # key doesn't exist in left
  180. lcols = lhs.columns.tolist()
  181. cols = lcols + [r for r in right.columns if r not in set(lcols)]
  182. merged = lhs.reindex(columns=cols)
  183. merged.index = range(len(merged))
  184. pieces.append(merged)
  185. continue
  186. merged = merge_pieces(lhs, rhs)
  187. # make sure join keys are in the merged
  188. # TODO, should merge_pieces do this?
  189. merged[by] = key
  190. pieces.append(merged)
  191. # preserve the original order
  192. # if we have a missing piece this can be reset
  193. from pandas.core.reshape.concat import concat
  194. result = concat(pieces, ignore_index=True)
  195. result = result.reindex(columns=pieces[0].columns, copy=False)
  196. return result, lby
  197. def merge_ordered(
  198. left: DataFrame,
  199. right: DataFrame,
  200. on: IndexLabel | None = None,
  201. left_on: IndexLabel | None = None,
  202. right_on: IndexLabel | None = None,
  203. left_by=None,
  204. right_by=None,
  205. fill_method: str | None = None,
  206. suffixes: Suffixes = ("_x", "_y"),
  207. how: JoinHow = "outer",
  208. ) -> DataFrame:
  209. """
  210. Perform a merge for ordered data with optional filling/interpolation.
  211. Designed for ordered data like time series data. Optionally
  212. perform group-wise merge (see examples).
  213. Parameters
  214. ----------
  215. left : DataFrame or named Series
  216. right : DataFrame or named Series
  217. on : label or list
  218. Field names to join on. Must be found in both DataFrames.
  219. left_on : label or list, or array-like
  220. Field names to join on in left DataFrame. Can be a vector or list of
  221. vectors of the length of the DataFrame to use a particular vector as
  222. the join key instead of columns.
  223. right_on : label or list, or array-like
  224. Field names to join on in right DataFrame or vector/list of vectors per
  225. left_on docs.
  226. left_by : column name or list of column names
  227. Group left DataFrame by group columns and merge piece by piece with
  228. right DataFrame. Must be None if either left or right are a Series.
  229. right_by : column name or list of column names
  230. Group right DataFrame by group columns and merge piece by piece with
  231. left DataFrame. Must be None if either left or right are a Series.
  232. fill_method : {'ffill', None}, default None
  233. Interpolation method for data.
  234. suffixes : list-like, default is ("_x", "_y")
  235. A length-2 sequence where each element is optionally a string
  236. indicating the suffix to add to overlapping column names in
  237. `left` and `right` respectively. Pass a value of `None` instead
  238. of a string to indicate that the column name from `left` or
  239. `right` should be left as-is, with no suffix. At least one of the
  240. values must not be None.
  241. how : {'left', 'right', 'outer', 'inner'}, default 'outer'
  242. * left: use only keys from left frame (SQL: left outer join)
  243. * right: use only keys from right frame (SQL: right outer join)
  244. * outer: use union of keys from both frames (SQL: full outer join)
  245. * inner: use intersection of keys from both frames (SQL: inner join).
  246. Returns
  247. -------
  248. DataFrame
  249. The merged DataFrame output type will be the same as
  250. 'left', if it is a subclass of DataFrame.
  251. See Also
  252. --------
  253. merge : Merge with a database-style join.
  254. merge_asof : Merge on nearest keys.
  255. Examples
  256. --------
  257. >>> from pandas import merge_ordered
  258. >>> df1 = pd.DataFrame(
  259. ... {
  260. ... "key": ["a", "c", "e", "a", "c", "e"],
  261. ... "lvalue": [1, 2, 3, 1, 2, 3],
  262. ... "group": ["a", "a", "a", "b", "b", "b"]
  263. ... }
  264. ... )
  265. >>> df1
  266. key lvalue group
  267. 0 a 1 a
  268. 1 c 2 a
  269. 2 e 3 a
  270. 3 a 1 b
  271. 4 c 2 b
  272. 5 e 3 b
  273. >>> df2 = pd.DataFrame({"key": ["b", "c", "d"], "rvalue": [1, 2, 3]})
  274. >>> df2
  275. key rvalue
  276. 0 b 1
  277. 1 c 2
  278. 2 d 3
  279. >>> merge_ordered(df1, df2, fill_method="ffill", left_by="group")
  280. key lvalue group rvalue
  281. 0 a 1 a NaN
  282. 1 b 1 a 1.0
  283. 2 c 2 a 2.0
  284. 3 d 2 a 3.0
  285. 4 e 3 a 3.0
  286. 5 a 1 b NaN
  287. 6 b 1 b 1.0
  288. 7 c 2 b 2.0
  289. 8 d 2 b 3.0
  290. 9 e 3 b 3.0
  291. """
  292. def _merger(x, y) -> DataFrame:
  293. # perform the ordered merge operation
  294. op = _OrderedMerge(
  295. x,
  296. y,
  297. on=on,
  298. left_on=left_on,
  299. right_on=right_on,
  300. suffixes=suffixes,
  301. fill_method=fill_method,
  302. how=how,
  303. )
  304. return op.get_result()
  305. if left_by is not None and right_by is not None:
  306. raise ValueError("Can only group either left or right frames")
  307. if left_by is not None:
  308. if isinstance(left_by, str):
  309. left_by = [left_by]
  310. check = set(left_by).difference(left.columns)
  311. if len(check) != 0:
  312. raise KeyError(f"{check} not found in left columns")
  313. result, _ = _groupby_and_merge(left_by, left, right, lambda x, y: _merger(x, y))
  314. elif right_by is not None:
  315. if isinstance(right_by, str):
  316. right_by = [right_by]
  317. check = set(right_by).difference(right.columns)
  318. if len(check) != 0:
  319. raise KeyError(f"{check} not found in right columns")
  320. result, _ = _groupby_and_merge(
  321. right_by, right, left, lambda x, y: _merger(y, x)
  322. )
  323. else:
  324. result = _merger(left, right)
  325. return result
  326. def merge_asof(
  327. left: DataFrame | Series,
  328. right: DataFrame | Series,
  329. on: IndexLabel | None = None,
  330. left_on: IndexLabel | None = None,
  331. right_on: IndexLabel | None = None,
  332. left_index: bool = False,
  333. right_index: bool = False,
  334. by=None,
  335. left_by=None,
  336. right_by=None,
  337. suffixes: Suffixes = ("_x", "_y"),
  338. tolerance=None,
  339. allow_exact_matches: bool = True,
  340. direction: str = "backward",
  341. ) -> DataFrame:
  342. """
  343. Perform a merge by key distance.
  344. This is similar to a left-join except that we match on nearest
  345. key rather than equal keys. Both DataFrames must be sorted by the key.
  346. For each row in the left DataFrame:
  347. - A "backward" search selects the last row in the right DataFrame whose
  348. 'on' key is less than or equal to the left's key.
  349. - A "forward" search selects the first row in the right DataFrame whose
  350. 'on' key is greater than or equal to the left's key.
  351. - A "nearest" search selects the row in the right DataFrame whose 'on'
  352. key is closest in absolute distance to the left's key.
  353. The default is "backward" and is compatible in versions below 0.20.0.
  354. The direction parameter was added in version 0.20.0 and introduces
  355. "forward" and "nearest".
  356. Optionally match on equivalent keys with 'by' before searching with 'on'.
  357. Parameters
  358. ----------
  359. left : DataFrame or named Series
  360. right : DataFrame or named Series
  361. on : label
  362. Field name to join on. Must be found in both DataFrames.
  363. The data MUST be ordered. Furthermore this must be a numeric column,
  364. such as datetimelike, integer, or float. On or left_on/right_on
  365. must be given.
  366. left_on : label
  367. Field name to join on in left DataFrame.
  368. right_on : label
  369. Field name to join on in right DataFrame.
  370. left_index : bool
  371. Use the index of the left DataFrame as the join key.
  372. right_index : bool
  373. Use the index of the right DataFrame as the join key.
  374. by : column name or list of column names
  375. Match on these columns before performing merge operation.
  376. left_by : column name
  377. Field names to match on in the left DataFrame.
  378. right_by : column name
  379. Field names to match on in the right DataFrame.
  380. suffixes : 2-length sequence (tuple, list, ...)
  381. Suffix to apply to overlapping column names in the left and right
  382. side, respectively.
  383. tolerance : int or Timedelta, optional, default None
  384. Select asof tolerance within this range; must be compatible
  385. with the merge index.
  386. allow_exact_matches : bool, default True
  387. - If True, allow matching with the same 'on' value
  388. (i.e. less-than-or-equal-to / greater-than-or-equal-to)
  389. - If False, don't match the same 'on' value
  390. (i.e., strictly less-than / strictly greater-than).
  391. direction : 'backward' (default), 'forward', or 'nearest'
  392. Whether to search for prior, subsequent, or closest matches.
  393. Returns
  394. -------
  395. DataFrame
  396. See Also
  397. --------
  398. merge : Merge with a database-style join.
  399. merge_ordered : Merge with optional filling/interpolation.
  400. Examples
  401. --------
  402. >>> left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
  403. >>> left
  404. a left_val
  405. 0 1 a
  406. 1 5 b
  407. 2 10 c
  408. >>> right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})
  409. >>> right
  410. a right_val
  411. 0 1 1
  412. 1 2 2
  413. 2 3 3
  414. 3 6 6
  415. 4 7 7
  416. >>> pd.merge_asof(left, right, on="a")
  417. a left_val right_val
  418. 0 1 a 1
  419. 1 5 b 3
  420. 2 10 c 7
  421. >>> pd.merge_asof(left, right, on="a", allow_exact_matches=False)
  422. a left_val right_val
  423. 0 1 a NaN
  424. 1 5 b 3.0
  425. 2 10 c 7.0
  426. >>> pd.merge_asof(left, right, on="a", direction="forward")
  427. a left_val right_val
  428. 0 1 a 1.0
  429. 1 5 b 6.0
  430. 2 10 c NaN
  431. >>> pd.merge_asof(left, right, on="a", direction="nearest")
  432. a left_val right_val
  433. 0 1 a 1
  434. 1 5 b 6
  435. 2 10 c 7
  436. We can use indexed DataFrames as well.
  437. >>> left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])
  438. >>> left
  439. left_val
  440. 1 a
  441. 5 b
  442. 10 c
  443. >>> right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])
  444. >>> right
  445. right_val
  446. 1 1
  447. 2 2
  448. 3 3
  449. 6 6
  450. 7 7
  451. >>> pd.merge_asof(left, right, left_index=True, right_index=True)
  452. left_val right_val
  453. 1 a 1
  454. 5 b 3
  455. 10 c 7
  456. Here is a real-world times-series example
  457. >>> quotes = pd.DataFrame(
  458. ... {
  459. ... "time": [
  460. ... pd.Timestamp("2016-05-25 13:30:00.023"),
  461. ... pd.Timestamp("2016-05-25 13:30:00.023"),
  462. ... pd.Timestamp("2016-05-25 13:30:00.030"),
  463. ... pd.Timestamp("2016-05-25 13:30:00.041"),
  464. ... pd.Timestamp("2016-05-25 13:30:00.048"),
  465. ... pd.Timestamp("2016-05-25 13:30:00.049"),
  466. ... pd.Timestamp("2016-05-25 13:30:00.072"),
  467. ... pd.Timestamp("2016-05-25 13:30:00.075")
  468. ... ],
  469. ... "ticker": [
  470. ... "GOOG",
  471. ... "MSFT",
  472. ... "MSFT",
  473. ... "MSFT",
  474. ... "GOOG",
  475. ... "AAPL",
  476. ... "GOOG",
  477. ... "MSFT"
  478. ... ],
  479. ... "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
  480. ... "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03]
  481. ... }
  482. ... )
  483. >>> quotes
  484. time ticker bid ask
  485. 0 2016-05-25 13:30:00.023 GOOG 720.50 720.93
  486. 1 2016-05-25 13:30:00.023 MSFT 51.95 51.96
  487. 2 2016-05-25 13:30:00.030 MSFT 51.97 51.98
  488. 3 2016-05-25 13:30:00.041 MSFT 51.99 52.00
  489. 4 2016-05-25 13:30:00.048 GOOG 720.50 720.93
  490. 5 2016-05-25 13:30:00.049 AAPL 97.99 98.01
  491. 6 2016-05-25 13:30:00.072 GOOG 720.50 720.88
  492. 7 2016-05-25 13:30:00.075 MSFT 52.01 52.03
  493. >>> trades = pd.DataFrame(
  494. ... {
  495. ... "time": [
  496. ... pd.Timestamp("2016-05-25 13:30:00.023"),
  497. ... pd.Timestamp("2016-05-25 13:30:00.038"),
  498. ... pd.Timestamp("2016-05-25 13:30:00.048"),
  499. ... pd.Timestamp("2016-05-25 13:30:00.048"),
  500. ... pd.Timestamp("2016-05-25 13:30:00.048")
  501. ... ],
  502. ... "ticker": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
  503. ... "price": [51.95, 51.95, 720.77, 720.92, 98.0],
  504. ... "quantity": [75, 155, 100, 100, 100]
  505. ... }
  506. ... )
  507. >>> trades
  508. time ticker price quantity
  509. 0 2016-05-25 13:30:00.023 MSFT 51.95 75
  510. 1 2016-05-25 13:30:00.038 MSFT 51.95 155
  511. 2 2016-05-25 13:30:00.048 GOOG 720.77 100
  512. 3 2016-05-25 13:30:00.048 GOOG 720.92 100
  513. 4 2016-05-25 13:30:00.048 AAPL 98.00 100
  514. By default we are taking the asof of the quotes
  515. >>> pd.merge_asof(trades, quotes, on="time", by="ticker")
  516. time ticker price quantity bid ask
  517. 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
  518. 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
  519. 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
  520. 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
  521. 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
  522. We only asof within 2ms between the quote time and the trade time
  523. >>> pd.merge_asof(
  524. ... trades, quotes, on="time", by="ticker", tolerance=pd.Timedelta("2ms")
  525. ... )
  526. time ticker price quantity bid ask
  527. 0 2016-05-25 13:30:00.023 MSFT 51.95 75 51.95 51.96
  528. 1 2016-05-25 13:30:00.038 MSFT 51.95 155 NaN NaN
  529. 2 2016-05-25 13:30:00.048 GOOG 720.77 100 720.50 720.93
  530. 3 2016-05-25 13:30:00.048 GOOG 720.92 100 720.50 720.93
  531. 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
  532. We only asof within 10ms between the quote time and the trade time
  533. and we exclude exact matches on time. However *prior* data will
  534. propagate forward
  535. >>> pd.merge_asof(
  536. ... trades,
  537. ... quotes,
  538. ... on="time",
  539. ... by="ticker",
  540. ... tolerance=pd.Timedelta("10ms"),
  541. ... allow_exact_matches=False
  542. ... )
  543. time ticker price quantity bid ask
  544. 0 2016-05-25 13:30:00.023 MSFT 51.95 75 NaN NaN
  545. 1 2016-05-25 13:30:00.038 MSFT 51.95 155 51.97 51.98
  546. 2 2016-05-25 13:30:00.048 GOOG 720.77 100 NaN NaN
  547. 3 2016-05-25 13:30:00.048 GOOG 720.92 100 NaN NaN
  548. 4 2016-05-25 13:30:00.048 AAPL 98.00 100 NaN NaN
  549. """
  550. op = _AsOfMerge(
  551. left,
  552. right,
  553. on=on,
  554. left_on=left_on,
  555. right_on=right_on,
  556. left_index=left_index,
  557. right_index=right_index,
  558. by=by,
  559. left_by=left_by,
  560. right_by=right_by,
  561. suffixes=suffixes,
  562. how="asof",
  563. tolerance=tolerance,
  564. allow_exact_matches=allow_exact_matches,
  565. direction=direction,
  566. )
  567. return op.get_result()
  568. # TODO: transformations??
  569. # TODO: only copy DataFrames when modification necessary
  570. class _MergeOperation:
  571. """
  572. Perform a database (SQL) merge operation between two DataFrame or Series
  573. objects using either columns as keys or their row indexes
  574. """
  575. _merge_type = "merge"
  576. how: MergeHow | Literal["asof"]
  577. on: IndexLabel | None
  578. # left_on/right_on may be None when passed, but in validate_specification
  579. # get replaced with non-None.
  580. left_on: Sequence[Hashable | AnyArrayLike]
  581. right_on: Sequence[Hashable | AnyArrayLike]
  582. left_index: bool
  583. right_index: bool
  584. axis: AxisInt
  585. bm_axis: AxisInt
  586. sort: bool
  587. suffixes: Suffixes
  588. copy: bool
  589. indicator: str | bool
  590. validate: str | None
  591. join_names: list[Hashable]
  592. right_join_keys: list[AnyArrayLike]
  593. left_join_keys: list[AnyArrayLike]
  594. def __init__(
  595. self,
  596. left: DataFrame | Series,
  597. right: DataFrame | Series,
  598. how: MergeHow | Literal["asof"] = "inner",
  599. on: IndexLabel | None = None,
  600. left_on: IndexLabel | None = None,
  601. right_on: IndexLabel | None = None,
  602. axis: AxisInt = 1,
  603. left_index: bool = False,
  604. right_index: bool = False,
  605. sort: bool = True,
  606. suffixes: Suffixes = ("_x", "_y"),
  607. indicator: str | bool = False,
  608. validate: str | None = None,
  609. ) -> None:
  610. _left = _validate_operand(left)
  611. _right = _validate_operand(right)
  612. self.left = self.orig_left = _left
  613. self.right = self.orig_right = _right
  614. self.how = how
  615. # bm_axis -> the axis on the BlockManager
  616. self.bm_axis = axis
  617. # axis --> the axis on the Series/DataFrame
  618. self.axis = 1 - axis if self.left.ndim == 2 else 0
  619. self.on = com.maybe_make_list(on)
  620. self.suffixes = suffixes
  621. self.sort = sort
  622. self.left_index = left_index
  623. self.right_index = right_index
  624. self.indicator = indicator
  625. if not is_bool(left_index):
  626. raise ValueError(
  627. f"left_index parameter must be of type bool, not {type(left_index)}"
  628. )
  629. if not is_bool(right_index):
  630. raise ValueError(
  631. f"right_index parameter must be of type bool, not {type(right_index)}"
  632. )
  633. # GH 40993: raise when merging between different levels; enforced in 2.0
  634. if _left.columns.nlevels != _right.columns.nlevels:
  635. msg = (
  636. "Not allowed to merge between different levels. "
  637. f"({_left.columns.nlevels} levels on the left, "
  638. f"{_right.columns.nlevels} on the right)"
  639. )
  640. raise MergeError(msg)
  641. self.left_on, self.right_on = self._validate_left_right_on(left_on, right_on)
  642. cross_col = None
  643. if self.how == "cross":
  644. (
  645. self.left,
  646. self.right,
  647. self.how,
  648. cross_col,
  649. ) = self._create_cross_configuration(self.left, self.right)
  650. self.left_on = self.right_on = [cross_col]
  651. self._cross = cross_col
  652. # note this function has side effects
  653. (
  654. self.left_join_keys,
  655. self.right_join_keys,
  656. self.join_names,
  657. ) = self._get_merge_keys()
  658. # validate the merge keys dtypes. We may need to coerce
  659. # to avoid incompatible dtypes
  660. self._maybe_coerce_merge_keys()
  661. # If argument passed to validate,
  662. # check if columns specified as unique
  663. # are in fact unique.
  664. if validate is not None:
  665. self._validate(validate)
  666. def _reindex_and_concat(
  667. self,
  668. join_index: Index,
  669. left_indexer: npt.NDArray[np.intp] | None,
  670. right_indexer: npt.NDArray[np.intp] | None,
  671. copy: bool | None,
  672. ) -> DataFrame:
  673. """
  674. reindex along index and concat along columns.
  675. """
  676. # Take views so we do not alter the originals
  677. left = self.left[:]
  678. right = self.right[:]
  679. llabels, rlabels = _items_overlap_with_suffix(
  680. self.left._info_axis, self.right._info_axis, self.suffixes
  681. )
  682. if left_indexer is not None and not is_range_indexer(left_indexer, len(left)):
  683. # Pinning the index here (and in the right code just below) is not
  684. # necessary, but makes the `.take` more performant if we have e.g.
  685. # a MultiIndex for left.index.
  686. lmgr = left._mgr.reindex_indexer(
  687. join_index,
  688. left_indexer,
  689. axis=1,
  690. copy=False,
  691. only_slice=True,
  692. allow_dups=True,
  693. use_na_proxy=True,
  694. )
  695. left = left._constructor(lmgr)
  696. left.index = join_index
  697. if right_indexer is not None and not is_range_indexer(
  698. right_indexer, len(right)
  699. ):
  700. rmgr = right._mgr.reindex_indexer(
  701. join_index,
  702. right_indexer,
  703. axis=1,
  704. copy=False,
  705. only_slice=True,
  706. allow_dups=True,
  707. use_na_proxy=True,
  708. )
  709. right = right._constructor(rmgr)
  710. right.index = join_index
  711. from pandas import concat
  712. left.columns = llabels
  713. right.columns = rlabels
  714. result = concat([left, right], axis=1, copy=copy)
  715. return result
  716. def get_result(self, copy: bool | None = True) -> DataFrame:
  717. if self.indicator:
  718. self.left, self.right = self._indicator_pre_merge(self.left, self.right)
  719. join_index, left_indexer, right_indexer = self._get_join_info()
  720. result = self._reindex_and_concat(
  721. join_index, left_indexer, right_indexer, copy=copy
  722. )
  723. result = result.__finalize__(self, method=self._merge_type)
  724. if self.indicator:
  725. result = self._indicator_post_merge(result)
  726. self._maybe_add_join_keys(result, left_indexer, right_indexer)
  727. self._maybe_restore_index_levels(result)
  728. self._maybe_drop_cross_column(result, self._cross)
  729. return result.__finalize__(self, method="merge")
  730. def _maybe_drop_cross_column(
  731. self, result: DataFrame, cross_col: str | None
  732. ) -> None:
  733. if cross_col is not None:
  734. del result[cross_col]
  735. @cache_readonly
  736. def _indicator_name(self) -> str | None:
  737. if isinstance(self.indicator, str):
  738. return self.indicator
  739. elif isinstance(self.indicator, bool):
  740. return "_merge" if self.indicator else None
  741. else:
  742. raise ValueError(
  743. "indicator option can only accept boolean or string arguments"
  744. )
  745. def _indicator_pre_merge(
  746. self, left: DataFrame, right: DataFrame
  747. ) -> tuple[DataFrame, DataFrame]:
  748. columns = left.columns.union(right.columns)
  749. for i in ["_left_indicator", "_right_indicator"]:
  750. if i in columns:
  751. raise ValueError(
  752. "Cannot use `indicator=True` option when "
  753. f"data contains a column named {i}"
  754. )
  755. if self._indicator_name in columns:
  756. raise ValueError(
  757. "Cannot use name of an existing column for indicator column"
  758. )
  759. left = left.copy()
  760. right = right.copy()
  761. left["_left_indicator"] = 1
  762. left["_left_indicator"] = left["_left_indicator"].astype("int8")
  763. right["_right_indicator"] = 2
  764. right["_right_indicator"] = right["_right_indicator"].astype("int8")
  765. return left, right
  766. def _indicator_post_merge(self, result: DataFrame) -> DataFrame:
  767. result["_left_indicator"] = result["_left_indicator"].fillna(0)
  768. result["_right_indicator"] = result["_right_indicator"].fillna(0)
  769. result[self._indicator_name] = Categorical(
  770. (result["_left_indicator"] + result["_right_indicator"]),
  771. categories=[1, 2, 3],
  772. )
  773. result[self._indicator_name] = result[
  774. self._indicator_name
  775. ].cat.rename_categories(["left_only", "right_only", "both"])
  776. result = result.drop(labels=["_left_indicator", "_right_indicator"], axis=1)
  777. return result
  778. def _maybe_restore_index_levels(self, result: DataFrame) -> None:
  779. """
  780. Restore index levels specified as `on` parameters
  781. Here we check for cases where `self.left_on` and `self.right_on` pairs
  782. each reference an index level in their respective DataFrames. The
  783. joined columns corresponding to these pairs are then restored to the
  784. index of `result`.
  785. **Note:** This method has side effects. It modifies `result` in-place
  786. Parameters
  787. ----------
  788. result: DataFrame
  789. merge result
  790. Returns
  791. -------
  792. None
  793. """
  794. names_to_restore = []
  795. for name, left_key, right_key in zip(
  796. self.join_names, self.left_on, self.right_on
  797. ):
  798. if (
  799. # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible
  800. # type "Union[Hashable, ExtensionArray, Index, Series]"; expected
  801. # "Hashable"
  802. self.orig_left._is_level_reference(left_key) # type: ignore[arg-type]
  803. # Argument 1 to "_is_level_reference" of "NDFrame" has incompatible
  804. # type "Union[Hashable, ExtensionArray, Index, Series]"; expected
  805. # "Hashable"
  806. and self.orig_right._is_level_reference(
  807. right_key # type: ignore[arg-type]
  808. )
  809. and left_key == right_key
  810. and name not in result.index.names
  811. ):
  812. names_to_restore.append(name)
  813. if names_to_restore:
  814. result.set_index(names_to_restore, inplace=True)
  815. def _maybe_add_join_keys(
  816. self,
  817. result: DataFrame,
  818. left_indexer: np.ndarray | None,
  819. right_indexer: np.ndarray | None,
  820. ) -> None:
  821. left_has_missing = None
  822. right_has_missing = None
  823. assert all(is_array_like(x) for x in self.left_join_keys)
  824. keys = zip(self.join_names, self.left_on, self.right_on)
  825. for i, (name, lname, rname) in enumerate(keys):
  826. if not _should_fill(lname, rname):
  827. continue
  828. take_left, take_right = None, None
  829. if name in result:
  830. if left_indexer is not None and right_indexer is not None:
  831. if name in self.left:
  832. if left_has_missing is None:
  833. left_has_missing = (left_indexer == -1).any()
  834. if left_has_missing:
  835. take_right = self.right_join_keys[i]
  836. if not is_dtype_equal(
  837. result[name].dtype, self.left[name].dtype
  838. ):
  839. take_left = self.left[name]._values
  840. elif name in self.right:
  841. if right_has_missing is None:
  842. right_has_missing = (right_indexer == -1).any()
  843. if right_has_missing:
  844. take_left = self.left_join_keys[i]
  845. if not is_dtype_equal(
  846. result[name].dtype, self.right[name].dtype
  847. ):
  848. take_right = self.right[name]._values
  849. elif left_indexer is not None:
  850. take_left = self.left_join_keys[i]
  851. take_right = self.right_join_keys[i]
  852. if take_left is not None or take_right is not None:
  853. if take_left is None:
  854. lvals = result[name]._values
  855. else:
  856. # TODO: can we pin down take_left's type earlier?
  857. take_left = extract_array(take_left, extract_numpy=True)
  858. lfill = na_value_for_dtype(take_left.dtype)
  859. lvals = algos.take_nd(take_left, left_indexer, fill_value=lfill)
  860. if take_right is None:
  861. rvals = result[name]._values
  862. else:
  863. # TODO: can we pin down take_right's type earlier?
  864. taker = extract_array(take_right, extract_numpy=True)
  865. rfill = na_value_for_dtype(taker.dtype)
  866. rvals = algos.take_nd(taker, right_indexer, fill_value=rfill)
  867. # if we have an all missing left_indexer
  868. # make sure to just use the right values or vice-versa
  869. mask_left = left_indexer == -1
  870. # error: Item "bool" of "Union[Any, bool]" has no attribute "all"
  871. if mask_left.all(): # type: ignore[union-attr]
  872. key_col = Index(rvals)
  873. result_dtype = rvals.dtype
  874. elif right_indexer is not None and (right_indexer == -1).all():
  875. key_col = Index(lvals)
  876. result_dtype = lvals.dtype
  877. else:
  878. key_col = Index(lvals).where(~mask_left, rvals)
  879. result_dtype = find_common_type([lvals.dtype, rvals.dtype])
  880. if (
  881. lvals.dtype.kind == "M"
  882. and rvals.dtype.kind == "M"
  883. and result_dtype.kind == "O"
  884. ):
  885. # TODO(non-nano) Workaround for common_type not dealing
  886. # with different resolutions
  887. result_dtype = key_col.dtype
  888. if result._is_label_reference(name):
  889. result[name] = Series(
  890. key_col, dtype=result_dtype, index=result.index
  891. )
  892. elif result._is_level_reference(name):
  893. if isinstance(result.index, MultiIndex):
  894. key_col.name = name
  895. idx_list = [
  896. result.index.get_level_values(level_name)
  897. if level_name != name
  898. else key_col
  899. for level_name in result.index.names
  900. ]
  901. result.set_index(idx_list, inplace=True)
  902. else:
  903. result.index = Index(key_col, name=name)
  904. else:
  905. result.insert(i, name or f"key_{i}", key_col)
  906. def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
  907. """return the join indexers"""
  908. return get_join_indexers(
  909. self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how
  910. )
  911. def _get_join_info(
  912. self,
  913. ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]:
  914. # make mypy happy
  915. assert self.how != "cross"
  916. left_ax = self.left.axes[self.axis]
  917. right_ax = self.right.axes[self.axis]
  918. if self.left_index and self.right_index and self.how != "asof":
  919. join_index, left_indexer, right_indexer = left_ax.join(
  920. right_ax, how=self.how, return_indexers=True, sort=self.sort
  921. )
  922. elif self.right_index and self.how == "left":
  923. join_index, left_indexer, right_indexer = _left_join_on_index(
  924. left_ax, right_ax, self.left_join_keys, sort=self.sort
  925. )
  926. elif self.left_index and self.how == "right":
  927. join_index, right_indexer, left_indexer = _left_join_on_index(
  928. right_ax, left_ax, self.right_join_keys, sort=self.sort
  929. )
  930. else:
  931. (left_indexer, right_indexer) = self._get_join_indexers()
  932. if self.right_index:
  933. if len(self.left) > 0:
  934. join_index = self._create_join_index(
  935. self.left.index,
  936. self.right.index,
  937. left_indexer,
  938. how="right",
  939. )
  940. else:
  941. join_index = self.right.index.take(right_indexer)
  942. elif self.left_index:
  943. if self.how == "asof":
  944. # GH#33463 asof should always behave like a left merge
  945. join_index = self._create_join_index(
  946. self.left.index,
  947. self.right.index,
  948. left_indexer,
  949. how="left",
  950. )
  951. elif len(self.right) > 0:
  952. join_index = self._create_join_index(
  953. self.right.index,
  954. self.left.index,
  955. right_indexer,
  956. how="left",
  957. )
  958. else:
  959. join_index = self.left.index.take(left_indexer)
  960. else:
  961. join_index = default_index(len(left_indexer))
  962. if len(join_index) == 0 and not isinstance(join_index, MultiIndex):
  963. join_index = default_index(0).set_names(join_index.name)
  964. return join_index, left_indexer, right_indexer
  965. def _create_join_index(
  966. self,
  967. index: Index,
  968. other_index: Index,
  969. indexer: npt.NDArray[np.intp],
  970. how: JoinHow = "left",
  971. ) -> Index:
  972. """
  973. Create a join index by rearranging one index to match another
  974. Parameters
  975. ----------
  976. index : Index being rearranged
  977. other_index : Index used to supply values not found in index
  978. indexer : np.ndarray[np.intp] how to rearrange index
  979. how : str
  980. Replacement is only necessary if indexer based on other_index.
  981. Returns
  982. -------
  983. Index
  984. """
  985. if self.how in (how, "outer") and not isinstance(other_index, MultiIndex):
  986. # if final index requires values in other_index but not target
  987. # index, indexer may hold missing (-1) values, causing Index.take
  988. # to take the final value in target index. So, we set the last
  989. # element to be the desired fill value. We do not use allow_fill
  990. # and fill_value because it throws a ValueError on integer indices
  991. mask = indexer == -1
  992. if np.any(mask):
  993. fill_value = na_value_for_dtype(index.dtype, compat=False)
  994. index = index.append(Index([fill_value]))
  995. return index.take(indexer)
  996. def _get_merge_keys(
  997. self,
  998. ) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]:
  999. """
  1000. Note: has side effects (copy/delete key columns)
  1001. Parameters
  1002. ----------
  1003. left
  1004. right
  1005. on
  1006. Returns
  1007. -------
  1008. left_keys, right_keys, join_names
  1009. """
  1010. # left_keys, right_keys entries can actually be anything listlike
  1011. # with a 'dtype' attr
  1012. left_keys: list[AnyArrayLike] = []
  1013. right_keys: list[AnyArrayLike] = []
  1014. join_names: list[Hashable] = []
  1015. right_drop: list[Hashable] = []
  1016. left_drop: list[Hashable] = []
  1017. left, right = self.left, self.right
  1018. is_lkey = lambda x: is_array_like(x) and len(x) == len(left)
  1019. is_rkey = lambda x: is_array_like(x) and len(x) == len(right)
  1020. # Note that pd.merge_asof() has separate 'on' and 'by' parameters. A
  1021. # user could, for example, request 'left_index' and 'left_by'. In a
  1022. # regular pd.merge(), users cannot specify both 'left_index' and
  1023. # 'left_on'. (Instead, users have a MultiIndex). That means the
  1024. # self.left_on in this function is always empty in a pd.merge(), but
  1025. # a pd.merge_asof(left_index=True, left_by=...) will result in a
  1026. # self.left_on array with a None in the middle of it. This requires
  1027. # a work-around as designated in the code below.
  1028. # See _validate_left_right_on() for where this happens.
  1029. # ugh, spaghetti re #733
  1030. if _any(self.left_on) and _any(self.right_on):
  1031. for lk, rk in zip(self.left_on, self.right_on):
  1032. if is_lkey(lk):
  1033. lk = cast(AnyArrayLike, lk)
  1034. left_keys.append(lk)
  1035. if is_rkey(rk):
  1036. rk = cast(AnyArrayLike, rk)
  1037. right_keys.append(rk)
  1038. join_names.append(None) # what to do?
  1039. else:
  1040. # Then we're either Hashable or a wrong-length arraylike,
  1041. # the latter of which will raise
  1042. rk = cast(Hashable, rk)
  1043. if rk is not None:
  1044. right_keys.append(right._get_label_or_level_values(rk))
  1045. join_names.append(rk)
  1046. else:
  1047. # work-around for merge_asof(right_index=True)
  1048. right_keys.append(right.index)
  1049. join_names.append(right.index.name)
  1050. else:
  1051. if not is_rkey(rk):
  1052. # Then we're either Hashable or a wrong-length arraylike,
  1053. # the latter of which will raise
  1054. rk = cast(Hashable, rk)
  1055. if rk is not None:
  1056. right_keys.append(right._get_label_or_level_values(rk))
  1057. else:
  1058. # work-around for merge_asof(right_index=True)
  1059. right_keys.append(right.index)
  1060. if lk is not None and lk == rk: # FIXME: what about other NAs?
  1061. # avoid key upcast in corner case (length-0)
  1062. lk = cast(Hashable, lk)
  1063. if len(left) > 0:
  1064. right_drop.append(rk)
  1065. else:
  1066. left_drop.append(lk)
  1067. else:
  1068. rk = cast(AnyArrayLike, rk)
  1069. right_keys.append(rk)
  1070. if lk is not None:
  1071. # Then we're either Hashable or a wrong-length arraylike,
  1072. # the latter of which will raise
  1073. lk = cast(Hashable, lk)
  1074. left_keys.append(left._get_label_or_level_values(lk))
  1075. join_names.append(lk)
  1076. else:
  1077. # work-around for merge_asof(left_index=True)
  1078. left_keys.append(left.index)
  1079. join_names.append(left.index.name)
  1080. elif _any(self.left_on):
  1081. for k in self.left_on:
  1082. if is_lkey(k):
  1083. k = cast(AnyArrayLike, k)
  1084. left_keys.append(k)
  1085. join_names.append(None)
  1086. else:
  1087. # Then we're either Hashable or a wrong-length arraylike,
  1088. # the latter of which will raise
  1089. k = cast(Hashable, k)
  1090. left_keys.append(left._get_label_or_level_values(k))
  1091. join_names.append(k)
  1092. if isinstance(self.right.index, MultiIndex):
  1093. right_keys = [
  1094. lev._values.take(lev_codes)
  1095. for lev, lev_codes in zip(
  1096. self.right.index.levels, self.right.index.codes
  1097. )
  1098. ]
  1099. else:
  1100. right_keys = [self.right.index._values]
  1101. elif _any(self.right_on):
  1102. for k in self.right_on:
  1103. if is_rkey(k):
  1104. k = cast(AnyArrayLike, k)
  1105. right_keys.append(k)
  1106. join_names.append(None)
  1107. else:
  1108. # Then we're either Hashable or a wrong-length arraylike,
  1109. # the latter of which will raise
  1110. k = cast(Hashable, k)
  1111. right_keys.append(right._get_label_or_level_values(k))
  1112. join_names.append(k)
  1113. if isinstance(self.left.index, MultiIndex):
  1114. left_keys = [
  1115. lev._values.take(lev_codes)
  1116. for lev, lev_codes in zip(
  1117. self.left.index.levels, self.left.index.codes
  1118. )
  1119. ]
  1120. else:
  1121. left_keys = [self.left.index._values]
  1122. if left_drop:
  1123. self.left = self.left._drop_labels_or_levels(left_drop)
  1124. if right_drop:
  1125. self.right = self.right._drop_labels_or_levels(right_drop)
  1126. return left_keys, right_keys, join_names
  1127. def _maybe_coerce_merge_keys(self) -> None:
  1128. # we have valid merges but we may have to further
  1129. # coerce these if they are originally incompatible types
  1130. #
  1131. # for example if these are categorical, but are not dtype_equal
  1132. # or if we have object and integer dtypes
  1133. for lk, rk, name in zip(
  1134. self.left_join_keys, self.right_join_keys, self.join_names
  1135. ):
  1136. if (len(lk) and not len(rk)) or (not len(lk) and len(rk)):
  1137. continue
  1138. lk = extract_array(lk, extract_numpy=True)
  1139. rk = extract_array(rk, extract_numpy=True)
  1140. lk_is_cat = is_categorical_dtype(lk.dtype)
  1141. rk_is_cat = is_categorical_dtype(rk.dtype)
  1142. lk_is_object = is_object_dtype(lk.dtype)
  1143. rk_is_object = is_object_dtype(rk.dtype)
  1144. # if either left or right is a categorical
  1145. # then the must match exactly in categories & ordered
  1146. if lk_is_cat and rk_is_cat:
  1147. lk = cast(Categorical, lk)
  1148. rk = cast(Categorical, rk)
  1149. if lk._categories_match_up_to_permutation(rk):
  1150. continue
  1151. elif lk_is_cat or rk_is_cat:
  1152. pass
  1153. elif is_dtype_equal(lk.dtype, rk.dtype):
  1154. continue
  1155. msg = (
  1156. f"You are trying to merge on {lk.dtype} and "
  1157. f"{rk.dtype} columns. If you wish to proceed you should use pd.concat"
  1158. )
  1159. # if we are numeric, then allow differing
  1160. # kinds to proceed, eg. int64 and int8, int and float
  1161. # further if we are object, but we infer to
  1162. # the same, then proceed
  1163. if is_numeric_dtype(lk.dtype) and is_numeric_dtype(rk.dtype):
  1164. if lk.dtype.kind == rk.dtype.kind:
  1165. continue
  1166. # check whether ints and floats
  1167. if is_integer_dtype(rk.dtype) and is_float_dtype(lk.dtype):
  1168. # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
  1169. with np.errstate(invalid="ignore"):
  1170. # error: Argument 1 to "astype" of "ndarray" has incompatible
  1171. # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected
  1172. # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"
  1173. casted = lk.astype(rk.dtype) # type: ignore[arg-type]
  1174. mask = ~np.isnan(lk)
  1175. match = lk == casted
  1176. if not match[mask].all():
  1177. warnings.warn(
  1178. "You are merging on int and float "
  1179. "columns where the float values "
  1180. "are not equal to their int representation.",
  1181. UserWarning,
  1182. stacklevel=find_stack_level(),
  1183. )
  1184. continue
  1185. if is_float_dtype(rk.dtype) and is_integer_dtype(lk.dtype):
  1186. # GH 47391 numpy > 1.24 will raise a RuntimeError for nan -> int
  1187. with np.errstate(invalid="ignore"):
  1188. # error: Argument 1 to "astype" of "ndarray" has incompatible
  1189. # type "Union[ExtensionDtype, Any, dtype[Any]]"; expected
  1190. # "Union[dtype[Any], Type[Any], _SupportsDType[dtype[Any]]]"
  1191. casted = rk.astype(lk.dtype) # type: ignore[arg-type]
  1192. mask = ~np.isnan(rk)
  1193. match = rk == casted
  1194. if not match[mask].all():
  1195. warnings.warn(
  1196. "You are merging on int and float "
  1197. "columns where the float values "
  1198. "are not equal to their int representation.",
  1199. UserWarning,
  1200. stacklevel=find_stack_level(),
  1201. )
  1202. continue
  1203. # let's infer and see if we are ok
  1204. if lib.infer_dtype(lk, skipna=False) == lib.infer_dtype(
  1205. rk, skipna=False
  1206. ):
  1207. continue
  1208. # Check if we are trying to merge on obviously
  1209. # incompatible dtypes GH 9780, GH 15800
  1210. # bool values are coerced to object
  1211. elif (lk_is_object and is_bool_dtype(rk.dtype)) or (
  1212. is_bool_dtype(lk.dtype) and rk_is_object
  1213. ):
  1214. pass
  1215. # object values are allowed to be merged
  1216. elif (lk_is_object and is_numeric_dtype(rk.dtype)) or (
  1217. is_numeric_dtype(lk.dtype) and rk_is_object
  1218. ):
  1219. inferred_left = lib.infer_dtype(lk, skipna=False)
  1220. inferred_right = lib.infer_dtype(rk, skipna=False)
  1221. bool_types = ["integer", "mixed-integer", "boolean", "empty"]
  1222. string_types = ["string", "unicode", "mixed", "bytes", "empty"]
  1223. # inferred bool
  1224. if inferred_left in bool_types and inferred_right in bool_types:
  1225. pass
  1226. # unless we are merging non-string-like with string-like
  1227. elif (
  1228. inferred_left in string_types and inferred_right not in string_types
  1229. ) or (
  1230. inferred_right in string_types and inferred_left not in string_types
  1231. ):
  1232. raise ValueError(msg)
  1233. # datetimelikes must match exactly
  1234. elif needs_i8_conversion(lk.dtype) and not needs_i8_conversion(rk.dtype):
  1235. raise ValueError(msg)
  1236. elif not needs_i8_conversion(lk.dtype) and needs_i8_conversion(rk.dtype):
  1237. raise ValueError(msg)
  1238. elif isinstance(lk.dtype, DatetimeTZDtype) and not isinstance(
  1239. rk.dtype, DatetimeTZDtype
  1240. ):
  1241. raise ValueError(msg)
  1242. elif not isinstance(lk.dtype, DatetimeTZDtype) and isinstance(
  1243. rk.dtype, DatetimeTZDtype
  1244. ):
  1245. raise ValueError(msg)
  1246. elif (
  1247. isinstance(lk.dtype, DatetimeTZDtype)
  1248. and isinstance(rk.dtype, DatetimeTZDtype)
  1249. ) or (lk.dtype.kind == "M" and rk.dtype.kind == "M"):
  1250. # allows datetime with different resolutions
  1251. continue
  1252. elif lk_is_object and rk_is_object:
  1253. continue
  1254. # Houston, we have a problem!
  1255. # let's coerce to object if the dtypes aren't
  1256. # categorical, otherwise coerce to the category
  1257. # dtype. If we coerced categories to object,
  1258. # then we would lose type information on some
  1259. # columns, and end up trying to merge
  1260. # incompatible dtypes. See GH 16900.
  1261. if name in self.left.columns:
  1262. typ = cast(Categorical, lk).categories.dtype if lk_is_cat else object
  1263. self.left = self.left.copy()
  1264. self.left[name] = self.left[name].astype(typ)
  1265. if name in self.right.columns:
  1266. typ = cast(Categorical, rk).categories.dtype if rk_is_cat else object
  1267. self.right = self.right.copy()
  1268. self.right[name] = self.right[name].astype(typ)
  1269. def _create_cross_configuration(
  1270. self, left: DataFrame, right: DataFrame
  1271. ) -> tuple[DataFrame, DataFrame, JoinHow, str]:
  1272. """
  1273. Creates the configuration to dispatch the cross operation to inner join,
  1274. e.g. adding a join column and resetting parameters. Join column is added
  1275. to a new object, no inplace modification
  1276. Parameters
  1277. ----------
  1278. left : DataFrame
  1279. right : DataFrame
  1280. Returns
  1281. -------
  1282. a tuple (left, right, how, cross_col) representing the adjusted
  1283. DataFrames with cross_col, the merge operation set to inner and the column
  1284. to join over.
  1285. """
  1286. cross_col = f"_cross_{uuid.uuid4()}"
  1287. how: JoinHow = "inner"
  1288. return (
  1289. left.assign(**{cross_col: 1}),
  1290. right.assign(**{cross_col: 1}),
  1291. how,
  1292. cross_col,
  1293. )
  1294. def _validate_left_right_on(self, left_on, right_on):
  1295. left_on = com.maybe_make_list(left_on)
  1296. right_on = com.maybe_make_list(right_on)
  1297. if self.how == "cross":
  1298. if (
  1299. self.left_index
  1300. or self.right_index
  1301. or right_on is not None
  1302. or left_on is not None
  1303. or self.on is not None
  1304. ):
  1305. raise MergeError(
  1306. "Can not pass on, right_on, left_on or set right_index=True or "
  1307. "left_index=True"
  1308. )
  1309. # Hm, any way to make this logic less complicated??
  1310. elif self.on is None and left_on is None and right_on is None:
  1311. if self.left_index and self.right_index:
  1312. left_on, right_on = (), ()
  1313. elif self.left_index:
  1314. raise MergeError("Must pass right_on or right_index=True")
  1315. elif self.right_index:
  1316. raise MergeError("Must pass left_on or left_index=True")
  1317. else:
  1318. # use the common columns
  1319. left_cols = self.left.columns
  1320. right_cols = self.right.columns
  1321. common_cols = left_cols.intersection(right_cols)
  1322. if len(common_cols) == 0:
  1323. raise MergeError(
  1324. "No common columns to perform merge on. "
  1325. f"Merge options: left_on={left_on}, "
  1326. f"right_on={right_on}, "
  1327. f"left_index={self.left_index}, "
  1328. f"right_index={self.right_index}"
  1329. )
  1330. if (
  1331. not left_cols.join(common_cols, how="inner").is_unique
  1332. or not right_cols.join(common_cols, how="inner").is_unique
  1333. ):
  1334. raise MergeError(f"Data columns not unique: {repr(common_cols)}")
  1335. left_on = right_on = common_cols
  1336. elif self.on is not None:
  1337. if left_on is not None or right_on is not None:
  1338. raise MergeError(
  1339. 'Can only pass argument "on" OR "left_on" '
  1340. 'and "right_on", not a combination of both.'
  1341. )
  1342. if self.left_index or self.right_index:
  1343. raise MergeError(
  1344. 'Can only pass argument "on" OR "left_index" '
  1345. 'and "right_index", not a combination of both.'
  1346. )
  1347. left_on = right_on = self.on
  1348. elif left_on is not None:
  1349. if self.left_index:
  1350. raise MergeError(
  1351. 'Can only pass argument "left_on" OR "left_index" not both.'
  1352. )
  1353. if not self.right_index and right_on is None:
  1354. raise MergeError('Must pass "right_on" OR "right_index".')
  1355. n = len(left_on)
  1356. if self.right_index:
  1357. if len(left_on) != self.right.index.nlevels:
  1358. raise ValueError(
  1359. "len(left_on) must equal the number "
  1360. 'of levels in the index of "right"'
  1361. )
  1362. right_on = [None] * n
  1363. elif right_on is not None:
  1364. if self.right_index:
  1365. raise MergeError(
  1366. 'Can only pass argument "right_on" OR "right_index" not both.'
  1367. )
  1368. if not self.left_index and left_on is None:
  1369. raise MergeError('Must pass "left_on" OR "left_index".')
  1370. n = len(right_on)
  1371. if self.left_index:
  1372. if len(right_on) != self.left.index.nlevels:
  1373. raise ValueError(
  1374. "len(right_on) must equal the number "
  1375. 'of levels in the index of "left"'
  1376. )
  1377. left_on = [None] * n
  1378. if self.how != "cross" and len(right_on) != len(left_on):
  1379. raise ValueError("len(right_on) must equal len(left_on)")
  1380. return left_on, right_on
  1381. def _validate(self, validate: str) -> None:
  1382. # Check uniqueness of each
  1383. if self.left_index:
  1384. left_unique = self.orig_left.index.is_unique
  1385. else:
  1386. left_unique = MultiIndex.from_arrays(self.left_join_keys).is_unique
  1387. if self.right_index:
  1388. right_unique = self.orig_right.index.is_unique
  1389. else:
  1390. right_unique = MultiIndex.from_arrays(self.right_join_keys).is_unique
  1391. # Check data integrity
  1392. if validate in ["one_to_one", "1:1"]:
  1393. if not left_unique and not right_unique:
  1394. raise MergeError(
  1395. "Merge keys are not unique in either left "
  1396. "or right dataset; not a one-to-one merge"
  1397. )
  1398. if not left_unique:
  1399. raise MergeError(
  1400. "Merge keys are not unique in left dataset; not a one-to-one merge"
  1401. )
  1402. if not right_unique:
  1403. raise MergeError(
  1404. "Merge keys are not unique in right dataset; not a one-to-one merge"
  1405. )
  1406. elif validate in ["one_to_many", "1:m"]:
  1407. if not left_unique:
  1408. raise MergeError(
  1409. "Merge keys are not unique in left dataset; not a one-to-many merge"
  1410. )
  1411. elif validate in ["many_to_one", "m:1"]:
  1412. if not right_unique:
  1413. raise MergeError(
  1414. "Merge keys are not unique in right dataset; "
  1415. "not a many-to-one merge"
  1416. )
  1417. elif validate in ["many_to_many", "m:m"]:
  1418. pass
  1419. else:
  1420. raise ValueError(
  1421. f'"{validate}" is not a valid argument. '
  1422. "Valid arguments are:\n"
  1423. '- "1:1"\n'
  1424. '- "1:m"\n'
  1425. '- "m:1"\n'
  1426. '- "m:m"\n'
  1427. '- "one_to_one"\n'
  1428. '- "one_to_many"\n'
  1429. '- "many_to_one"\n'
  1430. '- "many_to_many"'
  1431. )
  1432. def get_join_indexers(
  1433. left_keys,
  1434. right_keys,
  1435. sort: bool = False,
  1436. how: MergeHow | Literal["asof"] = "inner",
  1437. **kwargs,
  1438. ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
  1439. """
  1440. Parameters
  1441. ----------
  1442. left_keys : ndarray, Index, Series
  1443. right_keys : ndarray, Index, Series
  1444. sort : bool, default False
  1445. how : {'inner', 'outer', 'left', 'right'}, default 'inner'
  1446. Returns
  1447. -------
  1448. np.ndarray[np.intp]
  1449. Indexer into the left_keys.
  1450. np.ndarray[np.intp]
  1451. Indexer into the right_keys.
  1452. """
  1453. assert len(left_keys) == len(
  1454. right_keys
  1455. ), "left_key and right_keys must be the same length"
  1456. # fast-path for empty left/right
  1457. left_n = len(left_keys[0])
  1458. right_n = len(right_keys[0])
  1459. if left_n == 0:
  1460. if how in ["left", "inner", "cross"]:
  1461. return _get_empty_indexer()
  1462. elif not sort and how in ["right", "outer"]:
  1463. return _get_no_sort_one_missing_indexer(right_n, True)
  1464. elif right_n == 0:
  1465. if how in ["right", "inner", "cross"]:
  1466. return _get_empty_indexer()
  1467. elif not sort and how in ["left", "outer"]:
  1468. return _get_no_sort_one_missing_indexer(left_n, False)
  1469. # get left & right join labels and num. of levels at each location
  1470. mapped = (
  1471. _factorize_keys(left_keys[n], right_keys[n], sort=sort, how=how)
  1472. for n in range(len(left_keys))
  1473. )
  1474. zipped = zip(*mapped)
  1475. llab, rlab, shape = (list(x) for x in zipped)
  1476. # get flat i8 keys from label lists
  1477. lkey, rkey = _get_join_keys(llab, rlab, tuple(shape), sort)
  1478. # factorize keys to a dense i8 space
  1479. # `count` is the num. of unique keys
  1480. # set(lkey) | set(rkey) == range(count)
  1481. lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort, how=how)
  1482. # preserve left frame order if how == 'left' and sort == False
  1483. kwargs = cp.copy(kwargs)
  1484. if how in ("left", "right"):
  1485. kwargs["sort"] = sort
  1486. join_func = {
  1487. "inner": libjoin.inner_join,
  1488. "left": libjoin.left_outer_join,
  1489. "right": lambda x, y, count, **kwargs: libjoin.left_outer_join(
  1490. y, x, count, **kwargs
  1491. )[::-1],
  1492. "outer": libjoin.full_outer_join,
  1493. }[how]
  1494. # error: Cannot call function of unknown type
  1495. return join_func(lkey, rkey, count, **kwargs) # type: ignore[operator]
  1496. def restore_dropped_levels_multijoin(
  1497. left: MultiIndex,
  1498. right: MultiIndex,
  1499. dropped_level_names,
  1500. join_index: Index,
  1501. lindexer: npt.NDArray[np.intp],
  1502. rindexer: npt.NDArray[np.intp],
  1503. ) -> tuple[list[Index], npt.NDArray[np.intp], list[Hashable]]:
  1504. """
  1505. *this is an internal non-public method*
  1506. Returns the levels, labels and names of a multi-index to multi-index join.
  1507. Depending on the type of join, this method restores the appropriate
  1508. dropped levels of the joined multi-index.
  1509. The method relies on lindexer, rindexer which hold the index positions of
  1510. left and right, where a join was feasible
  1511. Parameters
  1512. ----------
  1513. left : MultiIndex
  1514. left index
  1515. right : MultiIndex
  1516. right index
  1517. dropped_level_names : str array
  1518. list of non-common level names
  1519. join_index : Index
  1520. the index of the join between the
  1521. common levels of left and right
  1522. lindexer : np.ndarray[np.intp]
  1523. left indexer
  1524. rindexer : np.ndarray[np.intp]
  1525. right indexer
  1526. Returns
  1527. -------
  1528. levels : list of Index
  1529. levels of combined multiindexes
  1530. labels : np.ndarray[np.intp]
  1531. labels of combined multiindexes
  1532. names : List[Hashable]
  1533. names of combined multiindex levels
  1534. """
  1535. def _convert_to_multiindex(index: Index) -> MultiIndex:
  1536. if isinstance(index, MultiIndex):
  1537. return index
  1538. else:
  1539. return MultiIndex.from_arrays([index._values], names=[index.name])
  1540. # For multi-multi joins with one overlapping level,
  1541. # the returned index if of type Index
  1542. # Assure that join_index is of type MultiIndex
  1543. # so that dropped levels can be appended
  1544. join_index = _convert_to_multiindex(join_index)
  1545. join_levels = join_index.levels
  1546. join_codes = join_index.codes
  1547. join_names = join_index.names
  1548. # Iterate through the levels that must be restored
  1549. for dropped_level_name in dropped_level_names:
  1550. if dropped_level_name in left.names:
  1551. idx = left
  1552. indexer = lindexer
  1553. else:
  1554. idx = right
  1555. indexer = rindexer
  1556. # The index of the level name to be restored
  1557. name_idx = idx.names.index(dropped_level_name)
  1558. restore_levels = idx.levels[name_idx]
  1559. # Inject -1 in the codes list where a join was not possible
  1560. # IOW indexer[i]=-1
  1561. codes = idx.codes[name_idx]
  1562. if indexer is None:
  1563. restore_codes = codes
  1564. else:
  1565. restore_codes = algos.take_nd(codes, indexer, fill_value=-1)
  1566. # error: Cannot determine type of "__add__"
  1567. join_levels = join_levels + [restore_levels] # type: ignore[has-type]
  1568. join_codes = join_codes + [restore_codes]
  1569. join_names = join_names + [dropped_level_name]
  1570. return join_levels, join_codes, join_names
  1571. class _OrderedMerge(_MergeOperation):
  1572. _merge_type = "ordered_merge"
  1573. def __init__(
  1574. self,
  1575. left: DataFrame | Series,
  1576. right: DataFrame | Series,
  1577. on: IndexLabel | None = None,
  1578. left_on: IndexLabel | None = None,
  1579. right_on: IndexLabel | None = None,
  1580. left_index: bool = False,
  1581. right_index: bool = False,
  1582. axis: AxisInt = 1,
  1583. suffixes: Suffixes = ("_x", "_y"),
  1584. fill_method: str | None = None,
  1585. how: JoinHow | Literal["asof"] = "outer",
  1586. ) -> None:
  1587. self.fill_method = fill_method
  1588. _MergeOperation.__init__(
  1589. self,
  1590. left,
  1591. right,
  1592. on=on,
  1593. left_on=left_on,
  1594. left_index=left_index,
  1595. right_index=right_index,
  1596. right_on=right_on,
  1597. axis=axis,
  1598. how=how,
  1599. suffixes=suffixes,
  1600. sort=True, # factorize sorts
  1601. )
  1602. def get_result(self, copy: bool | None = True) -> DataFrame:
  1603. join_index, left_indexer, right_indexer = self._get_join_info()
  1604. llabels, rlabels = _items_overlap_with_suffix(
  1605. self.left._info_axis, self.right._info_axis, self.suffixes
  1606. )
  1607. left_join_indexer: np.ndarray | None
  1608. right_join_indexer: np.ndarray | None
  1609. if self.fill_method == "ffill":
  1610. if left_indexer is None:
  1611. raise TypeError("left_indexer cannot be None")
  1612. left_indexer, right_indexer = cast(np.ndarray, left_indexer), cast(
  1613. np.ndarray, right_indexer
  1614. )
  1615. left_join_indexer = libjoin.ffill_indexer(left_indexer)
  1616. right_join_indexer = libjoin.ffill_indexer(right_indexer)
  1617. else:
  1618. left_join_indexer = left_indexer
  1619. right_join_indexer = right_indexer
  1620. result = self._reindex_and_concat(
  1621. join_index, left_join_indexer, right_join_indexer, copy=copy
  1622. )
  1623. self._maybe_add_join_keys(result, left_indexer, right_indexer)
  1624. return result
  1625. def _asof_by_function(direction: str):
  1626. name = f"asof_join_{direction}_on_X_by_Y"
  1627. return getattr(libjoin, name, None)
  1628. _type_casters = {
  1629. "int64_t": ensure_int64,
  1630. "double": ensure_float64,
  1631. "object": ensure_object,
  1632. }
  1633. def _get_cython_type_upcast(dtype: DtypeObj) -> str:
  1634. """Upcast a dtype to 'int64_t', 'double', or 'object'"""
  1635. if is_integer_dtype(dtype):
  1636. return "int64_t"
  1637. elif is_float_dtype(dtype):
  1638. return "double"
  1639. else:
  1640. return "object"
  1641. class _AsOfMerge(_OrderedMerge):
  1642. _merge_type = "asof_merge"
  1643. def __init__(
  1644. self,
  1645. left: DataFrame | Series,
  1646. right: DataFrame | Series,
  1647. on: IndexLabel | None = None,
  1648. left_on: IndexLabel | None = None,
  1649. right_on: IndexLabel | None = None,
  1650. left_index: bool = False,
  1651. right_index: bool = False,
  1652. by=None,
  1653. left_by=None,
  1654. right_by=None,
  1655. axis: AxisInt = 1,
  1656. suffixes: Suffixes = ("_x", "_y"),
  1657. copy: bool = True,
  1658. fill_method: str | None = None,
  1659. how: Literal["asof"] = "asof",
  1660. tolerance=None,
  1661. allow_exact_matches: bool = True,
  1662. direction: str = "backward",
  1663. ) -> None:
  1664. self.by = by
  1665. self.left_by = left_by
  1666. self.right_by = right_by
  1667. self.tolerance = tolerance
  1668. self.allow_exact_matches = allow_exact_matches
  1669. self.direction = direction
  1670. _OrderedMerge.__init__(
  1671. self,
  1672. left,
  1673. right,
  1674. on=on,
  1675. left_on=left_on,
  1676. right_on=right_on,
  1677. left_index=left_index,
  1678. right_index=right_index,
  1679. axis=axis,
  1680. how=how,
  1681. suffixes=suffixes,
  1682. fill_method=fill_method,
  1683. )
  1684. def _validate_left_right_on(self, left_on, right_on):
  1685. left_on, right_on = super()._validate_left_right_on(left_on, right_on)
  1686. # we only allow on to be a single item for on
  1687. if len(left_on) != 1 and not self.left_index:
  1688. raise MergeError("can only asof on a key for left")
  1689. if len(right_on) != 1 and not self.right_index:
  1690. raise MergeError("can only asof on a key for right")
  1691. if self.left_index and isinstance(self.left.index, MultiIndex):
  1692. raise MergeError("left can only have one index")
  1693. if self.right_index and isinstance(self.right.index, MultiIndex):
  1694. raise MergeError("right can only have one index")
  1695. # set 'by' columns
  1696. if self.by is not None:
  1697. if self.left_by is not None or self.right_by is not None:
  1698. raise MergeError("Can only pass by OR left_by and right_by")
  1699. self.left_by = self.right_by = self.by
  1700. if self.left_by is None and self.right_by is not None:
  1701. raise MergeError("missing left_by")
  1702. if self.left_by is not None and self.right_by is None:
  1703. raise MergeError("missing right_by")
  1704. # GH#29130 Check that merge keys do not have dtype object
  1705. if not self.left_index:
  1706. left_on_0 = left_on[0]
  1707. if is_array_like(left_on_0):
  1708. lo_dtype = left_on_0.dtype
  1709. else:
  1710. lo_dtype = (
  1711. self.left._get_label_or_level_values(left_on_0).dtype
  1712. if left_on_0 in self.left.columns
  1713. else self.left.index.get_level_values(left_on_0)
  1714. )
  1715. else:
  1716. lo_dtype = self.left.index.dtype
  1717. if not self.right_index:
  1718. right_on_0 = right_on[0]
  1719. if is_array_like(right_on_0):
  1720. ro_dtype = right_on_0.dtype
  1721. else:
  1722. ro_dtype = (
  1723. self.right._get_label_or_level_values(right_on_0).dtype
  1724. if right_on_0 in self.right.columns
  1725. else self.right.index.get_level_values(right_on_0)
  1726. )
  1727. else:
  1728. ro_dtype = self.right.index.dtype
  1729. if is_object_dtype(lo_dtype) or is_object_dtype(ro_dtype):
  1730. raise MergeError(
  1731. f"Incompatible merge dtype, {repr(ro_dtype)} and "
  1732. f"{repr(lo_dtype)}, both sides must have numeric dtype"
  1733. )
  1734. # add 'by' to our key-list so we can have it in the
  1735. # output as a key
  1736. if self.left_by is not None:
  1737. if not is_list_like(self.left_by):
  1738. self.left_by = [self.left_by]
  1739. if not is_list_like(self.right_by):
  1740. self.right_by = [self.right_by]
  1741. if len(self.left_by) != len(self.right_by):
  1742. raise MergeError("left_by and right_by must be same length")
  1743. left_on = self.left_by + list(left_on)
  1744. right_on = self.right_by + list(right_on)
  1745. # check 'direction' is valid
  1746. if self.direction not in ["backward", "forward", "nearest"]:
  1747. raise MergeError(f"direction invalid: {self.direction}")
  1748. return left_on, right_on
  1749. def _get_merge_keys(
  1750. self,
  1751. ) -> tuple[list[AnyArrayLike], list[AnyArrayLike], list[Hashable]]:
  1752. # note this function has side effects
  1753. (left_join_keys, right_join_keys, join_names) = super()._get_merge_keys()
  1754. # validate index types are the same
  1755. for i, (lk, rk) in enumerate(zip(left_join_keys, right_join_keys)):
  1756. if not is_dtype_equal(lk.dtype, rk.dtype):
  1757. if is_categorical_dtype(lk.dtype) and is_categorical_dtype(rk.dtype):
  1758. # The generic error message is confusing for categoricals.
  1759. #
  1760. # In this function, the join keys include both the original
  1761. # ones of the merge_asof() call, and also the keys passed
  1762. # to its by= argument. Unordered but equal categories
  1763. # are not supported for the former, but will fail
  1764. # later with a ValueError, so we don't *need* to check
  1765. # for them here.
  1766. msg = (
  1767. f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
  1768. f"{repr(rk.dtype)}, both sides category, but not equal ones"
  1769. )
  1770. else:
  1771. msg = (
  1772. f"incompatible merge keys [{i}] {repr(lk.dtype)} and "
  1773. f"{repr(rk.dtype)}, must be the same type"
  1774. )
  1775. raise MergeError(msg)
  1776. # validate tolerance; datetime.timedelta or Timedelta if we have a DTI
  1777. if self.tolerance is not None:
  1778. if self.left_index:
  1779. # Actually more specifically an Index
  1780. lt = cast(AnyArrayLike, self.left.index)
  1781. else:
  1782. lt = left_join_keys[-1]
  1783. msg = (
  1784. f"incompatible tolerance {self.tolerance}, must be compat "
  1785. f"with type {repr(lt.dtype)}"
  1786. )
  1787. if needs_i8_conversion(lt):
  1788. if not isinstance(self.tolerance, datetime.timedelta):
  1789. raise MergeError(msg)
  1790. if self.tolerance < Timedelta(0):
  1791. raise MergeError("tolerance must be positive")
  1792. elif is_integer_dtype(lt):
  1793. if not is_integer(self.tolerance):
  1794. raise MergeError(msg)
  1795. if self.tolerance < 0:
  1796. raise MergeError("tolerance must be positive")
  1797. elif is_float_dtype(lt):
  1798. if not is_number(self.tolerance):
  1799. raise MergeError(msg)
  1800. if self.tolerance < 0:
  1801. raise MergeError("tolerance must be positive")
  1802. else:
  1803. raise MergeError("key must be integer, timestamp or float")
  1804. # validate allow_exact_matches
  1805. if not is_bool(self.allow_exact_matches):
  1806. msg = (
  1807. "allow_exact_matches must be boolean, "
  1808. f"passed {self.allow_exact_matches}"
  1809. )
  1810. raise MergeError(msg)
  1811. return left_join_keys, right_join_keys, join_names
  1812. def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
  1813. """return the join indexers"""
  1814. def flip(xs) -> np.ndarray:
  1815. """unlike np.transpose, this returns an array of tuples"""
  1816. def injection(obj):
  1817. if not is_extension_array_dtype(obj):
  1818. # ndarray
  1819. return obj
  1820. obj = extract_array(obj)
  1821. if isinstance(obj, NDArrayBackedExtensionArray):
  1822. # fastpath for e.g. dt64tz, categorical
  1823. return obj._ndarray
  1824. # FIXME: returning obj._values_for_argsort() here doesn't
  1825. # break in any existing test cases, but i (@jbrockmendel)
  1826. # am pretty sure it should!
  1827. # e.g.
  1828. # arr = pd.array([0, pd.NA, 255], dtype="UInt8")
  1829. # will have values_for_argsort (before GH#45434)
  1830. # np.array([0, 255, 255], dtype=np.uint8)
  1831. # and the non-injectivity should make a difference somehow
  1832. # shouldn't it?
  1833. return np.asarray(obj)
  1834. xs = [injection(x) for x in xs]
  1835. labels = list(string.ascii_lowercase[: len(xs)])
  1836. dtypes = [x.dtype for x in xs]
  1837. labeled_dtypes = list(zip(labels, dtypes))
  1838. return np.array(list(zip(*xs)), labeled_dtypes)
  1839. # values to compare
  1840. left_values = (
  1841. self.left.index._values if self.left_index else self.left_join_keys[-1]
  1842. )
  1843. right_values = (
  1844. self.right.index._values if self.right_index else self.right_join_keys[-1]
  1845. )
  1846. tolerance = self.tolerance
  1847. # we require sortedness and non-null values in the join keys
  1848. if not Index(left_values).is_monotonic_increasing:
  1849. side = "left"
  1850. if isna(left_values).any():
  1851. raise ValueError(f"Merge keys contain null values on {side} side")
  1852. raise ValueError(f"{side} keys must be sorted")
  1853. if not Index(right_values).is_monotonic_increasing:
  1854. side = "right"
  1855. if isna(right_values).any():
  1856. raise ValueError(f"Merge keys contain null values on {side} side")
  1857. raise ValueError(f"{side} keys must be sorted")
  1858. # initial type conversion as needed
  1859. if needs_i8_conversion(left_values):
  1860. if tolerance is not None:
  1861. tolerance = Timedelta(tolerance)
  1862. # TODO: we have no test cases with PeriodDtype here; probably
  1863. # need to adjust tolerance for that case.
  1864. if left_values.dtype.kind in ["m", "M"]:
  1865. # Make sure the i8 representation for tolerance
  1866. # matches that for left_values/right_values.
  1867. lvs = ensure_wrapped_if_datetimelike(left_values)
  1868. tolerance = tolerance.as_unit(lvs.unit)
  1869. tolerance = tolerance._value
  1870. # TODO: require left_values.dtype == right_values.dtype, or at least
  1871. # comparable for e.g. dt64tz
  1872. left_values = left_values.view("i8")
  1873. right_values = right_values.view("i8")
  1874. # a "by" parameter requires special handling
  1875. if self.left_by is not None:
  1876. # remove 'on' parameter from values if one existed
  1877. if self.left_index and self.right_index:
  1878. left_by_values = self.left_join_keys
  1879. right_by_values = self.right_join_keys
  1880. else:
  1881. left_by_values = self.left_join_keys[0:-1]
  1882. right_by_values = self.right_join_keys[0:-1]
  1883. # get tuple representation of values if more than one
  1884. if len(left_by_values) == 1:
  1885. lbv = left_by_values[0]
  1886. rbv = right_by_values[0]
  1887. else:
  1888. # We get here with non-ndarrays in test_merge_by_col_tz_aware
  1889. # and test_merge_groupby_multiple_column_with_categorical_column
  1890. lbv = flip(left_by_values)
  1891. rbv = flip(right_by_values)
  1892. # upcast 'by' parameter because HashTable is limited
  1893. by_type = _get_cython_type_upcast(lbv.dtype)
  1894. by_type_caster = _type_casters[by_type]
  1895. # error: Incompatible types in assignment (expression has type
  1896. # "ndarray[Any, dtype[generic]]", variable has type
  1897. # "List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]")
  1898. left_by_values = by_type_caster(lbv) # type: ignore[assignment]
  1899. # error: Incompatible types in assignment (expression has type
  1900. # "ndarray[Any, dtype[generic]]", variable has type
  1901. # "List[Union[Union[ExtensionArray, ndarray[Any, Any]], Index, Series]]")
  1902. right_by_values = by_type_caster(rbv) # type: ignore[assignment]
  1903. # choose appropriate function by type
  1904. func = _asof_by_function(self.direction)
  1905. return func(
  1906. left_values,
  1907. right_values,
  1908. left_by_values,
  1909. right_by_values,
  1910. self.allow_exact_matches,
  1911. tolerance,
  1912. )
  1913. else:
  1914. # choose appropriate function by type
  1915. func = _asof_by_function(self.direction)
  1916. # TODO(cython3):
  1917. # Bug in beta1 preventing Cython from choosing
  1918. # right specialization when one fused memview is None
  1919. # Doesn't matter what type we choose
  1920. # (nothing happens anyways since it is None)
  1921. # GH 51640
  1922. return func[f"{left_values.dtype}_t", object](
  1923. left_values,
  1924. right_values,
  1925. None,
  1926. None,
  1927. self.allow_exact_matches,
  1928. tolerance,
  1929. False,
  1930. )
  1931. def _get_multiindex_indexer(
  1932. join_keys, index: MultiIndex, sort: bool
  1933. ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
  1934. # left & right join labels and num. of levels at each location
  1935. mapped = (
  1936. _factorize_keys(index.levels[n], join_keys[n], sort=sort)
  1937. for n in range(index.nlevels)
  1938. )
  1939. zipped = zip(*mapped)
  1940. rcodes, lcodes, shape = (list(x) for x in zipped)
  1941. if sort:
  1942. rcodes = list(map(np.take, rcodes, index.codes))
  1943. else:
  1944. i8copy = lambda a: a.astype("i8", subok=False, copy=True)
  1945. rcodes = list(map(i8copy, index.codes))
  1946. # fix right labels if there were any nulls
  1947. for i, join_key in enumerate(join_keys):
  1948. mask = index.codes[i] == -1
  1949. if mask.any():
  1950. # check if there already was any nulls at this location
  1951. # if there was, it is factorized to `shape[i] - 1`
  1952. a = join_key[lcodes[i] == shape[i] - 1]
  1953. if a.size == 0 or not a[0] != a[0]:
  1954. shape[i] += 1
  1955. rcodes[i][mask] = shape[i] - 1
  1956. # get flat i8 join keys
  1957. lkey, rkey = _get_join_keys(lcodes, rcodes, tuple(shape), sort)
  1958. # factorize keys to a dense i8 space
  1959. lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
  1960. return libjoin.left_outer_join(lkey, rkey, count, sort=sort)
  1961. def _get_single_indexer(
  1962. join_key, index: Index, sort: bool = False
  1963. ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
  1964. left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort)
  1965. return libjoin.left_outer_join(left_key, right_key, count, sort=sort)
  1966. def _get_empty_indexer() -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
  1967. """Return empty join indexers."""
  1968. return (
  1969. np.array([], dtype=np.intp),
  1970. np.array([], dtype=np.intp),
  1971. )
  1972. def _get_no_sort_one_missing_indexer(
  1973. n: int, left_missing: bool
  1974. ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
  1975. """
  1976. Return join indexers where all of one side is selected without sorting
  1977. and none of the other side is selected.
  1978. Parameters
  1979. ----------
  1980. n : int
  1981. Length of indexers to create.
  1982. left_missing : bool
  1983. If True, the left indexer will contain only -1's.
  1984. If False, the right indexer will contain only -1's.
  1985. Returns
  1986. -------
  1987. np.ndarray[np.intp]
  1988. Left indexer
  1989. np.ndarray[np.intp]
  1990. Right indexer
  1991. """
  1992. idx = np.arange(n, dtype=np.intp)
  1993. idx_missing = np.full(shape=n, fill_value=-1, dtype=np.intp)
  1994. if left_missing:
  1995. return idx_missing, idx
  1996. return idx, idx_missing
  1997. def _left_join_on_index(
  1998. left_ax: Index, right_ax: Index, join_keys, sort: bool = False
  1999. ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]:
  2000. if len(join_keys) > 1:
  2001. if not (
  2002. isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels
  2003. ):
  2004. raise AssertionError(
  2005. "If more than one join key is given then "
  2006. "'right_ax' must be a MultiIndex and the "
  2007. "number of join keys must be the number of levels in right_ax"
  2008. )
  2009. left_indexer, right_indexer = _get_multiindex_indexer(
  2010. join_keys, right_ax, sort=sort
  2011. )
  2012. else:
  2013. jkey = join_keys[0]
  2014. left_indexer, right_indexer = _get_single_indexer(jkey, right_ax, sort=sort)
  2015. if sort or len(left_ax) != len(left_indexer):
  2016. # if asked to sort or there are 1-to-many matches
  2017. join_index = left_ax.take(left_indexer)
  2018. return join_index, left_indexer, right_indexer
  2019. # left frame preserves order & length of its index
  2020. return left_ax, None, right_indexer
  2021. def _factorize_keys(
  2022. lk: ArrayLike,
  2023. rk: ArrayLike,
  2024. sort: bool = True,
  2025. how: MergeHow | Literal["asof"] = "inner",
  2026. ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp], int]:
  2027. """
  2028. Encode left and right keys as enumerated types.
  2029. This is used to get the join indexers to be used when merging DataFrames.
  2030. Parameters
  2031. ----------
  2032. lk : array-like
  2033. Left key.
  2034. rk : array-like
  2035. Right key.
  2036. sort : bool, defaults to True
  2037. If True, the encoding is done such that the unique elements in the
  2038. keys are sorted.
  2039. how : {‘left’, ‘right’, ‘outer’, ‘inner’}, default ‘inner’
  2040. Type of merge.
  2041. Returns
  2042. -------
  2043. np.ndarray[np.intp]
  2044. Left (resp. right if called with `key='right'`) labels, as enumerated type.
  2045. np.ndarray[np.intp]
  2046. Right (resp. left if called with `key='right'`) labels, as enumerated type.
  2047. int
  2048. Number of unique elements in union of left and right labels.
  2049. See Also
  2050. --------
  2051. merge : Merge DataFrame or named Series objects
  2052. with a database-style join.
  2053. algorithms.factorize : Encode the object as an enumerated type
  2054. or categorical variable.
  2055. Examples
  2056. --------
  2057. >>> lk = np.array(["a", "c", "b"])
  2058. >>> rk = np.array(["a", "c"])
  2059. Here, the unique values are `'a', 'b', 'c'`. With the default
  2060. `sort=True`, the encoding will be `{0: 'a', 1: 'b', 2: 'c'}`:
  2061. >>> pd.core.reshape.merge._factorize_keys(lk, rk)
  2062. (array([0, 2, 1]), array([0, 2]), 3)
  2063. With the `sort=False`, the encoding will correspond to the order
  2064. in which the unique elements first appear: `{0: 'a', 1: 'c', 2: 'b'}`:
  2065. >>> pd.core.reshape.merge._factorize_keys(lk, rk, sort=False)
  2066. (array([0, 1, 2]), array([0, 1]), 3)
  2067. """
  2068. # Some pre-processing for non-ndarray lk / rk
  2069. lk = extract_array(lk, extract_numpy=True, extract_range=True)
  2070. rk = extract_array(rk, extract_numpy=True, extract_range=True)
  2071. # TODO: if either is a RangeIndex, we can likely factorize more efficiently?
  2072. if (
  2073. isinstance(lk.dtype, DatetimeTZDtype) and isinstance(rk.dtype, DatetimeTZDtype)
  2074. ) or (
  2075. isinstance(lk.dtype, np.dtype)
  2076. and lk.dtype.kind == "M"
  2077. and isinstance(rk.dtype, np.dtype)
  2078. and rk.dtype.kind == "M"
  2079. ):
  2080. # Extract the ndarray (UTC-localized) values
  2081. # Note: we dont need the dtypes to match, as these can still be compared
  2082. lk, rk = cast("DatetimeArray", lk)._ensure_matching_resos(rk)
  2083. lk = cast("DatetimeArray", lk)._ndarray
  2084. rk = cast("DatetimeArray", rk)._ndarray
  2085. elif (
  2086. is_categorical_dtype(lk.dtype)
  2087. and is_categorical_dtype(rk.dtype)
  2088. and is_dtype_equal(lk.dtype, rk.dtype)
  2089. ):
  2090. assert isinstance(lk, Categorical)
  2091. assert isinstance(rk, Categorical)
  2092. # Cast rk to encoding so we can compare codes with lk
  2093. rk = lk._encode_with_my_categories(rk)
  2094. lk = ensure_int64(lk.codes)
  2095. rk = ensure_int64(rk.codes)
  2096. elif isinstance(lk, ExtensionArray) and is_dtype_equal(lk.dtype, rk.dtype):
  2097. if not isinstance(lk, BaseMaskedArray) and not (
  2098. # exclude arrow dtypes that would get cast to object
  2099. isinstance(lk.dtype, ArrowDtype)
  2100. and is_numeric_dtype(lk.dtype.numpy_dtype)
  2101. ):
  2102. lk, _ = lk._values_for_factorize()
  2103. # error: Item "ndarray" of "Union[Any, ndarray]" has no attribute
  2104. # "_values_for_factorize"
  2105. rk, _ = rk._values_for_factorize() # type: ignore[union-attr]
  2106. if needs_i8_conversion(lk.dtype) and lk.dtype == rk.dtype:
  2107. # GH#23917 TODO: Needs tests for non-matching dtypes
  2108. # GH#23917 TODO: needs tests for case where lk is integer-dtype
  2109. # and rk is datetime-dtype
  2110. lk = np.asarray(lk, dtype=np.int64)
  2111. rk = np.asarray(rk, dtype=np.int64)
  2112. klass, lk, rk = _convert_arrays_and_get_rizer_klass(lk, rk)
  2113. rizer = klass(max(len(lk), len(rk)))
  2114. if isinstance(lk, BaseMaskedArray):
  2115. assert isinstance(rk, BaseMaskedArray)
  2116. llab = rizer.factorize(lk._data, mask=lk._mask)
  2117. rlab = rizer.factorize(rk._data, mask=rk._mask)
  2118. elif isinstance(lk, ArrowExtensionArray):
  2119. assert isinstance(rk, ArrowExtensionArray)
  2120. # we can only get here with numeric dtypes
  2121. # TODO: Remove when we have a Factorizer for Arrow
  2122. llab = rizer.factorize(
  2123. lk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=lk.isna()
  2124. )
  2125. rlab = rizer.factorize(
  2126. rk.to_numpy(na_value=1, dtype=lk.dtype.numpy_dtype), mask=rk.isna()
  2127. )
  2128. else:
  2129. # Argument 1 to "factorize" of "ObjectFactorizer" has incompatible type
  2130. # "Union[ndarray[Any, dtype[signedinteger[_64Bit]]],
  2131. # ndarray[Any, dtype[object_]]]"; expected "ndarray[Any, dtype[object_]]"
  2132. llab = rizer.factorize(lk) # type: ignore[arg-type]
  2133. rlab = rizer.factorize(rk) # type: ignore[arg-type]
  2134. assert llab.dtype == np.dtype(np.intp), llab.dtype
  2135. assert rlab.dtype == np.dtype(np.intp), rlab.dtype
  2136. count = rizer.get_count()
  2137. if sort:
  2138. uniques = rizer.uniques.to_array()
  2139. llab, rlab = _sort_labels(uniques, llab, rlab)
  2140. # NA group
  2141. lmask = llab == -1
  2142. lany = lmask.any()
  2143. rmask = rlab == -1
  2144. rany = rmask.any()
  2145. if lany or rany:
  2146. if lany:
  2147. np.putmask(llab, lmask, count)
  2148. if rany:
  2149. np.putmask(rlab, rmask, count)
  2150. count += 1
  2151. if how == "right":
  2152. return rlab, llab, count
  2153. return llab, rlab, count
  2154. def _convert_arrays_and_get_rizer_klass(
  2155. lk: ArrayLike, rk: ArrayLike
  2156. ) -> tuple[type[libhashtable.Factorizer], ArrayLike, ArrayLike]:
  2157. klass: type[libhashtable.Factorizer]
  2158. if is_numeric_dtype(lk.dtype):
  2159. if not is_dtype_equal(lk, rk):
  2160. dtype = find_common_type([lk.dtype, rk.dtype])
  2161. if isinstance(dtype, ExtensionDtype):
  2162. cls = dtype.construct_array_type()
  2163. if not isinstance(lk, ExtensionArray):
  2164. lk = cls._from_sequence(lk, dtype=dtype, copy=False)
  2165. else:
  2166. lk = lk.astype(dtype)
  2167. if not isinstance(rk, ExtensionArray):
  2168. rk = cls._from_sequence(rk, dtype=dtype, copy=False)
  2169. else:
  2170. rk = rk.astype(dtype)
  2171. else:
  2172. lk = lk.astype(dtype)
  2173. rk = rk.astype(dtype)
  2174. if isinstance(lk, BaseMaskedArray):
  2175. # Invalid index type "type" for "Dict[Type[object], Type[Factorizer]]";
  2176. # expected type "Type[object]"
  2177. klass = _factorizers[lk.dtype.type] # type: ignore[index]
  2178. elif isinstance(lk.dtype, ArrowDtype):
  2179. klass = _factorizers[lk.dtype.numpy_dtype.type]
  2180. else:
  2181. klass = _factorizers[lk.dtype.type]
  2182. else:
  2183. klass = libhashtable.ObjectFactorizer
  2184. lk = ensure_object(lk)
  2185. rk = ensure_object(rk)
  2186. return klass, lk, rk
  2187. def _sort_labels(
  2188. uniques: np.ndarray, left: npt.NDArray[np.intp], right: npt.NDArray[np.intp]
  2189. ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]:
  2190. llength = len(left)
  2191. labels = np.concatenate([left, right])
  2192. _, new_labels = algos.safe_sort(uniques, labels, use_na_sentinel=True)
  2193. new_left, new_right = new_labels[:llength], new_labels[llength:]
  2194. return new_left, new_right
  2195. def _get_join_keys(
  2196. llab: list[npt.NDArray[np.int64 | np.intp]],
  2197. rlab: list[npt.NDArray[np.int64 | np.intp]],
  2198. shape: Shape,
  2199. sort: bool,
  2200. ) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64]]:
  2201. # how many levels can be done without overflow
  2202. nlev = next(
  2203. lev
  2204. for lev in range(len(shape), 0, -1)
  2205. if not is_int64_overflow_possible(shape[:lev])
  2206. )
  2207. # get keys for the first `nlev` levels
  2208. stride = np.prod(shape[1:nlev], dtype="i8")
  2209. lkey = stride * llab[0].astype("i8", subok=False, copy=False)
  2210. rkey = stride * rlab[0].astype("i8", subok=False, copy=False)
  2211. for i in range(1, nlev):
  2212. with np.errstate(divide="ignore"):
  2213. stride //= shape[i]
  2214. lkey += llab[i] * stride
  2215. rkey += rlab[i] * stride
  2216. if nlev == len(shape): # all done!
  2217. return lkey, rkey
  2218. # densify current keys to avoid overflow
  2219. lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
  2220. llab = [lkey] + llab[nlev:]
  2221. rlab = [rkey] + rlab[nlev:]
  2222. shape = (count,) + shape[nlev:]
  2223. return _get_join_keys(llab, rlab, shape, sort)
  2224. def _should_fill(lname, rname) -> bool:
  2225. if not isinstance(lname, str) or not isinstance(rname, str):
  2226. return True
  2227. return lname == rname
  2228. def _any(x) -> bool:
  2229. return x is not None and com.any_not_none(*x)
  2230. def _validate_operand(obj: DataFrame | Series) -> DataFrame:
  2231. if isinstance(obj, ABCDataFrame):
  2232. return obj
  2233. elif isinstance(obj, ABCSeries):
  2234. if obj.name is None:
  2235. raise ValueError("Cannot merge a Series without a name")
  2236. return obj.to_frame()
  2237. else:
  2238. raise TypeError(
  2239. f"Can only merge Series or DataFrame objects, a {type(obj)} was passed"
  2240. )
  2241. def _items_overlap_with_suffix(
  2242. left: Index, right: Index, suffixes: Suffixes
  2243. ) -> tuple[Index, Index]:
  2244. """
  2245. Suffixes type validation.
  2246. If two indices overlap, add suffixes to overlapping entries.
  2247. If corresponding suffix is empty, the entry is simply converted to string.
  2248. """
  2249. if not is_list_like(suffixes, allow_sets=False) or isinstance(suffixes, dict):
  2250. raise TypeError(
  2251. f"Passing 'suffixes' as a {type(suffixes)}, is not supported. "
  2252. "Provide 'suffixes' as a tuple instead."
  2253. )
  2254. to_rename = left.intersection(right)
  2255. if len(to_rename) == 0:
  2256. return left, right
  2257. lsuffix, rsuffix = suffixes
  2258. if not lsuffix and not rsuffix:
  2259. raise ValueError(f"columns overlap but no suffix specified: {to_rename}")
  2260. def renamer(x, suffix):
  2261. """
  2262. Rename the left and right indices.
  2263. If there is overlap, and suffix is not None, add
  2264. suffix, otherwise, leave it as-is.
  2265. Parameters
  2266. ----------
  2267. x : original column name
  2268. suffix : str or None
  2269. Returns
  2270. -------
  2271. x : renamed column name
  2272. """
  2273. if x in to_rename and suffix is not None:
  2274. return f"{x}{suffix}"
  2275. return x
  2276. lrenamer = partial(renamer, suffix=lsuffix)
  2277. rrenamer = partial(renamer, suffix=rsuffix)
  2278. llabels = left._transform_index(lrenamer)
  2279. rlabels = right._transform_index(rrenamer)
  2280. dups = []
  2281. if not llabels.is_unique:
  2282. # Only warn when duplicates are caused because of suffixes, already duplicated
  2283. # columns in origin should not warn
  2284. dups = llabels[(llabels.duplicated()) & (~left.duplicated())].tolist()
  2285. if not rlabels.is_unique:
  2286. dups.extend(rlabels[(rlabels.duplicated()) & (~right.duplicated())].tolist())
  2287. if dups:
  2288. raise MergeError(
  2289. f"Passing 'suffixes' which cause duplicate columns {set(dups)} is "
  2290. f"not allowed.",
  2291. )
  2292. return llabels, rlabels