multi.py 131 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918
  1. from __future__ import annotations
  2. from functools import wraps
  3. from sys import getsizeof
  4. from typing import (
  5. TYPE_CHECKING,
  6. Any,
  7. Callable,
  8. Collection,
  9. Generator,
  10. Hashable,
  11. Iterable,
  12. List,
  13. Literal,
  14. Sequence,
  15. Tuple,
  16. cast,
  17. )
  18. import warnings
  19. import numpy as np
  20. from pandas._config import get_option
  21. from pandas._libs import (
  22. algos as libalgos,
  23. index as libindex,
  24. lib,
  25. )
  26. from pandas._libs.hashtable import duplicated
  27. from pandas._typing import (
  28. AnyAll,
  29. AnyArrayLike,
  30. Axis,
  31. DropKeep,
  32. DtypeObj,
  33. F,
  34. IgnoreRaise,
  35. IndexLabel,
  36. Scalar,
  37. Shape,
  38. npt,
  39. )
  40. from pandas.compat.numpy import function as nv
  41. from pandas.errors import (
  42. InvalidIndexError,
  43. PerformanceWarning,
  44. UnsortedIndexError,
  45. )
  46. from pandas.util._decorators import (
  47. Appender,
  48. cache_readonly,
  49. doc,
  50. )
  51. from pandas.util._exceptions import find_stack_level
  52. from pandas.core.dtypes.cast import coerce_indexer_dtype
  53. from pandas.core.dtypes.common import (
  54. ensure_int64,
  55. ensure_platform_int,
  56. is_categorical_dtype,
  57. is_extension_array_dtype,
  58. is_hashable,
  59. is_integer,
  60. is_iterator,
  61. is_list_like,
  62. is_object_dtype,
  63. is_scalar,
  64. pandas_dtype,
  65. )
  66. from pandas.core.dtypes.dtypes import ExtensionDtype
  67. from pandas.core.dtypes.generic import (
  68. ABCDataFrame,
  69. ABCDatetimeIndex,
  70. ABCTimedeltaIndex,
  71. )
  72. from pandas.core.dtypes.missing import (
  73. array_equivalent,
  74. isna,
  75. )
  76. import pandas.core.algorithms as algos
  77. from pandas.core.array_algos.putmask import validate_putmask
  78. from pandas.core.arrays import Categorical
  79. from pandas.core.arrays.categorical import factorize_from_iterables
  80. import pandas.core.common as com
  81. import pandas.core.indexes.base as ibase
  82. from pandas.core.indexes.base import (
  83. Index,
  84. _index_shared_docs,
  85. ensure_index,
  86. get_unanimous_names,
  87. )
  88. from pandas.core.indexes.frozen import FrozenList
  89. from pandas.core.ops.invalid import make_invalid_op
  90. from pandas.core.sorting import (
  91. get_group_index,
  92. indexer_from_factorized,
  93. lexsort_indexer,
  94. )
  95. from pandas.io.formats.printing import pprint_thing
  96. if TYPE_CHECKING:
  97. from pandas import (
  98. CategoricalIndex,
  99. DataFrame,
  100. Series,
  101. )
  102. _index_doc_kwargs = dict(ibase._index_doc_kwargs)
  103. _index_doc_kwargs.update(
  104. {"klass": "MultiIndex", "target_klass": "MultiIndex or list of tuples"}
  105. )
  106. class MultiIndexUIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.UInt64Engine):
  107. """
  108. This class manages a MultiIndex by mapping label combinations to positive
  109. integers.
  110. """
  111. _base = libindex.UInt64Engine
  112. def _codes_to_ints(self, codes):
  113. """
  114. Transform combination(s) of uint64 in one uint64 (each), in a strictly
  115. monotonic way (i.e. respecting the lexicographic order of integer
  116. combinations): see BaseMultiIndexCodesEngine documentation.
  117. Parameters
  118. ----------
  119. codes : 1- or 2-dimensional array of dtype uint64
  120. Combinations of integers (one per row)
  121. Returns
  122. -------
  123. scalar or 1-dimensional array, of dtype uint64
  124. Integer(s) representing one combination (each).
  125. """
  126. # Shift the representation of each level by the pre-calculated number
  127. # of bits:
  128. codes <<= self.offsets
  129. # Now sum and OR are in fact interchangeable. This is a simple
  130. # composition of the (disjunct) significant bits of each level (i.e.
  131. # each column in "codes") in a single positive integer:
  132. if codes.ndim == 1:
  133. # Single key
  134. return np.bitwise_or.reduce(codes)
  135. # Multiple keys
  136. return np.bitwise_or.reduce(codes, axis=1)
  137. class MultiIndexPyIntEngine(libindex.BaseMultiIndexCodesEngine, libindex.ObjectEngine):
  138. """
  139. This class manages those (extreme) cases in which the number of possible
  140. label combinations overflows the 64 bits integers, and uses an ObjectEngine
  141. containing Python integers.
  142. """
  143. _base = libindex.ObjectEngine
  144. def _codes_to_ints(self, codes):
  145. """
  146. Transform combination(s) of uint64 in one Python integer (each), in a
  147. strictly monotonic way (i.e. respecting the lexicographic order of
  148. integer combinations): see BaseMultiIndexCodesEngine documentation.
  149. Parameters
  150. ----------
  151. codes : 1- or 2-dimensional array of dtype uint64
  152. Combinations of integers (one per row)
  153. Returns
  154. -------
  155. int, or 1-dimensional array of dtype object
  156. Integer(s) representing one combination (each).
  157. """
  158. # Shift the representation of each level by the pre-calculated number
  159. # of bits. Since this can overflow uint64, first make sure we are
  160. # working with Python integers:
  161. codes = codes.astype("object") << self.offsets
  162. # Now sum and OR are in fact interchangeable. This is a simple
  163. # composition of the (disjunct) significant bits of each level (i.e.
  164. # each column in "codes") in a single positive integer (per row):
  165. if codes.ndim == 1:
  166. # Single key
  167. return np.bitwise_or.reduce(codes)
  168. # Multiple keys
  169. return np.bitwise_or.reduce(codes, axis=1)
  170. def names_compat(meth: F) -> F:
  171. """
  172. A decorator to allow either `name` or `names` keyword but not both.
  173. This makes it easier to share code with base class.
  174. """
  175. @wraps(meth)
  176. def new_meth(self_or_cls, *args, **kwargs):
  177. if "name" in kwargs and "names" in kwargs:
  178. raise TypeError("Can only provide one of `names` and `name`")
  179. if "name" in kwargs:
  180. kwargs["names"] = kwargs.pop("name")
  181. return meth(self_or_cls, *args, **kwargs)
  182. return cast(F, new_meth)
  183. class MultiIndex(Index):
  184. """
  185. A multi-level, or hierarchical, index object for pandas objects.
  186. Parameters
  187. ----------
  188. levels : sequence of arrays
  189. The unique labels for each level.
  190. codes : sequence of arrays
  191. Integers for each level designating which label at each location.
  192. sortorder : optional int
  193. Level of sortedness (must be lexicographically sorted by that
  194. level).
  195. names : optional sequence of objects
  196. Names for each of the index levels. (name is accepted for compat).
  197. copy : bool, default False
  198. Copy the meta-data.
  199. verify_integrity : bool, default True
  200. Check that the levels/codes are consistent and valid.
  201. Attributes
  202. ----------
  203. names
  204. levels
  205. codes
  206. nlevels
  207. levshape
  208. dtypes
  209. Methods
  210. -------
  211. from_arrays
  212. from_tuples
  213. from_product
  214. from_frame
  215. set_levels
  216. set_codes
  217. to_frame
  218. to_flat_index
  219. sortlevel
  220. droplevel
  221. swaplevel
  222. reorder_levels
  223. remove_unused_levels
  224. get_level_values
  225. get_indexer
  226. get_loc
  227. get_locs
  228. get_loc_level
  229. drop
  230. See Also
  231. --------
  232. MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
  233. MultiIndex.from_product : Create a MultiIndex from the cartesian product
  234. of iterables.
  235. MultiIndex.from_tuples : Convert list of tuples to a MultiIndex.
  236. MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
  237. Index : The base pandas Index type.
  238. Notes
  239. -----
  240. See the `user guide
  241. <https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html>`__
  242. for more.
  243. Examples
  244. --------
  245. A new ``MultiIndex`` is typically constructed using one of the helper
  246. methods :meth:`MultiIndex.from_arrays`, :meth:`MultiIndex.from_product`
  247. and :meth:`MultiIndex.from_tuples`. For example (using ``.from_arrays``):
  248. >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
  249. >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
  250. MultiIndex([(1, 'red'),
  251. (1, 'blue'),
  252. (2, 'red'),
  253. (2, 'blue')],
  254. names=['number', 'color'])
  255. See further examples for how to construct a MultiIndex in the doc strings
  256. of the mentioned helper methods.
  257. """
  258. _hidden_attrs = Index._hidden_attrs | frozenset()
  259. # initialize to zero-length tuples to make everything work
  260. _typ = "multiindex"
  261. _names: list[Hashable | None] = []
  262. _levels = FrozenList()
  263. _codes = FrozenList()
  264. _comparables = ["names"]
  265. sortorder: int | None
  266. # --------------------------------------------------------------------
  267. # Constructors
  268. def __new__(
  269. cls,
  270. levels=None,
  271. codes=None,
  272. sortorder=None,
  273. names=None,
  274. dtype=None,
  275. copy: bool = False,
  276. name=None,
  277. verify_integrity: bool = True,
  278. ) -> MultiIndex:
  279. # compat with Index
  280. if name is not None:
  281. names = name
  282. if levels is None or codes is None:
  283. raise TypeError("Must pass both levels and codes")
  284. if len(levels) != len(codes):
  285. raise ValueError("Length of levels and codes must be the same.")
  286. if len(levels) == 0:
  287. raise ValueError("Must pass non-zero number of levels/codes")
  288. result = object.__new__(cls)
  289. result._cache = {}
  290. # we've already validated levels and codes, so shortcut here
  291. result._set_levels(levels, copy=copy, validate=False)
  292. result._set_codes(codes, copy=copy, validate=False)
  293. result._names = [None] * len(levels)
  294. if names is not None:
  295. # handles name validation
  296. result._set_names(names)
  297. if sortorder is not None:
  298. result.sortorder = int(sortorder)
  299. else:
  300. result.sortorder = sortorder
  301. if verify_integrity:
  302. new_codes = result._verify_integrity()
  303. result._codes = new_codes
  304. result._reset_identity()
  305. result._references = None
  306. return result
  307. def _validate_codes(self, level: list, code: list):
  308. """
  309. Reassign code values as -1 if their corresponding levels are NaN.
  310. Parameters
  311. ----------
  312. code : list
  313. Code to reassign.
  314. level : list
  315. Level to check for missing values (NaN, NaT, None).
  316. Returns
  317. -------
  318. new code where code value = -1 if it corresponds
  319. to a level with missing values (NaN, NaT, None).
  320. """
  321. null_mask = isna(level)
  322. if np.any(null_mask):
  323. # error: Incompatible types in assignment
  324. # (expression has type "ndarray[Any, dtype[Any]]",
  325. # variable has type "List[Any]")
  326. code = np.where(null_mask[code], -1, code) # type: ignore[assignment]
  327. return code
  328. def _verify_integrity(self, codes: list | None = None, levels: list | None = None):
  329. """
  330. Parameters
  331. ----------
  332. codes : optional list
  333. Codes to check for validity. Defaults to current codes.
  334. levels : optional list
  335. Levels to check for validity. Defaults to current levels.
  336. Raises
  337. ------
  338. ValueError
  339. If length of levels and codes don't match, if the codes for any
  340. level would exceed level bounds, or there are any duplicate levels.
  341. Returns
  342. -------
  343. new codes where code value = -1 if it corresponds to a
  344. NaN level.
  345. """
  346. # NOTE: Currently does not check, among other things, that cached
  347. # nlevels matches nor that sortorder matches actually sortorder.
  348. codes = codes or self.codes
  349. levels = levels or self.levels
  350. if len(levels) != len(codes):
  351. raise ValueError(
  352. "Length of levels and codes must match. NOTE: "
  353. "this index is in an inconsistent state."
  354. )
  355. codes_length = len(codes[0])
  356. for i, (level, level_codes) in enumerate(zip(levels, codes)):
  357. if len(level_codes) != codes_length:
  358. raise ValueError(
  359. f"Unequal code lengths: {[len(code_) for code_ in codes]}"
  360. )
  361. if len(level_codes) and level_codes.max() >= len(level):
  362. raise ValueError(
  363. f"On level {i}, code max ({level_codes.max()}) >= length of "
  364. f"level ({len(level)}). NOTE: this index is in an "
  365. "inconsistent state"
  366. )
  367. if len(level_codes) and level_codes.min() < -1:
  368. raise ValueError(f"On level {i}, code value ({level_codes.min()}) < -1")
  369. if not level.is_unique:
  370. raise ValueError(
  371. f"Level values must be unique: {list(level)} on level {i}"
  372. )
  373. if self.sortorder is not None:
  374. if self.sortorder > _lexsort_depth(self.codes, self.nlevels):
  375. raise ValueError(
  376. "Value for sortorder must be inferior or equal to actual "
  377. f"lexsort_depth: sortorder {self.sortorder} "
  378. f"with lexsort_depth {_lexsort_depth(self.codes, self.nlevels)}"
  379. )
  380. codes = [
  381. self._validate_codes(level, code) for level, code in zip(levels, codes)
  382. ]
  383. new_codes = FrozenList(codes)
  384. return new_codes
  385. @classmethod
  386. def from_arrays(
  387. cls,
  388. arrays,
  389. sortorder=None,
  390. names: Sequence[Hashable] | Hashable | lib.NoDefault = lib.no_default,
  391. ) -> MultiIndex:
  392. """
  393. Convert arrays to MultiIndex.
  394. Parameters
  395. ----------
  396. arrays : list / sequence of array-likes
  397. Each array-like gives one level's value for each data point.
  398. len(arrays) is the number of levels.
  399. sortorder : int or None
  400. Level of sortedness (must be lexicographically sorted by that
  401. level).
  402. names : list / sequence of str, optional
  403. Names for the levels in the index.
  404. Returns
  405. -------
  406. MultiIndex
  407. See Also
  408. --------
  409. MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
  410. MultiIndex.from_product : Make a MultiIndex from cartesian product
  411. of iterables.
  412. MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
  413. Examples
  414. --------
  415. >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']]
  416. >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color'))
  417. MultiIndex([(1, 'red'),
  418. (1, 'blue'),
  419. (2, 'red'),
  420. (2, 'blue')],
  421. names=['number', 'color'])
  422. """
  423. error_msg = "Input must be a list / sequence of array-likes."
  424. if not is_list_like(arrays):
  425. raise TypeError(error_msg)
  426. if is_iterator(arrays):
  427. arrays = list(arrays)
  428. # Check if elements of array are list-like
  429. for array in arrays:
  430. if not is_list_like(array):
  431. raise TypeError(error_msg)
  432. # Check if lengths of all arrays are equal or not,
  433. # raise ValueError, if not
  434. for i in range(1, len(arrays)):
  435. if len(arrays[i]) != len(arrays[i - 1]):
  436. raise ValueError("all arrays must be same length")
  437. codes, levels = factorize_from_iterables(arrays)
  438. if names is lib.no_default:
  439. names = [getattr(arr, "name", None) for arr in arrays]
  440. return cls(
  441. levels=levels,
  442. codes=codes,
  443. sortorder=sortorder,
  444. names=names,
  445. verify_integrity=False,
  446. )
  447. @classmethod
  448. @names_compat
  449. def from_tuples(
  450. cls,
  451. tuples: Iterable[tuple[Hashable, ...]],
  452. sortorder: int | None = None,
  453. names: Sequence[Hashable] | Hashable = None,
  454. ) -> MultiIndex:
  455. """
  456. Convert list of tuples to MultiIndex.
  457. Parameters
  458. ----------
  459. tuples : list / sequence of tuple-likes
  460. Each tuple is the index of one row/column.
  461. sortorder : int or None
  462. Level of sortedness (must be lexicographically sorted by that
  463. level).
  464. names : list / sequence of str, optional
  465. Names for the levels in the index.
  466. Returns
  467. -------
  468. MultiIndex
  469. See Also
  470. --------
  471. MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
  472. MultiIndex.from_product : Make a MultiIndex from cartesian product
  473. of iterables.
  474. MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
  475. Examples
  476. --------
  477. >>> tuples = [(1, 'red'), (1, 'blue'),
  478. ... (2, 'red'), (2, 'blue')]
  479. >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color'))
  480. MultiIndex([(1, 'red'),
  481. (1, 'blue'),
  482. (2, 'red'),
  483. (2, 'blue')],
  484. names=['number', 'color'])
  485. """
  486. if not is_list_like(tuples):
  487. raise TypeError("Input must be a list / sequence of tuple-likes.")
  488. if is_iterator(tuples):
  489. tuples = list(tuples)
  490. tuples = cast(Collection[Tuple[Hashable, ...]], tuples)
  491. # handling the empty tuple cases
  492. if len(tuples) and all(isinstance(e, tuple) and not e for e in tuples):
  493. codes = [np.zeros(len(tuples))]
  494. levels = [Index(com.asarray_tuplesafe(tuples, dtype=np.dtype("object")))]
  495. return cls(
  496. levels=levels,
  497. codes=codes,
  498. sortorder=sortorder,
  499. names=names,
  500. verify_integrity=False,
  501. )
  502. arrays: list[Sequence[Hashable]]
  503. if len(tuples) == 0:
  504. if names is None:
  505. raise TypeError("Cannot infer number of levels from empty list")
  506. # error: Argument 1 to "len" has incompatible type "Hashable";
  507. # expected "Sized"
  508. arrays = [[]] * len(names) # type: ignore[arg-type]
  509. elif isinstance(tuples, (np.ndarray, Index)):
  510. if isinstance(tuples, Index):
  511. tuples = np.asarray(tuples._values)
  512. arrays = list(lib.tuples_to_object_array(tuples).T)
  513. elif isinstance(tuples, list):
  514. arrays = list(lib.to_object_array_tuples(tuples).T)
  515. else:
  516. arrs = zip(*tuples)
  517. arrays = cast(List[Sequence[Hashable]], arrs)
  518. return cls.from_arrays(arrays, sortorder=sortorder, names=names)
  519. @classmethod
  520. def from_product(
  521. cls,
  522. iterables: Sequence[Iterable[Hashable]],
  523. sortorder: int | None = None,
  524. names: Sequence[Hashable] | Hashable | lib.NoDefault = lib.no_default,
  525. ) -> MultiIndex:
  526. """
  527. Make a MultiIndex from the cartesian product of multiple iterables.
  528. Parameters
  529. ----------
  530. iterables : list / sequence of iterables
  531. Each iterable has unique labels for each level of the index.
  532. sortorder : int or None
  533. Level of sortedness (must be lexicographically sorted by that
  534. level).
  535. names : list / sequence of str, optional
  536. Names for the levels in the index.
  537. If not explicitly provided, names will be inferred from the
  538. elements of iterables if an element has a name attribute.
  539. Returns
  540. -------
  541. MultiIndex
  542. See Also
  543. --------
  544. MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
  545. MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
  546. MultiIndex.from_frame : Make a MultiIndex from a DataFrame.
  547. Examples
  548. --------
  549. >>> numbers = [0, 1, 2]
  550. >>> colors = ['green', 'purple']
  551. >>> pd.MultiIndex.from_product([numbers, colors],
  552. ... names=['number', 'color'])
  553. MultiIndex([(0, 'green'),
  554. (0, 'purple'),
  555. (1, 'green'),
  556. (1, 'purple'),
  557. (2, 'green'),
  558. (2, 'purple')],
  559. names=['number', 'color'])
  560. """
  561. from pandas.core.reshape.util import cartesian_product
  562. if not is_list_like(iterables):
  563. raise TypeError("Input must be a list / sequence of iterables.")
  564. if is_iterator(iterables):
  565. iterables = list(iterables)
  566. codes, levels = factorize_from_iterables(iterables)
  567. if names is lib.no_default:
  568. names = [getattr(it, "name", None) for it in iterables]
  569. # codes are all ndarrays, so cartesian_product is lossless
  570. codes = cartesian_product(codes)
  571. return cls(levels, codes, sortorder=sortorder, names=names)
  572. @classmethod
  573. def from_frame(cls, df: DataFrame, sortorder=None, names=None) -> MultiIndex:
  574. """
  575. Make a MultiIndex from a DataFrame.
  576. Parameters
  577. ----------
  578. df : DataFrame
  579. DataFrame to be converted to MultiIndex.
  580. sortorder : int, optional
  581. Level of sortedness (must be lexicographically sorted by that
  582. level).
  583. names : list-like, optional
  584. If no names are provided, use the column names, or tuple of column
  585. names if the columns is a MultiIndex. If a sequence, overwrite
  586. names with the given sequence.
  587. Returns
  588. -------
  589. MultiIndex
  590. The MultiIndex representation of the given DataFrame.
  591. See Also
  592. --------
  593. MultiIndex.from_arrays : Convert list of arrays to MultiIndex.
  594. MultiIndex.from_tuples : Convert list of tuples to MultiIndex.
  595. MultiIndex.from_product : Make a MultiIndex from cartesian product
  596. of iterables.
  597. Examples
  598. --------
  599. >>> df = pd.DataFrame([['HI', 'Temp'], ['HI', 'Precip'],
  600. ... ['NJ', 'Temp'], ['NJ', 'Precip']],
  601. ... columns=['a', 'b'])
  602. >>> df
  603. a b
  604. 0 HI Temp
  605. 1 HI Precip
  606. 2 NJ Temp
  607. 3 NJ Precip
  608. >>> pd.MultiIndex.from_frame(df)
  609. MultiIndex([('HI', 'Temp'),
  610. ('HI', 'Precip'),
  611. ('NJ', 'Temp'),
  612. ('NJ', 'Precip')],
  613. names=['a', 'b'])
  614. Using explicit names, instead of the column names
  615. >>> pd.MultiIndex.from_frame(df, names=['state', 'observation'])
  616. MultiIndex([('HI', 'Temp'),
  617. ('HI', 'Precip'),
  618. ('NJ', 'Temp'),
  619. ('NJ', 'Precip')],
  620. names=['state', 'observation'])
  621. """
  622. if not isinstance(df, ABCDataFrame):
  623. raise TypeError("Input must be a DataFrame")
  624. column_names, columns = zip(*df.items())
  625. names = column_names if names is None else names
  626. return cls.from_arrays(columns, sortorder=sortorder, names=names)
  627. # --------------------------------------------------------------------
  628. @cache_readonly
  629. def _values(self) -> np.ndarray:
  630. # We override here, since our parent uses _data, which we don't use.
  631. values = []
  632. for i in range(self.nlevels):
  633. index = self.levels[i]
  634. codes = self.codes[i]
  635. vals = index
  636. if is_categorical_dtype(vals.dtype):
  637. vals = cast("CategoricalIndex", vals)
  638. vals = vals._data._internal_get_values()
  639. if isinstance(vals.dtype, ExtensionDtype) or isinstance(
  640. vals, (ABCDatetimeIndex, ABCTimedeltaIndex)
  641. ):
  642. vals = vals.astype(object)
  643. vals = np.array(vals, copy=False)
  644. vals = algos.take_nd(vals, codes, fill_value=index._na_value)
  645. values.append(vals)
  646. arr = lib.fast_zip(values)
  647. return arr
  648. @property
  649. def values(self) -> np.ndarray:
  650. return self._values
  651. @property
  652. def array(self):
  653. """
  654. Raises a ValueError for `MultiIndex` because there's no single
  655. array backing a MultiIndex.
  656. Raises
  657. ------
  658. ValueError
  659. """
  660. raise ValueError(
  661. "MultiIndex has no single backing array. Use "
  662. "'MultiIndex.to_numpy()' to get a NumPy array of tuples."
  663. )
  664. @cache_readonly
  665. def dtypes(self) -> Series:
  666. """
  667. Return the dtypes as a Series for the underlying MultiIndex.
  668. """
  669. from pandas import Series
  670. names = com.fill_missing_names([level.name for level in self.levels])
  671. return Series([level.dtype for level in self.levels], index=Index(names))
  672. def __len__(self) -> int:
  673. return len(self.codes[0])
  674. @property
  675. def size(self) -> int:
  676. """
  677. Return the number of elements in the underlying data.
  678. """
  679. # override Index.size to avoid materializing _values
  680. return len(self)
  681. # --------------------------------------------------------------------
  682. # Levels Methods
  683. @cache_readonly
  684. def levels(self) -> FrozenList:
  685. # Use cache_readonly to ensure that self.get_locs doesn't repeatedly
  686. # create new IndexEngine
  687. # https://github.com/pandas-dev/pandas/issues/31648
  688. result = [x._rename(name=name) for x, name in zip(self._levels, self._names)]
  689. for level in result:
  690. # disallow midx.levels[0].name = "foo"
  691. level._no_setting_name = True
  692. return FrozenList(result)
  693. def _set_levels(
  694. self,
  695. levels,
  696. *,
  697. level=None,
  698. copy: bool = False,
  699. validate: bool = True,
  700. verify_integrity: bool = False,
  701. ) -> None:
  702. # This is NOT part of the levels property because it should be
  703. # externally not allowed to set levels. User beware if you change
  704. # _levels directly
  705. if validate:
  706. if len(levels) == 0:
  707. raise ValueError("Must set non-zero number of levels.")
  708. if level is None and len(levels) != self.nlevels:
  709. raise ValueError("Length of levels must match number of levels.")
  710. if level is not None and len(levels) != len(level):
  711. raise ValueError("Length of levels must match length of level.")
  712. if level is None:
  713. new_levels = FrozenList(
  714. ensure_index(lev, copy=copy)._view() for lev in levels
  715. )
  716. else:
  717. level_numbers = [self._get_level_number(lev) for lev in level]
  718. new_levels_list = list(self._levels)
  719. for lev_num, lev in zip(level_numbers, levels):
  720. new_levels_list[lev_num] = ensure_index(lev, copy=copy)._view()
  721. new_levels = FrozenList(new_levels_list)
  722. if verify_integrity:
  723. new_codes = self._verify_integrity(levels=new_levels)
  724. self._codes = new_codes
  725. names = self.names
  726. self._levels = new_levels
  727. if any(names):
  728. self._set_names(names)
  729. self._reset_cache()
  730. def set_levels(
  731. self, levels, *, level=None, verify_integrity: bool = True
  732. ) -> MultiIndex:
  733. """
  734. Set new levels on MultiIndex. Defaults to returning new index.
  735. Parameters
  736. ----------
  737. levels : sequence or list of sequence
  738. New level(s) to apply.
  739. level : int, level name, or sequence of int/level names (default None)
  740. Level(s) to set (None for all levels).
  741. verify_integrity : bool, default True
  742. If True, checks that levels and codes are compatible.
  743. Returns
  744. -------
  745. MultiIndex
  746. Examples
  747. --------
  748. >>> idx = pd.MultiIndex.from_tuples(
  749. ... [
  750. ... (1, "one"),
  751. ... (1, "two"),
  752. ... (2, "one"),
  753. ... (2, "two"),
  754. ... (3, "one"),
  755. ... (3, "two")
  756. ... ],
  757. ... names=["foo", "bar"]
  758. ... )
  759. >>> idx
  760. MultiIndex([(1, 'one'),
  761. (1, 'two'),
  762. (2, 'one'),
  763. (2, 'two'),
  764. (3, 'one'),
  765. (3, 'two')],
  766. names=['foo', 'bar'])
  767. >>> idx.set_levels([['a', 'b', 'c'], [1, 2]])
  768. MultiIndex([('a', 1),
  769. ('a', 2),
  770. ('b', 1),
  771. ('b', 2),
  772. ('c', 1),
  773. ('c', 2)],
  774. names=['foo', 'bar'])
  775. >>> idx.set_levels(['a', 'b', 'c'], level=0)
  776. MultiIndex([('a', 'one'),
  777. ('a', 'two'),
  778. ('b', 'one'),
  779. ('b', 'two'),
  780. ('c', 'one'),
  781. ('c', 'two')],
  782. names=['foo', 'bar'])
  783. >>> idx.set_levels(['a', 'b'], level='bar')
  784. MultiIndex([(1, 'a'),
  785. (1, 'b'),
  786. (2, 'a'),
  787. (2, 'b'),
  788. (3, 'a'),
  789. (3, 'b')],
  790. names=['foo', 'bar'])
  791. If any of the levels passed to ``set_levels()`` exceeds the
  792. existing length, all of the values from that argument will
  793. be stored in the MultiIndex levels, though the values will
  794. be truncated in the MultiIndex output.
  795. >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1])
  796. MultiIndex([('a', 1),
  797. ('a', 2),
  798. ('b', 1),
  799. ('b', 2),
  800. ('c', 1),
  801. ('c', 2)],
  802. names=['foo', 'bar'])
  803. >>> idx.set_levels([['a', 'b', 'c'], [1, 2, 3, 4]], level=[0, 1]).levels
  804. FrozenList([['a', 'b', 'c'], [1, 2, 3, 4]])
  805. """
  806. if is_list_like(levels) and not isinstance(levels, Index):
  807. levels = list(levels)
  808. level, levels = _require_listlike(level, levels, "Levels")
  809. idx = self._view()
  810. idx._reset_identity()
  811. idx._set_levels(
  812. levels, level=level, validate=True, verify_integrity=verify_integrity
  813. )
  814. return idx
  815. @property
  816. def nlevels(self) -> int:
  817. """
  818. Integer number of levels in this MultiIndex.
  819. Examples
  820. --------
  821. >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']])
  822. >>> mi
  823. MultiIndex([('a', 'b', 'c')],
  824. )
  825. >>> mi.nlevels
  826. 3
  827. """
  828. return len(self._levels)
  829. @property
  830. def levshape(self) -> Shape:
  831. """
  832. A tuple with the length of each level.
  833. Examples
  834. --------
  835. >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']])
  836. >>> mi
  837. MultiIndex([('a', 'b', 'c')],
  838. )
  839. >>> mi.levshape
  840. (1, 1, 1)
  841. """
  842. return tuple(len(x) for x in self.levels)
  843. # --------------------------------------------------------------------
  844. # Codes Methods
  845. @property
  846. def codes(self):
  847. return self._codes
  848. def _set_codes(
  849. self,
  850. codes,
  851. *,
  852. level=None,
  853. copy: bool = False,
  854. validate: bool = True,
  855. verify_integrity: bool = False,
  856. ) -> None:
  857. if validate:
  858. if level is None and len(codes) != self.nlevels:
  859. raise ValueError("Length of codes must match number of levels")
  860. if level is not None and len(codes) != len(level):
  861. raise ValueError("Length of codes must match length of levels.")
  862. if level is None:
  863. new_codes = FrozenList(
  864. _coerce_indexer_frozen(level_codes, lev, copy=copy).view()
  865. for lev, level_codes in zip(self._levels, codes)
  866. )
  867. else:
  868. level_numbers = [self._get_level_number(lev) for lev in level]
  869. new_codes_list = list(self._codes)
  870. for lev_num, level_codes in zip(level_numbers, codes):
  871. lev = self.levels[lev_num]
  872. new_codes_list[lev_num] = _coerce_indexer_frozen(
  873. level_codes, lev, copy=copy
  874. )
  875. new_codes = FrozenList(new_codes_list)
  876. if verify_integrity:
  877. new_codes = self._verify_integrity(codes=new_codes)
  878. self._codes = new_codes
  879. self._reset_cache()
  880. def set_codes(self, codes, *, level=None, verify_integrity: bool = True):
  881. """
  882. Set new codes on MultiIndex. Defaults to returning new index.
  883. Parameters
  884. ----------
  885. codes : sequence or list of sequence
  886. New codes to apply.
  887. level : int, level name, or sequence of int/level names (default None)
  888. Level(s) to set (None for all levels).
  889. verify_integrity : bool, default True
  890. If True, checks that levels and codes are compatible.
  891. Returns
  892. -------
  893. new index (of same type and class...etc) or None
  894. The same type as the caller or None if ``inplace=True``.
  895. Examples
  896. --------
  897. >>> idx = pd.MultiIndex.from_tuples(
  898. ... [(1, "one"), (1, "two"), (2, "one"), (2, "two")], names=["foo", "bar"]
  899. ... )
  900. >>> idx
  901. MultiIndex([(1, 'one'),
  902. (1, 'two'),
  903. (2, 'one'),
  904. (2, 'two')],
  905. names=['foo', 'bar'])
  906. >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]])
  907. MultiIndex([(2, 'one'),
  908. (1, 'one'),
  909. (2, 'two'),
  910. (1, 'two')],
  911. names=['foo', 'bar'])
  912. >>> idx.set_codes([1, 0, 1, 0], level=0)
  913. MultiIndex([(2, 'one'),
  914. (1, 'two'),
  915. (2, 'one'),
  916. (1, 'two')],
  917. names=['foo', 'bar'])
  918. >>> idx.set_codes([0, 0, 1, 1], level='bar')
  919. MultiIndex([(1, 'one'),
  920. (1, 'one'),
  921. (2, 'two'),
  922. (2, 'two')],
  923. names=['foo', 'bar'])
  924. >>> idx.set_codes([[1, 0, 1, 0], [0, 0, 1, 1]], level=[0, 1])
  925. MultiIndex([(2, 'one'),
  926. (1, 'one'),
  927. (2, 'two'),
  928. (1, 'two')],
  929. names=['foo', 'bar'])
  930. """
  931. level, codes = _require_listlike(level, codes, "Codes")
  932. idx = self._view()
  933. idx._reset_identity()
  934. idx._set_codes(codes, level=level, verify_integrity=verify_integrity)
  935. return idx
  936. # --------------------------------------------------------------------
  937. # Index Internals
  938. @cache_readonly
  939. def _engine(self):
  940. # Calculate the number of bits needed to represent labels in each
  941. # level, as log2 of their sizes:
  942. # NaN values are shifted to 1 and missing values in other while
  943. # calculating the indexer are shifted to 0
  944. sizes = np.ceil(
  945. np.log2(
  946. [
  947. len(level)
  948. + libindex.multiindex_nulls_shift # type: ignore[attr-defined]
  949. for level in self.levels
  950. ]
  951. )
  952. )
  953. # Sum bit counts, starting from the _right_....
  954. lev_bits = np.cumsum(sizes[::-1])[::-1]
  955. # ... in order to obtain offsets such that sorting the combination of
  956. # shifted codes (one for each level, resulting in a unique integer) is
  957. # equivalent to sorting lexicographically the codes themselves. Notice
  958. # that each level needs to be shifted by the number of bits needed to
  959. # represent the _previous_ ones:
  960. offsets = np.concatenate([lev_bits[1:], [0]]).astype("uint64")
  961. # Check the total number of bits needed for our representation:
  962. if lev_bits[0] > 64:
  963. # The levels would overflow a 64 bit uint - use Python integers:
  964. return MultiIndexPyIntEngine(self.levels, self.codes, offsets)
  965. return MultiIndexUIntEngine(self.levels, self.codes, offsets)
  966. # Return type "Callable[..., MultiIndex]" of "_constructor" incompatible with return
  967. # type "Type[MultiIndex]" in supertype "Index"
  968. @property
  969. def _constructor(self) -> Callable[..., MultiIndex]: # type: ignore[override]
  970. return type(self).from_tuples
  971. @doc(Index._shallow_copy)
  972. def _shallow_copy(self, values: np.ndarray, name=lib.no_default) -> MultiIndex:
  973. names = name if name is not lib.no_default else self.names
  974. return type(self).from_tuples(values, sortorder=None, names=names)
  975. def _view(self) -> MultiIndex:
  976. result = type(self)(
  977. levels=self.levels,
  978. codes=self.codes,
  979. sortorder=self.sortorder,
  980. names=self.names,
  981. verify_integrity=False,
  982. )
  983. result._cache = self._cache.copy()
  984. result._cache.pop("levels", None) # GH32669
  985. return result
  986. # --------------------------------------------------------------------
  987. # error: Signature of "copy" incompatible with supertype "Index"
  988. def copy( # type: ignore[override]
  989. self,
  990. names=None,
  991. deep: bool = False,
  992. name=None,
  993. ):
  994. """
  995. Make a copy of this object.
  996. Names, dtype, levels and codes can be passed and will be set on new copy.
  997. Parameters
  998. ----------
  999. names : sequence, optional
  1000. deep : bool, default False
  1001. name : Label
  1002. Kept for compatibility with 1-dimensional Index. Should not be used.
  1003. Returns
  1004. -------
  1005. MultiIndex
  1006. Notes
  1007. -----
  1008. In most cases, there should be no functional difference from using
  1009. ``deep``, but if ``deep`` is passed it will attempt to deepcopy.
  1010. This could be potentially expensive on large MultiIndex objects.
  1011. Examples
  1012. --------
  1013. >>> mi = pd.MultiIndex.from_arrays([['a'], ['b'], ['c']])
  1014. >>> mi
  1015. MultiIndex([('a', 'b', 'c')],
  1016. )
  1017. >>> mi.copy()
  1018. MultiIndex([('a', 'b', 'c')],
  1019. )
  1020. """
  1021. names = self._validate_names(name=name, names=names, deep=deep)
  1022. keep_id = not deep
  1023. levels, codes = None, None
  1024. if deep:
  1025. from copy import deepcopy
  1026. levels = deepcopy(self.levels)
  1027. codes = deepcopy(self.codes)
  1028. levels = levels if levels is not None else self.levels
  1029. codes = codes if codes is not None else self.codes
  1030. new_index = type(self)(
  1031. levels=levels,
  1032. codes=codes,
  1033. sortorder=self.sortorder,
  1034. names=names,
  1035. verify_integrity=False,
  1036. )
  1037. new_index._cache = self._cache.copy()
  1038. new_index._cache.pop("levels", None) # GH32669
  1039. if keep_id:
  1040. new_index._id = self._id
  1041. return new_index
  1042. def __array__(self, dtype=None) -> np.ndarray:
  1043. """the array interface, return my values"""
  1044. return self.values
  1045. def view(self, cls=None):
  1046. """this is defined as a copy with the same identity"""
  1047. result = self.copy()
  1048. result._id = self._id
  1049. return result
  1050. @doc(Index.__contains__)
  1051. def __contains__(self, key: Any) -> bool:
  1052. hash(key)
  1053. try:
  1054. self.get_loc(key)
  1055. return True
  1056. except (LookupError, TypeError, ValueError):
  1057. return False
  1058. @cache_readonly
  1059. def dtype(self) -> np.dtype:
  1060. return np.dtype("O")
  1061. def _is_memory_usage_qualified(self) -> bool:
  1062. """return a boolean if we need a qualified .info display"""
  1063. def f(level) -> bool:
  1064. return "mixed" in level or "string" in level or "unicode" in level
  1065. return any(f(level) for level in self._inferred_type_levels)
  1066. # Cannot determine type of "memory_usage"
  1067. @doc(Index.memory_usage) # type: ignore[has-type]
  1068. def memory_usage(self, deep: bool = False) -> int:
  1069. # we are overwriting our base class to avoid
  1070. # computing .values here which could materialize
  1071. # a tuple representation unnecessarily
  1072. return self._nbytes(deep)
  1073. @cache_readonly
  1074. def nbytes(self) -> int:
  1075. """return the number of bytes in the underlying data"""
  1076. return self._nbytes(False)
  1077. def _nbytes(self, deep: bool = False) -> int:
  1078. """
  1079. return the number of bytes in the underlying data
  1080. deeply introspect the level data if deep=True
  1081. include the engine hashtable
  1082. *this is in internal routine*
  1083. """
  1084. # for implementations with no useful getsizeof (PyPy)
  1085. objsize = 24
  1086. level_nbytes = sum(i.memory_usage(deep=deep) for i in self.levels)
  1087. label_nbytes = sum(i.nbytes for i in self.codes)
  1088. names_nbytes = sum(getsizeof(i, objsize) for i in self.names)
  1089. result = level_nbytes + label_nbytes + names_nbytes
  1090. # include our engine hashtable
  1091. result += self._engine.sizeof(deep=deep)
  1092. return result
  1093. # --------------------------------------------------------------------
  1094. # Rendering Methods
  1095. def _formatter_func(self, tup):
  1096. """
  1097. Formats each item in tup according to its level's formatter function.
  1098. """
  1099. formatter_funcs = [level._formatter_func for level in self.levels]
  1100. return tuple(func(val) for func, val in zip(formatter_funcs, tup))
  1101. def _format_native_types(
  1102. self, *, na_rep: str = "nan", **kwargs
  1103. ) -> npt.NDArray[np.object_]:
  1104. new_levels = []
  1105. new_codes = []
  1106. # go through the levels and format them
  1107. for level, level_codes in zip(self.levels, self.codes):
  1108. level_strs = level._format_native_types(na_rep=na_rep, **kwargs)
  1109. # add nan values, if there are any
  1110. mask = level_codes == -1
  1111. if mask.any():
  1112. nan_index = len(level_strs)
  1113. # numpy 1.21 deprecated implicit string casting
  1114. level_strs = level_strs.astype(str)
  1115. level_strs = np.append(level_strs, na_rep)
  1116. assert not level_codes.flags.writeable # i.e. copy is needed
  1117. level_codes = level_codes.copy() # make writeable
  1118. level_codes[mask] = nan_index
  1119. new_levels.append(level_strs)
  1120. new_codes.append(level_codes)
  1121. if len(new_levels) == 1:
  1122. # a single-level multi-index
  1123. return Index(new_levels[0].take(new_codes[0]))._format_native_types()
  1124. else:
  1125. # reconstruct the multi-index
  1126. mi = MultiIndex(
  1127. levels=new_levels,
  1128. codes=new_codes,
  1129. names=self.names,
  1130. sortorder=self.sortorder,
  1131. verify_integrity=False,
  1132. )
  1133. return mi._values
  1134. def format(
  1135. self,
  1136. name: bool | None = None,
  1137. formatter: Callable | None = None,
  1138. na_rep: str | None = None,
  1139. names: bool = False,
  1140. space: int = 2,
  1141. sparsify=None,
  1142. adjoin: bool = True,
  1143. ) -> list:
  1144. if name is not None:
  1145. names = name
  1146. if len(self) == 0:
  1147. return []
  1148. stringified_levels = []
  1149. for lev, level_codes in zip(self.levels, self.codes):
  1150. na = na_rep if na_rep is not None else _get_na_rep(lev.dtype)
  1151. if len(lev) > 0:
  1152. formatted = lev.take(level_codes).format(formatter=formatter)
  1153. # we have some NA
  1154. mask = level_codes == -1
  1155. if mask.any():
  1156. formatted = np.array(formatted, dtype=object)
  1157. formatted[mask] = na
  1158. formatted = formatted.tolist()
  1159. else:
  1160. # weird all NA case
  1161. formatted = [
  1162. pprint_thing(na if isna(x) else x, escape_chars=("\t", "\r", "\n"))
  1163. for x in algos.take_nd(lev._values, level_codes)
  1164. ]
  1165. stringified_levels.append(formatted)
  1166. result_levels = []
  1167. for lev, lev_name in zip(stringified_levels, self.names):
  1168. level = []
  1169. if names:
  1170. level.append(
  1171. pprint_thing(lev_name, escape_chars=("\t", "\r", "\n"))
  1172. if lev_name is not None
  1173. else ""
  1174. )
  1175. level.extend(np.array(lev, dtype=object))
  1176. result_levels.append(level)
  1177. if sparsify is None:
  1178. sparsify = get_option("display.multi_sparse")
  1179. if sparsify:
  1180. sentinel: Literal[""] | bool | lib.NoDefault = ""
  1181. # GH3547 use value of sparsify as sentinel if it's "Falsey"
  1182. assert isinstance(sparsify, bool) or sparsify is lib.no_default
  1183. if sparsify in [False, lib.no_default]:
  1184. sentinel = sparsify
  1185. # little bit of a kludge job for #1217
  1186. result_levels = sparsify_labels(
  1187. result_levels, start=int(names), sentinel=sentinel
  1188. )
  1189. if adjoin:
  1190. from pandas.io.formats.format import get_adjustment
  1191. adj = get_adjustment()
  1192. return adj.adjoin(space, *result_levels).split("\n")
  1193. else:
  1194. return result_levels
  1195. # --------------------------------------------------------------------
  1196. # Names Methods
  1197. def _get_names(self) -> FrozenList:
  1198. return FrozenList(self._names)
  1199. def _set_names(self, names, *, level=None, validate: bool = True):
  1200. """
  1201. Set new names on index. Each name has to be a hashable type.
  1202. Parameters
  1203. ----------
  1204. values : str or sequence
  1205. name(s) to set
  1206. level : int, level name, or sequence of int/level names (default None)
  1207. If the index is a MultiIndex (hierarchical), level(s) to set (None
  1208. for all levels). Otherwise level must be None
  1209. validate : bool, default True
  1210. validate that the names match level lengths
  1211. Raises
  1212. ------
  1213. TypeError if each name is not hashable.
  1214. Notes
  1215. -----
  1216. sets names on levels. WARNING: mutates!
  1217. Note that you generally want to set this *after* changing levels, so
  1218. that it only acts on copies
  1219. """
  1220. # GH 15110
  1221. # Don't allow a single string for names in a MultiIndex
  1222. if names is not None and not is_list_like(names):
  1223. raise ValueError("Names should be list-like for a MultiIndex")
  1224. names = list(names)
  1225. if validate:
  1226. if level is not None and len(names) != len(level):
  1227. raise ValueError("Length of names must match length of level.")
  1228. if level is None and len(names) != self.nlevels:
  1229. raise ValueError(
  1230. "Length of names must match number of levels in MultiIndex."
  1231. )
  1232. if level is None:
  1233. level = range(self.nlevels)
  1234. else:
  1235. level = [self._get_level_number(lev) for lev in level]
  1236. # set the name
  1237. for lev, name in zip(level, names):
  1238. if name is not None:
  1239. # GH 20527
  1240. # All items in 'names' need to be hashable:
  1241. if not is_hashable(name):
  1242. raise TypeError(
  1243. f"{type(self).__name__}.name must be a hashable type"
  1244. )
  1245. self._names[lev] = name
  1246. # If .levels has been accessed, the names in our cache will be stale.
  1247. self._reset_cache()
  1248. names = property(
  1249. fset=_set_names,
  1250. fget=_get_names,
  1251. doc="""
  1252. Names of levels in MultiIndex.
  1253. Examples
  1254. --------
  1255. >>> mi = pd.MultiIndex.from_arrays(
  1256. ... [[1, 2], [3, 4], [5, 6]], names=['x', 'y', 'z'])
  1257. >>> mi
  1258. MultiIndex([(1, 3, 5),
  1259. (2, 4, 6)],
  1260. names=['x', 'y', 'z'])
  1261. >>> mi.names
  1262. FrozenList(['x', 'y', 'z'])
  1263. """,
  1264. )
  1265. # --------------------------------------------------------------------
  1266. @cache_readonly
  1267. def inferred_type(self) -> str:
  1268. return "mixed"
  1269. def _get_level_number(self, level) -> int:
  1270. count = self.names.count(level)
  1271. if (count > 1) and not is_integer(level):
  1272. raise ValueError(
  1273. f"The name {level} occurs multiple times, use a level number"
  1274. )
  1275. try:
  1276. level = self.names.index(level)
  1277. except ValueError as err:
  1278. if not is_integer(level):
  1279. raise KeyError(f"Level {level} not found") from err
  1280. if level < 0:
  1281. level += self.nlevels
  1282. if level < 0:
  1283. orig_level = level - self.nlevels
  1284. raise IndexError(
  1285. f"Too many levels: Index has only {self.nlevels} levels, "
  1286. f"{orig_level} is not a valid level number"
  1287. ) from err
  1288. # Note: levels are zero-based
  1289. elif level >= self.nlevels:
  1290. raise IndexError(
  1291. f"Too many levels: Index has only {self.nlevels} levels, "
  1292. f"not {level + 1}"
  1293. ) from err
  1294. return level
  1295. @cache_readonly
  1296. def is_monotonic_increasing(self) -> bool:
  1297. """
  1298. Return a boolean if the values are equal or increasing.
  1299. """
  1300. if any(-1 in code for code in self.codes):
  1301. return False
  1302. if all(level.is_monotonic_increasing for level in self.levels):
  1303. # If each level is sorted, we can operate on the codes directly. GH27495
  1304. return libalgos.is_lexsorted(
  1305. [x.astype("int64", copy=False) for x in self.codes]
  1306. )
  1307. # reversed() because lexsort() wants the most significant key last.
  1308. values = [
  1309. self._get_level_values(i)._values for i in reversed(range(len(self.levels)))
  1310. ]
  1311. try:
  1312. # error: Argument 1 to "lexsort" has incompatible type
  1313. # "List[Union[ExtensionArray, ndarray[Any, Any]]]";
  1314. # expected "Union[_SupportsArray[dtype[Any]],
  1315. # _NestedSequence[_SupportsArray[dtype[Any]]], bool,
  1316. # int, float, complex, str, bytes, _NestedSequence[Union
  1317. # [bool, int, float, complex, str, bytes]]]"
  1318. sort_order = np.lexsort(values) # type: ignore[arg-type]
  1319. return Index(sort_order).is_monotonic_increasing
  1320. except TypeError:
  1321. # we have mixed types and np.lexsort is not happy
  1322. return Index(self._values).is_monotonic_increasing
  1323. @cache_readonly
  1324. def is_monotonic_decreasing(self) -> bool:
  1325. """
  1326. Return a boolean if the values are equal or decreasing.
  1327. """
  1328. # monotonic decreasing if and only if reverse is monotonic increasing
  1329. return self[::-1].is_monotonic_increasing
  1330. @cache_readonly
  1331. def _inferred_type_levels(self) -> list[str]:
  1332. """return a list of the inferred types, one for each level"""
  1333. return [i.inferred_type for i in self.levels]
  1334. @doc(Index.duplicated)
  1335. def duplicated(self, keep: DropKeep = "first") -> npt.NDArray[np.bool_]:
  1336. shape = tuple(len(lev) for lev in self.levels)
  1337. ids = get_group_index(self.codes, shape, sort=False, xnull=False)
  1338. return duplicated(ids, keep)
  1339. # error: Cannot override final attribute "_duplicated"
  1340. # (previously declared in base class "IndexOpsMixin")
  1341. _duplicated = duplicated # type: ignore[misc]
  1342. def fillna(self, value=None, downcast=None):
  1343. """
  1344. fillna is not implemented for MultiIndex
  1345. """
  1346. raise NotImplementedError("isna is not defined for MultiIndex")
  1347. @doc(Index.dropna)
  1348. def dropna(self, how: AnyAll = "any") -> MultiIndex:
  1349. nans = [level_codes == -1 for level_codes in self.codes]
  1350. if how == "any":
  1351. indexer = np.any(nans, axis=0)
  1352. elif how == "all":
  1353. indexer = np.all(nans, axis=0)
  1354. else:
  1355. raise ValueError(f"invalid how option: {how}")
  1356. new_codes = [level_codes[~indexer] for level_codes in self.codes]
  1357. return self.set_codes(codes=new_codes)
  1358. def _get_level_values(self, level: int, unique: bool = False) -> Index:
  1359. """
  1360. Return vector of label values for requested level,
  1361. equal to the length of the index
  1362. **this is an internal method**
  1363. Parameters
  1364. ----------
  1365. level : int
  1366. unique : bool, default False
  1367. if True, drop duplicated values
  1368. Returns
  1369. -------
  1370. Index
  1371. """
  1372. lev = self.levels[level]
  1373. level_codes = self.codes[level]
  1374. name = self._names[level]
  1375. if unique:
  1376. level_codes = algos.unique(level_codes)
  1377. filled = algos.take_nd(lev._values, level_codes, fill_value=lev._na_value)
  1378. return lev._shallow_copy(filled, name=name)
  1379. def get_level_values(self, level):
  1380. """
  1381. Return vector of label values for requested level.
  1382. Length of returned vector is equal to the length of the index.
  1383. Parameters
  1384. ----------
  1385. level : int or str
  1386. ``level`` is either the integer position of the level in the
  1387. MultiIndex, or the name of the level.
  1388. Returns
  1389. -------
  1390. Index
  1391. Values is a level of this MultiIndex converted to
  1392. a single :class:`Index` (or subclass thereof).
  1393. Notes
  1394. -----
  1395. If the level contains missing values, the result may be casted to
  1396. ``float`` with missing values specified as ``NaN``. This is because
  1397. the level is converted to a regular ``Index``.
  1398. Examples
  1399. --------
  1400. Create a MultiIndex:
  1401. >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def')))
  1402. >>> mi.names = ['level_1', 'level_2']
  1403. Get level values by supplying level as either integer or name:
  1404. >>> mi.get_level_values(0)
  1405. Index(['a', 'b', 'c'], dtype='object', name='level_1')
  1406. >>> mi.get_level_values('level_2')
  1407. Index(['d', 'e', 'f'], dtype='object', name='level_2')
  1408. If a level contains missing values, the return type of the level
  1409. may be cast to ``float``.
  1410. >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).dtypes
  1411. level_0 int64
  1412. level_1 int64
  1413. dtype: object
  1414. >>> pd.MultiIndex.from_arrays([[1, None, 2], [3, 4, 5]]).get_level_values(0)
  1415. Index([1.0, nan, 2.0], dtype='float64')
  1416. """
  1417. level = self._get_level_number(level)
  1418. values = self._get_level_values(level)
  1419. return values
  1420. @doc(Index.unique)
  1421. def unique(self, level=None):
  1422. if level is None:
  1423. return self.drop_duplicates()
  1424. else:
  1425. level = self._get_level_number(level)
  1426. return self._get_level_values(level=level, unique=True)
  1427. def to_frame(
  1428. self,
  1429. index: bool = True,
  1430. name=lib.no_default,
  1431. allow_duplicates: bool = False,
  1432. ) -> DataFrame:
  1433. """
  1434. Create a DataFrame with the levels of the MultiIndex as columns.
  1435. Column ordering is determined by the DataFrame constructor with data as
  1436. a dict.
  1437. Parameters
  1438. ----------
  1439. index : bool, default True
  1440. Set the index of the returned DataFrame as the original MultiIndex.
  1441. name : list / sequence of str, optional
  1442. The passed names should substitute index level names.
  1443. allow_duplicates : bool, optional default False
  1444. Allow duplicate column labels to be created.
  1445. .. versionadded:: 1.5.0
  1446. Returns
  1447. -------
  1448. DataFrame
  1449. See Also
  1450. --------
  1451. DataFrame : Two-dimensional, size-mutable, potentially heterogeneous
  1452. tabular data.
  1453. Examples
  1454. --------
  1455. >>> mi = pd.MultiIndex.from_arrays([['a', 'b'], ['c', 'd']])
  1456. >>> mi
  1457. MultiIndex([('a', 'c'),
  1458. ('b', 'd')],
  1459. )
  1460. >>> df = mi.to_frame()
  1461. >>> df
  1462. 0 1
  1463. a c a c
  1464. b d b d
  1465. >>> df = mi.to_frame(index=False)
  1466. >>> df
  1467. 0 1
  1468. 0 a c
  1469. 1 b d
  1470. >>> df = mi.to_frame(name=['x', 'y'])
  1471. >>> df
  1472. x y
  1473. a c a c
  1474. b d b d
  1475. """
  1476. from pandas import DataFrame
  1477. if name is not lib.no_default:
  1478. if not is_list_like(name):
  1479. raise TypeError("'name' must be a list / sequence of column names.")
  1480. if len(name) != len(self.levels):
  1481. raise ValueError(
  1482. "'name' should have same length as number of levels on index."
  1483. )
  1484. idx_names = name
  1485. else:
  1486. idx_names = self._get_level_names()
  1487. if not allow_duplicates and len(set(idx_names)) != len(idx_names):
  1488. raise ValueError(
  1489. "Cannot create duplicate column labels if allow_duplicates is False"
  1490. )
  1491. # Guarantee resulting column order - PY36+ dict maintains insertion order
  1492. result = DataFrame(
  1493. {level: self._get_level_values(level) for level in range(len(self.levels))},
  1494. copy=False,
  1495. )
  1496. result.columns = idx_names
  1497. if index:
  1498. result.index = self
  1499. return result
  1500. # error: Return type "Index" of "to_flat_index" incompatible with return type
  1501. # "MultiIndex" in supertype "Index"
  1502. def to_flat_index(self) -> Index: # type: ignore[override]
  1503. """
  1504. Convert a MultiIndex to an Index of Tuples containing the level values.
  1505. Returns
  1506. -------
  1507. pd.Index
  1508. Index with the MultiIndex data represented in Tuples.
  1509. See Also
  1510. --------
  1511. MultiIndex.from_tuples : Convert flat index back to MultiIndex.
  1512. Notes
  1513. -----
  1514. This method will simply return the caller if called by anything other
  1515. than a MultiIndex.
  1516. Examples
  1517. --------
  1518. >>> index = pd.MultiIndex.from_product(
  1519. ... [['foo', 'bar'], ['baz', 'qux']],
  1520. ... names=['a', 'b'])
  1521. >>> index.to_flat_index()
  1522. Index([('foo', 'baz'), ('foo', 'qux'),
  1523. ('bar', 'baz'), ('bar', 'qux')],
  1524. dtype='object')
  1525. """
  1526. return Index(self._values, tupleize_cols=False)
  1527. def _is_lexsorted(self) -> bool:
  1528. """
  1529. Return True if the codes are lexicographically sorted.
  1530. Returns
  1531. -------
  1532. bool
  1533. Examples
  1534. --------
  1535. In the below examples, the first level of the MultiIndex is sorted because
  1536. a<b<c, so there is no need to look at the next level.
  1537. >>> pd.MultiIndex.from_arrays([['a', 'b', 'c'],
  1538. ... ['d', 'e', 'f']])._is_lexsorted()
  1539. True
  1540. >>> pd.MultiIndex.from_arrays([['a', 'b', 'c'],
  1541. ... ['d', 'f', 'e']])._is_lexsorted()
  1542. True
  1543. In case there is a tie, the lexicographical sorting looks
  1544. at the next level of the MultiIndex.
  1545. >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'b', 'c']])._is_lexsorted()
  1546. True
  1547. >>> pd.MultiIndex.from_arrays([[0, 1, 1], ['a', 'c', 'b']])._is_lexsorted()
  1548. False
  1549. >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'],
  1550. ... ['aa', 'bb', 'aa', 'bb']])._is_lexsorted()
  1551. True
  1552. >>> pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'],
  1553. ... ['bb', 'aa', 'aa', 'bb']])._is_lexsorted()
  1554. False
  1555. """
  1556. return self._lexsort_depth == self.nlevels
  1557. @cache_readonly
  1558. def _lexsort_depth(self) -> int:
  1559. """
  1560. Compute and return the lexsort_depth, the number of levels of the
  1561. MultiIndex that are sorted lexically
  1562. Returns
  1563. -------
  1564. int
  1565. """
  1566. if self.sortorder is not None:
  1567. return self.sortorder
  1568. return _lexsort_depth(self.codes, self.nlevels)
  1569. def _sort_levels_monotonic(self, raise_if_incomparable: bool = False) -> MultiIndex:
  1570. """
  1571. This is an *internal* function.
  1572. Create a new MultiIndex from the current to monotonically sorted
  1573. items IN the levels. This does not actually make the entire MultiIndex
  1574. monotonic, JUST the levels.
  1575. The resulting MultiIndex will have the same outward
  1576. appearance, meaning the same .values and ordering. It will also
  1577. be .equals() to the original.
  1578. Returns
  1579. -------
  1580. MultiIndex
  1581. Examples
  1582. --------
  1583. >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
  1584. ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
  1585. >>> mi
  1586. MultiIndex([('a', 'bb'),
  1587. ('a', 'aa'),
  1588. ('b', 'bb'),
  1589. ('b', 'aa')],
  1590. )
  1591. >>> mi.sort_values()
  1592. MultiIndex([('a', 'aa'),
  1593. ('a', 'bb'),
  1594. ('b', 'aa'),
  1595. ('b', 'bb')],
  1596. )
  1597. """
  1598. if self._is_lexsorted() and self.is_monotonic_increasing:
  1599. return self
  1600. new_levels = []
  1601. new_codes = []
  1602. for lev, level_codes in zip(self.levels, self.codes):
  1603. if not lev.is_monotonic_increasing:
  1604. try:
  1605. # indexer to reorder the levels
  1606. indexer = lev.argsort()
  1607. except TypeError:
  1608. if raise_if_incomparable:
  1609. raise
  1610. else:
  1611. lev = lev.take(indexer)
  1612. # indexer to reorder the level codes
  1613. indexer = ensure_platform_int(indexer)
  1614. ri = lib.get_reverse_indexer(indexer, len(indexer))
  1615. level_codes = algos.take_nd(ri, level_codes)
  1616. new_levels.append(lev)
  1617. new_codes.append(level_codes)
  1618. return MultiIndex(
  1619. new_levels,
  1620. new_codes,
  1621. names=self.names,
  1622. sortorder=self.sortorder,
  1623. verify_integrity=False,
  1624. )
  1625. def remove_unused_levels(self) -> MultiIndex:
  1626. """
  1627. Create new MultiIndex from current that removes unused levels.
  1628. Unused level(s) means levels that are not expressed in the
  1629. labels. The resulting MultiIndex will have the same outward
  1630. appearance, meaning the same .values and ordering. It will
  1631. also be .equals() to the original.
  1632. Returns
  1633. -------
  1634. MultiIndex
  1635. Examples
  1636. --------
  1637. >>> mi = pd.MultiIndex.from_product([range(2), list('ab')])
  1638. >>> mi
  1639. MultiIndex([(0, 'a'),
  1640. (0, 'b'),
  1641. (1, 'a'),
  1642. (1, 'b')],
  1643. )
  1644. >>> mi[2:]
  1645. MultiIndex([(1, 'a'),
  1646. (1, 'b')],
  1647. )
  1648. The 0 from the first level is not represented
  1649. and can be removed
  1650. >>> mi2 = mi[2:].remove_unused_levels()
  1651. >>> mi2.levels
  1652. FrozenList([[1], ['a', 'b']])
  1653. """
  1654. new_levels = []
  1655. new_codes = []
  1656. changed = False
  1657. for lev, level_codes in zip(self.levels, self.codes):
  1658. # Since few levels are typically unused, bincount() is more
  1659. # efficient than unique() - however it only accepts positive values
  1660. # (and drops order):
  1661. uniques = np.where(np.bincount(level_codes + 1) > 0)[0] - 1
  1662. has_na = int(len(uniques) and (uniques[0] == -1))
  1663. if len(uniques) != len(lev) + has_na:
  1664. if lev.isna().any() and len(uniques) == len(lev):
  1665. break
  1666. # We have unused levels
  1667. changed = True
  1668. # Recalculate uniques, now preserving order.
  1669. # Can easily be cythonized by exploiting the already existing
  1670. # "uniques" and stop parsing "level_codes" when all items
  1671. # are found:
  1672. uniques = algos.unique(level_codes)
  1673. if has_na:
  1674. na_idx = np.where(uniques == -1)[0]
  1675. # Just ensure that -1 is in first position:
  1676. uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]]
  1677. # codes get mapped from uniques to 0:len(uniques)
  1678. # -1 (if present) is mapped to last position
  1679. code_mapping = np.zeros(len(lev) + has_na)
  1680. # ... and reassigned value -1:
  1681. code_mapping[uniques] = np.arange(len(uniques)) - has_na
  1682. level_codes = code_mapping[level_codes]
  1683. # new levels are simple
  1684. lev = lev.take(uniques[has_na:])
  1685. new_levels.append(lev)
  1686. new_codes.append(level_codes)
  1687. result = self.view()
  1688. if changed:
  1689. result._reset_identity()
  1690. result._set_levels(new_levels, validate=False)
  1691. result._set_codes(new_codes, validate=False)
  1692. return result
  1693. # --------------------------------------------------------------------
  1694. # Pickling Methods
  1695. def __reduce__(self):
  1696. """Necessary for making this object picklable"""
  1697. d = {
  1698. "levels": list(self.levels),
  1699. "codes": list(self.codes),
  1700. "sortorder": self.sortorder,
  1701. "names": list(self.names),
  1702. }
  1703. return ibase._new_Index, (type(self), d), None
  1704. # --------------------------------------------------------------------
  1705. def __getitem__(self, key):
  1706. if is_scalar(key):
  1707. key = com.cast_scalar_indexer(key)
  1708. retval = []
  1709. for lev, level_codes in zip(self.levels, self.codes):
  1710. if level_codes[key] == -1:
  1711. retval.append(np.nan)
  1712. else:
  1713. retval.append(lev[level_codes[key]])
  1714. return tuple(retval)
  1715. else:
  1716. # in general cannot be sure whether the result will be sorted
  1717. sortorder = None
  1718. if com.is_bool_indexer(key):
  1719. key = np.asarray(key, dtype=bool)
  1720. sortorder = self.sortorder
  1721. elif isinstance(key, slice):
  1722. if key.step is None or key.step > 0:
  1723. sortorder = self.sortorder
  1724. elif isinstance(key, Index):
  1725. key = np.asarray(key)
  1726. new_codes = [level_codes[key] for level_codes in self.codes]
  1727. return MultiIndex(
  1728. levels=self.levels,
  1729. codes=new_codes,
  1730. names=self.names,
  1731. sortorder=sortorder,
  1732. verify_integrity=False,
  1733. )
  1734. def _getitem_slice(self: MultiIndex, slobj: slice) -> MultiIndex:
  1735. """
  1736. Fastpath for __getitem__ when we know we have a slice.
  1737. """
  1738. sortorder = None
  1739. if slobj.step is None or slobj.step > 0:
  1740. sortorder = self.sortorder
  1741. new_codes = [level_codes[slobj] for level_codes in self.codes]
  1742. return type(self)(
  1743. levels=self.levels,
  1744. codes=new_codes,
  1745. names=self._names,
  1746. sortorder=sortorder,
  1747. verify_integrity=False,
  1748. )
  1749. @Appender(_index_shared_docs["take"] % _index_doc_kwargs)
  1750. def take(
  1751. self: MultiIndex,
  1752. indices,
  1753. axis: Axis = 0,
  1754. allow_fill: bool = True,
  1755. fill_value=None,
  1756. **kwargs,
  1757. ) -> MultiIndex:
  1758. nv.validate_take((), kwargs)
  1759. indices = ensure_platform_int(indices)
  1760. # only fill if we are passing a non-None fill_value
  1761. allow_fill = self._maybe_disallow_fill(allow_fill, fill_value, indices)
  1762. na_value = -1
  1763. taken = [lab.take(indices) for lab in self.codes]
  1764. if allow_fill:
  1765. mask = indices == -1
  1766. if mask.any():
  1767. masked = []
  1768. for new_label in taken:
  1769. label_values = new_label
  1770. label_values[mask] = na_value
  1771. masked.append(np.asarray(label_values))
  1772. taken = masked
  1773. return MultiIndex(
  1774. levels=self.levels, codes=taken, names=self.names, verify_integrity=False
  1775. )
  1776. def append(self, other):
  1777. """
  1778. Append a collection of Index options together.
  1779. Parameters
  1780. ----------
  1781. other : Index or list/tuple of indices
  1782. Returns
  1783. -------
  1784. Index
  1785. The combined index.
  1786. Examples
  1787. --------
  1788. >>> mi = pd.MultiIndex.from_arrays([['a'], ['b']])
  1789. >>> mi
  1790. MultiIndex([('a', 'b')],
  1791. )
  1792. >>> mi.append(mi)
  1793. MultiIndex([('a', 'b'), ('a', 'b')],
  1794. )
  1795. """
  1796. if not isinstance(other, (list, tuple)):
  1797. other = [other]
  1798. if all(
  1799. (isinstance(o, MultiIndex) and o.nlevels >= self.nlevels) for o in other
  1800. ):
  1801. arrays, names = [], []
  1802. for i in range(self.nlevels):
  1803. label = self._get_level_values(i)
  1804. appended = [o._get_level_values(i) for o in other]
  1805. arrays.append(label.append(appended))
  1806. single_label_name = all(label.name == x.name for x in appended)
  1807. names.append(label.name if single_label_name else None)
  1808. return MultiIndex.from_arrays(arrays, names=names)
  1809. to_concat = (self._values,) + tuple(k._values for k in other)
  1810. new_tuples = np.concatenate(to_concat)
  1811. # if all(isinstance(x, MultiIndex) for x in other):
  1812. try:
  1813. # We only get here if other contains at least one index with tuples,
  1814. # setting names to None automatically
  1815. return MultiIndex.from_tuples(new_tuples)
  1816. except (TypeError, IndexError):
  1817. return Index(new_tuples)
  1818. def argsort(self, *args, **kwargs) -> npt.NDArray[np.intp]:
  1819. if len(args) == 0 and len(kwargs) == 0:
  1820. # lexsort is significantly faster than self._values.argsort()
  1821. target = self._sort_levels_monotonic(raise_if_incomparable=True)
  1822. return lexsort_indexer(target._get_codes_for_sorting())
  1823. return self._values.argsort(*args, **kwargs)
  1824. @Appender(_index_shared_docs["repeat"] % _index_doc_kwargs)
  1825. def repeat(self, repeats: int, axis=None) -> MultiIndex:
  1826. nv.validate_repeat((), {"axis": axis})
  1827. # error: Incompatible types in assignment (expression has type "ndarray",
  1828. # variable has type "int")
  1829. repeats = ensure_platform_int(repeats) # type: ignore[assignment]
  1830. return MultiIndex(
  1831. levels=self.levels,
  1832. codes=[
  1833. level_codes.view(np.ndarray).astype(np.intp, copy=False).repeat(repeats)
  1834. for level_codes in self.codes
  1835. ],
  1836. names=self.names,
  1837. sortorder=self.sortorder,
  1838. verify_integrity=False,
  1839. )
  1840. # error: Signature of "drop" incompatible with supertype "Index"
  1841. def drop( # type: ignore[override]
  1842. self,
  1843. codes,
  1844. level: Index | np.ndarray | Iterable[Hashable] | None = None,
  1845. errors: IgnoreRaise = "raise",
  1846. ) -> MultiIndex:
  1847. """
  1848. Make new MultiIndex with passed list of codes deleted.
  1849. Parameters
  1850. ----------
  1851. codes : array-like
  1852. Must be a list of tuples when level is not specified.
  1853. level : int or level name, default None
  1854. errors : str, default 'raise'
  1855. Returns
  1856. -------
  1857. MultiIndex
  1858. """
  1859. if level is not None:
  1860. return self._drop_from_level(codes, level, errors)
  1861. if not isinstance(codes, (np.ndarray, Index)):
  1862. try:
  1863. codes = com.index_labels_to_array(codes, dtype=np.dtype("object"))
  1864. except ValueError:
  1865. pass
  1866. inds = []
  1867. for level_codes in codes:
  1868. try:
  1869. loc = self.get_loc(level_codes)
  1870. # get_loc returns either an integer, a slice, or a boolean
  1871. # mask
  1872. if isinstance(loc, int):
  1873. inds.append(loc)
  1874. elif isinstance(loc, slice):
  1875. step = loc.step if loc.step is not None else 1
  1876. inds.extend(range(loc.start, loc.stop, step))
  1877. elif com.is_bool_indexer(loc):
  1878. if self._lexsort_depth == 0:
  1879. warnings.warn(
  1880. "dropping on a non-lexsorted multi-index "
  1881. "without a level parameter may impact performance.",
  1882. PerformanceWarning,
  1883. stacklevel=find_stack_level(),
  1884. )
  1885. loc = loc.nonzero()[0]
  1886. inds.extend(loc)
  1887. else:
  1888. msg = f"unsupported indexer of type {type(loc)}"
  1889. raise AssertionError(msg)
  1890. except KeyError:
  1891. if errors != "ignore":
  1892. raise
  1893. return self.delete(inds)
  1894. def _drop_from_level(
  1895. self, codes, level, errors: IgnoreRaise = "raise"
  1896. ) -> MultiIndex:
  1897. codes = com.index_labels_to_array(codes)
  1898. i = self._get_level_number(level)
  1899. index = self.levels[i]
  1900. values = index.get_indexer(codes)
  1901. # If nan should be dropped it will equal -1 here. We have to check which values
  1902. # are not nan and equal -1, this means they are missing in the index
  1903. nan_codes = isna(codes)
  1904. values[(np.equal(nan_codes, False)) & (values == -1)] = -2
  1905. if index.shape[0] == self.shape[0]:
  1906. values[np.equal(nan_codes, True)] = -2
  1907. not_found = codes[values == -2]
  1908. if len(not_found) != 0 and errors != "ignore":
  1909. raise KeyError(f"labels {not_found} not found in level")
  1910. mask = ~algos.isin(self.codes[i], values)
  1911. return self[mask]
  1912. def swaplevel(self, i=-2, j=-1) -> MultiIndex:
  1913. """
  1914. Swap level i with level j.
  1915. Calling this method does not change the ordering of the values.
  1916. Parameters
  1917. ----------
  1918. i : int, str, default -2
  1919. First level of index to be swapped. Can pass level name as string.
  1920. Type of parameters can be mixed.
  1921. j : int, str, default -1
  1922. Second level of index to be swapped. Can pass level name as string.
  1923. Type of parameters can be mixed.
  1924. Returns
  1925. -------
  1926. MultiIndex
  1927. A new MultiIndex.
  1928. See Also
  1929. --------
  1930. Series.swaplevel : Swap levels i and j in a MultiIndex.
  1931. DataFrame.swaplevel : Swap levels i and j in a MultiIndex on a
  1932. particular axis.
  1933. Examples
  1934. --------
  1935. >>> mi = pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
  1936. ... codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
  1937. >>> mi
  1938. MultiIndex([('a', 'bb'),
  1939. ('a', 'aa'),
  1940. ('b', 'bb'),
  1941. ('b', 'aa')],
  1942. )
  1943. >>> mi.swaplevel(0, 1)
  1944. MultiIndex([('bb', 'a'),
  1945. ('aa', 'a'),
  1946. ('bb', 'b'),
  1947. ('aa', 'b')],
  1948. )
  1949. """
  1950. new_levels = list(self.levels)
  1951. new_codes = list(self.codes)
  1952. new_names = list(self.names)
  1953. i = self._get_level_number(i)
  1954. j = self._get_level_number(j)
  1955. new_levels[i], new_levels[j] = new_levels[j], new_levels[i]
  1956. new_codes[i], new_codes[j] = new_codes[j], new_codes[i]
  1957. new_names[i], new_names[j] = new_names[j], new_names[i]
  1958. return MultiIndex(
  1959. levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
  1960. )
  1961. def reorder_levels(self, order) -> MultiIndex:
  1962. """
  1963. Rearrange levels using input order. May not drop or duplicate levels.
  1964. Parameters
  1965. ----------
  1966. order : list of int or list of str
  1967. List representing new level order. Reference level by number
  1968. (position) or by key (label).
  1969. Returns
  1970. -------
  1971. MultiIndex
  1972. Examples
  1973. --------
  1974. >>> mi = pd.MultiIndex.from_arrays([[1, 2], [3, 4]], names=['x', 'y'])
  1975. >>> mi
  1976. MultiIndex([(1, 3),
  1977. (2, 4)],
  1978. names=['x', 'y'])
  1979. >>> mi.reorder_levels(order=[1, 0])
  1980. MultiIndex([(3, 1),
  1981. (4, 2)],
  1982. names=['y', 'x'])
  1983. >>> mi.reorder_levels(order=['y', 'x'])
  1984. MultiIndex([(3, 1),
  1985. (4, 2)],
  1986. names=['y', 'x'])
  1987. """
  1988. order = [self._get_level_number(i) for i in order]
  1989. if len(order) != self.nlevels:
  1990. raise AssertionError(
  1991. f"Length of order must be same as number of levels ({self.nlevels}), "
  1992. f"got {len(order)}"
  1993. )
  1994. new_levels = [self.levels[i] for i in order]
  1995. new_codes = [self.codes[i] for i in order]
  1996. new_names = [self.names[i] for i in order]
  1997. return MultiIndex(
  1998. levels=new_levels, codes=new_codes, names=new_names, verify_integrity=False
  1999. )
  2000. def _get_codes_for_sorting(self) -> list[Categorical]:
  2001. """
  2002. we are categorizing our codes by using the
  2003. available categories (all, not just observed)
  2004. excluding any missing ones (-1); this is in preparation
  2005. for sorting, where we need to disambiguate that -1 is not
  2006. a valid valid
  2007. """
  2008. def cats(level_codes):
  2009. return np.arange(
  2010. np.array(level_codes).max() + 1 if len(level_codes) else 0,
  2011. dtype=level_codes.dtype,
  2012. )
  2013. return [
  2014. Categorical.from_codes(level_codes, cats(level_codes), ordered=True)
  2015. for level_codes in self.codes
  2016. ]
  2017. def sortlevel(
  2018. self,
  2019. level: IndexLabel = 0,
  2020. ascending: bool | list[bool] = True,
  2021. sort_remaining: bool = True,
  2022. ) -> tuple[MultiIndex, npt.NDArray[np.intp]]:
  2023. """
  2024. Sort MultiIndex at the requested level.
  2025. The result will respect the original ordering of the associated
  2026. factor at that level.
  2027. Parameters
  2028. ----------
  2029. level : list-like, int or str, default 0
  2030. If a string is given, must be a name of the level.
  2031. If list-like must be names or ints of levels.
  2032. ascending : bool, default True
  2033. False to sort in descending order.
  2034. Can also be a list to specify a directed ordering.
  2035. sort_remaining : sort by the remaining levels after level
  2036. Returns
  2037. -------
  2038. sorted_index : pd.MultiIndex
  2039. Resulting index.
  2040. indexer : np.ndarray[np.intp]
  2041. Indices of output values in original index.
  2042. Examples
  2043. --------
  2044. >>> mi = pd.MultiIndex.from_arrays([[0, 0], [2, 1]])
  2045. >>> mi
  2046. MultiIndex([(0, 2),
  2047. (0, 1)],
  2048. )
  2049. >>> mi.sortlevel()
  2050. (MultiIndex([(0, 1),
  2051. (0, 2)],
  2052. ), array([1, 0]))
  2053. >>> mi.sortlevel(sort_remaining=False)
  2054. (MultiIndex([(0, 2),
  2055. (0, 1)],
  2056. ), array([0, 1]))
  2057. >>> mi.sortlevel(1)
  2058. (MultiIndex([(0, 1),
  2059. (0, 2)],
  2060. ), array([1, 0]))
  2061. >>> mi.sortlevel(1, ascending=False)
  2062. (MultiIndex([(0, 2),
  2063. (0, 1)],
  2064. ), array([0, 1]))
  2065. """
  2066. if not is_list_like(level):
  2067. level = [level]
  2068. # error: Item "Hashable" of "Union[Hashable, Sequence[Hashable]]" has
  2069. # no attribute "__iter__" (not iterable)
  2070. level = [
  2071. self._get_level_number(lev) for lev in level # type: ignore[union-attr]
  2072. ]
  2073. sortorder = None
  2074. # we have a directed ordering via ascending
  2075. if isinstance(ascending, list):
  2076. if not len(level) == len(ascending):
  2077. raise ValueError("level must have same length as ascending")
  2078. indexer = lexsort_indexer(
  2079. [self.codes[lev] for lev in level], orders=ascending
  2080. )
  2081. # level ordering
  2082. else:
  2083. codes = list(self.codes)
  2084. shape = list(self.levshape)
  2085. # partition codes and shape
  2086. primary = tuple(codes[lev] for lev in level)
  2087. primshp = tuple(shape[lev] for lev in level)
  2088. # Reverse sorted to retain the order of
  2089. # smaller indices that needs to be removed
  2090. for lev in sorted(level, reverse=True):
  2091. codes.pop(lev)
  2092. shape.pop(lev)
  2093. if sort_remaining:
  2094. primary += primary + tuple(codes)
  2095. primshp += primshp + tuple(shape)
  2096. else:
  2097. sortorder = level[0]
  2098. indexer = indexer_from_factorized(primary, primshp, compress=False)
  2099. if not ascending:
  2100. indexer = indexer[::-1]
  2101. indexer = ensure_platform_int(indexer)
  2102. new_codes = [level_codes.take(indexer) for level_codes in self.codes]
  2103. new_index = MultiIndex(
  2104. codes=new_codes,
  2105. levels=self.levels,
  2106. names=self.names,
  2107. sortorder=sortorder,
  2108. verify_integrity=False,
  2109. )
  2110. return new_index, indexer
  2111. def _wrap_reindex_result(self, target, indexer, preserve_names: bool):
  2112. if not isinstance(target, MultiIndex):
  2113. if indexer is None:
  2114. target = self
  2115. elif (indexer >= 0).all():
  2116. target = self.take(indexer)
  2117. else:
  2118. try:
  2119. target = MultiIndex.from_tuples(target)
  2120. except TypeError:
  2121. # not all tuples, see test_constructor_dict_multiindex_reindex_flat
  2122. return target
  2123. target = self._maybe_preserve_names(target, preserve_names)
  2124. return target
  2125. def _maybe_preserve_names(self, target: Index, preserve_names: bool) -> Index:
  2126. if (
  2127. preserve_names
  2128. and target.nlevels == self.nlevels
  2129. and target.names != self.names
  2130. ):
  2131. target = target.copy(deep=False)
  2132. target.names = self.names
  2133. return target
  2134. # --------------------------------------------------------------------
  2135. # Indexing Methods
  2136. def _check_indexing_error(self, key) -> None:
  2137. if not is_hashable(key) or is_iterator(key):
  2138. # We allow tuples if they are hashable, whereas other Index
  2139. # subclasses require scalar.
  2140. # We have to explicitly exclude generators, as these are hashable.
  2141. raise InvalidIndexError(key)
  2142. @cache_readonly
  2143. def _should_fallback_to_positional(self) -> bool:
  2144. """
  2145. Should integer key(s) be treated as positional?
  2146. """
  2147. # GH#33355
  2148. return self.levels[0]._should_fallback_to_positional
  2149. def _get_indexer_strict(
  2150. self, key, axis_name: str
  2151. ) -> tuple[Index, npt.NDArray[np.intp]]:
  2152. keyarr = key
  2153. if not isinstance(keyarr, Index):
  2154. keyarr = com.asarray_tuplesafe(keyarr)
  2155. if len(keyarr) and not isinstance(keyarr[0], tuple):
  2156. indexer = self._get_indexer_level_0(keyarr)
  2157. self._raise_if_missing(key, indexer, axis_name)
  2158. return self[indexer], indexer
  2159. return super()._get_indexer_strict(key, axis_name)
  2160. def _raise_if_missing(self, key, indexer, axis_name: str) -> None:
  2161. keyarr = key
  2162. if not isinstance(key, Index):
  2163. keyarr = com.asarray_tuplesafe(key)
  2164. if len(keyarr) and not isinstance(keyarr[0], tuple):
  2165. # i.e. same condition for special case in MultiIndex._get_indexer_strict
  2166. mask = indexer == -1
  2167. if mask.any():
  2168. check = self.levels[0].get_indexer(keyarr)
  2169. cmask = check == -1
  2170. if cmask.any():
  2171. raise KeyError(f"{keyarr[cmask]} not in index")
  2172. # We get here when levels still contain values which are not
  2173. # actually in Index anymore
  2174. raise KeyError(f"{keyarr} not in index")
  2175. else:
  2176. return super()._raise_if_missing(key, indexer, axis_name)
  2177. def _get_indexer_level_0(self, target) -> npt.NDArray[np.intp]:
  2178. """
  2179. Optimized equivalent to `self.get_level_values(0).get_indexer_for(target)`.
  2180. """
  2181. lev = self.levels[0]
  2182. codes = self._codes[0]
  2183. cat = Categorical.from_codes(codes=codes, categories=lev)
  2184. ci = Index(cat)
  2185. return ci.get_indexer_for(target)
  2186. def get_slice_bound(
  2187. self,
  2188. label: Hashable | Sequence[Hashable],
  2189. side: Literal["left", "right"],
  2190. ) -> int:
  2191. """
  2192. For an ordered MultiIndex, compute slice bound
  2193. that corresponds to given label.
  2194. Returns leftmost (one-past-the-rightmost if `side=='right') position
  2195. of given label.
  2196. Parameters
  2197. ----------
  2198. label : object or tuple of objects
  2199. side : {'left', 'right'}
  2200. Returns
  2201. -------
  2202. int
  2203. Index of label.
  2204. Notes
  2205. -----
  2206. This method only works if level 0 index of the MultiIndex is lexsorted.
  2207. Examples
  2208. --------
  2209. >>> mi = pd.MultiIndex.from_arrays([list('abbc'), list('gefd')])
  2210. Get the locations from the leftmost 'b' in the first level
  2211. until the end of the multiindex:
  2212. >>> mi.get_slice_bound('b', side="left")
  2213. 1
  2214. Like above, but if you get the locations from the rightmost
  2215. 'b' in the first level and 'f' in the second level:
  2216. >>> mi.get_slice_bound(('b','f'), side="right")
  2217. 3
  2218. See Also
  2219. --------
  2220. MultiIndex.get_loc : Get location for a label or a tuple of labels.
  2221. MultiIndex.get_locs : Get location for a label/slice/list/mask or a
  2222. sequence of such.
  2223. """
  2224. if not isinstance(label, tuple):
  2225. label = (label,)
  2226. return self._partial_tup_index(label, side=side)
  2227. # pylint: disable-next=useless-parent-delegation
  2228. def slice_locs(self, start=None, end=None, step=None) -> tuple[int, int]:
  2229. """
  2230. For an ordered MultiIndex, compute the slice locations for input
  2231. labels.
  2232. The input labels can be tuples representing partial levels, e.g. for a
  2233. MultiIndex with 3 levels, you can pass a single value (corresponding to
  2234. the first level), or a 1-, 2-, or 3-tuple.
  2235. Parameters
  2236. ----------
  2237. start : label or tuple, default None
  2238. If None, defaults to the beginning
  2239. end : label or tuple
  2240. If None, defaults to the end
  2241. step : int or None
  2242. Slice step
  2243. Returns
  2244. -------
  2245. (start, end) : (int, int)
  2246. Notes
  2247. -----
  2248. This method only works if the MultiIndex is properly lexsorted. So,
  2249. if only the first 2 levels of a 3-level MultiIndex are lexsorted,
  2250. you can only pass two levels to ``.slice_locs``.
  2251. Examples
  2252. --------
  2253. >>> mi = pd.MultiIndex.from_arrays([list('abbd'), list('deff')],
  2254. ... names=['A', 'B'])
  2255. Get the slice locations from the beginning of 'b' in the first level
  2256. until the end of the multiindex:
  2257. >>> mi.slice_locs(start='b')
  2258. (1, 4)
  2259. Like above, but stop at the end of 'b' in the first level and 'f' in
  2260. the second level:
  2261. >>> mi.slice_locs(start='b', end=('b', 'f'))
  2262. (1, 3)
  2263. See Also
  2264. --------
  2265. MultiIndex.get_loc : Get location for a label or a tuple of labels.
  2266. MultiIndex.get_locs : Get location for a label/slice/list/mask or a
  2267. sequence of such.
  2268. """
  2269. # This function adds nothing to its parent implementation (the magic
  2270. # happens in get_slice_bound method), but it adds meaningful doc.
  2271. return super().slice_locs(start, end, step)
  2272. def _partial_tup_index(self, tup: tuple, side: Literal["left", "right"] = "left"):
  2273. if len(tup) > self._lexsort_depth:
  2274. raise UnsortedIndexError(
  2275. f"Key length ({len(tup)}) was greater than MultiIndex lexsort depth "
  2276. f"({self._lexsort_depth})"
  2277. )
  2278. n = len(tup)
  2279. start, end = 0, len(self)
  2280. zipped = zip(tup, self.levels, self.codes)
  2281. for k, (lab, lev, level_codes) in enumerate(zipped):
  2282. section = level_codes[start:end]
  2283. if lab not in lev and not isna(lab):
  2284. # short circuit
  2285. try:
  2286. loc = algos.searchsorted(lev, lab, side=side)
  2287. except TypeError as err:
  2288. # non-comparable e.g. test_slice_locs_with_type_mismatch
  2289. raise TypeError(f"Level type mismatch: {lab}") from err
  2290. if not is_integer(loc):
  2291. # non-comparable level, e.g. test_groupby_example
  2292. raise TypeError(f"Level type mismatch: {lab}")
  2293. if side == "right" and loc >= 0:
  2294. loc -= 1
  2295. return start + algos.searchsorted(section, loc, side=side)
  2296. idx = self._get_loc_single_level_index(lev, lab)
  2297. if isinstance(idx, slice) and k < n - 1:
  2298. # Get start and end value from slice, necessary when a non-integer
  2299. # interval is given as input GH#37707
  2300. start = idx.start
  2301. end = idx.stop
  2302. elif k < n - 1:
  2303. # error: Incompatible types in assignment (expression has type
  2304. # "Union[ndarray[Any, dtype[signedinteger[Any]]]
  2305. end = start + algos.searchsorted( # type: ignore[assignment]
  2306. section, idx, side="right"
  2307. )
  2308. # error: Incompatible types in assignment (expression has type
  2309. # "Union[ndarray[Any, dtype[signedinteger[Any]]]
  2310. start = start + algos.searchsorted( # type: ignore[assignment]
  2311. section, idx, side="left"
  2312. )
  2313. elif isinstance(idx, slice):
  2314. idx = idx.start
  2315. return start + algos.searchsorted(section, idx, side=side)
  2316. else:
  2317. return start + algos.searchsorted(section, idx, side=side)
  2318. def _get_loc_single_level_index(self, level_index: Index, key: Hashable) -> int:
  2319. """
  2320. If key is NA value, location of index unify as -1.
  2321. Parameters
  2322. ----------
  2323. level_index: Index
  2324. key : label
  2325. Returns
  2326. -------
  2327. loc : int
  2328. If key is NA value, loc is -1
  2329. Else, location of key in index.
  2330. See Also
  2331. --------
  2332. Index.get_loc : The get_loc method for (single-level) index.
  2333. """
  2334. if is_scalar(key) and isna(key):
  2335. # TODO: need is_valid_na_for_dtype(key, level_index.dtype)
  2336. return -1
  2337. else:
  2338. return level_index.get_loc(key)
  2339. def get_loc(self, key):
  2340. """
  2341. Get location for a label or a tuple of labels.
  2342. The location is returned as an integer/slice or boolean
  2343. mask.
  2344. Parameters
  2345. ----------
  2346. key : label or tuple of labels (one for each level)
  2347. Returns
  2348. -------
  2349. int, slice object or boolean mask
  2350. If the key is past the lexsort depth, the return may be a
  2351. boolean mask array, otherwise it is always a slice or int.
  2352. See Also
  2353. --------
  2354. Index.get_loc : The get_loc method for (single-level) index.
  2355. MultiIndex.slice_locs : Get slice location given start label(s) and
  2356. end label(s).
  2357. MultiIndex.get_locs : Get location for a label/slice/list/mask or a
  2358. sequence of such.
  2359. Notes
  2360. -----
  2361. The key cannot be a slice, list of same-level labels, a boolean mask,
  2362. or a sequence of such. If you want to use those, use
  2363. :meth:`MultiIndex.get_locs` instead.
  2364. Examples
  2365. --------
  2366. >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')])
  2367. >>> mi.get_loc('b')
  2368. slice(1, 3, None)
  2369. >>> mi.get_loc(('b', 'e'))
  2370. 1
  2371. """
  2372. self._check_indexing_error(key)
  2373. def _maybe_to_slice(loc):
  2374. """convert integer indexer to boolean mask or slice if possible"""
  2375. if not isinstance(loc, np.ndarray) or loc.dtype != np.intp:
  2376. return loc
  2377. loc = lib.maybe_indices_to_slice(loc, len(self))
  2378. if isinstance(loc, slice):
  2379. return loc
  2380. mask = np.empty(len(self), dtype="bool")
  2381. mask.fill(False)
  2382. mask[loc] = True
  2383. return mask
  2384. if not isinstance(key, tuple):
  2385. loc = self._get_level_indexer(key, level=0)
  2386. return _maybe_to_slice(loc)
  2387. keylen = len(key)
  2388. if self.nlevels < keylen:
  2389. raise KeyError(
  2390. f"Key length ({keylen}) exceeds index depth ({self.nlevels})"
  2391. )
  2392. if keylen == self.nlevels and self.is_unique:
  2393. # TODO: what if we have an IntervalIndex level?
  2394. # i.e. do we need _index_as_unique on that level?
  2395. try:
  2396. return self._engine.get_loc(key)
  2397. except TypeError:
  2398. # e.g. test_partial_slicing_with_multiindex partial string slicing
  2399. loc, _ = self.get_loc_level(key, list(range(self.nlevels)))
  2400. return loc
  2401. # -- partial selection or non-unique index
  2402. # break the key into 2 parts based on the lexsort_depth of the index;
  2403. # the first part returns a continuous slice of the index; the 2nd part
  2404. # needs linear search within the slice
  2405. i = self._lexsort_depth
  2406. lead_key, follow_key = key[:i], key[i:]
  2407. if not lead_key:
  2408. start = 0
  2409. stop = len(self)
  2410. else:
  2411. try:
  2412. start, stop = self.slice_locs(lead_key, lead_key)
  2413. except TypeError as err:
  2414. # e.g. test_groupby_example key = ((0, 0, 1, 2), "new_col")
  2415. # when self has 5 integer levels
  2416. raise KeyError(key) from err
  2417. if start == stop:
  2418. raise KeyError(key)
  2419. if not follow_key:
  2420. return slice(start, stop)
  2421. warnings.warn(
  2422. "indexing past lexsort depth may impact performance.",
  2423. PerformanceWarning,
  2424. stacklevel=find_stack_level(),
  2425. )
  2426. loc = np.arange(start, stop, dtype=np.intp)
  2427. for i, k in enumerate(follow_key, len(lead_key)):
  2428. mask = self.codes[i][loc] == self._get_loc_single_level_index(
  2429. self.levels[i], k
  2430. )
  2431. if not mask.all():
  2432. loc = loc[mask]
  2433. if not len(loc):
  2434. raise KeyError(key)
  2435. return _maybe_to_slice(loc) if len(loc) != stop - start else slice(start, stop)
  2436. def get_loc_level(self, key, level: IndexLabel = 0, drop_level: bool = True):
  2437. """
  2438. Get location and sliced index for requested label(s)/level(s).
  2439. Parameters
  2440. ----------
  2441. key : label or sequence of labels
  2442. level : int/level name or list thereof, optional
  2443. drop_level : bool, default True
  2444. If ``False``, the resulting index will not drop any level.
  2445. Returns
  2446. -------
  2447. tuple
  2448. A 2-tuple where the elements :
  2449. Element 0: int, slice object or boolean array.
  2450. Element 1: The resulting sliced multiindex/index. If the key
  2451. contains all levels, this will be ``None``.
  2452. See Also
  2453. --------
  2454. MultiIndex.get_loc : Get location for a label or a tuple of labels.
  2455. MultiIndex.get_locs : Get location for a label/slice/list/mask or a
  2456. sequence of such.
  2457. Examples
  2458. --------
  2459. >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')],
  2460. ... names=['A', 'B'])
  2461. >>> mi.get_loc_level('b')
  2462. (slice(1, 3, None), Index(['e', 'f'], dtype='object', name='B'))
  2463. >>> mi.get_loc_level('e', level='B')
  2464. (array([False, True, False]), Index(['b'], dtype='object', name='A'))
  2465. >>> mi.get_loc_level(['b', 'e'])
  2466. (1, None)
  2467. """
  2468. if not isinstance(level, (list, tuple)):
  2469. level = self._get_level_number(level)
  2470. else:
  2471. level = [self._get_level_number(lev) for lev in level]
  2472. loc, mi = self._get_loc_level(key, level=level)
  2473. if not drop_level:
  2474. if lib.is_integer(loc):
  2475. mi = self[loc : loc + 1]
  2476. else:
  2477. mi = self[loc]
  2478. return loc, mi
  2479. def _get_loc_level(self, key, level: int | list[int] = 0):
  2480. """
  2481. get_loc_level but with `level` known to be positional, not name-based.
  2482. """
  2483. # different name to distinguish from maybe_droplevels
  2484. def maybe_mi_droplevels(indexer, levels):
  2485. """
  2486. If level does not exist or all levels were dropped, the exception
  2487. has to be handled outside.
  2488. """
  2489. new_index = self[indexer]
  2490. for i in sorted(levels, reverse=True):
  2491. new_index = new_index._drop_level_numbers([i])
  2492. return new_index
  2493. if isinstance(level, (tuple, list)):
  2494. if len(key) != len(level):
  2495. raise AssertionError(
  2496. "Key for location must have same length as number of levels"
  2497. )
  2498. result = None
  2499. for lev, k in zip(level, key):
  2500. loc, new_index = self._get_loc_level(k, level=lev)
  2501. if isinstance(loc, slice):
  2502. mask = np.zeros(len(self), dtype=bool)
  2503. mask[loc] = True
  2504. loc = mask
  2505. result = loc if result is None else result & loc
  2506. try:
  2507. # FIXME: we should be only dropping levels on which we are
  2508. # scalar-indexing
  2509. mi = maybe_mi_droplevels(result, level)
  2510. except ValueError:
  2511. # droplevel failed because we tried to drop all levels,
  2512. # i.e. len(level) == self.nlevels
  2513. mi = self[result]
  2514. return result, mi
  2515. # kludge for #1796
  2516. if isinstance(key, list):
  2517. key = tuple(key)
  2518. if isinstance(key, tuple) and level == 0:
  2519. try:
  2520. # Check if this tuple is a single key in our first level
  2521. if key in self.levels[0]:
  2522. indexer = self._get_level_indexer(key, level=level)
  2523. new_index = maybe_mi_droplevels(indexer, [0])
  2524. return indexer, new_index
  2525. except (TypeError, InvalidIndexError):
  2526. pass
  2527. if not any(isinstance(k, slice) for k in key):
  2528. if len(key) == self.nlevels and self.is_unique:
  2529. # Complete key in unique index -> standard get_loc
  2530. try:
  2531. return (self._engine.get_loc(key), None)
  2532. except KeyError as err:
  2533. raise KeyError(key) from err
  2534. except TypeError:
  2535. # e.g. partial string indexing
  2536. # test_partial_string_timestamp_multiindex
  2537. pass
  2538. # partial selection
  2539. indexer = self.get_loc(key)
  2540. ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)]
  2541. if len(ilevels) == self.nlevels:
  2542. if is_integer(indexer):
  2543. # we are dropping all levels
  2544. return indexer, None
  2545. # TODO: in some cases we still need to drop some levels,
  2546. # e.g. test_multiindex_perf_warn
  2547. # test_partial_string_timestamp_multiindex
  2548. ilevels = [
  2549. i
  2550. for i in range(len(key))
  2551. if (
  2552. not isinstance(key[i], str)
  2553. or not self.levels[i]._supports_partial_string_indexing
  2554. )
  2555. and key[i] != slice(None, None)
  2556. ]
  2557. if len(ilevels) == self.nlevels:
  2558. # TODO: why?
  2559. ilevels = []
  2560. return indexer, maybe_mi_droplevels(indexer, ilevels)
  2561. else:
  2562. indexer = None
  2563. for i, k in enumerate(key):
  2564. if not isinstance(k, slice):
  2565. loc_level = self._get_level_indexer(k, level=i)
  2566. if isinstance(loc_level, slice):
  2567. if com.is_null_slice(loc_level) or com.is_full_slice(
  2568. loc_level, len(self)
  2569. ):
  2570. # everything
  2571. continue
  2572. # e.g. test_xs_IndexSlice_argument_not_implemented
  2573. k_index = np.zeros(len(self), dtype=bool)
  2574. k_index[loc_level] = True
  2575. else:
  2576. k_index = loc_level
  2577. elif com.is_null_slice(k):
  2578. # taking everything, does not affect `indexer` below
  2579. continue
  2580. else:
  2581. # FIXME: this message can be inaccurate, e.g.
  2582. # test_series_varied_multiindex_alignment
  2583. raise TypeError(f"Expected label or tuple of labels, got {key}")
  2584. if indexer is None:
  2585. indexer = k_index
  2586. else:
  2587. indexer &= k_index
  2588. if indexer is None:
  2589. indexer = slice(None, None)
  2590. ilevels = [i for i in range(len(key)) if key[i] != slice(None, None)]
  2591. return indexer, maybe_mi_droplevels(indexer, ilevels)
  2592. else:
  2593. indexer = self._get_level_indexer(key, level=level)
  2594. if (
  2595. isinstance(key, str)
  2596. and self.levels[level]._supports_partial_string_indexing
  2597. ):
  2598. # check to see if we did an exact lookup vs sliced
  2599. check = self.levels[level].get_loc(key)
  2600. if not is_integer(check):
  2601. # e.g. test_partial_string_timestamp_multiindex
  2602. return indexer, self[indexer]
  2603. try:
  2604. result_index = maybe_mi_droplevels(indexer, [level])
  2605. except ValueError:
  2606. result_index = self[indexer]
  2607. return indexer, result_index
  2608. def _get_level_indexer(
  2609. self, key, level: int = 0, indexer: npt.NDArray[np.bool_] | None = None
  2610. ):
  2611. # `level` kwarg is _always_ positional, never name
  2612. # return a boolean array or slice showing where the key is
  2613. # in the totality of values
  2614. # if the indexer is provided, then use this
  2615. level_index = self.levels[level]
  2616. level_codes = self.codes[level]
  2617. def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
  2618. # Compute a bool indexer to identify the positions to take.
  2619. # If we have an existing indexer, we only need to examine the
  2620. # subset of positions where the existing indexer is True.
  2621. if indexer is not None:
  2622. # we only need to look at the subset of codes where the
  2623. # existing indexer equals True
  2624. codes = codes[indexer]
  2625. if step is None or step == 1:
  2626. new_indexer = (codes >= start) & (codes < stop)
  2627. else:
  2628. r = np.arange(start, stop, step, dtype=codes.dtype)
  2629. new_indexer = algos.isin(codes, r)
  2630. if indexer is None:
  2631. return new_indexer
  2632. indexer = indexer.copy()
  2633. indexer[indexer] = new_indexer
  2634. return indexer
  2635. if isinstance(key, slice):
  2636. # handle a slice, returning a slice if we can
  2637. # otherwise a boolean indexer
  2638. step = key.step
  2639. is_negative_step = step is not None and step < 0
  2640. try:
  2641. if key.start is not None:
  2642. start = level_index.get_loc(key.start)
  2643. elif is_negative_step:
  2644. start = len(level_index) - 1
  2645. else:
  2646. start = 0
  2647. if key.stop is not None:
  2648. stop = level_index.get_loc(key.stop)
  2649. elif is_negative_step:
  2650. stop = 0
  2651. elif isinstance(start, slice):
  2652. stop = len(level_index)
  2653. else:
  2654. stop = len(level_index) - 1
  2655. except KeyError:
  2656. # we have a partial slice (like looking up a partial date
  2657. # string)
  2658. start = stop = level_index.slice_indexer(key.start, key.stop, key.step)
  2659. step = start.step
  2660. if isinstance(start, slice) or isinstance(stop, slice):
  2661. # we have a slice for start and/or stop
  2662. # a partial date slicer on a DatetimeIndex generates a slice
  2663. # note that the stop ALREADY includes the stopped point (if
  2664. # it was a string sliced)
  2665. start = getattr(start, "start", start)
  2666. stop = getattr(stop, "stop", stop)
  2667. return convert_indexer(start, stop, step)
  2668. elif level > 0 or self._lexsort_depth == 0 or step is not None:
  2669. # need to have like semantics here to right
  2670. # searching as when we are using a slice
  2671. # so adjust the stop by 1 (so we include stop)
  2672. stop = (stop - 1) if is_negative_step else (stop + 1)
  2673. return convert_indexer(start, stop, step)
  2674. else:
  2675. # sorted, so can return slice object -> view
  2676. i = algos.searchsorted(level_codes, start, side="left")
  2677. j = algos.searchsorted(level_codes, stop, side="right")
  2678. return slice(i, j, step)
  2679. else:
  2680. idx = self._get_loc_single_level_index(level_index, key)
  2681. if level > 0 or self._lexsort_depth == 0:
  2682. # Desired level is not sorted
  2683. if isinstance(idx, slice):
  2684. # test_get_loc_partial_timestamp_multiindex
  2685. locs = (level_codes >= idx.start) & (level_codes < idx.stop)
  2686. return locs
  2687. locs = np.array(level_codes == idx, dtype=bool, copy=False)
  2688. if not locs.any():
  2689. # The label is present in self.levels[level] but unused:
  2690. raise KeyError(key)
  2691. return locs
  2692. if isinstance(idx, slice):
  2693. # e.g. test_partial_string_timestamp_multiindex
  2694. start = algos.searchsorted(level_codes, idx.start, side="left")
  2695. # NB: "left" here bc of slice semantics
  2696. end = algos.searchsorted(level_codes, idx.stop, side="left")
  2697. else:
  2698. start = algos.searchsorted(level_codes, idx, side="left")
  2699. end = algos.searchsorted(level_codes, idx, side="right")
  2700. if start == end:
  2701. # The label is present in self.levels[level] but unused:
  2702. raise KeyError(key)
  2703. return slice(start, end)
  2704. def get_locs(self, seq):
  2705. """
  2706. Get location for a sequence of labels.
  2707. Parameters
  2708. ----------
  2709. seq : label, slice, list, mask or a sequence of such
  2710. You should use one of the above for each level.
  2711. If a level should not be used, set it to ``slice(None)``.
  2712. Returns
  2713. -------
  2714. numpy.ndarray
  2715. NumPy array of integers suitable for passing to iloc.
  2716. See Also
  2717. --------
  2718. MultiIndex.get_loc : Get location for a label or a tuple of labels.
  2719. MultiIndex.slice_locs : Get slice location given start label(s) and
  2720. end label(s).
  2721. Examples
  2722. --------
  2723. >>> mi = pd.MultiIndex.from_arrays([list('abb'), list('def')])
  2724. >>> mi.get_locs('b') # doctest: +SKIP
  2725. array([1, 2], dtype=int64)
  2726. >>> mi.get_locs([slice(None), ['e', 'f']]) # doctest: +SKIP
  2727. array([1, 2], dtype=int64)
  2728. >>> mi.get_locs([[True, False, True], slice('e', 'f')]) # doctest: +SKIP
  2729. array([2], dtype=int64)
  2730. """
  2731. # must be lexsorted to at least as many levels
  2732. true_slices = [i for (i, s) in enumerate(com.is_true_slices(seq)) if s]
  2733. if true_slices and true_slices[-1] >= self._lexsort_depth:
  2734. raise UnsortedIndexError(
  2735. "MultiIndex slicing requires the index to be lexsorted: slicing "
  2736. f"on levels {true_slices}, lexsort depth {self._lexsort_depth}"
  2737. )
  2738. if any(x is Ellipsis for x in seq):
  2739. raise NotImplementedError(
  2740. "MultiIndex does not support indexing with Ellipsis"
  2741. )
  2742. n = len(self)
  2743. def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]:
  2744. if isinstance(indexer, slice):
  2745. new_indexer = np.zeros(n, dtype=np.bool_)
  2746. new_indexer[indexer] = True
  2747. return new_indexer
  2748. return indexer
  2749. # a bool indexer for the positions we want to take
  2750. indexer: npt.NDArray[np.bool_] | None = None
  2751. for i, k in enumerate(seq):
  2752. lvl_indexer: npt.NDArray[np.bool_] | slice | None = None
  2753. if com.is_bool_indexer(k):
  2754. if len(k) != n:
  2755. raise ValueError(
  2756. "cannot index with a boolean indexer that "
  2757. "is not the same length as the index"
  2758. )
  2759. lvl_indexer = np.asarray(k)
  2760. elif is_list_like(k):
  2761. # a collection of labels to include from this level (these are or'd)
  2762. # GH#27591 check if this is a single tuple key in the level
  2763. try:
  2764. lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer)
  2765. except (InvalidIndexError, TypeError, KeyError) as err:
  2766. # InvalidIndexError e.g. non-hashable, fall back to treating
  2767. # this as a sequence of labels
  2768. # KeyError it can be ambiguous if this is a label or sequence
  2769. # of labels
  2770. # github.com/pandas-dev/pandas/issues/39424#issuecomment-871626708
  2771. for x in k:
  2772. if not is_hashable(x):
  2773. # e.g. slice
  2774. raise err
  2775. # GH 39424: Ignore not founds
  2776. # GH 42351: No longer ignore not founds & enforced in 2.0
  2777. # TODO: how to handle IntervalIndex level? (no test cases)
  2778. item_indexer = self._get_level_indexer(
  2779. x, level=i, indexer=indexer
  2780. )
  2781. if lvl_indexer is None:
  2782. lvl_indexer = _to_bool_indexer(item_indexer)
  2783. elif isinstance(item_indexer, slice):
  2784. lvl_indexer[item_indexer] = True # type: ignore[index]
  2785. else:
  2786. lvl_indexer |= item_indexer
  2787. if lvl_indexer is None:
  2788. # no matches we are done
  2789. # test_loc_getitem_duplicates_multiindex_empty_indexer
  2790. return np.array([], dtype=np.intp)
  2791. elif com.is_null_slice(k):
  2792. # empty slice
  2793. if indexer is None and i == len(seq) - 1:
  2794. return np.arange(n, dtype=np.intp)
  2795. continue
  2796. else:
  2797. # a slice or a single label
  2798. lvl_indexer = self._get_level_indexer(k, level=i, indexer=indexer)
  2799. # update indexer
  2800. lvl_indexer = _to_bool_indexer(lvl_indexer)
  2801. if indexer is None:
  2802. indexer = lvl_indexer
  2803. else:
  2804. indexer &= lvl_indexer
  2805. if not np.any(indexer) and np.any(lvl_indexer):
  2806. raise KeyError(seq)
  2807. # empty indexer
  2808. if indexer is None:
  2809. return np.array([], dtype=np.intp)
  2810. pos_indexer = indexer.nonzero()[0]
  2811. return self._reorder_indexer(seq, pos_indexer)
  2812. # --------------------------------------------------------------------
  2813. def _reorder_indexer(
  2814. self,
  2815. seq: tuple[Scalar | Iterable | AnyArrayLike, ...],
  2816. indexer: npt.NDArray[np.intp],
  2817. ) -> npt.NDArray[np.intp]:
  2818. """
  2819. Reorder an indexer of a MultiIndex (self) so that the labels are in the
  2820. same order as given in seq
  2821. Parameters
  2822. ----------
  2823. seq : label/slice/list/mask or a sequence of such
  2824. indexer: a position indexer of self
  2825. Returns
  2826. -------
  2827. indexer : a sorted position indexer of self ordered as seq
  2828. """
  2829. # check if sorting is necessary
  2830. need_sort = False
  2831. for i, k in enumerate(seq):
  2832. if com.is_null_slice(k) or com.is_bool_indexer(k) or is_scalar(k):
  2833. pass
  2834. elif is_list_like(k):
  2835. if len(k) <= 1: # type: ignore[arg-type]
  2836. pass
  2837. elif self._is_lexsorted():
  2838. # If the index is lexsorted and the list_like label
  2839. # in seq are sorted then we do not need to sort
  2840. k_codes = self.levels[i].get_indexer(k)
  2841. k_codes = k_codes[k_codes >= 0] # Filter absent keys
  2842. # True if the given codes are not ordered
  2843. need_sort = (k_codes[:-1] > k_codes[1:]).any()
  2844. else:
  2845. need_sort = True
  2846. elif isinstance(k, slice):
  2847. if self._is_lexsorted():
  2848. need_sort = k.step is not None and k.step < 0
  2849. else:
  2850. need_sort = True
  2851. else:
  2852. need_sort = True
  2853. if need_sort:
  2854. break
  2855. if not need_sort:
  2856. return indexer
  2857. n = len(self)
  2858. keys: tuple[np.ndarray, ...] = ()
  2859. # For each level of the sequence in seq, map the level codes with the
  2860. # order they appears in a list-like sequence
  2861. # This mapping is then use to reorder the indexer
  2862. for i, k in enumerate(seq):
  2863. if is_scalar(k):
  2864. # GH#34603 we want to treat a scalar the same as an all equal list
  2865. k = [k]
  2866. if com.is_bool_indexer(k):
  2867. new_order = np.arange(n)[indexer]
  2868. elif is_list_like(k):
  2869. # Generate a map with all level codes as sorted initially
  2870. k = algos.unique(k)
  2871. key_order_map = np.ones(len(self.levels[i]), dtype=np.uint64) * len(
  2872. self.levels[i]
  2873. )
  2874. # Set order as given in the indexer list
  2875. level_indexer = self.levels[i].get_indexer(k)
  2876. level_indexer = level_indexer[level_indexer >= 0] # Filter absent keys
  2877. key_order_map[level_indexer] = np.arange(len(level_indexer))
  2878. new_order = key_order_map[self.codes[i][indexer]]
  2879. elif isinstance(k, slice) and k.step is not None and k.step < 0:
  2880. # flip order for negative step
  2881. new_order = np.arange(n)[::-1][indexer]
  2882. elif isinstance(k, slice) and k.start is None and k.stop is None:
  2883. # slice(None) should not determine order GH#31330
  2884. new_order = np.ones((n,), dtype=np.intp)[indexer]
  2885. else:
  2886. # For all other case, use the same order as the level
  2887. new_order = np.arange(n)[indexer]
  2888. keys = (new_order,) + keys
  2889. # Find the reordering using lexsort on the keys mapping
  2890. ind = np.lexsort(keys)
  2891. return indexer[ind]
  2892. def truncate(self, before=None, after=None) -> MultiIndex:
  2893. """
  2894. Slice index between two labels / tuples, return new MultiIndex.
  2895. Parameters
  2896. ----------
  2897. before : label or tuple, can be partial. Default None
  2898. None defaults to start.
  2899. after : label or tuple, can be partial. Default None
  2900. None defaults to end.
  2901. Returns
  2902. -------
  2903. MultiIndex
  2904. The truncated MultiIndex.
  2905. Examples
  2906. --------
  2907. >>> mi = pd.MultiIndex.from_arrays([['a', 'b', 'c'], ['x', 'y', 'z']])
  2908. >>> mi
  2909. MultiIndex([('a', 'x'), ('b', 'y'), ('c', 'z')],
  2910. )
  2911. >>> mi.truncate(before='a', after='b')
  2912. MultiIndex([('a', 'x'), ('b', 'y')],
  2913. )
  2914. """
  2915. if after and before and after < before:
  2916. raise ValueError("after < before")
  2917. i, j = self.levels[0].slice_locs(before, after)
  2918. left, right = self.slice_locs(before, after)
  2919. new_levels = list(self.levels)
  2920. new_levels[0] = new_levels[0][i:j]
  2921. new_codes = [level_codes[left:right] for level_codes in self.codes]
  2922. new_codes[0] = new_codes[0] - i
  2923. return MultiIndex(
  2924. levels=new_levels,
  2925. codes=new_codes,
  2926. names=self._names,
  2927. verify_integrity=False,
  2928. )
  2929. def equals(self, other: object) -> bool:
  2930. """
  2931. Determines if two MultiIndex objects have the same labeling information
  2932. (the levels themselves do not necessarily have to be the same)
  2933. See Also
  2934. --------
  2935. equal_levels
  2936. """
  2937. if self.is_(other):
  2938. return True
  2939. if not isinstance(other, Index):
  2940. return False
  2941. if len(self) != len(other):
  2942. return False
  2943. if not isinstance(other, MultiIndex):
  2944. # d-level MultiIndex can equal d-tuple Index
  2945. if not self._should_compare(other):
  2946. # object Index or Categorical[object] may contain tuples
  2947. return False
  2948. return array_equivalent(self._values, other._values)
  2949. if self.nlevels != other.nlevels:
  2950. return False
  2951. for i in range(self.nlevels):
  2952. self_codes = self.codes[i]
  2953. other_codes = other.codes[i]
  2954. self_mask = self_codes == -1
  2955. other_mask = other_codes == -1
  2956. if not np.array_equal(self_mask, other_mask):
  2957. return False
  2958. self_codes = self_codes[~self_mask]
  2959. self_values = self.levels[i]._values.take(self_codes)
  2960. other_codes = other_codes[~other_mask]
  2961. other_values = other.levels[i]._values.take(other_codes)
  2962. # since we use NaT both datetime64 and timedelta64 we can have a
  2963. # situation where a level is typed say timedelta64 in self (IOW it
  2964. # has other values than NaT) but types datetime64 in other (where
  2965. # its all NaT) but these are equivalent
  2966. if len(self_values) == 0 and len(other_values) == 0:
  2967. continue
  2968. if not isinstance(self_values, np.ndarray):
  2969. # i.e. ExtensionArray
  2970. if not self_values.equals(other_values):
  2971. return False
  2972. elif not isinstance(other_values, np.ndarray):
  2973. # i.e. other is ExtensionArray
  2974. if not other_values.equals(self_values):
  2975. return False
  2976. else:
  2977. if not array_equivalent(self_values, other_values):
  2978. return False
  2979. return True
  2980. def equal_levels(self, other: MultiIndex) -> bool:
  2981. """
  2982. Return True if the levels of both MultiIndex objects are the same
  2983. """
  2984. if self.nlevels != other.nlevels:
  2985. return False
  2986. for i in range(self.nlevels):
  2987. if not self.levels[i].equals(other.levels[i]):
  2988. return False
  2989. return True
  2990. # --------------------------------------------------------------------
  2991. # Set Methods
  2992. def _union(self, other, sort) -> MultiIndex:
  2993. other, result_names = self._convert_can_do_setop(other)
  2994. if other.has_duplicates:
  2995. # This is only necessary if other has dupes,
  2996. # otherwise difference is faster
  2997. result = super()._union(other, sort)
  2998. if isinstance(result, MultiIndex):
  2999. return result
  3000. return MultiIndex.from_arrays(
  3001. zip(*result), sortorder=None, names=result_names
  3002. )
  3003. else:
  3004. right_missing = other.difference(self, sort=False)
  3005. if len(right_missing):
  3006. result = self.append(right_missing)
  3007. else:
  3008. result = self._get_reconciled_name_object(other)
  3009. if sort is not False:
  3010. try:
  3011. result = result.sort_values()
  3012. except TypeError:
  3013. if sort is True:
  3014. raise
  3015. warnings.warn(
  3016. "The values in the array are unorderable. "
  3017. "Pass `sort=False` to suppress this warning.",
  3018. RuntimeWarning,
  3019. stacklevel=find_stack_level(),
  3020. )
  3021. return result
  3022. def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
  3023. return is_object_dtype(dtype)
  3024. def _get_reconciled_name_object(self, other) -> MultiIndex:
  3025. """
  3026. If the result of a set operation will be self,
  3027. return self, unless the names change, in which
  3028. case make a shallow copy of self.
  3029. """
  3030. names = self._maybe_match_names(other)
  3031. if self.names != names:
  3032. # error: Cannot determine type of "rename"
  3033. return self.rename(names) # type: ignore[has-type]
  3034. return self
  3035. def _maybe_match_names(self, other):
  3036. """
  3037. Try to find common names to attach to the result of an operation between
  3038. a and b. Return a consensus list of names if they match at least partly
  3039. or list of None if they have completely different names.
  3040. """
  3041. if len(self.names) != len(other.names):
  3042. return [None] * len(self.names)
  3043. names = []
  3044. for a_name, b_name in zip(self.names, other.names):
  3045. if a_name == b_name:
  3046. names.append(a_name)
  3047. else:
  3048. # TODO: what if they both have np.nan for their names?
  3049. names.append(None)
  3050. return names
  3051. def _wrap_intersection_result(self, other, result) -> MultiIndex:
  3052. _, result_names = self._convert_can_do_setop(other)
  3053. return result.set_names(result_names)
  3054. def _wrap_difference_result(self, other, result: MultiIndex) -> MultiIndex:
  3055. _, result_names = self._convert_can_do_setop(other)
  3056. if len(result) == 0:
  3057. return result.remove_unused_levels().set_names(result_names)
  3058. else:
  3059. return result.set_names(result_names)
  3060. def _convert_can_do_setop(self, other):
  3061. result_names = self.names
  3062. if not isinstance(other, Index):
  3063. if len(other) == 0:
  3064. return self[:0], self.names
  3065. else:
  3066. msg = "other must be a MultiIndex or a list of tuples"
  3067. try:
  3068. other = MultiIndex.from_tuples(other, names=self.names)
  3069. except (ValueError, TypeError) as err:
  3070. # ValueError raised by tuples_to_object_array if we
  3071. # have non-object dtype
  3072. raise TypeError(msg) from err
  3073. else:
  3074. result_names = get_unanimous_names(self, other)
  3075. return other, result_names
  3076. # --------------------------------------------------------------------
  3077. @doc(Index.astype)
  3078. def astype(self, dtype, copy: bool = True):
  3079. dtype = pandas_dtype(dtype)
  3080. if is_categorical_dtype(dtype):
  3081. msg = "> 1 ndim Categorical are not supported at this time"
  3082. raise NotImplementedError(msg)
  3083. if not is_object_dtype(dtype):
  3084. raise TypeError(
  3085. "Setting a MultiIndex dtype to anything other than object "
  3086. "is not supported"
  3087. )
  3088. if copy is True:
  3089. return self._view()
  3090. return self
  3091. def _validate_fill_value(self, item):
  3092. if isinstance(item, MultiIndex):
  3093. # GH#43212
  3094. if item.nlevels != self.nlevels:
  3095. raise ValueError("Item must have length equal to number of levels.")
  3096. return item._values
  3097. elif not isinstance(item, tuple):
  3098. # Pad the key with empty strings if lower levels of the key
  3099. # aren't specified:
  3100. item = (item,) + ("",) * (self.nlevels - 1)
  3101. elif len(item) != self.nlevels:
  3102. raise ValueError("Item must have length equal to number of levels.")
  3103. return item
  3104. def putmask(self, mask, value: MultiIndex) -> MultiIndex:
  3105. """
  3106. Return a new MultiIndex of the values set with the mask.
  3107. Parameters
  3108. ----------
  3109. mask : array like
  3110. value : MultiIndex
  3111. Must either be the same length as self or length one
  3112. Returns
  3113. -------
  3114. MultiIndex
  3115. """
  3116. mask, noop = validate_putmask(self, mask)
  3117. if noop:
  3118. return self.copy()
  3119. if len(mask) == len(value):
  3120. subset = value[mask].remove_unused_levels()
  3121. else:
  3122. subset = value.remove_unused_levels()
  3123. new_levels = []
  3124. new_codes = []
  3125. for i, (value_level, level, level_codes) in enumerate(
  3126. zip(subset.levels, self.levels, self.codes)
  3127. ):
  3128. new_level = level.union(value_level, sort=False)
  3129. value_codes = new_level.get_indexer_for(subset.get_level_values(i))
  3130. new_code = ensure_int64(level_codes)
  3131. new_code[mask] = value_codes
  3132. new_levels.append(new_level)
  3133. new_codes.append(new_code)
  3134. return MultiIndex(
  3135. levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False
  3136. )
  3137. def insert(self, loc: int, item) -> MultiIndex:
  3138. """
  3139. Make new MultiIndex inserting new item at location
  3140. Parameters
  3141. ----------
  3142. loc : int
  3143. item : tuple
  3144. Must be same length as number of levels in the MultiIndex
  3145. Returns
  3146. -------
  3147. new_index : Index
  3148. """
  3149. item = self._validate_fill_value(item)
  3150. new_levels = []
  3151. new_codes = []
  3152. for k, level, level_codes in zip(item, self.levels, self.codes):
  3153. if k not in level:
  3154. # have to insert into level
  3155. # must insert at end otherwise you have to recompute all the
  3156. # other codes
  3157. lev_loc = len(level)
  3158. level = level.insert(lev_loc, k)
  3159. else:
  3160. lev_loc = level.get_loc(k)
  3161. new_levels.append(level)
  3162. new_codes.append(np.insert(ensure_int64(level_codes), loc, lev_loc))
  3163. return MultiIndex(
  3164. levels=new_levels, codes=new_codes, names=self.names, verify_integrity=False
  3165. )
  3166. def delete(self, loc) -> MultiIndex:
  3167. """
  3168. Make new index with passed location deleted
  3169. Returns
  3170. -------
  3171. new_index : MultiIndex
  3172. """
  3173. new_codes = [np.delete(level_codes, loc) for level_codes in self.codes]
  3174. return MultiIndex(
  3175. levels=self.levels,
  3176. codes=new_codes,
  3177. names=self.names,
  3178. verify_integrity=False,
  3179. )
  3180. @doc(Index.isin)
  3181. def isin(self, values, level=None) -> npt.NDArray[np.bool_]:
  3182. if isinstance(values, Generator):
  3183. values = list(values)
  3184. if level is None:
  3185. if len(values) == 0:
  3186. return np.zeros((len(self),), dtype=np.bool_)
  3187. if not isinstance(values, MultiIndex):
  3188. values = MultiIndex.from_tuples(values)
  3189. return values.unique().get_indexer_for(self) != -1
  3190. else:
  3191. num = self._get_level_number(level)
  3192. levs = self.get_level_values(num)
  3193. if levs.size == 0:
  3194. return np.zeros(len(levs), dtype=np.bool_)
  3195. return levs.isin(values)
  3196. # error: Incompatible types in assignment (expression has type overloaded function,
  3197. # base class "Index" defined the type as "Callable[[Index, Any, bool], Any]")
  3198. rename = Index.set_names # type: ignore[assignment]
  3199. # ---------------------------------------------------------------
  3200. # Arithmetic/Numeric Methods - Disabled
  3201. __add__ = make_invalid_op("__add__")
  3202. __radd__ = make_invalid_op("__radd__")
  3203. __iadd__ = make_invalid_op("__iadd__")
  3204. __sub__ = make_invalid_op("__sub__")
  3205. __rsub__ = make_invalid_op("__rsub__")
  3206. __isub__ = make_invalid_op("__isub__")
  3207. __pow__ = make_invalid_op("__pow__")
  3208. __rpow__ = make_invalid_op("__rpow__")
  3209. __mul__ = make_invalid_op("__mul__")
  3210. __rmul__ = make_invalid_op("__rmul__")
  3211. __floordiv__ = make_invalid_op("__floordiv__")
  3212. __rfloordiv__ = make_invalid_op("__rfloordiv__")
  3213. __truediv__ = make_invalid_op("__truediv__")
  3214. __rtruediv__ = make_invalid_op("__rtruediv__")
  3215. __mod__ = make_invalid_op("__mod__")
  3216. __rmod__ = make_invalid_op("__rmod__")
  3217. __divmod__ = make_invalid_op("__divmod__")
  3218. __rdivmod__ = make_invalid_op("__rdivmod__")
  3219. # Unary methods disabled
  3220. __neg__ = make_invalid_op("__neg__")
  3221. __pos__ = make_invalid_op("__pos__")
  3222. __abs__ = make_invalid_op("__abs__")
  3223. __invert__ = make_invalid_op("__invert__")
  3224. def _lexsort_depth(codes: list[np.ndarray], nlevels: int) -> int:
  3225. """Count depth (up to a maximum of `nlevels`) with which codes are lexsorted."""
  3226. int64_codes = [ensure_int64(level_codes) for level_codes in codes]
  3227. for k in range(nlevels, 0, -1):
  3228. if libalgos.is_lexsorted(int64_codes[:k]):
  3229. return k
  3230. return 0
  3231. def sparsify_labels(label_list, start: int = 0, sentinel: object = ""):
  3232. pivoted = list(zip(*label_list))
  3233. k = len(label_list)
  3234. result = pivoted[: start + 1]
  3235. prev = pivoted[start]
  3236. for cur in pivoted[start + 1 :]:
  3237. sparse_cur = []
  3238. for i, (p, t) in enumerate(zip(prev, cur)):
  3239. if i == k - 1:
  3240. sparse_cur.append(t)
  3241. result.append(sparse_cur)
  3242. break
  3243. if p == t:
  3244. sparse_cur.append(sentinel)
  3245. else:
  3246. sparse_cur.extend(cur[i:])
  3247. result.append(sparse_cur)
  3248. break
  3249. prev = cur
  3250. return list(zip(*result))
  3251. def _get_na_rep(dtype) -> str:
  3252. if is_extension_array_dtype(dtype):
  3253. return f"{dtype.na_value}"
  3254. else:
  3255. dtype = dtype.type
  3256. return {np.datetime64: "NaT", np.timedelta64: "NaT"}.get(dtype, "NaN")
  3257. def maybe_droplevels(index: Index, key) -> Index:
  3258. """
  3259. Attempt to drop level or levels from the given index.
  3260. Parameters
  3261. ----------
  3262. index: Index
  3263. key : scalar or tuple
  3264. Returns
  3265. -------
  3266. Index
  3267. """
  3268. # drop levels
  3269. original_index = index
  3270. if isinstance(key, tuple):
  3271. # Caller is responsible for ensuring the key is not an entry in the first
  3272. # level of the MultiIndex.
  3273. for _ in key:
  3274. try:
  3275. index = index._drop_level_numbers([0])
  3276. except ValueError:
  3277. # we have dropped too much, so back out
  3278. return original_index
  3279. else:
  3280. try:
  3281. index = index._drop_level_numbers([0])
  3282. except ValueError:
  3283. pass
  3284. return index
  3285. def _coerce_indexer_frozen(array_like, categories, copy: bool = False) -> np.ndarray:
  3286. """
  3287. Coerce the array-like indexer to the smallest integer dtype that can encode all
  3288. of the given categories.
  3289. Parameters
  3290. ----------
  3291. array_like : array-like
  3292. categories : array-like
  3293. copy : bool
  3294. Returns
  3295. -------
  3296. np.ndarray
  3297. Non-writeable.
  3298. """
  3299. array_like = coerce_indexer_dtype(array_like, categories)
  3300. if copy:
  3301. array_like = array_like.copy()
  3302. array_like.flags.writeable = False
  3303. return array_like
  3304. def _require_listlike(level, arr, arrname: str):
  3305. """
  3306. Ensure that level is either None or listlike, and arr is list-of-listlike.
  3307. """
  3308. if level is not None and not is_list_like(level):
  3309. if not is_list_like(arr):
  3310. raise TypeError(f"{arrname} must be list-like")
  3311. if len(arr) > 0 and is_list_like(arr[0]):
  3312. raise TypeError(f"{arrname} must be list-like")
  3313. level = [level]
  3314. arr = [arr]
  3315. elif level is None or is_list_like(level):
  3316. if not is_list_like(arr) or not is_list_like(arr[0]):
  3317. raise TypeError(f"{arrname} must be list of lists-like")
  3318. return level, arr