frame.py 387 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620
  1. """
  2. DataFrame
  3. ---------
  4. An efficient 2D container for potentially mixed-type time series or other
  5. labeled data series.
  6. Similar to its R counterpart, data.frame, except providing automatic data
  7. alignment and a host of useful data manipulation methods having to do with the
  8. labeling information
  9. """
  10. from __future__ import annotations
  11. import collections
  12. from collections import abc
  13. import datetime
  14. import functools
  15. from io import StringIO
  16. import itertools
  17. import sys
  18. from textwrap import dedent
  19. from typing import (
  20. TYPE_CHECKING,
  21. Any,
  22. Callable,
  23. Hashable,
  24. Iterable,
  25. Iterator,
  26. Literal,
  27. Mapping,
  28. Sequence,
  29. cast,
  30. overload,
  31. )
  32. import warnings
  33. import numpy as np
  34. from numpy import ma
  35. from pandas._config import (
  36. get_option,
  37. using_copy_on_write,
  38. )
  39. from pandas._libs import (
  40. algos as libalgos,
  41. lib,
  42. properties,
  43. )
  44. from pandas._libs.hashtable import duplicated
  45. from pandas._libs.lib import (
  46. NoDefault,
  47. is_range_indexer,
  48. no_default,
  49. )
  50. from pandas._typing import (
  51. AggFuncType,
  52. AlignJoin,
  53. AnyAll,
  54. AnyArrayLike,
  55. ArrayLike,
  56. Axes,
  57. Axis,
  58. AxisInt,
  59. ColspaceArgType,
  60. CompressionOptions,
  61. CorrelationMethod,
  62. DropKeep,
  63. Dtype,
  64. DtypeObj,
  65. FilePath,
  66. FillnaOptions,
  67. FloatFormatType,
  68. FormattersType,
  69. Frequency,
  70. IgnoreRaise,
  71. IndexKeyFunc,
  72. IndexLabel,
  73. Level,
  74. MergeHow,
  75. NaPosition,
  76. PythonFuncType,
  77. QuantileInterpolation,
  78. ReadBuffer,
  79. Renamer,
  80. Scalar,
  81. SortKind,
  82. StorageOptions,
  83. Suffixes,
  84. TimedeltaConvertibleTypes,
  85. TimestampConvertibleTypes,
  86. ValueKeyFunc,
  87. WriteBuffer,
  88. npt,
  89. )
  90. from pandas.compat import PYPY
  91. from pandas.compat._optional import import_optional_dependency
  92. from pandas.compat.numpy import (
  93. function as nv,
  94. np_percentile_argname,
  95. )
  96. from pandas.errors import (
  97. ChainedAssignmentError,
  98. InvalidIndexError,
  99. _chained_assignment_msg,
  100. )
  101. from pandas.util._decorators import (
  102. Appender,
  103. Substitution,
  104. doc,
  105. )
  106. from pandas.util._exceptions import find_stack_level
  107. from pandas.util._validators import (
  108. validate_ascending,
  109. validate_bool_kwarg,
  110. validate_percentile,
  111. )
  112. from pandas.core.dtypes.cast import (
  113. LossySetitemError,
  114. can_hold_element,
  115. construct_1d_arraylike_from_scalar,
  116. construct_2d_arraylike_from_scalar,
  117. find_common_type,
  118. infer_dtype_from_scalar,
  119. invalidate_string_dtypes,
  120. maybe_box_native,
  121. maybe_downcast_to_dtype,
  122. )
  123. from pandas.core.dtypes.common import (
  124. infer_dtype_from_object,
  125. is_1d_only_ea_dtype,
  126. is_bool_dtype,
  127. is_dataclass,
  128. is_dict_like,
  129. is_dtype_equal,
  130. is_extension_array_dtype,
  131. is_float,
  132. is_float_dtype,
  133. is_hashable,
  134. is_integer,
  135. is_integer_dtype,
  136. is_iterator,
  137. is_list_like,
  138. is_scalar,
  139. is_sequence,
  140. needs_i8_conversion,
  141. pandas_dtype,
  142. )
  143. from pandas.core.dtypes.dtypes import ExtensionDtype
  144. from pandas.core.dtypes.missing import (
  145. isna,
  146. notna,
  147. )
  148. from pandas.core import (
  149. algorithms,
  150. common as com,
  151. nanops,
  152. ops,
  153. )
  154. from pandas.core.accessor import CachedAccessor
  155. from pandas.core.apply import (
  156. reconstruct_func,
  157. relabel_result,
  158. )
  159. from pandas.core.array_algos.take import take_2d_multi
  160. from pandas.core.arraylike import OpsMixin
  161. from pandas.core.arrays import (
  162. DatetimeArray,
  163. ExtensionArray,
  164. PeriodArray,
  165. TimedeltaArray,
  166. )
  167. from pandas.core.arrays.arrow import ArrowDtype
  168. from pandas.core.arrays.sparse import SparseFrameAccessor
  169. from pandas.core.construction import (
  170. ensure_wrapped_if_datetimelike,
  171. extract_array,
  172. sanitize_array,
  173. sanitize_masked_array,
  174. )
  175. from pandas.core.generic import NDFrame
  176. from pandas.core.indexers import check_key_length
  177. from pandas.core.indexes.api import (
  178. DatetimeIndex,
  179. Index,
  180. PeriodIndex,
  181. default_index,
  182. ensure_index,
  183. ensure_index_from_sequences,
  184. )
  185. from pandas.core.indexes.multi import (
  186. MultiIndex,
  187. maybe_droplevels,
  188. )
  189. from pandas.core.indexing import (
  190. check_bool_indexer,
  191. check_dict_or_set_indexers,
  192. )
  193. from pandas.core.internals import (
  194. ArrayManager,
  195. BlockManager,
  196. )
  197. from pandas.core.internals.construction import (
  198. arrays_to_mgr,
  199. dataclasses_to_dicts,
  200. dict_to_mgr,
  201. mgr_to_mgr,
  202. ndarray_to_mgr,
  203. nested_data_to_arrays,
  204. rec_array_to_mgr,
  205. reorder_arrays,
  206. to_arrays,
  207. treat_as_nested,
  208. )
  209. from pandas.core.methods import selectn
  210. from pandas.core.reshape.melt import melt
  211. from pandas.core.series import Series
  212. from pandas.core.shared_docs import _shared_docs
  213. from pandas.core.sorting import (
  214. get_group_index,
  215. lexsort_indexer,
  216. nargsort,
  217. )
  218. from pandas.io.common import get_handle
  219. from pandas.io.formats import (
  220. console,
  221. format as fmt,
  222. )
  223. from pandas.io.formats.info import (
  224. INFO_DOCSTRING,
  225. DataFrameInfo,
  226. frame_sub_kwargs,
  227. )
  228. import pandas.plotting
  229. if TYPE_CHECKING:
  230. from pandas.core.groupby.generic import DataFrameGroupBy
  231. from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
  232. from pandas.core.internals import SingleDataManager
  233. from pandas.core.resample import Resampler
  234. from pandas.io.formats.style import Styler
  235. # ---------------------------------------------------------------------
  236. # Docstring templates
  237. _shared_doc_kwargs = {
  238. "axes": "index, columns",
  239. "klass": "DataFrame",
  240. "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
  241. "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
  242. If 0 or 'index': apply function to each column.
  243. If 1 or 'columns': apply function to each row.""",
  244. "inplace": """
  245. inplace : bool, default False
  246. Whether to modify the DataFrame rather than creating a new one.""",
  247. "optional_by": """
  248. by : str or list of str
  249. Name or list of names to sort by.
  250. - if `axis` is 0 or `'index'` then `by` may contain index
  251. levels and/or column labels.
  252. - if `axis` is 1 or `'columns'` then `by` may contain column
  253. levels and/or index labels.""",
  254. "optional_reindex": """
  255. labels : array-like, optional
  256. New labels / index to conform the axis specified by 'axis' to.
  257. index : array-like, optional
  258. New labels for the index. Preferably an Index object to avoid
  259. duplicating data.
  260. columns : array-like, optional
  261. New labels for the columns. Preferably an Index object to avoid
  262. duplicating data.
  263. axis : int or str, optional
  264. Axis to target. Can be either the axis name ('index', 'columns')
  265. or number (0, 1).""",
  266. "replace_iloc": """
  267. This differs from updating with ``.loc`` or ``.iloc``, which require
  268. you to specify a location to update with some value.""",
  269. }
  270. _numeric_only_doc = """numeric_only : bool, default False
  271. Include only float, int, boolean data.
  272. """
  273. _merge_doc = """
  274. Merge DataFrame or named Series objects with a database-style join.
  275. A named Series object is treated as a DataFrame with a single named column.
  276. The join is done on columns or indexes. If joining columns on
  277. columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
  278. on indexes or indexes on a column or columns, the index will be passed on.
  279. When performing a cross merge, no column specifications to merge on are
  280. allowed.
  281. .. warning::
  282. If both key columns contain rows where the key is a null value, those
  283. rows will be matched against each other. This is different from usual SQL
  284. join behaviour and can lead to unexpected results.
  285. Parameters
  286. ----------%s
  287. right : DataFrame or named Series
  288. Object to merge with.
  289. how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
  290. Type of merge to be performed.
  291. * left: use only keys from left frame, similar to a SQL left outer join;
  292. preserve key order.
  293. * right: use only keys from right frame, similar to a SQL right outer join;
  294. preserve key order.
  295. * outer: use union of keys from both frames, similar to a SQL full outer
  296. join; sort keys lexicographically.
  297. * inner: use intersection of keys from both frames, similar to a SQL inner
  298. join; preserve the order of the left keys.
  299. * cross: creates the cartesian product from both frames, preserves the order
  300. of the left keys.
  301. .. versionadded:: 1.2.0
  302. on : label or list
  303. Column or index level names to join on. These must be found in both
  304. DataFrames. If `on` is None and not merging on indexes then this defaults
  305. to the intersection of the columns in both DataFrames.
  306. left_on : label or list, or array-like
  307. Column or index level names to join on in the left DataFrame. Can also
  308. be an array or list of arrays of the length of the left DataFrame.
  309. These arrays are treated as if they are columns.
  310. right_on : label or list, or array-like
  311. Column or index level names to join on in the right DataFrame. Can also
  312. be an array or list of arrays of the length of the right DataFrame.
  313. These arrays are treated as if they are columns.
  314. left_index : bool, default False
  315. Use the index from the left DataFrame as the join key(s). If it is a
  316. MultiIndex, the number of keys in the other DataFrame (either the index
  317. or a number of columns) must match the number of levels.
  318. right_index : bool, default False
  319. Use the index from the right DataFrame as the join key. Same caveats as
  320. left_index.
  321. sort : bool, default False
  322. Sort the join keys lexicographically in the result DataFrame. If False,
  323. the order of the join keys depends on the join type (how keyword).
  324. suffixes : list-like, default is ("_x", "_y")
  325. A length-2 sequence where each element is optionally a string
  326. indicating the suffix to add to overlapping column names in
  327. `left` and `right` respectively. Pass a value of `None` instead
  328. of a string to indicate that the column name from `left` or
  329. `right` should be left as-is, with no suffix. At least one of the
  330. values must not be None.
  331. copy : bool, default True
  332. If False, avoid copy if possible.
  333. indicator : bool or str, default False
  334. If True, adds a column to the output DataFrame called "_merge" with
  335. information on the source of each row. The column can be given a different
  336. name by providing a string argument. The column will have a Categorical
  337. type with the value of "left_only" for observations whose merge key only
  338. appears in the left DataFrame, "right_only" for observations
  339. whose merge key only appears in the right DataFrame, and "both"
  340. if the observation's merge key is found in both DataFrames.
  341. validate : str, optional
  342. If specified, checks if merge is of specified type.
  343. * "one_to_one" or "1:1": check if merge keys are unique in both
  344. left and right datasets.
  345. * "one_to_many" or "1:m": check if merge keys are unique in left
  346. dataset.
  347. * "many_to_one" or "m:1": check if merge keys are unique in right
  348. dataset.
  349. * "many_to_many" or "m:m": allowed, but does not result in checks.
  350. Returns
  351. -------
  352. DataFrame
  353. A DataFrame of the two merged objects.
  354. See Also
  355. --------
  356. merge_ordered : Merge with optional filling/interpolation.
  357. merge_asof : Merge on nearest keys.
  358. DataFrame.join : Similar method using indices.
  359. Notes
  360. -----
  361. Support for specifying index levels as the `on`, `left_on`, and
  362. `right_on` parameters was added in version 0.23.0
  363. Support for merging named Series objects was added in version 0.24.0
  364. Examples
  365. --------
  366. >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
  367. ... 'value': [1, 2, 3, 5]})
  368. >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
  369. ... 'value': [5, 6, 7, 8]})
  370. >>> df1
  371. lkey value
  372. 0 foo 1
  373. 1 bar 2
  374. 2 baz 3
  375. 3 foo 5
  376. >>> df2
  377. rkey value
  378. 0 foo 5
  379. 1 bar 6
  380. 2 baz 7
  381. 3 foo 8
  382. Merge df1 and df2 on the lkey and rkey columns. The value columns have
  383. the default suffixes, _x and _y, appended.
  384. >>> df1.merge(df2, left_on='lkey', right_on='rkey')
  385. lkey value_x rkey value_y
  386. 0 foo 1 foo 5
  387. 1 foo 1 foo 8
  388. 2 foo 5 foo 5
  389. 3 foo 5 foo 8
  390. 4 bar 2 bar 6
  391. 5 baz 3 baz 7
  392. Merge DataFrames df1 and df2 with specified left and right suffixes
  393. appended to any overlapping columns.
  394. >>> df1.merge(df2, left_on='lkey', right_on='rkey',
  395. ... suffixes=('_left', '_right'))
  396. lkey value_left rkey value_right
  397. 0 foo 1 foo 5
  398. 1 foo 1 foo 8
  399. 2 foo 5 foo 5
  400. 3 foo 5 foo 8
  401. 4 bar 2 bar 6
  402. 5 baz 3 baz 7
  403. Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
  404. any overlapping columns.
  405. >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
  406. Traceback (most recent call last):
  407. ...
  408. ValueError: columns overlap but no suffix specified:
  409. Index(['value'], dtype='object')
  410. >>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
  411. >>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
  412. >>> df1
  413. a b
  414. 0 foo 1
  415. 1 bar 2
  416. >>> df2
  417. a c
  418. 0 foo 3
  419. 1 baz 4
  420. >>> df1.merge(df2, how='inner', on='a')
  421. a b c
  422. 0 foo 1 3
  423. >>> df1.merge(df2, how='left', on='a')
  424. a b c
  425. 0 foo 1 3.0
  426. 1 bar 2 NaN
  427. >>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
  428. >>> df2 = pd.DataFrame({'right': [7, 8]})
  429. >>> df1
  430. left
  431. 0 foo
  432. 1 bar
  433. >>> df2
  434. right
  435. 0 7
  436. 1 8
  437. >>> df1.merge(df2, how='cross')
  438. left right
  439. 0 foo 7
  440. 1 foo 8
  441. 2 bar 7
  442. 3 bar 8
  443. """
  444. # -----------------------------------------------------------------------
  445. # DataFrame class
  446. class DataFrame(NDFrame, OpsMixin):
  447. """
  448. Two-dimensional, size-mutable, potentially heterogeneous tabular data.
  449. Data structure also contains labeled axes (rows and columns).
  450. Arithmetic operations align on both row and column labels. Can be
  451. thought of as a dict-like container for Series objects. The primary
  452. pandas data structure.
  453. Parameters
  454. ----------
  455. data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
  456. Dict can contain Series, arrays, constants, dataclass or list-like objects. If
  457. data is a dict, column order follows insertion-order. If a dict contains Series
  458. which have an index defined, it is aligned by its index. This alignment also
  459. occurs if data is a Series or a DataFrame itself. Alignment is done on
  460. Series/DataFrame inputs.
  461. If data is a list of dicts, column order follows insertion-order.
  462. index : Index or array-like
  463. Index to use for resulting frame. Will default to RangeIndex if
  464. no indexing information part of input data and no index provided.
  465. columns : Index or array-like
  466. Column labels to use for resulting frame when data does not have them,
  467. defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
  468. will perform column selection instead.
  469. dtype : dtype, default None
  470. Data type to force. Only a single dtype is allowed. If None, infer.
  471. copy : bool or None, default None
  472. Copy data from inputs.
  473. For dict data, the default of None behaves like ``copy=True``. For DataFrame
  474. or 2d ndarray input, the default of None behaves like ``copy=False``.
  475. If data is a dict containing one or more Series (possibly of different dtypes),
  476. ``copy=False`` will ensure that these inputs are not copied.
  477. .. versionchanged:: 1.3.0
  478. See Also
  479. --------
  480. DataFrame.from_records : Constructor from tuples, also record arrays.
  481. DataFrame.from_dict : From dicts of Series, arrays, or dicts.
  482. read_csv : Read a comma-separated values (csv) file into DataFrame.
  483. read_table : Read general delimited file into DataFrame.
  484. read_clipboard : Read text from clipboard into DataFrame.
  485. Notes
  486. -----
  487. Please reference the :ref:`User Guide <basics.dataframe>` for more information.
  488. Examples
  489. --------
  490. Constructing DataFrame from a dictionary.
  491. >>> d = {'col1': [1, 2], 'col2': [3, 4]}
  492. >>> df = pd.DataFrame(data=d)
  493. >>> df
  494. col1 col2
  495. 0 1 3
  496. 1 2 4
  497. Notice that the inferred dtype is int64.
  498. >>> df.dtypes
  499. col1 int64
  500. col2 int64
  501. dtype: object
  502. To enforce a single dtype:
  503. >>> df = pd.DataFrame(data=d, dtype=np.int8)
  504. >>> df.dtypes
  505. col1 int8
  506. col2 int8
  507. dtype: object
  508. Constructing DataFrame from a dictionary including Series:
  509. >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
  510. >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
  511. col1 col2
  512. 0 0 NaN
  513. 1 1 NaN
  514. 2 2 2.0
  515. 3 3 3.0
  516. Constructing DataFrame from numpy ndarray:
  517. >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
  518. ... columns=['a', 'b', 'c'])
  519. >>> df2
  520. a b c
  521. 0 1 2 3
  522. 1 4 5 6
  523. 2 7 8 9
  524. Constructing DataFrame from a numpy ndarray that has labeled columns:
  525. >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
  526. ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
  527. >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
  528. ...
  529. >>> df3
  530. c a
  531. 0 3 1
  532. 1 6 4
  533. 2 9 7
  534. Constructing DataFrame from dataclass:
  535. >>> from dataclasses import make_dataclass
  536. >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
  537. >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
  538. x y
  539. 0 0 0
  540. 1 0 3
  541. 2 2 3
  542. Constructing DataFrame from Series/DataFrame:
  543. >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"])
  544. >>> df = pd.DataFrame(data=ser, index=["a", "c"])
  545. >>> df
  546. 0
  547. a 1
  548. c 3
  549. >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"])
  550. >>> df2 = pd.DataFrame(data=df1, index=["a", "c"])
  551. >>> df2
  552. x
  553. a 1
  554. c 3
  555. """
  556. _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
  557. _typ = "dataframe"
  558. _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
  559. _accessors: set[str] = {"sparse"}
  560. _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])
  561. _mgr: BlockManager | ArrayManager
  562. @property
  563. def _constructor(self) -> Callable[..., DataFrame]:
  564. return DataFrame
  565. _constructor_sliced: Callable[..., Series] = Series
  566. # ----------------------------------------------------------------------
  567. # Constructors
  568. def __init__(
  569. self,
  570. data=None,
  571. index: Axes | None = None,
  572. columns: Axes | None = None,
  573. dtype: Dtype | None = None,
  574. copy: bool | None = None,
  575. ) -> None:
  576. if dtype is not None:
  577. dtype = self._validate_dtype(dtype)
  578. if isinstance(data, DataFrame):
  579. data = data._mgr
  580. if not copy:
  581. # if not copying data, ensure to still return a shallow copy
  582. # to avoid the result sharing the same Manager
  583. data = data.copy(deep=False)
  584. if isinstance(data, (BlockManager, ArrayManager)):
  585. if using_copy_on_write():
  586. data = data.copy(deep=False)
  587. # first check if a Manager is passed without any other arguments
  588. # -> use fastpath (without checking Manager type)
  589. if index is None and columns is None and dtype is None and not copy:
  590. # GH#33357 fastpath
  591. NDFrame.__init__(self, data)
  592. return
  593. manager = get_option("mode.data_manager")
  594. # GH47215
  595. if index is not None and isinstance(index, set):
  596. raise ValueError("index cannot be a set")
  597. if columns is not None and isinstance(columns, set):
  598. raise ValueError("columns cannot be a set")
  599. if copy is None:
  600. if isinstance(data, dict):
  601. # retain pre-GH#38939 default behavior
  602. copy = True
  603. elif (
  604. manager == "array"
  605. and isinstance(data, (np.ndarray, ExtensionArray))
  606. and data.ndim == 2
  607. ):
  608. # INFO(ArrayManager) by default copy the 2D input array to get
  609. # contiguous 1D arrays
  610. copy = True
  611. elif using_copy_on_write() and not isinstance(
  612. data, (Index, DataFrame, Series)
  613. ):
  614. copy = True
  615. else:
  616. copy = False
  617. if data is None:
  618. index = index if index is not None else default_index(0)
  619. columns = columns if columns is not None else default_index(0)
  620. dtype = dtype if dtype is not None else pandas_dtype(object)
  621. data = []
  622. if isinstance(data, (BlockManager, ArrayManager)):
  623. mgr = self._init_mgr(
  624. data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
  625. )
  626. elif isinstance(data, dict):
  627. # GH#38939 de facto copy defaults to False only in non-dict cases
  628. mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
  629. elif isinstance(data, ma.MaskedArray):
  630. from numpy.ma import mrecords
  631. # masked recarray
  632. if isinstance(data, mrecords.MaskedRecords):
  633. raise TypeError(
  634. "MaskedRecords are not supported. Pass "
  635. "{name: data[name] for name in data.dtype.names} "
  636. "instead"
  637. )
  638. # a masked array
  639. data = sanitize_masked_array(data)
  640. mgr = ndarray_to_mgr(
  641. data,
  642. index,
  643. columns,
  644. dtype=dtype,
  645. copy=copy,
  646. typ=manager,
  647. )
  648. elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):
  649. if data.dtype.names:
  650. # i.e. numpy structured array
  651. data = cast(np.ndarray, data)
  652. mgr = rec_array_to_mgr(
  653. data,
  654. index,
  655. columns,
  656. dtype,
  657. copy,
  658. typ=manager,
  659. )
  660. elif getattr(data, "name", None) is not None:
  661. # i.e. Series/Index with non-None name
  662. _copy = copy if using_copy_on_write() else True
  663. mgr = dict_to_mgr(
  664. # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
  665. # attribute "name"
  666. {data.name: data}, # type: ignore[union-attr]
  667. index,
  668. columns,
  669. dtype=dtype,
  670. typ=manager,
  671. copy=_copy,
  672. )
  673. else:
  674. mgr = ndarray_to_mgr(
  675. data,
  676. index,
  677. columns,
  678. dtype=dtype,
  679. copy=copy,
  680. typ=manager,
  681. )
  682. # For data is list-like, or Iterable (will consume into list)
  683. elif is_list_like(data):
  684. if not isinstance(data, abc.Sequence):
  685. if hasattr(data, "__array__"):
  686. # GH#44616 big perf improvement for e.g. pytorch tensor
  687. data = np.asarray(data)
  688. else:
  689. data = list(data)
  690. if len(data) > 0:
  691. if is_dataclass(data[0]):
  692. data = dataclasses_to_dicts(data)
  693. if not isinstance(data, np.ndarray) and treat_as_nested(data):
  694. # exclude ndarray as we may have cast it a few lines above
  695. if columns is not None:
  696. columns = ensure_index(columns)
  697. arrays, columns, index = nested_data_to_arrays(
  698. # error: Argument 3 to "nested_data_to_arrays" has incompatible
  699. # type "Optional[Collection[Any]]"; expected "Optional[Index]"
  700. data,
  701. columns,
  702. index, # type: ignore[arg-type]
  703. dtype,
  704. )
  705. mgr = arrays_to_mgr(
  706. arrays,
  707. columns,
  708. index,
  709. dtype=dtype,
  710. typ=manager,
  711. )
  712. else:
  713. mgr = ndarray_to_mgr(
  714. data,
  715. index,
  716. columns,
  717. dtype=dtype,
  718. copy=copy,
  719. typ=manager,
  720. )
  721. else:
  722. mgr = dict_to_mgr(
  723. {},
  724. index,
  725. columns if columns is not None else default_index(0),
  726. dtype=dtype,
  727. typ=manager,
  728. )
  729. # For data is scalar
  730. else:
  731. if index is None or columns is None:
  732. raise ValueError("DataFrame constructor not properly called!")
  733. index = ensure_index(index)
  734. columns = ensure_index(columns)
  735. if not dtype:
  736. dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)
  737. # For data is a scalar extension dtype
  738. if isinstance(dtype, ExtensionDtype):
  739. # TODO(EA2D): special case not needed with 2D EAs
  740. values = [
  741. construct_1d_arraylike_from_scalar(data, len(index), dtype)
  742. for _ in range(len(columns))
  743. ]
  744. mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
  745. else:
  746. arr2d = construct_2d_arraylike_from_scalar(
  747. data,
  748. len(index),
  749. len(columns),
  750. dtype,
  751. copy,
  752. )
  753. mgr = ndarray_to_mgr(
  754. arr2d,
  755. index,
  756. columns,
  757. dtype=arr2d.dtype,
  758. copy=False,
  759. typ=manager,
  760. )
  761. # ensure correct Manager type according to settings
  762. mgr = mgr_to_mgr(mgr, typ=manager)
  763. NDFrame.__init__(self, mgr)
  764. # ----------------------------------------------------------------------
  765. def __dataframe__(
  766. self, nan_as_null: bool = False, allow_copy: bool = True
  767. ) -> DataFrameXchg:
  768. """
  769. Return the dataframe interchange object implementing the interchange protocol.
  770. Parameters
  771. ----------
  772. nan_as_null : bool, default False
  773. Whether to tell the DataFrame to overwrite null values in the data
  774. with ``NaN`` (or ``NaT``).
  775. allow_copy : bool, default True
  776. Whether to allow memory copying when exporting. If set to False
  777. it would cause non-zero-copy exports to fail.
  778. Returns
  779. -------
  780. DataFrame interchange object
  781. The object which consuming library can use to ingress the dataframe.
  782. Notes
  783. -----
  784. Details on the interchange protocol:
  785. https://data-apis.org/dataframe-protocol/latest/index.html
  786. `nan_as_null` currently has no effect; once support for nullable extension
  787. dtypes is added, this value should be propagated to columns.
  788. """
  789. from pandas.core.interchange.dataframe import PandasDataFrameXchg
  790. return PandasDataFrameXchg(self, nan_as_null, allow_copy)
  791. # ----------------------------------------------------------------------
  792. @property
  793. def axes(self) -> list[Index]:
  794. """
  795. Return a list representing the axes of the DataFrame.
  796. It has the row axis labels and column axis labels as the only members.
  797. They are returned in that order.
  798. Examples
  799. --------
  800. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  801. >>> df.axes
  802. [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
  803. dtype='object')]
  804. """
  805. return [self.index, self.columns]
  806. @property
  807. def shape(self) -> tuple[int, int]:
  808. """
  809. Return a tuple representing the dimensionality of the DataFrame.
  810. See Also
  811. --------
  812. ndarray.shape : Tuple of array dimensions.
  813. Examples
  814. --------
  815. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  816. >>> df.shape
  817. (2, 2)
  818. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
  819. ... 'col3': [5, 6]})
  820. >>> df.shape
  821. (2, 3)
  822. """
  823. return len(self.index), len(self.columns)
  824. @property
  825. def _is_homogeneous_type(self) -> bool:
  826. """
  827. Whether all the columns in a DataFrame have the same type.
  828. Returns
  829. -------
  830. bool
  831. See Also
  832. --------
  833. Index._is_homogeneous_type : Whether the object has a single
  834. dtype.
  835. MultiIndex._is_homogeneous_type : Whether all the levels of a
  836. MultiIndex have the same dtype.
  837. Examples
  838. --------
  839. >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
  840. True
  841. >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
  842. False
  843. Items with the same type but different sizes are considered
  844. different types.
  845. >>> DataFrame({
  846. ... "A": np.array([1, 2], dtype=np.int32),
  847. ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
  848. False
  849. """
  850. if isinstance(self._mgr, ArrayManager):
  851. return len({arr.dtype for arr in self._mgr.arrays}) == 1
  852. if self._mgr.any_extension_types:
  853. return len({block.dtype for block in self._mgr.blocks}) == 1
  854. else:
  855. return not self._is_mixed_type
  856. @property
  857. def _can_fast_transpose(self) -> bool:
  858. """
  859. Can we transpose this DataFrame without creating any new array objects.
  860. """
  861. if isinstance(self._mgr, ArrayManager):
  862. return False
  863. blocks = self._mgr.blocks
  864. if len(blocks) != 1:
  865. return False
  866. dtype = blocks[0].dtype
  867. # TODO(EA2D) special case would be unnecessary with 2D EAs
  868. return not is_1d_only_ea_dtype(dtype)
  869. @property
  870. def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:
  871. """
  872. Analogue to ._values that may return a 2D ExtensionArray.
  873. """
  874. mgr = self._mgr
  875. if isinstance(mgr, ArrayManager):
  876. if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):
  877. # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"
  878. # has no attribute "reshape"
  879. return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]
  880. return ensure_wrapped_if_datetimelike(self.values)
  881. blocks = mgr.blocks
  882. if len(blocks) != 1:
  883. return ensure_wrapped_if_datetimelike(self.values)
  884. arr = blocks[0].values
  885. if arr.ndim == 1:
  886. # non-2D ExtensionArray
  887. return self.values
  888. # more generally, whatever we allow in NDArrayBackedExtensionBlock
  889. arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)
  890. return arr.T
  891. # ----------------------------------------------------------------------
  892. # Rendering Methods
  893. def _repr_fits_vertical_(self) -> bool:
  894. """
  895. Check length against max_rows.
  896. """
  897. max_rows = get_option("display.max_rows")
  898. return len(self) <= max_rows
  899. def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
  900. """
  901. Check if full repr fits in horizontal boundaries imposed by the display
  902. options width and max_columns.
  903. In case of non-interactive session, no boundaries apply.
  904. `ignore_width` is here so ipynb+HTML output can behave the way
  905. users expect. display.max_columns remains in effect.
  906. GH3541, GH3573
  907. """
  908. width, height = console.get_console_size()
  909. max_columns = get_option("display.max_columns")
  910. nb_columns = len(self.columns)
  911. # exceed max columns
  912. if (max_columns and nb_columns > max_columns) or (
  913. (not ignore_width) and width and nb_columns > (width // 2)
  914. ):
  915. return False
  916. # used by repr_html under IPython notebook or scripts ignore terminal
  917. # dims
  918. if ignore_width or width is None or not console.in_interactive_session():
  919. return True
  920. if get_option("display.width") is not None or console.in_ipython_frontend():
  921. # check at least the column row for excessive width
  922. max_rows = 1
  923. else:
  924. max_rows = get_option("display.max_rows")
  925. # when auto-detecting, so width=None and not in ipython front end
  926. # check whether repr fits horizontal by actually checking
  927. # the width of the rendered repr
  928. buf = StringIO()
  929. # only care about the stuff we'll actually print out
  930. # and to_string on entire frame may be expensive
  931. d = self
  932. if max_rows is not None: # unlimited rows
  933. # min of two, where one may be None
  934. d = d.iloc[: min(max_rows, len(d))]
  935. else:
  936. return True
  937. d.to_string(buf=buf)
  938. value = buf.getvalue()
  939. repr_width = max(len(line) for line in value.split("\n"))
  940. return repr_width < width
  941. def _info_repr(self) -> bool:
  942. """
  943. True if the repr should show the info view.
  944. """
  945. info_repr_option = get_option("display.large_repr") == "info"
  946. return info_repr_option and not (
  947. self._repr_fits_horizontal_() and self._repr_fits_vertical_()
  948. )
  949. def __repr__(self) -> str:
  950. """
  951. Return a string representation for a particular DataFrame.
  952. """
  953. if self._info_repr():
  954. buf = StringIO()
  955. self.info(buf=buf)
  956. return buf.getvalue()
  957. repr_params = fmt.get_dataframe_repr_params()
  958. return self.to_string(**repr_params)
  959. def _repr_html_(self) -> str | None:
  960. """
  961. Return a html representation for a particular DataFrame.
  962. Mainly for IPython notebook.
  963. """
  964. if self._info_repr():
  965. buf = StringIO()
  966. self.info(buf=buf)
  967. # need to escape the <class>, should be the first line.
  968. val = buf.getvalue().replace("<", r"&lt;", 1)
  969. val = val.replace(">", r"&gt;", 1)
  970. return f"<pre>{val}</pre>"
  971. if get_option("display.notebook_repr_html"):
  972. max_rows = get_option("display.max_rows")
  973. min_rows = get_option("display.min_rows")
  974. max_cols = get_option("display.max_columns")
  975. show_dimensions = get_option("display.show_dimensions")
  976. formatter = fmt.DataFrameFormatter(
  977. self,
  978. columns=None,
  979. col_space=None,
  980. na_rep="NaN",
  981. formatters=None,
  982. float_format=None,
  983. sparsify=None,
  984. justify=None,
  985. index_names=True,
  986. header=True,
  987. index=True,
  988. bold_rows=True,
  989. escape=True,
  990. max_rows=max_rows,
  991. min_rows=min_rows,
  992. max_cols=max_cols,
  993. show_dimensions=show_dimensions,
  994. decimal=".",
  995. )
  996. return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
  997. else:
  998. return None
  999. @overload
  1000. def to_string(
  1001. self,
  1002. buf: None = ...,
  1003. columns: Sequence[str] | None = ...,
  1004. col_space: int | list[int] | dict[Hashable, int] | None = ...,
  1005. header: bool | Sequence[str] = ...,
  1006. index: bool = ...,
  1007. na_rep: str = ...,
  1008. formatters: fmt.FormattersType | None = ...,
  1009. float_format: fmt.FloatFormatType | None = ...,
  1010. sparsify: bool | None = ...,
  1011. index_names: bool = ...,
  1012. justify: str | None = ...,
  1013. max_rows: int | None = ...,
  1014. max_cols: int | None = ...,
  1015. show_dimensions: bool = ...,
  1016. decimal: str = ...,
  1017. line_width: int | None = ...,
  1018. min_rows: int | None = ...,
  1019. max_colwidth: int | None = ...,
  1020. encoding: str | None = ...,
  1021. ) -> str:
  1022. ...
  1023. @overload
  1024. def to_string(
  1025. self,
  1026. buf: FilePath | WriteBuffer[str],
  1027. columns: Sequence[str] | None = ...,
  1028. col_space: int | list[int] | dict[Hashable, int] | None = ...,
  1029. header: bool | Sequence[str] = ...,
  1030. index: bool = ...,
  1031. na_rep: str = ...,
  1032. formatters: fmt.FormattersType | None = ...,
  1033. float_format: fmt.FloatFormatType | None = ...,
  1034. sparsify: bool | None = ...,
  1035. index_names: bool = ...,
  1036. justify: str | None = ...,
  1037. max_rows: int | None = ...,
  1038. max_cols: int | None = ...,
  1039. show_dimensions: bool = ...,
  1040. decimal: str = ...,
  1041. line_width: int | None = ...,
  1042. min_rows: int | None = ...,
  1043. max_colwidth: int | None = ...,
  1044. encoding: str | None = ...,
  1045. ) -> None:
  1046. ...
  1047. @Substitution(
  1048. header_type="bool or sequence of str",
  1049. header="Write out the column names. If a list of strings "
  1050. "is given, it is assumed to be aliases for the "
  1051. "column names",
  1052. col_space_type="int, list or dict of int",
  1053. col_space="The minimum width of each column. If a list of ints is given "
  1054. "every integers corresponds with one column. If a dict is given, the key "
  1055. "references the column, while the value defines the space to use.",
  1056. )
  1057. @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
  1058. def to_string(
  1059. self,
  1060. buf: FilePath | WriteBuffer[str] | None = None,
  1061. columns: Sequence[str] | None = None,
  1062. col_space: int | list[int] | dict[Hashable, int] | None = None,
  1063. header: bool | Sequence[str] = True,
  1064. index: bool = True,
  1065. na_rep: str = "NaN",
  1066. formatters: fmt.FormattersType | None = None,
  1067. float_format: fmt.FloatFormatType | None = None,
  1068. sparsify: bool | None = None,
  1069. index_names: bool = True,
  1070. justify: str | None = None,
  1071. max_rows: int | None = None,
  1072. max_cols: int | None = None,
  1073. show_dimensions: bool = False,
  1074. decimal: str = ".",
  1075. line_width: int | None = None,
  1076. min_rows: int | None = None,
  1077. max_colwidth: int | None = None,
  1078. encoding: str | None = None,
  1079. ) -> str | None:
  1080. """
  1081. Render a DataFrame to a console-friendly tabular output.
  1082. %(shared_params)s
  1083. line_width : int, optional
  1084. Width to wrap a line in characters.
  1085. min_rows : int, optional
  1086. The number of rows to display in the console in a truncated repr
  1087. (when number of rows is above `max_rows`).
  1088. max_colwidth : int, optional
  1089. Max width to truncate each column in characters. By default, no limit.
  1090. encoding : str, default "utf-8"
  1091. Set character encoding.
  1092. %(returns)s
  1093. See Also
  1094. --------
  1095. to_html : Convert DataFrame to HTML.
  1096. Examples
  1097. --------
  1098. >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
  1099. >>> df = pd.DataFrame(d)
  1100. >>> print(df.to_string())
  1101. col1 col2
  1102. 0 1 4
  1103. 1 2 5
  1104. 2 3 6
  1105. """
  1106. from pandas import option_context
  1107. with option_context("display.max_colwidth", max_colwidth):
  1108. formatter = fmt.DataFrameFormatter(
  1109. self,
  1110. columns=columns,
  1111. col_space=col_space,
  1112. na_rep=na_rep,
  1113. formatters=formatters,
  1114. float_format=float_format,
  1115. sparsify=sparsify,
  1116. justify=justify,
  1117. index_names=index_names,
  1118. header=header,
  1119. index=index,
  1120. min_rows=min_rows,
  1121. max_rows=max_rows,
  1122. max_cols=max_cols,
  1123. show_dimensions=show_dimensions,
  1124. decimal=decimal,
  1125. )
  1126. return fmt.DataFrameRenderer(formatter).to_string(
  1127. buf=buf,
  1128. encoding=encoding,
  1129. line_width=line_width,
  1130. )
  1131. # ----------------------------------------------------------------------
  1132. @property
  1133. def style(self) -> Styler:
  1134. """
  1135. Returns a Styler object.
  1136. Contains methods for building a styled HTML representation of the DataFrame.
  1137. See Also
  1138. --------
  1139. io.formats.style.Styler : Helps style a DataFrame or Series according to the
  1140. data with HTML and CSS.
  1141. """
  1142. from pandas.io.formats.style import Styler
  1143. return Styler(self)
  1144. _shared_docs[
  1145. "items"
  1146. ] = r"""
  1147. Iterate over (column name, Series) pairs.
  1148. Iterates over the DataFrame columns, returning a tuple with
  1149. the column name and the content as a Series.
  1150. Yields
  1151. ------
  1152. label : object
  1153. The column names for the DataFrame being iterated over.
  1154. content : Series
  1155. The column entries belonging to each label, as a Series.
  1156. See Also
  1157. --------
  1158. DataFrame.iterrows : Iterate over DataFrame rows as
  1159. (index, Series) pairs.
  1160. DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
  1161. of the values.
  1162. Examples
  1163. --------
  1164. >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
  1165. ... 'population': [1864, 22000, 80000]},
  1166. ... index=['panda', 'polar', 'koala'])
  1167. >>> df
  1168. species population
  1169. panda bear 1864
  1170. polar bear 22000
  1171. koala marsupial 80000
  1172. >>> for label, content in df.items():
  1173. ... print(f'label: {label}')
  1174. ... print(f'content: {content}', sep='\n')
  1175. ...
  1176. label: species
  1177. content:
  1178. panda bear
  1179. polar bear
  1180. koala marsupial
  1181. Name: species, dtype: object
  1182. label: population
  1183. content:
  1184. panda 1864
  1185. polar 22000
  1186. koala 80000
  1187. Name: population, dtype: int64
  1188. """
  1189. @Appender(_shared_docs["items"])
  1190. def items(self) -> Iterable[tuple[Hashable, Series]]:
  1191. if self.columns.is_unique and hasattr(self, "_item_cache"):
  1192. for k in self.columns:
  1193. yield k, self._get_item_cache(k)
  1194. else:
  1195. for i, k in enumerate(self.columns):
  1196. yield k, self._ixs(i, axis=1)
  1197. def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
  1198. """
  1199. Iterate over DataFrame rows as (index, Series) pairs.
  1200. Yields
  1201. ------
  1202. index : label or tuple of label
  1203. The index of the row. A tuple for a `MultiIndex`.
  1204. data : Series
  1205. The data of the row as a Series.
  1206. See Also
  1207. --------
  1208. DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
  1209. DataFrame.items : Iterate over (column name, Series) pairs.
  1210. Notes
  1211. -----
  1212. 1. Because ``iterrows`` returns a Series for each row,
  1213. it does **not** preserve dtypes across the rows (dtypes are
  1214. preserved across columns for DataFrames). For example,
  1215. >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
  1216. >>> row = next(df.iterrows())[1]
  1217. >>> row
  1218. int 1.0
  1219. float 1.5
  1220. Name: 0, dtype: float64
  1221. >>> print(row['int'].dtype)
  1222. float64
  1223. >>> print(df['int'].dtype)
  1224. int64
  1225. To preserve dtypes while iterating over the rows, it is better
  1226. to use :meth:`itertuples` which returns namedtuples of the values
  1227. and which is generally faster than ``iterrows``.
  1228. 2. You should **never modify** something you are iterating over.
  1229. This is not guaranteed to work in all cases. Depending on the
  1230. data types, the iterator returns a copy and not a view, and writing
  1231. to it will have no effect.
  1232. """
  1233. columns = self.columns
  1234. klass = self._constructor_sliced
  1235. using_cow = using_copy_on_write()
  1236. for k, v in zip(self.index, self.values):
  1237. s = klass(v, index=columns, name=k).__finalize__(self)
  1238. if using_cow and self._mgr.is_single_block:
  1239. s._mgr.add_references(self._mgr) # type: ignore[arg-type]
  1240. yield k, s
  1241. def itertuples(
  1242. self, index: bool = True, name: str | None = "Pandas"
  1243. ) -> Iterable[tuple[Any, ...]]:
  1244. """
  1245. Iterate over DataFrame rows as namedtuples.
  1246. Parameters
  1247. ----------
  1248. index : bool, default True
  1249. If True, return the index as the first element of the tuple.
  1250. name : str or None, default "Pandas"
  1251. The name of the returned namedtuples or None to return regular
  1252. tuples.
  1253. Returns
  1254. -------
  1255. iterator
  1256. An object to iterate over namedtuples for each row in the
  1257. DataFrame with the first field possibly being the index and
  1258. following fields being the column values.
  1259. See Also
  1260. --------
  1261. DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
  1262. pairs.
  1263. DataFrame.items : Iterate over (column name, Series) pairs.
  1264. Notes
  1265. -----
  1266. The column names will be renamed to positional names if they are
  1267. invalid Python identifiers, repeated, or start with an underscore.
  1268. Examples
  1269. --------
  1270. >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
  1271. ... index=['dog', 'hawk'])
  1272. >>> df
  1273. num_legs num_wings
  1274. dog 4 0
  1275. hawk 2 2
  1276. >>> for row in df.itertuples():
  1277. ... print(row)
  1278. ...
  1279. Pandas(Index='dog', num_legs=4, num_wings=0)
  1280. Pandas(Index='hawk', num_legs=2, num_wings=2)
  1281. By setting the `index` parameter to False we can remove the index
  1282. as the first element of the tuple:
  1283. >>> for row in df.itertuples(index=False):
  1284. ... print(row)
  1285. ...
  1286. Pandas(num_legs=4, num_wings=0)
  1287. Pandas(num_legs=2, num_wings=2)
  1288. With the `name` parameter set we set a custom name for the yielded
  1289. namedtuples:
  1290. >>> for row in df.itertuples(name='Animal'):
  1291. ... print(row)
  1292. ...
  1293. Animal(Index='dog', num_legs=4, num_wings=0)
  1294. Animal(Index='hawk', num_legs=2, num_wings=2)
  1295. """
  1296. arrays = []
  1297. fields = list(self.columns)
  1298. if index:
  1299. arrays.append(self.index)
  1300. fields.insert(0, "Index")
  1301. # use integer indexing because of possible duplicate column names
  1302. arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
  1303. if name is not None:
  1304. # https://github.com/python/mypy/issues/9046
  1305. # error: namedtuple() expects a string literal as the first argument
  1306. itertuple = collections.namedtuple( # type: ignore[misc]
  1307. name, fields, rename=True
  1308. )
  1309. return map(itertuple._make, zip(*arrays))
  1310. # fallback to regular tuples
  1311. return zip(*arrays)
  1312. def __len__(self) -> int:
  1313. """
  1314. Returns length of info axis, but here we use the index.
  1315. """
  1316. return len(self.index)
  1317. @overload
  1318. def dot(self, other: Series) -> Series:
  1319. ...
  1320. @overload
  1321. def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:
  1322. ...
  1323. def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
  1324. """
  1325. Compute the matrix multiplication between the DataFrame and other.
  1326. This method computes the matrix product between the DataFrame and the
  1327. values of an other Series, DataFrame or a numpy array.
  1328. It can also be called using ``self @ other`` in Python >= 3.5.
  1329. Parameters
  1330. ----------
  1331. other : Series, DataFrame or array-like
  1332. The other object to compute the matrix product with.
  1333. Returns
  1334. -------
  1335. Series or DataFrame
  1336. If other is a Series, return the matrix product between self and
  1337. other as a Series. If other is a DataFrame or a numpy.array, return
  1338. the matrix product of self and other in a DataFrame of a np.array.
  1339. See Also
  1340. --------
  1341. Series.dot: Similar method for Series.
  1342. Notes
  1343. -----
  1344. The dimensions of DataFrame and other must be compatible in order to
  1345. compute the matrix multiplication. In addition, the column names of
  1346. DataFrame and the index of other must contain the same values, as they
  1347. will be aligned prior to the multiplication.
  1348. The dot method for Series computes the inner product, instead of the
  1349. matrix product here.
  1350. Examples
  1351. --------
  1352. Here we multiply a DataFrame with a Series.
  1353. >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
  1354. >>> s = pd.Series([1, 1, 2, 1])
  1355. >>> df.dot(s)
  1356. 0 -4
  1357. 1 5
  1358. dtype: int64
  1359. Here we multiply a DataFrame with another DataFrame.
  1360. >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
  1361. >>> df.dot(other)
  1362. 0 1
  1363. 0 1 4
  1364. 1 2 2
  1365. Note that the dot method give the same result as @
  1366. >>> df @ other
  1367. 0 1
  1368. 0 1 4
  1369. 1 2 2
  1370. The dot method works also if other is an np.array.
  1371. >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
  1372. >>> df.dot(arr)
  1373. 0 1
  1374. 0 1 4
  1375. 1 2 2
  1376. Note how shuffling of the objects does not change the result.
  1377. >>> s2 = s.reindex([1, 0, 2, 3])
  1378. >>> df.dot(s2)
  1379. 0 -4
  1380. 1 5
  1381. dtype: int64
  1382. """
  1383. if isinstance(other, (Series, DataFrame)):
  1384. common = self.columns.union(other.index)
  1385. if len(common) > len(self.columns) or len(common) > len(other.index):
  1386. raise ValueError("matrices are not aligned")
  1387. left = self.reindex(columns=common, copy=False)
  1388. right = other.reindex(index=common, copy=False)
  1389. lvals = left.values
  1390. rvals = right._values
  1391. else:
  1392. left = self
  1393. lvals = self.values
  1394. rvals = np.asarray(other)
  1395. if lvals.shape[1] != rvals.shape[0]:
  1396. raise ValueError(
  1397. f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
  1398. )
  1399. if isinstance(other, DataFrame):
  1400. return self._constructor(
  1401. np.dot(lvals, rvals),
  1402. index=left.index,
  1403. columns=other.columns,
  1404. copy=False,
  1405. )
  1406. elif isinstance(other, Series):
  1407. return self._constructor_sliced(
  1408. np.dot(lvals, rvals), index=left.index, copy=False
  1409. )
  1410. elif isinstance(rvals, (np.ndarray, Index)):
  1411. result = np.dot(lvals, rvals)
  1412. if result.ndim == 2:
  1413. return self._constructor(result, index=left.index, copy=False)
  1414. else:
  1415. return self._constructor_sliced(result, index=left.index, copy=False)
  1416. else: # pragma: no cover
  1417. raise TypeError(f"unsupported type: {type(other)}")
  1418. @overload
  1419. def __matmul__(self, other: Series) -> Series:
  1420. ...
  1421. @overload
  1422. def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
  1423. ...
  1424. def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
  1425. """
  1426. Matrix multiplication using binary `@` operator in Python>=3.5.
  1427. """
  1428. return self.dot(other)
  1429. def __rmatmul__(self, other) -> DataFrame:
  1430. """
  1431. Matrix multiplication using binary `@` operator in Python>=3.5.
  1432. """
  1433. try:
  1434. return self.T.dot(np.transpose(other)).T
  1435. except ValueError as err:
  1436. if "shape mismatch" not in str(err):
  1437. raise
  1438. # GH#21581 give exception message for original shapes
  1439. msg = f"shapes {np.shape(other)} and {self.shape} not aligned"
  1440. raise ValueError(msg) from err
  1441. # ----------------------------------------------------------------------
  1442. # IO methods (to / from other formats)
  1443. @classmethod
  1444. def from_dict(
  1445. cls,
  1446. data: dict,
  1447. orient: str = "columns",
  1448. dtype: Dtype | None = None,
  1449. columns: Axes | None = None,
  1450. ) -> DataFrame:
  1451. """
  1452. Construct DataFrame from dict of array-like or dicts.
  1453. Creates DataFrame object from dictionary by columns or by index
  1454. allowing dtype specification.
  1455. Parameters
  1456. ----------
  1457. data : dict
  1458. Of the form {field : array-like} or {field : dict}.
  1459. orient : {'columns', 'index', 'tight'}, default 'columns'
  1460. The "orientation" of the data. If the keys of the passed dict
  1461. should be the columns of the resulting DataFrame, pass 'columns'
  1462. (default). Otherwise if the keys should be rows, pass 'index'.
  1463. If 'tight', assume a dict with keys ['index', 'columns', 'data',
  1464. 'index_names', 'column_names'].
  1465. .. versionadded:: 1.4.0
  1466. 'tight' as an allowed value for the ``orient`` argument
  1467. dtype : dtype, default None
  1468. Data type to force after DataFrame construction, otherwise infer.
  1469. columns : list, default None
  1470. Column labels to use when ``orient='index'``. Raises a ValueError
  1471. if used with ``orient='columns'`` or ``orient='tight'``.
  1472. Returns
  1473. -------
  1474. DataFrame
  1475. See Also
  1476. --------
  1477. DataFrame.from_records : DataFrame from structured ndarray, sequence
  1478. of tuples or dicts, or DataFrame.
  1479. DataFrame : DataFrame object creation using constructor.
  1480. DataFrame.to_dict : Convert the DataFrame to a dictionary.
  1481. Examples
  1482. --------
  1483. By default the keys of the dict become the DataFrame columns:
  1484. >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
  1485. >>> pd.DataFrame.from_dict(data)
  1486. col_1 col_2
  1487. 0 3 a
  1488. 1 2 b
  1489. 2 1 c
  1490. 3 0 d
  1491. Specify ``orient='index'`` to create the DataFrame using dictionary
  1492. keys as rows:
  1493. >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
  1494. >>> pd.DataFrame.from_dict(data, orient='index')
  1495. 0 1 2 3
  1496. row_1 3 2 1 0
  1497. row_2 a b c d
  1498. When using the 'index' orientation, the column names can be
  1499. specified manually:
  1500. >>> pd.DataFrame.from_dict(data, orient='index',
  1501. ... columns=['A', 'B', 'C', 'D'])
  1502. A B C D
  1503. row_1 3 2 1 0
  1504. row_2 a b c d
  1505. Specify ``orient='tight'`` to create the DataFrame using a 'tight'
  1506. format:
  1507. >>> data = {'index': [('a', 'b'), ('a', 'c')],
  1508. ... 'columns': [('x', 1), ('y', 2)],
  1509. ... 'data': [[1, 3], [2, 4]],
  1510. ... 'index_names': ['n1', 'n2'],
  1511. ... 'column_names': ['z1', 'z2']}
  1512. >>> pd.DataFrame.from_dict(data, orient='tight')
  1513. z1 x y
  1514. z2 1 2
  1515. n1 n2
  1516. a b 1 3
  1517. c 2 4
  1518. """
  1519. index = None
  1520. orient = orient.lower()
  1521. if orient == "index":
  1522. if len(data) > 0:
  1523. # TODO speed up Series case
  1524. if isinstance(list(data.values())[0], (Series, dict)):
  1525. data = _from_nested_dict(data)
  1526. else:
  1527. index = list(data.keys())
  1528. # error: Incompatible types in assignment (expression has type
  1529. # "List[Any]", variable has type "Dict[Any, Any]")
  1530. data = list(data.values()) # type: ignore[assignment]
  1531. elif orient in ("columns", "tight"):
  1532. if columns is not None:
  1533. raise ValueError(f"cannot use columns parameter with orient='{orient}'")
  1534. else: # pragma: no cover
  1535. raise ValueError(
  1536. f"Expected 'index', 'columns' or 'tight' for orient parameter. "
  1537. f"Got '{orient}' instead"
  1538. )
  1539. if orient != "tight":
  1540. return cls(data, index=index, columns=columns, dtype=dtype)
  1541. else:
  1542. realdata = data["data"]
  1543. def create_index(indexlist, namelist):
  1544. index: Index
  1545. if len(namelist) > 1:
  1546. index = MultiIndex.from_tuples(indexlist, names=namelist)
  1547. else:
  1548. index = Index(indexlist, name=namelist[0])
  1549. return index
  1550. index = create_index(data["index"], data["index_names"])
  1551. columns = create_index(data["columns"], data["column_names"])
  1552. return cls(realdata, index=index, columns=columns, dtype=dtype)
  1553. def to_numpy(
  1554. self,
  1555. dtype: npt.DTypeLike | None = None,
  1556. copy: bool = False,
  1557. na_value: object = lib.no_default,
  1558. ) -> np.ndarray:
  1559. """
  1560. Convert the DataFrame to a NumPy array.
  1561. By default, the dtype of the returned array will be the common NumPy
  1562. dtype of all types in the DataFrame. For example, if the dtypes are
  1563. ``float16`` and ``float32``, the results dtype will be ``float32``.
  1564. This may require copying data and coercing values, which may be
  1565. expensive.
  1566. Parameters
  1567. ----------
  1568. dtype : str or numpy.dtype, optional
  1569. The dtype to pass to :meth:`numpy.asarray`.
  1570. copy : bool, default False
  1571. Whether to ensure that the returned value is not a view on
  1572. another array. Note that ``copy=False`` does not *ensure* that
  1573. ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
  1574. a copy is made, even if not strictly necessary.
  1575. na_value : Any, optional
  1576. The value to use for missing values. The default value depends
  1577. on `dtype` and the dtypes of the DataFrame columns.
  1578. .. versionadded:: 1.1.0
  1579. Returns
  1580. -------
  1581. numpy.ndarray
  1582. See Also
  1583. --------
  1584. Series.to_numpy : Similar method for Series.
  1585. Examples
  1586. --------
  1587. >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
  1588. array([[1, 3],
  1589. [2, 4]])
  1590. With heterogeneous data, the lowest common type will have to
  1591. be used.
  1592. >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
  1593. >>> df.to_numpy()
  1594. array([[1. , 3. ],
  1595. [2. , 4.5]])
  1596. For a mix of numeric and non-numeric types, the output array will
  1597. have object dtype.
  1598. >>> df['C'] = pd.date_range('2000', periods=2)
  1599. >>> df.to_numpy()
  1600. array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
  1601. [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
  1602. """
  1603. if dtype is not None:
  1604. dtype = np.dtype(dtype)
  1605. result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)
  1606. if result.dtype is not dtype:
  1607. result = np.array(result, dtype=dtype, copy=False)
  1608. return result
  1609. def _create_data_for_split_and_tight_to_dict(
  1610. self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
  1611. ) -> list:
  1612. """
  1613. Simple helper method to create data for to ``to_dict(orient="split")`` and
  1614. ``to_dict(orient="tight")`` to create the main output data
  1615. """
  1616. if are_all_object_dtype_cols:
  1617. data = [
  1618. list(map(maybe_box_native, t))
  1619. for t in self.itertuples(index=False, name=None)
  1620. ]
  1621. else:
  1622. data = [list(t) for t in self.itertuples(index=False, name=None)]
  1623. if object_dtype_indices:
  1624. # If we have object_dtype_cols, apply maybe_box_naive after list
  1625. # comprehension for perf
  1626. for row in data:
  1627. for i in object_dtype_indices:
  1628. row[i] = maybe_box_native(row[i])
  1629. return data
  1630. @overload
  1631. def to_dict(
  1632. self,
  1633. orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
  1634. into: type[dict] = ...,
  1635. ) -> dict:
  1636. ...
  1637. @overload
  1638. def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]:
  1639. ...
  1640. def to_dict(
  1641. self,
  1642. orient: Literal[
  1643. "dict", "list", "series", "split", "tight", "records", "index"
  1644. ] = "dict",
  1645. into: type[dict] = dict,
  1646. index: bool = True,
  1647. ) -> dict | list[dict]:
  1648. """
  1649. Convert the DataFrame to a dictionary.
  1650. The type of the key-value pairs can be customized with the parameters
  1651. (see below).
  1652. Parameters
  1653. ----------
  1654. orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
  1655. Determines the type of the values of the dictionary.
  1656. - 'dict' (default) : dict like {column -> {index -> value}}
  1657. - 'list' : dict like {column -> [values]}
  1658. - 'series' : dict like {column -> Series(values)}
  1659. - 'split' : dict like
  1660. {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
  1661. - 'tight' : dict like
  1662. {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
  1663. 'index_names' -> [index.names], 'column_names' -> [column.names]}
  1664. - 'records' : list like
  1665. [{column -> value}, ... , {column -> value}]
  1666. - 'index' : dict like {index -> {column -> value}}
  1667. .. versionadded:: 1.4.0
  1668. 'tight' as an allowed value for the ``orient`` argument
  1669. into : class, default dict
  1670. The collections.abc.Mapping subclass used for all Mappings
  1671. in the return value. Can be the actual class or an empty
  1672. instance of the mapping type you want. If you want a
  1673. collections.defaultdict, you must pass it initialized.
  1674. index : bool, default True
  1675. Whether to include the index item (and index_names item if `orient`
  1676. is 'tight') in the returned dictionary. Can only be ``False``
  1677. when `orient` is 'split' or 'tight'.
  1678. .. versionadded:: 2.0.0
  1679. Returns
  1680. -------
  1681. dict, list or collections.abc.Mapping
  1682. Return a collections.abc.Mapping object representing the DataFrame.
  1683. The resulting transformation depends on the `orient` parameter.
  1684. See Also
  1685. --------
  1686. DataFrame.from_dict: Create a DataFrame from a dictionary.
  1687. DataFrame.to_json: Convert a DataFrame to JSON format.
  1688. Examples
  1689. --------
  1690. >>> df = pd.DataFrame({'col1': [1, 2],
  1691. ... 'col2': [0.5, 0.75]},
  1692. ... index=['row1', 'row2'])
  1693. >>> df
  1694. col1 col2
  1695. row1 1 0.50
  1696. row2 2 0.75
  1697. >>> df.to_dict()
  1698. {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
  1699. You can specify the return orientation.
  1700. >>> df.to_dict('series')
  1701. {'col1': row1 1
  1702. row2 2
  1703. Name: col1, dtype: int64,
  1704. 'col2': row1 0.50
  1705. row2 0.75
  1706. Name: col2, dtype: float64}
  1707. >>> df.to_dict('split')
  1708. {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
  1709. 'data': [[1, 0.5], [2, 0.75]]}
  1710. >>> df.to_dict('records')
  1711. [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
  1712. >>> df.to_dict('index')
  1713. {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
  1714. >>> df.to_dict('tight')
  1715. {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
  1716. 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
  1717. You can also specify the mapping type.
  1718. >>> from collections import OrderedDict, defaultdict
  1719. >>> df.to_dict(into=OrderedDict)
  1720. OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
  1721. ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
  1722. If you want a `defaultdict`, you need to initialize it:
  1723. >>> dd = defaultdict(list)
  1724. >>> df.to_dict('records', into=dd)
  1725. [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
  1726. defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
  1727. """
  1728. from pandas.core.methods.to_dict import to_dict
  1729. return to_dict(self, orient, into, index)
  1730. def to_gbq(
  1731. self,
  1732. destination_table: str,
  1733. project_id: str | None = None,
  1734. chunksize: int | None = None,
  1735. reauth: bool = False,
  1736. if_exists: str = "fail",
  1737. auth_local_webserver: bool = True,
  1738. table_schema: list[dict[str, str]] | None = None,
  1739. location: str | None = None,
  1740. progress_bar: bool = True,
  1741. credentials=None,
  1742. ) -> None:
  1743. """
  1744. Write a DataFrame to a Google BigQuery table.
  1745. This function requires the `pandas-gbq package
  1746. <https://pandas-gbq.readthedocs.io>`__.
  1747. See the `How to authenticate with Google BigQuery
  1748. <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
  1749. guide for authentication instructions.
  1750. Parameters
  1751. ----------
  1752. destination_table : str
  1753. Name of table to be written, in the form ``dataset.tablename``.
  1754. project_id : str, optional
  1755. Google BigQuery Account project ID. Optional when available from
  1756. the environment.
  1757. chunksize : int, optional
  1758. Number of rows to be inserted in each chunk from the dataframe.
  1759. Set to ``None`` to load the whole dataframe at once.
  1760. reauth : bool, default False
  1761. Force Google BigQuery to re-authenticate the user. This is useful
  1762. if multiple accounts are used.
  1763. if_exists : str, default 'fail'
  1764. Behavior when the destination table exists. Value can be one of:
  1765. ``'fail'``
  1766. If table exists raise pandas_gbq.gbq.TableCreationError.
  1767. ``'replace'``
  1768. If table exists, drop it, recreate it, and insert data.
  1769. ``'append'``
  1770. If table exists, insert data. Create if does not exist.
  1771. auth_local_webserver : bool, default True
  1772. Use the `local webserver flow`_ instead of the `console flow`_
  1773. when getting user credentials.
  1774. .. _local webserver flow:
  1775. https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
  1776. .. _console flow:
  1777. https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
  1778. *New in version 0.2.0 of pandas-gbq*.
  1779. .. versionchanged:: 1.5.0
  1780. Default value is changed to ``True``. Google has deprecated the
  1781. ``auth_local_webserver = False`` `"out of band" (copy-paste)
  1782. flow
  1783. <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
  1784. table_schema : list of dicts, optional
  1785. List of BigQuery table fields to which according DataFrame
  1786. columns conform to, e.g. ``[{'name': 'col1', 'type':
  1787. 'STRING'},...]``. If schema is not provided, it will be
  1788. generated according to dtypes of DataFrame columns. See
  1789. BigQuery API documentation on available names of a field.
  1790. *New in version 0.3.1 of pandas-gbq*.
  1791. location : str, optional
  1792. Location where the load job should run. See the `BigQuery locations
  1793. documentation
  1794. <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
  1795. list of available locations. The location must match that of the
  1796. target dataset.
  1797. *New in version 0.5.0 of pandas-gbq*.
  1798. progress_bar : bool, default True
  1799. Use the library `tqdm` to show the progress bar for the upload,
  1800. chunk by chunk.
  1801. *New in version 0.5.0 of pandas-gbq*.
  1802. credentials : google.auth.credentials.Credentials, optional
  1803. Credentials for accessing Google APIs. Use this parameter to
  1804. override default credentials, such as to use Compute Engine
  1805. :class:`google.auth.compute_engine.Credentials` or Service
  1806. Account :class:`google.oauth2.service_account.Credentials`
  1807. directly.
  1808. *New in version 0.8.0 of pandas-gbq*.
  1809. See Also
  1810. --------
  1811. pandas_gbq.to_gbq : This function in the pandas-gbq library.
  1812. read_gbq : Read a DataFrame from Google BigQuery.
  1813. """
  1814. from pandas.io import gbq
  1815. gbq.to_gbq(
  1816. self,
  1817. destination_table,
  1818. project_id=project_id,
  1819. chunksize=chunksize,
  1820. reauth=reauth,
  1821. if_exists=if_exists,
  1822. auth_local_webserver=auth_local_webserver,
  1823. table_schema=table_schema,
  1824. location=location,
  1825. progress_bar=progress_bar,
  1826. credentials=credentials,
  1827. )
  1828. @classmethod
  1829. def from_records(
  1830. cls,
  1831. data,
  1832. index=None,
  1833. exclude=None,
  1834. columns=None,
  1835. coerce_float: bool = False,
  1836. nrows: int | None = None,
  1837. ) -> DataFrame:
  1838. """
  1839. Convert structured or record ndarray to DataFrame.
  1840. Creates a DataFrame object from a structured ndarray, sequence of
  1841. tuples or dicts, or DataFrame.
  1842. Parameters
  1843. ----------
  1844. data : structured ndarray, sequence of tuples or dicts, or DataFrame
  1845. Structured input data.
  1846. index : str, list of fields, array-like
  1847. Field of array to use as the index, alternately a specific set of
  1848. input labels to use.
  1849. exclude : sequence, default None
  1850. Columns or fields to exclude.
  1851. columns : sequence, default None
  1852. Column names to use. If the passed data do not have names
  1853. associated with them, this argument provides names for the
  1854. columns. Otherwise this argument indicates the order of the columns
  1855. in the result (any names not found in the data will become all-NA
  1856. columns).
  1857. coerce_float : bool, default False
  1858. Attempt to convert values of non-string, non-numeric objects (like
  1859. decimal.Decimal) to floating point, useful for SQL result sets.
  1860. nrows : int, default None
  1861. Number of rows to read if data is an iterator.
  1862. Returns
  1863. -------
  1864. DataFrame
  1865. See Also
  1866. --------
  1867. DataFrame.from_dict : DataFrame from dict of array-like or dicts.
  1868. DataFrame : DataFrame object creation using constructor.
  1869. Examples
  1870. --------
  1871. Data can be provided as a structured ndarray:
  1872. >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
  1873. ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
  1874. >>> pd.DataFrame.from_records(data)
  1875. col_1 col_2
  1876. 0 3 a
  1877. 1 2 b
  1878. 2 1 c
  1879. 3 0 d
  1880. Data can be provided as a list of dicts:
  1881. >>> data = [{'col_1': 3, 'col_2': 'a'},
  1882. ... {'col_1': 2, 'col_2': 'b'},
  1883. ... {'col_1': 1, 'col_2': 'c'},
  1884. ... {'col_1': 0, 'col_2': 'd'}]
  1885. >>> pd.DataFrame.from_records(data)
  1886. col_1 col_2
  1887. 0 3 a
  1888. 1 2 b
  1889. 2 1 c
  1890. 3 0 d
  1891. Data can be provided as a list of tuples with corresponding columns:
  1892. >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
  1893. >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
  1894. col_1 col_2
  1895. 0 3 a
  1896. 1 2 b
  1897. 2 1 c
  1898. 3 0 d
  1899. """
  1900. if isinstance(data, DataFrame):
  1901. if columns is not None:
  1902. if is_scalar(columns):
  1903. columns = [columns]
  1904. data = data[columns]
  1905. if index is not None:
  1906. data = data.set_index(index)
  1907. if exclude is not None:
  1908. data = data.drop(columns=exclude)
  1909. return data.copy(deep=False)
  1910. result_index = None
  1911. # Make a copy of the input columns so we can modify it
  1912. if columns is not None:
  1913. columns = ensure_index(columns)
  1914. def maybe_reorder(
  1915. arrays: list[ArrayLike], arr_columns: Index, columns: Index, index
  1916. ) -> tuple[list[ArrayLike], Index, Index | None]:
  1917. """
  1918. If our desired 'columns' do not match the data's pre-existing 'arr_columns',
  1919. we re-order our arrays. This is like a pre-emptive (cheap) reindex.
  1920. """
  1921. if len(arrays):
  1922. length = len(arrays[0])
  1923. else:
  1924. length = 0
  1925. result_index = None
  1926. if len(arrays) == 0 and index is None and length == 0:
  1927. result_index = default_index(0)
  1928. arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)
  1929. return arrays, arr_columns, result_index
  1930. if is_iterator(data):
  1931. if nrows == 0:
  1932. return cls()
  1933. try:
  1934. first_row = next(data)
  1935. except StopIteration:
  1936. return cls(index=index, columns=columns)
  1937. dtype = None
  1938. if hasattr(first_row, "dtype") and first_row.dtype.names:
  1939. dtype = first_row.dtype
  1940. values = [first_row]
  1941. if nrows is None:
  1942. values += data
  1943. else:
  1944. values.extend(itertools.islice(data, nrows - 1))
  1945. if dtype is not None:
  1946. data = np.array(values, dtype=dtype)
  1947. else:
  1948. data = values
  1949. if isinstance(data, dict):
  1950. if columns is None:
  1951. columns = arr_columns = ensure_index(sorted(data))
  1952. arrays = [data[k] for k in columns]
  1953. else:
  1954. arrays = []
  1955. arr_columns_list = []
  1956. for k, v in data.items():
  1957. if k in columns:
  1958. arr_columns_list.append(k)
  1959. arrays.append(v)
  1960. arr_columns = Index(arr_columns_list)
  1961. arrays, arr_columns, result_index = maybe_reorder(
  1962. arrays, arr_columns, columns, index
  1963. )
  1964. elif isinstance(data, (np.ndarray, DataFrame)):
  1965. arrays, columns = to_arrays(data, columns)
  1966. arr_columns = columns
  1967. else:
  1968. arrays, arr_columns = to_arrays(data, columns)
  1969. if coerce_float:
  1970. for i, arr in enumerate(arrays):
  1971. if arr.dtype == object:
  1972. # error: Argument 1 to "maybe_convert_objects" has
  1973. # incompatible type "Union[ExtensionArray, ndarray]";
  1974. # expected "ndarray"
  1975. arrays[i] = lib.maybe_convert_objects(
  1976. arr, # type: ignore[arg-type]
  1977. try_float=True,
  1978. )
  1979. arr_columns = ensure_index(arr_columns)
  1980. if columns is None:
  1981. columns = arr_columns
  1982. else:
  1983. arrays, arr_columns, result_index = maybe_reorder(
  1984. arrays, arr_columns, columns, index
  1985. )
  1986. if exclude is None:
  1987. exclude = set()
  1988. else:
  1989. exclude = set(exclude)
  1990. if index is not None:
  1991. if isinstance(index, str) or not hasattr(index, "__iter__"):
  1992. i = columns.get_loc(index)
  1993. exclude.add(index)
  1994. if len(arrays) > 0:
  1995. result_index = Index(arrays[i], name=index)
  1996. else:
  1997. result_index = Index([], name=index)
  1998. else:
  1999. try:
  2000. index_data = [arrays[arr_columns.get_loc(field)] for field in index]
  2001. except (KeyError, TypeError):
  2002. # raised by get_loc, see GH#29258
  2003. result_index = index
  2004. else:
  2005. result_index = ensure_index_from_sequences(index_data, names=index)
  2006. exclude.update(index)
  2007. if any(exclude):
  2008. arr_exclude = [x for x in exclude if x in arr_columns]
  2009. to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
  2010. arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
  2011. columns = columns.drop(exclude)
  2012. manager = get_option("mode.data_manager")
  2013. mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)
  2014. return cls(mgr)
  2015. def to_records(
  2016. self, index: bool = True, column_dtypes=None, index_dtypes=None
  2017. ) -> np.recarray:
  2018. """
  2019. Convert DataFrame to a NumPy record array.
  2020. Index will be included as the first field of the record array if
  2021. requested.
  2022. Parameters
  2023. ----------
  2024. index : bool, default True
  2025. Include index in resulting record array, stored in 'index'
  2026. field or using the index label, if set.
  2027. column_dtypes : str, type, dict, default None
  2028. If a string or type, the data type to store all columns. If
  2029. a dictionary, a mapping of column names and indices (zero-indexed)
  2030. to specific data types.
  2031. index_dtypes : str, type, dict, default None
  2032. If a string or type, the data type to store all index levels. If
  2033. a dictionary, a mapping of index level names and indices
  2034. (zero-indexed) to specific data types.
  2035. This mapping is applied only if `index=True`.
  2036. Returns
  2037. -------
  2038. numpy.recarray
  2039. NumPy ndarray with the DataFrame labels as fields and each row
  2040. of the DataFrame as entries.
  2041. See Also
  2042. --------
  2043. DataFrame.from_records: Convert structured or record ndarray
  2044. to DataFrame.
  2045. numpy.recarray: An ndarray that allows field access using
  2046. attributes, analogous to typed columns in a
  2047. spreadsheet.
  2048. Examples
  2049. --------
  2050. >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
  2051. ... index=['a', 'b'])
  2052. >>> df
  2053. A B
  2054. a 1 0.50
  2055. b 2 0.75
  2056. >>> df.to_records()
  2057. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  2058. dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
  2059. If the DataFrame index has no label then the recarray field name
  2060. is set to 'index'. If the index has a label then this is used as the
  2061. field name:
  2062. >>> df.index = df.index.rename("I")
  2063. >>> df.to_records()
  2064. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  2065. dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
  2066. The index can be excluded from the record array:
  2067. >>> df.to_records(index=False)
  2068. rec.array([(1, 0.5 ), (2, 0.75)],
  2069. dtype=[('A', '<i8'), ('B', '<f8')])
  2070. Data types can be specified for the columns:
  2071. >>> df.to_records(column_dtypes={"A": "int32"})
  2072. rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
  2073. dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
  2074. As well as for the index:
  2075. >>> df.to_records(index_dtypes="<S2")
  2076. rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
  2077. dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
  2078. >>> index_dtypes = f"<S{df.index.str.len().max()}"
  2079. >>> df.to_records(index_dtypes=index_dtypes)
  2080. rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
  2081. dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
  2082. """
  2083. if index:
  2084. ix_vals = [
  2085. np.asarray(self.index.get_level_values(i))
  2086. for i in range(self.index.nlevels)
  2087. ]
  2088. arrays = ix_vals + [
  2089. np.asarray(self.iloc[:, i]) for i in range(len(self.columns))
  2090. ]
  2091. index_names = list(self.index.names)
  2092. if isinstance(self.index, MultiIndex):
  2093. index_names = com.fill_missing_names(index_names)
  2094. elif index_names[0] is None:
  2095. index_names = ["index"]
  2096. names = [str(name) for name in itertools.chain(index_names, self.columns)]
  2097. else:
  2098. arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]
  2099. names = [str(c) for c in self.columns]
  2100. index_names = []
  2101. index_len = len(index_names)
  2102. formats = []
  2103. for i, v in enumerate(arrays):
  2104. index_int = i
  2105. # When the names and arrays are collected, we
  2106. # first collect those in the DataFrame's index,
  2107. # followed by those in its columns.
  2108. #
  2109. # Thus, the total length of the array is:
  2110. # len(index_names) + len(DataFrame.columns).
  2111. #
  2112. # This check allows us to see whether we are
  2113. # handling a name / array in the index or column.
  2114. if index_int < index_len:
  2115. dtype_mapping = index_dtypes
  2116. name = index_names[index_int]
  2117. else:
  2118. index_int -= index_len
  2119. dtype_mapping = column_dtypes
  2120. name = self.columns[index_int]
  2121. # We have a dictionary, so we get the data type
  2122. # associated with the index or column (which can
  2123. # be denoted by its name in the DataFrame or its
  2124. # position in DataFrame's array of indices or
  2125. # columns, whichever is applicable.
  2126. if is_dict_like(dtype_mapping):
  2127. if name in dtype_mapping:
  2128. dtype_mapping = dtype_mapping[name]
  2129. elif index_int in dtype_mapping:
  2130. dtype_mapping = dtype_mapping[index_int]
  2131. else:
  2132. dtype_mapping = None
  2133. # If no mapping can be found, use the array's
  2134. # dtype attribute for formatting.
  2135. #
  2136. # A valid dtype must either be a type or
  2137. # string naming a type.
  2138. if dtype_mapping is None:
  2139. formats.append(v.dtype)
  2140. elif isinstance(dtype_mapping, (type, np.dtype, str)):
  2141. # error: Argument 1 to "append" of "list" has incompatible
  2142. # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"
  2143. formats.append(dtype_mapping) # type: ignore[arg-type]
  2144. else:
  2145. element = "row" if i < index_len else "column"
  2146. msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
  2147. raise ValueError(msg)
  2148. return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
  2149. @classmethod
  2150. def _from_arrays(
  2151. cls,
  2152. arrays,
  2153. columns,
  2154. index,
  2155. dtype: Dtype | None = None,
  2156. verify_integrity: bool = True,
  2157. ) -> DataFrame:
  2158. """
  2159. Create DataFrame from a list of arrays corresponding to the columns.
  2160. Parameters
  2161. ----------
  2162. arrays : list-like of arrays
  2163. Each array in the list corresponds to one column, in order.
  2164. columns : list-like, Index
  2165. The column names for the resulting DataFrame.
  2166. index : list-like, Index
  2167. The rows labels for the resulting DataFrame.
  2168. dtype : dtype, optional
  2169. Optional dtype to enforce for all arrays.
  2170. verify_integrity : bool, default True
  2171. Validate and homogenize all input. If set to False, it is assumed
  2172. that all elements of `arrays` are actual arrays how they will be
  2173. stored in a block (numpy ndarray or ExtensionArray), have the same
  2174. length as and are aligned with the index, and that `columns` and
  2175. `index` are ensured to be an Index object.
  2176. Returns
  2177. -------
  2178. DataFrame
  2179. """
  2180. if dtype is not None:
  2181. dtype = pandas_dtype(dtype)
  2182. manager = get_option("mode.data_manager")
  2183. columns = ensure_index(columns)
  2184. if len(columns) != len(arrays):
  2185. raise ValueError("len(columns) must match len(arrays)")
  2186. mgr = arrays_to_mgr(
  2187. arrays,
  2188. columns,
  2189. index,
  2190. dtype=dtype,
  2191. verify_integrity=verify_integrity,
  2192. typ=manager,
  2193. )
  2194. return cls(mgr)
  2195. @doc(
  2196. storage_options=_shared_docs["storage_options"],
  2197. compression_options=_shared_docs["compression_options"] % "path",
  2198. )
  2199. def to_stata(
  2200. self,
  2201. path: FilePath | WriteBuffer[bytes],
  2202. *,
  2203. convert_dates: dict[Hashable, str] | None = None,
  2204. write_index: bool = True,
  2205. byteorder: str | None = None,
  2206. time_stamp: datetime.datetime | None = None,
  2207. data_label: str | None = None,
  2208. variable_labels: dict[Hashable, str] | None = None,
  2209. version: int | None = 114,
  2210. convert_strl: Sequence[Hashable] | None = None,
  2211. compression: CompressionOptions = "infer",
  2212. storage_options: StorageOptions = None,
  2213. value_labels: dict[Hashable, dict[float, str]] | None = None,
  2214. ) -> None:
  2215. """
  2216. Export DataFrame object to Stata dta format.
  2217. Writes the DataFrame to a Stata dataset file.
  2218. "dta" files contain a Stata dataset.
  2219. Parameters
  2220. ----------
  2221. path : str, path object, or buffer
  2222. String, path object (implementing ``os.PathLike[str]``), or file-like
  2223. object implementing a binary ``write()`` function.
  2224. convert_dates : dict
  2225. Dictionary mapping columns containing datetime types to stata
  2226. internal format to use when writing the dates. Options are 'tc',
  2227. 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
  2228. or a name. Datetime columns that do not have a conversion type
  2229. specified will be converted to 'tc'. Raises NotImplementedError if
  2230. a datetime column has timezone information.
  2231. write_index : bool
  2232. Write the index to Stata dataset.
  2233. byteorder : str
  2234. Can be ">", "<", "little", or "big". default is `sys.byteorder`.
  2235. time_stamp : datetime
  2236. A datetime to use as file creation date. Default is the current
  2237. time.
  2238. data_label : str, optional
  2239. A label for the data set. Must be 80 characters or smaller.
  2240. variable_labels : dict
  2241. Dictionary containing columns as keys and variable labels as
  2242. values. Each label must be 80 characters or smaller.
  2243. version : {{114, 117, 118, 119, None}}, default 114
  2244. Version to use in the output dta file. Set to None to let pandas
  2245. decide between 118 or 119 formats depending on the number of
  2246. columns in the frame. Version 114 can be read by Stata 10 and
  2247. later. Version 117 can be read by Stata 13 or later. Version 118
  2248. is supported in Stata 14 and later. Version 119 is supported in
  2249. Stata 15 and later. Version 114 limits string variables to 244
  2250. characters or fewer while versions 117 and later allow strings
  2251. with lengths up to 2,000,000 characters. Versions 118 and 119
  2252. support Unicode characters, and version 119 supports more than
  2253. 32,767 variables.
  2254. Version 119 should usually only be used when the number of
  2255. variables exceeds the capacity of dta format 118. Exporting
  2256. smaller datasets in format 119 may have unintended consequences,
  2257. and, as of November 2020, Stata SE cannot read version 119 files.
  2258. convert_strl : list, optional
  2259. List of column names to convert to string columns to Stata StrL
  2260. format. Only available if version is 117. Storing strings in the
  2261. StrL format can produce smaller dta files if strings have more than
  2262. 8 characters and values are repeated.
  2263. {compression_options}
  2264. .. versionadded:: 1.1.0
  2265. .. versionchanged:: 1.4.0 Zstandard support.
  2266. {storage_options}
  2267. .. versionadded:: 1.2.0
  2268. value_labels : dict of dicts
  2269. Dictionary containing columns as keys and dictionaries of column value
  2270. to labels as values. Labels for a single variable must be 32,000
  2271. characters or smaller.
  2272. .. versionadded:: 1.4.0
  2273. Raises
  2274. ------
  2275. NotImplementedError
  2276. * If datetimes contain timezone information
  2277. * Column dtype is not representable in Stata
  2278. ValueError
  2279. * Columns listed in convert_dates are neither datetime64[ns]
  2280. or datetime.datetime
  2281. * Column listed in convert_dates is not in DataFrame
  2282. * Categorical label contains more than 32,000 characters
  2283. See Also
  2284. --------
  2285. read_stata : Import Stata data files.
  2286. io.stata.StataWriter : Low-level writer for Stata data files.
  2287. io.stata.StataWriter117 : Low-level writer for version 117 files.
  2288. Examples
  2289. --------
  2290. >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
  2291. ... 'parrot'],
  2292. ... 'speed': [350, 18, 361, 15]}})
  2293. >>> df.to_stata('animals.dta') # doctest: +SKIP
  2294. """
  2295. if version not in (114, 117, 118, 119, None):
  2296. raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
  2297. if version == 114:
  2298. if convert_strl is not None:
  2299. raise ValueError("strl is not supported in format 114")
  2300. from pandas.io.stata import StataWriter as statawriter
  2301. elif version == 117:
  2302. # Incompatible import of "statawriter" (imported name has type
  2303. # "Type[StataWriter117]", local name has type "Type[StataWriter]")
  2304. from pandas.io.stata import ( # type: ignore[assignment]
  2305. StataWriter117 as statawriter,
  2306. )
  2307. else: # versions 118 and 119
  2308. # Incompatible import of "statawriter" (imported name has type
  2309. # "Type[StataWriter117]", local name has type "Type[StataWriter]")
  2310. from pandas.io.stata import ( # type: ignore[assignment]
  2311. StataWriterUTF8 as statawriter,
  2312. )
  2313. kwargs: dict[str, Any] = {}
  2314. if version is None or version >= 117:
  2315. # strl conversion is only supported >= 117
  2316. kwargs["convert_strl"] = convert_strl
  2317. if version is None or version >= 118:
  2318. # Specifying the version is only supported for UTF8 (118 or 119)
  2319. kwargs["version"] = version
  2320. writer = statawriter(
  2321. path,
  2322. self,
  2323. convert_dates=convert_dates,
  2324. byteorder=byteorder,
  2325. time_stamp=time_stamp,
  2326. data_label=data_label,
  2327. write_index=write_index,
  2328. variable_labels=variable_labels,
  2329. compression=compression,
  2330. storage_options=storage_options,
  2331. value_labels=value_labels,
  2332. **kwargs,
  2333. )
  2334. writer.write_file()
  2335. def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
  2336. """
  2337. Write a DataFrame to the binary Feather format.
  2338. Parameters
  2339. ----------
  2340. path : str, path object, file-like object
  2341. String, path object (implementing ``os.PathLike[str]``), or file-like
  2342. object implementing a binary ``write()`` function. If a string or a path,
  2343. it will be used as Root Directory path when writing a partitioned dataset.
  2344. **kwargs :
  2345. Additional keywords passed to :func:`pyarrow.feather.write_feather`.
  2346. Starting with pyarrow 0.17, this includes the `compression`,
  2347. `compression_level`, `chunksize` and `version` keywords.
  2348. .. versionadded:: 1.1.0
  2349. Notes
  2350. -----
  2351. This function writes the dataframe as a `feather file
  2352. <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default
  2353. index. For saving the DataFrame with your custom index use a method that
  2354. supports custom indices e.g. `to_parquet`.
  2355. """
  2356. from pandas.io.feather_format import to_feather
  2357. to_feather(self, path, **kwargs)
  2358. @doc(
  2359. Series.to_markdown,
  2360. klass=_shared_doc_kwargs["klass"],
  2361. storage_options=_shared_docs["storage_options"],
  2362. examples="""Examples
  2363. --------
  2364. >>> df = pd.DataFrame(
  2365. ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
  2366. ... )
  2367. >>> print(df.to_markdown())
  2368. | | animal_1 | animal_2 |
  2369. |---:|:-----------|:-----------|
  2370. | 0 | elk | dog |
  2371. | 1 | pig | quetzal |
  2372. Output markdown with a tabulate option.
  2373. >>> print(df.to_markdown(tablefmt="grid"))
  2374. +----+------------+------------+
  2375. | | animal_1 | animal_2 |
  2376. +====+============+============+
  2377. | 0 | elk | dog |
  2378. +----+------------+------------+
  2379. | 1 | pig | quetzal |
  2380. +----+------------+------------+""",
  2381. )
  2382. def to_markdown(
  2383. self,
  2384. buf: FilePath | WriteBuffer[str] | None = None,
  2385. mode: str = "wt",
  2386. index: bool = True,
  2387. storage_options: StorageOptions = None,
  2388. **kwargs,
  2389. ) -> str | None:
  2390. if "showindex" in kwargs:
  2391. raise ValueError("Pass 'index' instead of 'showindex")
  2392. kwargs.setdefault("headers", "keys")
  2393. kwargs.setdefault("tablefmt", "pipe")
  2394. kwargs.setdefault("showindex", index)
  2395. tabulate = import_optional_dependency("tabulate")
  2396. result = tabulate.tabulate(self, **kwargs)
  2397. if buf is None:
  2398. return result
  2399. with get_handle(buf, mode, storage_options=storage_options) as handles:
  2400. handles.handle.write(result)
  2401. return None
  2402. @overload
  2403. def to_parquet(
  2404. self,
  2405. path: None = ...,
  2406. engine: str = ...,
  2407. compression: str | None = ...,
  2408. index: bool | None = ...,
  2409. partition_cols: list[str] | None = ...,
  2410. storage_options: StorageOptions = ...,
  2411. **kwargs,
  2412. ) -> bytes:
  2413. ...
  2414. @overload
  2415. def to_parquet(
  2416. self,
  2417. path: FilePath | WriteBuffer[bytes],
  2418. engine: str = ...,
  2419. compression: str | None = ...,
  2420. index: bool | None = ...,
  2421. partition_cols: list[str] | None = ...,
  2422. storage_options: StorageOptions = ...,
  2423. **kwargs,
  2424. ) -> None:
  2425. ...
  2426. @doc(storage_options=_shared_docs["storage_options"])
  2427. def to_parquet(
  2428. self,
  2429. path: FilePath | WriteBuffer[bytes] | None = None,
  2430. engine: str = "auto",
  2431. compression: str | None = "snappy",
  2432. index: bool | None = None,
  2433. partition_cols: list[str] | None = None,
  2434. storage_options: StorageOptions = None,
  2435. **kwargs,
  2436. ) -> bytes | None:
  2437. """
  2438. Write a DataFrame to the binary parquet format.
  2439. This function writes the dataframe as a `parquet file
  2440. <https://parquet.apache.org/>`_. You can choose different parquet
  2441. backends, and have the option of compression. See
  2442. :ref:`the user guide <io.parquet>` for more details.
  2443. Parameters
  2444. ----------
  2445. path : str, path object, file-like object, or None, default None
  2446. String, path object (implementing ``os.PathLike[str]``), or file-like
  2447. object implementing a binary ``write()`` function. If None, the result is
  2448. returned as bytes. If a string or path, it will be used as Root Directory
  2449. path when writing a partitioned dataset.
  2450. .. versionchanged:: 1.2.0
  2451. Previously this was "fname"
  2452. engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
  2453. Parquet library to use. If 'auto', then the option
  2454. ``io.parquet.engine`` is used. The default ``io.parquet.engine``
  2455. behavior is to try 'pyarrow', falling back to 'fastparquet' if
  2456. 'pyarrow' is unavailable.
  2457. compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'
  2458. Name of the compression to use. Use ``None`` for no compression.
  2459. index : bool, default None
  2460. If ``True``, include the dataframe's index(es) in the file output.
  2461. If ``False``, they will not be written to the file.
  2462. If ``None``, similar to ``True`` the dataframe's index(es)
  2463. will be saved. However, instead of being saved as values,
  2464. the RangeIndex will be stored as a range in the metadata so it
  2465. doesn't require much space and is faster. Other indexes will
  2466. be included as columns in the file output.
  2467. partition_cols : list, optional, default None
  2468. Column names by which to partition the dataset.
  2469. Columns are partitioned in the order they are given.
  2470. Must be None if path is not a string.
  2471. {storage_options}
  2472. .. versionadded:: 1.2.0
  2473. **kwargs
  2474. Additional arguments passed to the parquet library. See
  2475. :ref:`pandas io <io.parquet>` for more details.
  2476. Returns
  2477. -------
  2478. bytes if no path argument is provided else None
  2479. See Also
  2480. --------
  2481. read_parquet : Read a parquet file.
  2482. DataFrame.to_orc : Write an orc file.
  2483. DataFrame.to_csv : Write a csv file.
  2484. DataFrame.to_sql : Write to a sql table.
  2485. DataFrame.to_hdf : Write to hdf.
  2486. Notes
  2487. -----
  2488. This function requires either the `fastparquet
  2489. <https://pypi.org/project/fastparquet>`_ or `pyarrow
  2490. <https://arrow.apache.org/docs/python/>`_ library.
  2491. Examples
  2492. --------
  2493. >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
  2494. >>> df.to_parquet('df.parquet.gzip',
  2495. ... compression='gzip') # doctest: +SKIP
  2496. >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
  2497. col1 col2
  2498. 0 1 3
  2499. 1 2 4
  2500. If you want to get a buffer to the parquet content you can use a io.BytesIO
  2501. object, as long as you don't use partition_cols, which creates multiple files.
  2502. >>> import io
  2503. >>> f = io.BytesIO()
  2504. >>> df.to_parquet(f)
  2505. >>> f.seek(0)
  2506. 0
  2507. >>> content = f.read()
  2508. """
  2509. from pandas.io.parquet import to_parquet
  2510. return to_parquet(
  2511. self,
  2512. path,
  2513. engine,
  2514. compression=compression,
  2515. index=index,
  2516. partition_cols=partition_cols,
  2517. storage_options=storage_options,
  2518. **kwargs,
  2519. )
  2520. def to_orc(
  2521. self,
  2522. path: FilePath | WriteBuffer[bytes] | None = None,
  2523. *,
  2524. engine: Literal["pyarrow"] = "pyarrow",
  2525. index: bool | None = None,
  2526. engine_kwargs: dict[str, Any] | None = None,
  2527. ) -> bytes | None:
  2528. """
  2529. Write a DataFrame to the ORC format.
  2530. .. versionadded:: 1.5.0
  2531. Parameters
  2532. ----------
  2533. path : str, file-like object or None, default None
  2534. If a string, it will be used as Root Directory path
  2535. when writing a partitioned dataset. By file-like object,
  2536. we refer to objects with a write() method, such as a file handle
  2537. (e.g. via builtin open function). If path is None,
  2538. a bytes object is returned.
  2539. engine : str, default 'pyarrow'
  2540. ORC library to use. Pyarrow must be >= 7.0.0.
  2541. index : bool, optional
  2542. If ``True``, include the dataframe's index(es) in the file output.
  2543. If ``False``, they will not be written to the file.
  2544. If ``None``, similar to ``infer`` the dataframe's index(es)
  2545. will be saved. However, instead of being saved as values,
  2546. the RangeIndex will be stored as a range in the metadata so it
  2547. doesn't require much space and is faster. Other indexes will
  2548. be included as columns in the file output.
  2549. engine_kwargs : dict[str, Any] or None, default None
  2550. Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
  2551. Returns
  2552. -------
  2553. bytes if no path argument is provided else None
  2554. Raises
  2555. ------
  2556. NotImplementedError
  2557. Dtype of one or more columns is category, unsigned integers, interval,
  2558. period or sparse.
  2559. ValueError
  2560. engine is not pyarrow.
  2561. See Also
  2562. --------
  2563. read_orc : Read a ORC file.
  2564. DataFrame.to_parquet : Write a parquet file.
  2565. DataFrame.to_csv : Write a csv file.
  2566. DataFrame.to_sql : Write to a sql table.
  2567. DataFrame.to_hdf : Write to hdf.
  2568. Notes
  2569. -----
  2570. * Before using this function you should read the :ref:`user guide about
  2571. ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.
  2572. * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
  2573. library.
  2574. * For supported dtypes please refer to `supported ORC features in Arrow
  2575. <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
  2576. * Currently timezones in datetime columns are not preserved when a
  2577. dataframe is converted into ORC files.
  2578. Examples
  2579. --------
  2580. >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
  2581. >>> df.to_orc('df.orc') # doctest: +SKIP
  2582. >>> pd.read_orc('df.orc') # doctest: +SKIP
  2583. col1 col2
  2584. 0 1 4
  2585. 1 2 3
  2586. If you want to get a buffer to the orc content you can write it to io.BytesIO
  2587. >>> import io
  2588. >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
  2589. >>> b.seek(0) # doctest: +SKIP
  2590. 0
  2591. >>> content = b.read() # doctest: +SKIP
  2592. """
  2593. from pandas.io.orc import to_orc
  2594. return to_orc(
  2595. self, path, engine=engine, index=index, engine_kwargs=engine_kwargs
  2596. )
  2597. @overload
  2598. def to_html(
  2599. self,
  2600. buf: FilePath | WriteBuffer[str],
  2601. columns: Sequence[Level] | None = ...,
  2602. col_space: ColspaceArgType | None = ...,
  2603. header: bool | Sequence[str] = ...,
  2604. index: bool = ...,
  2605. na_rep: str = ...,
  2606. formatters: FormattersType | None = ...,
  2607. float_format: FloatFormatType | None = ...,
  2608. sparsify: bool | None = ...,
  2609. index_names: bool = ...,
  2610. justify: str | None = ...,
  2611. max_rows: int | None = ...,
  2612. max_cols: int | None = ...,
  2613. show_dimensions: bool | str = ...,
  2614. decimal: str = ...,
  2615. bold_rows: bool = ...,
  2616. classes: str | list | tuple | None = ...,
  2617. escape: bool = ...,
  2618. notebook: bool = ...,
  2619. border: int | bool | None = ...,
  2620. table_id: str | None = ...,
  2621. render_links: bool = ...,
  2622. encoding: str | None = ...,
  2623. ) -> None:
  2624. ...
  2625. @overload
  2626. def to_html(
  2627. self,
  2628. buf: None = ...,
  2629. columns: Sequence[Level] | None = ...,
  2630. col_space: ColspaceArgType | None = ...,
  2631. header: bool | Sequence[str] = ...,
  2632. index: bool = ...,
  2633. na_rep: str = ...,
  2634. formatters: FormattersType | None = ...,
  2635. float_format: FloatFormatType | None = ...,
  2636. sparsify: bool | None = ...,
  2637. index_names: bool = ...,
  2638. justify: str | None = ...,
  2639. max_rows: int | None = ...,
  2640. max_cols: int | None = ...,
  2641. show_dimensions: bool | str = ...,
  2642. decimal: str = ...,
  2643. bold_rows: bool = ...,
  2644. classes: str | list | tuple | None = ...,
  2645. escape: bool = ...,
  2646. notebook: bool = ...,
  2647. border: int | bool | None = ...,
  2648. table_id: str | None = ...,
  2649. render_links: bool = ...,
  2650. encoding: str | None = ...,
  2651. ) -> str:
  2652. ...
  2653. @Substitution(
  2654. header_type="bool",
  2655. header="Whether to print column labels, default True",
  2656. col_space_type="str or int, list or dict of int or str",
  2657. col_space="The minimum width of each column in CSS length "
  2658. "units. An int is assumed to be px units.",
  2659. )
  2660. @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
  2661. def to_html(
  2662. self,
  2663. buf: FilePath | WriteBuffer[str] | None = None,
  2664. columns: Sequence[Level] | None = None,
  2665. col_space: ColspaceArgType | None = None,
  2666. header: bool | Sequence[str] = True,
  2667. index: bool = True,
  2668. na_rep: str = "NaN",
  2669. formatters: FormattersType | None = None,
  2670. float_format: FloatFormatType | None = None,
  2671. sparsify: bool | None = None,
  2672. index_names: bool = True,
  2673. justify: str | None = None,
  2674. max_rows: int | None = None,
  2675. max_cols: int | None = None,
  2676. show_dimensions: bool | str = False,
  2677. decimal: str = ".",
  2678. bold_rows: bool = True,
  2679. classes: str | list | tuple | None = None,
  2680. escape: bool = True,
  2681. notebook: bool = False,
  2682. border: int | bool | None = None,
  2683. table_id: str | None = None,
  2684. render_links: bool = False,
  2685. encoding: str | None = None,
  2686. ) -> str | None:
  2687. """
  2688. Render a DataFrame as an HTML table.
  2689. %(shared_params)s
  2690. bold_rows : bool, default True
  2691. Make the row labels bold in the output.
  2692. classes : str or list or tuple, default None
  2693. CSS class(es) to apply to the resulting html table.
  2694. escape : bool, default True
  2695. Convert the characters <, >, and & to HTML-safe sequences.
  2696. notebook : {True, False}, default False
  2697. Whether the generated HTML is for IPython Notebook.
  2698. border : int
  2699. A ``border=border`` attribute is included in the opening
  2700. `<table>` tag. Default ``pd.options.display.html.border``.
  2701. table_id : str, optional
  2702. A css id is included in the opening `<table>` tag if specified.
  2703. render_links : bool, default False
  2704. Convert URLs to HTML links.
  2705. encoding : str, default "utf-8"
  2706. Set character encoding.
  2707. .. versionadded:: 1.0
  2708. %(returns)s
  2709. See Also
  2710. --------
  2711. to_string : Convert DataFrame to a string.
  2712. """
  2713. if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
  2714. raise ValueError("Invalid value for justify parameter")
  2715. formatter = fmt.DataFrameFormatter(
  2716. self,
  2717. columns=columns,
  2718. col_space=col_space,
  2719. na_rep=na_rep,
  2720. header=header,
  2721. index=index,
  2722. formatters=formatters,
  2723. float_format=float_format,
  2724. bold_rows=bold_rows,
  2725. sparsify=sparsify,
  2726. justify=justify,
  2727. index_names=index_names,
  2728. escape=escape,
  2729. decimal=decimal,
  2730. max_rows=max_rows,
  2731. max_cols=max_cols,
  2732. show_dimensions=show_dimensions,
  2733. )
  2734. # TODO: a generic formatter wld b in DataFrameFormatter
  2735. return fmt.DataFrameRenderer(formatter).to_html(
  2736. buf=buf,
  2737. classes=classes,
  2738. notebook=notebook,
  2739. border=border,
  2740. encoding=encoding,
  2741. table_id=table_id,
  2742. render_links=render_links,
  2743. )
  2744. @doc(
  2745. storage_options=_shared_docs["storage_options"],
  2746. compression_options=_shared_docs["compression_options"] % "path_or_buffer",
  2747. )
  2748. def to_xml(
  2749. self,
  2750. path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
  2751. index: bool = True,
  2752. root_name: str | None = "data",
  2753. row_name: str | None = "row",
  2754. na_rep: str | None = None,
  2755. attr_cols: list[str] | None = None,
  2756. elem_cols: list[str] | None = None,
  2757. namespaces: dict[str | None, str] | None = None,
  2758. prefix: str | None = None,
  2759. encoding: str = "utf-8",
  2760. xml_declaration: bool | None = True,
  2761. pretty_print: bool | None = True,
  2762. parser: str | None = "lxml",
  2763. stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
  2764. compression: CompressionOptions = "infer",
  2765. storage_options: StorageOptions = None,
  2766. ) -> str | None:
  2767. """
  2768. Render a DataFrame to an XML document.
  2769. .. versionadded:: 1.3.0
  2770. Parameters
  2771. ----------
  2772. path_or_buffer : str, path object, file-like object, or None, default None
  2773. String, path object (implementing ``os.PathLike[str]``), or file-like
  2774. object implementing a ``write()`` function. If None, the result is returned
  2775. as a string.
  2776. index : bool, default True
  2777. Whether to include index in XML document.
  2778. root_name : str, default 'data'
  2779. The name of root element in XML document.
  2780. row_name : str, default 'row'
  2781. The name of row element in XML document.
  2782. na_rep : str, optional
  2783. Missing data representation.
  2784. attr_cols : list-like, optional
  2785. List of columns to write as attributes in row element.
  2786. Hierarchical columns will be flattened with underscore
  2787. delimiting the different levels.
  2788. elem_cols : list-like, optional
  2789. List of columns to write as children in row element. By default,
  2790. all columns output as children of row element. Hierarchical
  2791. columns will be flattened with underscore delimiting the
  2792. different levels.
  2793. namespaces : dict, optional
  2794. All namespaces to be defined in root element. Keys of dict
  2795. should be prefix names and values of dict corresponding URIs.
  2796. Default namespaces should be given empty string key. For
  2797. example, ::
  2798. namespaces = {{"": "https://example.com"}}
  2799. prefix : str, optional
  2800. Namespace prefix to be used for every element and/or attribute
  2801. in document. This should be one of the keys in ``namespaces``
  2802. dict.
  2803. encoding : str, default 'utf-8'
  2804. Encoding of the resulting document.
  2805. xml_declaration : bool, default True
  2806. Whether to include the XML declaration at start of document.
  2807. pretty_print : bool, default True
  2808. Whether output should be pretty printed with indentation and
  2809. line breaks.
  2810. parser : {{'lxml','etree'}}, default 'lxml'
  2811. Parser module to use for building of tree. Only 'lxml' and
  2812. 'etree' are supported. With 'lxml', the ability to use XSLT
  2813. stylesheet is supported.
  2814. stylesheet : str, path object or file-like object, optional
  2815. A URL, file-like object, or a raw string containing an XSLT
  2816. script used to transform the raw XML output. Script should use
  2817. layout of elements and attributes from original output. This
  2818. argument requires ``lxml`` to be installed. Only XSLT 1.0
  2819. scripts and not later versions is currently supported.
  2820. {compression_options}
  2821. .. versionchanged:: 1.4.0 Zstandard support.
  2822. {storage_options}
  2823. Returns
  2824. -------
  2825. None or str
  2826. If ``io`` is None, returns the resulting XML format as a
  2827. string. Otherwise returns None.
  2828. See Also
  2829. --------
  2830. to_json : Convert the pandas object to a JSON string.
  2831. to_html : Convert DataFrame to a html.
  2832. Examples
  2833. --------
  2834. >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'],
  2835. ... 'degrees': [360, 360, 180],
  2836. ... 'sides': [4, np.nan, 3]}})
  2837. >>> df.to_xml() # doctest: +SKIP
  2838. <?xml version='1.0' encoding='utf-8'?>
  2839. <data>
  2840. <row>
  2841. <index>0</index>
  2842. <shape>square</shape>
  2843. <degrees>360</degrees>
  2844. <sides>4.0</sides>
  2845. </row>
  2846. <row>
  2847. <index>1</index>
  2848. <shape>circle</shape>
  2849. <degrees>360</degrees>
  2850. <sides/>
  2851. </row>
  2852. <row>
  2853. <index>2</index>
  2854. <shape>triangle</shape>
  2855. <degrees>180</degrees>
  2856. <sides>3.0</sides>
  2857. </row>
  2858. </data>
  2859. >>> df.to_xml(attr_cols=[
  2860. ... 'index', 'shape', 'degrees', 'sides'
  2861. ... ]) # doctest: +SKIP
  2862. <?xml version='1.0' encoding='utf-8'?>
  2863. <data>
  2864. <row index="0" shape="square" degrees="360" sides="4.0"/>
  2865. <row index="1" shape="circle" degrees="360"/>
  2866. <row index="2" shape="triangle" degrees="180" sides="3.0"/>
  2867. </data>
  2868. >>> df.to_xml(namespaces={{"doc": "https://example.com"}},
  2869. ... prefix="doc") # doctest: +SKIP
  2870. <?xml version='1.0' encoding='utf-8'?>
  2871. <doc:data xmlns:doc="https://example.com">
  2872. <doc:row>
  2873. <doc:index>0</doc:index>
  2874. <doc:shape>square</doc:shape>
  2875. <doc:degrees>360</doc:degrees>
  2876. <doc:sides>4.0</doc:sides>
  2877. </doc:row>
  2878. <doc:row>
  2879. <doc:index>1</doc:index>
  2880. <doc:shape>circle</doc:shape>
  2881. <doc:degrees>360</doc:degrees>
  2882. <doc:sides/>
  2883. </doc:row>
  2884. <doc:row>
  2885. <doc:index>2</doc:index>
  2886. <doc:shape>triangle</doc:shape>
  2887. <doc:degrees>180</doc:degrees>
  2888. <doc:sides>3.0</doc:sides>
  2889. </doc:row>
  2890. </doc:data>
  2891. """
  2892. from pandas.io.formats.xml import (
  2893. EtreeXMLFormatter,
  2894. LxmlXMLFormatter,
  2895. )
  2896. lxml = import_optional_dependency("lxml.etree", errors="ignore")
  2897. TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter]
  2898. if parser == "lxml":
  2899. if lxml is not None:
  2900. TreeBuilder = LxmlXMLFormatter
  2901. else:
  2902. raise ImportError(
  2903. "lxml not found, please install or use the etree parser."
  2904. )
  2905. elif parser == "etree":
  2906. TreeBuilder = EtreeXMLFormatter
  2907. else:
  2908. raise ValueError("Values for parser can only be lxml or etree.")
  2909. xml_formatter = TreeBuilder(
  2910. self,
  2911. path_or_buffer=path_or_buffer,
  2912. index=index,
  2913. root_name=root_name,
  2914. row_name=row_name,
  2915. na_rep=na_rep,
  2916. attr_cols=attr_cols,
  2917. elem_cols=elem_cols,
  2918. namespaces=namespaces,
  2919. prefix=prefix,
  2920. encoding=encoding,
  2921. xml_declaration=xml_declaration,
  2922. pretty_print=pretty_print,
  2923. stylesheet=stylesheet,
  2924. compression=compression,
  2925. storage_options=storage_options,
  2926. )
  2927. return xml_formatter.write_output()
  2928. # ----------------------------------------------------------------------
  2929. @doc(INFO_DOCSTRING, **frame_sub_kwargs)
  2930. def info(
  2931. self,
  2932. verbose: bool | None = None,
  2933. buf: WriteBuffer[str] | None = None,
  2934. max_cols: int | None = None,
  2935. memory_usage: bool | str | None = None,
  2936. show_counts: bool | None = None,
  2937. ) -> None:
  2938. info = DataFrameInfo(
  2939. data=self,
  2940. memory_usage=memory_usage,
  2941. )
  2942. info.render(
  2943. buf=buf,
  2944. max_cols=max_cols,
  2945. verbose=verbose,
  2946. show_counts=show_counts,
  2947. )
  2948. def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
  2949. """
  2950. Return the memory usage of each column in bytes.
  2951. The memory usage can optionally include the contribution of
  2952. the index and elements of `object` dtype.
  2953. This value is displayed in `DataFrame.info` by default. This can be
  2954. suppressed by setting ``pandas.options.display.memory_usage`` to False.
  2955. Parameters
  2956. ----------
  2957. index : bool, default True
  2958. Specifies whether to include the memory usage of the DataFrame's
  2959. index in returned Series. If ``index=True``, the memory usage of
  2960. the index is the first item in the output.
  2961. deep : bool, default False
  2962. If True, introspect the data deeply by interrogating
  2963. `object` dtypes for system-level memory consumption, and include
  2964. it in the returned values.
  2965. Returns
  2966. -------
  2967. Series
  2968. A Series whose index is the original column names and whose values
  2969. is the memory usage of each column in bytes.
  2970. See Also
  2971. --------
  2972. numpy.ndarray.nbytes : Total bytes consumed by the elements of an
  2973. ndarray.
  2974. Series.memory_usage : Bytes consumed by a Series.
  2975. Categorical : Memory-efficient array for string values with
  2976. many repeated values.
  2977. DataFrame.info : Concise summary of a DataFrame.
  2978. Notes
  2979. -----
  2980. See the :ref:`Frequently Asked Questions <df-memory-usage>` for more
  2981. details.
  2982. Examples
  2983. --------
  2984. >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
  2985. >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))
  2986. ... for t in dtypes])
  2987. >>> df = pd.DataFrame(data)
  2988. >>> df.head()
  2989. int64 float64 complex128 object bool
  2990. 0 1 1.0 1.0+0.0j 1 True
  2991. 1 1 1.0 1.0+0.0j 1 True
  2992. 2 1 1.0 1.0+0.0j 1 True
  2993. 3 1 1.0 1.0+0.0j 1 True
  2994. 4 1 1.0 1.0+0.0j 1 True
  2995. >>> df.memory_usage()
  2996. Index 128
  2997. int64 40000
  2998. float64 40000
  2999. complex128 80000
  3000. object 40000
  3001. bool 5000
  3002. dtype: int64
  3003. >>> df.memory_usage(index=False)
  3004. int64 40000
  3005. float64 40000
  3006. complex128 80000
  3007. object 40000
  3008. bool 5000
  3009. dtype: int64
  3010. The memory footprint of `object` dtype columns is ignored by default:
  3011. >>> df.memory_usage(deep=True)
  3012. Index 128
  3013. int64 40000
  3014. float64 40000
  3015. complex128 80000
  3016. object 180000
  3017. bool 5000
  3018. dtype: int64
  3019. Use a Categorical for efficient storage of an object-dtype column with
  3020. many repeated values.
  3021. >>> df['object'].astype('category').memory_usage(deep=True)
  3022. 5244
  3023. """
  3024. result = self._constructor_sliced(
  3025. [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
  3026. index=self.columns,
  3027. dtype=np.intp,
  3028. )
  3029. if index:
  3030. index_memory_usage = self._constructor_sliced(
  3031. self.index.memory_usage(deep=deep), index=["Index"]
  3032. )
  3033. result = index_memory_usage._append(result)
  3034. return result
  3035. def transpose(self, *args, copy: bool = False) -> DataFrame:
  3036. """
  3037. Transpose index and columns.
  3038. Reflect the DataFrame over its main diagonal by writing rows as columns
  3039. and vice-versa. The property :attr:`.T` is an accessor to the method
  3040. :meth:`transpose`.
  3041. Parameters
  3042. ----------
  3043. *args : tuple, optional
  3044. Accepted for compatibility with NumPy.
  3045. copy : bool, default False
  3046. Whether to copy the data after transposing, even for DataFrames
  3047. with a single dtype.
  3048. Note that a copy is always required for mixed dtype DataFrames,
  3049. or for DataFrames with any extension types.
  3050. Returns
  3051. -------
  3052. DataFrame
  3053. The transposed DataFrame.
  3054. See Also
  3055. --------
  3056. numpy.transpose : Permute the dimensions of a given array.
  3057. Notes
  3058. -----
  3059. Transposing a DataFrame with mixed dtypes will result in a homogeneous
  3060. DataFrame with the `object` dtype. In such a case, a copy of the data
  3061. is always made.
  3062. Examples
  3063. --------
  3064. **Square DataFrame with homogeneous dtype**
  3065. >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
  3066. >>> df1 = pd.DataFrame(data=d1)
  3067. >>> df1
  3068. col1 col2
  3069. 0 1 3
  3070. 1 2 4
  3071. >>> df1_transposed = df1.T # or df1.transpose()
  3072. >>> df1_transposed
  3073. 0 1
  3074. col1 1 2
  3075. col2 3 4
  3076. When the dtype is homogeneous in the original DataFrame, we get a
  3077. transposed DataFrame with the same dtype:
  3078. >>> df1.dtypes
  3079. col1 int64
  3080. col2 int64
  3081. dtype: object
  3082. >>> df1_transposed.dtypes
  3083. 0 int64
  3084. 1 int64
  3085. dtype: object
  3086. **Non-square DataFrame with mixed dtypes**
  3087. >>> d2 = {'name': ['Alice', 'Bob'],
  3088. ... 'score': [9.5, 8],
  3089. ... 'employed': [False, True],
  3090. ... 'kids': [0, 0]}
  3091. >>> df2 = pd.DataFrame(data=d2)
  3092. >>> df2
  3093. name score employed kids
  3094. 0 Alice 9.5 False 0
  3095. 1 Bob 8.0 True 0
  3096. >>> df2_transposed = df2.T # or df2.transpose()
  3097. >>> df2_transposed
  3098. 0 1
  3099. name Alice Bob
  3100. score 9.5 8.0
  3101. employed False True
  3102. kids 0 0
  3103. When the DataFrame has mixed dtypes, we get a transposed DataFrame with
  3104. the `object` dtype:
  3105. >>> df2.dtypes
  3106. name object
  3107. score float64
  3108. employed bool
  3109. kids int64
  3110. dtype: object
  3111. >>> df2_transposed.dtypes
  3112. 0 object
  3113. 1 object
  3114. dtype: object
  3115. """
  3116. nv.validate_transpose(args, {})
  3117. # construct the args
  3118. dtypes = list(self.dtypes)
  3119. if self._can_fast_transpose:
  3120. # Note: tests pass without this, but this improves perf quite a bit.
  3121. new_vals = self._values.T
  3122. if copy and not using_copy_on_write():
  3123. new_vals = new_vals.copy()
  3124. result = self._constructor(
  3125. new_vals, index=self.columns, columns=self.index, copy=False
  3126. )
  3127. if using_copy_on_write() and len(self) > 0:
  3128. result._mgr.add_references(self._mgr) # type: ignore[arg-type]
  3129. elif (
  3130. self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0])
  3131. ):
  3132. # We have EAs with the same dtype. We can preserve that dtype in transpose.
  3133. dtype = dtypes[0]
  3134. arr_type = dtype.construct_array_type()
  3135. values = self.values
  3136. new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
  3137. result = type(self)._from_arrays(
  3138. new_values, index=self.columns, columns=self.index
  3139. )
  3140. else:
  3141. new_arr = self.values.T
  3142. if copy and not using_copy_on_write():
  3143. new_arr = new_arr.copy()
  3144. result = self._constructor(
  3145. new_arr,
  3146. index=self.columns,
  3147. columns=self.index,
  3148. # We already made a copy (more than one block)
  3149. copy=False,
  3150. )
  3151. return result.__finalize__(self, method="transpose")
  3152. @property
  3153. def T(self) -> DataFrame:
  3154. """
  3155. The transpose of the DataFrame.
  3156. Returns
  3157. -------
  3158. DataFrame
  3159. The transposed DataFrame.
  3160. See Also
  3161. --------
  3162. DataFrame.transpose : Transpose index and columns.
  3163. Examples
  3164. --------
  3165. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  3166. >>> df
  3167. col1 col2
  3168. 0 1 3
  3169. 1 2 4
  3170. >>> df.T
  3171. 0 1
  3172. col1 1 2
  3173. col2 3 4
  3174. """
  3175. return self.transpose()
  3176. # ----------------------------------------------------------------------
  3177. # Indexing Methods
  3178. def _ixs(self, i: int, axis: AxisInt = 0) -> Series:
  3179. """
  3180. Parameters
  3181. ----------
  3182. i : int
  3183. axis : int
  3184. Returns
  3185. -------
  3186. Series
  3187. """
  3188. # irow
  3189. if axis == 0:
  3190. new_mgr = self._mgr.fast_xs(i)
  3191. # if we are a copy, mark as such
  3192. copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None
  3193. result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__(
  3194. self
  3195. )
  3196. result._set_is_copy(self, copy=copy)
  3197. return result
  3198. # icol
  3199. else:
  3200. label = self.columns[i]
  3201. col_mgr = self._mgr.iget(i)
  3202. result = self._box_col_values(col_mgr, i)
  3203. # this is a cached value, mark it so
  3204. result._set_as_cached(label, self)
  3205. return result
  3206. def _get_column_array(self, i: int) -> ArrayLike:
  3207. """
  3208. Get the values of the i'th column (ndarray or ExtensionArray, as stored
  3209. in the Block)
  3210. Warning! The returned array is a view but doesn't handle Copy-on-Write,
  3211. so this should be used with caution (for read-only purposes).
  3212. """
  3213. return self._mgr.iget_values(i)
  3214. def _iter_column_arrays(self) -> Iterator[ArrayLike]:
  3215. """
  3216. Iterate over the arrays of all columns in order.
  3217. This returns the values as stored in the Block (ndarray or ExtensionArray).
  3218. Warning! The returned array is a view but doesn't handle Copy-on-Write,
  3219. so this should be used with caution (for read-only purposes).
  3220. """
  3221. for i in range(len(self.columns)):
  3222. yield self._get_column_array(i)
  3223. def _getitem_nocopy(self, key: list):
  3224. """
  3225. Behaves like __getitem__, but returns a view in cases where __getitem__
  3226. would make a copy.
  3227. """
  3228. # TODO(CoW): can be removed if/when we are always Copy-on-Write
  3229. indexer = self.columns._get_indexer_strict(key, "columns")[1]
  3230. new_axis = self.columns[indexer]
  3231. new_mgr = self._mgr.reindex_indexer(
  3232. new_axis,
  3233. indexer,
  3234. axis=0,
  3235. allow_dups=True,
  3236. copy=False,
  3237. only_slice=True,
  3238. )
  3239. return self._constructor(new_mgr)
  3240. def __getitem__(self, key):
  3241. check_dict_or_set_indexers(key)
  3242. key = lib.item_from_zerodim(key)
  3243. key = com.apply_if_callable(key, self)
  3244. if is_hashable(key) and not is_iterator(key):
  3245. # is_iterator to exclude generator e.g. test_getitem_listlike
  3246. # shortcut if the key is in columns
  3247. is_mi = isinstance(self.columns, MultiIndex)
  3248. # GH#45316 Return view if key is not duplicated
  3249. # Only use drop_duplicates with duplicates for performance
  3250. if not is_mi and (
  3251. self.columns.is_unique
  3252. and key in self.columns
  3253. or key in self.columns.drop_duplicates(keep=False)
  3254. ):
  3255. return self._get_item_cache(key)
  3256. elif is_mi and self.columns.is_unique and key in self.columns:
  3257. return self._getitem_multilevel(key)
  3258. # Do we have a slicer (on rows)?
  3259. if isinstance(key, slice):
  3260. indexer = self.index._convert_slice_indexer(key, kind="getitem")
  3261. if isinstance(indexer, np.ndarray):
  3262. # reachable with DatetimeIndex
  3263. indexer = lib.maybe_indices_to_slice(
  3264. indexer.astype(np.intp, copy=False), len(self)
  3265. )
  3266. if isinstance(indexer, np.ndarray):
  3267. # GH#43223 If we can not convert, use take
  3268. return self.take(indexer, axis=0)
  3269. return self._slice(indexer, axis=0)
  3270. # Do we have a (boolean) DataFrame?
  3271. if isinstance(key, DataFrame):
  3272. return self.where(key)
  3273. # Do we have a (boolean) 1d indexer?
  3274. if com.is_bool_indexer(key):
  3275. return self._getitem_bool_array(key)
  3276. # We are left with two options: a single key, and a collection of keys,
  3277. # We interpret tuples as collections only for non-MultiIndex
  3278. is_single_key = isinstance(key, tuple) or not is_list_like(key)
  3279. if is_single_key:
  3280. if self.columns.nlevels > 1:
  3281. return self._getitem_multilevel(key)
  3282. indexer = self.columns.get_loc(key)
  3283. if is_integer(indexer):
  3284. indexer = [indexer]
  3285. else:
  3286. if is_iterator(key):
  3287. key = list(key)
  3288. indexer = self.columns._get_indexer_strict(key, "columns")[1]
  3289. # take() does not accept boolean indexers
  3290. if getattr(indexer, "dtype", None) == bool:
  3291. indexer = np.where(indexer)[0]
  3292. data = self._take_with_is_copy(indexer, axis=1)
  3293. if is_single_key:
  3294. # What does looking for a single key in a non-unique index return?
  3295. # The behavior is inconsistent. It returns a Series, except when
  3296. # - the key itself is repeated (test on data.shape, #9519), or
  3297. # - we have a MultiIndex on columns (test on self.columns, #21309)
  3298. if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
  3299. # GH#26490 using data[key] can cause RecursionError
  3300. return data._get_item_cache(key)
  3301. return data
  3302. def _getitem_bool_array(self, key):
  3303. # also raises Exception if object array with NA values
  3304. # warning here just in case -- previously __setitem__ was
  3305. # reindexing but __getitem__ was not; it seems more reasonable to
  3306. # go with the __setitem__ behavior since that is more consistent
  3307. # with all other indexing behavior
  3308. if isinstance(key, Series) and not key.index.equals(self.index):
  3309. warnings.warn(
  3310. "Boolean Series key will be reindexed to match DataFrame index.",
  3311. UserWarning,
  3312. stacklevel=find_stack_level(),
  3313. )
  3314. elif len(key) != len(self.index):
  3315. raise ValueError(
  3316. f"Item wrong length {len(key)} instead of {len(self.index)}."
  3317. )
  3318. # check_bool_indexer will throw exception if Series key cannot
  3319. # be reindexed to match DataFrame rows
  3320. key = check_bool_indexer(self.index, key)
  3321. if key.all():
  3322. return self.copy(deep=None)
  3323. indexer = key.nonzero()[0]
  3324. return self._take_with_is_copy(indexer, axis=0)
  3325. def _getitem_multilevel(self, key):
  3326. # self.columns is a MultiIndex
  3327. loc = self.columns.get_loc(key)
  3328. if isinstance(loc, (slice, np.ndarray)):
  3329. new_columns = self.columns[loc]
  3330. result_columns = maybe_droplevels(new_columns, key)
  3331. result = self.iloc[:, loc]
  3332. result.columns = result_columns
  3333. # If there is only one column being returned, and its name is
  3334. # either an empty string, or a tuple with an empty string as its
  3335. # first element, then treat the empty string as a placeholder
  3336. # and return the column as if the user had provided that empty
  3337. # string in the key. If the result is a Series, exclude the
  3338. # implied empty string from its name.
  3339. if len(result.columns) == 1:
  3340. # e.g. test_frame_getitem_multicolumn_empty_level,
  3341. # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice
  3342. top = result.columns[0]
  3343. if isinstance(top, tuple):
  3344. top = top[0]
  3345. if top == "":
  3346. result = result[""]
  3347. if isinstance(result, Series):
  3348. result = self._constructor_sliced(
  3349. result, index=self.index, name=key
  3350. )
  3351. result._set_is_copy(self)
  3352. return result
  3353. else:
  3354. # loc is neither a slice nor ndarray, so must be an int
  3355. return self._ixs(loc, axis=1)
  3356. def _get_value(self, index, col, takeable: bool = False) -> Scalar:
  3357. """
  3358. Quickly retrieve single value at passed column and index.
  3359. Parameters
  3360. ----------
  3361. index : row label
  3362. col : column label
  3363. takeable : interpret the index/col as indexers, default False
  3364. Returns
  3365. -------
  3366. scalar
  3367. Notes
  3368. -----
  3369. Assumes that both `self.index._index_as_unique` and
  3370. `self.columns._index_as_unique`; Caller is responsible for checking.
  3371. """
  3372. if takeable:
  3373. series = self._ixs(col, axis=1)
  3374. return series._values[index]
  3375. series = self._get_item_cache(col)
  3376. engine = self.index._engine
  3377. if not isinstance(self.index, MultiIndex):
  3378. # CategoricalIndex: Trying to use the engine fastpath may give incorrect
  3379. # results if our categories are integers that dont match our codes
  3380. # IntervalIndex: IntervalTree has no get_loc
  3381. row = self.index.get_loc(index)
  3382. return series._values[row]
  3383. # For MultiIndex going through engine effectively restricts us to
  3384. # same-length tuples; see test_get_set_value_no_partial_indexing
  3385. loc = engine.get_loc(index)
  3386. return series._values[loc]
  3387. def isetitem(self, loc, value) -> None:
  3388. """
  3389. Set the given value in the column with position `loc`.
  3390. This is a positional analogue to ``__setitem__``.
  3391. Parameters
  3392. ----------
  3393. loc : int or sequence of ints
  3394. Index position for the column.
  3395. value : scalar or arraylike
  3396. Value(s) for the column.
  3397. Notes
  3398. -----
  3399. ``frame.isetitem(loc, value)`` is an in-place method as it will
  3400. modify the DataFrame in place (not returning a new object). In contrast to
  3401. ``frame.iloc[:, i] = value`` which will try to update the existing values in
  3402. place, ``frame.isetitem(loc, value)`` will not update the values of the column
  3403. itself in place, it will instead insert a new array.
  3404. In cases where ``frame.columns`` is unique, this is equivalent to
  3405. ``frame[frame.columns[i]] = value``.
  3406. """
  3407. if isinstance(value, DataFrame):
  3408. if is_scalar(loc):
  3409. loc = [loc]
  3410. for i, idx in enumerate(loc):
  3411. arraylike = self._sanitize_column(value.iloc[:, i])
  3412. self._iset_item_mgr(idx, arraylike, inplace=False)
  3413. return
  3414. arraylike = self._sanitize_column(value)
  3415. self._iset_item_mgr(loc, arraylike, inplace=False)
  3416. def __setitem__(self, key, value):
  3417. if not PYPY and using_copy_on_write():
  3418. if sys.getrefcount(self) <= 3:
  3419. warnings.warn(
  3420. _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
  3421. )
  3422. key = com.apply_if_callable(key, self)
  3423. # see if we can slice the rows
  3424. if isinstance(key, slice):
  3425. slc = self.index._convert_slice_indexer(key, kind="getitem")
  3426. return self._setitem_slice(slc, value)
  3427. if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
  3428. self._setitem_frame(key, value)
  3429. elif isinstance(key, (Series, np.ndarray, list, Index)):
  3430. self._setitem_array(key, value)
  3431. elif isinstance(value, DataFrame):
  3432. self._set_item_frame_value(key, value)
  3433. elif (
  3434. is_list_like(value)
  3435. and not self.columns.is_unique
  3436. and 1 < len(self.columns.get_indexer_for([key])) == len(value)
  3437. ):
  3438. # Column to set is duplicated
  3439. self._setitem_array([key], value)
  3440. else:
  3441. # set column
  3442. self._set_item(key, value)
  3443. def _setitem_slice(self, key: slice, value) -> None:
  3444. # NB: we can't just use self.loc[key] = value because that
  3445. # operates on labels and we need to operate positional for
  3446. # backwards-compat, xref GH#31469
  3447. self._check_setitem_copy()
  3448. self.iloc[key] = value
  3449. def _setitem_array(self, key, value):
  3450. # also raises Exception if object array with NA values
  3451. if com.is_bool_indexer(key):
  3452. # bool indexer is indexing along rows
  3453. if len(key) != len(self.index):
  3454. raise ValueError(
  3455. f"Item wrong length {len(key)} instead of {len(self.index)}!"
  3456. )
  3457. key = check_bool_indexer(self.index, key)
  3458. indexer = key.nonzero()[0]
  3459. self._check_setitem_copy()
  3460. if isinstance(value, DataFrame):
  3461. # GH#39931 reindex since iloc does not align
  3462. value = value.reindex(self.index.take(indexer))
  3463. self.iloc[indexer] = value
  3464. else:
  3465. # Note: unlike self.iloc[:, indexer] = value, this will
  3466. # never try to overwrite values inplace
  3467. if isinstance(value, DataFrame):
  3468. check_key_length(self.columns, key, value)
  3469. for k1, k2 in zip(key, value.columns):
  3470. self[k1] = value[k2]
  3471. elif not is_list_like(value):
  3472. for col in key:
  3473. self[col] = value
  3474. elif isinstance(value, np.ndarray) and value.ndim == 2:
  3475. self._iset_not_inplace(key, value)
  3476. elif np.ndim(value) > 1:
  3477. # list of lists
  3478. value = DataFrame(value).values
  3479. return self._setitem_array(key, value)
  3480. else:
  3481. self._iset_not_inplace(key, value)
  3482. def _iset_not_inplace(self, key, value):
  3483. # GH#39510 when setting with df[key] = obj with a list-like key and
  3484. # list-like value, we iterate over those listlikes and set columns
  3485. # one at a time. This is different from dispatching to
  3486. # `self.loc[:, key]= value` because loc.__setitem__ may overwrite
  3487. # data inplace, whereas this will insert new arrays.
  3488. def igetitem(obj, i: int):
  3489. # Note: we catch DataFrame obj before getting here, but
  3490. # hypothetically would return obj.iloc[:, i]
  3491. if isinstance(obj, np.ndarray):
  3492. return obj[..., i]
  3493. else:
  3494. return obj[i]
  3495. if self.columns.is_unique:
  3496. if np.shape(value)[-1] != len(key):
  3497. raise ValueError("Columns must be same length as key")
  3498. for i, col in enumerate(key):
  3499. self[col] = igetitem(value, i)
  3500. else:
  3501. ilocs = self.columns.get_indexer_non_unique(key)[0]
  3502. if (ilocs < 0).any():
  3503. # key entries not in self.columns
  3504. raise NotImplementedError
  3505. if np.shape(value)[-1] != len(ilocs):
  3506. raise ValueError("Columns must be same length as key")
  3507. assert np.ndim(value) <= 2
  3508. orig_columns = self.columns
  3509. # Using self.iloc[:, i] = ... may set values inplace, which
  3510. # by convention we do not do in __setitem__
  3511. try:
  3512. self.columns = Index(range(len(self.columns)))
  3513. for i, iloc in enumerate(ilocs):
  3514. self[iloc] = igetitem(value, i)
  3515. finally:
  3516. self.columns = orig_columns
  3517. def _setitem_frame(self, key, value):
  3518. # support boolean setting with DataFrame input, e.g.
  3519. # df[df > df2] = 0
  3520. if isinstance(key, np.ndarray):
  3521. if key.shape != self.shape:
  3522. raise ValueError("Array conditional must be same shape as self")
  3523. key = self._constructor(key, **self._construct_axes_dict(), copy=False)
  3524. if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes):
  3525. raise TypeError(
  3526. "Must pass DataFrame or 2-d ndarray with boolean values only"
  3527. )
  3528. self._check_inplace_setting(value)
  3529. self._check_setitem_copy()
  3530. self._where(-key, value, inplace=True)
  3531. def _set_item_frame_value(self, key, value: DataFrame) -> None:
  3532. self._ensure_valid_index(value)
  3533. # align columns
  3534. if key in self.columns:
  3535. loc = self.columns.get_loc(key)
  3536. cols = self.columns[loc]
  3537. len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols)
  3538. if len_cols != len(value.columns):
  3539. raise ValueError("Columns must be same length as key")
  3540. # align right-hand-side columns if self.columns
  3541. # is multi-index and self[key] is a sub-frame
  3542. if isinstance(self.columns, MultiIndex) and isinstance(
  3543. loc, (slice, Series, np.ndarray, Index)
  3544. ):
  3545. cols_droplevel = maybe_droplevels(cols, key)
  3546. if len(cols_droplevel) and not cols_droplevel.equals(value.columns):
  3547. value = value.reindex(cols_droplevel, axis=1)
  3548. for col, col_droplevel in zip(cols, cols_droplevel):
  3549. self[col] = value[col_droplevel]
  3550. return
  3551. if is_scalar(cols):
  3552. self[cols] = value[value.columns[0]]
  3553. return
  3554. # now align rows
  3555. arraylike = _reindex_for_setitem(value, self.index)
  3556. self._set_item_mgr(key, arraylike)
  3557. return
  3558. if len(value.columns) != 1:
  3559. raise ValueError(
  3560. "Cannot set a DataFrame with multiple columns to the single "
  3561. f"column {key}"
  3562. )
  3563. self[key] = value[value.columns[0]]
  3564. def _iset_item_mgr(
  3565. self, loc: int | slice | np.ndarray, value, inplace: bool = False
  3566. ) -> None:
  3567. # when called from _set_item_mgr loc can be anything returned from get_loc
  3568. self._mgr.iset(loc, value, inplace=inplace)
  3569. self._clear_item_cache()
  3570. def _set_item_mgr(self, key, value: ArrayLike) -> None:
  3571. try:
  3572. loc = self._info_axis.get_loc(key)
  3573. except KeyError:
  3574. # This item wasn't present, just insert at end
  3575. self._mgr.insert(len(self._info_axis), key, value)
  3576. else:
  3577. self._iset_item_mgr(loc, value)
  3578. # check if we are modifying a copy
  3579. # try to set first as we want an invalid
  3580. # value exception to occur first
  3581. if len(self):
  3582. self._check_setitem_copy()
  3583. def _iset_item(self, loc: int, value) -> None:
  3584. arraylike = self._sanitize_column(value)
  3585. self._iset_item_mgr(loc, arraylike, inplace=True)
  3586. # check if we are modifying a copy
  3587. # try to set first as we want an invalid
  3588. # value exception to occur first
  3589. if len(self):
  3590. self._check_setitem_copy()
  3591. def _set_item(self, key, value) -> None:
  3592. """
  3593. Add series to DataFrame in specified column.
  3594. If series is a numpy-array (not a Series/TimeSeries), it must be the
  3595. same length as the DataFrames index or an error will be thrown.
  3596. Series/TimeSeries will be conformed to the DataFrames index to
  3597. ensure homogeneity.
  3598. """
  3599. value = self._sanitize_column(value)
  3600. if (
  3601. key in self.columns
  3602. and value.ndim == 1
  3603. and not is_extension_array_dtype(value)
  3604. ):
  3605. # broadcast across multiple columns if necessary
  3606. if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
  3607. existing_piece = self[key]
  3608. if isinstance(existing_piece, DataFrame):
  3609. value = np.tile(value, (len(existing_piece.columns), 1)).T
  3610. self._set_item_mgr(key, value)
  3611. def _set_value(
  3612. self, index: IndexLabel, col, value: Scalar, takeable: bool = False
  3613. ) -> None:
  3614. """
  3615. Put single value at passed column and index.
  3616. Parameters
  3617. ----------
  3618. index : Label
  3619. row label
  3620. col : Label
  3621. column label
  3622. value : scalar
  3623. takeable : bool, default False
  3624. Sets whether or not index/col interpreted as indexers
  3625. """
  3626. try:
  3627. if takeable:
  3628. icol = col
  3629. iindex = cast(int, index)
  3630. else:
  3631. icol = self.columns.get_loc(col)
  3632. iindex = self.index.get_loc(index)
  3633. self._mgr.column_setitem(icol, iindex, value, inplace_only=True)
  3634. self._clear_item_cache()
  3635. except (KeyError, TypeError, ValueError, LossySetitemError):
  3636. # get_loc might raise a KeyError for missing labels (falling back
  3637. # to (i)loc will do expansion of the index)
  3638. # column_setitem will do validation that may raise TypeError,
  3639. # ValueError, or LossySetitemError
  3640. # set using a non-recursive method & reset the cache
  3641. if takeable:
  3642. self.iloc[index, col] = value
  3643. else:
  3644. self.loc[index, col] = value
  3645. self._item_cache.pop(col, None)
  3646. except InvalidIndexError as ii_err:
  3647. # GH48729: Seems like you are trying to assign a value to a
  3648. # row when only scalar options are permitted
  3649. raise InvalidIndexError(
  3650. f"You can only assign a scalar value not a {type(value)}"
  3651. ) from ii_err
  3652. def _ensure_valid_index(self, value) -> None:
  3653. """
  3654. Ensure that if we don't have an index, that we can create one from the
  3655. passed value.
  3656. """
  3657. # GH5632, make sure that we are a Series convertible
  3658. if not len(self.index) and is_list_like(value) and len(value):
  3659. if not isinstance(value, DataFrame):
  3660. try:
  3661. value = Series(value)
  3662. except (ValueError, NotImplementedError, TypeError) as err:
  3663. raise ValueError(
  3664. "Cannot set a frame with no defined index "
  3665. "and a value that cannot be converted to a Series"
  3666. ) from err
  3667. # GH31368 preserve name of index
  3668. index_copy = value.index.copy()
  3669. if self.index.name is not None:
  3670. index_copy.name = self.index.name
  3671. self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)
  3672. def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:
  3673. """
  3674. Provide boxed values for a column.
  3675. """
  3676. # Lookup in columns so that if e.g. a str datetime was passed
  3677. # we attach the Timestamp object as the name.
  3678. name = self.columns[loc]
  3679. klass = self._constructor_sliced
  3680. # We get index=self.index bc values is a SingleDataManager
  3681. return klass(values, name=name, fastpath=True).__finalize__(self)
  3682. # ----------------------------------------------------------------------
  3683. # Lookup Caching
  3684. def _clear_item_cache(self) -> None:
  3685. self._item_cache.clear()
  3686. def _get_item_cache(self, item: Hashable) -> Series:
  3687. """Return the cached item, item represents a label indexer."""
  3688. if using_copy_on_write():
  3689. loc = self.columns.get_loc(item)
  3690. return self._ixs(loc, axis=1)
  3691. cache = self._item_cache
  3692. res = cache.get(item)
  3693. if res is None:
  3694. # All places that call _get_item_cache have unique columns,
  3695. # pending resolution of GH#33047
  3696. loc = self.columns.get_loc(item)
  3697. res = self._ixs(loc, axis=1)
  3698. cache[item] = res
  3699. # for a chain
  3700. res._is_copy = self._is_copy
  3701. return res
  3702. def _reset_cacher(self) -> None:
  3703. # no-op for DataFrame
  3704. pass
  3705. def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
  3706. """
  3707. The object has called back to us saying maybe it has changed.
  3708. """
  3709. loc = self._info_axis.get_loc(item)
  3710. arraylike = value._values
  3711. old = self._ixs(loc, axis=1)
  3712. if old._values is value._values and inplace:
  3713. # GH#46149 avoid making unnecessary copies/block-splitting
  3714. return
  3715. self._mgr.iset(loc, arraylike, inplace=inplace)
  3716. # ----------------------------------------------------------------------
  3717. # Unsorted
  3718. @overload
  3719. def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame:
  3720. ...
  3721. @overload
  3722. def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
  3723. ...
  3724. @overload
  3725. def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None:
  3726. ...
  3727. def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None:
  3728. """
  3729. Query the columns of a DataFrame with a boolean expression.
  3730. Parameters
  3731. ----------
  3732. expr : str
  3733. The query string to evaluate.
  3734. You can refer to variables
  3735. in the environment by prefixing them with an '@' character like
  3736. ``@a + b``.
  3737. You can refer to column names that are not valid Python variable names
  3738. by surrounding them in backticks. Thus, column names containing spaces
  3739. or punctuations (besides underscores) or starting with digits must be
  3740. surrounded by backticks. (For example, a column named "Area (cm^2)" would
  3741. be referenced as ```Area (cm^2)```). Column names which are Python keywords
  3742. (like "list", "for", "import", etc) cannot be used.
  3743. For example, if one of your columns is called ``a a`` and you want
  3744. to sum it with ``b``, your query should be ```a a` + b``.
  3745. inplace : bool
  3746. Whether to modify the DataFrame rather than creating a new one.
  3747. **kwargs
  3748. See the documentation for :func:`eval` for complete details
  3749. on the keyword arguments accepted by :meth:`DataFrame.query`.
  3750. Returns
  3751. -------
  3752. DataFrame or None
  3753. DataFrame resulting from the provided query expression or
  3754. None if ``inplace=True``.
  3755. See Also
  3756. --------
  3757. eval : Evaluate a string describing operations on
  3758. DataFrame columns.
  3759. DataFrame.eval : Evaluate a string describing operations on
  3760. DataFrame columns.
  3761. Notes
  3762. -----
  3763. The result of the evaluation of this expression is first passed to
  3764. :attr:`DataFrame.loc` and if that fails because of a
  3765. multidimensional key (e.g., a DataFrame) then the result will be passed
  3766. to :meth:`DataFrame.__getitem__`.
  3767. This method uses the top-level :func:`eval` function to
  3768. evaluate the passed query.
  3769. The :meth:`~pandas.DataFrame.query` method uses a slightly
  3770. modified Python syntax by default. For example, the ``&`` and ``|``
  3771. (bitwise) operators have the precedence of their boolean cousins,
  3772. :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
  3773. however the semantics are different.
  3774. You can change the semantics of the expression by passing the keyword
  3775. argument ``parser='python'``. This enforces the same semantics as
  3776. evaluation in Python space. Likewise, you can pass ``engine='python'``
  3777. to evaluate an expression using Python itself as a backend. This is not
  3778. recommended as it is inefficient compared to using ``numexpr`` as the
  3779. engine.
  3780. The :attr:`DataFrame.index` and
  3781. :attr:`DataFrame.columns` attributes of the
  3782. :class:`~pandas.DataFrame` instance are placed in the query namespace
  3783. by default, which allows you to treat both the index and columns of the
  3784. frame as a column in the frame.
  3785. The identifier ``index`` is used for the frame index; you can also
  3786. use the name of the index to identify it in a query. Please note that
  3787. Python keywords may not be used as identifiers.
  3788. For further details and examples see the ``query`` documentation in
  3789. :ref:`indexing <indexing.query>`.
  3790. *Backtick quoted variables*
  3791. Backtick quoted variables are parsed as literal Python code and
  3792. are converted internally to a Python valid identifier.
  3793. This can lead to the following problems.
  3794. During parsing a number of disallowed characters inside the backtick
  3795. quoted string are replaced by strings that are allowed as a Python identifier.
  3796. These characters include all operators in Python, the space character, the
  3797. question mark, the exclamation mark, the dollar sign, and the euro sign.
  3798. For other characters that fall outside the ASCII range (U+0001..U+007F)
  3799. and those that are not further specified in PEP 3131,
  3800. the query parser will raise an error.
  3801. This excludes whitespace different than the space character,
  3802. but also the hashtag (as it is used for comments) and the backtick
  3803. itself (backtick can also not be escaped).
  3804. In a special case, quotes that make a pair around a backtick can
  3805. confuse the parser.
  3806. For example, ```it's` > `that's``` will raise an error,
  3807. as it forms a quoted string (``'s > `that'``) with a backtick inside.
  3808. See also the Python documentation about lexical analysis
  3809. (https://docs.python.org/3/reference/lexical_analysis.html)
  3810. in combination with the source code in :mod:`pandas.core.computation.parsing`.
  3811. Examples
  3812. --------
  3813. >>> df = pd.DataFrame({'A': range(1, 6),
  3814. ... 'B': range(10, 0, -2),
  3815. ... 'C C': range(10, 5, -1)})
  3816. >>> df
  3817. A B C C
  3818. 0 1 10 10
  3819. 1 2 8 9
  3820. 2 3 6 8
  3821. 3 4 4 7
  3822. 4 5 2 6
  3823. >>> df.query('A > B')
  3824. A B C C
  3825. 4 5 2 6
  3826. The previous expression is equivalent to
  3827. >>> df[df.A > df.B]
  3828. A B C C
  3829. 4 5 2 6
  3830. For columns with spaces in their name, you can use backtick quoting.
  3831. >>> df.query('B == `C C`')
  3832. A B C C
  3833. 0 1 10 10
  3834. The previous expression is equivalent to
  3835. >>> df[df.B == df['C C']]
  3836. A B C C
  3837. 0 1 10 10
  3838. """
  3839. inplace = validate_bool_kwarg(inplace, "inplace")
  3840. if not isinstance(expr, str):
  3841. msg = f"expr must be a string to be evaluated, {type(expr)} given"
  3842. raise ValueError(msg)
  3843. kwargs["level"] = kwargs.pop("level", 0) + 1
  3844. kwargs["target"] = None
  3845. res = self.eval(expr, **kwargs)
  3846. try:
  3847. result = self.loc[res]
  3848. except ValueError:
  3849. # when res is multi-dimensional loc raises, but this is sometimes a
  3850. # valid query
  3851. result = self[res]
  3852. if inplace:
  3853. self._update_inplace(result)
  3854. return None
  3855. else:
  3856. return result
  3857. @overload
  3858. def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any:
  3859. ...
  3860. @overload
  3861. def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
  3862. ...
  3863. def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
  3864. """
  3865. Evaluate a string describing operations on DataFrame columns.
  3866. Operates on columns only, not specific rows or elements. This allows
  3867. `eval` to run arbitrary code, which can make you vulnerable to code
  3868. injection if you pass user input to this function.
  3869. Parameters
  3870. ----------
  3871. expr : str
  3872. The expression string to evaluate.
  3873. inplace : bool, default False
  3874. If the expression contains an assignment, whether to perform the
  3875. operation inplace and mutate the existing DataFrame. Otherwise,
  3876. a new DataFrame is returned.
  3877. **kwargs
  3878. See the documentation for :func:`eval` for complete details
  3879. on the keyword arguments accepted by
  3880. :meth:`~pandas.DataFrame.query`.
  3881. Returns
  3882. -------
  3883. ndarray, scalar, pandas object, or None
  3884. The result of the evaluation or None if ``inplace=True``.
  3885. See Also
  3886. --------
  3887. DataFrame.query : Evaluates a boolean expression to query the columns
  3888. of a frame.
  3889. DataFrame.assign : Can evaluate an expression or function to create new
  3890. values for a column.
  3891. eval : Evaluate a Python expression as a string using various
  3892. backends.
  3893. Notes
  3894. -----
  3895. For more details see the API documentation for :func:`~eval`.
  3896. For detailed examples see :ref:`enhancing performance with eval
  3897. <enhancingperf.eval>`.
  3898. Examples
  3899. --------
  3900. >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
  3901. >>> df
  3902. A B
  3903. 0 1 10
  3904. 1 2 8
  3905. 2 3 6
  3906. 3 4 4
  3907. 4 5 2
  3908. >>> df.eval('A + B')
  3909. 0 11
  3910. 1 10
  3911. 2 9
  3912. 3 8
  3913. 4 7
  3914. dtype: int64
  3915. Assignment is allowed though by default the original DataFrame is not
  3916. modified.
  3917. >>> df.eval('C = A + B')
  3918. A B C
  3919. 0 1 10 11
  3920. 1 2 8 10
  3921. 2 3 6 9
  3922. 3 4 4 8
  3923. 4 5 2 7
  3924. >>> df
  3925. A B
  3926. 0 1 10
  3927. 1 2 8
  3928. 2 3 6
  3929. 3 4 4
  3930. 4 5 2
  3931. Multiple columns can be assigned to using multi-line expressions:
  3932. >>> df.eval(
  3933. ... '''
  3934. ... C = A + B
  3935. ... D = A - B
  3936. ... '''
  3937. ... )
  3938. A B C D
  3939. 0 1 10 11 -9
  3940. 1 2 8 10 -6
  3941. 2 3 6 9 -3
  3942. 3 4 4 8 0
  3943. 4 5 2 7 3
  3944. """
  3945. from pandas.core.computation.eval import eval as _eval
  3946. inplace = validate_bool_kwarg(inplace, "inplace")
  3947. kwargs["level"] = kwargs.pop("level", 0) + 1
  3948. index_resolvers = self._get_index_resolvers()
  3949. column_resolvers = self._get_cleaned_column_resolvers()
  3950. resolvers = column_resolvers, index_resolvers
  3951. if "target" not in kwargs:
  3952. kwargs["target"] = self
  3953. kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
  3954. return _eval(expr, inplace=inplace, **kwargs)
  3955. def select_dtypes(self, include=None, exclude=None) -> DataFrame:
  3956. """
  3957. Return a subset of the DataFrame's columns based on the column dtypes.
  3958. Parameters
  3959. ----------
  3960. include, exclude : scalar or list-like
  3961. A selection of dtypes or strings to be included/excluded. At least
  3962. one of these parameters must be supplied.
  3963. Returns
  3964. -------
  3965. DataFrame
  3966. The subset of the frame including the dtypes in ``include`` and
  3967. excluding the dtypes in ``exclude``.
  3968. Raises
  3969. ------
  3970. ValueError
  3971. * If both of ``include`` and ``exclude`` are empty
  3972. * If ``include`` and ``exclude`` have overlapping elements
  3973. * If any kind of string dtype is passed in.
  3974. See Also
  3975. --------
  3976. DataFrame.dtypes: Return Series with the data type of each column.
  3977. Notes
  3978. -----
  3979. * To select all *numeric* types, use ``np.number`` or ``'number'``
  3980. * To select strings you must use the ``object`` dtype, but note that
  3981. this will return *all* object dtype columns
  3982. * See the `numpy dtype hierarchy
  3983. <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
  3984. * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
  3985. ``'datetime64'``
  3986. * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
  3987. ``'timedelta64'``
  3988. * To select Pandas categorical dtypes, use ``'category'``
  3989. * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
  3990. 0.20.0) or ``'datetime64[ns, tz]'``
  3991. Examples
  3992. --------
  3993. >>> df = pd.DataFrame({'a': [1, 2] * 3,
  3994. ... 'b': [True, False] * 3,
  3995. ... 'c': [1.0, 2.0] * 3})
  3996. >>> df
  3997. a b c
  3998. 0 1 True 1.0
  3999. 1 2 False 2.0
  4000. 2 1 True 1.0
  4001. 3 2 False 2.0
  4002. 4 1 True 1.0
  4003. 5 2 False 2.0
  4004. >>> df.select_dtypes(include='bool')
  4005. b
  4006. 0 True
  4007. 1 False
  4008. 2 True
  4009. 3 False
  4010. 4 True
  4011. 5 False
  4012. >>> df.select_dtypes(include=['float64'])
  4013. c
  4014. 0 1.0
  4015. 1 2.0
  4016. 2 1.0
  4017. 3 2.0
  4018. 4 1.0
  4019. 5 2.0
  4020. >>> df.select_dtypes(exclude=['int64'])
  4021. b c
  4022. 0 True 1.0
  4023. 1 False 2.0
  4024. 2 True 1.0
  4025. 3 False 2.0
  4026. 4 True 1.0
  4027. 5 False 2.0
  4028. """
  4029. if not is_list_like(include):
  4030. include = (include,) if include is not None else ()
  4031. if not is_list_like(exclude):
  4032. exclude = (exclude,) if exclude is not None else ()
  4033. selection = (frozenset(include), frozenset(exclude))
  4034. if not any(selection):
  4035. raise ValueError("at least one of include or exclude must be nonempty")
  4036. # convert the myriad valid dtypes object to a single representation
  4037. def check_int_infer_dtype(dtypes):
  4038. converted_dtypes: list[type] = []
  4039. for dtype in dtypes:
  4040. # Numpy maps int to different types (int32, in64) on Windows and Linux
  4041. # see https://github.com/numpy/numpy/issues/9464
  4042. if (isinstance(dtype, str) and dtype == "int") or (dtype is int):
  4043. converted_dtypes.append(np.int32)
  4044. converted_dtypes.append(np.int64)
  4045. elif dtype == "float" or dtype is float:
  4046. # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20
  4047. converted_dtypes.extend([np.float64, np.float32])
  4048. else:
  4049. converted_dtypes.append(infer_dtype_from_object(dtype))
  4050. return frozenset(converted_dtypes)
  4051. include = check_int_infer_dtype(include)
  4052. exclude = check_int_infer_dtype(exclude)
  4053. for dtypes in (include, exclude):
  4054. invalidate_string_dtypes(dtypes)
  4055. # can't both include AND exclude!
  4056. if not include.isdisjoint(exclude):
  4057. raise ValueError(f"include and exclude overlap on {(include & exclude)}")
  4058. def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
  4059. # GH 46870: BooleanDtype._is_numeric == True but should be excluded
  4060. dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype
  4061. return issubclass(dtype.type, tuple(dtypes_set)) or (
  4062. np.number in dtypes_set
  4063. and getattr(dtype, "_is_numeric", False)
  4064. and not is_bool_dtype(dtype)
  4065. )
  4066. def predicate(arr: ArrayLike) -> bool:
  4067. dtype = arr.dtype
  4068. if include:
  4069. if not dtype_predicate(dtype, include):
  4070. return False
  4071. if exclude:
  4072. if dtype_predicate(dtype, exclude):
  4073. return False
  4074. return True
  4075. mgr = self._mgr._get_data_subset(predicate).copy(deep=None)
  4076. return type(self)(mgr).__finalize__(self)
  4077. def insert(
  4078. self,
  4079. loc: int,
  4080. column: Hashable,
  4081. value: Scalar | AnyArrayLike,
  4082. allow_duplicates: bool | lib.NoDefault = lib.no_default,
  4083. ) -> None:
  4084. """
  4085. Insert column into DataFrame at specified location.
  4086. Raises a ValueError if `column` is already contained in the DataFrame,
  4087. unless `allow_duplicates` is set to True.
  4088. Parameters
  4089. ----------
  4090. loc : int
  4091. Insertion index. Must verify 0 <= loc <= len(columns).
  4092. column : str, number, or hashable object
  4093. Label of the inserted column.
  4094. value : Scalar, Series, or array-like
  4095. allow_duplicates : bool, optional, default lib.no_default
  4096. See Also
  4097. --------
  4098. Index.insert : Insert new item by index.
  4099. Examples
  4100. --------
  4101. >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
  4102. >>> df
  4103. col1 col2
  4104. 0 1 3
  4105. 1 2 4
  4106. >>> df.insert(1, "newcol", [99, 99])
  4107. >>> df
  4108. col1 newcol col2
  4109. 0 1 99 3
  4110. 1 2 99 4
  4111. >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)
  4112. >>> df
  4113. col1 col1 newcol col2
  4114. 0 100 1 99 3
  4115. 1 100 2 99 4
  4116. Notice that pandas uses index alignment in case of `value` from type `Series`:
  4117. >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))
  4118. >>> df
  4119. col0 col1 col1 newcol col2
  4120. 0 NaN 100 1 99 3
  4121. 1 5.0 100 2 99 4
  4122. """
  4123. if allow_duplicates is lib.no_default:
  4124. allow_duplicates = False
  4125. if allow_duplicates and not self.flags.allows_duplicate_labels:
  4126. raise ValueError(
  4127. "Cannot specify 'allow_duplicates=True' when "
  4128. "'self.flags.allows_duplicate_labels' is False."
  4129. )
  4130. if not allow_duplicates and column in self.columns:
  4131. # Should this be a different kind of error??
  4132. raise ValueError(f"cannot insert {column}, already exists")
  4133. if not isinstance(loc, int):
  4134. raise TypeError("loc must be int")
  4135. value = self._sanitize_column(value)
  4136. self._mgr.insert(loc, column, value)
  4137. def assign(self, **kwargs) -> DataFrame:
  4138. r"""
  4139. Assign new columns to a DataFrame.
  4140. Returns a new object with all original columns in addition to new ones.
  4141. Existing columns that are re-assigned will be overwritten.
  4142. Parameters
  4143. ----------
  4144. **kwargs : dict of {str: callable or Series}
  4145. The column names are keywords. If the values are
  4146. callable, they are computed on the DataFrame and
  4147. assigned to the new columns. The callable must not
  4148. change input DataFrame (though pandas doesn't check it).
  4149. If the values are not callable, (e.g. a Series, scalar, or array),
  4150. they are simply assigned.
  4151. Returns
  4152. -------
  4153. DataFrame
  4154. A new DataFrame with the new columns in addition to
  4155. all the existing columns.
  4156. Notes
  4157. -----
  4158. Assigning multiple columns within the same ``assign`` is possible.
  4159. Later items in '\*\*kwargs' may refer to newly created or modified
  4160. columns in 'df'; items are computed and assigned into 'df' in order.
  4161. Examples
  4162. --------
  4163. >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
  4164. ... index=['Portland', 'Berkeley'])
  4165. >>> df
  4166. temp_c
  4167. Portland 17.0
  4168. Berkeley 25.0
  4169. Where the value is a callable, evaluated on `df`:
  4170. >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
  4171. temp_c temp_f
  4172. Portland 17.0 62.6
  4173. Berkeley 25.0 77.0
  4174. Alternatively, the same behavior can be achieved by directly
  4175. referencing an existing Series or sequence:
  4176. >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
  4177. temp_c temp_f
  4178. Portland 17.0 62.6
  4179. Berkeley 25.0 77.0
  4180. You can create multiple columns within the same assign where one
  4181. of the columns depends on another one defined within the same assign:
  4182. >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
  4183. ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
  4184. temp_c temp_f temp_k
  4185. Portland 17.0 62.6 290.15
  4186. Berkeley 25.0 77.0 298.15
  4187. """
  4188. data = self.copy(deep=None)
  4189. for k, v in kwargs.items():
  4190. data[k] = com.apply_if_callable(v, data)
  4191. return data
  4192. def _sanitize_column(self, value) -> ArrayLike:
  4193. """
  4194. Ensures new columns (which go into the BlockManager as new blocks) are
  4195. always copied and converted into an array.
  4196. Parameters
  4197. ----------
  4198. value : scalar, Series, or array-like
  4199. Returns
  4200. -------
  4201. numpy.ndarray or ExtensionArray
  4202. """
  4203. self._ensure_valid_index(value)
  4204. # We can get there through isetitem with a DataFrame
  4205. # or through loc single_block_path
  4206. if isinstance(value, DataFrame):
  4207. return _reindex_for_setitem(value, self.index)
  4208. elif is_dict_like(value):
  4209. return _reindex_for_setitem(Series(value), self.index)
  4210. if is_list_like(value):
  4211. com.require_length_match(value, self.index)
  4212. return sanitize_array(value, self.index, copy=True, allow_2d=True)
  4213. @property
  4214. def _series(self):
  4215. return {
  4216. item: Series(
  4217. self._mgr.iget(idx), index=self.index, name=item, fastpath=True
  4218. )
  4219. for idx, item in enumerate(self.columns)
  4220. }
  4221. # ----------------------------------------------------------------------
  4222. # Reindexing and alignment
  4223. def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
  4224. frame = self
  4225. columns = axes["columns"]
  4226. if columns is not None:
  4227. frame = frame._reindex_columns(
  4228. columns, method, copy, level, fill_value, limit, tolerance
  4229. )
  4230. index = axes["index"]
  4231. if index is not None:
  4232. frame = frame._reindex_index(
  4233. index, method, copy, level, fill_value, limit, tolerance
  4234. )
  4235. return frame
  4236. def _reindex_index(
  4237. self,
  4238. new_index,
  4239. method,
  4240. copy: bool,
  4241. level: Level,
  4242. fill_value=np.nan,
  4243. limit=None,
  4244. tolerance=None,
  4245. ):
  4246. new_index, indexer = self.index.reindex(
  4247. new_index, method=method, level=level, limit=limit, tolerance=tolerance
  4248. )
  4249. return self._reindex_with_indexers(
  4250. {0: [new_index, indexer]},
  4251. copy=copy,
  4252. fill_value=fill_value,
  4253. allow_dups=False,
  4254. )
  4255. def _reindex_columns(
  4256. self,
  4257. new_columns,
  4258. method,
  4259. copy: bool,
  4260. level: Level,
  4261. fill_value=None,
  4262. limit=None,
  4263. tolerance=None,
  4264. ):
  4265. new_columns, indexer = self.columns.reindex(
  4266. new_columns, method=method, level=level, limit=limit, tolerance=tolerance
  4267. )
  4268. return self._reindex_with_indexers(
  4269. {1: [new_columns, indexer]},
  4270. copy=copy,
  4271. fill_value=fill_value,
  4272. allow_dups=False,
  4273. )
  4274. def _reindex_multi(
  4275. self, axes: dict[str, Index], copy: bool, fill_value
  4276. ) -> DataFrame:
  4277. """
  4278. We are guaranteed non-Nones in the axes.
  4279. """
  4280. new_index, row_indexer = self.index.reindex(axes["index"])
  4281. new_columns, col_indexer = self.columns.reindex(axes["columns"])
  4282. if row_indexer is not None and col_indexer is not None:
  4283. # Fastpath. By doing two 'take's at once we avoid making an
  4284. # unnecessary copy.
  4285. # We only get here with `not self._is_mixed_type`, which (almost)
  4286. # ensures that self.values is cheap. It may be worth making this
  4287. # condition more specific.
  4288. indexer = row_indexer, col_indexer
  4289. new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)
  4290. return self._constructor(
  4291. new_values, index=new_index, columns=new_columns, copy=False
  4292. )
  4293. else:
  4294. return self._reindex_with_indexers(
  4295. {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
  4296. copy=copy,
  4297. fill_value=fill_value,
  4298. )
  4299. @doc(NDFrame.align, **_shared_doc_kwargs)
  4300. def align(
  4301. self,
  4302. other: DataFrame,
  4303. join: AlignJoin = "outer",
  4304. axis: Axis | None = None,
  4305. level: Level = None,
  4306. copy: bool | None = None,
  4307. fill_value=None,
  4308. method: FillnaOptions | None = None,
  4309. limit: int | None = None,
  4310. fill_axis: Axis = 0,
  4311. broadcast_axis: Axis | None = None,
  4312. ) -> DataFrame:
  4313. return super().align(
  4314. other,
  4315. join=join,
  4316. axis=axis,
  4317. level=level,
  4318. copy=copy,
  4319. fill_value=fill_value,
  4320. method=method,
  4321. limit=limit,
  4322. fill_axis=fill_axis,
  4323. broadcast_axis=broadcast_axis,
  4324. )
  4325. @Appender(
  4326. """
  4327. Examples
  4328. --------
  4329. >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  4330. Change the row labels.
  4331. >>> df.set_axis(['a', 'b', 'c'], axis='index')
  4332. A B
  4333. a 1 4
  4334. b 2 5
  4335. c 3 6
  4336. Change the column labels.
  4337. >>> df.set_axis(['I', 'II'], axis='columns')
  4338. I II
  4339. 0 1 4
  4340. 1 2 5
  4341. 2 3 6
  4342. """
  4343. )
  4344. @Substitution(
  4345. **_shared_doc_kwargs,
  4346. extended_summary_sub=" column or",
  4347. axis_description_sub=", and 1 identifies the columns",
  4348. see_also_sub=" or columns",
  4349. )
  4350. @Appender(NDFrame.set_axis.__doc__)
  4351. def set_axis(
  4352. self,
  4353. labels,
  4354. *,
  4355. axis: Axis = 0,
  4356. copy: bool | None = None,
  4357. ) -> DataFrame:
  4358. return super().set_axis(labels, axis=axis, copy=copy)
  4359. @doc(
  4360. NDFrame.reindex,
  4361. klass=_shared_doc_kwargs["klass"],
  4362. optional_reindex=_shared_doc_kwargs["optional_reindex"],
  4363. )
  4364. def reindex( # type: ignore[override]
  4365. self,
  4366. labels=None,
  4367. *,
  4368. index=None,
  4369. columns=None,
  4370. axis: Axis | None = None,
  4371. method: str | None = None,
  4372. copy: bool | None = None,
  4373. level: Level | None = None,
  4374. fill_value: Scalar | None = np.nan,
  4375. limit: int | None = None,
  4376. tolerance=None,
  4377. ) -> DataFrame:
  4378. return super().reindex(
  4379. labels=labels,
  4380. index=index,
  4381. columns=columns,
  4382. axis=axis,
  4383. method=method,
  4384. copy=copy,
  4385. level=level,
  4386. fill_value=fill_value,
  4387. limit=limit,
  4388. tolerance=tolerance,
  4389. )
  4390. @overload
  4391. def drop(
  4392. self,
  4393. labels: IndexLabel = ...,
  4394. *,
  4395. axis: Axis = ...,
  4396. index: IndexLabel = ...,
  4397. columns: IndexLabel = ...,
  4398. level: Level = ...,
  4399. inplace: Literal[True],
  4400. errors: IgnoreRaise = ...,
  4401. ) -> None:
  4402. ...
  4403. @overload
  4404. def drop(
  4405. self,
  4406. labels: IndexLabel = ...,
  4407. *,
  4408. axis: Axis = ...,
  4409. index: IndexLabel = ...,
  4410. columns: IndexLabel = ...,
  4411. level: Level = ...,
  4412. inplace: Literal[False] = ...,
  4413. errors: IgnoreRaise = ...,
  4414. ) -> DataFrame:
  4415. ...
  4416. @overload
  4417. def drop(
  4418. self,
  4419. labels: IndexLabel = ...,
  4420. *,
  4421. axis: Axis = ...,
  4422. index: IndexLabel = ...,
  4423. columns: IndexLabel = ...,
  4424. level: Level = ...,
  4425. inplace: bool = ...,
  4426. errors: IgnoreRaise = ...,
  4427. ) -> DataFrame | None:
  4428. ...
  4429. def drop(
  4430. self,
  4431. labels: IndexLabel = None,
  4432. *,
  4433. axis: Axis = 0,
  4434. index: IndexLabel = None,
  4435. columns: IndexLabel = None,
  4436. level: Level = None,
  4437. inplace: bool = False,
  4438. errors: IgnoreRaise = "raise",
  4439. ) -> DataFrame | None:
  4440. """
  4441. Drop specified labels from rows or columns.
  4442. Remove rows or columns by specifying label names and corresponding
  4443. axis, or by specifying directly index or column names. When using a
  4444. multi-index, labels on different levels can be removed by specifying
  4445. the level. See the :ref:`user guide <advanced.shown_levels>`
  4446. for more information about the now unused levels.
  4447. Parameters
  4448. ----------
  4449. labels : single label or list-like
  4450. Index or column labels to drop. A tuple will be used as a single
  4451. label and not treated as a list-like.
  4452. axis : {0 or 'index', 1 or 'columns'}, default 0
  4453. Whether to drop labels from the index (0 or 'index') or
  4454. columns (1 or 'columns').
  4455. index : single label or list-like
  4456. Alternative to specifying axis (``labels, axis=0``
  4457. is equivalent to ``index=labels``).
  4458. columns : single label or list-like
  4459. Alternative to specifying axis (``labels, axis=1``
  4460. is equivalent to ``columns=labels``).
  4461. level : int or level name, optional
  4462. For MultiIndex, level from which the labels will be removed.
  4463. inplace : bool, default False
  4464. If False, return a copy. Otherwise, do operation
  4465. inplace and return None.
  4466. errors : {'ignore', 'raise'}, default 'raise'
  4467. If 'ignore', suppress error and only existing labels are
  4468. dropped.
  4469. Returns
  4470. -------
  4471. DataFrame or None
  4472. DataFrame without the removed index or column labels or
  4473. None if ``inplace=True``.
  4474. Raises
  4475. ------
  4476. KeyError
  4477. If any of the labels is not found in the selected axis.
  4478. See Also
  4479. --------
  4480. DataFrame.loc : Label-location based indexer for selection by label.
  4481. DataFrame.dropna : Return DataFrame with labels on given axis omitted
  4482. where (all or any) data are missing.
  4483. DataFrame.drop_duplicates : Return DataFrame with duplicate rows
  4484. removed, optionally only considering certain columns.
  4485. Series.drop : Return Series with specified index labels removed.
  4486. Examples
  4487. --------
  4488. >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
  4489. ... columns=['A', 'B', 'C', 'D'])
  4490. >>> df
  4491. A B C D
  4492. 0 0 1 2 3
  4493. 1 4 5 6 7
  4494. 2 8 9 10 11
  4495. Drop columns
  4496. >>> df.drop(['B', 'C'], axis=1)
  4497. A D
  4498. 0 0 3
  4499. 1 4 7
  4500. 2 8 11
  4501. >>> df.drop(columns=['B', 'C'])
  4502. A D
  4503. 0 0 3
  4504. 1 4 7
  4505. 2 8 11
  4506. Drop a row by index
  4507. >>> df.drop([0, 1])
  4508. A B C D
  4509. 2 8 9 10 11
  4510. Drop columns and/or rows of MultiIndex DataFrame
  4511. >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
  4512. ... ['speed', 'weight', 'length']],
  4513. ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
  4514. ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
  4515. >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
  4516. ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
  4517. ... [250, 150], [1.5, 0.8], [320, 250],
  4518. ... [1, 0.8], [0.3, 0.2]])
  4519. >>> df
  4520. big small
  4521. lama speed 45.0 30.0
  4522. weight 200.0 100.0
  4523. length 1.5 1.0
  4524. cow speed 30.0 20.0
  4525. weight 250.0 150.0
  4526. length 1.5 0.8
  4527. falcon speed 320.0 250.0
  4528. weight 1.0 0.8
  4529. length 0.3 0.2
  4530. Drop a specific index combination from the MultiIndex
  4531. DataFrame, i.e., drop the combination ``'falcon'`` and
  4532. ``'weight'``, which deletes only the corresponding row
  4533. >>> df.drop(index=('falcon', 'weight'))
  4534. big small
  4535. lama speed 45.0 30.0
  4536. weight 200.0 100.0
  4537. length 1.5 1.0
  4538. cow speed 30.0 20.0
  4539. weight 250.0 150.0
  4540. length 1.5 0.8
  4541. falcon speed 320.0 250.0
  4542. length 0.3 0.2
  4543. >>> df.drop(index='cow', columns='small')
  4544. big
  4545. lama speed 45.0
  4546. weight 200.0
  4547. length 1.5
  4548. falcon speed 320.0
  4549. weight 1.0
  4550. length 0.3
  4551. >>> df.drop(index='length', level=1)
  4552. big small
  4553. lama speed 45.0 30.0
  4554. weight 200.0 100.0
  4555. cow speed 30.0 20.0
  4556. weight 250.0 150.0
  4557. falcon speed 320.0 250.0
  4558. weight 1.0 0.8
  4559. """
  4560. return super().drop(
  4561. labels=labels,
  4562. axis=axis,
  4563. index=index,
  4564. columns=columns,
  4565. level=level,
  4566. inplace=inplace,
  4567. errors=errors,
  4568. )
  4569. @overload
  4570. def rename(
  4571. self,
  4572. mapper: Renamer | None = ...,
  4573. *,
  4574. index: Renamer | None = ...,
  4575. columns: Renamer | None = ...,
  4576. axis: Axis | None = ...,
  4577. copy: bool | None = ...,
  4578. inplace: Literal[True],
  4579. level: Level = ...,
  4580. errors: IgnoreRaise = ...,
  4581. ) -> None:
  4582. ...
  4583. @overload
  4584. def rename(
  4585. self,
  4586. mapper: Renamer | None = ...,
  4587. *,
  4588. index: Renamer | None = ...,
  4589. columns: Renamer | None = ...,
  4590. axis: Axis | None = ...,
  4591. copy: bool | None = ...,
  4592. inplace: Literal[False] = ...,
  4593. level: Level = ...,
  4594. errors: IgnoreRaise = ...,
  4595. ) -> DataFrame:
  4596. ...
  4597. @overload
  4598. def rename(
  4599. self,
  4600. mapper: Renamer | None = ...,
  4601. *,
  4602. index: Renamer | None = ...,
  4603. columns: Renamer | None = ...,
  4604. axis: Axis | None = ...,
  4605. copy: bool | None = ...,
  4606. inplace: bool = ...,
  4607. level: Level = ...,
  4608. errors: IgnoreRaise = ...,
  4609. ) -> DataFrame | None:
  4610. ...
  4611. def rename(
  4612. self,
  4613. mapper: Renamer | None = None,
  4614. *,
  4615. index: Renamer | None = None,
  4616. columns: Renamer | None = None,
  4617. axis: Axis | None = None,
  4618. copy: bool | None = None,
  4619. inplace: bool = False,
  4620. level: Level = None,
  4621. errors: IgnoreRaise = "ignore",
  4622. ) -> DataFrame | None:
  4623. """
  4624. Rename columns or index labels.
  4625. Function / dict values must be unique (1-to-1). Labels not contained in
  4626. a dict / Series will be left as-is. Extra labels listed don't throw an
  4627. error.
  4628. See the :ref:`user guide <basics.rename>` for more.
  4629. Parameters
  4630. ----------
  4631. mapper : dict-like or function
  4632. Dict-like or function transformations to apply to
  4633. that axis' values. Use either ``mapper`` and ``axis`` to
  4634. specify the axis to target with ``mapper``, or ``index`` and
  4635. ``columns``.
  4636. index : dict-like or function
  4637. Alternative to specifying axis (``mapper, axis=0``
  4638. is equivalent to ``index=mapper``).
  4639. columns : dict-like or function
  4640. Alternative to specifying axis (``mapper, axis=1``
  4641. is equivalent to ``columns=mapper``).
  4642. axis : {0 or 'index', 1 or 'columns'}, default 0
  4643. Axis to target with ``mapper``. Can be either the axis name
  4644. ('index', 'columns') or number (0, 1). The default is 'index'.
  4645. copy : bool, default True
  4646. Also copy underlying data.
  4647. inplace : bool, default False
  4648. Whether to modify the DataFrame rather than creating a new one.
  4649. If True then value of copy is ignored.
  4650. level : int or level name, default None
  4651. In case of a MultiIndex, only rename labels in the specified
  4652. level.
  4653. errors : {'ignore', 'raise'}, default 'ignore'
  4654. If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
  4655. or `columns` contains labels that are not present in the Index
  4656. being transformed.
  4657. If 'ignore', existing keys will be renamed and extra keys will be
  4658. ignored.
  4659. Returns
  4660. -------
  4661. DataFrame or None
  4662. DataFrame with the renamed axis labels or None if ``inplace=True``.
  4663. Raises
  4664. ------
  4665. KeyError
  4666. If any of the labels is not found in the selected axis and
  4667. "errors='raise'".
  4668. See Also
  4669. --------
  4670. DataFrame.rename_axis : Set the name of the axis.
  4671. Examples
  4672. --------
  4673. ``DataFrame.rename`` supports two calling conventions
  4674. * ``(index=index_mapper, columns=columns_mapper, ...)``
  4675. * ``(mapper, axis={'index', 'columns'}, ...)``
  4676. We *highly* recommend using keyword arguments to clarify your
  4677. intent.
  4678. Rename columns using a mapping:
  4679. >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
  4680. >>> df.rename(columns={"A": "a", "B": "c"})
  4681. a c
  4682. 0 1 4
  4683. 1 2 5
  4684. 2 3 6
  4685. Rename index using a mapping:
  4686. >>> df.rename(index={0: "x", 1: "y", 2: "z"})
  4687. A B
  4688. x 1 4
  4689. y 2 5
  4690. z 3 6
  4691. Cast index labels to a different type:
  4692. >>> df.index
  4693. RangeIndex(start=0, stop=3, step=1)
  4694. >>> df.rename(index=str).index
  4695. Index(['0', '1', '2'], dtype='object')
  4696. >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
  4697. Traceback (most recent call last):
  4698. KeyError: ['C'] not found in axis
  4699. Using axis-style parameters:
  4700. >>> df.rename(str.lower, axis='columns')
  4701. a b
  4702. 0 1 4
  4703. 1 2 5
  4704. 2 3 6
  4705. >>> df.rename({1: 2, 2: 4}, axis='index')
  4706. A B
  4707. 0 1 4
  4708. 2 2 5
  4709. 4 3 6
  4710. """
  4711. return super()._rename(
  4712. mapper=mapper,
  4713. index=index,
  4714. columns=columns,
  4715. axis=axis,
  4716. copy=copy,
  4717. inplace=inplace,
  4718. level=level,
  4719. errors=errors,
  4720. )
  4721. @overload
  4722. def fillna(
  4723. self,
  4724. value: Hashable | Mapping | Series | DataFrame = ...,
  4725. *,
  4726. method: FillnaOptions | None = ...,
  4727. axis: Axis | None = ...,
  4728. inplace: Literal[False] = ...,
  4729. limit: int | None = ...,
  4730. downcast: dict | None = ...,
  4731. ) -> DataFrame:
  4732. ...
  4733. @overload
  4734. def fillna(
  4735. self,
  4736. value: Hashable | Mapping | Series | DataFrame = ...,
  4737. *,
  4738. method: FillnaOptions | None = ...,
  4739. axis: Axis | None = ...,
  4740. inplace: Literal[True],
  4741. limit: int | None = ...,
  4742. downcast: dict | None = ...,
  4743. ) -> None:
  4744. ...
  4745. @overload
  4746. def fillna(
  4747. self,
  4748. value: Hashable | Mapping | Series | DataFrame = ...,
  4749. *,
  4750. method: FillnaOptions | None = ...,
  4751. axis: Axis | None = ...,
  4752. inplace: bool = ...,
  4753. limit: int | None = ...,
  4754. downcast: dict | None = ...,
  4755. ) -> DataFrame | None:
  4756. ...
  4757. @doc(NDFrame.fillna, **_shared_doc_kwargs)
  4758. def fillna(
  4759. self,
  4760. value: Hashable | Mapping | Series | DataFrame = None,
  4761. *,
  4762. method: FillnaOptions | None = None,
  4763. axis: Axis | None = None,
  4764. inplace: bool = False,
  4765. limit: int | None = None,
  4766. downcast: dict | None = None,
  4767. ) -> DataFrame | None:
  4768. return super().fillna(
  4769. value=value,
  4770. method=method,
  4771. axis=axis,
  4772. inplace=inplace,
  4773. limit=limit,
  4774. downcast=downcast,
  4775. )
  4776. def pop(self, item: Hashable) -> Series:
  4777. """
  4778. Return item and drop from frame. Raise KeyError if not found.
  4779. Parameters
  4780. ----------
  4781. item : label
  4782. Label of column to be popped.
  4783. Returns
  4784. -------
  4785. Series
  4786. Examples
  4787. --------
  4788. >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
  4789. ... ('parrot', 'bird', 24.0),
  4790. ... ('lion', 'mammal', 80.5),
  4791. ... ('monkey', 'mammal', np.nan)],
  4792. ... columns=('name', 'class', 'max_speed'))
  4793. >>> df
  4794. name class max_speed
  4795. 0 falcon bird 389.0
  4796. 1 parrot bird 24.0
  4797. 2 lion mammal 80.5
  4798. 3 monkey mammal NaN
  4799. >>> df.pop('class')
  4800. 0 bird
  4801. 1 bird
  4802. 2 mammal
  4803. 3 mammal
  4804. Name: class, dtype: object
  4805. >>> df
  4806. name max_speed
  4807. 0 falcon 389.0
  4808. 1 parrot 24.0
  4809. 2 lion 80.5
  4810. 3 monkey NaN
  4811. """
  4812. return super().pop(item=item)
  4813. @overload
  4814. def replace(
  4815. self,
  4816. to_replace=...,
  4817. value=...,
  4818. *,
  4819. inplace: Literal[False] = ...,
  4820. limit: int | None = ...,
  4821. regex: bool = ...,
  4822. method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
  4823. ) -> DataFrame:
  4824. ...
  4825. @overload
  4826. def replace(
  4827. self,
  4828. to_replace=...,
  4829. value=...,
  4830. *,
  4831. inplace: Literal[True],
  4832. limit: int | None = ...,
  4833. regex: bool = ...,
  4834. method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
  4835. ) -> None:
  4836. ...
  4837. @doc(NDFrame.replace, **_shared_doc_kwargs)
  4838. def replace(
  4839. self,
  4840. to_replace=None,
  4841. value=lib.no_default,
  4842. *,
  4843. inplace: bool = False,
  4844. limit: int | None = None,
  4845. regex: bool = False,
  4846. method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
  4847. ) -> DataFrame | None:
  4848. return super().replace(
  4849. to_replace=to_replace,
  4850. value=value,
  4851. inplace=inplace,
  4852. limit=limit,
  4853. regex=regex,
  4854. method=method,
  4855. )
  4856. def _replace_columnwise(
  4857. self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex
  4858. ):
  4859. """
  4860. Dispatch to Series.replace column-wise.
  4861. Parameters
  4862. ----------
  4863. mapping : dict
  4864. of the form {col: (target, value)}
  4865. inplace : bool
  4866. regex : bool or same types as `to_replace` in DataFrame.replace
  4867. Returns
  4868. -------
  4869. DataFrame or None
  4870. """
  4871. # Operate column-wise
  4872. res = self if inplace else self.copy(deep=None)
  4873. ax = self.columns
  4874. for i, ax_value in enumerate(ax):
  4875. if ax_value in mapping:
  4876. ser = self.iloc[:, i]
  4877. target, value = mapping[ax_value]
  4878. newobj = ser.replace(target, value, regex=regex)
  4879. res._iset_item(i, newobj)
  4880. if inplace:
  4881. return
  4882. return res.__finalize__(self)
  4883. @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])
  4884. def shift(
  4885. self,
  4886. periods: int = 1,
  4887. freq: Frequency | None = None,
  4888. axis: Axis = 0,
  4889. fill_value: Hashable = lib.no_default,
  4890. ) -> DataFrame:
  4891. axis = self._get_axis_number(axis)
  4892. ncols = len(self.columns)
  4893. if (
  4894. axis == 1
  4895. and periods != 0
  4896. and freq is None
  4897. and fill_value is lib.no_default
  4898. and ncols > 0
  4899. ):
  4900. # We will infer fill_value to match the closest column
  4901. # Use a column that we know is valid for our column's dtype GH#38434
  4902. label = self.columns[0]
  4903. if periods > 0:
  4904. result = self.iloc[:, :-periods]
  4905. for col in range(min(ncols, abs(periods))):
  4906. # TODO(EA2D): doing this in a loop unnecessary with 2D EAs
  4907. # Define filler inside loop so we get a copy
  4908. filler = self.iloc[:, 0].shift(len(self))
  4909. result.insert(0, label, filler, allow_duplicates=True)
  4910. else:
  4911. result = self.iloc[:, -periods:]
  4912. for col in range(min(ncols, abs(periods))):
  4913. # Define filler inside loop so we get a copy
  4914. filler = self.iloc[:, -1].shift(len(self))
  4915. result.insert(
  4916. len(result.columns), label, filler, allow_duplicates=True
  4917. )
  4918. result.columns = self.columns.copy()
  4919. return result
  4920. elif (
  4921. axis == 1
  4922. and periods != 0
  4923. and fill_value is not lib.no_default
  4924. and ncols > 0
  4925. ):
  4926. arrays = self._mgr.arrays
  4927. if len(arrays) > 1 or (
  4928. # If we only have one block and we know that we can't
  4929. # keep the same dtype (i.e. the _can_hold_element check)
  4930. # then we can go through the reindex_indexer path
  4931. # (and avoid casting logic in the Block method).
  4932. not can_hold_element(arrays[0], fill_value)
  4933. ):
  4934. # GH#35488 we need to watch out for multi-block cases
  4935. # We only get here with fill_value not-lib.no_default
  4936. nper = abs(periods)
  4937. nper = min(nper, ncols)
  4938. if periods > 0:
  4939. indexer = np.array(
  4940. [-1] * nper + list(range(ncols - periods)), dtype=np.intp
  4941. )
  4942. else:
  4943. indexer = np.array(
  4944. list(range(nper, ncols)) + [-1] * nper, dtype=np.intp
  4945. )
  4946. mgr = self._mgr.reindex_indexer(
  4947. self.columns,
  4948. indexer,
  4949. axis=0,
  4950. fill_value=fill_value,
  4951. allow_dups=True,
  4952. )
  4953. res_df = self._constructor(mgr)
  4954. return res_df.__finalize__(self, method="shift")
  4955. return super().shift(
  4956. periods=periods, freq=freq, axis=axis, fill_value=fill_value
  4957. )
  4958. @overload
  4959. def set_index(
  4960. self,
  4961. keys,
  4962. *,
  4963. drop: bool = ...,
  4964. append: bool = ...,
  4965. inplace: Literal[False] = ...,
  4966. verify_integrity: bool = ...,
  4967. ) -> DataFrame:
  4968. ...
  4969. @overload
  4970. def set_index(
  4971. self,
  4972. keys,
  4973. *,
  4974. drop: bool = ...,
  4975. append: bool = ...,
  4976. inplace: Literal[True],
  4977. verify_integrity: bool = ...,
  4978. ) -> None:
  4979. ...
  4980. def set_index(
  4981. self,
  4982. keys,
  4983. *,
  4984. drop: bool = True,
  4985. append: bool = False,
  4986. inplace: bool = False,
  4987. verify_integrity: bool = False,
  4988. ) -> DataFrame | None:
  4989. """
  4990. Set the DataFrame index using existing columns.
  4991. Set the DataFrame index (row labels) using one or more existing
  4992. columns or arrays (of the correct length). The index can replace the
  4993. existing index or expand on it.
  4994. Parameters
  4995. ----------
  4996. keys : label or array-like or list of labels/arrays
  4997. This parameter can be either a single column key, a single array of
  4998. the same length as the calling DataFrame, or a list containing an
  4999. arbitrary combination of column keys and arrays. Here, "array"
  5000. encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
  5001. instances of :class:`~collections.abc.Iterator`.
  5002. drop : bool, default True
  5003. Delete columns to be used as the new index.
  5004. append : bool, default False
  5005. Whether to append columns to existing index.
  5006. inplace : bool, default False
  5007. Whether to modify the DataFrame rather than creating a new one.
  5008. verify_integrity : bool, default False
  5009. Check the new index for duplicates. Otherwise defer the check until
  5010. necessary. Setting to False will improve the performance of this
  5011. method.
  5012. Returns
  5013. -------
  5014. DataFrame or None
  5015. Changed row labels or None if ``inplace=True``.
  5016. See Also
  5017. --------
  5018. DataFrame.reset_index : Opposite of set_index.
  5019. DataFrame.reindex : Change to new indices or expand indices.
  5020. DataFrame.reindex_like : Change to same indices as other DataFrame.
  5021. Examples
  5022. --------
  5023. >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
  5024. ... 'year': [2012, 2014, 2013, 2014],
  5025. ... 'sale': [55, 40, 84, 31]})
  5026. >>> df
  5027. month year sale
  5028. 0 1 2012 55
  5029. 1 4 2014 40
  5030. 2 7 2013 84
  5031. 3 10 2014 31
  5032. Set the index to become the 'month' column:
  5033. >>> df.set_index('month')
  5034. year sale
  5035. month
  5036. 1 2012 55
  5037. 4 2014 40
  5038. 7 2013 84
  5039. 10 2014 31
  5040. Create a MultiIndex using columns 'year' and 'month':
  5041. >>> df.set_index(['year', 'month'])
  5042. sale
  5043. year month
  5044. 2012 1 55
  5045. 2014 4 40
  5046. 2013 7 84
  5047. 2014 10 31
  5048. Create a MultiIndex using an Index and a column:
  5049. >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
  5050. month sale
  5051. year
  5052. 1 2012 1 55
  5053. 2 2014 4 40
  5054. 3 2013 7 84
  5055. 4 2014 10 31
  5056. Create a MultiIndex using two Series:
  5057. >>> s = pd.Series([1, 2, 3, 4])
  5058. >>> df.set_index([s, s**2])
  5059. month year sale
  5060. 1 1 1 2012 55
  5061. 2 4 4 2014 40
  5062. 3 9 7 2013 84
  5063. 4 16 10 2014 31
  5064. """
  5065. inplace = validate_bool_kwarg(inplace, "inplace")
  5066. self._check_inplace_and_allows_duplicate_labels(inplace)
  5067. if not isinstance(keys, list):
  5068. keys = [keys]
  5069. err_msg = (
  5070. 'The parameter "keys" may be a column key, one-dimensional '
  5071. "array, or a list containing only valid column keys and "
  5072. "one-dimensional arrays."
  5073. )
  5074. missing: list[Hashable] = []
  5075. for col in keys:
  5076. if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):
  5077. # arrays are fine as long as they are one-dimensional
  5078. # iterators get converted to list below
  5079. if getattr(col, "ndim", 1) != 1:
  5080. raise ValueError(err_msg)
  5081. else:
  5082. # everything else gets tried as a key; see GH 24969
  5083. try:
  5084. found = col in self.columns
  5085. except TypeError as err:
  5086. raise TypeError(
  5087. f"{err_msg}. Received column of type {type(col)}"
  5088. ) from err
  5089. else:
  5090. if not found:
  5091. missing.append(col)
  5092. if missing:
  5093. raise KeyError(f"None of {missing} are in the columns")
  5094. if inplace:
  5095. frame = self
  5096. else:
  5097. # GH 49473 Use "lazy copy" with Copy-on-Write
  5098. frame = self.copy(deep=None)
  5099. arrays = []
  5100. names: list[Hashable] = []
  5101. if append:
  5102. names = list(self.index.names)
  5103. if isinstance(self.index, MultiIndex):
  5104. for i in range(self.index.nlevels):
  5105. arrays.append(self.index._get_level_values(i))
  5106. else:
  5107. arrays.append(self.index)
  5108. to_remove: list[Hashable] = []
  5109. for col in keys:
  5110. if isinstance(col, MultiIndex):
  5111. for n in range(col.nlevels):
  5112. arrays.append(col._get_level_values(n))
  5113. names.extend(col.names)
  5114. elif isinstance(col, (Index, Series)):
  5115. # if Index then not MultiIndex (treated above)
  5116. # error: Argument 1 to "append" of "list" has incompatible type
  5117. # "Union[Index, Series]"; expected "Index"
  5118. arrays.append(col) # type:ignore[arg-type]
  5119. names.append(col.name)
  5120. elif isinstance(col, (list, np.ndarray)):
  5121. # error: Argument 1 to "append" of "list" has incompatible type
  5122. # "Union[List[Any], ndarray]"; expected "Index"
  5123. arrays.append(col) # type: ignore[arg-type]
  5124. names.append(None)
  5125. elif isinstance(col, abc.Iterator):
  5126. # error: Argument 1 to "append" of "list" has incompatible type
  5127. # "List[Any]"; expected "Index"
  5128. arrays.append(list(col)) # type: ignore[arg-type]
  5129. names.append(None)
  5130. # from here, col can only be a column label
  5131. else:
  5132. arrays.append(frame[col])
  5133. names.append(col)
  5134. if drop:
  5135. to_remove.append(col)
  5136. if len(arrays[-1]) != len(self):
  5137. # check newest element against length of calling frame, since
  5138. # ensure_index_from_sequences would not raise for append=False.
  5139. raise ValueError(
  5140. f"Length mismatch: Expected {len(self)} rows, "
  5141. f"received array of length {len(arrays[-1])}"
  5142. )
  5143. index = ensure_index_from_sequences(arrays, names)
  5144. if verify_integrity and not index.is_unique:
  5145. duplicates = index[index.duplicated()].unique()
  5146. raise ValueError(f"Index has duplicate keys: {duplicates}")
  5147. # use set to handle duplicate column names gracefully in case of drop
  5148. for c in set(to_remove):
  5149. del frame[c]
  5150. # clear up memory usage
  5151. index._cleanup()
  5152. frame.index = index
  5153. if not inplace:
  5154. return frame
  5155. return None
  5156. @overload
  5157. def reset_index(
  5158. self,
  5159. level: IndexLabel = ...,
  5160. *,
  5161. drop: bool = ...,
  5162. inplace: Literal[False] = ...,
  5163. col_level: Hashable = ...,
  5164. col_fill: Hashable = ...,
  5165. allow_duplicates: bool | lib.NoDefault = ...,
  5166. names: Hashable | Sequence[Hashable] = None,
  5167. ) -> DataFrame:
  5168. ...
  5169. @overload
  5170. def reset_index(
  5171. self,
  5172. level: IndexLabel = ...,
  5173. *,
  5174. drop: bool = ...,
  5175. inplace: Literal[True],
  5176. col_level: Hashable = ...,
  5177. col_fill: Hashable = ...,
  5178. allow_duplicates: bool | lib.NoDefault = ...,
  5179. names: Hashable | Sequence[Hashable] = None,
  5180. ) -> None:
  5181. ...
  5182. @overload
  5183. def reset_index(
  5184. self,
  5185. level: IndexLabel = ...,
  5186. *,
  5187. drop: bool = ...,
  5188. inplace: bool = ...,
  5189. col_level: Hashable = ...,
  5190. col_fill: Hashable = ...,
  5191. allow_duplicates: bool | lib.NoDefault = ...,
  5192. names: Hashable | Sequence[Hashable] = None,
  5193. ) -> DataFrame | None:
  5194. ...
  5195. def reset_index(
  5196. self,
  5197. level: IndexLabel = None,
  5198. *,
  5199. drop: bool = False,
  5200. inplace: bool = False,
  5201. col_level: Hashable = 0,
  5202. col_fill: Hashable = "",
  5203. allow_duplicates: bool | lib.NoDefault = lib.no_default,
  5204. names: Hashable | Sequence[Hashable] = None,
  5205. ) -> DataFrame | None:
  5206. """
  5207. Reset the index, or a level of it.
  5208. Reset the index of the DataFrame, and use the default one instead.
  5209. If the DataFrame has a MultiIndex, this method can remove one or more
  5210. levels.
  5211. Parameters
  5212. ----------
  5213. level : int, str, tuple, or list, default None
  5214. Only remove the given levels from the index. Removes all levels by
  5215. default.
  5216. drop : bool, default False
  5217. Do not try to insert index into dataframe columns. This resets
  5218. the index to the default integer index.
  5219. inplace : bool, default False
  5220. Whether to modify the DataFrame rather than creating a new one.
  5221. col_level : int or str, default 0
  5222. If the columns have multiple levels, determines which level the
  5223. labels are inserted into. By default it is inserted into the first
  5224. level.
  5225. col_fill : object, default ''
  5226. If the columns have multiple levels, determines how the other
  5227. levels are named. If None then the index name is repeated.
  5228. allow_duplicates : bool, optional, default lib.no_default
  5229. Allow duplicate column labels to be created.
  5230. .. versionadded:: 1.5.0
  5231. names : int, str or 1-dimensional list, default None
  5232. Using the given string, rename the DataFrame column which contains the
  5233. index data. If the DataFrame has a MultiIndex, this has to be a list or
  5234. tuple with length equal to the number of levels.
  5235. .. versionadded:: 1.5.0
  5236. Returns
  5237. -------
  5238. DataFrame or None
  5239. DataFrame with the new index or None if ``inplace=True``.
  5240. See Also
  5241. --------
  5242. DataFrame.set_index : Opposite of reset_index.
  5243. DataFrame.reindex : Change to new indices or expand indices.
  5244. DataFrame.reindex_like : Change to same indices as other DataFrame.
  5245. Examples
  5246. --------
  5247. >>> df = pd.DataFrame([('bird', 389.0),
  5248. ... ('bird', 24.0),
  5249. ... ('mammal', 80.5),
  5250. ... ('mammal', np.nan)],
  5251. ... index=['falcon', 'parrot', 'lion', 'monkey'],
  5252. ... columns=('class', 'max_speed'))
  5253. >>> df
  5254. class max_speed
  5255. falcon bird 389.0
  5256. parrot bird 24.0
  5257. lion mammal 80.5
  5258. monkey mammal NaN
  5259. When we reset the index, the old index is added as a column, and a
  5260. new sequential index is used:
  5261. >>> df.reset_index()
  5262. index class max_speed
  5263. 0 falcon bird 389.0
  5264. 1 parrot bird 24.0
  5265. 2 lion mammal 80.5
  5266. 3 monkey mammal NaN
  5267. We can use the `drop` parameter to avoid the old index being added as
  5268. a column:
  5269. >>> df.reset_index(drop=True)
  5270. class max_speed
  5271. 0 bird 389.0
  5272. 1 bird 24.0
  5273. 2 mammal 80.5
  5274. 3 mammal NaN
  5275. You can also use `reset_index` with `MultiIndex`.
  5276. >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
  5277. ... ('bird', 'parrot'),
  5278. ... ('mammal', 'lion'),
  5279. ... ('mammal', 'monkey')],
  5280. ... names=['class', 'name'])
  5281. >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
  5282. ... ('species', 'type')])
  5283. >>> df = pd.DataFrame([(389.0, 'fly'),
  5284. ... (24.0, 'fly'),
  5285. ... (80.5, 'run'),
  5286. ... (np.nan, 'jump')],
  5287. ... index=index,
  5288. ... columns=columns)
  5289. >>> df
  5290. speed species
  5291. max type
  5292. class name
  5293. bird falcon 389.0 fly
  5294. parrot 24.0 fly
  5295. mammal lion 80.5 run
  5296. monkey NaN jump
  5297. Using the `names` parameter, choose a name for the index column:
  5298. >>> df.reset_index(names=['classes', 'names'])
  5299. classes names speed species
  5300. max type
  5301. 0 bird falcon 389.0 fly
  5302. 1 bird parrot 24.0 fly
  5303. 2 mammal lion 80.5 run
  5304. 3 mammal monkey NaN jump
  5305. If the index has multiple levels, we can reset a subset of them:
  5306. >>> df.reset_index(level='class')
  5307. class speed species
  5308. max type
  5309. name
  5310. falcon bird 389.0 fly
  5311. parrot bird 24.0 fly
  5312. lion mammal 80.5 run
  5313. monkey mammal NaN jump
  5314. If we are not dropping the index, by default, it is placed in the top
  5315. level. We can place it in another level:
  5316. >>> df.reset_index(level='class', col_level=1)
  5317. speed species
  5318. class max type
  5319. name
  5320. falcon bird 389.0 fly
  5321. parrot bird 24.0 fly
  5322. lion mammal 80.5 run
  5323. monkey mammal NaN jump
  5324. When the index is inserted under another level, we can specify under
  5325. which one with the parameter `col_fill`:
  5326. >>> df.reset_index(level='class', col_level=1, col_fill='species')
  5327. species speed species
  5328. class max type
  5329. name
  5330. falcon bird 389.0 fly
  5331. parrot bird 24.0 fly
  5332. lion mammal 80.5 run
  5333. monkey mammal NaN jump
  5334. If we specify a nonexistent level for `col_fill`, it is created:
  5335. >>> df.reset_index(level='class', col_level=1, col_fill='genus')
  5336. genus speed species
  5337. class max type
  5338. name
  5339. falcon bird 389.0 fly
  5340. parrot bird 24.0 fly
  5341. lion mammal 80.5 run
  5342. monkey mammal NaN jump
  5343. """
  5344. inplace = validate_bool_kwarg(inplace, "inplace")
  5345. self._check_inplace_and_allows_duplicate_labels(inplace)
  5346. if inplace:
  5347. new_obj = self
  5348. else:
  5349. new_obj = self.copy(deep=None)
  5350. if allow_duplicates is not lib.no_default:
  5351. allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
  5352. new_index = default_index(len(new_obj))
  5353. if level is not None:
  5354. if not isinstance(level, (tuple, list)):
  5355. level = [level]
  5356. level = [self.index._get_level_number(lev) for lev in level]
  5357. if len(level) < self.index.nlevels:
  5358. new_index = self.index.droplevel(level)
  5359. if not drop:
  5360. to_insert: Iterable[tuple[Any, Any | None]]
  5361. default = "index" if "index" not in self else "level_0"
  5362. names = self.index._get_default_index_names(names, default)
  5363. if isinstance(self.index, MultiIndex):
  5364. to_insert = zip(self.index.levels, self.index.codes)
  5365. else:
  5366. to_insert = ((self.index, None),)
  5367. multi_col = isinstance(self.columns, MultiIndex)
  5368. for i, (lev, lab) in reversed(list(enumerate(to_insert))):
  5369. if level is not None and i not in level:
  5370. continue
  5371. name = names[i]
  5372. if multi_col:
  5373. col_name = list(name) if isinstance(name, tuple) else [name]
  5374. if col_fill is None:
  5375. if len(col_name) not in (1, self.columns.nlevels):
  5376. raise ValueError(
  5377. "col_fill=None is incompatible "
  5378. f"with incomplete column name {name}"
  5379. )
  5380. col_fill = col_name[0]
  5381. lev_num = self.columns._get_level_number(col_level)
  5382. name_lst = [col_fill] * lev_num + col_name
  5383. missing = self.columns.nlevels - len(name_lst)
  5384. name_lst += [col_fill] * missing
  5385. name = tuple(name_lst)
  5386. # to ndarray and maybe infer different dtype
  5387. level_values = lev._values
  5388. if level_values.dtype == np.object_:
  5389. level_values = lib.maybe_convert_objects(level_values)
  5390. if lab is not None:
  5391. # if we have the codes, extract the values with a mask
  5392. level_values = algorithms.take(
  5393. level_values, lab, allow_fill=True, fill_value=lev._na_value
  5394. )
  5395. new_obj.insert(
  5396. 0,
  5397. name,
  5398. level_values,
  5399. allow_duplicates=allow_duplicates,
  5400. )
  5401. new_obj.index = new_index
  5402. if not inplace:
  5403. return new_obj
  5404. return None
  5405. # ----------------------------------------------------------------------
  5406. # Reindex-based selection methods
  5407. @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
  5408. def isna(self) -> DataFrame:
  5409. result = self._constructor(self._mgr.isna(func=isna))
  5410. return result.__finalize__(self, method="isna")
  5411. @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
  5412. def isnull(self) -> DataFrame:
  5413. """
  5414. DataFrame.isnull is an alias for DataFrame.isna.
  5415. """
  5416. return self.isna()
  5417. @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
  5418. def notna(self) -> DataFrame:
  5419. return ~self.isna()
  5420. @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
  5421. def notnull(self) -> DataFrame:
  5422. """
  5423. DataFrame.notnull is an alias for DataFrame.notna.
  5424. """
  5425. return ~self.isna()
  5426. @overload
  5427. def dropna(
  5428. self,
  5429. *,
  5430. axis: Axis = ...,
  5431. how: AnyAll | NoDefault = ...,
  5432. thresh: int | NoDefault = ...,
  5433. subset: IndexLabel = ...,
  5434. inplace: Literal[False] = ...,
  5435. ignore_index: bool = ...,
  5436. ) -> DataFrame:
  5437. ...
  5438. @overload
  5439. def dropna(
  5440. self,
  5441. *,
  5442. axis: Axis = ...,
  5443. how: AnyAll | NoDefault = ...,
  5444. thresh: int | NoDefault = ...,
  5445. subset: IndexLabel = ...,
  5446. inplace: Literal[True],
  5447. ignore_index: bool = ...,
  5448. ) -> None:
  5449. ...
  5450. def dropna(
  5451. self,
  5452. *,
  5453. axis: Axis = 0,
  5454. how: AnyAll | NoDefault = no_default,
  5455. thresh: int | NoDefault = no_default,
  5456. subset: IndexLabel = None,
  5457. inplace: bool = False,
  5458. ignore_index: bool = False,
  5459. ) -> DataFrame | None:
  5460. """
  5461. Remove missing values.
  5462. See the :ref:`User Guide <missing_data>` for more on which values are
  5463. considered missing, and how to work with missing data.
  5464. Parameters
  5465. ----------
  5466. axis : {0 or 'index', 1 or 'columns'}, default 0
  5467. Determine if rows or columns which contain missing values are
  5468. removed.
  5469. * 0, or 'index' : Drop rows which contain missing values.
  5470. * 1, or 'columns' : Drop columns which contain missing value.
  5471. Pass tuple or list to drop on multiple axes.
  5472. Only a single axis is allowed.
  5473. how : {'any', 'all'}, default 'any'
  5474. Determine if row or column is removed from DataFrame, when we have
  5475. at least one NA or all NA.
  5476. * 'any' : If any NA values are present, drop that row or column.
  5477. * 'all' : If all values are NA, drop that row or column.
  5478. thresh : int, optional
  5479. Require that many non-NA values. Cannot be combined with how.
  5480. subset : column label or sequence of labels, optional
  5481. Labels along other axis to consider, e.g. if you are dropping rows
  5482. these would be a list of columns to include.
  5483. inplace : bool, default False
  5484. Whether to modify the DataFrame rather than creating a new one.
  5485. ignore_index : bool, default ``False``
  5486. If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
  5487. .. versionadded:: 2.0.0
  5488. Returns
  5489. -------
  5490. DataFrame or None
  5491. DataFrame with NA entries dropped from it or None if ``inplace=True``.
  5492. See Also
  5493. --------
  5494. DataFrame.isna: Indicate missing values.
  5495. DataFrame.notna : Indicate existing (non-missing) values.
  5496. DataFrame.fillna : Replace missing values.
  5497. Series.dropna : Drop missing values.
  5498. Index.dropna : Drop missing indices.
  5499. Examples
  5500. --------
  5501. >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
  5502. ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
  5503. ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
  5504. ... pd.NaT]})
  5505. >>> df
  5506. name toy born
  5507. 0 Alfred NaN NaT
  5508. 1 Batman Batmobile 1940-04-25
  5509. 2 Catwoman Bullwhip NaT
  5510. Drop the rows where at least one element is missing.
  5511. >>> df.dropna()
  5512. name toy born
  5513. 1 Batman Batmobile 1940-04-25
  5514. Drop the columns where at least one element is missing.
  5515. >>> df.dropna(axis='columns')
  5516. name
  5517. 0 Alfred
  5518. 1 Batman
  5519. 2 Catwoman
  5520. Drop the rows where all elements are missing.
  5521. >>> df.dropna(how='all')
  5522. name toy born
  5523. 0 Alfred NaN NaT
  5524. 1 Batman Batmobile 1940-04-25
  5525. 2 Catwoman Bullwhip NaT
  5526. Keep only the rows with at least 2 non-NA values.
  5527. >>> df.dropna(thresh=2)
  5528. name toy born
  5529. 1 Batman Batmobile 1940-04-25
  5530. 2 Catwoman Bullwhip NaT
  5531. Define in which columns to look for missing values.
  5532. >>> df.dropna(subset=['name', 'toy'])
  5533. name toy born
  5534. 1 Batman Batmobile 1940-04-25
  5535. 2 Catwoman Bullwhip NaT
  5536. """
  5537. if (how is not no_default) and (thresh is not no_default):
  5538. raise TypeError(
  5539. "You cannot set both the how and thresh arguments at the same time."
  5540. )
  5541. if how is no_default:
  5542. how = "any"
  5543. inplace = validate_bool_kwarg(inplace, "inplace")
  5544. if isinstance(axis, (tuple, list)):
  5545. # GH20987
  5546. raise TypeError("supplying multiple axes to axis is no longer supported.")
  5547. axis = self._get_axis_number(axis)
  5548. agg_axis = 1 - axis
  5549. agg_obj = self
  5550. if subset is not None:
  5551. # subset needs to be list
  5552. if not is_list_like(subset):
  5553. subset = [subset]
  5554. ax = self._get_axis(agg_axis)
  5555. indices = ax.get_indexer_for(subset)
  5556. check = indices == -1
  5557. if check.any():
  5558. raise KeyError(np.array(subset)[check].tolist())
  5559. agg_obj = self.take(indices, axis=agg_axis)
  5560. if thresh is not no_default:
  5561. count = agg_obj.count(axis=agg_axis)
  5562. mask = count >= thresh
  5563. elif how == "any":
  5564. # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'
  5565. mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)
  5566. elif how == "all":
  5567. # faster equivalent to 'agg_obj.count(agg_axis) > 0'
  5568. mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)
  5569. else:
  5570. raise ValueError(f"invalid how option: {how}")
  5571. if np.all(mask):
  5572. result = self.copy(deep=None)
  5573. else:
  5574. result = self.loc(axis=axis)[mask]
  5575. if ignore_index:
  5576. result.index = default_index(len(result))
  5577. if not inplace:
  5578. return result
  5579. self._update_inplace(result)
  5580. return None
  5581. def drop_duplicates(
  5582. self,
  5583. subset: Hashable | Sequence[Hashable] | None = None,
  5584. *,
  5585. keep: DropKeep = "first",
  5586. inplace: bool = False,
  5587. ignore_index: bool = False,
  5588. ) -> DataFrame | None:
  5589. """
  5590. Return DataFrame with duplicate rows removed.
  5591. Considering certain columns is optional. Indexes, including time indexes
  5592. are ignored.
  5593. Parameters
  5594. ----------
  5595. subset : column label or sequence of labels, optional
  5596. Only consider certain columns for identifying duplicates, by
  5597. default use all of the columns.
  5598. keep : {'first', 'last', ``False``}, default 'first'
  5599. Determines which duplicates (if any) to keep.
  5600. - 'first' : Drop duplicates except for the first occurrence.
  5601. - 'last' : Drop duplicates except for the last occurrence.
  5602. - ``False`` : Drop all duplicates.
  5603. inplace : bool, default ``False``
  5604. Whether to modify the DataFrame rather than creating a new one.
  5605. ignore_index : bool, default ``False``
  5606. If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
  5607. Returns
  5608. -------
  5609. DataFrame or None
  5610. DataFrame with duplicates removed or None if ``inplace=True``.
  5611. See Also
  5612. --------
  5613. DataFrame.value_counts: Count unique combinations of columns.
  5614. Examples
  5615. --------
  5616. Consider dataset containing ramen rating.
  5617. >>> df = pd.DataFrame({
  5618. ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
  5619. ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
  5620. ... 'rating': [4, 4, 3.5, 15, 5]
  5621. ... })
  5622. >>> df
  5623. brand style rating
  5624. 0 Yum Yum cup 4.0
  5625. 1 Yum Yum cup 4.0
  5626. 2 Indomie cup 3.5
  5627. 3 Indomie pack 15.0
  5628. 4 Indomie pack 5.0
  5629. By default, it removes duplicate rows based on all columns.
  5630. >>> df.drop_duplicates()
  5631. brand style rating
  5632. 0 Yum Yum cup 4.0
  5633. 2 Indomie cup 3.5
  5634. 3 Indomie pack 15.0
  5635. 4 Indomie pack 5.0
  5636. To remove duplicates on specific column(s), use ``subset``.
  5637. >>> df.drop_duplicates(subset=['brand'])
  5638. brand style rating
  5639. 0 Yum Yum cup 4.0
  5640. 2 Indomie cup 3.5
  5641. To remove duplicates and keep last occurrences, use ``keep``.
  5642. >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
  5643. brand style rating
  5644. 1 Yum Yum cup 4.0
  5645. 2 Indomie cup 3.5
  5646. 4 Indomie pack 5.0
  5647. """
  5648. if self.empty:
  5649. return self.copy(deep=None)
  5650. inplace = validate_bool_kwarg(inplace, "inplace")
  5651. ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
  5652. result = self[-self.duplicated(subset, keep=keep)]
  5653. if ignore_index:
  5654. result.index = default_index(len(result))
  5655. if inplace:
  5656. self._update_inplace(result)
  5657. return None
  5658. else:
  5659. return result
  5660. def duplicated(
  5661. self,
  5662. subset: Hashable | Sequence[Hashable] | None = None,
  5663. keep: DropKeep = "first",
  5664. ) -> Series:
  5665. """
  5666. Return boolean Series denoting duplicate rows.
  5667. Considering certain columns is optional.
  5668. Parameters
  5669. ----------
  5670. subset : column label or sequence of labels, optional
  5671. Only consider certain columns for identifying duplicates, by
  5672. default use all of the columns.
  5673. keep : {'first', 'last', False}, default 'first'
  5674. Determines which duplicates (if any) to mark.
  5675. - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
  5676. - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
  5677. - False : Mark all duplicates as ``True``.
  5678. Returns
  5679. -------
  5680. Series
  5681. Boolean series for each duplicated rows.
  5682. See Also
  5683. --------
  5684. Index.duplicated : Equivalent method on index.
  5685. Series.duplicated : Equivalent method on Series.
  5686. Series.drop_duplicates : Remove duplicate values from Series.
  5687. DataFrame.drop_duplicates : Remove duplicate values from DataFrame.
  5688. Examples
  5689. --------
  5690. Consider dataset containing ramen rating.
  5691. >>> df = pd.DataFrame({
  5692. ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
  5693. ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
  5694. ... 'rating': [4, 4, 3.5, 15, 5]
  5695. ... })
  5696. >>> df
  5697. brand style rating
  5698. 0 Yum Yum cup 4.0
  5699. 1 Yum Yum cup 4.0
  5700. 2 Indomie cup 3.5
  5701. 3 Indomie pack 15.0
  5702. 4 Indomie pack 5.0
  5703. By default, for each set of duplicated values, the first occurrence
  5704. is set on False and all others on True.
  5705. >>> df.duplicated()
  5706. 0 False
  5707. 1 True
  5708. 2 False
  5709. 3 False
  5710. 4 False
  5711. dtype: bool
  5712. By using 'last', the last occurrence of each set of duplicated values
  5713. is set on False and all others on True.
  5714. >>> df.duplicated(keep='last')
  5715. 0 True
  5716. 1 False
  5717. 2 False
  5718. 3 False
  5719. 4 False
  5720. dtype: bool
  5721. By setting ``keep`` on False, all duplicates are True.
  5722. >>> df.duplicated(keep=False)
  5723. 0 True
  5724. 1 True
  5725. 2 False
  5726. 3 False
  5727. 4 False
  5728. dtype: bool
  5729. To find duplicates on specific column(s), use ``subset``.
  5730. >>> df.duplicated(subset=['brand'])
  5731. 0 False
  5732. 1 True
  5733. 2 False
  5734. 3 True
  5735. 4 True
  5736. dtype: bool
  5737. """
  5738. if self.empty:
  5739. return self._constructor_sliced(dtype=bool)
  5740. def f(vals) -> tuple[np.ndarray, int]:
  5741. labels, shape = algorithms.factorize(vals, size_hint=len(self))
  5742. return labels.astype("i8", copy=False), len(shape)
  5743. if subset is None:
  5744. # https://github.com/pandas-dev/pandas/issues/28770
  5745. # Incompatible types in assignment (expression has type "Index", variable
  5746. # has type "Sequence[Any]")
  5747. subset = self.columns # type: ignore[assignment]
  5748. elif (
  5749. not np.iterable(subset)
  5750. or isinstance(subset, str)
  5751. or isinstance(subset, tuple)
  5752. and subset in self.columns
  5753. ):
  5754. subset = (subset,)
  5755. # needed for mypy since can't narrow types using np.iterable
  5756. subset = cast(Sequence, subset)
  5757. # Verify all columns in subset exist in the queried dataframe
  5758. # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
  5759. # key that doesn't exist.
  5760. diff = set(subset) - set(self.columns)
  5761. if diff:
  5762. raise KeyError(Index(diff))
  5763. if len(subset) == 1 and self.columns.is_unique:
  5764. # GH#45236 This is faster than get_group_index below
  5765. result = self[subset[0]].duplicated(keep)
  5766. result.name = None
  5767. else:
  5768. vals = (col.values for name, col in self.items() if name in subset)
  5769. labels, shape = map(list, zip(*map(f, vals)))
  5770. ids = get_group_index(
  5771. labels,
  5772. # error: Argument 1 to "tuple" has incompatible type "List[_T]";
  5773. # expected "Iterable[int]"
  5774. tuple(shape), # type: ignore[arg-type]
  5775. sort=False,
  5776. xnull=False,
  5777. )
  5778. result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
  5779. return result.__finalize__(self, method="duplicated")
  5780. # ----------------------------------------------------------------------
  5781. # Sorting
  5782. # error: Signature of "sort_values" incompatible with supertype "NDFrame"
  5783. @overload # type: ignore[override]
  5784. def sort_values(
  5785. self,
  5786. by: IndexLabel,
  5787. *,
  5788. axis: Axis = ...,
  5789. ascending=...,
  5790. inplace: Literal[False] = ...,
  5791. kind: str = ...,
  5792. na_position: str = ...,
  5793. ignore_index: bool = ...,
  5794. key: ValueKeyFunc = ...,
  5795. ) -> DataFrame:
  5796. ...
  5797. @overload
  5798. def sort_values(
  5799. self,
  5800. by: IndexLabel,
  5801. *,
  5802. axis: Axis = ...,
  5803. ascending=...,
  5804. inplace: Literal[True],
  5805. kind: str = ...,
  5806. na_position: str = ...,
  5807. ignore_index: bool = ...,
  5808. key: ValueKeyFunc = ...,
  5809. ) -> None:
  5810. ...
  5811. # TODO: Just move the sort_values doc here.
  5812. @Substitution(**_shared_doc_kwargs)
  5813. @Appender(NDFrame.sort_values.__doc__)
  5814. def sort_values(
  5815. self,
  5816. by: IndexLabel,
  5817. *,
  5818. axis: Axis = 0,
  5819. ascending: bool | list[bool] | tuple[bool, ...] = True,
  5820. inplace: bool = False,
  5821. kind: str = "quicksort",
  5822. na_position: str = "last",
  5823. ignore_index: bool = False,
  5824. key: ValueKeyFunc = None,
  5825. ) -> DataFrame | None:
  5826. inplace = validate_bool_kwarg(inplace, "inplace")
  5827. axis = self._get_axis_number(axis)
  5828. ascending = validate_ascending(ascending)
  5829. if not isinstance(by, list):
  5830. by = [by]
  5831. # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";
  5832. # expected "Sized"
  5833. if is_sequence(ascending) and (
  5834. len(by) != len(ascending) # type: ignore[arg-type]
  5835. ):
  5836. # error: Argument 1 to "len" has incompatible type "Union[bool,
  5837. # List[bool]]"; expected "Sized"
  5838. raise ValueError(
  5839. f"Length of ascending ({len(ascending)})" # type: ignore[arg-type]
  5840. f" != length of by ({len(by)})"
  5841. )
  5842. if len(by) > 1:
  5843. keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
  5844. # need to rewrap columns in Series to apply key function
  5845. if key is not None:
  5846. # error: List comprehension has incompatible type List[Series];
  5847. # expected List[ndarray]
  5848. keys = [
  5849. Series(k, name=name) # type: ignore[misc]
  5850. for (k, name) in zip(keys, by)
  5851. ]
  5852. indexer = lexsort_indexer(
  5853. keys, orders=ascending, na_position=na_position, key=key
  5854. )
  5855. elif len(by):
  5856. # len(by) == 1
  5857. by = by[0]
  5858. k = self._get_label_or_level_values(by, axis=axis)
  5859. # need to rewrap column in Series to apply key function
  5860. if key is not None:
  5861. # error: Incompatible types in assignment (expression has type
  5862. # "Series", variable has type "ndarray")
  5863. k = Series(k, name=by) # type: ignore[assignment]
  5864. if isinstance(ascending, (tuple, list)):
  5865. ascending = ascending[0]
  5866. indexer = nargsort(
  5867. k, kind=kind, ascending=ascending, na_position=na_position, key=key
  5868. )
  5869. else:
  5870. if inplace:
  5871. return self._update_inplace(self)
  5872. else:
  5873. return self.copy(deep=None)
  5874. if is_range_indexer(indexer, len(indexer)):
  5875. result = self.copy(deep=(not inplace and not using_copy_on_write()))
  5876. if ignore_index:
  5877. result.index = default_index(len(result))
  5878. if inplace:
  5879. return self._update_inplace(result)
  5880. else:
  5881. return result
  5882. new_data = self._mgr.take(
  5883. indexer, axis=self._get_block_manager_axis(axis), verify=False
  5884. )
  5885. if ignore_index:
  5886. new_data.set_axis(
  5887. self._get_block_manager_axis(axis), default_index(len(indexer))
  5888. )
  5889. result = self._constructor(new_data)
  5890. if inplace:
  5891. return self._update_inplace(result)
  5892. else:
  5893. return result.__finalize__(self, method="sort_values")
  5894. @overload
  5895. def sort_index(
  5896. self,
  5897. *,
  5898. axis: Axis = ...,
  5899. level: IndexLabel = ...,
  5900. ascending: bool | Sequence[bool] = ...,
  5901. inplace: Literal[True],
  5902. kind: SortKind = ...,
  5903. na_position: NaPosition = ...,
  5904. sort_remaining: bool = ...,
  5905. ignore_index: bool = ...,
  5906. key: IndexKeyFunc = ...,
  5907. ) -> None:
  5908. ...
  5909. @overload
  5910. def sort_index(
  5911. self,
  5912. *,
  5913. axis: Axis = ...,
  5914. level: IndexLabel = ...,
  5915. ascending: bool | Sequence[bool] = ...,
  5916. inplace: Literal[False] = ...,
  5917. kind: SortKind = ...,
  5918. na_position: NaPosition = ...,
  5919. sort_remaining: bool = ...,
  5920. ignore_index: bool = ...,
  5921. key: IndexKeyFunc = ...,
  5922. ) -> DataFrame:
  5923. ...
  5924. @overload
  5925. def sort_index(
  5926. self,
  5927. *,
  5928. axis: Axis = ...,
  5929. level: IndexLabel = ...,
  5930. ascending: bool | Sequence[bool] = ...,
  5931. inplace: bool = ...,
  5932. kind: SortKind = ...,
  5933. na_position: NaPosition = ...,
  5934. sort_remaining: bool = ...,
  5935. ignore_index: bool = ...,
  5936. key: IndexKeyFunc = ...,
  5937. ) -> DataFrame | None:
  5938. ...
  5939. def sort_index(
  5940. self,
  5941. *,
  5942. axis: Axis = 0,
  5943. level: IndexLabel = None,
  5944. ascending: bool | Sequence[bool] = True,
  5945. inplace: bool = False,
  5946. kind: SortKind = "quicksort",
  5947. na_position: NaPosition = "last",
  5948. sort_remaining: bool = True,
  5949. ignore_index: bool = False,
  5950. key: IndexKeyFunc = None,
  5951. ) -> DataFrame | None:
  5952. """
  5953. Sort object by labels (along an axis).
  5954. Returns a new DataFrame sorted by label if `inplace` argument is
  5955. ``False``, otherwise updates the original DataFrame and returns None.
  5956. Parameters
  5957. ----------
  5958. axis : {0 or 'index', 1 or 'columns'}, default 0
  5959. The axis along which to sort. The value 0 identifies the rows,
  5960. and 1 identifies the columns.
  5961. level : int or level name or list of ints or list of level names
  5962. If not None, sort on values in specified index level(s).
  5963. ascending : bool or list-like of bools, default True
  5964. Sort ascending vs. descending. When the index is a MultiIndex the
  5965. sort direction can be controlled for each level individually.
  5966. inplace : bool, default False
  5967. Whether to modify the DataFrame rather than creating a new one.
  5968. kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
  5969. Choice of sorting algorithm. See also :func:`numpy.sort` for more
  5970. information. `mergesort` and `stable` are the only stable algorithms. For
  5971. DataFrames, this option is only applied when sorting on a single
  5972. column or label.
  5973. na_position : {'first', 'last'}, default 'last'
  5974. Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
  5975. Not implemented for MultiIndex.
  5976. sort_remaining : bool, default True
  5977. If True and sorting by level and index is multilevel, sort by other
  5978. levels too (in order) after sorting by specified level.
  5979. ignore_index : bool, default False
  5980. If True, the resulting axis will be labeled 0, 1, …, n - 1.
  5981. key : callable, optional
  5982. If not None, apply the key function to the index values
  5983. before sorting. This is similar to the `key` argument in the
  5984. builtin :meth:`sorted` function, with the notable difference that
  5985. this `key` function should be *vectorized*. It should expect an
  5986. ``Index`` and return an ``Index`` of the same shape. For MultiIndex
  5987. inputs, the key is applied *per level*.
  5988. .. versionadded:: 1.1.0
  5989. Returns
  5990. -------
  5991. DataFrame or None
  5992. The original DataFrame sorted by the labels or None if ``inplace=True``.
  5993. See Also
  5994. --------
  5995. Series.sort_index : Sort Series by the index.
  5996. DataFrame.sort_values : Sort DataFrame by the value.
  5997. Series.sort_values : Sort Series by the value.
  5998. Examples
  5999. --------
  6000. >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],
  6001. ... columns=['A'])
  6002. >>> df.sort_index()
  6003. A
  6004. 1 4
  6005. 29 2
  6006. 100 1
  6007. 150 5
  6008. 234 3
  6009. By default, it sorts in ascending order, to sort in descending order,
  6010. use ``ascending=False``
  6011. >>> df.sort_index(ascending=False)
  6012. A
  6013. 234 3
  6014. 150 5
  6015. 100 1
  6016. 29 2
  6017. 1 4
  6018. A key function can be specified which is applied to the index before
  6019. sorting. For a ``MultiIndex`` this is applied to each level separately.
  6020. >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
  6021. >>> df.sort_index(key=lambda x: x.str.lower())
  6022. a
  6023. A 1
  6024. b 2
  6025. C 3
  6026. d 4
  6027. """
  6028. return super().sort_index(
  6029. axis=axis,
  6030. level=level,
  6031. ascending=ascending,
  6032. inplace=inplace,
  6033. kind=kind,
  6034. na_position=na_position,
  6035. sort_remaining=sort_remaining,
  6036. ignore_index=ignore_index,
  6037. key=key,
  6038. )
  6039. def value_counts(
  6040. self,
  6041. subset: Sequence[Hashable] | None = None,
  6042. normalize: bool = False,
  6043. sort: bool = True,
  6044. ascending: bool = False,
  6045. dropna: bool = True,
  6046. ) -> Series:
  6047. """
  6048. Return a Series containing counts of unique rows in the DataFrame.
  6049. .. versionadded:: 1.1.0
  6050. Parameters
  6051. ----------
  6052. subset : label or list of labels, optional
  6053. Columns to use when counting unique combinations.
  6054. normalize : bool, default False
  6055. Return proportions rather than frequencies.
  6056. sort : bool, default True
  6057. Sort by frequencies.
  6058. ascending : bool, default False
  6059. Sort in ascending order.
  6060. dropna : bool, default True
  6061. Don’t include counts of rows that contain NA values.
  6062. .. versionadded:: 1.3.0
  6063. Returns
  6064. -------
  6065. Series
  6066. See Also
  6067. --------
  6068. Series.value_counts: Equivalent method on Series.
  6069. Notes
  6070. -----
  6071. The returned Series will have a MultiIndex with one level per input
  6072. column but an Index (non-multi) for a single label. By default, rows
  6073. that contain any NA values are omitted from the result. By default,
  6074. the resulting Series will be in descending order so that the first
  6075. element is the most frequently-occurring row.
  6076. Examples
  6077. --------
  6078. >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
  6079. ... 'num_wings': [2, 0, 0, 0]},
  6080. ... index=['falcon', 'dog', 'cat', 'ant'])
  6081. >>> df
  6082. num_legs num_wings
  6083. falcon 2 2
  6084. dog 4 0
  6085. cat 4 0
  6086. ant 6 0
  6087. >>> df.value_counts()
  6088. num_legs num_wings
  6089. 4 0 2
  6090. 2 2 1
  6091. 6 0 1
  6092. Name: count, dtype: int64
  6093. >>> df.value_counts(sort=False)
  6094. num_legs num_wings
  6095. 2 2 1
  6096. 4 0 2
  6097. 6 0 1
  6098. Name: count, dtype: int64
  6099. >>> df.value_counts(ascending=True)
  6100. num_legs num_wings
  6101. 2 2 1
  6102. 6 0 1
  6103. 4 0 2
  6104. Name: count, dtype: int64
  6105. >>> df.value_counts(normalize=True)
  6106. num_legs num_wings
  6107. 4 0 0.50
  6108. 2 2 0.25
  6109. 6 0 0.25
  6110. Name: proportion, dtype: float64
  6111. With `dropna` set to `False` we can also count rows with NA values.
  6112. >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],
  6113. ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})
  6114. >>> df
  6115. first_name middle_name
  6116. 0 John Smith
  6117. 1 Anne <NA>
  6118. 2 John <NA>
  6119. 3 Beth Louise
  6120. >>> df.value_counts()
  6121. first_name middle_name
  6122. Beth Louise 1
  6123. John Smith 1
  6124. Name: count, dtype: int64
  6125. >>> df.value_counts(dropna=False)
  6126. first_name middle_name
  6127. Anne NaN 1
  6128. Beth Louise 1
  6129. John Smith 1
  6130. NaN 1
  6131. Name: count, dtype: int64
  6132. >>> df.value_counts("first_name")
  6133. first_name
  6134. John 2
  6135. Anne 1
  6136. Beth 1
  6137. Name: count, dtype: int64
  6138. """
  6139. if subset is None:
  6140. subset = self.columns.tolist()
  6141. name = "proportion" if normalize else "count"
  6142. counts = self.groupby(subset, dropna=dropna).grouper.size()
  6143. counts.name = name
  6144. if sort:
  6145. counts = counts.sort_values(ascending=ascending)
  6146. if normalize:
  6147. counts /= counts.sum()
  6148. # Force MultiIndex for single column
  6149. if is_list_like(subset) and len(subset) == 1:
  6150. counts.index = MultiIndex.from_arrays(
  6151. [counts.index], names=[counts.index.name]
  6152. )
  6153. return counts
  6154. def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
  6155. """
  6156. Return the first `n` rows ordered by `columns` in descending order.
  6157. Return the first `n` rows with the largest values in `columns`, in
  6158. descending order. The columns that are not specified are returned as
  6159. well, but not used for ordering.
  6160. This method is equivalent to
  6161. ``df.sort_values(columns, ascending=False).head(n)``, but more
  6162. performant.
  6163. Parameters
  6164. ----------
  6165. n : int
  6166. Number of rows to return.
  6167. columns : label or list of labels
  6168. Column label(s) to order by.
  6169. keep : {'first', 'last', 'all'}, default 'first'
  6170. Where there are duplicate values:
  6171. - ``first`` : prioritize the first occurrence(s)
  6172. - ``last`` : prioritize the last occurrence(s)
  6173. - ``all`` : do not drop any duplicates, even it means
  6174. selecting more than `n` items.
  6175. Returns
  6176. -------
  6177. DataFrame
  6178. The first `n` rows ordered by the given columns in descending
  6179. order.
  6180. See Also
  6181. --------
  6182. DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
  6183. ascending order.
  6184. DataFrame.sort_values : Sort DataFrame by the values.
  6185. DataFrame.head : Return the first `n` rows without re-ordering.
  6186. Notes
  6187. -----
  6188. This function cannot be used with all column types. For example, when
  6189. specifying columns with `object` or `category` dtypes, ``TypeError`` is
  6190. raised.
  6191. Examples
  6192. --------
  6193. >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
  6194. ... 434000, 434000, 337000, 11300,
  6195. ... 11300, 11300],
  6196. ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
  6197. ... 17036, 182, 38, 311],
  6198. ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
  6199. ... "IS", "NR", "TV", "AI"]},
  6200. ... index=["Italy", "France", "Malta",
  6201. ... "Maldives", "Brunei", "Iceland",
  6202. ... "Nauru", "Tuvalu", "Anguilla"])
  6203. >>> df
  6204. population GDP alpha-2
  6205. Italy 59000000 1937894 IT
  6206. France 65000000 2583560 FR
  6207. Malta 434000 12011 MT
  6208. Maldives 434000 4520 MV
  6209. Brunei 434000 12128 BN
  6210. Iceland 337000 17036 IS
  6211. Nauru 11300 182 NR
  6212. Tuvalu 11300 38 TV
  6213. Anguilla 11300 311 AI
  6214. In the following example, we will use ``nlargest`` to select the three
  6215. rows having the largest values in column "population".
  6216. >>> df.nlargest(3, 'population')
  6217. population GDP alpha-2
  6218. France 65000000 2583560 FR
  6219. Italy 59000000 1937894 IT
  6220. Malta 434000 12011 MT
  6221. When using ``keep='last'``, ties are resolved in reverse order:
  6222. >>> df.nlargest(3, 'population', keep='last')
  6223. population GDP alpha-2
  6224. France 65000000 2583560 FR
  6225. Italy 59000000 1937894 IT
  6226. Brunei 434000 12128 BN
  6227. When using ``keep='all'``, all duplicate items are maintained:
  6228. >>> df.nlargest(3, 'population', keep='all')
  6229. population GDP alpha-2
  6230. France 65000000 2583560 FR
  6231. Italy 59000000 1937894 IT
  6232. Malta 434000 12011 MT
  6233. Maldives 434000 4520 MV
  6234. Brunei 434000 12128 BN
  6235. To order by the largest values in column "population" and then "GDP",
  6236. we can specify multiple columns like in the next example.
  6237. >>> df.nlargest(3, ['population', 'GDP'])
  6238. population GDP alpha-2
  6239. France 65000000 2583560 FR
  6240. Italy 59000000 1937894 IT
  6241. Brunei 434000 12128 BN
  6242. """
  6243. return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
  6244. def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
  6245. """
  6246. Return the first `n` rows ordered by `columns` in ascending order.
  6247. Return the first `n` rows with the smallest values in `columns`, in
  6248. ascending order. The columns that are not specified are returned as
  6249. well, but not used for ordering.
  6250. This method is equivalent to
  6251. ``df.sort_values(columns, ascending=True).head(n)``, but more
  6252. performant.
  6253. Parameters
  6254. ----------
  6255. n : int
  6256. Number of items to retrieve.
  6257. columns : list or str
  6258. Column name or names to order by.
  6259. keep : {'first', 'last', 'all'}, default 'first'
  6260. Where there are duplicate values:
  6261. - ``first`` : take the first occurrence.
  6262. - ``last`` : take the last occurrence.
  6263. - ``all`` : do not drop any duplicates, even it means
  6264. selecting more than `n` items.
  6265. Returns
  6266. -------
  6267. DataFrame
  6268. See Also
  6269. --------
  6270. DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
  6271. descending order.
  6272. DataFrame.sort_values : Sort DataFrame by the values.
  6273. DataFrame.head : Return the first `n` rows without re-ordering.
  6274. Examples
  6275. --------
  6276. >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
  6277. ... 434000, 434000, 337000, 337000,
  6278. ... 11300, 11300],
  6279. ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
  6280. ... 17036, 182, 38, 311],
  6281. ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
  6282. ... "IS", "NR", "TV", "AI"]},
  6283. ... index=["Italy", "France", "Malta",
  6284. ... "Maldives", "Brunei", "Iceland",
  6285. ... "Nauru", "Tuvalu", "Anguilla"])
  6286. >>> df
  6287. population GDP alpha-2
  6288. Italy 59000000 1937894 IT
  6289. France 65000000 2583560 FR
  6290. Malta 434000 12011 MT
  6291. Maldives 434000 4520 MV
  6292. Brunei 434000 12128 BN
  6293. Iceland 337000 17036 IS
  6294. Nauru 337000 182 NR
  6295. Tuvalu 11300 38 TV
  6296. Anguilla 11300 311 AI
  6297. In the following example, we will use ``nsmallest`` to select the
  6298. three rows having the smallest values in column "population".
  6299. >>> df.nsmallest(3, 'population')
  6300. population GDP alpha-2
  6301. Tuvalu 11300 38 TV
  6302. Anguilla 11300 311 AI
  6303. Iceland 337000 17036 IS
  6304. When using ``keep='last'``, ties are resolved in reverse order:
  6305. >>> df.nsmallest(3, 'population', keep='last')
  6306. population GDP alpha-2
  6307. Anguilla 11300 311 AI
  6308. Tuvalu 11300 38 TV
  6309. Nauru 337000 182 NR
  6310. When using ``keep='all'``, all duplicate items are maintained:
  6311. >>> df.nsmallest(3, 'population', keep='all')
  6312. population GDP alpha-2
  6313. Tuvalu 11300 38 TV
  6314. Anguilla 11300 311 AI
  6315. Iceland 337000 17036 IS
  6316. Nauru 337000 182 NR
  6317. To order by the smallest values in column "population" and then "GDP", we can
  6318. specify multiple columns like in the next example.
  6319. >>> df.nsmallest(3, ['population', 'GDP'])
  6320. population GDP alpha-2
  6321. Tuvalu 11300 38 TV
  6322. Anguilla 11300 311 AI
  6323. Nauru 337000 182 NR
  6324. """
  6325. return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()
  6326. @doc(
  6327. Series.swaplevel,
  6328. klass=_shared_doc_kwargs["klass"],
  6329. extra_params=dedent(
  6330. """axis : {0 or 'index', 1 or 'columns'}, default 0
  6331. The axis to swap levels on. 0 or 'index' for row-wise, 1 or
  6332. 'columns' for column-wise."""
  6333. ),
  6334. examples=dedent(
  6335. """\
  6336. Examples
  6337. --------
  6338. >>> df = pd.DataFrame(
  6339. ... {"Grade": ["A", "B", "A", "C"]},
  6340. ... index=[
  6341. ... ["Final exam", "Final exam", "Coursework", "Coursework"],
  6342. ... ["History", "Geography", "History", "Geography"],
  6343. ... ["January", "February", "March", "April"],
  6344. ... ],
  6345. ... )
  6346. >>> df
  6347. Grade
  6348. Final exam History January A
  6349. Geography February B
  6350. Coursework History March A
  6351. Geography April C
  6352. In the following example, we will swap the levels of the indices.
  6353. Here, we will swap the levels column-wise, but levels can be swapped row-wise
  6354. in a similar manner. Note that column-wise is the default behaviour.
  6355. By not supplying any arguments for i and j, we swap the last and second to
  6356. last indices.
  6357. >>> df.swaplevel()
  6358. Grade
  6359. Final exam January History A
  6360. February Geography B
  6361. Coursework March History A
  6362. April Geography C
  6363. By supplying one argument, we can choose which index to swap the last
  6364. index with. We can for example swap the first index with the last one as
  6365. follows.
  6366. >>> df.swaplevel(0)
  6367. Grade
  6368. January History Final exam A
  6369. February Geography Final exam B
  6370. March History Coursework A
  6371. April Geography Coursework C
  6372. We can also define explicitly which indices we want to swap by supplying values
  6373. for both i and j. Here, we for example swap the first and second indices.
  6374. >>> df.swaplevel(0, 1)
  6375. Grade
  6376. History Final exam January A
  6377. Geography Final exam February B
  6378. History Coursework March A
  6379. Geography Coursework April C"""
  6380. ),
  6381. )
  6382. def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
  6383. result = self.copy(deep=None)
  6384. axis = self._get_axis_number(axis)
  6385. if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover
  6386. raise TypeError("Can only swap levels on a hierarchical axis.")
  6387. if axis == 0:
  6388. assert isinstance(result.index, MultiIndex)
  6389. result.index = result.index.swaplevel(i, j)
  6390. else:
  6391. assert isinstance(result.columns, MultiIndex)
  6392. result.columns = result.columns.swaplevel(i, j)
  6393. return result
  6394. def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame:
  6395. """
  6396. Rearrange index levels using input order. May not drop or duplicate levels.
  6397. Parameters
  6398. ----------
  6399. order : list of int or list of str
  6400. List representing new level order. Reference level by number
  6401. (position) or by key (label).
  6402. axis : {0 or 'index', 1 or 'columns'}, default 0
  6403. Where to reorder levels.
  6404. Returns
  6405. -------
  6406. DataFrame
  6407. Examples
  6408. --------
  6409. >>> data = {
  6410. ... "class": ["Mammals", "Mammals", "Reptiles"],
  6411. ... "diet": ["Omnivore", "Carnivore", "Carnivore"],
  6412. ... "species": ["Humans", "Dogs", "Snakes"],
  6413. ... }
  6414. >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])
  6415. >>> df = df.set_index(["class", "diet"])
  6416. >>> df
  6417. species
  6418. class diet
  6419. Mammals Omnivore Humans
  6420. Carnivore Dogs
  6421. Reptiles Carnivore Snakes
  6422. Let's reorder the levels of the index:
  6423. >>> df.reorder_levels(["diet", "class"])
  6424. species
  6425. diet class
  6426. Omnivore Mammals Humans
  6427. Carnivore Mammals Dogs
  6428. Reptiles Snakes
  6429. """
  6430. axis = self._get_axis_number(axis)
  6431. if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover
  6432. raise TypeError("Can only reorder levels on a hierarchical axis.")
  6433. result = self.copy(deep=None)
  6434. if axis == 0:
  6435. assert isinstance(result.index, MultiIndex)
  6436. result.index = result.index.reorder_levels(order)
  6437. else:
  6438. assert isinstance(result.columns, MultiIndex)
  6439. result.columns = result.columns.reorder_levels(order)
  6440. return result
  6441. # ----------------------------------------------------------------------
  6442. # Arithmetic Methods
  6443. def _cmp_method(self, other, op):
  6444. axis: Literal[1] = 1 # only relevant for Series other case
  6445. self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)
  6446. # See GH#4537 for discussion of scalar op behavior
  6447. new_data = self._dispatch_frame_op(other, op, axis=axis)
  6448. return self._construct_result(new_data)
  6449. def _arith_method(self, other, op):
  6450. if ops.should_reindex_frame_op(self, other, op, 1, None, None):
  6451. return ops.frame_arith_method_with_reindex(self, other, op)
  6452. axis: Literal[1] = 1 # only relevant for Series other case
  6453. other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))
  6454. self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None)
  6455. new_data = self._dispatch_frame_op(other, op, axis=axis)
  6456. return self._construct_result(new_data)
  6457. _logical_method = _arith_method
  6458. def _dispatch_frame_op(self, right, func: Callable, axis: AxisInt | None = None):
  6459. """
  6460. Evaluate the frame operation func(left, right) by evaluating
  6461. column-by-column, dispatching to the Series implementation.
  6462. Parameters
  6463. ----------
  6464. right : scalar, Series, or DataFrame
  6465. func : arithmetic or comparison operator
  6466. axis : {None, 0, 1}
  6467. Returns
  6468. -------
  6469. DataFrame
  6470. """
  6471. # Get the appropriate array-op to apply to each column/block's values.
  6472. array_op = ops.get_array_op(func)
  6473. right = lib.item_from_zerodim(right)
  6474. if not is_list_like(right):
  6475. # i.e. scalar, faster than checking np.ndim(right) == 0
  6476. with np.errstate(all="ignore"):
  6477. bm = self._mgr.apply(array_op, right=right)
  6478. return self._constructor(bm)
  6479. elif isinstance(right, DataFrame):
  6480. assert self.index.equals(right.index)
  6481. assert self.columns.equals(right.columns)
  6482. # TODO: The previous assertion `assert right._indexed_same(self)`
  6483. # fails in cases with empty columns reached via
  6484. # _frame_arith_method_with_reindex
  6485. # TODO operate_blockwise expects a manager of the same type
  6486. with np.errstate(all="ignore"):
  6487. bm = self._mgr.operate_blockwise(
  6488. # error: Argument 1 to "operate_blockwise" of "ArrayManager" has
  6489. # incompatible type "Union[ArrayManager, BlockManager]"; expected
  6490. # "ArrayManager"
  6491. # error: Argument 1 to "operate_blockwise" of "BlockManager" has
  6492. # incompatible type "Union[ArrayManager, BlockManager]"; expected
  6493. # "BlockManager"
  6494. right._mgr, # type: ignore[arg-type]
  6495. array_op,
  6496. )
  6497. return self._constructor(bm)
  6498. elif isinstance(right, Series) and axis == 1:
  6499. # axis=1 means we want to operate row-by-row
  6500. assert right.index.equals(self.columns)
  6501. right = right._values
  6502. # maybe_align_as_frame ensures we do not have an ndarray here
  6503. assert not isinstance(right, np.ndarray)
  6504. with np.errstate(all="ignore"):
  6505. arrays = [
  6506. array_op(_left, _right)
  6507. for _left, _right in zip(self._iter_column_arrays(), right)
  6508. ]
  6509. elif isinstance(right, Series):
  6510. assert right.index.equals(self.index) # Handle other cases later
  6511. right = right._values
  6512. with np.errstate(all="ignore"):
  6513. arrays = [array_op(left, right) for left in self._iter_column_arrays()]
  6514. else:
  6515. # Remaining cases have less-obvious dispatch rules
  6516. raise NotImplementedError(right)
  6517. return type(self)._from_arrays(
  6518. arrays, self.columns, self.index, verify_integrity=False
  6519. )
  6520. def _combine_frame(self, other: DataFrame, func, fill_value=None):
  6521. # at this point we have `self._indexed_same(other)`
  6522. if fill_value is None:
  6523. # since _arith_op may be called in a loop, avoid function call
  6524. # overhead if possible by doing this check once
  6525. _arith_op = func
  6526. else:
  6527. def _arith_op(left, right):
  6528. # for the mixed_type case where we iterate over columns,
  6529. # _arith_op(left, right) is equivalent to
  6530. # left._binop(right, func, fill_value=fill_value)
  6531. left, right = ops.fill_binop(left, right, fill_value)
  6532. return func(left, right)
  6533. new_data = self._dispatch_frame_op(other, _arith_op)
  6534. return new_data
  6535. def _construct_result(self, result) -> DataFrame:
  6536. """
  6537. Wrap the result of an arithmetic, comparison, or logical operation.
  6538. Parameters
  6539. ----------
  6540. result : DataFrame
  6541. Returns
  6542. -------
  6543. DataFrame
  6544. """
  6545. out = self._constructor(result, copy=False).__finalize__(self)
  6546. # Pin columns instead of passing to constructor for compat with
  6547. # non-unique columns case
  6548. out.columns = self.columns
  6549. out.index = self.index
  6550. return out
  6551. def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:
  6552. # Naive implementation, room for optimization
  6553. div = self // other
  6554. mod = self - div * other
  6555. return div, mod
  6556. def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
  6557. # Naive implementation, room for optimization
  6558. div = other // self
  6559. mod = other - div * self
  6560. return div, mod
  6561. # ----------------------------------------------------------------------
  6562. # Combination-Related
  6563. @doc(
  6564. _shared_docs["compare"],
  6565. """
  6566. Returns
  6567. -------
  6568. DataFrame
  6569. DataFrame that shows the differences stacked side by side.
  6570. The resulting index will be a MultiIndex with 'self' and 'other'
  6571. stacked alternately at the inner level.
  6572. Raises
  6573. ------
  6574. ValueError
  6575. When the two DataFrames don't have identical labels or shape.
  6576. See Also
  6577. --------
  6578. Series.compare : Compare with another Series and show differences.
  6579. DataFrame.equals : Test whether two objects contain the same elements.
  6580. Notes
  6581. -----
  6582. Matching NaNs will not appear as a difference.
  6583. Can only compare identically-labeled
  6584. (i.e. same shape, identical row and column labels) DataFrames
  6585. Examples
  6586. --------
  6587. >>> df = pd.DataFrame(
  6588. ... {{
  6589. ... "col1": ["a", "a", "b", "b", "a"],
  6590. ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
  6591. ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
  6592. ... }},
  6593. ... columns=["col1", "col2", "col3"],
  6594. ... )
  6595. >>> df
  6596. col1 col2 col3
  6597. 0 a 1.0 1.0
  6598. 1 a 2.0 2.0
  6599. 2 b 3.0 3.0
  6600. 3 b NaN 4.0
  6601. 4 a 5.0 5.0
  6602. >>> df2 = df.copy()
  6603. >>> df2.loc[0, 'col1'] = 'c'
  6604. >>> df2.loc[2, 'col3'] = 4.0
  6605. >>> df2
  6606. col1 col2 col3
  6607. 0 c 1.0 1.0
  6608. 1 a 2.0 2.0
  6609. 2 b 3.0 4.0
  6610. 3 b NaN 4.0
  6611. 4 a 5.0 5.0
  6612. Align the differences on columns
  6613. >>> df.compare(df2)
  6614. col1 col3
  6615. self other self other
  6616. 0 a c NaN NaN
  6617. 2 NaN NaN 3.0 4.0
  6618. Assign result_names
  6619. >>> df.compare(df2, result_names=("left", "right"))
  6620. col1 col3
  6621. left right left right
  6622. 0 a c NaN NaN
  6623. 2 NaN NaN 3.0 4.0
  6624. Stack the differences on rows
  6625. >>> df.compare(df2, align_axis=0)
  6626. col1 col3
  6627. 0 self a NaN
  6628. other c NaN
  6629. 2 self NaN 3.0
  6630. other NaN 4.0
  6631. Keep the equal values
  6632. >>> df.compare(df2, keep_equal=True)
  6633. col1 col3
  6634. self other self other
  6635. 0 a c 1.0 1.0
  6636. 2 b b 3.0 4.0
  6637. Keep all original rows and columns
  6638. >>> df.compare(df2, keep_shape=True)
  6639. col1 col2 col3
  6640. self other self other self other
  6641. 0 a c NaN NaN NaN NaN
  6642. 1 NaN NaN NaN NaN NaN NaN
  6643. 2 NaN NaN NaN NaN 3.0 4.0
  6644. 3 NaN NaN NaN NaN NaN NaN
  6645. 4 NaN NaN NaN NaN NaN NaN
  6646. Keep all original rows and columns and also all original values
  6647. >>> df.compare(df2, keep_shape=True, keep_equal=True)
  6648. col1 col2 col3
  6649. self other self other self other
  6650. 0 a c 1.0 1.0 1.0 1.0
  6651. 1 a a 2.0 2.0 2.0 2.0
  6652. 2 b b 3.0 3.0 3.0 4.0
  6653. 3 b b NaN NaN 4.0 4.0
  6654. 4 a a 5.0 5.0 5.0 5.0
  6655. """,
  6656. klass=_shared_doc_kwargs["klass"],
  6657. )
  6658. def compare(
  6659. self,
  6660. other: DataFrame,
  6661. align_axis: Axis = 1,
  6662. keep_shape: bool = False,
  6663. keep_equal: bool = False,
  6664. result_names: Suffixes = ("self", "other"),
  6665. ) -> DataFrame:
  6666. return super().compare(
  6667. other=other,
  6668. align_axis=align_axis,
  6669. keep_shape=keep_shape,
  6670. keep_equal=keep_equal,
  6671. result_names=result_names,
  6672. )
  6673. def combine(
  6674. self,
  6675. other: DataFrame,
  6676. func: Callable[[Series, Series], Series | Hashable],
  6677. fill_value=None,
  6678. overwrite: bool = True,
  6679. ) -> DataFrame:
  6680. """
  6681. Perform column-wise combine with another DataFrame.
  6682. Combines a DataFrame with `other` DataFrame using `func`
  6683. to element-wise combine columns. The row and column indexes of the
  6684. resulting DataFrame will be the union of the two.
  6685. Parameters
  6686. ----------
  6687. other : DataFrame
  6688. The DataFrame to merge column-wise.
  6689. func : function
  6690. Function that takes two series as inputs and return a Series or a
  6691. scalar. Used to merge the two dataframes column by columns.
  6692. fill_value : scalar value, default None
  6693. The value to fill NaNs with prior to passing any column to the
  6694. merge func.
  6695. overwrite : bool, default True
  6696. If True, columns in `self` that do not exist in `other` will be
  6697. overwritten with NaNs.
  6698. Returns
  6699. -------
  6700. DataFrame
  6701. Combination of the provided DataFrames.
  6702. See Also
  6703. --------
  6704. DataFrame.combine_first : Combine two DataFrame objects and default to
  6705. non-null values in frame calling the method.
  6706. Examples
  6707. --------
  6708. Combine using a simple function that chooses the smaller column.
  6709. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
  6710. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  6711. >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
  6712. >>> df1.combine(df2, take_smaller)
  6713. A B
  6714. 0 0 3
  6715. 1 0 3
  6716. Example using a true element-wise combine function.
  6717. >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
  6718. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  6719. >>> df1.combine(df2, np.minimum)
  6720. A B
  6721. 0 1 2
  6722. 1 0 3
  6723. Using `fill_value` fills Nones prior to passing the column to the
  6724. merge function.
  6725. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
  6726. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  6727. >>> df1.combine(df2, take_smaller, fill_value=-5)
  6728. A B
  6729. 0 0 -5.0
  6730. 1 0 4.0
  6731. However, if the same element in both dataframes is None, that None
  6732. is preserved
  6733. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
  6734. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
  6735. >>> df1.combine(df2, take_smaller, fill_value=-5)
  6736. A B
  6737. 0 0 -5.0
  6738. 1 0 3.0
  6739. Example that demonstrates the use of `overwrite` and behavior when
  6740. the axis differ between the dataframes.
  6741. >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
  6742. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
  6743. >>> df1.combine(df2, take_smaller)
  6744. A B C
  6745. 0 NaN NaN NaN
  6746. 1 NaN 3.0 -10.0
  6747. 2 NaN 3.0 1.0
  6748. >>> df1.combine(df2, take_smaller, overwrite=False)
  6749. A B C
  6750. 0 0.0 NaN NaN
  6751. 1 0.0 3.0 -10.0
  6752. 2 NaN 3.0 1.0
  6753. Demonstrating the preference of the passed in dataframe.
  6754. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
  6755. >>> df2.combine(df1, take_smaller)
  6756. A B C
  6757. 0 0.0 NaN NaN
  6758. 1 0.0 3.0 NaN
  6759. 2 NaN 3.0 NaN
  6760. >>> df2.combine(df1, take_smaller, overwrite=False)
  6761. A B C
  6762. 0 0.0 NaN NaN
  6763. 1 0.0 3.0 1.0
  6764. 2 NaN 3.0 1.0
  6765. """
  6766. other_idxlen = len(other.index) # save for compare
  6767. this, other = self.align(other, copy=False)
  6768. new_index = this.index
  6769. if other.empty and len(new_index) == len(self.index):
  6770. return self.copy()
  6771. if self.empty and len(other) == other_idxlen:
  6772. return other.copy()
  6773. # sorts if possible; otherwise align above ensures that these are set-equal
  6774. new_columns = this.columns.union(other.columns)
  6775. do_fill = fill_value is not None
  6776. result = {}
  6777. for col in new_columns:
  6778. series = this[col]
  6779. other_series = other[col]
  6780. this_dtype = series.dtype
  6781. other_dtype = other_series.dtype
  6782. this_mask = isna(series)
  6783. other_mask = isna(other_series)
  6784. # don't overwrite columns unnecessarily
  6785. # DO propagate if this column is not in the intersection
  6786. if not overwrite and other_mask.all():
  6787. result[col] = this[col].copy()
  6788. continue
  6789. if do_fill:
  6790. series = series.copy()
  6791. other_series = other_series.copy()
  6792. series[this_mask] = fill_value
  6793. other_series[other_mask] = fill_value
  6794. if col not in self.columns:
  6795. # If self DataFrame does not have col in other DataFrame,
  6796. # try to promote series, which is all NaN, as other_dtype.
  6797. new_dtype = other_dtype
  6798. try:
  6799. series = series.astype(new_dtype, copy=False)
  6800. except ValueError:
  6801. # e.g. new_dtype is integer types
  6802. pass
  6803. else:
  6804. # if we have different dtypes, possibly promote
  6805. new_dtype = find_common_type([this_dtype, other_dtype])
  6806. series = series.astype(new_dtype, copy=False)
  6807. other_series = other_series.astype(new_dtype, copy=False)
  6808. arr = func(series, other_series)
  6809. if isinstance(new_dtype, np.dtype):
  6810. # if new_dtype is an EA Dtype, then `func` is expected to return
  6811. # the correct dtype without any additional casting
  6812. # error: No overload variant of "maybe_downcast_to_dtype" matches
  6813. # argument types "Union[Series, Hashable]", "dtype[Any]"
  6814. arr = maybe_downcast_to_dtype( # type: ignore[call-overload]
  6815. arr, new_dtype
  6816. )
  6817. result[col] = arr
  6818. # convert_objects just in case
  6819. return self._constructor(result, index=new_index, columns=new_columns)
  6820. def combine_first(self, other: DataFrame) -> DataFrame:
  6821. """
  6822. Update null elements with value in the same location in `other`.
  6823. Combine two DataFrame objects by filling null values in one DataFrame
  6824. with non-null values from other DataFrame. The row and column indexes
  6825. of the resulting DataFrame will be the union of the two. The resulting
  6826. dataframe contains the 'first' dataframe values and overrides the
  6827. second one values where both first.loc[index, col] and
  6828. second.loc[index, col] are not missing values, upon calling
  6829. first.combine_first(second).
  6830. Parameters
  6831. ----------
  6832. other : DataFrame
  6833. Provided DataFrame to use to fill null values.
  6834. Returns
  6835. -------
  6836. DataFrame
  6837. The result of combining the provided DataFrame with the other object.
  6838. See Also
  6839. --------
  6840. DataFrame.combine : Perform series-wise operation on two DataFrames
  6841. using a given function.
  6842. Examples
  6843. --------
  6844. >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
  6845. >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
  6846. >>> df1.combine_first(df2)
  6847. A B
  6848. 0 1.0 3.0
  6849. 1 0.0 4.0
  6850. Null values still persist if the location of that null value
  6851. does not exist in `other`
  6852. >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
  6853. >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
  6854. >>> df1.combine_first(df2)
  6855. A B C
  6856. 0 NaN 4.0 NaN
  6857. 1 0.0 3.0 1.0
  6858. 2 NaN 3.0 1.0
  6859. """
  6860. from pandas.core.computation import expressions
  6861. def combiner(x, y):
  6862. mask = extract_array(isna(x))
  6863. x_values = extract_array(x, extract_numpy=True)
  6864. y_values = extract_array(y, extract_numpy=True)
  6865. # If the column y in other DataFrame is not in first DataFrame,
  6866. # just return y_values.
  6867. if y.name not in self.columns:
  6868. return y_values
  6869. return expressions.where(mask, y_values, x_values)
  6870. combined = self.combine(other, combiner, overwrite=False)
  6871. dtypes = {
  6872. col: find_common_type([self.dtypes[col], other.dtypes[col]])
  6873. for col in self.columns.intersection(other.columns)
  6874. if not is_dtype_equal(combined.dtypes[col], self.dtypes[col])
  6875. }
  6876. if dtypes:
  6877. combined = combined.astype(dtypes)
  6878. return combined
  6879. def update(
  6880. self,
  6881. other,
  6882. join: str = "left",
  6883. overwrite: bool = True,
  6884. filter_func=None,
  6885. errors: str = "ignore",
  6886. ) -> None:
  6887. """
  6888. Modify in place using non-NA values from another DataFrame.
  6889. Aligns on indices. There is no return value.
  6890. Parameters
  6891. ----------
  6892. other : DataFrame, or object coercible into a DataFrame
  6893. Should have at least one matching index/column label
  6894. with the original DataFrame. If a Series is passed,
  6895. its name attribute must be set, and that will be
  6896. used as the column name to align with the original DataFrame.
  6897. join : {'left'}, default 'left'
  6898. Only left join is implemented, keeping the index and columns of the
  6899. original object.
  6900. overwrite : bool, default True
  6901. How to handle non-NA values for overlapping keys:
  6902. * True: overwrite original DataFrame's values
  6903. with values from `other`.
  6904. * False: only update values that are NA in
  6905. the original DataFrame.
  6906. filter_func : callable(1d-array) -> bool 1d-array, optional
  6907. Can choose to replace values other than NA. Return True for values
  6908. that should be updated.
  6909. errors : {'raise', 'ignore'}, default 'ignore'
  6910. If 'raise', will raise a ValueError if the DataFrame and `other`
  6911. both contain non-NA data in the same place.
  6912. Returns
  6913. -------
  6914. None
  6915. This method directly changes calling object.
  6916. Raises
  6917. ------
  6918. ValueError
  6919. * When `errors='raise'` and there's overlapping non-NA data.
  6920. * When `errors` is not either `'ignore'` or `'raise'`
  6921. NotImplementedError
  6922. * If `join != 'left'`
  6923. See Also
  6924. --------
  6925. dict.update : Similar method for dictionaries.
  6926. DataFrame.merge : For column(s)-on-column(s) operations.
  6927. Examples
  6928. --------
  6929. >>> df = pd.DataFrame({'A': [1, 2, 3],
  6930. ... 'B': [400, 500, 600]})
  6931. >>> new_df = pd.DataFrame({'B': [4, 5, 6],
  6932. ... 'C': [7, 8, 9]})
  6933. >>> df.update(new_df)
  6934. >>> df
  6935. A B
  6936. 0 1 4
  6937. 1 2 5
  6938. 2 3 6
  6939. The DataFrame's length does not increase as a result of the update,
  6940. only values at matching index/column labels are updated.
  6941. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  6942. ... 'B': ['x', 'y', 'z']})
  6943. >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
  6944. >>> df.update(new_df)
  6945. >>> df
  6946. A B
  6947. 0 a d
  6948. 1 b e
  6949. 2 c f
  6950. For Series, its name attribute must be set.
  6951. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  6952. ... 'B': ['x', 'y', 'z']})
  6953. >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
  6954. >>> df.update(new_column)
  6955. >>> df
  6956. A B
  6957. 0 a d
  6958. 1 b y
  6959. 2 c e
  6960. >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
  6961. ... 'B': ['x', 'y', 'z']})
  6962. >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
  6963. >>> df.update(new_df)
  6964. >>> df
  6965. A B
  6966. 0 a x
  6967. 1 b d
  6968. 2 c e
  6969. If `other` contains NaNs the corresponding values are not updated
  6970. in the original dataframe.
  6971. >>> df = pd.DataFrame({'A': [1, 2, 3],
  6972. ... 'B': [400, 500, 600]})
  6973. >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
  6974. >>> df.update(new_df)
  6975. >>> df
  6976. A B
  6977. 0 1 4
  6978. 1 2 500
  6979. 2 3 6
  6980. """
  6981. from pandas.core.computation import expressions
  6982. # TODO: Support other joins
  6983. if join != "left": # pragma: no cover
  6984. raise NotImplementedError("Only left join is supported")
  6985. if errors not in ["ignore", "raise"]:
  6986. raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
  6987. if not isinstance(other, DataFrame):
  6988. other = DataFrame(other)
  6989. other = other.reindex(self.index)
  6990. for col in self.columns.intersection(other.columns):
  6991. this = self[col]._values
  6992. that = other[col]._values
  6993. if filter_func is not None:
  6994. with np.errstate(all="ignore"):
  6995. mask = ~filter_func(this) | isna(that)
  6996. else:
  6997. if errors == "raise":
  6998. mask_this = notna(that)
  6999. mask_that = notna(this)
  7000. if any(mask_this & mask_that):
  7001. raise ValueError("Data overlaps.")
  7002. if overwrite:
  7003. mask = isna(that)
  7004. else:
  7005. mask = notna(this)
  7006. # don't overwrite columns unnecessarily
  7007. if mask.all():
  7008. continue
  7009. self.loc[:, col] = expressions.where(mask, this, that)
  7010. # ----------------------------------------------------------------------
  7011. # Data reshaping
  7012. @Appender(
  7013. """
  7014. Examples
  7015. --------
  7016. >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
  7017. ... 'Parrot', 'Parrot'],
  7018. ... 'Max Speed': [380., 370., 24., 26.]})
  7019. >>> df
  7020. Animal Max Speed
  7021. 0 Falcon 380.0
  7022. 1 Falcon 370.0
  7023. 2 Parrot 24.0
  7024. 3 Parrot 26.0
  7025. >>> df.groupby(['Animal']).mean()
  7026. Max Speed
  7027. Animal
  7028. Falcon 375.0
  7029. Parrot 25.0
  7030. **Hierarchical Indexes**
  7031. We can groupby different levels of a hierarchical index
  7032. using the `level` parameter:
  7033. >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
  7034. ... ['Captive', 'Wild', 'Captive', 'Wild']]
  7035. >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
  7036. >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
  7037. ... index=index)
  7038. >>> df
  7039. Max Speed
  7040. Animal Type
  7041. Falcon Captive 390.0
  7042. Wild 350.0
  7043. Parrot Captive 30.0
  7044. Wild 20.0
  7045. >>> df.groupby(level=0).mean()
  7046. Max Speed
  7047. Animal
  7048. Falcon 370.0
  7049. Parrot 25.0
  7050. >>> df.groupby(level="Type").mean()
  7051. Max Speed
  7052. Type
  7053. Captive 210.0
  7054. Wild 185.0
  7055. We can also choose to include NA in group keys or not by setting
  7056. `dropna` parameter, the default setting is `True`.
  7057. >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
  7058. >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
  7059. >>> df.groupby(by=["b"]).sum()
  7060. a c
  7061. b
  7062. 1.0 2 3
  7063. 2.0 2 5
  7064. >>> df.groupby(by=["b"], dropna=False).sum()
  7065. a c
  7066. b
  7067. 1.0 2 3
  7068. 2.0 2 5
  7069. NaN 1 4
  7070. >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
  7071. >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
  7072. >>> df.groupby(by="a").sum()
  7073. b c
  7074. a
  7075. a 13.0 13.0
  7076. b 12.3 123.0
  7077. >>> df.groupby(by="a", dropna=False).sum()
  7078. b c
  7079. a
  7080. a 13.0 13.0
  7081. b 12.3 123.0
  7082. NaN 12.3 33.0
  7083. When using ``.apply()``, use ``group_keys`` to include or exclude the group keys.
  7084. The ``group_keys`` argument defaults to ``True`` (include).
  7085. >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
  7086. ... 'Parrot', 'Parrot'],
  7087. ... 'Max Speed': [380., 370., 24., 26.]})
  7088. >>> df.groupby("Animal", group_keys=True).apply(lambda x: x)
  7089. Animal Max Speed
  7090. Animal
  7091. Falcon 0 Falcon 380.0
  7092. 1 Falcon 370.0
  7093. Parrot 2 Parrot 24.0
  7094. 3 Parrot 26.0
  7095. >>> df.groupby("Animal", group_keys=False).apply(lambda x: x)
  7096. Animal Max Speed
  7097. 0 Falcon 380.0
  7098. 1 Falcon 370.0
  7099. 2 Parrot 24.0
  7100. 3 Parrot 26.0
  7101. """
  7102. )
  7103. @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
  7104. def groupby(
  7105. self,
  7106. by=None,
  7107. axis: Axis = 0,
  7108. level: IndexLabel | None = None,
  7109. as_index: bool = True,
  7110. sort: bool = True,
  7111. group_keys: bool = True,
  7112. observed: bool = False,
  7113. dropna: bool = True,
  7114. ) -> DataFrameGroupBy:
  7115. from pandas.core.groupby.generic import DataFrameGroupBy
  7116. if level is None and by is None:
  7117. raise TypeError("You have to supply one of 'by' and 'level'")
  7118. axis = self._get_axis_number(axis)
  7119. return DataFrameGroupBy(
  7120. obj=self,
  7121. keys=by,
  7122. axis=axis,
  7123. level=level,
  7124. as_index=as_index,
  7125. sort=sort,
  7126. group_keys=group_keys,
  7127. observed=observed,
  7128. dropna=dropna,
  7129. )
  7130. _shared_docs[
  7131. "pivot"
  7132. ] = """
  7133. Return reshaped DataFrame organized by given index / column values.
  7134. Reshape data (produce a "pivot" table) based on column values. Uses
  7135. unique values from specified `index` / `columns` to form axes of the
  7136. resulting DataFrame. This function does not support data
  7137. aggregation, multiple values will result in a MultiIndex in the
  7138. columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
  7139. Parameters
  7140. ----------%s
  7141. columns : str or object or a list of str
  7142. Column to use to make new frame's columns.
  7143. .. versionchanged:: 1.1.0
  7144. Also accept list of columns names.
  7145. index : str or object or a list of str, optional
  7146. Column to use to make new frame's index. If not given, uses existing index.
  7147. .. versionchanged:: 1.1.0
  7148. Also accept list of index names.
  7149. values : str, object or a list of the previous, optional
  7150. Column(s) to use for populating new frame's values. If not
  7151. specified, all remaining columns will be used and the result will
  7152. have hierarchically indexed columns.
  7153. Returns
  7154. -------
  7155. DataFrame
  7156. Returns reshaped DataFrame.
  7157. Raises
  7158. ------
  7159. ValueError:
  7160. When there are any `index`, `columns` combinations with multiple
  7161. values. `DataFrame.pivot_table` when you need to aggregate.
  7162. See Also
  7163. --------
  7164. DataFrame.pivot_table : Generalization of pivot that can handle
  7165. duplicate values for one index/column pair.
  7166. DataFrame.unstack : Pivot based on the index values instead of a
  7167. column.
  7168. wide_to_long : Wide panel to long format. Less flexible but more
  7169. user-friendly than melt.
  7170. Notes
  7171. -----
  7172. For finer-tuned control, see hierarchical indexing documentation along
  7173. with the related stack/unstack methods.
  7174. Reference :ref:`the user guide <reshaping.pivot>` for more examples.
  7175. Examples
  7176. --------
  7177. >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
  7178. ... 'two'],
  7179. ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
  7180. ... 'baz': [1, 2, 3, 4, 5, 6],
  7181. ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
  7182. >>> df
  7183. foo bar baz zoo
  7184. 0 one A 1 x
  7185. 1 one B 2 y
  7186. 2 one C 3 z
  7187. 3 two A 4 q
  7188. 4 two B 5 w
  7189. 5 two C 6 t
  7190. >>> df.pivot(index='foo', columns='bar', values='baz')
  7191. bar A B C
  7192. foo
  7193. one 1 2 3
  7194. two 4 5 6
  7195. >>> df.pivot(index='foo', columns='bar')['baz']
  7196. bar A B C
  7197. foo
  7198. one 1 2 3
  7199. two 4 5 6
  7200. >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
  7201. baz zoo
  7202. bar A B C A B C
  7203. foo
  7204. one 1 2 3 x y z
  7205. two 4 5 6 q w t
  7206. You could also assign a list of column names or a list of index names.
  7207. >>> df = pd.DataFrame({
  7208. ... "lev1": [1, 1, 1, 2, 2, 2],
  7209. ... "lev2": [1, 1, 2, 1, 1, 2],
  7210. ... "lev3": [1, 2, 1, 2, 1, 2],
  7211. ... "lev4": [1, 2, 3, 4, 5, 6],
  7212. ... "values": [0, 1, 2, 3, 4, 5]})
  7213. >>> df
  7214. lev1 lev2 lev3 lev4 values
  7215. 0 1 1 1 1 0
  7216. 1 1 1 2 2 1
  7217. 2 1 2 1 3 2
  7218. 3 2 1 2 4 3
  7219. 4 2 1 1 5 4
  7220. 5 2 2 2 6 5
  7221. >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")
  7222. lev2 1 2
  7223. lev3 1 2 1 2
  7224. lev1
  7225. 1 0.0 1.0 2.0 NaN
  7226. 2 4.0 3.0 NaN 5.0
  7227. >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")
  7228. lev3 1 2
  7229. lev1 lev2
  7230. 1 1 0.0 1.0
  7231. 2 2.0 NaN
  7232. 2 1 4.0 3.0
  7233. 2 NaN 5.0
  7234. A ValueError is raised if there are any duplicates.
  7235. >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
  7236. ... "bar": ['A', 'A', 'B', 'C'],
  7237. ... "baz": [1, 2, 3, 4]})
  7238. >>> df
  7239. foo bar baz
  7240. 0 one A 1
  7241. 1 one A 2
  7242. 2 two B 3
  7243. 3 two C 4
  7244. Notice that the first two rows are the same for our `index`
  7245. and `columns` arguments.
  7246. >>> df.pivot(index='foo', columns='bar', values='baz')
  7247. Traceback (most recent call last):
  7248. ...
  7249. ValueError: Index contains duplicate entries, cannot reshape
  7250. """
  7251. @Substitution("")
  7252. @Appender(_shared_docs["pivot"])
  7253. def pivot(self, *, columns, index=lib.NoDefault, values=lib.NoDefault) -> DataFrame:
  7254. from pandas.core.reshape.pivot import pivot
  7255. return pivot(self, index=index, columns=columns, values=values)
  7256. _shared_docs[
  7257. "pivot_table"
  7258. ] = """
  7259. Create a spreadsheet-style pivot table as a DataFrame.
  7260. The levels in the pivot table will be stored in MultiIndex objects
  7261. (hierarchical indexes) on the index and columns of the result DataFrame.
  7262. Parameters
  7263. ----------%s
  7264. values : list-like or scalar, optional
  7265. Column or columns to aggregate.
  7266. index : column, Grouper, array, or list of the previous
  7267. If an array is passed, it must be the same length as the data. The
  7268. list can contain any of the other types (except list).
  7269. Keys to group by on the pivot table index. If an array is passed,
  7270. it is being used as the same manner as column values.
  7271. columns : column, Grouper, array, or list of the previous
  7272. If an array is passed, it must be the same length as the data. The
  7273. list can contain any of the other types (except list).
  7274. Keys to group by on the pivot table column. If an array is passed,
  7275. it is being used as the same manner as column values.
  7276. aggfunc : function, list of functions, dict, default numpy.mean
  7277. If list of functions passed, the resulting pivot table will have
  7278. hierarchical columns whose top level are the function names
  7279. (inferred from the function objects themselves)
  7280. If dict is passed, the key is column to aggregate and value
  7281. is function or list of functions. If ``margin=True``,
  7282. aggfunc will be used to calculate the partial aggregates.
  7283. fill_value : scalar, default None
  7284. Value to replace missing values with (in the resulting pivot table,
  7285. after aggregation).
  7286. margins : bool, default False
  7287. If ``margins=True``, special ``All`` columns and rows
  7288. will be added with partial group aggregates across the categories
  7289. on the rows and columns.
  7290. dropna : bool, default True
  7291. Do not include columns whose entries are all NaN. If True,
  7292. rows with a NaN value in any column will be omitted before
  7293. computing margins.
  7294. margins_name : str, default 'All'
  7295. Name of the row / column that will contain the totals
  7296. when margins is True.
  7297. observed : bool, default False
  7298. This only applies if any of the groupers are Categoricals.
  7299. If True: only show observed values for categorical groupers.
  7300. If False: show all values for categorical groupers.
  7301. sort : bool, default True
  7302. Specifies if the result should be sorted.
  7303. .. versionadded:: 1.3.0
  7304. Returns
  7305. -------
  7306. DataFrame
  7307. An Excel style pivot table.
  7308. See Also
  7309. --------
  7310. DataFrame.pivot : Pivot without aggregation that can handle
  7311. non-numeric data.
  7312. DataFrame.melt: Unpivot a DataFrame from wide to long format,
  7313. optionally leaving identifiers set.
  7314. wide_to_long : Wide panel to long format. Less flexible but more
  7315. user-friendly than melt.
  7316. Notes
  7317. -----
  7318. Reference :ref:`the user guide <reshaping.pivot>` for more examples.
  7319. Examples
  7320. --------
  7321. >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
  7322. ... "bar", "bar", "bar", "bar"],
  7323. ... "B": ["one", "one", "one", "two", "two",
  7324. ... "one", "one", "two", "two"],
  7325. ... "C": ["small", "large", "large", "small",
  7326. ... "small", "large", "small", "small",
  7327. ... "large"],
  7328. ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
  7329. ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
  7330. >>> df
  7331. A B C D E
  7332. 0 foo one small 1 2
  7333. 1 foo one large 2 4
  7334. 2 foo one large 2 5
  7335. 3 foo two small 3 5
  7336. 4 foo two small 3 6
  7337. 5 bar one large 4 6
  7338. 6 bar one small 5 8
  7339. 7 bar two small 6 9
  7340. 8 bar two large 7 9
  7341. This first example aggregates values by taking the sum.
  7342. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
  7343. ... columns=['C'], aggfunc=np.sum)
  7344. >>> table
  7345. C large small
  7346. A B
  7347. bar one 4.0 5.0
  7348. two 7.0 6.0
  7349. foo one 4.0 1.0
  7350. two NaN 6.0
  7351. We can also fill missing values using the `fill_value` parameter.
  7352. >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
  7353. ... columns=['C'], aggfunc=np.sum, fill_value=0)
  7354. >>> table
  7355. C large small
  7356. A B
  7357. bar one 4 5
  7358. two 7 6
  7359. foo one 4 1
  7360. two 0 6
  7361. The next example aggregates by taking the mean across multiple columns.
  7362. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
  7363. ... aggfunc={'D': np.mean, 'E': np.mean})
  7364. >>> table
  7365. D E
  7366. A C
  7367. bar large 5.500000 7.500000
  7368. small 5.500000 8.500000
  7369. foo large 2.000000 4.500000
  7370. small 2.333333 4.333333
  7371. We can also calculate multiple types of aggregations for any given
  7372. value column.
  7373. >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
  7374. ... aggfunc={'D': np.mean,
  7375. ... 'E': [min, max, np.mean]})
  7376. >>> table
  7377. D E
  7378. mean max mean min
  7379. A C
  7380. bar large 5.500000 9 7.500000 6
  7381. small 5.500000 9 8.500000 8
  7382. foo large 2.000000 5 4.500000 4
  7383. small 2.333333 6 4.333333 2
  7384. """
  7385. @Substitution("")
  7386. @Appender(_shared_docs["pivot_table"])
  7387. def pivot_table(
  7388. self,
  7389. values=None,
  7390. index=None,
  7391. columns=None,
  7392. aggfunc: AggFuncType = "mean",
  7393. fill_value=None,
  7394. margins: bool = False,
  7395. dropna: bool = True,
  7396. margins_name: Level = "All",
  7397. observed: bool = False,
  7398. sort: bool = True,
  7399. ) -> DataFrame:
  7400. from pandas.core.reshape.pivot import pivot_table
  7401. return pivot_table(
  7402. self,
  7403. values=values,
  7404. index=index,
  7405. columns=columns,
  7406. aggfunc=aggfunc,
  7407. fill_value=fill_value,
  7408. margins=margins,
  7409. dropna=dropna,
  7410. margins_name=margins_name,
  7411. observed=observed,
  7412. sort=sort,
  7413. )
  7414. def stack(self, level: Level = -1, dropna: bool = True):
  7415. """
  7416. Stack the prescribed level(s) from columns to index.
  7417. Return a reshaped DataFrame or Series having a multi-level
  7418. index with one or more new inner-most levels compared to the current
  7419. DataFrame. The new inner-most levels are created by pivoting the
  7420. columns of the current dataframe:
  7421. - if the columns have a single level, the output is a Series;
  7422. - if the columns have multiple levels, the new index
  7423. level(s) is (are) taken from the prescribed level(s) and
  7424. the output is a DataFrame.
  7425. Parameters
  7426. ----------
  7427. level : int, str, list, default -1
  7428. Level(s) to stack from the column axis onto the index
  7429. axis, defined as one index or label, or a list of indices
  7430. or labels.
  7431. dropna : bool, default True
  7432. Whether to drop rows in the resulting Frame/Series with
  7433. missing values. Stacking a column level onto the index
  7434. axis can create combinations of index and column values
  7435. that are missing from the original dataframe. See Examples
  7436. section.
  7437. Returns
  7438. -------
  7439. DataFrame or Series
  7440. Stacked dataframe or series.
  7441. See Also
  7442. --------
  7443. DataFrame.unstack : Unstack prescribed level(s) from index axis
  7444. onto column axis.
  7445. DataFrame.pivot : Reshape dataframe from long format to wide
  7446. format.
  7447. DataFrame.pivot_table : Create a spreadsheet-style pivot table
  7448. as a DataFrame.
  7449. Notes
  7450. -----
  7451. The function is named by analogy with a collection of books
  7452. being reorganized from being side by side on a horizontal
  7453. position (the columns of the dataframe) to being stacked
  7454. vertically on top of each other (in the index of the
  7455. dataframe).
  7456. Reference :ref:`the user guide <reshaping.stacking>` for more examples.
  7457. Examples
  7458. --------
  7459. **Single level columns**
  7460. >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
  7461. ... index=['cat', 'dog'],
  7462. ... columns=['weight', 'height'])
  7463. Stacking a dataframe with a single level column axis returns a Series:
  7464. >>> df_single_level_cols
  7465. weight height
  7466. cat 0 1
  7467. dog 2 3
  7468. >>> df_single_level_cols.stack()
  7469. cat weight 0
  7470. height 1
  7471. dog weight 2
  7472. height 3
  7473. dtype: int64
  7474. **Multi level columns: simple case**
  7475. >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
  7476. ... ('weight', 'pounds')])
  7477. >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
  7478. ... index=['cat', 'dog'],
  7479. ... columns=multicol1)
  7480. Stacking a dataframe with a multi-level column axis:
  7481. >>> df_multi_level_cols1
  7482. weight
  7483. kg pounds
  7484. cat 1 2
  7485. dog 2 4
  7486. >>> df_multi_level_cols1.stack()
  7487. weight
  7488. cat kg 1
  7489. pounds 2
  7490. dog kg 2
  7491. pounds 4
  7492. **Missing values**
  7493. >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
  7494. ... ('height', 'm')])
  7495. >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
  7496. ... index=['cat', 'dog'],
  7497. ... columns=multicol2)
  7498. It is common to have missing values when stacking a dataframe
  7499. with multi-level columns, as the stacked dataframe typically
  7500. has more values than the original dataframe. Missing values
  7501. are filled with NaNs:
  7502. >>> df_multi_level_cols2
  7503. weight height
  7504. kg m
  7505. cat 1.0 2.0
  7506. dog 3.0 4.0
  7507. >>> df_multi_level_cols2.stack()
  7508. height weight
  7509. cat kg NaN 1.0
  7510. m 2.0 NaN
  7511. dog kg NaN 3.0
  7512. m 4.0 NaN
  7513. **Prescribing the level(s) to be stacked**
  7514. The first parameter controls which level or levels are stacked:
  7515. >>> df_multi_level_cols2.stack(0)
  7516. kg m
  7517. cat height NaN 2.0
  7518. weight 1.0 NaN
  7519. dog height NaN 4.0
  7520. weight 3.0 NaN
  7521. >>> df_multi_level_cols2.stack([0, 1])
  7522. cat height m 2.0
  7523. weight kg 1.0
  7524. dog height m 4.0
  7525. weight kg 3.0
  7526. dtype: float64
  7527. **Dropping missing values**
  7528. >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
  7529. ... index=['cat', 'dog'],
  7530. ... columns=multicol2)
  7531. Note that rows where all values are missing are dropped by
  7532. default but this behaviour can be controlled via the dropna
  7533. keyword parameter:
  7534. >>> df_multi_level_cols3
  7535. weight height
  7536. kg m
  7537. cat NaN 1.0
  7538. dog 2.0 3.0
  7539. >>> df_multi_level_cols3.stack(dropna=False)
  7540. height weight
  7541. cat kg NaN NaN
  7542. m 1.0 NaN
  7543. dog kg NaN 2.0
  7544. m 3.0 NaN
  7545. >>> df_multi_level_cols3.stack(dropna=True)
  7546. height weight
  7547. cat m 1.0 NaN
  7548. dog kg NaN 2.0
  7549. m 3.0 NaN
  7550. """
  7551. from pandas.core.reshape.reshape import (
  7552. stack,
  7553. stack_multiple,
  7554. )
  7555. if isinstance(level, (tuple, list)):
  7556. result = stack_multiple(self, level, dropna=dropna)
  7557. else:
  7558. result = stack(self, level, dropna=dropna)
  7559. return result.__finalize__(self, method="stack")
  7560. def explode(
  7561. self,
  7562. column: IndexLabel,
  7563. ignore_index: bool = False,
  7564. ) -> DataFrame:
  7565. """
  7566. Transform each element of a list-like to a row, replicating index values.
  7567. Parameters
  7568. ----------
  7569. column : IndexLabel
  7570. Column(s) to explode.
  7571. For multiple columns, specify a non-empty list with each element
  7572. be str or tuple, and all specified columns their list-like data
  7573. on same row of the frame must have matching length.
  7574. .. versionadded:: 1.3.0
  7575. Multi-column explode
  7576. ignore_index : bool, default False
  7577. If True, the resulting index will be labeled 0, 1, …, n - 1.
  7578. .. versionadded:: 1.1.0
  7579. Returns
  7580. -------
  7581. DataFrame
  7582. Exploded lists to rows of the subset columns;
  7583. index will be duplicated for these rows.
  7584. Raises
  7585. ------
  7586. ValueError :
  7587. * If columns of the frame are not unique.
  7588. * If specified columns to explode is empty list.
  7589. * If specified columns to explode have not matching count of
  7590. elements rowwise in the frame.
  7591. See Also
  7592. --------
  7593. DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
  7594. index labels.
  7595. DataFrame.melt : Unpivot a DataFrame from wide format to long format.
  7596. Series.explode : Explode a DataFrame from list-like columns to long format.
  7597. Notes
  7598. -----
  7599. This routine will explode list-likes including lists, tuples, sets,
  7600. Series, and np.ndarray. The result dtype of the subset rows will
  7601. be object. Scalars will be returned unchanged, and empty list-likes will
  7602. result in a np.nan for that row. In addition, the ordering of rows in the
  7603. output will be non-deterministic when exploding sets.
  7604. Reference :ref:`the user guide <reshaping.explode>` for more examples.
  7605. Examples
  7606. --------
  7607. >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
  7608. ... 'B': 1,
  7609. ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
  7610. >>> df
  7611. A B C
  7612. 0 [0, 1, 2] 1 [a, b, c]
  7613. 1 foo 1 NaN
  7614. 2 [] 1 []
  7615. 3 [3, 4] 1 [d, e]
  7616. Single-column explode.
  7617. >>> df.explode('A')
  7618. A B C
  7619. 0 0 1 [a, b, c]
  7620. 0 1 1 [a, b, c]
  7621. 0 2 1 [a, b, c]
  7622. 1 foo 1 NaN
  7623. 2 NaN 1 []
  7624. 3 3 1 [d, e]
  7625. 3 4 1 [d, e]
  7626. Multi-column explode.
  7627. >>> df.explode(list('AC'))
  7628. A B C
  7629. 0 0 1 a
  7630. 0 1 1 b
  7631. 0 2 1 c
  7632. 1 foo 1 NaN
  7633. 2 NaN 1 NaN
  7634. 3 3 1 d
  7635. 3 4 1 e
  7636. """
  7637. if not self.columns.is_unique:
  7638. duplicate_cols = self.columns[self.columns.duplicated()].tolist()
  7639. raise ValueError(
  7640. f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}"
  7641. )
  7642. columns: list[Hashable]
  7643. if is_scalar(column) or isinstance(column, tuple):
  7644. columns = [column]
  7645. elif isinstance(column, list) and all(
  7646. is_scalar(c) or isinstance(c, tuple) for c in column
  7647. ):
  7648. if not column:
  7649. raise ValueError("column must be nonempty")
  7650. if len(column) > len(set(column)):
  7651. raise ValueError("column must be unique")
  7652. columns = column
  7653. else:
  7654. raise ValueError("column must be a scalar, tuple, or list thereof")
  7655. df = self.reset_index(drop=True)
  7656. if len(columns) == 1:
  7657. result = df[columns[0]].explode()
  7658. else:
  7659. mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1
  7660. counts0 = self[columns[0]].apply(mylen)
  7661. for c in columns[1:]:
  7662. if not all(counts0 == self[c].apply(mylen)):
  7663. raise ValueError("columns must have matching element counts")
  7664. result = DataFrame({c: df[c].explode() for c in columns})
  7665. result = df.drop(columns, axis=1).join(result)
  7666. if ignore_index:
  7667. result.index = default_index(len(result))
  7668. else:
  7669. result.index = self.index.take(result.index)
  7670. result = result.reindex(columns=self.columns, copy=False)
  7671. return result.__finalize__(self, method="explode")
  7672. def unstack(self, level: Level = -1, fill_value=None):
  7673. """
  7674. Pivot a level of the (necessarily hierarchical) index labels.
  7675. Returns a DataFrame having a new level of column labels whose inner-most level
  7676. consists of the pivoted index labels.
  7677. If the index is not a MultiIndex, the output will be a Series
  7678. (the analogue of stack when the columns are not a MultiIndex).
  7679. Parameters
  7680. ----------
  7681. level : int, str, or list of these, default -1 (last level)
  7682. Level(s) of index to unstack, can pass level name.
  7683. fill_value : int, str or dict
  7684. Replace NaN with this value if the unstack produces missing values.
  7685. Returns
  7686. -------
  7687. Series or DataFrame
  7688. See Also
  7689. --------
  7690. DataFrame.pivot : Pivot a table based on column values.
  7691. DataFrame.stack : Pivot a level of the column labels (inverse operation
  7692. from `unstack`).
  7693. Notes
  7694. -----
  7695. Reference :ref:`the user guide <reshaping.stacking>` for more examples.
  7696. Examples
  7697. --------
  7698. >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
  7699. ... ('two', 'a'), ('two', 'b')])
  7700. >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
  7701. >>> s
  7702. one a 1.0
  7703. b 2.0
  7704. two a 3.0
  7705. b 4.0
  7706. dtype: float64
  7707. >>> s.unstack(level=-1)
  7708. a b
  7709. one 1.0 2.0
  7710. two 3.0 4.0
  7711. >>> s.unstack(level=0)
  7712. one two
  7713. a 1.0 3.0
  7714. b 2.0 4.0
  7715. >>> df = s.unstack(level=0)
  7716. >>> df.unstack()
  7717. one a 1.0
  7718. b 2.0
  7719. two a 3.0
  7720. b 4.0
  7721. dtype: float64
  7722. """
  7723. from pandas.core.reshape.reshape import unstack
  7724. result = unstack(self, level, fill_value)
  7725. return result.__finalize__(self, method="unstack")
  7726. @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
  7727. def melt(
  7728. self,
  7729. id_vars=None,
  7730. value_vars=None,
  7731. var_name=None,
  7732. value_name: Hashable = "value",
  7733. col_level: Level = None,
  7734. ignore_index: bool = True,
  7735. ) -> DataFrame:
  7736. return melt(
  7737. self,
  7738. id_vars=id_vars,
  7739. value_vars=value_vars,
  7740. var_name=var_name,
  7741. value_name=value_name,
  7742. col_level=col_level,
  7743. ignore_index=ignore_index,
  7744. ).__finalize__(self, method="melt")
  7745. # ----------------------------------------------------------------------
  7746. # Time series-related
  7747. @doc(
  7748. Series.diff,
  7749. klass="DataFrame",
  7750. extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "
  7751. "Take difference over rows (0) or columns (1).\n",
  7752. other_klass="Series",
  7753. examples=dedent(
  7754. """
  7755. Difference with previous row
  7756. >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
  7757. ... 'b': [1, 1, 2, 3, 5, 8],
  7758. ... 'c': [1, 4, 9, 16, 25, 36]})
  7759. >>> df
  7760. a b c
  7761. 0 1 1 1
  7762. 1 2 1 4
  7763. 2 3 2 9
  7764. 3 4 3 16
  7765. 4 5 5 25
  7766. 5 6 8 36
  7767. >>> df.diff()
  7768. a b c
  7769. 0 NaN NaN NaN
  7770. 1 1.0 0.0 3.0
  7771. 2 1.0 1.0 5.0
  7772. 3 1.0 1.0 7.0
  7773. 4 1.0 2.0 9.0
  7774. 5 1.0 3.0 11.0
  7775. Difference with previous column
  7776. >>> df.diff(axis=1)
  7777. a b c
  7778. 0 NaN 0 0
  7779. 1 NaN -1 3
  7780. 2 NaN -1 7
  7781. 3 NaN -1 13
  7782. 4 NaN 0 20
  7783. 5 NaN 2 28
  7784. Difference with 3rd previous row
  7785. >>> df.diff(periods=3)
  7786. a b c
  7787. 0 NaN NaN NaN
  7788. 1 NaN NaN NaN
  7789. 2 NaN NaN NaN
  7790. 3 3.0 2.0 15.0
  7791. 4 3.0 4.0 21.0
  7792. 5 3.0 6.0 27.0
  7793. Difference with following row
  7794. >>> df.diff(periods=-1)
  7795. a b c
  7796. 0 -1.0 0.0 -3.0
  7797. 1 -1.0 -1.0 -5.0
  7798. 2 -1.0 -1.0 -7.0
  7799. 3 -1.0 -2.0 -9.0
  7800. 4 -1.0 -3.0 -11.0
  7801. 5 NaN NaN NaN
  7802. Overflow in input dtype
  7803. >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)
  7804. >>> df.diff()
  7805. a
  7806. 0 NaN
  7807. 1 255.0"""
  7808. ),
  7809. )
  7810. def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
  7811. if not lib.is_integer(periods):
  7812. if not (
  7813. is_float(periods)
  7814. # error: "int" has no attribute "is_integer"
  7815. and periods.is_integer() # type: ignore[attr-defined]
  7816. ):
  7817. raise ValueError("periods must be an integer")
  7818. periods = int(periods)
  7819. axis = self._get_axis_number(axis)
  7820. if axis == 1:
  7821. if periods != 0:
  7822. # in the periods == 0 case, this is equivalent diff of 0 periods
  7823. # along axis=0, and the Manager method may be somewhat more
  7824. # performant, so we dispatch in that case.
  7825. return self - self.shift(periods, axis=axis)
  7826. # With periods=0 this is equivalent to a diff with axis=0
  7827. axis = 0
  7828. new_data = self._mgr.diff(n=periods, axis=axis)
  7829. return self._constructor(new_data).__finalize__(self, "diff")
  7830. # ----------------------------------------------------------------------
  7831. # Function application
  7832. def _gotitem(
  7833. self,
  7834. key: IndexLabel,
  7835. ndim: int,
  7836. subset: DataFrame | Series | None = None,
  7837. ) -> DataFrame | Series:
  7838. """
  7839. Sub-classes to define. Return a sliced object.
  7840. Parameters
  7841. ----------
  7842. key : string / list of selections
  7843. ndim : {1, 2}
  7844. requested ndim of result
  7845. subset : object, default None
  7846. subset to act on
  7847. """
  7848. if subset is None:
  7849. subset = self
  7850. elif subset.ndim == 1: # is Series
  7851. return subset
  7852. # TODO: _shallow_copy(subset)?
  7853. return subset[key]
  7854. _agg_summary_and_see_also_doc = dedent(
  7855. """
  7856. The aggregation operations are always performed over an axis, either the
  7857. index (default) or the column axis. This behavior is different from
  7858. `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
  7859. `var`), where the default is to compute the aggregation of the flattened
  7860. array, e.g., ``numpy.mean(arr_2d)`` as opposed to
  7861. ``numpy.mean(arr_2d, axis=0)``.
  7862. `agg` is an alias for `aggregate`. Use the alias.
  7863. See Also
  7864. --------
  7865. DataFrame.apply : Perform any type of operations.
  7866. DataFrame.transform : Perform transformation type operations.
  7867. core.groupby.GroupBy : Perform operations over groups.
  7868. core.resample.Resampler : Perform operations over resampled bins.
  7869. core.window.Rolling : Perform operations over rolling window.
  7870. core.window.Expanding : Perform operations over expanding window.
  7871. core.window.ExponentialMovingWindow : Perform operation over exponential weighted
  7872. window.
  7873. """
  7874. )
  7875. _agg_examples_doc = dedent(
  7876. """
  7877. Examples
  7878. --------
  7879. >>> df = pd.DataFrame([[1, 2, 3],
  7880. ... [4, 5, 6],
  7881. ... [7, 8, 9],
  7882. ... [np.nan, np.nan, np.nan]],
  7883. ... columns=['A', 'B', 'C'])
  7884. Aggregate these functions over the rows.
  7885. >>> df.agg(['sum', 'min'])
  7886. A B C
  7887. sum 12.0 15.0 18.0
  7888. min 1.0 2.0 3.0
  7889. Different aggregations per column.
  7890. >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
  7891. A B
  7892. sum 12.0 NaN
  7893. min 1.0 2.0
  7894. max NaN 8.0
  7895. Aggregate different functions over the columns and rename the index of the resulting
  7896. DataFrame.
  7897. >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))
  7898. A B C
  7899. x 7.0 NaN NaN
  7900. y NaN 2.0 NaN
  7901. z NaN NaN 6.0
  7902. Aggregate over the columns.
  7903. >>> df.agg("mean", axis="columns")
  7904. 0 2.0
  7905. 1 5.0
  7906. 2 8.0
  7907. 3 NaN
  7908. dtype: float64
  7909. """
  7910. )
  7911. @doc(
  7912. _shared_docs["aggregate"],
  7913. klass=_shared_doc_kwargs["klass"],
  7914. axis=_shared_doc_kwargs["axis"],
  7915. see_also=_agg_summary_and_see_also_doc,
  7916. examples=_agg_examples_doc,
  7917. )
  7918. def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
  7919. from pandas.core.apply import frame_apply
  7920. axis = self._get_axis_number(axis)
  7921. relabeling, func, columns, order = reconstruct_func(func, **kwargs)
  7922. op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
  7923. result = op.agg()
  7924. if relabeling:
  7925. # This is to keep the order to columns occurrence unchanged, and also
  7926. # keep the order of new columns occurrence unchanged
  7927. # For the return values of reconstruct_func, if relabeling is
  7928. # False, columns and order will be None.
  7929. assert columns is not None
  7930. assert order is not None
  7931. result_in_dict = relabel_result(result, func, columns, order)
  7932. result = DataFrame(result_in_dict, index=columns)
  7933. return result
  7934. agg = aggregate
  7935. # error: Signature of "any" incompatible with supertype "NDFrame" [override]
  7936. @overload # type: ignore[override]
  7937. def any(
  7938. self,
  7939. *,
  7940. axis: Axis = ...,
  7941. bool_only: bool | None = ...,
  7942. skipna: bool = ...,
  7943. level: None = ...,
  7944. **kwargs,
  7945. ) -> Series:
  7946. ...
  7947. @overload
  7948. def any(
  7949. self,
  7950. *,
  7951. axis: Axis = ...,
  7952. bool_only: bool | None = ...,
  7953. skipna: bool = ...,
  7954. level: Level,
  7955. **kwargs,
  7956. ) -> DataFrame | Series:
  7957. ...
  7958. # error: Missing return statement
  7959. @doc(NDFrame.any, **_shared_doc_kwargs)
  7960. def any( # type: ignore[empty-body]
  7961. self,
  7962. axis: Axis = 0,
  7963. bool_only: bool | None = None,
  7964. skipna: bool = True,
  7965. level: Level = None,
  7966. **kwargs,
  7967. ) -> DataFrame | Series:
  7968. ...
  7969. @doc(
  7970. _shared_docs["transform"],
  7971. klass=_shared_doc_kwargs["klass"],
  7972. axis=_shared_doc_kwargs["axis"],
  7973. )
  7974. def transform(
  7975. self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
  7976. ) -> DataFrame:
  7977. from pandas.core.apply import frame_apply
  7978. op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
  7979. result = op.transform()
  7980. assert isinstance(result, DataFrame)
  7981. return result
  7982. def apply(
  7983. self,
  7984. func: AggFuncType,
  7985. axis: Axis = 0,
  7986. raw: bool = False,
  7987. result_type: Literal["expand", "reduce", "broadcast"] | None = None,
  7988. args=(),
  7989. **kwargs,
  7990. ):
  7991. """
  7992. Apply a function along an axis of the DataFrame.
  7993. Objects passed to the function are Series objects whose index is
  7994. either the DataFrame's index (``axis=0``) or the DataFrame's columns
  7995. (``axis=1``). By default (``result_type=None``), the final return type
  7996. is inferred from the return type of the applied function. Otherwise,
  7997. it depends on the `result_type` argument.
  7998. Parameters
  7999. ----------
  8000. func : function
  8001. Function to apply to each column or row.
  8002. axis : {0 or 'index', 1 or 'columns'}, default 0
  8003. Axis along which the function is applied:
  8004. * 0 or 'index': apply function to each column.
  8005. * 1 or 'columns': apply function to each row.
  8006. raw : bool, default False
  8007. Determines if row or column is passed as a Series or ndarray object:
  8008. * ``False`` : passes each row or column as a Series to the
  8009. function.
  8010. * ``True`` : the passed function will receive ndarray objects
  8011. instead.
  8012. If you are just applying a NumPy reduction function this will
  8013. achieve much better performance.
  8014. result_type : {'expand', 'reduce', 'broadcast', None}, default None
  8015. These only act when ``axis=1`` (columns):
  8016. * 'expand' : list-like results will be turned into columns.
  8017. * 'reduce' : returns a Series if possible rather than expanding
  8018. list-like results. This is the opposite of 'expand'.
  8019. * 'broadcast' : results will be broadcast to the original shape
  8020. of the DataFrame, the original index and columns will be
  8021. retained.
  8022. The default behaviour (None) depends on the return value of the
  8023. applied function: list-like results will be returned as a Series
  8024. of those. However if the apply function returns a Series these
  8025. are expanded to columns.
  8026. args : tuple
  8027. Positional arguments to pass to `func` in addition to the
  8028. array/series.
  8029. **kwargs
  8030. Additional keyword arguments to pass as keywords arguments to
  8031. `func`.
  8032. Returns
  8033. -------
  8034. Series or DataFrame
  8035. Result of applying ``func`` along the given axis of the
  8036. DataFrame.
  8037. See Also
  8038. --------
  8039. DataFrame.applymap: For elementwise operations.
  8040. DataFrame.aggregate: Only perform aggregating type operations.
  8041. DataFrame.transform: Only perform transforming type operations.
  8042. Notes
  8043. -----
  8044. Functions that mutate the passed object can produce unexpected
  8045. behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
  8046. for more details.
  8047. Examples
  8048. --------
  8049. >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
  8050. >>> df
  8051. A B
  8052. 0 4 9
  8053. 1 4 9
  8054. 2 4 9
  8055. Using a numpy universal function (in this case the same as
  8056. ``np.sqrt(df)``):
  8057. >>> df.apply(np.sqrt)
  8058. A B
  8059. 0 2.0 3.0
  8060. 1 2.0 3.0
  8061. 2 2.0 3.0
  8062. Using a reducing function on either axis
  8063. >>> df.apply(np.sum, axis=0)
  8064. A 12
  8065. B 27
  8066. dtype: int64
  8067. >>> df.apply(np.sum, axis=1)
  8068. 0 13
  8069. 1 13
  8070. 2 13
  8071. dtype: int64
  8072. Returning a list-like will result in a Series
  8073. >>> df.apply(lambda x: [1, 2], axis=1)
  8074. 0 [1, 2]
  8075. 1 [1, 2]
  8076. 2 [1, 2]
  8077. dtype: object
  8078. Passing ``result_type='expand'`` will expand list-like results
  8079. to columns of a Dataframe
  8080. >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
  8081. 0 1
  8082. 0 1 2
  8083. 1 1 2
  8084. 2 1 2
  8085. Returning a Series inside the function is similar to passing
  8086. ``result_type='expand'``. The resulting column names
  8087. will be the Series index.
  8088. >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
  8089. foo bar
  8090. 0 1 2
  8091. 1 1 2
  8092. 2 1 2
  8093. Passing ``result_type='broadcast'`` will ensure the same shape
  8094. result, whether list-like or scalar is returned by the function,
  8095. and broadcast it along the axis. The resulting column names will
  8096. be the originals.
  8097. >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
  8098. A B
  8099. 0 1 2
  8100. 1 1 2
  8101. 2 1 2
  8102. """
  8103. from pandas.core.apply import frame_apply
  8104. op = frame_apply(
  8105. self,
  8106. func=func,
  8107. axis=axis,
  8108. raw=raw,
  8109. result_type=result_type,
  8110. args=args,
  8111. kwargs=kwargs,
  8112. )
  8113. return op.apply().__finalize__(self, method="apply")
  8114. def applymap(
  8115. self, func: PythonFuncType, na_action: str | None = None, **kwargs
  8116. ) -> DataFrame:
  8117. """
  8118. Apply a function to a Dataframe elementwise.
  8119. This method applies a function that accepts and returns a scalar
  8120. to every element of a DataFrame.
  8121. Parameters
  8122. ----------
  8123. func : callable
  8124. Python function, returns a single value from a single value.
  8125. na_action : {None, 'ignore'}, default None
  8126. If ‘ignore’, propagate NaN values, without passing them to func.
  8127. .. versionadded:: 1.2
  8128. **kwargs
  8129. Additional keyword arguments to pass as keywords arguments to
  8130. `func`.
  8131. .. versionadded:: 1.3.0
  8132. Returns
  8133. -------
  8134. DataFrame
  8135. Transformed DataFrame.
  8136. See Also
  8137. --------
  8138. DataFrame.apply : Apply a function along input axis of DataFrame.
  8139. Examples
  8140. --------
  8141. >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
  8142. >>> df
  8143. 0 1
  8144. 0 1.000 2.120
  8145. 1 3.356 4.567
  8146. >>> df.applymap(lambda x: len(str(x)))
  8147. 0 1
  8148. 0 3 4
  8149. 1 5 5
  8150. Like Series.map, NA values can be ignored:
  8151. >>> df_copy = df.copy()
  8152. >>> df_copy.iloc[0, 0] = pd.NA
  8153. >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore')
  8154. 0 1
  8155. 0 NaN 4
  8156. 1 5.0 5
  8157. Note that a vectorized version of `func` often exists, which will
  8158. be much faster. You could square each number elementwise.
  8159. >>> df.applymap(lambda x: x**2)
  8160. 0 1
  8161. 0 1.000000 4.494400
  8162. 1 11.262736 20.857489
  8163. But it's better to avoid applymap in that case.
  8164. >>> df ** 2
  8165. 0 1
  8166. 0 1.000000 4.494400
  8167. 1 11.262736 20.857489
  8168. """
  8169. if na_action not in {"ignore", None}:
  8170. raise ValueError(
  8171. f"na_action must be 'ignore' or None. Got {repr(na_action)}"
  8172. )
  8173. ignore_na = na_action == "ignore"
  8174. func = functools.partial(func, **kwargs)
  8175. # if we have a dtype == 'M8[ns]', provide boxed values
  8176. def infer(x):
  8177. if x.empty:
  8178. return lib.map_infer(x, func, ignore_na=ignore_na)
  8179. return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na)
  8180. return self.apply(infer).__finalize__(self, "applymap")
  8181. # ----------------------------------------------------------------------
  8182. # Merging / joining methods
  8183. def _append(
  8184. self,
  8185. other,
  8186. ignore_index: bool = False,
  8187. verify_integrity: bool = False,
  8188. sort: bool = False,
  8189. ) -> DataFrame:
  8190. if isinstance(other, (Series, dict)):
  8191. if isinstance(other, dict):
  8192. if not ignore_index:
  8193. raise TypeError("Can only append a dict if ignore_index=True")
  8194. other = Series(other)
  8195. if other.name is None and not ignore_index:
  8196. raise TypeError(
  8197. "Can only append a Series if ignore_index=True "
  8198. "or if the Series has a name"
  8199. )
  8200. index = Index(
  8201. [other.name],
  8202. name=self.index.names
  8203. if isinstance(self.index, MultiIndex)
  8204. else self.index.name,
  8205. )
  8206. row_df = other.to_frame().T
  8207. # infer_objects is needed for
  8208. # test_append_empty_frame_to_series_with_dateutil_tz
  8209. other = row_df.infer_objects(copy=False).rename_axis(
  8210. index.names, copy=False
  8211. )
  8212. elif isinstance(other, list):
  8213. if not other:
  8214. pass
  8215. elif not isinstance(other[0], DataFrame):
  8216. other = DataFrame(other)
  8217. if self.index.name is not None and not ignore_index:
  8218. other.index.name = self.index.name
  8219. from pandas.core.reshape.concat import concat
  8220. if isinstance(other, (list, tuple)):
  8221. to_concat = [self, *other]
  8222. else:
  8223. to_concat = [self, other]
  8224. result = concat(
  8225. to_concat,
  8226. ignore_index=ignore_index,
  8227. verify_integrity=verify_integrity,
  8228. sort=sort,
  8229. )
  8230. return result.__finalize__(self, method="append")
  8231. def join(
  8232. self,
  8233. other: DataFrame | Series | Iterable[DataFrame | Series],
  8234. on: IndexLabel | None = None,
  8235. how: MergeHow = "left",
  8236. lsuffix: str = "",
  8237. rsuffix: str = "",
  8238. sort: bool = False,
  8239. validate: str | None = None,
  8240. ) -> DataFrame:
  8241. """
  8242. Join columns of another DataFrame.
  8243. Join columns with `other` DataFrame either on index or on a key
  8244. column. Efficiently join multiple DataFrame objects by index at once by
  8245. passing a list.
  8246. Parameters
  8247. ----------
  8248. other : DataFrame, Series, or a list containing any combination of them
  8249. Index should be similar to one of the columns in this one. If a
  8250. Series is passed, its name attribute must be set, and that will be
  8251. used as the column name in the resulting joined DataFrame.
  8252. on : str, list of str, or array-like, optional
  8253. Column or index level name(s) in the caller to join on the index
  8254. in `other`, otherwise joins index-on-index. If multiple
  8255. values given, the `other` DataFrame must have a MultiIndex. Can
  8256. pass an array as the join key if it is not already contained in
  8257. the calling DataFrame. Like an Excel VLOOKUP operation.
  8258. how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
  8259. How to handle the operation of the two objects.
  8260. * left: use calling frame's index (or column if on is specified)
  8261. * right: use `other`'s index.
  8262. * outer: form union of calling frame's index (or column if on is
  8263. specified) with `other`'s index, and sort it.
  8264. lexicographically.
  8265. * inner: form intersection of calling frame's index (or column if
  8266. on is specified) with `other`'s index, preserving the order
  8267. of the calling's one.
  8268. * cross: creates the cartesian product from both frames, preserves the order
  8269. of the left keys.
  8270. .. versionadded:: 1.2.0
  8271. lsuffix : str, default ''
  8272. Suffix to use from left frame's overlapping columns.
  8273. rsuffix : str, default ''
  8274. Suffix to use from right frame's overlapping columns.
  8275. sort : bool, default False
  8276. Order result DataFrame lexicographically by the join key. If False,
  8277. the order of the join key depends on the join type (how keyword).
  8278. validate : str, optional
  8279. If specified, checks if join is of specified type.
  8280. * "one_to_one" or "1:1": check if join keys are unique in both left
  8281. and right datasets.
  8282. * "one_to_many" or "1:m": check if join keys are unique in left dataset.
  8283. * "many_to_one" or "m:1": check if join keys are unique in right dataset.
  8284. * "many_to_many" or "m:m": allowed, but does not result in checks.
  8285. .. versionadded:: 1.5.0
  8286. Returns
  8287. -------
  8288. DataFrame
  8289. A dataframe containing columns from both the caller and `other`.
  8290. See Also
  8291. --------
  8292. DataFrame.merge : For column(s)-on-column(s) operations.
  8293. Notes
  8294. -----
  8295. Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
  8296. passing a list of `DataFrame` objects.
  8297. Support for specifying index levels as the `on` parameter was added
  8298. in version 0.23.0.
  8299. Examples
  8300. --------
  8301. >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
  8302. ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
  8303. >>> df
  8304. key A
  8305. 0 K0 A0
  8306. 1 K1 A1
  8307. 2 K2 A2
  8308. 3 K3 A3
  8309. 4 K4 A4
  8310. 5 K5 A5
  8311. >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
  8312. ... 'B': ['B0', 'B1', 'B2']})
  8313. >>> other
  8314. key B
  8315. 0 K0 B0
  8316. 1 K1 B1
  8317. 2 K2 B2
  8318. Join DataFrames using their indexes.
  8319. >>> df.join(other, lsuffix='_caller', rsuffix='_other')
  8320. key_caller A key_other B
  8321. 0 K0 A0 K0 B0
  8322. 1 K1 A1 K1 B1
  8323. 2 K2 A2 K2 B2
  8324. 3 K3 A3 NaN NaN
  8325. 4 K4 A4 NaN NaN
  8326. 5 K5 A5 NaN NaN
  8327. If we want to join using the key columns, we need to set key to be
  8328. the index in both `df` and `other`. The joined DataFrame will have
  8329. key as its index.
  8330. >>> df.set_index('key').join(other.set_index('key'))
  8331. A B
  8332. key
  8333. K0 A0 B0
  8334. K1 A1 B1
  8335. K2 A2 B2
  8336. K3 A3 NaN
  8337. K4 A4 NaN
  8338. K5 A5 NaN
  8339. Another option to join using the key columns is to use the `on`
  8340. parameter. DataFrame.join always uses `other`'s index but we can use
  8341. any column in `df`. This method preserves the original DataFrame's
  8342. index in the result.
  8343. >>> df.join(other.set_index('key'), on='key')
  8344. key A B
  8345. 0 K0 A0 B0
  8346. 1 K1 A1 B1
  8347. 2 K2 A2 B2
  8348. 3 K3 A3 NaN
  8349. 4 K4 A4 NaN
  8350. 5 K5 A5 NaN
  8351. Using non-unique key values shows how they are matched.
  8352. >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],
  8353. ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
  8354. >>> df
  8355. key A
  8356. 0 K0 A0
  8357. 1 K1 A1
  8358. 2 K1 A2
  8359. 3 K3 A3
  8360. 4 K0 A4
  8361. 5 K1 A5
  8362. >>> df.join(other.set_index('key'), on='key', validate='m:1')
  8363. key A B
  8364. 0 K0 A0 B0
  8365. 1 K1 A1 B1
  8366. 2 K1 A2 B1
  8367. 3 K3 A3 NaN
  8368. 4 K0 A4 B0
  8369. 5 K1 A5 B1
  8370. """
  8371. return self._join_compat(
  8372. other,
  8373. on=on,
  8374. how=how,
  8375. lsuffix=lsuffix,
  8376. rsuffix=rsuffix,
  8377. sort=sort,
  8378. validate=validate,
  8379. )
  8380. def _join_compat(
  8381. self,
  8382. other: DataFrame | Series | Iterable[DataFrame | Series],
  8383. on: IndexLabel | None = None,
  8384. how: MergeHow = "left",
  8385. lsuffix: str = "",
  8386. rsuffix: str = "",
  8387. sort: bool = False,
  8388. validate: str | None = None,
  8389. ):
  8390. from pandas.core.reshape.concat import concat
  8391. from pandas.core.reshape.merge import merge
  8392. if isinstance(other, Series):
  8393. if other.name is None:
  8394. raise ValueError("Other Series must have a name")
  8395. other = DataFrame({other.name: other})
  8396. if isinstance(other, DataFrame):
  8397. if how == "cross":
  8398. return merge(
  8399. self,
  8400. other,
  8401. how=how,
  8402. on=on,
  8403. suffixes=(lsuffix, rsuffix),
  8404. sort=sort,
  8405. validate=validate,
  8406. )
  8407. return merge(
  8408. self,
  8409. other,
  8410. left_on=on,
  8411. how=how,
  8412. left_index=on is None,
  8413. right_index=True,
  8414. suffixes=(lsuffix, rsuffix),
  8415. sort=sort,
  8416. validate=validate,
  8417. )
  8418. else:
  8419. if on is not None:
  8420. raise ValueError(
  8421. "Joining multiple DataFrames only supported for joining on index"
  8422. )
  8423. if rsuffix or lsuffix:
  8424. raise ValueError(
  8425. "Suffixes not supported when joining multiple DataFrames"
  8426. )
  8427. # Mypy thinks the RHS is a
  8428. # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas
  8429. # the LHS is an "Iterable[DataFrame]", but in reality both types are
  8430. # "Iterable[Union[DataFrame, Series]]" due to the if statements
  8431. frames = [cast("DataFrame | Series", self)] + list(other)
  8432. can_concat = all(df.index.is_unique for df in frames)
  8433. # join indexes only using concat
  8434. if can_concat:
  8435. if how == "left":
  8436. res = concat(
  8437. frames, axis=1, join="outer", verify_integrity=True, sort=sort
  8438. )
  8439. return res.reindex(self.index, copy=False)
  8440. else:
  8441. return concat(
  8442. frames, axis=1, join=how, verify_integrity=True, sort=sort
  8443. )
  8444. joined = frames[0]
  8445. for frame in frames[1:]:
  8446. joined = merge(
  8447. joined,
  8448. frame,
  8449. how=how,
  8450. left_index=True,
  8451. right_index=True,
  8452. validate=validate,
  8453. )
  8454. return joined
  8455. @Substitution("")
  8456. @Appender(_merge_doc, indents=2)
  8457. def merge(
  8458. self,
  8459. right: DataFrame | Series,
  8460. how: MergeHow = "inner",
  8461. on: IndexLabel | None = None,
  8462. left_on: IndexLabel | None = None,
  8463. right_on: IndexLabel | None = None,
  8464. left_index: bool = False,
  8465. right_index: bool = False,
  8466. sort: bool = False,
  8467. suffixes: Suffixes = ("_x", "_y"),
  8468. copy: bool | None = None,
  8469. indicator: str | bool = False,
  8470. validate: str | None = None,
  8471. ) -> DataFrame:
  8472. from pandas.core.reshape.merge import merge
  8473. return merge(
  8474. self,
  8475. right,
  8476. how=how,
  8477. on=on,
  8478. left_on=left_on,
  8479. right_on=right_on,
  8480. left_index=left_index,
  8481. right_index=right_index,
  8482. sort=sort,
  8483. suffixes=suffixes,
  8484. copy=copy,
  8485. indicator=indicator,
  8486. validate=validate,
  8487. )
  8488. def round(
  8489. self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs
  8490. ) -> DataFrame:
  8491. """
  8492. Round a DataFrame to a variable number of decimal places.
  8493. Parameters
  8494. ----------
  8495. decimals : int, dict, Series
  8496. Number of decimal places to round each column to. If an int is
  8497. given, round each column to the same number of places.
  8498. Otherwise dict and Series round to variable numbers of places.
  8499. Column names should be in the keys if `decimals` is a
  8500. dict-like, or in the index if `decimals` is a Series. Any
  8501. columns not included in `decimals` will be left as is. Elements
  8502. of `decimals` which are not columns of the input will be
  8503. ignored.
  8504. *args
  8505. Additional keywords have no effect but might be accepted for
  8506. compatibility with numpy.
  8507. **kwargs
  8508. Additional keywords have no effect but might be accepted for
  8509. compatibility with numpy.
  8510. Returns
  8511. -------
  8512. DataFrame
  8513. A DataFrame with the affected columns rounded to the specified
  8514. number of decimal places.
  8515. See Also
  8516. --------
  8517. numpy.around : Round a numpy array to the given number of decimals.
  8518. Series.round : Round a Series to the given number of decimals.
  8519. Examples
  8520. --------
  8521. >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
  8522. ... columns=['dogs', 'cats'])
  8523. >>> df
  8524. dogs cats
  8525. 0 0.21 0.32
  8526. 1 0.01 0.67
  8527. 2 0.66 0.03
  8528. 3 0.21 0.18
  8529. By providing an integer each column is rounded to the same number
  8530. of decimal places
  8531. >>> df.round(1)
  8532. dogs cats
  8533. 0 0.2 0.3
  8534. 1 0.0 0.7
  8535. 2 0.7 0.0
  8536. 3 0.2 0.2
  8537. With a dict, the number of places for specific columns can be
  8538. specified with the column names as key and the number of decimal
  8539. places as value
  8540. >>> df.round({'dogs': 1, 'cats': 0})
  8541. dogs cats
  8542. 0 0.2 0.0
  8543. 1 0.0 1.0
  8544. 2 0.7 0.0
  8545. 3 0.2 0.0
  8546. Using a Series, the number of places for specific columns can be
  8547. specified with the column names as index and the number of
  8548. decimal places as value
  8549. >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
  8550. >>> df.round(decimals)
  8551. dogs cats
  8552. 0 0.2 0.0
  8553. 1 0.0 1.0
  8554. 2 0.7 0.0
  8555. 3 0.2 0.0
  8556. """
  8557. from pandas.core.reshape.concat import concat
  8558. def _dict_round(df: DataFrame, decimals):
  8559. for col, vals in df.items():
  8560. try:
  8561. yield _series_round(vals, decimals[col])
  8562. except KeyError:
  8563. yield vals
  8564. def _series_round(ser: Series, decimals: int) -> Series:
  8565. if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):
  8566. return ser.round(decimals)
  8567. return ser
  8568. nv.validate_round(args, kwargs)
  8569. if isinstance(decimals, (dict, Series)):
  8570. if isinstance(decimals, Series) and not decimals.index.is_unique:
  8571. raise ValueError("Index of decimals must be unique")
  8572. if is_dict_like(decimals) and not all(
  8573. is_integer(value) for _, value in decimals.items()
  8574. ):
  8575. raise TypeError("Values in decimals must be integers")
  8576. new_cols = list(_dict_round(self, decimals))
  8577. elif is_integer(decimals):
  8578. # Dispatch to Block.round
  8579. return self._constructor(
  8580. self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()),
  8581. ).__finalize__(self, method="round")
  8582. else:
  8583. raise TypeError("decimals must be an integer, a dict-like or a Series")
  8584. if new_cols is not None and len(new_cols) > 0:
  8585. return self._constructor(
  8586. concat(new_cols, axis=1), index=self.index, columns=self.columns
  8587. ).__finalize__(self, method="round")
  8588. else:
  8589. return self.copy(deep=False)
  8590. # ----------------------------------------------------------------------
  8591. # Statistical methods, etc.
  8592. def corr(
  8593. self,
  8594. method: CorrelationMethod = "pearson",
  8595. min_periods: int = 1,
  8596. numeric_only: bool = False,
  8597. ) -> DataFrame:
  8598. """
  8599. Compute pairwise correlation of columns, excluding NA/null values.
  8600. Parameters
  8601. ----------
  8602. method : {'pearson', 'kendall', 'spearman'} or callable
  8603. Method of correlation:
  8604. * pearson : standard correlation coefficient
  8605. * kendall : Kendall Tau correlation coefficient
  8606. * spearman : Spearman rank correlation
  8607. * callable: callable with input two 1d ndarrays
  8608. and returning a float. Note that the returned matrix from corr
  8609. will have 1 along the diagonals and will be symmetric
  8610. regardless of the callable's behavior.
  8611. min_periods : int, optional
  8612. Minimum number of observations required per pair of columns
  8613. to have a valid result. Currently only available for Pearson
  8614. and Spearman correlation.
  8615. numeric_only : bool, default False
  8616. Include only `float`, `int` or `boolean` data.
  8617. .. versionadded:: 1.5.0
  8618. .. versionchanged:: 2.0.0
  8619. The default value of ``numeric_only`` is now ``False``.
  8620. Returns
  8621. -------
  8622. DataFrame
  8623. Correlation matrix.
  8624. See Also
  8625. --------
  8626. DataFrame.corrwith : Compute pairwise correlation with another
  8627. DataFrame or Series.
  8628. Series.corr : Compute the correlation between two Series.
  8629. Notes
  8630. -----
  8631. Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
  8632. * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
  8633. * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
  8634. * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
  8635. Examples
  8636. --------
  8637. >>> def histogram_intersection(a, b):
  8638. ... v = np.minimum(a, b).sum().round(decimals=1)
  8639. ... return v
  8640. >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
  8641. ... columns=['dogs', 'cats'])
  8642. >>> df.corr(method=histogram_intersection)
  8643. dogs cats
  8644. dogs 1.0 0.3
  8645. cats 0.3 1.0
  8646. >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],
  8647. ... columns=['dogs', 'cats'])
  8648. >>> df.corr(min_periods=3)
  8649. dogs cats
  8650. dogs 1.0 NaN
  8651. cats NaN 1.0
  8652. """ # noqa:E501
  8653. data = self._get_numeric_data() if numeric_only else self
  8654. cols = data.columns
  8655. idx = cols.copy()
  8656. mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
  8657. if method == "pearson":
  8658. correl = libalgos.nancorr(mat, minp=min_periods)
  8659. elif method == "spearman":
  8660. correl = libalgos.nancorr_spearman(mat, minp=min_periods)
  8661. elif method == "kendall" or callable(method):
  8662. if min_periods is None:
  8663. min_periods = 1
  8664. mat = mat.T
  8665. corrf = nanops.get_corr_func(method)
  8666. K = len(cols)
  8667. correl = np.empty((K, K), dtype=float)
  8668. mask = np.isfinite(mat)
  8669. for i, ac in enumerate(mat):
  8670. for j, bc in enumerate(mat):
  8671. if i > j:
  8672. continue
  8673. valid = mask[i] & mask[j]
  8674. if valid.sum() < min_periods:
  8675. c = np.nan
  8676. elif i == j:
  8677. c = 1.0
  8678. elif not valid.all():
  8679. c = corrf(ac[valid], bc[valid])
  8680. else:
  8681. c = corrf(ac, bc)
  8682. correl[i, j] = c
  8683. correl[j, i] = c
  8684. else:
  8685. raise ValueError(
  8686. "method must be either 'pearson', "
  8687. "'spearman', 'kendall', or a callable, "
  8688. f"'{method}' was supplied"
  8689. )
  8690. result = self._constructor(correl, index=idx, columns=cols, copy=False)
  8691. return result.__finalize__(self, method="corr")
  8692. def cov(
  8693. self,
  8694. min_periods: int | None = None,
  8695. ddof: int | None = 1,
  8696. numeric_only: bool = False,
  8697. ) -> DataFrame:
  8698. """
  8699. Compute pairwise covariance of columns, excluding NA/null values.
  8700. Compute the pairwise covariance among the series of a DataFrame.
  8701. The returned data frame is the `covariance matrix
  8702. <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
  8703. of the DataFrame.
  8704. Both NA and null values are automatically excluded from the
  8705. calculation. (See the note below about bias from missing values.)
  8706. A threshold can be set for the minimum number of
  8707. observations for each value created. Comparisons with observations
  8708. below this threshold will be returned as ``NaN``.
  8709. This method is generally used for the analysis of time series data to
  8710. understand the relationship between different measures
  8711. across time.
  8712. Parameters
  8713. ----------
  8714. min_periods : int, optional
  8715. Minimum number of observations required per pair of columns
  8716. to have a valid result.
  8717. ddof : int, default 1
  8718. Delta degrees of freedom. The divisor used in calculations
  8719. is ``N - ddof``, where ``N`` represents the number of elements.
  8720. .. versionadded:: 1.1.0
  8721. numeric_only : bool, default False
  8722. Include only `float`, `int` or `boolean` data.
  8723. .. versionadded:: 1.5.0
  8724. .. versionchanged:: 2.0.0
  8725. The default value of ``numeric_only`` is now ``False``.
  8726. Returns
  8727. -------
  8728. DataFrame
  8729. The covariance matrix of the series of the DataFrame.
  8730. See Also
  8731. --------
  8732. Series.cov : Compute covariance with another Series.
  8733. core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample
  8734. covariance.
  8735. core.window.expanding.Expanding.cov : Expanding sample covariance.
  8736. core.window.rolling.Rolling.cov : Rolling sample covariance.
  8737. Notes
  8738. -----
  8739. Returns the covariance matrix of the DataFrame's time series.
  8740. The covariance is normalized by N-ddof.
  8741. For DataFrames that have Series that are missing data (assuming that
  8742. data is `missing at random
  8743. <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
  8744. the returned covariance matrix will be an unbiased estimate
  8745. of the variance and covariance between the member Series.
  8746. However, for many applications this estimate may not be acceptable
  8747. because the estimate covariance matrix is not guaranteed to be positive
  8748. semi-definite. This could lead to estimate correlations having
  8749. absolute values which are greater than one, and/or a non-invertible
  8750. covariance matrix. See `Estimation of covariance matrices
  8751. <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
  8752. matrices>`__ for more details.
  8753. Examples
  8754. --------
  8755. >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
  8756. ... columns=['dogs', 'cats'])
  8757. >>> df.cov()
  8758. dogs cats
  8759. dogs 0.666667 -1.000000
  8760. cats -1.000000 1.666667
  8761. >>> np.random.seed(42)
  8762. >>> df = pd.DataFrame(np.random.randn(1000, 5),
  8763. ... columns=['a', 'b', 'c', 'd', 'e'])
  8764. >>> df.cov()
  8765. a b c d e
  8766. a 0.998438 -0.020161 0.059277 -0.008943 0.014144
  8767. b -0.020161 1.059352 -0.008543 -0.024738 0.009826
  8768. c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
  8769. d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
  8770. e 0.014144 0.009826 -0.000271 -0.013692 0.977795
  8771. **Minimum number of periods**
  8772. This method also supports an optional ``min_periods`` keyword
  8773. that specifies the required minimum number of non-NA observations for
  8774. each column pair in order to have a valid result:
  8775. >>> np.random.seed(42)
  8776. >>> df = pd.DataFrame(np.random.randn(20, 3),
  8777. ... columns=['a', 'b', 'c'])
  8778. >>> df.loc[df.index[:5], 'a'] = np.nan
  8779. >>> df.loc[df.index[5:10], 'b'] = np.nan
  8780. >>> df.cov(min_periods=12)
  8781. a b c
  8782. a 0.316741 NaN -0.150812
  8783. b NaN 1.248003 0.191417
  8784. c -0.150812 0.191417 0.895202
  8785. """
  8786. data = self._get_numeric_data() if numeric_only else self
  8787. cols = data.columns
  8788. idx = cols.copy()
  8789. mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
  8790. if notna(mat).all():
  8791. if min_periods is not None and min_periods > len(mat):
  8792. base_cov = np.empty((mat.shape[1], mat.shape[1]))
  8793. base_cov.fill(np.nan)
  8794. else:
  8795. base_cov = np.cov(mat.T, ddof=ddof)
  8796. base_cov = base_cov.reshape((len(cols), len(cols)))
  8797. else:
  8798. base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
  8799. result = self._constructor(base_cov, index=idx, columns=cols, copy=False)
  8800. return result.__finalize__(self, method="cov")
  8801. def corrwith(
  8802. self,
  8803. other: DataFrame | Series,
  8804. axis: Axis = 0,
  8805. drop: bool = False,
  8806. method: CorrelationMethod = "pearson",
  8807. numeric_only: bool = False,
  8808. ) -> Series:
  8809. """
  8810. Compute pairwise correlation.
  8811. Pairwise correlation is computed between rows or columns of
  8812. DataFrame with rows or columns of Series or DataFrame. DataFrames
  8813. are first aligned along both axes before computing the
  8814. correlations.
  8815. Parameters
  8816. ----------
  8817. other : DataFrame, Series
  8818. Object with which to compute correlations.
  8819. axis : {0 or 'index', 1 or 'columns'}, default 0
  8820. The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for
  8821. column-wise.
  8822. drop : bool, default False
  8823. Drop missing indices from result.
  8824. method : {'pearson', 'kendall', 'spearman'} or callable
  8825. Method of correlation:
  8826. * pearson : standard correlation coefficient
  8827. * kendall : Kendall Tau correlation coefficient
  8828. * spearman : Spearman rank correlation
  8829. * callable: callable with input two 1d ndarrays
  8830. and returning a float.
  8831. numeric_only : bool, default False
  8832. Include only `float`, `int` or `boolean` data.
  8833. .. versionadded:: 1.5.0
  8834. .. versionchanged:: 2.0.0
  8835. The default value of ``numeric_only`` is now ``False``.
  8836. Returns
  8837. -------
  8838. Series
  8839. Pairwise correlations.
  8840. See Also
  8841. --------
  8842. DataFrame.corr : Compute pairwise correlation of columns.
  8843. Examples
  8844. --------
  8845. >>> index = ["a", "b", "c", "d", "e"]
  8846. >>> columns = ["one", "two", "three", "four"]
  8847. >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)
  8848. >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)
  8849. >>> df1.corrwith(df2)
  8850. one 1.0
  8851. two 1.0
  8852. three 1.0
  8853. four 1.0
  8854. dtype: float64
  8855. >>> df2.corrwith(df1, axis=1)
  8856. a 1.0
  8857. b 1.0
  8858. c 1.0
  8859. d 1.0
  8860. e NaN
  8861. dtype: float64
  8862. """ # noqa:E501
  8863. axis = self._get_axis_number(axis)
  8864. this = self._get_numeric_data() if numeric_only else self
  8865. if isinstance(other, Series):
  8866. return this.apply(lambda x: other.corr(x, method=method), axis=axis)
  8867. if numeric_only:
  8868. other = other._get_numeric_data()
  8869. left, right = this.align(other, join="inner", copy=False)
  8870. if axis == 1:
  8871. left = left.T
  8872. right = right.T
  8873. if method == "pearson":
  8874. # mask missing values
  8875. left = left + right * 0
  8876. right = right + left * 0
  8877. # demeaned data
  8878. ldem = left - left.mean(numeric_only=numeric_only)
  8879. rdem = right - right.mean(numeric_only=numeric_only)
  8880. num = (ldem * rdem).sum()
  8881. dom = (
  8882. (left.count() - 1)
  8883. * left.std(numeric_only=numeric_only)
  8884. * right.std(numeric_only=numeric_only)
  8885. )
  8886. correl = num / dom
  8887. elif method in ["kendall", "spearman"] or callable(method):
  8888. def c(x):
  8889. return nanops.nancorr(x[0], x[1], method=method)
  8890. correl = self._constructor_sliced(
  8891. map(c, zip(left.values.T, right.values.T)),
  8892. index=left.columns,
  8893. copy=False,
  8894. )
  8895. else:
  8896. raise ValueError(
  8897. f"Invalid method {method} was passed, "
  8898. "valid methods are: 'pearson', 'kendall', "
  8899. "'spearman', or callable"
  8900. )
  8901. if not drop:
  8902. # Find non-matching labels along the given axis
  8903. # and append missing correlations (GH 22375)
  8904. raxis: AxisInt = 1 if axis == 0 else 0
  8905. result_index = this._get_axis(raxis).union(other._get_axis(raxis))
  8906. idx_diff = result_index.difference(correl.index)
  8907. if len(idx_diff) > 0:
  8908. correl = correl._append(
  8909. Series([np.nan] * len(idx_diff), index=idx_diff)
  8910. )
  8911. return correl
  8912. # ----------------------------------------------------------------------
  8913. # ndarray-like stats methods
  8914. def count(self, axis: Axis = 0, numeric_only: bool = False):
  8915. """
  8916. Count non-NA cells for each column or row.
  8917. The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
  8918. on `pandas.options.mode.use_inf_as_na`) are considered NA.
  8919. Parameters
  8920. ----------
  8921. axis : {0 or 'index', 1 or 'columns'}, default 0
  8922. If 0 or 'index' counts are generated for each column.
  8923. If 1 or 'columns' counts are generated for each row.
  8924. numeric_only : bool, default False
  8925. Include only `float`, `int` or `boolean` data.
  8926. Returns
  8927. -------
  8928. Series or DataFrame
  8929. For each column/row the number of non-NA/null entries.
  8930. If `level` is specified returns a `DataFrame`.
  8931. See Also
  8932. --------
  8933. Series.count: Number of non-NA elements in a Series.
  8934. DataFrame.value_counts: Count unique combinations of columns.
  8935. DataFrame.shape: Number of DataFrame rows and columns (including NA
  8936. elements).
  8937. DataFrame.isna: Boolean same-sized DataFrame showing places of NA
  8938. elements.
  8939. Examples
  8940. --------
  8941. Constructing DataFrame from a dictionary:
  8942. >>> df = pd.DataFrame({"Person":
  8943. ... ["John", "Myla", "Lewis", "John", "Myla"],
  8944. ... "Age": [24., np.nan, 21., 33, 26],
  8945. ... "Single": [False, True, True, True, False]})
  8946. >>> df
  8947. Person Age Single
  8948. 0 John 24.0 False
  8949. 1 Myla NaN True
  8950. 2 Lewis 21.0 True
  8951. 3 John 33.0 True
  8952. 4 Myla 26.0 False
  8953. Notice the uncounted NA values:
  8954. >>> df.count()
  8955. Person 5
  8956. Age 4
  8957. Single 5
  8958. dtype: int64
  8959. Counts for each **row**:
  8960. >>> df.count(axis='columns')
  8961. 0 3
  8962. 1 2
  8963. 2 3
  8964. 3 3
  8965. 4 3
  8966. dtype: int64
  8967. """
  8968. axis = self._get_axis_number(axis)
  8969. if numeric_only:
  8970. frame = self._get_numeric_data()
  8971. else:
  8972. frame = self
  8973. # GH #423
  8974. if len(frame._get_axis(axis)) == 0:
  8975. result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))
  8976. else:
  8977. if frame._is_mixed_type or frame._mgr.any_extension_types:
  8978. # the or any_extension_types is really only hit for single-
  8979. # column frames with an extension array
  8980. result = notna(frame).sum(axis=axis)
  8981. else:
  8982. # GH13407
  8983. series_counts = notna(frame).sum(axis=axis)
  8984. counts = series_counts._values
  8985. result = self._constructor_sliced(
  8986. counts, index=frame._get_agg_axis(axis), copy=False
  8987. )
  8988. return result.astype("int64").__finalize__(self, method="count")
  8989. def _reduce(
  8990. self,
  8991. op,
  8992. name: str,
  8993. *,
  8994. axis: Axis = 0,
  8995. skipna: bool = True,
  8996. numeric_only: bool = False,
  8997. filter_type=None,
  8998. **kwds,
  8999. ):
  9000. assert filter_type is None or filter_type == "bool", filter_type
  9001. out_dtype = "bool" if filter_type == "bool" else None
  9002. if axis is not None:
  9003. axis = self._get_axis_number(axis)
  9004. def func(values: np.ndarray):
  9005. # We only use this in the case that operates on self.values
  9006. return op(values, axis=axis, skipna=skipna, **kwds)
  9007. def blk_func(values, axis: Axis = 1):
  9008. if isinstance(values, ExtensionArray):
  9009. if not is_1d_only_ea_dtype(values.dtype) and not isinstance(
  9010. self._mgr, ArrayManager
  9011. ):
  9012. return values._reduce(name, axis=1, skipna=skipna, **kwds)
  9013. return values._reduce(name, skipna=skipna, **kwds)
  9014. else:
  9015. return op(values, axis=axis, skipna=skipna, **kwds)
  9016. def _get_data() -> DataFrame:
  9017. if filter_type is None:
  9018. data = self._get_numeric_data()
  9019. else:
  9020. # GH#25101, GH#24434
  9021. assert filter_type == "bool"
  9022. data = self._get_bool_data()
  9023. return data
  9024. # Case with EAs see GH#35881
  9025. df = self
  9026. if numeric_only:
  9027. df = _get_data()
  9028. if axis is None:
  9029. return func(df.values)
  9030. elif axis == 1:
  9031. if len(df.index) == 0:
  9032. # Taking a transpose would result in no columns, losing the dtype.
  9033. # In the empty case, reducing along axis 0 or 1 gives the same
  9034. # result dtype, so reduce with axis=0 and ignore values
  9035. result = df._reduce(
  9036. op,
  9037. name,
  9038. axis=0,
  9039. skipna=skipna,
  9040. numeric_only=False,
  9041. filter_type=filter_type,
  9042. **kwds,
  9043. ).iloc[:0]
  9044. result.index = df.index
  9045. return result
  9046. df = df.T
  9047. # After possibly _get_data and transposing, we are now in the
  9048. # simple case where we can use BlockManager.reduce
  9049. res = df._mgr.reduce(blk_func)
  9050. out = df._constructor(res).iloc[0]
  9051. if out_dtype is not None:
  9052. out = out.astype(out_dtype)
  9053. elif (df._mgr.get_dtypes() == object).any():
  9054. out = out.astype(object)
  9055. elif len(self) == 0 and name in ("sum", "prod"):
  9056. # Even if we are object dtype, follow numpy and return
  9057. # float64, see test_apply_funcs_over_empty
  9058. out = out.astype(np.float64)
  9059. return out
  9060. def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
  9061. """
  9062. Special case for _reduce to try to avoid a potentially-expensive transpose.
  9063. Apply the reduction block-wise along axis=1 and then reduce the resulting
  9064. 1D arrays.
  9065. """
  9066. if name == "all":
  9067. result = np.ones(len(self), dtype=bool)
  9068. ufunc = np.logical_and
  9069. elif name == "any":
  9070. result = np.zeros(len(self), dtype=bool)
  9071. # error: Incompatible types in assignment
  9072. # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],
  9073. # Literal[20], Literal[False]]", variable has type
  9074. # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],
  9075. # Literal[True]]")
  9076. ufunc = np.logical_or # type: ignore[assignment]
  9077. else:
  9078. raise NotImplementedError(name)
  9079. for arr in self._mgr.arrays:
  9080. middle = func(arr, axis=0, skipna=skipna)
  9081. result = ufunc(result, middle)
  9082. res_ser = self._constructor_sliced(result, index=self.index, copy=False)
  9083. return res_ser
  9084. def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
  9085. """
  9086. Count number of distinct elements in specified axis.
  9087. Return Series with number of distinct elements. Can ignore NaN
  9088. values.
  9089. Parameters
  9090. ----------
  9091. axis : {0 or 'index', 1 or 'columns'}, default 0
  9092. The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
  9093. column-wise.
  9094. dropna : bool, default True
  9095. Don't include NaN in the counts.
  9096. Returns
  9097. -------
  9098. Series
  9099. See Also
  9100. --------
  9101. Series.nunique: Method nunique for Series.
  9102. DataFrame.count: Count non-NA cells for each column or row.
  9103. Examples
  9104. --------
  9105. >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
  9106. >>> df.nunique()
  9107. A 3
  9108. B 2
  9109. dtype: int64
  9110. >>> df.nunique(axis=1)
  9111. 0 1
  9112. 1 2
  9113. 2 2
  9114. dtype: int64
  9115. """
  9116. return self.apply(Series.nunique, axis=axis, dropna=dropna)
  9117. @doc(_shared_docs["idxmin"], numeric_only_default="False")
  9118. def idxmin(
  9119. self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
  9120. ) -> Series:
  9121. axis = self._get_axis_number(axis)
  9122. if numeric_only:
  9123. data = self._get_numeric_data()
  9124. else:
  9125. data = self
  9126. res = data._reduce(
  9127. nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
  9128. )
  9129. indices = res._values
  9130. # indices will always be np.ndarray since axis is not None and
  9131. # values is a 2d array for DataFrame
  9132. # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
  9133. assert isinstance(indices, np.ndarray) # for mypy
  9134. index = data._get_axis(axis)
  9135. result = [index[i] if i >= 0 else np.nan for i in indices]
  9136. final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
  9137. return final_result.__finalize__(self, method="idxmin")
  9138. @doc(_shared_docs["idxmax"], numeric_only_default="False")
  9139. def idxmax(
  9140. self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
  9141. ) -> Series:
  9142. axis = self._get_axis_number(axis)
  9143. if numeric_only:
  9144. data = self._get_numeric_data()
  9145. else:
  9146. data = self
  9147. res = data._reduce(
  9148. nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
  9149. )
  9150. indices = res._values
  9151. # indices will always be np.ndarray since axis is not None and
  9152. # values is a 2d array for DataFrame
  9153. # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
  9154. assert isinstance(indices, np.ndarray) # for mypy
  9155. index = data._get_axis(axis)
  9156. result = [index[i] if i >= 0 else np.nan for i in indices]
  9157. final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
  9158. return final_result.__finalize__(self, method="idxmax")
  9159. def _get_agg_axis(self, axis_num: int) -> Index:
  9160. """
  9161. Let's be explicit about this.
  9162. """
  9163. if axis_num == 0:
  9164. return self.columns
  9165. elif axis_num == 1:
  9166. return self.index
  9167. else:
  9168. raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
  9169. def mode(
  9170. self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True
  9171. ) -> DataFrame:
  9172. """
  9173. Get the mode(s) of each element along the selected axis.
  9174. The mode of a set of values is the value that appears most often.
  9175. It can be multiple values.
  9176. Parameters
  9177. ----------
  9178. axis : {0 or 'index', 1 or 'columns'}, default 0
  9179. The axis to iterate over while searching for the mode:
  9180. * 0 or 'index' : get mode of each column
  9181. * 1 or 'columns' : get mode of each row.
  9182. numeric_only : bool, default False
  9183. If True, only apply to numeric columns.
  9184. dropna : bool, default True
  9185. Don't consider counts of NaN/NaT.
  9186. Returns
  9187. -------
  9188. DataFrame
  9189. The modes of each column or row.
  9190. See Also
  9191. --------
  9192. Series.mode : Return the highest frequency value in a Series.
  9193. Series.value_counts : Return the counts of values in a Series.
  9194. Examples
  9195. --------
  9196. >>> df = pd.DataFrame([('bird', 2, 2),
  9197. ... ('mammal', 4, np.nan),
  9198. ... ('arthropod', 8, 0),
  9199. ... ('bird', 2, np.nan)],
  9200. ... index=('falcon', 'horse', 'spider', 'ostrich'),
  9201. ... columns=('species', 'legs', 'wings'))
  9202. >>> df
  9203. species legs wings
  9204. falcon bird 2 2.0
  9205. horse mammal 4 NaN
  9206. spider arthropod 8 0.0
  9207. ostrich bird 2 NaN
  9208. By default, missing values are not considered, and the mode of wings
  9209. are both 0 and 2. Because the resulting DataFrame has two rows,
  9210. the second row of ``species`` and ``legs`` contains ``NaN``.
  9211. >>> df.mode()
  9212. species legs wings
  9213. 0 bird 2.0 0.0
  9214. 1 NaN NaN 2.0
  9215. Setting ``dropna=False`` ``NaN`` values are considered and they can be
  9216. the mode (like for wings).
  9217. >>> df.mode(dropna=False)
  9218. species legs wings
  9219. 0 bird 2 NaN
  9220. Setting ``numeric_only=True``, only the mode of numeric columns is
  9221. computed, and columns of other types are ignored.
  9222. >>> df.mode(numeric_only=True)
  9223. legs wings
  9224. 0 2.0 0.0
  9225. 1 NaN 2.0
  9226. To compute the mode over columns and not rows, use the axis parameter:
  9227. >>> df.mode(axis='columns', numeric_only=True)
  9228. 0 1
  9229. falcon 2.0 NaN
  9230. horse 4.0 NaN
  9231. spider 0.0 8.0
  9232. ostrich 2.0 NaN
  9233. """
  9234. data = self if not numeric_only else self._get_numeric_data()
  9235. def f(s):
  9236. return s.mode(dropna=dropna)
  9237. data = data.apply(f, axis=axis)
  9238. # Ensure index is type stable (should always use int index)
  9239. if data.empty:
  9240. data.index = default_index(0)
  9241. return data
  9242. @overload
  9243. def quantile(
  9244. self,
  9245. q: float = ...,
  9246. axis: Axis = ...,
  9247. numeric_only: bool = ...,
  9248. interpolation: QuantileInterpolation = ...,
  9249. ) -> Series:
  9250. ...
  9251. @overload
  9252. def quantile(
  9253. self,
  9254. q: AnyArrayLike | Sequence[float],
  9255. axis: Axis = ...,
  9256. numeric_only: bool = ...,
  9257. interpolation: QuantileInterpolation = ...,
  9258. ) -> Series | DataFrame:
  9259. ...
  9260. @overload
  9261. def quantile(
  9262. self,
  9263. q: float | AnyArrayLike | Sequence[float] = ...,
  9264. axis: Axis = ...,
  9265. numeric_only: bool = ...,
  9266. interpolation: QuantileInterpolation = ...,
  9267. ) -> Series | DataFrame:
  9268. ...
  9269. def quantile(
  9270. self,
  9271. q: float | AnyArrayLike | Sequence[float] = 0.5,
  9272. axis: Axis = 0,
  9273. numeric_only: bool = False,
  9274. interpolation: QuantileInterpolation = "linear",
  9275. method: Literal["single", "table"] = "single",
  9276. ) -> Series | DataFrame:
  9277. """
  9278. Return values at the given quantile over requested axis.
  9279. Parameters
  9280. ----------
  9281. q : float or array-like, default 0.5 (50% quantile)
  9282. Value between 0 <= q <= 1, the quantile(s) to compute.
  9283. axis : {0 or 'index', 1 or 'columns'}, default 0
  9284. Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
  9285. numeric_only : bool, default False
  9286. Include only `float`, `int` or `boolean` data.
  9287. .. versionchanged:: 2.0.0
  9288. The default value of ``numeric_only`` is now ``False``.
  9289. interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
  9290. This optional parameter specifies the interpolation method to use,
  9291. when the desired quantile lies between two data points `i` and `j`:
  9292. * linear: `i + (j - i) * fraction`, where `fraction` is the
  9293. fractional part of the index surrounded by `i` and `j`.
  9294. * lower: `i`.
  9295. * higher: `j`.
  9296. * nearest: `i` or `j` whichever is nearest.
  9297. * midpoint: (`i` + `j`) / 2.
  9298. method : {'single', 'table'}, default 'single'
  9299. Whether to compute quantiles per-column ('single') or over all columns
  9300. ('table'). When 'table', the only allowed interpolation methods are
  9301. 'nearest', 'lower', and 'higher'.
  9302. Returns
  9303. -------
  9304. Series or DataFrame
  9305. If ``q`` is an array, a DataFrame will be returned where the
  9306. index is ``q``, the columns are the columns of self, and the
  9307. values are the quantiles.
  9308. If ``q`` is a float, a Series will be returned where the
  9309. index is the columns of self and the values are the quantiles.
  9310. See Also
  9311. --------
  9312. core.window.rolling.Rolling.quantile: Rolling quantile.
  9313. numpy.percentile: Numpy function to compute the percentile.
  9314. Examples
  9315. --------
  9316. >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
  9317. ... columns=['a', 'b'])
  9318. >>> df.quantile(.1)
  9319. a 1.3
  9320. b 3.7
  9321. Name: 0.1, dtype: float64
  9322. >>> df.quantile([.1, .5])
  9323. a b
  9324. 0.1 1.3 3.7
  9325. 0.5 2.5 55.0
  9326. Specifying `method='table'` will compute the quantile over all columns.
  9327. >>> df.quantile(.1, method="table", interpolation="nearest")
  9328. a 1
  9329. b 1
  9330. Name: 0.1, dtype: int64
  9331. >>> df.quantile([.1, .5], method="table", interpolation="nearest")
  9332. a b
  9333. 0.1 1 1
  9334. 0.5 3 100
  9335. Specifying `numeric_only=False` will also compute the quantile of
  9336. datetime and timedelta data.
  9337. >>> df = pd.DataFrame({'A': [1, 2],
  9338. ... 'B': [pd.Timestamp('2010'),
  9339. ... pd.Timestamp('2011')],
  9340. ... 'C': [pd.Timedelta('1 days'),
  9341. ... pd.Timedelta('2 days')]})
  9342. >>> df.quantile(0.5, numeric_only=False)
  9343. A 1.5
  9344. B 2010-07-02 12:00:00
  9345. C 1 days 12:00:00
  9346. Name: 0.5, dtype: object
  9347. """
  9348. validate_percentile(q)
  9349. axis = self._get_axis_number(axis)
  9350. if not is_list_like(q):
  9351. # BlockManager.quantile expects listlike, so we wrap and unwrap here
  9352. # error: List item 0 has incompatible type "Union[float, Union[Union[
  9353. # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";
  9354. # expected "float"
  9355. res_df = self.quantile( # type: ignore[call-overload]
  9356. [q],
  9357. axis=axis,
  9358. numeric_only=numeric_only,
  9359. interpolation=interpolation,
  9360. method=method,
  9361. )
  9362. if method == "single":
  9363. res = res_df.iloc[0]
  9364. else:
  9365. # cannot directly iloc over sparse arrays
  9366. res = res_df.T.iloc[:, 0]
  9367. if axis == 1 and len(self) == 0:
  9368. # GH#41544 try to get an appropriate dtype
  9369. dtype = find_common_type(list(self.dtypes))
  9370. if needs_i8_conversion(dtype):
  9371. return res.astype(dtype)
  9372. return res
  9373. q = Index(q, dtype=np.float64)
  9374. data = self._get_numeric_data() if numeric_only else self
  9375. if axis == 1:
  9376. data = data.T
  9377. if len(data.columns) == 0:
  9378. # GH#23925 _get_numeric_data may have dropped all columns
  9379. cols = Index([], name=self.columns.name)
  9380. dtype = np.float64
  9381. if axis == 1:
  9382. # GH#41544 try to get an appropriate dtype
  9383. cdtype = find_common_type(list(self.dtypes))
  9384. if needs_i8_conversion(cdtype):
  9385. dtype = cdtype
  9386. res = self._constructor([], index=q, columns=cols, dtype=dtype)
  9387. return res.__finalize__(self, method="quantile")
  9388. valid_method = {"single", "table"}
  9389. if method not in valid_method:
  9390. raise ValueError(
  9391. f"Invalid method: {method}. Method must be in {valid_method}."
  9392. )
  9393. if method == "single":
  9394. res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation)
  9395. elif method == "table":
  9396. valid_interpolation = {"nearest", "lower", "higher"}
  9397. if interpolation not in valid_interpolation:
  9398. raise ValueError(
  9399. f"Invalid interpolation: {interpolation}. "
  9400. f"Interpolation must be in {valid_interpolation}"
  9401. )
  9402. # handle degenerate case
  9403. if len(data) == 0:
  9404. if data.ndim == 2:
  9405. dtype = find_common_type(list(self.dtypes))
  9406. else:
  9407. dtype = self.dtype
  9408. return self._constructor([], index=q, columns=data.columns, dtype=dtype)
  9409. q_idx = np.quantile( # type: ignore[call-overload]
  9410. np.arange(len(data)), q, **{np_percentile_argname: interpolation}
  9411. )
  9412. by = data.columns
  9413. if len(by) > 1:
  9414. keys = [data._get_label_or_level_values(x) for x in by]
  9415. indexer = lexsort_indexer(keys)
  9416. else:
  9417. by = by[0]
  9418. k = data._get_label_or_level_values(by) # type: ignore[arg-type]
  9419. indexer = nargsort(k)
  9420. res = data._mgr.take(indexer[q_idx], verify=False)
  9421. res.axes[1] = q
  9422. result = self._constructor(res)
  9423. return result.__finalize__(self, method="quantile")
  9424. @doc(NDFrame.asfreq, **_shared_doc_kwargs)
  9425. def asfreq(
  9426. self,
  9427. freq: Frequency,
  9428. method: FillnaOptions | None = None,
  9429. how: str | None = None,
  9430. normalize: bool = False,
  9431. fill_value: Hashable = None,
  9432. ) -> DataFrame:
  9433. return super().asfreq(
  9434. freq=freq,
  9435. method=method,
  9436. how=how,
  9437. normalize=normalize,
  9438. fill_value=fill_value,
  9439. )
  9440. @doc(NDFrame.resample, **_shared_doc_kwargs)
  9441. def resample(
  9442. self,
  9443. rule,
  9444. axis: Axis = 0,
  9445. closed: str | None = None,
  9446. label: str | None = None,
  9447. convention: str = "start",
  9448. kind: str | None = None,
  9449. on: Level = None,
  9450. level: Level = None,
  9451. origin: str | TimestampConvertibleTypes = "start_day",
  9452. offset: TimedeltaConvertibleTypes | None = None,
  9453. group_keys: bool = False,
  9454. ) -> Resampler:
  9455. return super().resample(
  9456. rule=rule,
  9457. axis=axis,
  9458. closed=closed,
  9459. label=label,
  9460. convention=convention,
  9461. kind=kind,
  9462. on=on,
  9463. level=level,
  9464. origin=origin,
  9465. offset=offset,
  9466. group_keys=group_keys,
  9467. )
  9468. def to_timestamp(
  9469. self,
  9470. freq: Frequency | None = None,
  9471. how: str = "start",
  9472. axis: Axis = 0,
  9473. copy: bool | None = None,
  9474. ) -> DataFrame:
  9475. """
  9476. Cast to DatetimeIndex of timestamps, at *beginning* of period.
  9477. Parameters
  9478. ----------
  9479. freq : str, default frequency of PeriodIndex
  9480. Desired frequency.
  9481. how : {'s', 'e', 'start', 'end'}
  9482. Convention for converting period to timestamp; start of period
  9483. vs. end.
  9484. axis : {0 or 'index', 1 or 'columns'}, default 0
  9485. The axis to convert (the index by default).
  9486. copy : bool, default True
  9487. If False then underlying input data is not copied.
  9488. Returns
  9489. -------
  9490. DataFrame
  9491. The DataFrame has a DatetimeIndex.
  9492. Examples
  9493. --------
  9494. >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y')
  9495. >>> d = {'col1': [1, 2], 'col2': [3, 4]}
  9496. >>> df1 = pd.DataFrame(data=d, index=idx)
  9497. >>> df1
  9498. col1 col2
  9499. 2023 1 3
  9500. 2024 2 4
  9501. The resulting timestamps will be at the beginning of the year in this case
  9502. >>> df1 = df1.to_timestamp()
  9503. >>> df1
  9504. col1 col2
  9505. 2023-01-01 1 3
  9506. 2024-01-01 2 4
  9507. >>> df1.index
  9508. DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None)
  9509. Using `freq` which is the offset that the Timestamps will have
  9510. >>> df2 = pd.DataFrame(data=d, index=idx)
  9511. >>> df2 = df2.to_timestamp(freq='M')
  9512. >>> df2
  9513. col1 col2
  9514. 2023-01-31 1 3
  9515. 2024-01-31 2 4
  9516. >>> df2.index
  9517. DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None)
  9518. """
  9519. new_obj = self.copy(deep=copy and not using_copy_on_write())
  9520. axis_name = self._get_axis_name(axis)
  9521. old_ax = getattr(self, axis_name)
  9522. if not isinstance(old_ax, PeriodIndex):
  9523. raise TypeError(f"unsupported Type {type(old_ax).__name__}")
  9524. new_ax = old_ax.to_timestamp(freq=freq, how=how)
  9525. setattr(new_obj, axis_name, new_ax)
  9526. return new_obj
  9527. def to_period(
  9528. self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None
  9529. ) -> DataFrame:
  9530. """
  9531. Convert DataFrame from DatetimeIndex to PeriodIndex.
  9532. Convert DataFrame from DatetimeIndex to PeriodIndex with desired
  9533. frequency (inferred from index if not passed).
  9534. Parameters
  9535. ----------
  9536. freq : str, default
  9537. Frequency of the PeriodIndex.
  9538. axis : {0 or 'index', 1 or 'columns'}, default 0
  9539. The axis to convert (the index by default).
  9540. copy : bool, default True
  9541. If False then underlying input data is not copied.
  9542. Returns
  9543. -------
  9544. DataFrame
  9545. The DataFrame has a PeriodIndex.
  9546. Examples
  9547. --------
  9548. >>> idx = pd.to_datetime(
  9549. ... [
  9550. ... "2001-03-31 00:00:00",
  9551. ... "2002-05-31 00:00:00",
  9552. ... "2003-08-31 00:00:00",
  9553. ... ]
  9554. ... )
  9555. >>> idx
  9556. DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],
  9557. dtype='datetime64[ns]', freq=None)
  9558. >>> idx.to_period("M")
  9559. PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')
  9560. For the yearly frequency
  9561. >>> idx.to_period("Y")
  9562. PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]')
  9563. """
  9564. new_obj = self.copy(deep=copy and not using_copy_on_write())
  9565. axis_name = self._get_axis_name(axis)
  9566. old_ax = getattr(self, axis_name)
  9567. if not isinstance(old_ax, DatetimeIndex):
  9568. raise TypeError(f"unsupported Type {type(old_ax).__name__}")
  9569. new_ax = old_ax.to_period(freq=freq)
  9570. setattr(new_obj, axis_name, new_ax)
  9571. return new_obj
  9572. def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
  9573. """
  9574. Whether each element in the DataFrame is contained in values.
  9575. Parameters
  9576. ----------
  9577. values : iterable, Series, DataFrame or dict
  9578. The result will only be true at a location if all the
  9579. labels match. If `values` is a Series, that's the index. If
  9580. `values` is a dict, the keys must be the column names,
  9581. which must match. If `values` is a DataFrame,
  9582. then both the index and column labels must match.
  9583. Returns
  9584. -------
  9585. DataFrame
  9586. DataFrame of booleans showing whether each element in the DataFrame
  9587. is contained in values.
  9588. See Also
  9589. --------
  9590. DataFrame.eq: Equality test for DataFrame.
  9591. Series.isin: Equivalent method on Series.
  9592. Series.str.contains: Test if pattern or regex is contained within a
  9593. string of a Series or Index.
  9594. Examples
  9595. --------
  9596. >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
  9597. ... index=['falcon', 'dog'])
  9598. >>> df
  9599. num_legs num_wings
  9600. falcon 2 2
  9601. dog 4 0
  9602. When ``values`` is a list check whether every value in the DataFrame
  9603. is present in the list (which animals have 0 or 2 legs or wings)
  9604. >>> df.isin([0, 2])
  9605. num_legs num_wings
  9606. falcon True True
  9607. dog False True
  9608. To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:
  9609. >>> ~df.isin([0, 2])
  9610. num_legs num_wings
  9611. falcon False False
  9612. dog True False
  9613. When ``values`` is a dict, we can pass values to check for each
  9614. column separately:
  9615. >>> df.isin({'num_wings': [0, 3]})
  9616. num_legs num_wings
  9617. falcon False False
  9618. dog False True
  9619. When ``values`` is a Series or DataFrame the index and column must
  9620. match. Note that 'falcon' does not match based on the number of legs
  9621. in other.
  9622. >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},
  9623. ... index=['spider', 'falcon'])
  9624. >>> df.isin(other)
  9625. num_legs num_wings
  9626. falcon False True
  9627. dog False False
  9628. """
  9629. if isinstance(values, dict):
  9630. from pandas.core.reshape.concat import concat
  9631. values = collections.defaultdict(list, values)
  9632. result = concat(
  9633. (
  9634. self.iloc[:, [i]].isin(values[col])
  9635. for i, col in enumerate(self.columns)
  9636. ),
  9637. axis=1,
  9638. )
  9639. elif isinstance(values, Series):
  9640. if not values.index.is_unique:
  9641. raise ValueError("cannot compute isin with a duplicate axis.")
  9642. result = self.eq(values.reindex_like(self), axis="index")
  9643. elif isinstance(values, DataFrame):
  9644. if not (values.columns.is_unique and values.index.is_unique):
  9645. raise ValueError("cannot compute isin with a duplicate axis.")
  9646. result = self.eq(values.reindex_like(self))
  9647. else:
  9648. if not is_list_like(values):
  9649. raise TypeError(
  9650. "only list-like or dict-like objects are allowed "
  9651. "to be passed to DataFrame.isin(), "
  9652. f"you passed a '{type(values).__name__}'"
  9653. )
  9654. # error: Argument 2 to "isin" has incompatible type "Union[Sequence[Any],
  9655. # Mapping[Any, Any]]"; expected "Union[Union[ExtensionArray,
  9656. # ndarray[Any, Any]], Index, Series]"
  9657. result = self._constructor(
  9658. algorithms.isin(
  9659. self.values.ravel(), values # type: ignore[arg-type]
  9660. ).reshape(self.shape),
  9661. self.index,
  9662. self.columns,
  9663. copy=False,
  9664. )
  9665. return result.__finalize__(self, method="isin")
  9666. # ----------------------------------------------------------------------
  9667. # Add index and columns
  9668. _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"]
  9669. _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {
  9670. **NDFrame._AXIS_TO_AXIS_NUMBER,
  9671. 1: 1,
  9672. "columns": 1,
  9673. }
  9674. _AXIS_LEN = len(_AXIS_ORDERS)
  9675. _info_axis_number: Literal[1] = 1
  9676. _info_axis_name: Literal["columns"] = "columns"
  9677. index = properties.AxisProperty(
  9678. axis=1, doc="The index (row labels) of the DataFrame."
  9679. )
  9680. columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.")
  9681. # ----------------------------------------------------------------------
  9682. # Add plotting methods to DataFrame
  9683. plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
  9684. hist = pandas.plotting.hist_frame
  9685. boxplot = pandas.plotting.boxplot_frame
  9686. sparse = CachedAccessor("sparse", SparseFrameAccessor)
  9687. # ----------------------------------------------------------------------
  9688. # Internal Interface Methods
  9689. def _to_dict_of_blocks(self, copy: bool = True):
  9690. """
  9691. Return a dict of dtype -> Constructor Types that
  9692. each is a homogeneous dtype.
  9693. Internal ONLY - only works for BlockManager
  9694. """
  9695. mgr = self._mgr
  9696. # convert to BlockManager if needed -> this way support ArrayManager as well
  9697. mgr = mgr_to_mgr(mgr, "block")
  9698. mgr = cast(BlockManager, mgr)
  9699. return {
  9700. k: self._constructor(v).__finalize__(self)
  9701. for k, v, in mgr.to_dict(copy=copy).items()
  9702. }
  9703. @property
  9704. def values(self) -> np.ndarray:
  9705. """
  9706. Return a Numpy representation of the DataFrame.
  9707. .. warning::
  9708. We recommend using :meth:`DataFrame.to_numpy` instead.
  9709. Only the values in the DataFrame will be returned, the axes labels
  9710. will be removed.
  9711. Returns
  9712. -------
  9713. numpy.ndarray
  9714. The values of the DataFrame.
  9715. See Also
  9716. --------
  9717. DataFrame.to_numpy : Recommended alternative to this method.
  9718. DataFrame.index : Retrieve the index labels.
  9719. DataFrame.columns : Retrieving the column names.
  9720. Notes
  9721. -----
  9722. The dtype will be a lower-common-denominator dtype (implicit
  9723. upcasting); that is to say if the dtypes (even of numeric types)
  9724. are mixed, the one that accommodates all will be chosen. Use this
  9725. with care if you are not dealing with the blocks.
  9726. e.g. If the dtypes are float16 and float32, dtype will be upcast to
  9727. float32. If dtypes are int32 and uint8, dtype will be upcast to
  9728. int32. By :func:`numpy.find_common_type` convention, mixing int64
  9729. and uint64 will result in a float64 dtype.
  9730. Examples
  9731. --------
  9732. A DataFrame where all columns are the same type (e.g., int64) results
  9733. in an array of the same type.
  9734. >>> df = pd.DataFrame({'age': [ 3, 29],
  9735. ... 'height': [94, 170],
  9736. ... 'weight': [31, 115]})
  9737. >>> df
  9738. age height weight
  9739. 0 3 94 31
  9740. 1 29 170 115
  9741. >>> df.dtypes
  9742. age int64
  9743. height int64
  9744. weight int64
  9745. dtype: object
  9746. >>> df.values
  9747. array([[ 3, 94, 31],
  9748. [ 29, 170, 115]])
  9749. A DataFrame with mixed type columns(e.g., str/object, int64, float32)
  9750. results in an ndarray of the broadest type that accommodates these
  9751. mixed types (e.g., object).
  9752. >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),
  9753. ... ('lion', 80.5, 1),
  9754. ... ('monkey', np.nan, None)],
  9755. ... columns=('name', 'max_speed', 'rank'))
  9756. >>> df2.dtypes
  9757. name object
  9758. max_speed float64
  9759. rank object
  9760. dtype: object
  9761. >>> df2.values
  9762. array([['parrot', 24.0, 'second'],
  9763. ['lion', 80.5, 1],
  9764. ['monkey', nan, None]], dtype=object)
  9765. """
  9766. return self._mgr.as_array()
  9767. @overload
  9768. def ffill(
  9769. self,
  9770. *,
  9771. axis: None | Axis = ...,
  9772. inplace: Literal[False] = ...,
  9773. limit: None | int = ...,
  9774. downcast: dict | None = ...,
  9775. ) -> DataFrame:
  9776. ...
  9777. @overload
  9778. def ffill(
  9779. self,
  9780. *,
  9781. axis: None | Axis = ...,
  9782. inplace: Literal[True],
  9783. limit: None | int = ...,
  9784. downcast: dict | None = ...,
  9785. ) -> None:
  9786. ...
  9787. @overload
  9788. def ffill(
  9789. self,
  9790. *,
  9791. axis: None | Axis = ...,
  9792. inplace: bool = ...,
  9793. limit: None | int = ...,
  9794. downcast: dict | None = ...,
  9795. ) -> DataFrame | None:
  9796. ...
  9797. def ffill(
  9798. self,
  9799. *,
  9800. axis: None | Axis = None,
  9801. inplace: bool = False,
  9802. limit: None | int = None,
  9803. downcast: dict | None = None,
  9804. ) -> DataFrame | None:
  9805. return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
  9806. @overload
  9807. def bfill(
  9808. self,
  9809. *,
  9810. axis: None | Axis = ...,
  9811. inplace: Literal[False] = ...,
  9812. limit: None | int = ...,
  9813. downcast=...,
  9814. ) -> DataFrame:
  9815. ...
  9816. @overload
  9817. def bfill(
  9818. self,
  9819. *,
  9820. axis: None | Axis = ...,
  9821. inplace: Literal[True],
  9822. limit: None | int = ...,
  9823. downcast=...,
  9824. ) -> None:
  9825. ...
  9826. @overload
  9827. def bfill(
  9828. self,
  9829. *,
  9830. axis: None | Axis = ...,
  9831. inplace: bool = ...,
  9832. limit: None | int = ...,
  9833. downcast=...,
  9834. ) -> DataFrame | None:
  9835. ...
  9836. def bfill(
  9837. self,
  9838. *,
  9839. axis: None | Axis = None,
  9840. inplace: bool = False,
  9841. limit: None | int = None,
  9842. downcast=None,
  9843. ) -> DataFrame | None:
  9844. return super().bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
  9845. def clip(
  9846. self: DataFrame,
  9847. lower: float | None = None,
  9848. upper: float | None = None,
  9849. *,
  9850. axis: Axis | None = None,
  9851. inplace: bool = False,
  9852. **kwargs,
  9853. ) -> DataFrame | None:
  9854. return super().clip(lower, upper, axis=axis, inplace=inplace, **kwargs)
  9855. def interpolate(
  9856. self: DataFrame,
  9857. method: str = "linear",
  9858. *,
  9859. axis: Axis = 0,
  9860. limit: int | None = None,
  9861. inplace: bool = False,
  9862. limit_direction: str | None = None,
  9863. limit_area: str | None = None,
  9864. downcast: str | None = None,
  9865. **kwargs,
  9866. ) -> DataFrame | None:
  9867. return super().interpolate(
  9868. method=method,
  9869. axis=axis,
  9870. limit=limit,
  9871. inplace=inplace,
  9872. limit_direction=limit_direction,
  9873. limit_area=limit_area,
  9874. downcast=downcast,
  9875. **kwargs,
  9876. )
  9877. @overload
  9878. def where(
  9879. self,
  9880. cond,
  9881. other=...,
  9882. *,
  9883. inplace: Literal[False] = ...,
  9884. axis: Axis | None = ...,
  9885. level: Level = ...,
  9886. ) -> DataFrame:
  9887. ...
  9888. @overload
  9889. def where(
  9890. self,
  9891. cond,
  9892. other=...,
  9893. *,
  9894. inplace: Literal[True],
  9895. axis: Axis | None = ...,
  9896. level: Level = ...,
  9897. ) -> None:
  9898. ...
  9899. @overload
  9900. def where(
  9901. self,
  9902. cond,
  9903. other=...,
  9904. *,
  9905. inplace: bool = ...,
  9906. axis: Axis | None = ...,
  9907. level: Level = ...,
  9908. ) -> DataFrame | None:
  9909. ...
  9910. def where(
  9911. self,
  9912. cond,
  9913. other=lib.no_default,
  9914. *,
  9915. inplace: bool = False,
  9916. axis: Axis | None = None,
  9917. level: Level = None,
  9918. ) -> DataFrame | None:
  9919. return super().where(
  9920. cond,
  9921. other,
  9922. inplace=inplace,
  9923. axis=axis,
  9924. level=level,
  9925. )
  9926. @overload
  9927. def mask(
  9928. self,
  9929. cond,
  9930. other=...,
  9931. *,
  9932. inplace: Literal[False] = ...,
  9933. axis: Axis | None = ...,
  9934. level: Level = ...,
  9935. ) -> DataFrame:
  9936. ...
  9937. @overload
  9938. def mask(
  9939. self,
  9940. cond,
  9941. other=...,
  9942. *,
  9943. inplace: Literal[True],
  9944. axis: Axis | None = ...,
  9945. level: Level = ...,
  9946. ) -> None:
  9947. ...
  9948. @overload
  9949. def mask(
  9950. self,
  9951. cond,
  9952. other=...,
  9953. *,
  9954. inplace: bool = ...,
  9955. axis: Axis | None = ...,
  9956. level: Level = ...,
  9957. ) -> DataFrame | None:
  9958. ...
  9959. def mask(
  9960. self,
  9961. cond,
  9962. other=lib.no_default,
  9963. *,
  9964. inplace: bool = False,
  9965. axis: Axis | None = None,
  9966. level: Level = None,
  9967. ) -> DataFrame | None:
  9968. return super().mask(
  9969. cond,
  9970. other,
  9971. inplace=inplace,
  9972. axis=axis,
  9973. level=level,
  9974. )
  9975. DataFrame._add_numeric_operations()
  9976. ops.add_flex_arithmetic_methods(DataFrame)
  9977. def _from_nested_dict(data) -> collections.defaultdict:
  9978. new_data: collections.defaultdict = collections.defaultdict(dict)
  9979. for index, s in data.items():
  9980. for col, v in s.items():
  9981. new_data[col][index] = v
  9982. return new_data
  9983. def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
  9984. # reindex if necessary
  9985. if value.index.equals(index) or not len(index):
  9986. return value._values.copy()
  9987. # GH#4107
  9988. try:
  9989. reindexed_value = value.reindex(index)._values
  9990. except ValueError as err:
  9991. # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
  9992. if not value.index.is_unique:
  9993. # duplicate axis
  9994. raise err
  9995. raise TypeError(
  9996. "incompatible index of inserted column with frame index"
  9997. ) from err
  9998. return reindexed_value