12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216821782188219822082218222822382248225822682278228822982308231823282338234823582368237823882398240824182428243824482458246824782488249825082518252825382548255825682578258825982608261826282638264826582668267826882698270827182728273827482758276827782788279828082818282828382848285828682878288828982908291829282938294829582968297829882998300830183028303830483058306830783088309831083118312831383148315831683178318831983208321832283238324832583268327832883298330833183328333833483358336833783388339834083418342834383448345834683478348834983508351835283538354835583568357835883598360836183628363836483658366836783688369837083718372837383748375837683778378837983808381838283838384838583868387838883898390839183928393839483958396839783988399840084018402840384048405840684078408840984108411841284138414841584168417841884198420842184228423842484258426842784288429843084318432843384348435843684378438843984408441844284438444844584468447844884498450845184528453845484558456845784588459846084618462846384648465846684678468846984708471847284738474847584768477847884798480848184828483848484858486848784888489849084918492849384948495849684978498849985008501850285038504850585068507850885098510851185128513851485158516851785188519852085218522852385248525852685278528852985308531853285338534853585368537853885398540854185428543854485458546854785488549855085518552855385548555855685578558855985608561856285638564856585668567856885698570857185728573857485758576857785788579858085818582858385848585858685878588858985908591859285938594859585968597859885998600860186028603860486058606860786088609861086118612861386148615861686178618861986208621862286238624862586268627862886298630863186328633863486358636863786388639864086418642864386448645864686478648864986508651865286538654865586568657865886598660866186628663866486658666866786688669867086718672867386748675867686778678867986808681868286838684868586868687868886898690869186928693869486958696869786988699870087018702870387048705870687078708870987108711871287138714871587168717871887198720872187228723872487258726872787288729873087318732873387348735873687378738873987408741874287438744874587468747874887498750875187528753875487558756875787588759876087618762876387648765876687678768876987708771877287738774877587768777877887798780878187828783878487858786878787888789879087918792879387948795879687978798879988008801880288038804880588068807880888098810881188128813881488158816881788188819882088218822882388248825882688278828882988308831883288338834883588368837883888398840884188428843884488458846884788488849885088518852885388548855885688578858885988608861886288638864886588668867886888698870887188728873887488758876887788788879888088818882888388848885888688878888888988908891889288938894889588968897889888998900890189028903890489058906890789088909891089118912891389148915891689178918891989208921892289238924892589268927892889298930893189328933893489358936893789388939894089418942894389448945894689478948894989508951895289538954895589568957895889598960896189628963896489658966896789688969897089718972897389748975897689778978897989808981898289838984898589868987898889898990899189928993899489958996899789988999900090019002900390049005900690079008900990109011901290139014901590169017901890199020902190229023902490259026902790289029903090319032903390349035903690379038903990409041904290439044904590469047904890499050905190529053905490559056905790589059906090619062906390649065906690679068906990709071907290739074907590769077907890799080908190829083908490859086908790889089909090919092909390949095909690979098909991009101910291039104910591069107910891099110911191129113911491159116911791189119912091219122912391249125912691279128912991309131913291339134913591369137913891399140914191429143914491459146914791489149915091519152915391549155915691579158915991609161916291639164916591669167916891699170917191729173917491759176917791789179918091819182918391849185918691879188918991909191919291939194919591969197919891999200920192029203920492059206920792089209921092119212921392149215921692179218921992209221922292239224922592269227922892299230923192329233923492359236923792389239924092419242924392449245924692479248924992509251925292539254925592569257925892599260926192629263926492659266926792689269927092719272927392749275927692779278927992809281928292839284928592869287928892899290929192929293929492959296929792989299930093019302930393049305930693079308930993109311931293139314931593169317931893199320932193229323932493259326932793289329933093319332933393349335933693379338933993409341934293439344934593469347934893499350935193529353935493559356935793589359936093619362936393649365936693679368936993709371937293739374937593769377937893799380938193829383938493859386938793889389939093919392939393949395939693979398939994009401940294039404940594069407940894099410941194129413941494159416941794189419942094219422942394249425942694279428942994309431943294339434943594369437943894399440944194429443944494459446944794489449945094519452945394549455945694579458945994609461946294639464946594669467946894699470947194729473947494759476947794789479948094819482948394849485948694879488948994909491949294939494949594969497949894999500950195029503950495059506950795089509951095119512951395149515951695179518951995209521952295239524952595269527952895299530953195329533953495359536953795389539954095419542954395449545954695479548954995509551955295539554955595569557955895599560956195629563956495659566956795689569957095719572957395749575957695779578957995809581958295839584958595869587958895899590959195929593959495959596959795989599960096019602960396049605960696079608960996109611961296139614961596169617961896199620962196229623962496259626962796289629963096319632963396349635963696379638963996409641964296439644964596469647964896499650965196529653965496559656965796589659966096619662966396649665966696679668966996709671967296739674967596769677967896799680968196829683968496859686968796889689969096919692969396949695969696979698969997009701970297039704970597069707970897099710971197129713971497159716971797189719972097219722972397249725972697279728972997309731973297339734973597369737973897399740974197429743974497459746974797489749975097519752975397549755975697579758975997609761976297639764976597669767976897699770977197729773977497759776977797789779978097819782978397849785978697879788978997909791979297939794979597969797979897999800980198029803980498059806980798089809981098119812981398149815981698179818981998209821982298239824982598269827982898299830983198329833983498359836983798389839984098419842984398449845984698479848984998509851985298539854985598569857985898599860986198629863986498659866986798689869987098719872987398749875987698779878987998809881988298839884988598869887988898899890989198929893989498959896989798989899990099019902990399049905990699079908990999109911991299139914991599169917991899199920992199229923992499259926992799289929993099319932993399349935993699379938993999409941994299439944994599469947994899499950995199529953995499559956995799589959996099619962996399649965996699679968996999709971997299739974997599769977997899799980998199829983998499859986998799889989999099919992999399949995999699979998999910000100011000210003100041000510006100071000810009100101001110012100131001410015100161001710018100191002010021100221002310024100251002610027100281002910030100311003210033100341003510036100371003810039100401004110042100431004410045100461004710048100491005010051100521005310054100551005610057100581005910060100611006210063100641006510066100671006810069100701007110072100731007410075100761007710078100791008010081100821008310084100851008610087100881008910090100911009210093100941009510096100971009810099101001010110102101031010410105101061010710108101091011010111101121011310114101151011610117101181011910120101211012210123101241012510126101271012810129101301013110132101331013410135101361013710138101391014010141101421014310144101451014610147101481014910150101511015210153101541015510156101571015810159101601016110162101631016410165101661016710168101691017010171101721017310174101751017610177101781017910180101811018210183101841018510186101871018810189101901019110192101931019410195101961019710198101991020010201102021020310204102051020610207102081020910210102111021210213102141021510216102171021810219102201022110222102231022410225102261022710228102291023010231102321023310234102351023610237102381023910240102411024210243102441024510246102471024810249102501025110252102531025410255102561025710258102591026010261102621026310264102651026610267102681026910270102711027210273102741027510276102771027810279102801028110282102831028410285102861028710288102891029010291102921029310294102951029610297102981029910300103011030210303103041030510306103071030810309103101031110312103131031410315103161031710318103191032010321103221032310324103251032610327103281032910330103311033210333103341033510336103371033810339103401034110342103431034410345103461034710348103491035010351103521035310354103551035610357103581035910360103611036210363103641036510366103671036810369103701037110372103731037410375103761037710378103791038010381103821038310384103851038610387103881038910390103911039210393103941039510396103971039810399104001040110402104031040410405104061040710408104091041010411104121041310414104151041610417104181041910420104211042210423104241042510426104271042810429104301043110432104331043410435104361043710438104391044010441104421044310444104451044610447104481044910450104511045210453104541045510456104571045810459104601046110462104631046410465104661046710468104691047010471104721047310474104751047610477104781047910480104811048210483104841048510486104871048810489104901049110492104931049410495104961049710498104991050010501105021050310504105051050610507105081050910510105111051210513105141051510516105171051810519105201052110522105231052410525105261052710528105291053010531105321053310534105351053610537105381053910540105411054210543105441054510546105471054810549105501055110552105531055410555105561055710558105591056010561105621056310564105651056610567105681056910570105711057210573105741057510576105771057810579105801058110582105831058410585105861058710588105891059010591105921059310594105951059610597105981059910600106011060210603106041060510606106071060810609106101061110612106131061410615106161061710618106191062010621106221062310624106251062610627106281062910630106311063210633106341063510636106371063810639106401064110642106431064410645106461064710648106491065010651106521065310654106551065610657106581065910660106611066210663106641066510666106671066810669106701067110672106731067410675106761067710678106791068010681106821068310684106851068610687106881068910690106911069210693106941069510696106971069810699107001070110702107031070410705107061070710708107091071010711107121071310714107151071610717107181071910720107211072210723107241072510726107271072810729107301073110732107331073410735107361073710738107391074010741107421074310744107451074610747107481074910750107511075210753107541075510756107571075810759107601076110762107631076410765107661076710768107691077010771107721077310774107751077610777107781077910780107811078210783107841078510786107871078810789107901079110792107931079410795107961079710798107991080010801108021080310804108051080610807108081080910810108111081210813108141081510816108171081810819108201082110822108231082410825108261082710828108291083010831108321083310834108351083610837108381083910840108411084210843108441084510846108471084810849108501085110852108531085410855108561085710858108591086010861108621086310864108651086610867108681086910870108711087210873108741087510876108771087810879108801088110882108831088410885108861088710888108891089010891108921089310894108951089610897108981089910900109011090210903109041090510906109071090810909109101091110912109131091410915109161091710918109191092010921109221092310924109251092610927109281092910930109311093210933109341093510936109371093810939109401094110942109431094410945109461094710948109491095010951109521095310954109551095610957109581095910960109611096210963109641096510966109671096810969109701097110972109731097410975109761097710978109791098010981109821098310984109851098610987109881098910990109911099210993109941099510996109971099810999110001100111002110031100411005110061100711008110091101011011110121101311014110151101611017110181101911020110211102211023110241102511026110271102811029110301103111032110331103411035110361103711038110391104011041110421104311044110451104611047110481104911050110511105211053110541105511056110571105811059110601106111062110631106411065110661106711068110691107011071110721107311074110751107611077110781107911080110811108211083110841108511086110871108811089110901109111092110931109411095110961109711098110991110011101111021110311104111051110611107111081110911110111111111211113111141111511116111171111811119111201112111122111231112411125111261112711128111291113011131111321113311134111351113611137111381113911140111411114211143111441114511146111471114811149111501115111152111531115411155111561115711158111591116011161111621116311164111651116611167111681116911170111711117211173111741117511176111771117811179111801118111182111831118411185111861118711188111891119011191111921119311194111951119611197111981119911200112011120211203112041120511206112071120811209112101121111212112131121411215112161121711218112191122011221112221122311224112251122611227112281122911230112311123211233112341123511236112371123811239112401124111242112431124411245112461124711248112491125011251112521125311254112551125611257112581125911260112611126211263112641126511266112671126811269112701127111272112731127411275112761127711278112791128011281112821128311284112851128611287112881128911290112911129211293112941129511296112971129811299113001130111302113031130411305113061130711308113091131011311113121131311314113151131611317113181131911320113211132211323113241132511326113271132811329113301133111332113331133411335113361133711338113391134011341113421134311344113451134611347113481134911350113511135211353113541135511356113571135811359113601136111362113631136411365113661136711368113691137011371113721137311374113751137611377113781137911380113811138211383113841138511386113871138811389113901139111392113931139411395113961139711398113991140011401114021140311404114051140611407114081140911410114111141211413114141141511416114171141811419114201142111422114231142411425114261142711428114291143011431114321143311434114351143611437114381143911440114411144211443114441144511446114471144811449114501145111452114531145411455114561145711458114591146011461114621146311464114651146611467114681146911470114711147211473114741147511476114771147811479114801148111482114831148411485114861148711488114891149011491114921149311494114951149611497114981149911500115011150211503115041150511506115071150811509115101151111512115131151411515115161151711518115191152011521115221152311524115251152611527115281152911530115311153211533115341153511536115371153811539115401154111542115431154411545115461154711548115491155011551115521155311554115551155611557115581155911560115611156211563115641156511566115671156811569115701157111572115731157411575115761157711578115791158011581115821158311584115851158611587115881158911590115911159211593115941159511596115971159811599116001160111602116031160411605116061160711608116091161011611116121161311614116151161611617116181161911620 |
- """
- DataFrame
- ---------
- An efficient 2D container for potentially mixed-type time series or other
- labeled data series.
- Similar to its R counterpart, data.frame, except providing automatic data
- alignment and a host of useful data manipulation methods having to do with the
- labeling information
- """
- from __future__ import annotations
- import collections
- from collections import abc
- import datetime
- import functools
- from io import StringIO
- import itertools
- import sys
- from textwrap import dedent
- from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Iterable,
- Iterator,
- Literal,
- Mapping,
- Sequence,
- cast,
- overload,
- )
- import warnings
- import numpy as np
- from numpy import ma
- from pandas._config import (
- get_option,
- using_copy_on_write,
- )
- from pandas._libs import (
- algos as libalgos,
- lib,
- properties,
- )
- from pandas._libs.hashtable import duplicated
- from pandas._libs.lib import (
- NoDefault,
- is_range_indexer,
- no_default,
- )
- from pandas._typing import (
- AggFuncType,
- AlignJoin,
- AnyAll,
- AnyArrayLike,
- ArrayLike,
- Axes,
- Axis,
- AxisInt,
- ColspaceArgType,
- CompressionOptions,
- CorrelationMethod,
- DropKeep,
- Dtype,
- DtypeObj,
- FilePath,
- FillnaOptions,
- FloatFormatType,
- FormattersType,
- Frequency,
- IgnoreRaise,
- IndexKeyFunc,
- IndexLabel,
- Level,
- MergeHow,
- NaPosition,
- PythonFuncType,
- QuantileInterpolation,
- ReadBuffer,
- Renamer,
- Scalar,
- SortKind,
- StorageOptions,
- Suffixes,
- TimedeltaConvertibleTypes,
- TimestampConvertibleTypes,
- ValueKeyFunc,
- WriteBuffer,
- npt,
- )
- from pandas.compat import PYPY
- from pandas.compat._optional import import_optional_dependency
- from pandas.compat.numpy import (
- function as nv,
- np_percentile_argname,
- )
- from pandas.errors import (
- ChainedAssignmentError,
- InvalidIndexError,
- _chained_assignment_msg,
- )
- from pandas.util._decorators import (
- Appender,
- Substitution,
- doc,
- )
- from pandas.util._exceptions import find_stack_level
- from pandas.util._validators import (
- validate_ascending,
- validate_bool_kwarg,
- validate_percentile,
- )
- from pandas.core.dtypes.cast import (
- LossySetitemError,
- can_hold_element,
- construct_1d_arraylike_from_scalar,
- construct_2d_arraylike_from_scalar,
- find_common_type,
- infer_dtype_from_scalar,
- invalidate_string_dtypes,
- maybe_box_native,
- maybe_downcast_to_dtype,
- )
- from pandas.core.dtypes.common import (
- infer_dtype_from_object,
- is_1d_only_ea_dtype,
- is_bool_dtype,
- is_dataclass,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float,
- is_float_dtype,
- is_hashable,
- is_integer,
- is_integer_dtype,
- is_iterator,
- is_list_like,
- is_scalar,
- is_sequence,
- needs_i8_conversion,
- pandas_dtype,
- )
- from pandas.core.dtypes.dtypes import ExtensionDtype
- from pandas.core.dtypes.missing import (
- isna,
- notna,
- )
- from pandas.core import (
- algorithms,
- common as com,
- nanops,
- ops,
- )
- from pandas.core.accessor import CachedAccessor
- from pandas.core.apply import (
- reconstruct_func,
- relabel_result,
- )
- from pandas.core.array_algos.take import take_2d_multi
- from pandas.core.arraylike import OpsMixin
- from pandas.core.arrays import (
- DatetimeArray,
- ExtensionArray,
- PeriodArray,
- TimedeltaArray,
- )
- from pandas.core.arrays.arrow import ArrowDtype
- from pandas.core.arrays.sparse import SparseFrameAccessor
- from pandas.core.construction import (
- ensure_wrapped_if_datetimelike,
- extract_array,
- sanitize_array,
- sanitize_masked_array,
- )
- from pandas.core.generic import NDFrame
- from pandas.core.indexers import check_key_length
- from pandas.core.indexes.api import (
- DatetimeIndex,
- Index,
- PeriodIndex,
- default_index,
- ensure_index,
- ensure_index_from_sequences,
- )
- from pandas.core.indexes.multi import (
- MultiIndex,
- maybe_droplevels,
- )
- from pandas.core.indexing import (
- check_bool_indexer,
- check_dict_or_set_indexers,
- )
- from pandas.core.internals import (
- ArrayManager,
- BlockManager,
- )
- from pandas.core.internals.construction import (
- arrays_to_mgr,
- dataclasses_to_dicts,
- dict_to_mgr,
- mgr_to_mgr,
- ndarray_to_mgr,
- nested_data_to_arrays,
- rec_array_to_mgr,
- reorder_arrays,
- to_arrays,
- treat_as_nested,
- )
- from pandas.core.methods import selectn
- from pandas.core.reshape.melt import melt
- from pandas.core.series import Series
- from pandas.core.shared_docs import _shared_docs
- from pandas.core.sorting import (
- get_group_index,
- lexsort_indexer,
- nargsort,
- )
- from pandas.io.common import get_handle
- from pandas.io.formats import (
- console,
- format as fmt,
- )
- from pandas.io.formats.info import (
- INFO_DOCSTRING,
- DataFrameInfo,
- frame_sub_kwargs,
- )
- import pandas.plotting
- if TYPE_CHECKING:
- from pandas.core.groupby.generic import DataFrameGroupBy
- from pandas.core.interchange.dataframe_protocol import DataFrame as DataFrameXchg
- from pandas.core.internals import SingleDataManager
- from pandas.core.resample import Resampler
- from pandas.io.formats.style import Styler
- # ---------------------------------------------------------------------
- # Docstring templates
- _shared_doc_kwargs = {
- "axes": "index, columns",
- "klass": "DataFrame",
- "axes_single_arg": "{0 or 'index', 1 or 'columns'}",
- "axis": """axis : {0 or 'index', 1 or 'columns'}, default 0
- If 0 or 'index': apply function to each column.
- If 1 or 'columns': apply function to each row.""",
- "inplace": """
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.""",
- "optional_by": """
- by : str or list of str
- Name or list of names to sort by.
- - if `axis` is 0 or `'index'` then `by` may contain index
- levels and/or column labels.
- - if `axis` is 1 or `'columns'` then `by` may contain column
- levels and/or index labels.""",
- "optional_reindex": """
- labels : array-like, optional
- New labels / index to conform the axis specified by 'axis' to.
- index : array-like, optional
- New labels for the index. Preferably an Index object to avoid
- duplicating data.
- columns : array-like, optional
- New labels for the columns. Preferably an Index object to avoid
- duplicating data.
- axis : int or str, optional
- Axis to target. Can be either the axis name ('index', 'columns')
- or number (0, 1).""",
- "replace_iloc": """
- This differs from updating with ``.loc`` or ``.iloc``, which require
- you to specify a location to update with some value.""",
- }
- _numeric_only_doc = """numeric_only : bool, default False
- Include only float, int, boolean data.
- """
- _merge_doc = """
- Merge DataFrame or named Series objects with a database-style join.
- A named Series object is treated as a DataFrame with a single named column.
- The join is done on columns or indexes. If joining columns on
- columns, the DataFrame indexes *will be ignored*. Otherwise if joining indexes
- on indexes or indexes on a column or columns, the index will be passed on.
- When performing a cross merge, no column specifications to merge on are
- allowed.
- .. warning::
- If both key columns contain rows where the key is a null value, those
- rows will be matched against each other. This is different from usual SQL
- join behaviour and can lead to unexpected results.
- Parameters
- ----------%s
- right : DataFrame or named Series
- Object to merge with.
- how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'inner'
- Type of merge to be performed.
- * left: use only keys from left frame, similar to a SQL left outer join;
- preserve key order.
- * right: use only keys from right frame, similar to a SQL right outer join;
- preserve key order.
- * outer: use union of keys from both frames, similar to a SQL full outer
- join; sort keys lexicographically.
- * inner: use intersection of keys from both frames, similar to a SQL inner
- join; preserve the order of the left keys.
- * cross: creates the cartesian product from both frames, preserves the order
- of the left keys.
- .. versionadded:: 1.2.0
- on : label or list
- Column or index level names to join on. These must be found in both
- DataFrames. If `on` is None and not merging on indexes then this defaults
- to the intersection of the columns in both DataFrames.
- left_on : label or list, or array-like
- Column or index level names to join on in the left DataFrame. Can also
- be an array or list of arrays of the length of the left DataFrame.
- These arrays are treated as if they are columns.
- right_on : label or list, or array-like
- Column or index level names to join on in the right DataFrame. Can also
- be an array or list of arrays of the length of the right DataFrame.
- These arrays are treated as if they are columns.
- left_index : bool, default False
- Use the index from the left DataFrame as the join key(s). If it is a
- MultiIndex, the number of keys in the other DataFrame (either the index
- or a number of columns) must match the number of levels.
- right_index : bool, default False
- Use the index from the right DataFrame as the join key. Same caveats as
- left_index.
- sort : bool, default False
- Sort the join keys lexicographically in the result DataFrame. If False,
- the order of the join keys depends on the join type (how keyword).
- suffixes : list-like, default is ("_x", "_y")
- A length-2 sequence where each element is optionally a string
- indicating the suffix to add to overlapping column names in
- `left` and `right` respectively. Pass a value of `None` instead
- of a string to indicate that the column name from `left` or
- `right` should be left as-is, with no suffix. At least one of the
- values must not be None.
- copy : bool, default True
- If False, avoid copy if possible.
- indicator : bool or str, default False
- If True, adds a column to the output DataFrame called "_merge" with
- information on the source of each row. The column can be given a different
- name by providing a string argument. The column will have a Categorical
- type with the value of "left_only" for observations whose merge key only
- appears in the left DataFrame, "right_only" for observations
- whose merge key only appears in the right DataFrame, and "both"
- if the observation's merge key is found in both DataFrames.
- validate : str, optional
- If specified, checks if merge is of specified type.
- * "one_to_one" or "1:1": check if merge keys are unique in both
- left and right datasets.
- * "one_to_many" or "1:m": check if merge keys are unique in left
- dataset.
- * "many_to_one" or "m:1": check if merge keys are unique in right
- dataset.
- * "many_to_many" or "m:m": allowed, but does not result in checks.
- Returns
- -------
- DataFrame
- A DataFrame of the two merged objects.
- See Also
- --------
- merge_ordered : Merge with optional filling/interpolation.
- merge_asof : Merge on nearest keys.
- DataFrame.join : Similar method using indices.
- Notes
- -----
- Support for specifying index levels as the `on`, `left_on`, and
- `right_on` parameters was added in version 0.23.0
- Support for merging named Series objects was added in version 0.24.0
- Examples
- --------
- >>> df1 = pd.DataFrame({'lkey': ['foo', 'bar', 'baz', 'foo'],
- ... 'value': [1, 2, 3, 5]})
- >>> df2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'],
- ... 'value': [5, 6, 7, 8]})
- >>> df1
- lkey value
- 0 foo 1
- 1 bar 2
- 2 baz 3
- 3 foo 5
- >>> df2
- rkey value
- 0 foo 5
- 1 bar 6
- 2 baz 7
- 3 foo 8
- Merge df1 and df2 on the lkey and rkey columns. The value columns have
- the default suffixes, _x and _y, appended.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey')
- lkey value_x rkey value_y
- 0 foo 1 foo 5
- 1 foo 1 foo 8
- 2 foo 5 foo 5
- 3 foo 5 foo 8
- 4 bar 2 bar 6
- 5 baz 3 baz 7
- Merge DataFrames df1 and df2 with specified left and right suffixes
- appended to any overlapping columns.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey',
- ... suffixes=('_left', '_right'))
- lkey value_left rkey value_right
- 0 foo 1 foo 5
- 1 foo 1 foo 8
- 2 foo 5 foo 5
- 3 foo 5 foo 8
- 4 bar 2 bar 6
- 5 baz 3 baz 7
- Merge DataFrames df1 and df2, but raise an exception if the DataFrames have
- any overlapping columns.
- >>> df1.merge(df2, left_on='lkey', right_on='rkey', suffixes=(False, False))
- Traceback (most recent call last):
- ...
- ValueError: columns overlap but no suffix specified:
- Index(['value'], dtype='object')
- >>> df1 = pd.DataFrame({'a': ['foo', 'bar'], 'b': [1, 2]})
- >>> df2 = pd.DataFrame({'a': ['foo', 'baz'], 'c': [3, 4]})
- >>> df1
- a b
- 0 foo 1
- 1 bar 2
- >>> df2
- a c
- 0 foo 3
- 1 baz 4
- >>> df1.merge(df2, how='inner', on='a')
- a b c
- 0 foo 1 3
- >>> df1.merge(df2, how='left', on='a')
- a b c
- 0 foo 1 3.0
- 1 bar 2 NaN
- >>> df1 = pd.DataFrame({'left': ['foo', 'bar']})
- >>> df2 = pd.DataFrame({'right': [7, 8]})
- >>> df1
- left
- 0 foo
- 1 bar
- >>> df2
- right
- 0 7
- 1 8
- >>> df1.merge(df2, how='cross')
- left right
- 0 foo 7
- 1 foo 8
- 2 bar 7
- 3 bar 8
- """
- # -----------------------------------------------------------------------
- # DataFrame class
- class DataFrame(NDFrame, OpsMixin):
- """
- Two-dimensional, size-mutable, potentially heterogeneous tabular data.
- Data structure also contains labeled axes (rows and columns).
- Arithmetic operations align on both row and column labels. Can be
- thought of as a dict-like container for Series objects. The primary
- pandas data structure.
- Parameters
- ----------
- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame
- Dict can contain Series, arrays, constants, dataclass or list-like objects. If
- data is a dict, column order follows insertion-order. If a dict contains Series
- which have an index defined, it is aligned by its index. This alignment also
- occurs if data is a Series or a DataFrame itself. Alignment is done on
- Series/DataFrame inputs.
- If data is a list of dicts, column order follows insertion-order.
- index : Index or array-like
- Index to use for resulting frame. Will default to RangeIndex if
- no indexing information part of input data and no index provided.
- columns : Index or array-like
- Column labels to use for resulting frame when data does not have them,
- defaulting to RangeIndex(0, 1, 2, ..., n). If data contains column labels,
- will perform column selection instead.
- dtype : dtype, default None
- Data type to force. Only a single dtype is allowed. If None, infer.
- copy : bool or None, default None
- Copy data from inputs.
- For dict data, the default of None behaves like ``copy=True``. For DataFrame
- or 2d ndarray input, the default of None behaves like ``copy=False``.
- If data is a dict containing one or more Series (possibly of different dtypes),
- ``copy=False`` will ensure that these inputs are not copied.
- .. versionchanged:: 1.3.0
- See Also
- --------
- DataFrame.from_records : Constructor from tuples, also record arrays.
- DataFrame.from_dict : From dicts of Series, arrays, or dicts.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- read_table : Read general delimited file into DataFrame.
- read_clipboard : Read text from clipboard into DataFrame.
- Notes
- -----
- Please reference the :ref:`User Guide <basics.dataframe>` for more information.
- Examples
- --------
- Constructing DataFrame from a dictionary.
- >>> d = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df = pd.DataFrame(data=d)
- >>> df
- col1 col2
- 0 1 3
- 1 2 4
- Notice that the inferred dtype is int64.
- >>> df.dtypes
- col1 int64
- col2 int64
- dtype: object
- To enforce a single dtype:
- >>> df = pd.DataFrame(data=d, dtype=np.int8)
- >>> df.dtypes
- col1 int8
- col2 int8
- dtype: object
- Constructing DataFrame from a dictionary including Series:
- >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
- >>> pd.DataFrame(data=d, index=[0, 1, 2, 3])
- col1 col2
- 0 0 NaN
- 1 1 NaN
- 2 2 2.0
- 3 3 3.0
- Constructing DataFrame from numpy ndarray:
- >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
- ... columns=['a', 'b', 'c'])
- >>> df2
- a b c
- 0 1 2 3
- 1 4 5 6
- 2 7 8 9
- Constructing DataFrame from a numpy ndarray that has labeled columns:
- >>> data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
- ... dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
- >>> df3 = pd.DataFrame(data, columns=['c', 'a'])
- ...
- >>> df3
- c a
- 0 3 1
- 1 6 4
- 2 9 7
- Constructing DataFrame from dataclass:
- >>> from dataclasses import make_dataclass
- >>> Point = make_dataclass("Point", [("x", int), ("y", int)])
- >>> pd.DataFrame([Point(0, 0), Point(0, 3), Point(2, 3)])
- x y
- 0 0 0
- 1 0 3
- 2 2 3
- Constructing DataFrame from Series/DataFrame:
- >>> ser = pd.Series([1, 2, 3], index=["a", "b", "c"])
- >>> df = pd.DataFrame(data=ser, index=["a", "c"])
- >>> df
- 0
- a 1
- c 3
- >>> df1 = pd.DataFrame([1, 2, 3], index=["a", "b", "c"], columns=["x"])
- >>> df2 = pd.DataFrame(data=df1, index=["a", "c"])
- >>> df2
- x
- a 1
- c 3
- """
- _internal_names_set = {"columns", "index"} | NDFrame._internal_names_set
- _typ = "dataframe"
- _HANDLED_TYPES = (Series, Index, ExtensionArray, np.ndarray)
- _accessors: set[str] = {"sparse"}
- _hidden_attrs: frozenset[str] = NDFrame._hidden_attrs | frozenset([])
- _mgr: BlockManager | ArrayManager
- @property
- def _constructor(self) -> Callable[..., DataFrame]:
- return DataFrame
- _constructor_sliced: Callable[..., Series] = Series
- # ----------------------------------------------------------------------
- # Constructors
- def __init__(
- self,
- data=None,
- index: Axes | None = None,
- columns: Axes | None = None,
- dtype: Dtype | None = None,
- copy: bool | None = None,
- ) -> None:
- if dtype is not None:
- dtype = self._validate_dtype(dtype)
- if isinstance(data, DataFrame):
- data = data._mgr
- if not copy:
- # if not copying data, ensure to still return a shallow copy
- # to avoid the result sharing the same Manager
- data = data.copy(deep=False)
- if isinstance(data, (BlockManager, ArrayManager)):
- if using_copy_on_write():
- data = data.copy(deep=False)
- # first check if a Manager is passed without any other arguments
- # -> use fastpath (without checking Manager type)
- if index is None and columns is None and dtype is None and not copy:
- # GH#33357 fastpath
- NDFrame.__init__(self, data)
- return
- manager = get_option("mode.data_manager")
- # GH47215
- if index is not None and isinstance(index, set):
- raise ValueError("index cannot be a set")
- if columns is not None and isinstance(columns, set):
- raise ValueError("columns cannot be a set")
- if copy is None:
- if isinstance(data, dict):
- # retain pre-GH#38939 default behavior
- copy = True
- elif (
- manager == "array"
- and isinstance(data, (np.ndarray, ExtensionArray))
- and data.ndim == 2
- ):
- # INFO(ArrayManager) by default copy the 2D input array to get
- # contiguous 1D arrays
- copy = True
- elif using_copy_on_write() and not isinstance(
- data, (Index, DataFrame, Series)
- ):
- copy = True
- else:
- copy = False
- if data is None:
- index = index if index is not None else default_index(0)
- columns = columns if columns is not None else default_index(0)
- dtype = dtype if dtype is not None else pandas_dtype(object)
- data = []
- if isinstance(data, (BlockManager, ArrayManager)):
- mgr = self._init_mgr(
- data, axes={"index": index, "columns": columns}, dtype=dtype, copy=copy
- )
- elif isinstance(data, dict):
- # GH#38939 de facto copy defaults to False only in non-dict cases
- mgr = dict_to_mgr(data, index, columns, dtype=dtype, copy=copy, typ=manager)
- elif isinstance(data, ma.MaskedArray):
- from numpy.ma import mrecords
- # masked recarray
- if isinstance(data, mrecords.MaskedRecords):
- raise TypeError(
- "MaskedRecords are not supported. Pass "
- "{name: data[name] for name in data.dtype.names} "
- "instead"
- )
- # a masked array
- data = sanitize_masked_array(data)
- mgr = ndarray_to_mgr(
- data,
- index,
- columns,
- dtype=dtype,
- copy=copy,
- typ=manager,
- )
- elif isinstance(data, (np.ndarray, Series, Index, ExtensionArray)):
- if data.dtype.names:
- # i.e. numpy structured array
- data = cast(np.ndarray, data)
- mgr = rec_array_to_mgr(
- data,
- index,
- columns,
- dtype,
- copy,
- typ=manager,
- )
- elif getattr(data, "name", None) is not None:
- # i.e. Series/Index with non-None name
- _copy = copy if using_copy_on_write() else True
- mgr = dict_to_mgr(
- # error: Item "ndarray" of "Union[ndarray, Series, Index]" has no
- # attribute "name"
- {data.name: data}, # type: ignore[union-attr]
- index,
- columns,
- dtype=dtype,
- typ=manager,
- copy=_copy,
- )
- else:
- mgr = ndarray_to_mgr(
- data,
- index,
- columns,
- dtype=dtype,
- copy=copy,
- typ=manager,
- )
- # For data is list-like, or Iterable (will consume into list)
- elif is_list_like(data):
- if not isinstance(data, abc.Sequence):
- if hasattr(data, "__array__"):
- # GH#44616 big perf improvement for e.g. pytorch tensor
- data = np.asarray(data)
- else:
- data = list(data)
- if len(data) > 0:
- if is_dataclass(data[0]):
- data = dataclasses_to_dicts(data)
- if not isinstance(data, np.ndarray) and treat_as_nested(data):
- # exclude ndarray as we may have cast it a few lines above
- if columns is not None:
- columns = ensure_index(columns)
- arrays, columns, index = nested_data_to_arrays(
- # error: Argument 3 to "nested_data_to_arrays" has incompatible
- # type "Optional[Collection[Any]]"; expected "Optional[Index]"
- data,
- columns,
- index, # type: ignore[arg-type]
- dtype,
- )
- mgr = arrays_to_mgr(
- arrays,
- columns,
- index,
- dtype=dtype,
- typ=manager,
- )
- else:
- mgr = ndarray_to_mgr(
- data,
- index,
- columns,
- dtype=dtype,
- copy=copy,
- typ=manager,
- )
- else:
- mgr = dict_to_mgr(
- {},
- index,
- columns if columns is not None else default_index(0),
- dtype=dtype,
- typ=manager,
- )
- # For data is scalar
- else:
- if index is None or columns is None:
- raise ValueError("DataFrame constructor not properly called!")
- index = ensure_index(index)
- columns = ensure_index(columns)
- if not dtype:
- dtype, _ = infer_dtype_from_scalar(data, pandas_dtype=True)
- # For data is a scalar extension dtype
- if isinstance(dtype, ExtensionDtype):
- # TODO(EA2D): special case not needed with 2D EAs
- values = [
- construct_1d_arraylike_from_scalar(data, len(index), dtype)
- for _ in range(len(columns))
- ]
- mgr = arrays_to_mgr(values, columns, index, dtype=None, typ=manager)
- else:
- arr2d = construct_2d_arraylike_from_scalar(
- data,
- len(index),
- len(columns),
- dtype,
- copy,
- )
- mgr = ndarray_to_mgr(
- arr2d,
- index,
- columns,
- dtype=arr2d.dtype,
- copy=False,
- typ=manager,
- )
- # ensure correct Manager type according to settings
- mgr = mgr_to_mgr(mgr, typ=manager)
- NDFrame.__init__(self, mgr)
- # ----------------------------------------------------------------------
- def __dataframe__(
- self, nan_as_null: bool = False, allow_copy: bool = True
- ) -> DataFrameXchg:
- """
- Return the dataframe interchange object implementing the interchange protocol.
- Parameters
- ----------
- nan_as_null : bool, default False
- Whether to tell the DataFrame to overwrite null values in the data
- with ``NaN`` (or ``NaT``).
- allow_copy : bool, default True
- Whether to allow memory copying when exporting. If set to False
- it would cause non-zero-copy exports to fail.
- Returns
- -------
- DataFrame interchange object
- The object which consuming library can use to ingress the dataframe.
- Notes
- -----
- Details on the interchange protocol:
- https://data-apis.org/dataframe-protocol/latest/index.html
- `nan_as_null` currently has no effect; once support for nullable extension
- dtypes is added, this value should be propagated to columns.
- """
- from pandas.core.interchange.dataframe import PandasDataFrameXchg
- return PandasDataFrameXchg(self, nan_as_null, allow_copy)
- # ----------------------------------------------------------------------
- @property
- def axes(self) -> list[Index]:
- """
- Return a list representing the axes of the DataFrame.
- It has the row axis labels and column axis labels as the only members.
- They are returned in that order.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.axes
- [RangeIndex(start=0, stop=2, step=1), Index(['col1', 'col2'],
- dtype='object')]
- """
- return [self.index, self.columns]
- @property
- def shape(self) -> tuple[int, int]:
- """
- Return a tuple representing the dimensionality of the DataFrame.
- See Also
- --------
- ndarray.shape : Tuple of array dimensions.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df.shape
- (2, 2)
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4],
- ... 'col3': [5, 6]})
- >>> df.shape
- (2, 3)
- """
- return len(self.index), len(self.columns)
- @property
- def _is_homogeneous_type(self) -> bool:
- """
- Whether all the columns in a DataFrame have the same type.
- Returns
- -------
- bool
- See Also
- --------
- Index._is_homogeneous_type : Whether the object has a single
- dtype.
- MultiIndex._is_homogeneous_type : Whether all the levels of a
- MultiIndex have the same dtype.
- Examples
- --------
- >>> DataFrame({"A": [1, 2], "B": [3, 4]})._is_homogeneous_type
- True
- >>> DataFrame({"A": [1, 2], "B": [3.0, 4.0]})._is_homogeneous_type
- False
- Items with the same type but different sizes are considered
- different types.
- >>> DataFrame({
- ... "A": np.array([1, 2], dtype=np.int32),
- ... "B": np.array([1, 2], dtype=np.int64)})._is_homogeneous_type
- False
- """
- if isinstance(self._mgr, ArrayManager):
- return len({arr.dtype for arr in self._mgr.arrays}) == 1
- if self._mgr.any_extension_types:
- return len({block.dtype for block in self._mgr.blocks}) == 1
- else:
- return not self._is_mixed_type
- @property
- def _can_fast_transpose(self) -> bool:
- """
- Can we transpose this DataFrame without creating any new array objects.
- """
- if isinstance(self._mgr, ArrayManager):
- return False
- blocks = self._mgr.blocks
- if len(blocks) != 1:
- return False
- dtype = blocks[0].dtype
- # TODO(EA2D) special case would be unnecessary with 2D EAs
- return not is_1d_only_ea_dtype(dtype)
- @property
- def _values(self) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray:
- """
- Analogue to ._values that may return a 2D ExtensionArray.
- """
- mgr = self._mgr
- if isinstance(mgr, ArrayManager):
- if len(mgr.arrays) == 1 and not is_1d_only_ea_dtype(mgr.arrays[0].dtype):
- # error: Item "ExtensionArray" of "Union[ndarray, ExtensionArray]"
- # has no attribute "reshape"
- return mgr.arrays[0].reshape(-1, 1) # type: ignore[union-attr]
- return ensure_wrapped_if_datetimelike(self.values)
- blocks = mgr.blocks
- if len(blocks) != 1:
- return ensure_wrapped_if_datetimelike(self.values)
- arr = blocks[0].values
- if arr.ndim == 1:
- # non-2D ExtensionArray
- return self.values
- # more generally, whatever we allow in NDArrayBackedExtensionBlock
- arr = cast("np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray", arr)
- return arr.T
- # ----------------------------------------------------------------------
- # Rendering Methods
- def _repr_fits_vertical_(self) -> bool:
- """
- Check length against max_rows.
- """
- max_rows = get_option("display.max_rows")
- return len(self) <= max_rows
- def _repr_fits_horizontal_(self, ignore_width: bool = False) -> bool:
- """
- Check if full repr fits in horizontal boundaries imposed by the display
- options width and max_columns.
- In case of non-interactive session, no boundaries apply.
- `ignore_width` is here so ipynb+HTML output can behave the way
- users expect. display.max_columns remains in effect.
- GH3541, GH3573
- """
- width, height = console.get_console_size()
- max_columns = get_option("display.max_columns")
- nb_columns = len(self.columns)
- # exceed max columns
- if (max_columns and nb_columns > max_columns) or (
- (not ignore_width) and width and nb_columns > (width // 2)
- ):
- return False
- # used by repr_html under IPython notebook or scripts ignore terminal
- # dims
- if ignore_width or width is None or not console.in_interactive_session():
- return True
- if get_option("display.width") is not None or console.in_ipython_frontend():
- # check at least the column row for excessive width
- max_rows = 1
- else:
- max_rows = get_option("display.max_rows")
- # when auto-detecting, so width=None and not in ipython front end
- # check whether repr fits horizontal by actually checking
- # the width of the rendered repr
- buf = StringIO()
- # only care about the stuff we'll actually print out
- # and to_string on entire frame may be expensive
- d = self
- if max_rows is not None: # unlimited rows
- # min of two, where one may be None
- d = d.iloc[: min(max_rows, len(d))]
- else:
- return True
- d.to_string(buf=buf)
- value = buf.getvalue()
- repr_width = max(len(line) for line in value.split("\n"))
- return repr_width < width
- def _info_repr(self) -> bool:
- """
- True if the repr should show the info view.
- """
- info_repr_option = get_option("display.large_repr") == "info"
- return info_repr_option and not (
- self._repr_fits_horizontal_() and self._repr_fits_vertical_()
- )
- def __repr__(self) -> str:
- """
- Return a string representation for a particular DataFrame.
- """
- if self._info_repr():
- buf = StringIO()
- self.info(buf=buf)
- return buf.getvalue()
- repr_params = fmt.get_dataframe_repr_params()
- return self.to_string(**repr_params)
- def _repr_html_(self) -> str | None:
- """
- Return a html representation for a particular DataFrame.
- Mainly for IPython notebook.
- """
- if self._info_repr():
- buf = StringIO()
- self.info(buf=buf)
- # need to escape the <class>, should be the first line.
- val = buf.getvalue().replace("<", r"<", 1)
- val = val.replace(">", r">", 1)
- return f"<pre>{val}</pre>"
- if get_option("display.notebook_repr_html"):
- max_rows = get_option("display.max_rows")
- min_rows = get_option("display.min_rows")
- max_cols = get_option("display.max_columns")
- show_dimensions = get_option("display.show_dimensions")
- formatter = fmt.DataFrameFormatter(
- self,
- columns=None,
- col_space=None,
- na_rep="NaN",
- formatters=None,
- float_format=None,
- sparsify=None,
- justify=None,
- index_names=True,
- header=True,
- index=True,
- bold_rows=True,
- escape=True,
- max_rows=max_rows,
- min_rows=min_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=".",
- )
- return fmt.DataFrameRenderer(formatter).to_html(notebook=True)
- else:
- return None
- @overload
- def to_string(
- self,
- buf: None = ...,
- columns: Sequence[str] | None = ...,
- col_space: int | list[int] | dict[Hashable, int] | None = ...,
- header: bool | Sequence[str] = ...,
- index: bool = ...,
- na_rep: str = ...,
- formatters: fmt.FormattersType | None = ...,
- float_format: fmt.FloatFormatType | None = ...,
- sparsify: bool | None = ...,
- index_names: bool = ...,
- justify: str | None = ...,
- max_rows: int | None = ...,
- max_cols: int | None = ...,
- show_dimensions: bool = ...,
- decimal: str = ...,
- line_width: int | None = ...,
- min_rows: int | None = ...,
- max_colwidth: int | None = ...,
- encoding: str | None = ...,
- ) -> str:
- ...
- @overload
- def to_string(
- self,
- buf: FilePath | WriteBuffer[str],
- columns: Sequence[str] | None = ...,
- col_space: int | list[int] | dict[Hashable, int] | None = ...,
- header: bool | Sequence[str] = ...,
- index: bool = ...,
- na_rep: str = ...,
- formatters: fmt.FormattersType | None = ...,
- float_format: fmt.FloatFormatType | None = ...,
- sparsify: bool | None = ...,
- index_names: bool = ...,
- justify: str | None = ...,
- max_rows: int | None = ...,
- max_cols: int | None = ...,
- show_dimensions: bool = ...,
- decimal: str = ...,
- line_width: int | None = ...,
- min_rows: int | None = ...,
- max_colwidth: int | None = ...,
- encoding: str | None = ...,
- ) -> None:
- ...
- @Substitution(
- header_type="bool or sequence of str",
- header="Write out the column names. If a list of strings "
- "is given, it is assumed to be aliases for the "
- "column names",
- col_space_type="int, list or dict of int",
- col_space="The minimum width of each column. If a list of ints is given "
- "every integers corresponds with one column. If a dict is given, the key "
- "references the column, while the value defines the space to use.",
- )
- @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
- def to_string(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- columns: Sequence[str] | None = None,
- col_space: int | list[int] | dict[Hashable, int] | None = None,
- header: bool | Sequence[str] = True,
- index: bool = True,
- na_rep: str = "NaN",
- formatters: fmt.FormattersType | None = None,
- float_format: fmt.FloatFormatType | None = None,
- sparsify: bool | None = None,
- index_names: bool = True,
- justify: str | None = None,
- max_rows: int | None = None,
- max_cols: int | None = None,
- show_dimensions: bool = False,
- decimal: str = ".",
- line_width: int | None = None,
- min_rows: int | None = None,
- max_colwidth: int | None = None,
- encoding: str | None = None,
- ) -> str | None:
- """
- Render a DataFrame to a console-friendly tabular output.
- %(shared_params)s
- line_width : int, optional
- Width to wrap a line in characters.
- min_rows : int, optional
- The number of rows to display in the console in a truncated repr
- (when number of rows is above `max_rows`).
- max_colwidth : int, optional
- Max width to truncate each column in characters. By default, no limit.
- encoding : str, default "utf-8"
- Set character encoding.
- %(returns)s
- See Also
- --------
- to_html : Convert DataFrame to HTML.
- Examples
- --------
- >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]}
- >>> df = pd.DataFrame(d)
- >>> print(df.to_string())
- col1 col2
- 0 1 4
- 1 2 5
- 2 3 6
- """
- from pandas import option_context
- with option_context("display.max_colwidth", max_colwidth):
- formatter = fmt.DataFrameFormatter(
- self,
- columns=columns,
- col_space=col_space,
- na_rep=na_rep,
- formatters=formatters,
- float_format=float_format,
- sparsify=sparsify,
- justify=justify,
- index_names=index_names,
- header=header,
- index=index,
- min_rows=min_rows,
- max_rows=max_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- decimal=decimal,
- )
- return fmt.DataFrameRenderer(formatter).to_string(
- buf=buf,
- encoding=encoding,
- line_width=line_width,
- )
- # ----------------------------------------------------------------------
- @property
- def style(self) -> Styler:
- """
- Returns a Styler object.
- Contains methods for building a styled HTML representation of the DataFrame.
- See Also
- --------
- io.formats.style.Styler : Helps style a DataFrame or Series according to the
- data with HTML and CSS.
- """
- from pandas.io.formats.style import Styler
- return Styler(self)
- _shared_docs[
- "items"
- ] = r"""
- Iterate over (column name, Series) pairs.
- Iterates over the DataFrame columns, returning a tuple with
- the column name and the content as a Series.
- Yields
- ------
- label : object
- The column names for the DataFrame being iterated over.
- content : Series
- The column entries belonging to each label, as a Series.
- See Also
- --------
- DataFrame.iterrows : Iterate over DataFrame rows as
- (index, Series) pairs.
- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples
- of the values.
- Examples
- --------
- >>> df = pd.DataFrame({'species': ['bear', 'bear', 'marsupial'],
- ... 'population': [1864, 22000, 80000]},
- ... index=['panda', 'polar', 'koala'])
- >>> df
- species population
- panda bear 1864
- polar bear 22000
- koala marsupial 80000
- >>> for label, content in df.items():
- ... print(f'label: {label}')
- ... print(f'content: {content}', sep='\n')
- ...
- label: species
- content:
- panda bear
- polar bear
- koala marsupial
- Name: species, dtype: object
- label: population
- content:
- panda 1864
- polar 22000
- koala 80000
- Name: population, dtype: int64
- """
- @Appender(_shared_docs["items"])
- def items(self) -> Iterable[tuple[Hashable, Series]]:
- if self.columns.is_unique and hasattr(self, "_item_cache"):
- for k in self.columns:
- yield k, self._get_item_cache(k)
- else:
- for i, k in enumerate(self.columns):
- yield k, self._ixs(i, axis=1)
- def iterrows(self) -> Iterable[tuple[Hashable, Series]]:
- """
- Iterate over DataFrame rows as (index, Series) pairs.
- Yields
- ------
- index : label or tuple of label
- The index of the row. A tuple for a `MultiIndex`.
- data : Series
- The data of the row as a Series.
- See Also
- --------
- DataFrame.itertuples : Iterate over DataFrame rows as namedtuples of the values.
- DataFrame.items : Iterate over (column name, Series) pairs.
- Notes
- -----
- 1. Because ``iterrows`` returns a Series for each row,
- it does **not** preserve dtypes across the rows (dtypes are
- preserved across columns for DataFrames). For example,
- >>> df = pd.DataFrame([[1, 1.5]], columns=['int', 'float'])
- >>> row = next(df.iterrows())[1]
- >>> row
- int 1.0
- float 1.5
- Name: 0, dtype: float64
- >>> print(row['int'].dtype)
- float64
- >>> print(df['int'].dtype)
- int64
- To preserve dtypes while iterating over the rows, it is better
- to use :meth:`itertuples` which returns namedtuples of the values
- and which is generally faster than ``iterrows``.
- 2. You should **never modify** something you are iterating over.
- This is not guaranteed to work in all cases. Depending on the
- data types, the iterator returns a copy and not a view, and writing
- to it will have no effect.
- """
- columns = self.columns
- klass = self._constructor_sliced
- using_cow = using_copy_on_write()
- for k, v in zip(self.index, self.values):
- s = klass(v, index=columns, name=k).__finalize__(self)
- if using_cow and self._mgr.is_single_block:
- s._mgr.add_references(self._mgr) # type: ignore[arg-type]
- yield k, s
- def itertuples(
- self, index: bool = True, name: str | None = "Pandas"
- ) -> Iterable[tuple[Any, ...]]:
- """
- Iterate over DataFrame rows as namedtuples.
- Parameters
- ----------
- index : bool, default True
- If True, return the index as the first element of the tuple.
- name : str or None, default "Pandas"
- The name of the returned namedtuples or None to return regular
- tuples.
- Returns
- -------
- iterator
- An object to iterate over namedtuples for each row in the
- DataFrame with the first field possibly being the index and
- following fields being the column values.
- See Also
- --------
- DataFrame.iterrows : Iterate over DataFrame rows as (index, Series)
- pairs.
- DataFrame.items : Iterate over (column name, Series) pairs.
- Notes
- -----
- The column names will be renamed to positional names if they are
- invalid Python identifiers, repeated, or start with an underscore.
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [4, 2], 'num_wings': [0, 2]},
- ... index=['dog', 'hawk'])
- >>> df
- num_legs num_wings
- dog 4 0
- hawk 2 2
- >>> for row in df.itertuples():
- ... print(row)
- ...
- Pandas(Index='dog', num_legs=4, num_wings=0)
- Pandas(Index='hawk', num_legs=2, num_wings=2)
- By setting the `index` parameter to False we can remove the index
- as the first element of the tuple:
- >>> for row in df.itertuples(index=False):
- ... print(row)
- ...
- Pandas(num_legs=4, num_wings=0)
- Pandas(num_legs=2, num_wings=2)
- With the `name` parameter set we set a custom name for the yielded
- namedtuples:
- >>> for row in df.itertuples(name='Animal'):
- ... print(row)
- ...
- Animal(Index='dog', num_legs=4, num_wings=0)
- Animal(Index='hawk', num_legs=2, num_wings=2)
- """
- arrays = []
- fields = list(self.columns)
- if index:
- arrays.append(self.index)
- fields.insert(0, "Index")
- # use integer indexing because of possible duplicate column names
- arrays.extend(self.iloc[:, k] for k in range(len(self.columns)))
- if name is not None:
- # https://github.com/python/mypy/issues/9046
- # error: namedtuple() expects a string literal as the first argument
- itertuple = collections.namedtuple( # type: ignore[misc]
- name, fields, rename=True
- )
- return map(itertuple._make, zip(*arrays))
- # fallback to regular tuples
- return zip(*arrays)
- def __len__(self) -> int:
- """
- Returns length of info axis, but here we use the index.
- """
- return len(self.index)
- @overload
- def dot(self, other: Series) -> Series:
- ...
- @overload
- def dot(self, other: DataFrame | Index | ArrayLike) -> DataFrame:
- ...
- def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
- """
- Compute the matrix multiplication between the DataFrame and other.
- This method computes the matrix product between the DataFrame and the
- values of an other Series, DataFrame or a numpy array.
- It can also be called using ``self @ other`` in Python >= 3.5.
- Parameters
- ----------
- other : Series, DataFrame or array-like
- The other object to compute the matrix product with.
- Returns
- -------
- Series or DataFrame
- If other is a Series, return the matrix product between self and
- other as a Series. If other is a DataFrame or a numpy.array, return
- the matrix product of self and other in a DataFrame of a np.array.
- See Also
- --------
- Series.dot: Similar method for Series.
- Notes
- -----
- The dimensions of DataFrame and other must be compatible in order to
- compute the matrix multiplication. In addition, the column names of
- DataFrame and the index of other must contain the same values, as they
- will be aligned prior to the multiplication.
- The dot method for Series computes the inner product, instead of the
- matrix product here.
- Examples
- --------
- Here we multiply a DataFrame with a Series.
- >>> df = pd.DataFrame([[0, 1, -2, -1], [1, 1, 1, 1]])
- >>> s = pd.Series([1, 1, 2, 1])
- >>> df.dot(s)
- 0 -4
- 1 5
- dtype: int64
- Here we multiply a DataFrame with another DataFrame.
- >>> other = pd.DataFrame([[0, 1], [1, 2], [-1, -1], [2, 0]])
- >>> df.dot(other)
- 0 1
- 0 1 4
- 1 2 2
- Note that the dot method give the same result as @
- >>> df @ other
- 0 1
- 0 1 4
- 1 2 2
- The dot method works also if other is an np.array.
- >>> arr = np.array([[0, 1], [1, 2], [-1, -1], [2, 0]])
- >>> df.dot(arr)
- 0 1
- 0 1 4
- 1 2 2
- Note how shuffling of the objects does not change the result.
- >>> s2 = s.reindex([1, 0, 2, 3])
- >>> df.dot(s2)
- 0 -4
- 1 5
- dtype: int64
- """
- if isinstance(other, (Series, DataFrame)):
- common = self.columns.union(other.index)
- if len(common) > len(self.columns) or len(common) > len(other.index):
- raise ValueError("matrices are not aligned")
- left = self.reindex(columns=common, copy=False)
- right = other.reindex(index=common, copy=False)
- lvals = left.values
- rvals = right._values
- else:
- left = self
- lvals = self.values
- rvals = np.asarray(other)
- if lvals.shape[1] != rvals.shape[0]:
- raise ValueError(
- f"Dot product shape mismatch, {lvals.shape} vs {rvals.shape}"
- )
- if isinstance(other, DataFrame):
- return self._constructor(
- np.dot(lvals, rvals),
- index=left.index,
- columns=other.columns,
- copy=False,
- )
- elif isinstance(other, Series):
- return self._constructor_sliced(
- np.dot(lvals, rvals), index=left.index, copy=False
- )
- elif isinstance(rvals, (np.ndarray, Index)):
- result = np.dot(lvals, rvals)
- if result.ndim == 2:
- return self._constructor(result, index=left.index, copy=False)
- else:
- return self._constructor_sliced(result, index=left.index, copy=False)
- else: # pragma: no cover
- raise TypeError(f"unsupported type: {type(other)}")
- @overload
- def __matmul__(self, other: Series) -> Series:
- ...
- @overload
- def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
- ...
- def __matmul__(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- return self.dot(other)
- def __rmatmul__(self, other) -> DataFrame:
- """
- Matrix multiplication using binary `@` operator in Python>=3.5.
- """
- try:
- return self.T.dot(np.transpose(other)).T
- except ValueError as err:
- if "shape mismatch" not in str(err):
- raise
- # GH#21581 give exception message for original shapes
- msg = f"shapes {np.shape(other)} and {self.shape} not aligned"
- raise ValueError(msg) from err
- # ----------------------------------------------------------------------
- # IO methods (to / from other formats)
- @classmethod
- def from_dict(
- cls,
- data: dict,
- orient: str = "columns",
- dtype: Dtype | None = None,
- columns: Axes | None = None,
- ) -> DataFrame:
- """
- Construct DataFrame from dict of array-like or dicts.
- Creates DataFrame object from dictionary by columns or by index
- allowing dtype specification.
- Parameters
- ----------
- data : dict
- Of the form {field : array-like} or {field : dict}.
- orient : {'columns', 'index', 'tight'}, default 'columns'
- The "orientation" of the data. If the keys of the passed dict
- should be the columns of the resulting DataFrame, pass 'columns'
- (default). Otherwise if the keys should be rows, pass 'index'.
- If 'tight', assume a dict with keys ['index', 'columns', 'data',
- 'index_names', 'column_names'].
- .. versionadded:: 1.4.0
- 'tight' as an allowed value for the ``orient`` argument
- dtype : dtype, default None
- Data type to force after DataFrame construction, otherwise infer.
- columns : list, default None
- Column labels to use when ``orient='index'``. Raises a ValueError
- if used with ``orient='columns'`` or ``orient='tight'``.
- Returns
- -------
- DataFrame
- See Also
- --------
- DataFrame.from_records : DataFrame from structured ndarray, sequence
- of tuples or dicts, or DataFrame.
- DataFrame : DataFrame object creation using constructor.
- DataFrame.to_dict : Convert the DataFrame to a dictionary.
- Examples
- --------
- By default the keys of the dict become the DataFrame columns:
- >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
- >>> pd.DataFrame.from_dict(data)
- col_1 col_2
- 0 3 a
- 1 2 b
- 2 1 c
- 3 0 d
- Specify ``orient='index'`` to create the DataFrame using dictionary
- keys as rows:
- >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']}
- >>> pd.DataFrame.from_dict(data, orient='index')
- 0 1 2 3
- row_1 3 2 1 0
- row_2 a b c d
- When using the 'index' orientation, the column names can be
- specified manually:
- >>> pd.DataFrame.from_dict(data, orient='index',
- ... columns=['A', 'B', 'C', 'D'])
- A B C D
- row_1 3 2 1 0
- row_2 a b c d
- Specify ``orient='tight'`` to create the DataFrame using a 'tight'
- format:
- >>> data = {'index': [('a', 'b'), ('a', 'c')],
- ... 'columns': [('x', 1), ('y', 2)],
- ... 'data': [[1, 3], [2, 4]],
- ... 'index_names': ['n1', 'n2'],
- ... 'column_names': ['z1', 'z2']}
- >>> pd.DataFrame.from_dict(data, orient='tight')
- z1 x y
- z2 1 2
- n1 n2
- a b 1 3
- c 2 4
- """
- index = None
- orient = orient.lower()
- if orient == "index":
- if len(data) > 0:
- # TODO speed up Series case
- if isinstance(list(data.values())[0], (Series, dict)):
- data = _from_nested_dict(data)
- else:
- index = list(data.keys())
- # error: Incompatible types in assignment (expression has type
- # "List[Any]", variable has type "Dict[Any, Any]")
- data = list(data.values()) # type: ignore[assignment]
- elif orient in ("columns", "tight"):
- if columns is not None:
- raise ValueError(f"cannot use columns parameter with orient='{orient}'")
- else: # pragma: no cover
- raise ValueError(
- f"Expected 'index', 'columns' or 'tight' for orient parameter. "
- f"Got '{orient}' instead"
- )
- if orient != "tight":
- return cls(data, index=index, columns=columns, dtype=dtype)
- else:
- realdata = data["data"]
- def create_index(indexlist, namelist):
- index: Index
- if len(namelist) > 1:
- index = MultiIndex.from_tuples(indexlist, names=namelist)
- else:
- index = Index(indexlist, name=namelist[0])
- return index
- index = create_index(data["index"], data["index_names"])
- columns = create_index(data["columns"], data["column_names"])
- return cls(realdata, index=index, columns=columns, dtype=dtype)
- def to_numpy(
- self,
- dtype: npt.DTypeLike | None = None,
- copy: bool = False,
- na_value: object = lib.no_default,
- ) -> np.ndarray:
- """
- Convert the DataFrame to a NumPy array.
- By default, the dtype of the returned array will be the common NumPy
- dtype of all types in the DataFrame. For example, if the dtypes are
- ``float16`` and ``float32``, the results dtype will be ``float32``.
- This may require copying data and coercing values, which may be
- expensive.
- Parameters
- ----------
- dtype : str or numpy.dtype, optional
- The dtype to pass to :meth:`numpy.asarray`.
- copy : bool, default False
- Whether to ensure that the returned value is not a view on
- another array. Note that ``copy=False`` does not *ensure* that
- ``to_numpy()`` is no-copy. Rather, ``copy=True`` ensure that
- a copy is made, even if not strictly necessary.
- na_value : Any, optional
- The value to use for missing values. The default value depends
- on `dtype` and the dtypes of the DataFrame columns.
- .. versionadded:: 1.1.0
- Returns
- -------
- numpy.ndarray
- See Also
- --------
- Series.to_numpy : Similar method for Series.
- Examples
- --------
- >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()
- array([[1, 3],
- [2, 4]])
- With heterogeneous data, the lowest common type will have to
- be used.
- >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
- >>> df.to_numpy()
- array([[1. , 3. ],
- [2. , 4.5]])
- For a mix of numeric and non-numeric types, the output array will
- have object dtype.
- >>> df['C'] = pd.date_range('2000', periods=2)
- >>> df.to_numpy()
- array([[1, 3.0, Timestamp('2000-01-01 00:00:00')],
- [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object)
- """
- if dtype is not None:
- dtype = np.dtype(dtype)
- result = self._mgr.as_array(dtype=dtype, copy=copy, na_value=na_value)
- if result.dtype is not dtype:
- result = np.array(result, dtype=dtype, copy=False)
- return result
- def _create_data_for_split_and_tight_to_dict(
- self, are_all_object_dtype_cols: bool, object_dtype_indices: list[int]
- ) -> list:
- """
- Simple helper method to create data for to ``to_dict(orient="split")`` and
- ``to_dict(orient="tight")`` to create the main output data
- """
- if are_all_object_dtype_cols:
- data = [
- list(map(maybe_box_native, t))
- for t in self.itertuples(index=False, name=None)
- ]
- else:
- data = [list(t) for t in self.itertuples(index=False, name=None)]
- if object_dtype_indices:
- # If we have object_dtype_cols, apply maybe_box_naive after list
- # comprehension for perf
- for row in data:
- for i in object_dtype_indices:
- row[i] = maybe_box_native(row[i])
- return data
- @overload
- def to_dict(
- self,
- orient: Literal["dict", "list", "series", "split", "tight", "index"] = ...,
- into: type[dict] = ...,
- ) -> dict:
- ...
- @overload
- def to_dict(self, orient: Literal["records"], into: type[dict] = ...) -> list[dict]:
- ...
- def to_dict(
- self,
- orient: Literal[
- "dict", "list", "series", "split", "tight", "records", "index"
- ] = "dict",
- into: type[dict] = dict,
- index: bool = True,
- ) -> dict | list[dict]:
- """
- Convert the DataFrame to a dictionary.
- The type of the key-value pairs can be customized with the parameters
- (see below).
- Parameters
- ----------
- orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
- Determines the type of the values of the dictionary.
- - 'dict' (default) : dict like {column -> {index -> value}}
- - 'list' : dict like {column -> [values]}
- - 'series' : dict like {column -> Series(values)}
- - 'split' : dict like
- {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
- - 'tight' : dict like
- {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
- 'index_names' -> [index.names], 'column_names' -> [column.names]}
- - 'records' : list like
- [{column -> value}, ... , {column -> value}]
- - 'index' : dict like {index -> {column -> value}}
- .. versionadded:: 1.4.0
- 'tight' as an allowed value for the ``orient`` argument
- into : class, default dict
- The collections.abc.Mapping subclass used for all Mappings
- in the return value. Can be the actual class or an empty
- instance of the mapping type you want. If you want a
- collections.defaultdict, you must pass it initialized.
- index : bool, default True
- Whether to include the index item (and index_names item if `orient`
- is 'tight') in the returned dictionary. Can only be ``False``
- when `orient` is 'split' or 'tight'.
- .. versionadded:: 2.0.0
- Returns
- -------
- dict, list or collections.abc.Mapping
- Return a collections.abc.Mapping object representing the DataFrame.
- The resulting transformation depends on the `orient` parameter.
- See Also
- --------
- DataFrame.from_dict: Create a DataFrame from a dictionary.
- DataFrame.to_json: Convert a DataFrame to JSON format.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2],
- ... 'col2': [0.5, 0.75]},
- ... index=['row1', 'row2'])
- >>> df
- col1 col2
- row1 1 0.50
- row2 2 0.75
- >>> df.to_dict()
- {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
- You can specify the return orientation.
- >>> df.to_dict('series')
- {'col1': row1 1
- row2 2
- Name: col1, dtype: int64,
- 'col2': row1 0.50
- row2 0.75
- Name: col2, dtype: float64}
- >>> df.to_dict('split')
- {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
- 'data': [[1, 0.5], [2, 0.75]]}
- >>> df.to_dict('records')
- [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
- >>> df.to_dict('index')
- {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
- >>> df.to_dict('tight')
- {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
- 'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
- You can also specify the mapping type.
- >>> from collections import OrderedDict, defaultdict
- >>> df.to_dict(into=OrderedDict)
- OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
- ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
- If you want a `defaultdict`, you need to initialize it:
- >>> dd = defaultdict(list)
- >>> df.to_dict('records', into=dd)
- [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
- defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
- """
- from pandas.core.methods.to_dict import to_dict
- return to_dict(self, orient, into, index)
- def to_gbq(
- self,
- destination_table: str,
- project_id: str | None = None,
- chunksize: int | None = None,
- reauth: bool = False,
- if_exists: str = "fail",
- auth_local_webserver: bool = True,
- table_schema: list[dict[str, str]] | None = None,
- location: str | None = None,
- progress_bar: bool = True,
- credentials=None,
- ) -> None:
- """
- Write a DataFrame to a Google BigQuery table.
- This function requires the `pandas-gbq package
- <https://pandas-gbq.readthedocs.io>`__.
- See the `How to authenticate with Google BigQuery
- <https://pandas-gbq.readthedocs.io/en/latest/howto/authentication.html>`__
- guide for authentication instructions.
- Parameters
- ----------
- destination_table : str
- Name of table to be written, in the form ``dataset.tablename``.
- project_id : str, optional
- Google BigQuery Account project ID. Optional when available from
- the environment.
- chunksize : int, optional
- Number of rows to be inserted in each chunk from the dataframe.
- Set to ``None`` to load the whole dataframe at once.
- reauth : bool, default False
- Force Google BigQuery to re-authenticate the user. This is useful
- if multiple accounts are used.
- if_exists : str, default 'fail'
- Behavior when the destination table exists. Value can be one of:
- ``'fail'``
- If table exists raise pandas_gbq.gbq.TableCreationError.
- ``'replace'``
- If table exists, drop it, recreate it, and insert data.
- ``'append'``
- If table exists, insert data. Create if does not exist.
- auth_local_webserver : bool, default True
- Use the `local webserver flow`_ instead of the `console flow`_
- when getting user credentials.
- .. _local webserver flow:
- https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_local_server
- .. _console flow:
- https://google-auth-oauthlib.readthedocs.io/en/latest/reference/google_auth_oauthlib.flow.html#google_auth_oauthlib.flow.InstalledAppFlow.run_console
- *New in version 0.2.0 of pandas-gbq*.
- .. versionchanged:: 1.5.0
- Default value is changed to ``True``. Google has deprecated the
- ``auth_local_webserver = False`` `"out of band" (copy-paste)
- flow
- <https://developers.googleblog.com/2022/02/making-oauth-flows-safer.html?m=1#disallowed-oob>`_.
- table_schema : list of dicts, optional
- List of BigQuery table fields to which according DataFrame
- columns conform to, e.g. ``[{'name': 'col1', 'type':
- 'STRING'},...]``. If schema is not provided, it will be
- generated according to dtypes of DataFrame columns. See
- BigQuery API documentation on available names of a field.
- *New in version 0.3.1 of pandas-gbq*.
- location : str, optional
- Location where the load job should run. See the `BigQuery locations
- documentation
- <https://cloud.google.com/bigquery/docs/dataset-locations>`__ for a
- list of available locations. The location must match that of the
- target dataset.
- *New in version 0.5.0 of pandas-gbq*.
- progress_bar : bool, default True
- Use the library `tqdm` to show the progress bar for the upload,
- chunk by chunk.
- *New in version 0.5.0 of pandas-gbq*.
- credentials : google.auth.credentials.Credentials, optional
- Credentials for accessing Google APIs. Use this parameter to
- override default credentials, such as to use Compute Engine
- :class:`google.auth.compute_engine.Credentials` or Service
- Account :class:`google.oauth2.service_account.Credentials`
- directly.
- *New in version 0.8.0 of pandas-gbq*.
- See Also
- --------
- pandas_gbq.to_gbq : This function in the pandas-gbq library.
- read_gbq : Read a DataFrame from Google BigQuery.
- """
- from pandas.io import gbq
- gbq.to_gbq(
- self,
- destination_table,
- project_id=project_id,
- chunksize=chunksize,
- reauth=reauth,
- if_exists=if_exists,
- auth_local_webserver=auth_local_webserver,
- table_schema=table_schema,
- location=location,
- progress_bar=progress_bar,
- credentials=credentials,
- )
- @classmethod
- def from_records(
- cls,
- data,
- index=None,
- exclude=None,
- columns=None,
- coerce_float: bool = False,
- nrows: int | None = None,
- ) -> DataFrame:
- """
- Convert structured or record ndarray to DataFrame.
- Creates a DataFrame object from a structured ndarray, sequence of
- tuples or dicts, or DataFrame.
- Parameters
- ----------
- data : structured ndarray, sequence of tuples or dicts, or DataFrame
- Structured input data.
- index : str, list of fields, array-like
- Field of array to use as the index, alternately a specific set of
- input labels to use.
- exclude : sequence, default None
- Columns or fields to exclude.
- columns : sequence, default None
- Column names to use. If the passed data do not have names
- associated with them, this argument provides names for the
- columns. Otherwise this argument indicates the order of the columns
- in the result (any names not found in the data will become all-NA
- columns).
- coerce_float : bool, default False
- Attempt to convert values of non-string, non-numeric objects (like
- decimal.Decimal) to floating point, useful for SQL result sets.
- nrows : int, default None
- Number of rows to read if data is an iterator.
- Returns
- -------
- DataFrame
- See Also
- --------
- DataFrame.from_dict : DataFrame from dict of array-like or dicts.
- DataFrame : DataFrame object creation using constructor.
- Examples
- --------
- Data can be provided as a structured ndarray:
- >>> data = np.array([(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')],
- ... dtype=[('col_1', 'i4'), ('col_2', 'U1')])
- >>> pd.DataFrame.from_records(data)
- col_1 col_2
- 0 3 a
- 1 2 b
- 2 1 c
- 3 0 d
- Data can be provided as a list of dicts:
- >>> data = [{'col_1': 3, 'col_2': 'a'},
- ... {'col_1': 2, 'col_2': 'b'},
- ... {'col_1': 1, 'col_2': 'c'},
- ... {'col_1': 0, 'col_2': 'd'}]
- >>> pd.DataFrame.from_records(data)
- col_1 col_2
- 0 3 a
- 1 2 b
- 2 1 c
- 3 0 d
- Data can be provided as a list of tuples with corresponding columns:
- >>> data = [(3, 'a'), (2, 'b'), (1, 'c'), (0, 'd')]
- >>> pd.DataFrame.from_records(data, columns=['col_1', 'col_2'])
- col_1 col_2
- 0 3 a
- 1 2 b
- 2 1 c
- 3 0 d
- """
- if isinstance(data, DataFrame):
- if columns is not None:
- if is_scalar(columns):
- columns = [columns]
- data = data[columns]
- if index is not None:
- data = data.set_index(index)
- if exclude is not None:
- data = data.drop(columns=exclude)
- return data.copy(deep=False)
- result_index = None
- # Make a copy of the input columns so we can modify it
- if columns is not None:
- columns = ensure_index(columns)
- def maybe_reorder(
- arrays: list[ArrayLike], arr_columns: Index, columns: Index, index
- ) -> tuple[list[ArrayLike], Index, Index | None]:
- """
- If our desired 'columns' do not match the data's pre-existing 'arr_columns',
- we re-order our arrays. This is like a pre-emptive (cheap) reindex.
- """
- if len(arrays):
- length = len(arrays[0])
- else:
- length = 0
- result_index = None
- if len(arrays) == 0 and index is None and length == 0:
- result_index = default_index(0)
- arrays, arr_columns = reorder_arrays(arrays, arr_columns, columns, length)
- return arrays, arr_columns, result_index
- if is_iterator(data):
- if nrows == 0:
- return cls()
- try:
- first_row = next(data)
- except StopIteration:
- return cls(index=index, columns=columns)
- dtype = None
- if hasattr(first_row, "dtype") and first_row.dtype.names:
- dtype = first_row.dtype
- values = [first_row]
- if nrows is None:
- values += data
- else:
- values.extend(itertools.islice(data, nrows - 1))
- if dtype is not None:
- data = np.array(values, dtype=dtype)
- else:
- data = values
- if isinstance(data, dict):
- if columns is None:
- columns = arr_columns = ensure_index(sorted(data))
- arrays = [data[k] for k in columns]
- else:
- arrays = []
- arr_columns_list = []
- for k, v in data.items():
- if k in columns:
- arr_columns_list.append(k)
- arrays.append(v)
- arr_columns = Index(arr_columns_list)
- arrays, arr_columns, result_index = maybe_reorder(
- arrays, arr_columns, columns, index
- )
- elif isinstance(data, (np.ndarray, DataFrame)):
- arrays, columns = to_arrays(data, columns)
- arr_columns = columns
- else:
- arrays, arr_columns = to_arrays(data, columns)
- if coerce_float:
- for i, arr in enumerate(arrays):
- if arr.dtype == object:
- # error: Argument 1 to "maybe_convert_objects" has
- # incompatible type "Union[ExtensionArray, ndarray]";
- # expected "ndarray"
- arrays[i] = lib.maybe_convert_objects(
- arr, # type: ignore[arg-type]
- try_float=True,
- )
- arr_columns = ensure_index(arr_columns)
- if columns is None:
- columns = arr_columns
- else:
- arrays, arr_columns, result_index = maybe_reorder(
- arrays, arr_columns, columns, index
- )
- if exclude is None:
- exclude = set()
- else:
- exclude = set(exclude)
- if index is not None:
- if isinstance(index, str) or not hasattr(index, "__iter__"):
- i = columns.get_loc(index)
- exclude.add(index)
- if len(arrays) > 0:
- result_index = Index(arrays[i], name=index)
- else:
- result_index = Index([], name=index)
- else:
- try:
- index_data = [arrays[arr_columns.get_loc(field)] for field in index]
- except (KeyError, TypeError):
- # raised by get_loc, see GH#29258
- result_index = index
- else:
- result_index = ensure_index_from_sequences(index_data, names=index)
- exclude.update(index)
- if any(exclude):
- arr_exclude = [x for x in exclude if x in arr_columns]
- to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
- arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
- columns = columns.drop(exclude)
- manager = get_option("mode.data_manager")
- mgr = arrays_to_mgr(arrays, columns, result_index, typ=manager)
- return cls(mgr)
- def to_records(
- self, index: bool = True, column_dtypes=None, index_dtypes=None
- ) -> np.recarray:
- """
- Convert DataFrame to a NumPy record array.
- Index will be included as the first field of the record array if
- requested.
- Parameters
- ----------
- index : bool, default True
- Include index in resulting record array, stored in 'index'
- field or using the index label, if set.
- column_dtypes : str, type, dict, default None
- If a string or type, the data type to store all columns. If
- a dictionary, a mapping of column names and indices (zero-indexed)
- to specific data types.
- index_dtypes : str, type, dict, default None
- If a string or type, the data type to store all index levels. If
- a dictionary, a mapping of index level names and indices
- (zero-indexed) to specific data types.
- This mapping is applied only if `index=True`.
- Returns
- -------
- numpy.recarray
- NumPy ndarray with the DataFrame labels as fields and each row
- of the DataFrame as entries.
- See Also
- --------
- DataFrame.from_records: Convert structured or record ndarray
- to DataFrame.
- numpy.recarray: An ndarray that allows field access using
- attributes, analogous to typed columns in a
- spreadsheet.
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 2], 'B': [0.5, 0.75]},
- ... index=['a', 'b'])
- >>> df
- A B
- a 1 0.50
- b 2 0.75
- >>> df.to_records()
- rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
- dtype=[('index', 'O'), ('A', '<i8'), ('B', '<f8')])
- If the DataFrame index has no label then the recarray field name
- is set to 'index'. If the index has a label then this is used as the
- field name:
- >>> df.index = df.index.rename("I")
- >>> df.to_records()
- rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
- dtype=[('I', 'O'), ('A', '<i8'), ('B', '<f8')])
- The index can be excluded from the record array:
- >>> df.to_records(index=False)
- rec.array([(1, 0.5 ), (2, 0.75)],
- dtype=[('A', '<i8'), ('B', '<f8')])
- Data types can be specified for the columns:
- >>> df.to_records(column_dtypes={"A": "int32"})
- rec.array([('a', 1, 0.5 ), ('b', 2, 0.75)],
- dtype=[('I', 'O'), ('A', '<i4'), ('B', '<f8')])
- As well as for the index:
- >>> df.to_records(index_dtypes="<S2")
- rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
- dtype=[('I', 'S2'), ('A', '<i8'), ('B', '<f8')])
- >>> index_dtypes = f"<S{df.index.str.len().max()}"
- >>> df.to_records(index_dtypes=index_dtypes)
- rec.array([(b'a', 1, 0.5 ), (b'b', 2, 0.75)],
- dtype=[('I', 'S1'), ('A', '<i8'), ('B', '<f8')])
- """
- if index:
- ix_vals = [
- np.asarray(self.index.get_level_values(i))
- for i in range(self.index.nlevels)
- ]
- arrays = ix_vals + [
- np.asarray(self.iloc[:, i]) for i in range(len(self.columns))
- ]
- index_names = list(self.index.names)
- if isinstance(self.index, MultiIndex):
- index_names = com.fill_missing_names(index_names)
- elif index_names[0] is None:
- index_names = ["index"]
- names = [str(name) for name in itertools.chain(index_names, self.columns)]
- else:
- arrays = [np.asarray(self.iloc[:, i]) for i in range(len(self.columns))]
- names = [str(c) for c in self.columns]
- index_names = []
- index_len = len(index_names)
- formats = []
- for i, v in enumerate(arrays):
- index_int = i
- # When the names and arrays are collected, we
- # first collect those in the DataFrame's index,
- # followed by those in its columns.
- #
- # Thus, the total length of the array is:
- # len(index_names) + len(DataFrame.columns).
- #
- # This check allows us to see whether we are
- # handling a name / array in the index or column.
- if index_int < index_len:
- dtype_mapping = index_dtypes
- name = index_names[index_int]
- else:
- index_int -= index_len
- dtype_mapping = column_dtypes
- name = self.columns[index_int]
- # We have a dictionary, so we get the data type
- # associated with the index or column (which can
- # be denoted by its name in the DataFrame or its
- # position in DataFrame's array of indices or
- # columns, whichever is applicable.
- if is_dict_like(dtype_mapping):
- if name in dtype_mapping:
- dtype_mapping = dtype_mapping[name]
- elif index_int in dtype_mapping:
- dtype_mapping = dtype_mapping[index_int]
- else:
- dtype_mapping = None
- # If no mapping can be found, use the array's
- # dtype attribute for formatting.
- #
- # A valid dtype must either be a type or
- # string naming a type.
- if dtype_mapping is None:
- formats.append(v.dtype)
- elif isinstance(dtype_mapping, (type, np.dtype, str)):
- # error: Argument 1 to "append" of "list" has incompatible
- # type "Union[type, dtype[Any], str]"; expected "dtype[Any]"
- formats.append(dtype_mapping) # type: ignore[arg-type]
- else:
- element = "row" if i < index_len else "column"
- msg = f"Invalid dtype {dtype_mapping} specified for {element} {name}"
- raise ValueError(msg)
- return np.rec.fromarrays(arrays, dtype={"names": names, "formats": formats})
- @classmethod
- def _from_arrays(
- cls,
- arrays,
- columns,
- index,
- dtype: Dtype | None = None,
- verify_integrity: bool = True,
- ) -> DataFrame:
- """
- Create DataFrame from a list of arrays corresponding to the columns.
- Parameters
- ----------
- arrays : list-like of arrays
- Each array in the list corresponds to one column, in order.
- columns : list-like, Index
- The column names for the resulting DataFrame.
- index : list-like, Index
- The rows labels for the resulting DataFrame.
- dtype : dtype, optional
- Optional dtype to enforce for all arrays.
- verify_integrity : bool, default True
- Validate and homogenize all input. If set to False, it is assumed
- that all elements of `arrays` are actual arrays how they will be
- stored in a block (numpy ndarray or ExtensionArray), have the same
- length as and are aligned with the index, and that `columns` and
- `index` are ensured to be an Index object.
- Returns
- -------
- DataFrame
- """
- if dtype is not None:
- dtype = pandas_dtype(dtype)
- manager = get_option("mode.data_manager")
- columns = ensure_index(columns)
- if len(columns) != len(arrays):
- raise ValueError("len(columns) must match len(arrays)")
- mgr = arrays_to_mgr(
- arrays,
- columns,
- index,
- dtype=dtype,
- verify_integrity=verify_integrity,
- typ=manager,
- )
- return cls(mgr)
- @doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "path",
- )
- def to_stata(
- self,
- path: FilePath | WriteBuffer[bytes],
- *,
- convert_dates: dict[Hashable, str] | None = None,
- write_index: bool = True,
- byteorder: str | None = None,
- time_stamp: datetime.datetime | None = None,
- data_label: str | None = None,
- variable_labels: dict[Hashable, str] | None = None,
- version: int | None = 114,
- convert_strl: Sequence[Hashable] | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- value_labels: dict[Hashable, dict[float, str]] | None = None,
- ) -> None:
- """
- Export DataFrame object to Stata dta format.
- Writes the DataFrame to a Stata dataset file.
- "dta" files contain a Stata dataset.
- Parameters
- ----------
- path : str, path object, or buffer
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``write()`` function.
- convert_dates : dict
- Dictionary mapping columns containing datetime types to stata
- internal format to use when writing the dates. Options are 'tc',
- 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
- or a name. Datetime columns that do not have a conversion type
- specified will be converted to 'tc'. Raises NotImplementedError if
- a datetime column has timezone information.
- write_index : bool
- Write the index to Stata dataset.
- byteorder : str
- Can be ">", "<", "little", or "big". default is `sys.byteorder`.
- time_stamp : datetime
- A datetime to use as file creation date. Default is the current
- time.
- data_label : str, optional
- A label for the data set. Must be 80 characters or smaller.
- variable_labels : dict
- Dictionary containing columns as keys and variable labels as
- values. Each label must be 80 characters or smaller.
- version : {{114, 117, 118, 119, None}}, default 114
- Version to use in the output dta file. Set to None to let pandas
- decide between 118 or 119 formats depending on the number of
- columns in the frame. Version 114 can be read by Stata 10 and
- later. Version 117 can be read by Stata 13 or later. Version 118
- is supported in Stata 14 and later. Version 119 is supported in
- Stata 15 and later. Version 114 limits string variables to 244
- characters or fewer while versions 117 and later allow strings
- with lengths up to 2,000,000 characters. Versions 118 and 119
- support Unicode characters, and version 119 supports more than
- 32,767 variables.
- Version 119 should usually only be used when the number of
- variables exceeds the capacity of dta format 118. Exporting
- smaller datasets in format 119 may have unintended consequences,
- and, as of November 2020, Stata SE cannot read version 119 files.
- convert_strl : list, optional
- List of column names to convert to string columns to Stata StrL
- format. Only available if version is 117. Storing strings in the
- StrL format can produce smaller dta files if strings have more than
- 8 characters and values are repeated.
- {compression_options}
- .. versionadded:: 1.1.0
- .. versionchanged:: 1.4.0 Zstandard support.
- {storage_options}
- .. versionadded:: 1.2.0
- value_labels : dict of dicts
- Dictionary containing columns as keys and dictionaries of column value
- to labels as values. Labels for a single variable must be 32,000
- characters or smaller.
- .. versionadded:: 1.4.0
- Raises
- ------
- NotImplementedError
- * If datetimes contain timezone information
- * Column dtype is not representable in Stata
- ValueError
- * Columns listed in convert_dates are neither datetime64[ns]
- or datetime.datetime
- * Column listed in convert_dates is not in DataFrame
- * Categorical label contains more than 32,000 characters
- See Also
- --------
- read_stata : Import Stata data files.
- io.stata.StataWriter : Low-level writer for Stata data files.
- io.stata.StataWriter117 : Low-level writer for version 117 files.
- Examples
- --------
- >>> df = pd.DataFrame({{'animal': ['falcon', 'parrot', 'falcon',
- ... 'parrot'],
- ... 'speed': [350, 18, 361, 15]}})
- >>> df.to_stata('animals.dta') # doctest: +SKIP
- """
- if version not in (114, 117, 118, 119, None):
- raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
- if version == 114:
- if convert_strl is not None:
- raise ValueError("strl is not supported in format 114")
- from pandas.io.stata import StataWriter as statawriter
- elif version == 117:
- # Incompatible import of "statawriter" (imported name has type
- # "Type[StataWriter117]", local name has type "Type[StataWriter]")
- from pandas.io.stata import ( # type: ignore[assignment]
- StataWriter117 as statawriter,
- )
- else: # versions 118 and 119
- # Incompatible import of "statawriter" (imported name has type
- # "Type[StataWriter117]", local name has type "Type[StataWriter]")
- from pandas.io.stata import ( # type: ignore[assignment]
- StataWriterUTF8 as statawriter,
- )
- kwargs: dict[str, Any] = {}
- if version is None or version >= 117:
- # strl conversion is only supported >= 117
- kwargs["convert_strl"] = convert_strl
- if version is None or version >= 118:
- # Specifying the version is only supported for UTF8 (118 or 119)
- kwargs["version"] = version
- writer = statawriter(
- path,
- self,
- convert_dates=convert_dates,
- byteorder=byteorder,
- time_stamp=time_stamp,
- data_label=data_label,
- write_index=write_index,
- variable_labels=variable_labels,
- compression=compression,
- storage_options=storage_options,
- value_labels=value_labels,
- **kwargs,
- )
- writer.write_file()
- def to_feather(self, path: FilePath | WriteBuffer[bytes], **kwargs) -> None:
- """
- Write a DataFrame to the binary Feather format.
- Parameters
- ----------
- path : str, path object, file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``write()`` function. If a string or a path,
- it will be used as Root Directory path when writing a partitioned dataset.
- **kwargs :
- Additional keywords passed to :func:`pyarrow.feather.write_feather`.
- Starting with pyarrow 0.17, this includes the `compression`,
- `compression_level`, `chunksize` and `version` keywords.
- .. versionadded:: 1.1.0
- Notes
- -----
- This function writes the dataframe as a `feather file
- <https://arrow.apache.org/docs/python/feather.html>`_. Requires a default
- index. For saving the DataFrame with your custom index use a method that
- supports custom indices e.g. `to_parquet`.
- """
- from pandas.io.feather_format import to_feather
- to_feather(self, path, **kwargs)
- @doc(
- Series.to_markdown,
- klass=_shared_doc_kwargs["klass"],
- storage_options=_shared_docs["storage_options"],
- examples="""Examples
- --------
- >>> df = pd.DataFrame(
- ... data={"animal_1": ["elk", "pig"], "animal_2": ["dog", "quetzal"]}
- ... )
- >>> print(df.to_markdown())
- | | animal_1 | animal_2 |
- |---:|:-----------|:-----------|
- | 0 | elk | dog |
- | 1 | pig | quetzal |
- Output markdown with a tabulate option.
- >>> print(df.to_markdown(tablefmt="grid"))
- +----+------------+------------+
- | | animal_1 | animal_2 |
- +====+============+============+
- | 0 | elk | dog |
- +----+------------+------------+
- | 1 | pig | quetzal |
- +----+------------+------------+""",
- )
- def to_markdown(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- mode: str = "wt",
- index: bool = True,
- storage_options: StorageOptions = None,
- **kwargs,
- ) -> str | None:
- if "showindex" in kwargs:
- raise ValueError("Pass 'index' instead of 'showindex")
- kwargs.setdefault("headers", "keys")
- kwargs.setdefault("tablefmt", "pipe")
- kwargs.setdefault("showindex", index)
- tabulate = import_optional_dependency("tabulate")
- result = tabulate.tabulate(self, **kwargs)
- if buf is None:
- return result
- with get_handle(buf, mode, storage_options=storage_options) as handles:
- handles.handle.write(result)
- return None
- @overload
- def to_parquet(
- self,
- path: None = ...,
- engine: str = ...,
- compression: str | None = ...,
- index: bool | None = ...,
- partition_cols: list[str] | None = ...,
- storage_options: StorageOptions = ...,
- **kwargs,
- ) -> bytes:
- ...
- @overload
- def to_parquet(
- self,
- path: FilePath | WriteBuffer[bytes],
- engine: str = ...,
- compression: str | None = ...,
- index: bool | None = ...,
- partition_cols: list[str] | None = ...,
- storage_options: StorageOptions = ...,
- **kwargs,
- ) -> None:
- ...
- @doc(storage_options=_shared_docs["storage_options"])
- def to_parquet(
- self,
- path: FilePath | WriteBuffer[bytes] | None = None,
- engine: str = "auto",
- compression: str | None = "snappy",
- index: bool | None = None,
- partition_cols: list[str] | None = None,
- storage_options: StorageOptions = None,
- **kwargs,
- ) -> bytes | None:
- """
- Write a DataFrame to the binary parquet format.
- This function writes the dataframe as a `parquet file
- <https://parquet.apache.org/>`_. You can choose different parquet
- backends, and have the option of compression. See
- :ref:`the user guide <io.parquet>` for more details.
- Parameters
- ----------
- path : str, path object, file-like object, or None, default None
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a binary ``write()`` function. If None, the result is
- returned as bytes. If a string or path, it will be used as Root Directory
- path when writing a partitioned dataset.
- .. versionchanged:: 1.2.0
- Previously this was "fname"
- engine : {{'auto', 'pyarrow', 'fastparquet'}}, default 'auto'
- Parquet library to use. If 'auto', then the option
- ``io.parquet.engine`` is used. The default ``io.parquet.engine``
- behavior is to try 'pyarrow', falling back to 'fastparquet' if
- 'pyarrow' is unavailable.
- compression : {{'snappy', 'gzip', 'brotli', None}}, default 'snappy'
- Name of the compression to use. Use ``None`` for no compression.
- index : bool, default None
- If ``True``, include the dataframe's index(es) in the file output.
- If ``False``, they will not be written to the file.
- If ``None``, similar to ``True`` the dataframe's index(es)
- will be saved. However, instead of being saved as values,
- the RangeIndex will be stored as a range in the metadata so it
- doesn't require much space and is faster. Other indexes will
- be included as columns in the file output.
- partition_cols : list, optional, default None
- Column names by which to partition the dataset.
- Columns are partitioned in the order they are given.
- Must be None if path is not a string.
- {storage_options}
- .. versionadded:: 1.2.0
- **kwargs
- Additional arguments passed to the parquet library. See
- :ref:`pandas io <io.parquet>` for more details.
- Returns
- -------
- bytes if no path argument is provided else None
- See Also
- --------
- read_parquet : Read a parquet file.
- DataFrame.to_orc : Write an orc file.
- DataFrame.to_csv : Write a csv file.
- DataFrame.to_sql : Write to a sql table.
- DataFrame.to_hdf : Write to hdf.
- Notes
- -----
- This function requires either the `fastparquet
- <https://pypi.org/project/fastparquet>`_ or `pyarrow
- <https://arrow.apache.org/docs/python/>`_ library.
- Examples
- --------
- >>> df = pd.DataFrame(data={{'col1': [1, 2], 'col2': [3, 4]}})
- >>> df.to_parquet('df.parquet.gzip',
- ... compression='gzip') # doctest: +SKIP
- >>> pd.read_parquet('df.parquet.gzip') # doctest: +SKIP
- col1 col2
- 0 1 3
- 1 2 4
- If you want to get a buffer to the parquet content you can use a io.BytesIO
- object, as long as you don't use partition_cols, which creates multiple files.
- >>> import io
- >>> f = io.BytesIO()
- >>> df.to_parquet(f)
- >>> f.seek(0)
- 0
- >>> content = f.read()
- """
- from pandas.io.parquet import to_parquet
- return to_parquet(
- self,
- path,
- engine,
- compression=compression,
- index=index,
- partition_cols=partition_cols,
- storage_options=storage_options,
- **kwargs,
- )
- def to_orc(
- self,
- path: FilePath | WriteBuffer[bytes] | None = None,
- *,
- engine: Literal["pyarrow"] = "pyarrow",
- index: bool | None = None,
- engine_kwargs: dict[str, Any] | None = None,
- ) -> bytes | None:
- """
- Write a DataFrame to the ORC format.
- .. versionadded:: 1.5.0
- Parameters
- ----------
- path : str, file-like object or None, default None
- If a string, it will be used as Root Directory path
- when writing a partitioned dataset. By file-like object,
- we refer to objects with a write() method, such as a file handle
- (e.g. via builtin open function). If path is None,
- a bytes object is returned.
- engine : str, default 'pyarrow'
- ORC library to use. Pyarrow must be >= 7.0.0.
- index : bool, optional
- If ``True``, include the dataframe's index(es) in the file output.
- If ``False``, they will not be written to the file.
- If ``None``, similar to ``infer`` the dataframe's index(es)
- will be saved. However, instead of being saved as values,
- the RangeIndex will be stored as a range in the metadata so it
- doesn't require much space and is faster. Other indexes will
- be included as columns in the file output.
- engine_kwargs : dict[str, Any] or None, default None
- Additional keyword arguments passed to :func:`pyarrow.orc.write_table`.
- Returns
- -------
- bytes if no path argument is provided else None
- Raises
- ------
- NotImplementedError
- Dtype of one or more columns is category, unsigned integers, interval,
- period or sparse.
- ValueError
- engine is not pyarrow.
- See Also
- --------
- read_orc : Read a ORC file.
- DataFrame.to_parquet : Write a parquet file.
- DataFrame.to_csv : Write a csv file.
- DataFrame.to_sql : Write to a sql table.
- DataFrame.to_hdf : Write to hdf.
- Notes
- -----
- * Before using this function you should read the :ref:`user guide about
- ORC <io.orc>` and :ref:`install optional dependencies <install.warn_orc>`.
- * This function requires `pyarrow <https://arrow.apache.org/docs/python/>`_
- library.
- * For supported dtypes please refer to `supported ORC features in Arrow
- <https://arrow.apache.org/docs/cpp/orc.html#data-types>`__.
- * Currently timezones in datetime columns are not preserved when a
- dataframe is converted into ORC files.
- Examples
- --------
- >>> df = pd.DataFrame(data={'col1': [1, 2], 'col2': [4, 3]})
- >>> df.to_orc('df.orc') # doctest: +SKIP
- >>> pd.read_orc('df.orc') # doctest: +SKIP
- col1 col2
- 0 1 4
- 1 2 3
- If you want to get a buffer to the orc content you can write it to io.BytesIO
- >>> import io
- >>> b = io.BytesIO(df.to_orc()) # doctest: +SKIP
- >>> b.seek(0) # doctest: +SKIP
- 0
- >>> content = b.read() # doctest: +SKIP
- """
- from pandas.io.orc import to_orc
- return to_orc(
- self, path, engine=engine, index=index, engine_kwargs=engine_kwargs
- )
- @overload
- def to_html(
- self,
- buf: FilePath | WriteBuffer[str],
- columns: Sequence[Level] | None = ...,
- col_space: ColspaceArgType | None = ...,
- header: bool | Sequence[str] = ...,
- index: bool = ...,
- na_rep: str = ...,
- formatters: FormattersType | None = ...,
- float_format: FloatFormatType | None = ...,
- sparsify: bool | None = ...,
- index_names: bool = ...,
- justify: str | None = ...,
- max_rows: int | None = ...,
- max_cols: int | None = ...,
- show_dimensions: bool | str = ...,
- decimal: str = ...,
- bold_rows: bool = ...,
- classes: str | list | tuple | None = ...,
- escape: bool = ...,
- notebook: bool = ...,
- border: int | bool | None = ...,
- table_id: str | None = ...,
- render_links: bool = ...,
- encoding: str | None = ...,
- ) -> None:
- ...
- @overload
- def to_html(
- self,
- buf: None = ...,
- columns: Sequence[Level] | None = ...,
- col_space: ColspaceArgType | None = ...,
- header: bool | Sequence[str] = ...,
- index: bool = ...,
- na_rep: str = ...,
- formatters: FormattersType | None = ...,
- float_format: FloatFormatType | None = ...,
- sparsify: bool | None = ...,
- index_names: bool = ...,
- justify: str | None = ...,
- max_rows: int | None = ...,
- max_cols: int | None = ...,
- show_dimensions: bool | str = ...,
- decimal: str = ...,
- bold_rows: bool = ...,
- classes: str | list | tuple | None = ...,
- escape: bool = ...,
- notebook: bool = ...,
- border: int | bool | None = ...,
- table_id: str | None = ...,
- render_links: bool = ...,
- encoding: str | None = ...,
- ) -> str:
- ...
- @Substitution(
- header_type="bool",
- header="Whether to print column labels, default True",
- col_space_type="str or int, list or dict of int or str",
- col_space="The minimum width of each column in CSS length "
- "units. An int is assumed to be px units.",
- )
- @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring)
- def to_html(
- self,
- buf: FilePath | WriteBuffer[str] | None = None,
- columns: Sequence[Level] | None = None,
- col_space: ColspaceArgType | None = None,
- header: bool | Sequence[str] = True,
- index: bool = True,
- na_rep: str = "NaN",
- formatters: FormattersType | None = None,
- float_format: FloatFormatType | None = None,
- sparsify: bool | None = None,
- index_names: bool = True,
- justify: str | None = None,
- max_rows: int | None = None,
- max_cols: int | None = None,
- show_dimensions: bool | str = False,
- decimal: str = ".",
- bold_rows: bool = True,
- classes: str | list | tuple | None = None,
- escape: bool = True,
- notebook: bool = False,
- border: int | bool | None = None,
- table_id: str | None = None,
- render_links: bool = False,
- encoding: str | None = None,
- ) -> str | None:
- """
- Render a DataFrame as an HTML table.
- %(shared_params)s
- bold_rows : bool, default True
- Make the row labels bold in the output.
- classes : str or list or tuple, default None
- CSS class(es) to apply to the resulting html table.
- escape : bool, default True
- Convert the characters <, >, and & to HTML-safe sequences.
- notebook : {True, False}, default False
- Whether the generated HTML is for IPython Notebook.
- border : int
- A ``border=border`` attribute is included in the opening
- `<table>` tag. Default ``pd.options.display.html.border``.
- table_id : str, optional
- A css id is included in the opening `<table>` tag if specified.
- render_links : bool, default False
- Convert URLs to HTML links.
- encoding : str, default "utf-8"
- Set character encoding.
- .. versionadded:: 1.0
- %(returns)s
- See Also
- --------
- to_string : Convert DataFrame to a string.
- """
- if justify is not None and justify not in fmt._VALID_JUSTIFY_PARAMETERS:
- raise ValueError("Invalid value for justify parameter")
- formatter = fmt.DataFrameFormatter(
- self,
- columns=columns,
- col_space=col_space,
- na_rep=na_rep,
- header=header,
- index=index,
- formatters=formatters,
- float_format=float_format,
- bold_rows=bold_rows,
- sparsify=sparsify,
- justify=justify,
- index_names=index_names,
- escape=escape,
- decimal=decimal,
- max_rows=max_rows,
- max_cols=max_cols,
- show_dimensions=show_dimensions,
- )
- # TODO: a generic formatter wld b in DataFrameFormatter
- return fmt.DataFrameRenderer(formatter).to_html(
- buf=buf,
- classes=classes,
- notebook=notebook,
- border=border,
- encoding=encoding,
- table_id=table_id,
- render_links=render_links,
- )
- @doc(
- storage_options=_shared_docs["storage_options"],
- compression_options=_shared_docs["compression_options"] % "path_or_buffer",
- )
- def to_xml(
- self,
- path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
- index: bool = True,
- root_name: str | None = "data",
- row_name: str | None = "row",
- na_rep: str | None = None,
- attr_cols: list[str] | None = None,
- elem_cols: list[str] | None = None,
- namespaces: dict[str | None, str] | None = None,
- prefix: str | None = None,
- encoding: str = "utf-8",
- xml_declaration: bool | None = True,
- pretty_print: bool | None = True,
- parser: str | None = "lxml",
- stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- ) -> str | None:
- """
- Render a DataFrame to an XML document.
- .. versionadded:: 1.3.0
- Parameters
- ----------
- path_or_buffer : str, path object, file-like object, or None, default None
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a ``write()`` function. If None, the result is returned
- as a string.
- index : bool, default True
- Whether to include index in XML document.
- root_name : str, default 'data'
- The name of root element in XML document.
- row_name : str, default 'row'
- The name of row element in XML document.
- na_rep : str, optional
- Missing data representation.
- attr_cols : list-like, optional
- List of columns to write as attributes in row element.
- Hierarchical columns will be flattened with underscore
- delimiting the different levels.
- elem_cols : list-like, optional
- List of columns to write as children in row element. By default,
- all columns output as children of row element. Hierarchical
- columns will be flattened with underscore delimiting the
- different levels.
- namespaces : dict, optional
- All namespaces to be defined in root element. Keys of dict
- should be prefix names and values of dict corresponding URIs.
- Default namespaces should be given empty string key. For
- example, ::
- namespaces = {{"": "https://example.com"}}
- prefix : str, optional
- Namespace prefix to be used for every element and/or attribute
- in document. This should be one of the keys in ``namespaces``
- dict.
- encoding : str, default 'utf-8'
- Encoding of the resulting document.
- xml_declaration : bool, default True
- Whether to include the XML declaration at start of document.
- pretty_print : bool, default True
- Whether output should be pretty printed with indentation and
- line breaks.
- parser : {{'lxml','etree'}}, default 'lxml'
- Parser module to use for building of tree. Only 'lxml' and
- 'etree' are supported. With 'lxml', the ability to use XSLT
- stylesheet is supported.
- stylesheet : str, path object or file-like object, optional
- A URL, file-like object, or a raw string containing an XSLT
- script used to transform the raw XML output. Script should use
- layout of elements and attributes from original output. This
- argument requires ``lxml`` to be installed. Only XSLT 1.0
- scripts and not later versions is currently supported.
- {compression_options}
- .. versionchanged:: 1.4.0 Zstandard support.
- {storage_options}
- Returns
- -------
- None or str
- If ``io`` is None, returns the resulting XML format as a
- string. Otherwise returns None.
- See Also
- --------
- to_json : Convert the pandas object to a JSON string.
- to_html : Convert DataFrame to a html.
- Examples
- --------
- >>> df = pd.DataFrame({{'shape': ['square', 'circle', 'triangle'],
- ... 'degrees': [360, 360, 180],
- ... 'sides': [4, np.nan, 3]}})
- >>> df.to_xml() # doctest: +SKIP
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <index>0</index>
- <shape>square</shape>
- <degrees>360</degrees>
- <sides>4.0</sides>
- </row>
- <row>
- <index>1</index>
- <shape>circle</shape>
- <degrees>360</degrees>
- <sides/>
- </row>
- <row>
- <index>2</index>
- <shape>triangle</shape>
- <degrees>180</degrees>
- <sides>3.0</sides>
- </row>
- </data>
- >>> df.to_xml(attr_cols=[
- ... 'index', 'shape', 'degrees', 'sides'
- ... ]) # doctest: +SKIP
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row index="0" shape="square" degrees="360" sides="4.0"/>
- <row index="1" shape="circle" degrees="360"/>
- <row index="2" shape="triangle" degrees="180" sides="3.0"/>
- </data>
- >>> df.to_xml(namespaces={{"doc": "https://example.com"}},
- ... prefix="doc") # doctest: +SKIP
- <?xml version='1.0' encoding='utf-8'?>
- <doc:data xmlns:doc="https://example.com">
- <doc:row>
- <doc:index>0</doc:index>
- <doc:shape>square</doc:shape>
- <doc:degrees>360</doc:degrees>
- <doc:sides>4.0</doc:sides>
- </doc:row>
- <doc:row>
- <doc:index>1</doc:index>
- <doc:shape>circle</doc:shape>
- <doc:degrees>360</doc:degrees>
- <doc:sides/>
- </doc:row>
- <doc:row>
- <doc:index>2</doc:index>
- <doc:shape>triangle</doc:shape>
- <doc:degrees>180</doc:degrees>
- <doc:sides>3.0</doc:sides>
- </doc:row>
- </doc:data>
- """
- from pandas.io.formats.xml import (
- EtreeXMLFormatter,
- LxmlXMLFormatter,
- )
- lxml = import_optional_dependency("lxml.etree", errors="ignore")
- TreeBuilder: type[EtreeXMLFormatter] | type[LxmlXMLFormatter]
- if parser == "lxml":
- if lxml is not None:
- TreeBuilder = LxmlXMLFormatter
- else:
- raise ImportError(
- "lxml not found, please install or use the etree parser."
- )
- elif parser == "etree":
- TreeBuilder = EtreeXMLFormatter
- else:
- raise ValueError("Values for parser can only be lxml or etree.")
- xml_formatter = TreeBuilder(
- self,
- path_or_buffer=path_or_buffer,
- index=index,
- root_name=root_name,
- row_name=row_name,
- na_rep=na_rep,
- attr_cols=attr_cols,
- elem_cols=elem_cols,
- namespaces=namespaces,
- prefix=prefix,
- encoding=encoding,
- xml_declaration=xml_declaration,
- pretty_print=pretty_print,
- stylesheet=stylesheet,
- compression=compression,
- storage_options=storage_options,
- )
- return xml_formatter.write_output()
- # ----------------------------------------------------------------------
- @doc(INFO_DOCSTRING, **frame_sub_kwargs)
- def info(
- self,
- verbose: bool | None = None,
- buf: WriteBuffer[str] | None = None,
- max_cols: int | None = None,
- memory_usage: bool | str | None = None,
- show_counts: bool | None = None,
- ) -> None:
- info = DataFrameInfo(
- data=self,
- memory_usage=memory_usage,
- )
- info.render(
- buf=buf,
- max_cols=max_cols,
- verbose=verbose,
- show_counts=show_counts,
- )
- def memory_usage(self, index: bool = True, deep: bool = False) -> Series:
- """
- Return the memory usage of each column in bytes.
- The memory usage can optionally include the contribution of
- the index and elements of `object` dtype.
- This value is displayed in `DataFrame.info` by default. This can be
- suppressed by setting ``pandas.options.display.memory_usage`` to False.
- Parameters
- ----------
- index : bool, default True
- Specifies whether to include the memory usage of the DataFrame's
- index in returned Series. If ``index=True``, the memory usage of
- the index is the first item in the output.
- deep : bool, default False
- If True, introspect the data deeply by interrogating
- `object` dtypes for system-level memory consumption, and include
- it in the returned values.
- Returns
- -------
- Series
- A Series whose index is the original column names and whose values
- is the memory usage of each column in bytes.
- See Also
- --------
- numpy.ndarray.nbytes : Total bytes consumed by the elements of an
- ndarray.
- Series.memory_usage : Bytes consumed by a Series.
- Categorical : Memory-efficient array for string values with
- many repeated values.
- DataFrame.info : Concise summary of a DataFrame.
- Notes
- -----
- See the :ref:`Frequently Asked Questions <df-memory-usage>` for more
- details.
- Examples
- --------
- >>> dtypes = ['int64', 'float64', 'complex128', 'object', 'bool']
- >>> data = dict([(t, np.ones(shape=5000, dtype=int).astype(t))
- ... for t in dtypes])
- >>> df = pd.DataFrame(data)
- >>> df.head()
- int64 float64 complex128 object bool
- 0 1 1.0 1.0+0.0j 1 True
- 1 1 1.0 1.0+0.0j 1 True
- 2 1 1.0 1.0+0.0j 1 True
- 3 1 1.0 1.0+0.0j 1 True
- 4 1 1.0 1.0+0.0j 1 True
- >>> df.memory_usage()
- Index 128
- int64 40000
- float64 40000
- complex128 80000
- object 40000
- bool 5000
- dtype: int64
- >>> df.memory_usage(index=False)
- int64 40000
- float64 40000
- complex128 80000
- object 40000
- bool 5000
- dtype: int64
- The memory footprint of `object` dtype columns is ignored by default:
- >>> df.memory_usage(deep=True)
- Index 128
- int64 40000
- float64 40000
- complex128 80000
- object 180000
- bool 5000
- dtype: int64
- Use a Categorical for efficient storage of an object-dtype column with
- many repeated values.
- >>> df['object'].astype('category').memory_usage(deep=True)
- 5244
- """
- result = self._constructor_sliced(
- [c.memory_usage(index=False, deep=deep) for col, c in self.items()],
- index=self.columns,
- dtype=np.intp,
- )
- if index:
- index_memory_usage = self._constructor_sliced(
- self.index.memory_usage(deep=deep), index=["Index"]
- )
- result = index_memory_usage._append(result)
- return result
- def transpose(self, *args, copy: bool = False) -> DataFrame:
- """
- Transpose index and columns.
- Reflect the DataFrame over its main diagonal by writing rows as columns
- and vice-versa. The property :attr:`.T` is an accessor to the method
- :meth:`transpose`.
- Parameters
- ----------
- *args : tuple, optional
- Accepted for compatibility with NumPy.
- copy : bool, default False
- Whether to copy the data after transposing, even for DataFrames
- with a single dtype.
- Note that a copy is always required for mixed dtype DataFrames,
- or for DataFrames with any extension types.
- Returns
- -------
- DataFrame
- The transposed DataFrame.
- See Also
- --------
- numpy.transpose : Permute the dimensions of a given array.
- Notes
- -----
- Transposing a DataFrame with mixed dtypes will result in a homogeneous
- DataFrame with the `object` dtype. In such a case, a copy of the data
- is always made.
- Examples
- --------
- **Square DataFrame with homogeneous dtype**
- >>> d1 = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df1 = pd.DataFrame(data=d1)
- >>> df1
- col1 col2
- 0 1 3
- 1 2 4
- >>> df1_transposed = df1.T # or df1.transpose()
- >>> df1_transposed
- 0 1
- col1 1 2
- col2 3 4
- When the dtype is homogeneous in the original DataFrame, we get a
- transposed DataFrame with the same dtype:
- >>> df1.dtypes
- col1 int64
- col2 int64
- dtype: object
- >>> df1_transposed.dtypes
- 0 int64
- 1 int64
- dtype: object
- **Non-square DataFrame with mixed dtypes**
- >>> d2 = {'name': ['Alice', 'Bob'],
- ... 'score': [9.5, 8],
- ... 'employed': [False, True],
- ... 'kids': [0, 0]}
- >>> df2 = pd.DataFrame(data=d2)
- >>> df2
- name score employed kids
- 0 Alice 9.5 False 0
- 1 Bob 8.0 True 0
- >>> df2_transposed = df2.T # or df2.transpose()
- >>> df2_transposed
- 0 1
- name Alice Bob
- score 9.5 8.0
- employed False True
- kids 0 0
- When the DataFrame has mixed dtypes, we get a transposed DataFrame with
- the `object` dtype:
- >>> df2.dtypes
- name object
- score float64
- employed bool
- kids int64
- dtype: object
- >>> df2_transposed.dtypes
- 0 object
- 1 object
- dtype: object
- """
- nv.validate_transpose(args, {})
- # construct the args
- dtypes = list(self.dtypes)
- if self._can_fast_transpose:
- # Note: tests pass without this, but this improves perf quite a bit.
- new_vals = self._values.T
- if copy and not using_copy_on_write():
- new_vals = new_vals.copy()
- result = self._constructor(
- new_vals, index=self.columns, columns=self.index, copy=False
- )
- if using_copy_on_write() and len(self) > 0:
- result._mgr.add_references(self._mgr) # type: ignore[arg-type]
- elif (
- self._is_homogeneous_type and dtypes and is_extension_array_dtype(dtypes[0])
- ):
- # We have EAs with the same dtype. We can preserve that dtype in transpose.
- dtype = dtypes[0]
- arr_type = dtype.construct_array_type()
- values = self.values
- new_values = [arr_type._from_sequence(row, dtype=dtype) for row in values]
- result = type(self)._from_arrays(
- new_values, index=self.columns, columns=self.index
- )
- else:
- new_arr = self.values.T
- if copy and not using_copy_on_write():
- new_arr = new_arr.copy()
- result = self._constructor(
- new_arr,
- index=self.columns,
- columns=self.index,
- # We already made a copy (more than one block)
- copy=False,
- )
- return result.__finalize__(self, method="transpose")
- @property
- def T(self) -> DataFrame:
- """
- The transpose of the DataFrame.
- Returns
- -------
- DataFrame
- The transposed DataFrame.
- See Also
- --------
- DataFrame.transpose : Transpose index and columns.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df
- col1 col2
- 0 1 3
- 1 2 4
- >>> df.T
- 0 1
- col1 1 2
- col2 3 4
- """
- return self.transpose()
- # ----------------------------------------------------------------------
- # Indexing Methods
- def _ixs(self, i: int, axis: AxisInt = 0) -> Series:
- """
- Parameters
- ----------
- i : int
- axis : int
- Returns
- -------
- Series
- """
- # irow
- if axis == 0:
- new_mgr = self._mgr.fast_xs(i)
- # if we are a copy, mark as such
- copy = isinstance(new_mgr.array, np.ndarray) and new_mgr.array.base is None
- result = self._constructor_sliced(new_mgr, name=self.index[i]).__finalize__(
- self
- )
- result._set_is_copy(self, copy=copy)
- return result
- # icol
- else:
- label = self.columns[i]
- col_mgr = self._mgr.iget(i)
- result = self._box_col_values(col_mgr, i)
- # this is a cached value, mark it so
- result._set_as_cached(label, self)
- return result
- def _get_column_array(self, i: int) -> ArrayLike:
- """
- Get the values of the i'th column (ndarray or ExtensionArray, as stored
- in the Block)
- Warning! The returned array is a view but doesn't handle Copy-on-Write,
- so this should be used with caution (for read-only purposes).
- """
- return self._mgr.iget_values(i)
- def _iter_column_arrays(self) -> Iterator[ArrayLike]:
- """
- Iterate over the arrays of all columns in order.
- This returns the values as stored in the Block (ndarray or ExtensionArray).
- Warning! The returned array is a view but doesn't handle Copy-on-Write,
- so this should be used with caution (for read-only purposes).
- """
- for i in range(len(self.columns)):
- yield self._get_column_array(i)
- def _getitem_nocopy(self, key: list):
- """
- Behaves like __getitem__, but returns a view in cases where __getitem__
- would make a copy.
- """
- # TODO(CoW): can be removed if/when we are always Copy-on-Write
- indexer = self.columns._get_indexer_strict(key, "columns")[1]
- new_axis = self.columns[indexer]
- new_mgr = self._mgr.reindex_indexer(
- new_axis,
- indexer,
- axis=0,
- allow_dups=True,
- copy=False,
- only_slice=True,
- )
- return self._constructor(new_mgr)
- def __getitem__(self, key):
- check_dict_or_set_indexers(key)
- key = lib.item_from_zerodim(key)
- key = com.apply_if_callable(key, self)
- if is_hashable(key) and not is_iterator(key):
- # is_iterator to exclude generator e.g. test_getitem_listlike
- # shortcut if the key is in columns
- is_mi = isinstance(self.columns, MultiIndex)
- # GH#45316 Return view if key is not duplicated
- # Only use drop_duplicates with duplicates for performance
- if not is_mi and (
- self.columns.is_unique
- and key in self.columns
- or key in self.columns.drop_duplicates(keep=False)
- ):
- return self._get_item_cache(key)
- elif is_mi and self.columns.is_unique and key in self.columns:
- return self._getitem_multilevel(key)
- # Do we have a slicer (on rows)?
- if isinstance(key, slice):
- indexer = self.index._convert_slice_indexer(key, kind="getitem")
- if isinstance(indexer, np.ndarray):
- # reachable with DatetimeIndex
- indexer = lib.maybe_indices_to_slice(
- indexer.astype(np.intp, copy=False), len(self)
- )
- if isinstance(indexer, np.ndarray):
- # GH#43223 If we can not convert, use take
- return self.take(indexer, axis=0)
- return self._slice(indexer, axis=0)
- # Do we have a (boolean) DataFrame?
- if isinstance(key, DataFrame):
- return self.where(key)
- # Do we have a (boolean) 1d indexer?
- if com.is_bool_indexer(key):
- return self._getitem_bool_array(key)
- # We are left with two options: a single key, and a collection of keys,
- # We interpret tuples as collections only for non-MultiIndex
- is_single_key = isinstance(key, tuple) or not is_list_like(key)
- if is_single_key:
- if self.columns.nlevels > 1:
- return self._getitem_multilevel(key)
- indexer = self.columns.get_loc(key)
- if is_integer(indexer):
- indexer = [indexer]
- else:
- if is_iterator(key):
- key = list(key)
- indexer = self.columns._get_indexer_strict(key, "columns")[1]
- # take() does not accept boolean indexers
- if getattr(indexer, "dtype", None) == bool:
- indexer = np.where(indexer)[0]
- data = self._take_with_is_copy(indexer, axis=1)
- if is_single_key:
- # What does looking for a single key in a non-unique index return?
- # The behavior is inconsistent. It returns a Series, except when
- # - the key itself is repeated (test on data.shape, #9519), or
- # - we have a MultiIndex on columns (test on self.columns, #21309)
- if data.shape[1] == 1 and not isinstance(self.columns, MultiIndex):
- # GH#26490 using data[key] can cause RecursionError
- return data._get_item_cache(key)
- return data
- def _getitem_bool_array(self, key):
- # also raises Exception if object array with NA values
- # warning here just in case -- previously __setitem__ was
- # reindexing but __getitem__ was not; it seems more reasonable to
- # go with the __setitem__ behavior since that is more consistent
- # with all other indexing behavior
- if isinstance(key, Series) and not key.index.equals(self.index):
- warnings.warn(
- "Boolean Series key will be reindexed to match DataFrame index.",
- UserWarning,
- stacklevel=find_stack_level(),
- )
- elif len(key) != len(self.index):
- raise ValueError(
- f"Item wrong length {len(key)} instead of {len(self.index)}."
- )
- # check_bool_indexer will throw exception if Series key cannot
- # be reindexed to match DataFrame rows
- key = check_bool_indexer(self.index, key)
- if key.all():
- return self.copy(deep=None)
- indexer = key.nonzero()[0]
- return self._take_with_is_copy(indexer, axis=0)
- def _getitem_multilevel(self, key):
- # self.columns is a MultiIndex
- loc = self.columns.get_loc(key)
- if isinstance(loc, (slice, np.ndarray)):
- new_columns = self.columns[loc]
- result_columns = maybe_droplevels(new_columns, key)
- result = self.iloc[:, loc]
- result.columns = result_columns
- # If there is only one column being returned, and its name is
- # either an empty string, or a tuple with an empty string as its
- # first element, then treat the empty string as a placeholder
- # and return the column as if the user had provided that empty
- # string in the key. If the result is a Series, exclude the
- # implied empty string from its name.
- if len(result.columns) == 1:
- # e.g. test_frame_getitem_multicolumn_empty_level,
- # test_frame_mixed_depth_get, test_loc_setitem_single_column_slice
- top = result.columns[0]
- if isinstance(top, tuple):
- top = top[0]
- if top == "":
- result = result[""]
- if isinstance(result, Series):
- result = self._constructor_sliced(
- result, index=self.index, name=key
- )
- result._set_is_copy(self)
- return result
- else:
- # loc is neither a slice nor ndarray, so must be an int
- return self._ixs(loc, axis=1)
- def _get_value(self, index, col, takeable: bool = False) -> Scalar:
- """
- Quickly retrieve single value at passed column and index.
- Parameters
- ----------
- index : row label
- col : column label
- takeable : interpret the index/col as indexers, default False
- Returns
- -------
- scalar
- Notes
- -----
- Assumes that both `self.index._index_as_unique` and
- `self.columns._index_as_unique`; Caller is responsible for checking.
- """
- if takeable:
- series = self._ixs(col, axis=1)
- return series._values[index]
- series = self._get_item_cache(col)
- engine = self.index._engine
- if not isinstance(self.index, MultiIndex):
- # CategoricalIndex: Trying to use the engine fastpath may give incorrect
- # results if our categories are integers that dont match our codes
- # IntervalIndex: IntervalTree has no get_loc
- row = self.index.get_loc(index)
- return series._values[row]
- # For MultiIndex going through engine effectively restricts us to
- # same-length tuples; see test_get_set_value_no_partial_indexing
- loc = engine.get_loc(index)
- return series._values[loc]
- def isetitem(self, loc, value) -> None:
- """
- Set the given value in the column with position `loc`.
- This is a positional analogue to ``__setitem__``.
- Parameters
- ----------
- loc : int or sequence of ints
- Index position for the column.
- value : scalar or arraylike
- Value(s) for the column.
- Notes
- -----
- ``frame.isetitem(loc, value)`` is an in-place method as it will
- modify the DataFrame in place (not returning a new object). In contrast to
- ``frame.iloc[:, i] = value`` which will try to update the existing values in
- place, ``frame.isetitem(loc, value)`` will not update the values of the column
- itself in place, it will instead insert a new array.
- In cases where ``frame.columns`` is unique, this is equivalent to
- ``frame[frame.columns[i]] = value``.
- """
- if isinstance(value, DataFrame):
- if is_scalar(loc):
- loc = [loc]
- for i, idx in enumerate(loc):
- arraylike = self._sanitize_column(value.iloc[:, i])
- self._iset_item_mgr(idx, arraylike, inplace=False)
- return
- arraylike = self._sanitize_column(value)
- self._iset_item_mgr(loc, arraylike, inplace=False)
- def __setitem__(self, key, value):
- if not PYPY and using_copy_on_write():
- if sys.getrefcount(self) <= 3:
- warnings.warn(
- _chained_assignment_msg, ChainedAssignmentError, stacklevel=2
- )
- key = com.apply_if_callable(key, self)
- # see if we can slice the rows
- if isinstance(key, slice):
- slc = self.index._convert_slice_indexer(key, kind="getitem")
- return self._setitem_slice(slc, value)
- if isinstance(key, DataFrame) or getattr(key, "ndim", None) == 2:
- self._setitem_frame(key, value)
- elif isinstance(key, (Series, np.ndarray, list, Index)):
- self._setitem_array(key, value)
- elif isinstance(value, DataFrame):
- self._set_item_frame_value(key, value)
- elif (
- is_list_like(value)
- and not self.columns.is_unique
- and 1 < len(self.columns.get_indexer_for([key])) == len(value)
- ):
- # Column to set is duplicated
- self._setitem_array([key], value)
- else:
- # set column
- self._set_item(key, value)
- def _setitem_slice(self, key: slice, value) -> None:
- # NB: we can't just use self.loc[key] = value because that
- # operates on labels and we need to operate positional for
- # backwards-compat, xref GH#31469
- self._check_setitem_copy()
- self.iloc[key] = value
- def _setitem_array(self, key, value):
- # also raises Exception if object array with NA values
- if com.is_bool_indexer(key):
- # bool indexer is indexing along rows
- if len(key) != len(self.index):
- raise ValueError(
- f"Item wrong length {len(key)} instead of {len(self.index)}!"
- )
- key = check_bool_indexer(self.index, key)
- indexer = key.nonzero()[0]
- self._check_setitem_copy()
- if isinstance(value, DataFrame):
- # GH#39931 reindex since iloc does not align
- value = value.reindex(self.index.take(indexer))
- self.iloc[indexer] = value
- else:
- # Note: unlike self.iloc[:, indexer] = value, this will
- # never try to overwrite values inplace
- if isinstance(value, DataFrame):
- check_key_length(self.columns, key, value)
- for k1, k2 in zip(key, value.columns):
- self[k1] = value[k2]
- elif not is_list_like(value):
- for col in key:
- self[col] = value
- elif isinstance(value, np.ndarray) and value.ndim == 2:
- self._iset_not_inplace(key, value)
- elif np.ndim(value) > 1:
- # list of lists
- value = DataFrame(value).values
- return self._setitem_array(key, value)
- else:
- self._iset_not_inplace(key, value)
- def _iset_not_inplace(self, key, value):
- # GH#39510 when setting with df[key] = obj with a list-like key and
- # list-like value, we iterate over those listlikes and set columns
- # one at a time. This is different from dispatching to
- # `self.loc[:, key]= value` because loc.__setitem__ may overwrite
- # data inplace, whereas this will insert new arrays.
- def igetitem(obj, i: int):
- # Note: we catch DataFrame obj before getting here, but
- # hypothetically would return obj.iloc[:, i]
- if isinstance(obj, np.ndarray):
- return obj[..., i]
- else:
- return obj[i]
- if self.columns.is_unique:
- if np.shape(value)[-1] != len(key):
- raise ValueError("Columns must be same length as key")
- for i, col in enumerate(key):
- self[col] = igetitem(value, i)
- else:
- ilocs = self.columns.get_indexer_non_unique(key)[0]
- if (ilocs < 0).any():
- # key entries not in self.columns
- raise NotImplementedError
- if np.shape(value)[-1] != len(ilocs):
- raise ValueError("Columns must be same length as key")
- assert np.ndim(value) <= 2
- orig_columns = self.columns
- # Using self.iloc[:, i] = ... may set values inplace, which
- # by convention we do not do in __setitem__
- try:
- self.columns = Index(range(len(self.columns)))
- for i, iloc in enumerate(ilocs):
- self[iloc] = igetitem(value, i)
- finally:
- self.columns = orig_columns
- def _setitem_frame(self, key, value):
- # support boolean setting with DataFrame input, e.g.
- # df[df > df2] = 0
- if isinstance(key, np.ndarray):
- if key.shape != self.shape:
- raise ValueError("Array conditional must be same shape as self")
- key = self._constructor(key, **self._construct_axes_dict(), copy=False)
- if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes):
- raise TypeError(
- "Must pass DataFrame or 2-d ndarray with boolean values only"
- )
- self._check_inplace_setting(value)
- self._check_setitem_copy()
- self._where(-key, value, inplace=True)
- def _set_item_frame_value(self, key, value: DataFrame) -> None:
- self._ensure_valid_index(value)
- # align columns
- if key in self.columns:
- loc = self.columns.get_loc(key)
- cols = self.columns[loc]
- len_cols = 1 if is_scalar(cols) or isinstance(cols, tuple) else len(cols)
- if len_cols != len(value.columns):
- raise ValueError("Columns must be same length as key")
- # align right-hand-side columns if self.columns
- # is multi-index and self[key] is a sub-frame
- if isinstance(self.columns, MultiIndex) and isinstance(
- loc, (slice, Series, np.ndarray, Index)
- ):
- cols_droplevel = maybe_droplevels(cols, key)
- if len(cols_droplevel) and not cols_droplevel.equals(value.columns):
- value = value.reindex(cols_droplevel, axis=1)
- for col, col_droplevel in zip(cols, cols_droplevel):
- self[col] = value[col_droplevel]
- return
- if is_scalar(cols):
- self[cols] = value[value.columns[0]]
- return
- # now align rows
- arraylike = _reindex_for_setitem(value, self.index)
- self._set_item_mgr(key, arraylike)
- return
- if len(value.columns) != 1:
- raise ValueError(
- "Cannot set a DataFrame with multiple columns to the single "
- f"column {key}"
- )
- self[key] = value[value.columns[0]]
- def _iset_item_mgr(
- self, loc: int | slice | np.ndarray, value, inplace: bool = False
- ) -> None:
- # when called from _set_item_mgr loc can be anything returned from get_loc
- self._mgr.iset(loc, value, inplace=inplace)
- self._clear_item_cache()
- def _set_item_mgr(self, key, value: ArrayLike) -> None:
- try:
- loc = self._info_axis.get_loc(key)
- except KeyError:
- # This item wasn't present, just insert at end
- self._mgr.insert(len(self._info_axis), key, value)
- else:
- self._iset_item_mgr(loc, value)
- # check if we are modifying a copy
- # try to set first as we want an invalid
- # value exception to occur first
- if len(self):
- self._check_setitem_copy()
- def _iset_item(self, loc: int, value) -> None:
- arraylike = self._sanitize_column(value)
- self._iset_item_mgr(loc, arraylike, inplace=True)
- # check if we are modifying a copy
- # try to set first as we want an invalid
- # value exception to occur first
- if len(self):
- self._check_setitem_copy()
- def _set_item(self, key, value) -> None:
- """
- Add series to DataFrame in specified column.
- If series is a numpy-array (not a Series/TimeSeries), it must be the
- same length as the DataFrames index or an error will be thrown.
- Series/TimeSeries will be conformed to the DataFrames index to
- ensure homogeneity.
- """
- value = self._sanitize_column(value)
- if (
- key in self.columns
- and value.ndim == 1
- and not is_extension_array_dtype(value)
- ):
- # broadcast across multiple columns if necessary
- if not self.columns.is_unique or isinstance(self.columns, MultiIndex):
- existing_piece = self[key]
- if isinstance(existing_piece, DataFrame):
- value = np.tile(value, (len(existing_piece.columns), 1)).T
- self._set_item_mgr(key, value)
- def _set_value(
- self, index: IndexLabel, col, value: Scalar, takeable: bool = False
- ) -> None:
- """
- Put single value at passed column and index.
- Parameters
- ----------
- index : Label
- row label
- col : Label
- column label
- value : scalar
- takeable : bool, default False
- Sets whether or not index/col interpreted as indexers
- """
- try:
- if takeable:
- icol = col
- iindex = cast(int, index)
- else:
- icol = self.columns.get_loc(col)
- iindex = self.index.get_loc(index)
- self._mgr.column_setitem(icol, iindex, value, inplace_only=True)
- self._clear_item_cache()
- except (KeyError, TypeError, ValueError, LossySetitemError):
- # get_loc might raise a KeyError for missing labels (falling back
- # to (i)loc will do expansion of the index)
- # column_setitem will do validation that may raise TypeError,
- # ValueError, or LossySetitemError
- # set using a non-recursive method & reset the cache
- if takeable:
- self.iloc[index, col] = value
- else:
- self.loc[index, col] = value
- self._item_cache.pop(col, None)
- except InvalidIndexError as ii_err:
- # GH48729: Seems like you are trying to assign a value to a
- # row when only scalar options are permitted
- raise InvalidIndexError(
- f"You can only assign a scalar value not a {type(value)}"
- ) from ii_err
- def _ensure_valid_index(self, value) -> None:
- """
- Ensure that if we don't have an index, that we can create one from the
- passed value.
- """
- # GH5632, make sure that we are a Series convertible
- if not len(self.index) and is_list_like(value) and len(value):
- if not isinstance(value, DataFrame):
- try:
- value = Series(value)
- except (ValueError, NotImplementedError, TypeError) as err:
- raise ValueError(
- "Cannot set a frame with no defined index "
- "and a value that cannot be converted to a Series"
- ) from err
- # GH31368 preserve name of index
- index_copy = value.index.copy()
- if self.index.name is not None:
- index_copy.name = self.index.name
- self._mgr = self._mgr.reindex_axis(index_copy, axis=1, fill_value=np.nan)
- def _box_col_values(self, values: SingleDataManager, loc: int) -> Series:
- """
- Provide boxed values for a column.
- """
- # Lookup in columns so that if e.g. a str datetime was passed
- # we attach the Timestamp object as the name.
- name = self.columns[loc]
- klass = self._constructor_sliced
- # We get index=self.index bc values is a SingleDataManager
- return klass(values, name=name, fastpath=True).__finalize__(self)
- # ----------------------------------------------------------------------
- # Lookup Caching
- def _clear_item_cache(self) -> None:
- self._item_cache.clear()
- def _get_item_cache(self, item: Hashable) -> Series:
- """Return the cached item, item represents a label indexer."""
- if using_copy_on_write():
- loc = self.columns.get_loc(item)
- return self._ixs(loc, axis=1)
- cache = self._item_cache
- res = cache.get(item)
- if res is None:
- # All places that call _get_item_cache have unique columns,
- # pending resolution of GH#33047
- loc = self.columns.get_loc(item)
- res = self._ixs(loc, axis=1)
- cache[item] = res
- # for a chain
- res._is_copy = self._is_copy
- return res
- def _reset_cacher(self) -> None:
- # no-op for DataFrame
- pass
- def _maybe_cache_changed(self, item, value: Series, inplace: bool) -> None:
- """
- The object has called back to us saying maybe it has changed.
- """
- loc = self._info_axis.get_loc(item)
- arraylike = value._values
- old = self._ixs(loc, axis=1)
- if old._values is value._values and inplace:
- # GH#46149 avoid making unnecessary copies/block-splitting
- return
- self._mgr.iset(loc, arraylike, inplace=inplace)
- # ----------------------------------------------------------------------
- # Unsorted
- @overload
- def query(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> DataFrame:
- ...
- @overload
- def query(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
- ...
- @overload
- def query(self, expr: str, *, inplace: bool = ..., **kwargs) -> DataFrame | None:
- ...
- def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | None:
- """
- Query the columns of a DataFrame with a boolean expression.
- Parameters
- ----------
- expr : str
- The query string to evaluate.
- You can refer to variables
- in the environment by prefixing them with an '@' character like
- ``@a + b``.
- You can refer to column names that are not valid Python variable names
- by surrounding them in backticks. Thus, column names containing spaces
- or punctuations (besides underscores) or starting with digits must be
- surrounded by backticks. (For example, a column named "Area (cm^2)" would
- be referenced as ```Area (cm^2)```). Column names which are Python keywords
- (like "list", "for", "import", etc) cannot be used.
- For example, if one of your columns is called ``a a`` and you want
- to sum it with ``b``, your query should be ```a a` + b``.
- inplace : bool
- Whether to modify the DataFrame rather than creating a new one.
- **kwargs
- See the documentation for :func:`eval` for complete details
- on the keyword arguments accepted by :meth:`DataFrame.query`.
- Returns
- -------
- DataFrame or None
- DataFrame resulting from the provided query expression or
- None if ``inplace=True``.
- See Also
- --------
- eval : Evaluate a string describing operations on
- DataFrame columns.
- DataFrame.eval : Evaluate a string describing operations on
- DataFrame columns.
- Notes
- -----
- The result of the evaluation of this expression is first passed to
- :attr:`DataFrame.loc` and if that fails because of a
- multidimensional key (e.g., a DataFrame) then the result will be passed
- to :meth:`DataFrame.__getitem__`.
- This method uses the top-level :func:`eval` function to
- evaluate the passed query.
- The :meth:`~pandas.DataFrame.query` method uses a slightly
- modified Python syntax by default. For example, the ``&`` and ``|``
- (bitwise) operators have the precedence of their boolean cousins,
- :keyword:`and` and :keyword:`or`. This *is* syntactically valid Python,
- however the semantics are different.
- You can change the semantics of the expression by passing the keyword
- argument ``parser='python'``. This enforces the same semantics as
- evaluation in Python space. Likewise, you can pass ``engine='python'``
- to evaluate an expression using Python itself as a backend. This is not
- recommended as it is inefficient compared to using ``numexpr`` as the
- engine.
- The :attr:`DataFrame.index` and
- :attr:`DataFrame.columns` attributes of the
- :class:`~pandas.DataFrame` instance are placed in the query namespace
- by default, which allows you to treat both the index and columns of the
- frame as a column in the frame.
- The identifier ``index`` is used for the frame index; you can also
- use the name of the index to identify it in a query. Please note that
- Python keywords may not be used as identifiers.
- For further details and examples see the ``query`` documentation in
- :ref:`indexing <indexing.query>`.
- *Backtick quoted variables*
- Backtick quoted variables are parsed as literal Python code and
- are converted internally to a Python valid identifier.
- This can lead to the following problems.
- During parsing a number of disallowed characters inside the backtick
- quoted string are replaced by strings that are allowed as a Python identifier.
- These characters include all operators in Python, the space character, the
- question mark, the exclamation mark, the dollar sign, and the euro sign.
- For other characters that fall outside the ASCII range (U+0001..U+007F)
- and those that are not further specified in PEP 3131,
- the query parser will raise an error.
- This excludes whitespace different than the space character,
- but also the hashtag (as it is used for comments) and the backtick
- itself (backtick can also not be escaped).
- In a special case, quotes that make a pair around a backtick can
- confuse the parser.
- For example, ```it's` > `that's``` will raise an error,
- as it forms a quoted string (``'s > `that'``) with a backtick inside.
- See also the Python documentation about lexical analysis
- (https://docs.python.org/3/reference/lexical_analysis.html)
- in combination with the source code in :mod:`pandas.core.computation.parsing`.
- Examples
- --------
- >>> df = pd.DataFrame({'A': range(1, 6),
- ... 'B': range(10, 0, -2),
- ... 'C C': range(10, 5, -1)})
- >>> df
- A B C C
- 0 1 10 10
- 1 2 8 9
- 2 3 6 8
- 3 4 4 7
- 4 5 2 6
- >>> df.query('A > B')
- A B C C
- 4 5 2 6
- The previous expression is equivalent to
- >>> df[df.A > df.B]
- A B C C
- 4 5 2 6
- For columns with spaces in their name, you can use backtick quoting.
- >>> df.query('B == `C C`')
- A B C C
- 0 1 10 10
- The previous expression is equivalent to
- >>> df[df.B == df['C C']]
- A B C C
- 0 1 10 10
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- if not isinstance(expr, str):
- msg = f"expr must be a string to be evaluated, {type(expr)} given"
- raise ValueError(msg)
- kwargs["level"] = kwargs.pop("level", 0) + 1
- kwargs["target"] = None
- res = self.eval(expr, **kwargs)
- try:
- result = self.loc[res]
- except ValueError:
- # when res is multi-dimensional loc raises, but this is sometimes a
- # valid query
- result = self[res]
- if inplace:
- self._update_inplace(result)
- return None
- else:
- return result
- @overload
- def eval(self, expr: str, *, inplace: Literal[False] = ..., **kwargs) -> Any:
- ...
- @overload
- def eval(self, expr: str, *, inplace: Literal[True], **kwargs) -> None:
- ...
- def eval(self, expr: str, *, inplace: bool = False, **kwargs) -> Any | None:
- """
- Evaluate a string describing operations on DataFrame columns.
- Operates on columns only, not specific rows or elements. This allows
- `eval` to run arbitrary code, which can make you vulnerable to code
- injection if you pass user input to this function.
- Parameters
- ----------
- expr : str
- The expression string to evaluate.
- inplace : bool, default False
- If the expression contains an assignment, whether to perform the
- operation inplace and mutate the existing DataFrame. Otherwise,
- a new DataFrame is returned.
- **kwargs
- See the documentation for :func:`eval` for complete details
- on the keyword arguments accepted by
- :meth:`~pandas.DataFrame.query`.
- Returns
- -------
- ndarray, scalar, pandas object, or None
- The result of the evaluation or None if ``inplace=True``.
- See Also
- --------
- DataFrame.query : Evaluates a boolean expression to query the columns
- of a frame.
- DataFrame.assign : Can evaluate an expression or function to create new
- values for a column.
- eval : Evaluate a Python expression as a string using various
- backends.
- Notes
- -----
- For more details see the API documentation for :func:`~eval`.
- For detailed examples see :ref:`enhancing performance with eval
- <enhancingperf.eval>`.
- Examples
- --------
- >>> df = pd.DataFrame({'A': range(1, 6), 'B': range(10, 0, -2)})
- >>> df
- A B
- 0 1 10
- 1 2 8
- 2 3 6
- 3 4 4
- 4 5 2
- >>> df.eval('A + B')
- 0 11
- 1 10
- 2 9
- 3 8
- 4 7
- dtype: int64
- Assignment is allowed though by default the original DataFrame is not
- modified.
- >>> df.eval('C = A + B')
- A B C
- 0 1 10 11
- 1 2 8 10
- 2 3 6 9
- 3 4 4 8
- 4 5 2 7
- >>> df
- A B
- 0 1 10
- 1 2 8
- 2 3 6
- 3 4 4
- 4 5 2
- Multiple columns can be assigned to using multi-line expressions:
- >>> df.eval(
- ... '''
- ... C = A + B
- ... D = A - B
- ... '''
- ... )
- A B C D
- 0 1 10 11 -9
- 1 2 8 10 -6
- 2 3 6 9 -3
- 3 4 4 8 0
- 4 5 2 7 3
- """
- from pandas.core.computation.eval import eval as _eval
- inplace = validate_bool_kwarg(inplace, "inplace")
- kwargs["level"] = kwargs.pop("level", 0) + 1
- index_resolvers = self._get_index_resolvers()
- column_resolvers = self._get_cleaned_column_resolvers()
- resolvers = column_resolvers, index_resolvers
- if "target" not in kwargs:
- kwargs["target"] = self
- kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
- return _eval(expr, inplace=inplace, **kwargs)
- def select_dtypes(self, include=None, exclude=None) -> DataFrame:
- """
- Return a subset of the DataFrame's columns based on the column dtypes.
- Parameters
- ----------
- include, exclude : scalar or list-like
- A selection of dtypes or strings to be included/excluded. At least
- one of these parameters must be supplied.
- Returns
- -------
- DataFrame
- The subset of the frame including the dtypes in ``include`` and
- excluding the dtypes in ``exclude``.
- Raises
- ------
- ValueError
- * If both of ``include`` and ``exclude`` are empty
- * If ``include`` and ``exclude`` have overlapping elements
- * If any kind of string dtype is passed in.
- See Also
- --------
- DataFrame.dtypes: Return Series with the data type of each column.
- Notes
- -----
- * To select all *numeric* types, use ``np.number`` or ``'number'``
- * To select strings you must use the ``object`` dtype, but note that
- this will return *all* object dtype columns
- * See the `numpy dtype hierarchy
- <https://numpy.org/doc/stable/reference/arrays.scalars.html>`__
- * To select datetimes, use ``np.datetime64``, ``'datetime'`` or
- ``'datetime64'``
- * To select timedeltas, use ``np.timedelta64``, ``'timedelta'`` or
- ``'timedelta64'``
- * To select Pandas categorical dtypes, use ``'category'``
- * To select Pandas datetimetz dtypes, use ``'datetimetz'`` (new in
- 0.20.0) or ``'datetime64[ns, tz]'``
- Examples
- --------
- >>> df = pd.DataFrame({'a': [1, 2] * 3,
- ... 'b': [True, False] * 3,
- ... 'c': [1.0, 2.0] * 3})
- >>> df
- a b c
- 0 1 True 1.0
- 1 2 False 2.0
- 2 1 True 1.0
- 3 2 False 2.0
- 4 1 True 1.0
- 5 2 False 2.0
- >>> df.select_dtypes(include='bool')
- b
- 0 True
- 1 False
- 2 True
- 3 False
- 4 True
- 5 False
- >>> df.select_dtypes(include=['float64'])
- c
- 0 1.0
- 1 2.0
- 2 1.0
- 3 2.0
- 4 1.0
- 5 2.0
- >>> df.select_dtypes(exclude=['int64'])
- b c
- 0 True 1.0
- 1 False 2.0
- 2 True 1.0
- 3 False 2.0
- 4 True 1.0
- 5 False 2.0
- """
- if not is_list_like(include):
- include = (include,) if include is not None else ()
- if not is_list_like(exclude):
- exclude = (exclude,) if exclude is not None else ()
- selection = (frozenset(include), frozenset(exclude))
- if not any(selection):
- raise ValueError("at least one of include or exclude must be nonempty")
- # convert the myriad valid dtypes object to a single representation
- def check_int_infer_dtype(dtypes):
- converted_dtypes: list[type] = []
- for dtype in dtypes:
- # Numpy maps int to different types (int32, in64) on Windows and Linux
- # see https://github.com/numpy/numpy/issues/9464
- if (isinstance(dtype, str) and dtype == "int") or (dtype is int):
- converted_dtypes.append(np.int32)
- converted_dtypes.append(np.int64)
- elif dtype == "float" or dtype is float:
- # GH#42452 : np.dtype("float") coerces to np.float64 from Numpy 1.20
- converted_dtypes.extend([np.float64, np.float32])
- else:
- converted_dtypes.append(infer_dtype_from_object(dtype))
- return frozenset(converted_dtypes)
- include = check_int_infer_dtype(include)
- exclude = check_int_infer_dtype(exclude)
- for dtypes in (include, exclude):
- invalidate_string_dtypes(dtypes)
- # can't both include AND exclude!
- if not include.isdisjoint(exclude):
- raise ValueError(f"include and exclude overlap on {(include & exclude)}")
- def dtype_predicate(dtype: DtypeObj, dtypes_set) -> bool:
- # GH 46870: BooleanDtype._is_numeric == True but should be excluded
- dtype = dtype if not isinstance(dtype, ArrowDtype) else dtype.numpy_dtype
- return issubclass(dtype.type, tuple(dtypes_set)) or (
- np.number in dtypes_set
- and getattr(dtype, "_is_numeric", False)
- and not is_bool_dtype(dtype)
- )
- def predicate(arr: ArrayLike) -> bool:
- dtype = arr.dtype
- if include:
- if not dtype_predicate(dtype, include):
- return False
- if exclude:
- if dtype_predicate(dtype, exclude):
- return False
- return True
- mgr = self._mgr._get_data_subset(predicate).copy(deep=None)
- return type(self)(mgr).__finalize__(self)
- def insert(
- self,
- loc: int,
- column: Hashable,
- value: Scalar | AnyArrayLike,
- allow_duplicates: bool | lib.NoDefault = lib.no_default,
- ) -> None:
- """
- Insert column into DataFrame at specified location.
- Raises a ValueError if `column` is already contained in the DataFrame,
- unless `allow_duplicates` is set to True.
- Parameters
- ----------
- loc : int
- Insertion index. Must verify 0 <= loc <= len(columns).
- column : str, number, or hashable object
- Label of the inserted column.
- value : Scalar, Series, or array-like
- allow_duplicates : bool, optional, default lib.no_default
- See Also
- --------
- Index.insert : Insert new item by index.
- Examples
- --------
- >>> df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
- >>> df
- col1 col2
- 0 1 3
- 1 2 4
- >>> df.insert(1, "newcol", [99, 99])
- >>> df
- col1 newcol col2
- 0 1 99 3
- 1 2 99 4
- >>> df.insert(0, "col1", [100, 100], allow_duplicates=True)
- >>> df
- col1 col1 newcol col2
- 0 100 1 99 3
- 1 100 2 99 4
- Notice that pandas uses index alignment in case of `value` from type `Series`:
- >>> df.insert(0, "col0", pd.Series([5, 6], index=[1, 2]))
- >>> df
- col0 col1 col1 newcol col2
- 0 NaN 100 1 99 3
- 1 5.0 100 2 99 4
- """
- if allow_duplicates is lib.no_default:
- allow_duplicates = False
- if allow_duplicates and not self.flags.allows_duplicate_labels:
- raise ValueError(
- "Cannot specify 'allow_duplicates=True' when "
- "'self.flags.allows_duplicate_labels' is False."
- )
- if not allow_duplicates and column in self.columns:
- # Should this be a different kind of error??
- raise ValueError(f"cannot insert {column}, already exists")
- if not isinstance(loc, int):
- raise TypeError("loc must be int")
- value = self._sanitize_column(value)
- self._mgr.insert(loc, column, value)
- def assign(self, **kwargs) -> DataFrame:
- r"""
- Assign new columns to a DataFrame.
- Returns a new object with all original columns in addition to new ones.
- Existing columns that are re-assigned will be overwritten.
- Parameters
- ----------
- **kwargs : dict of {str: callable or Series}
- The column names are keywords. If the values are
- callable, they are computed on the DataFrame and
- assigned to the new columns. The callable must not
- change input DataFrame (though pandas doesn't check it).
- If the values are not callable, (e.g. a Series, scalar, or array),
- they are simply assigned.
- Returns
- -------
- DataFrame
- A new DataFrame with the new columns in addition to
- all the existing columns.
- Notes
- -----
- Assigning multiple columns within the same ``assign`` is possible.
- Later items in '\*\*kwargs' may refer to newly created or modified
- columns in 'df'; items are computed and assigned into 'df' in order.
- Examples
- --------
- >>> df = pd.DataFrame({'temp_c': [17.0, 25.0]},
- ... index=['Portland', 'Berkeley'])
- >>> df
- temp_c
- Portland 17.0
- Berkeley 25.0
- Where the value is a callable, evaluated on `df`:
- >>> df.assign(temp_f=lambda x: x.temp_c * 9 / 5 + 32)
- temp_c temp_f
- Portland 17.0 62.6
- Berkeley 25.0 77.0
- Alternatively, the same behavior can be achieved by directly
- referencing an existing Series or sequence:
- >>> df.assign(temp_f=df['temp_c'] * 9 / 5 + 32)
- temp_c temp_f
- Portland 17.0 62.6
- Berkeley 25.0 77.0
- You can create multiple columns within the same assign where one
- of the columns depends on another one defined within the same assign:
- >>> df.assign(temp_f=lambda x: x['temp_c'] * 9 / 5 + 32,
- ... temp_k=lambda x: (x['temp_f'] + 459.67) * 5 / 9)
- temp_c temp_f temp_k
- Portland 17.0 62.6 290.15
- Berkeley 25.0 77.0 298.15
- """
- data = self.copy(deep=None)
- for k, v in kwargs.items():
- data[k] = com.apply_if_callable(v, data)
- return data
- def _sanitize_column(self, value) -> ArrayLike:
- """
- Ensures new columns (which go into the BlockManager as new blocks) are
- always copied and converted into an array.
- Parameters
- ----------
- value : scalar, Series, or array-like
- Returns
- -------
- numpy.ndarray or ExtensionArray
- """
- self._ensure_valid_index(value)
- # We can get there through isetitem with a DataFrame
- # or through loc single_block_path
- if isinstance(value, DataFrame):
- return _reindex_for_setitem(value, self.index)
- elif is_dict_like(value):
- return _reindex_for_setitem(Series(value), self.index)
- if is_list_like(value):
- com.require_length_match(value, self.index)
- return sanitize_array(value, self.index, copy=True, allow_2d=True)
- @property
- def _series(self):
- return {
- item: Series(
- self._mgr.iget(idx), index=self.index, name=item, fastpath=True
- )
- for idx, item in enumerate(self.columns)
- }
- # ----------------------------------------------------------------------
- # Reindexing and alignment
- def _reindex_axes(self, axes, level, limit, tolerance, method, fill_value, copy):
- frame = self
- columns = axes["columns"]
- if columns is not None:
- frame = frame._reindex_columns(
- columns, method, copy, level, fill_value, limit, tolerance
- )
- index = axes["index"]
- if index is not None:
- frame = frame._reindex_index(
- index, method, copy, level, fill_value, limit, tolerance
- )
- return frame
- def _reindex_index(
- self,
- new_index,
- method,
- copy: bool,
- level: Level,
- fill_value=np.nan,
- limit=None,
- tolerance=None,
- ):
- new_index, indexer = self.index.reindex(
- new_index, method=method, level=level, limit=limit, tolerance=tolerance
- )
- return self._reindex_with_indexers(
- {0: [new_index, indexer]},
- copy=copy,
- fill_value=fill_value,
- allow_dups=False,
- )
- def _reindex_columns(
- self,
- new_columns,
- method,
- copy: bool,
- level: Level,
- fill_value=None,
- limit=None,
- tolerance=None,
- ):
- new_columns, indexer = self.columns.reindex(
- new_columns, method=method, level=level, limit=limit, tolerance=tolerance
- )
- return self._reindex_with_indexers(
- {1: [new_columns, indexer]},
- copy=copy,
- fill_value=fill_value,
- allow_dups=False,
- )
- def _reindex_multi(
- self, axes: dict[str, Index], copy: bool, fill_value
- ) -> DataFrame:
- """
- We are guaranteed non-Nones in the axes.
- """
- new_index, row_indexer = self.index.reindex(axes["index"])
- new_columns, col_indexer = self.columns.reindex(axes["columns"])
- if row_indexer is not None and col_indexer is not None:
- # Fastpath. By doing two 'take's at once we avoid making an
- # unnecessary copy.
- # We only get here with `not self._is_mixed_type`, which (almost)
- # ensures that self.values is cheap. It may be worth making this
- # condition more specific.
- indexer = row_indexer, col_indexer
- new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)
- return self._constructor(
- new_values, index=new_index, columns=new_columns, copy=False
- )
- else:
- return self._reindex_with_indexers(
- {0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
- copy=copy,
- fill_value=fill_value,
- )
- @doc(NDFrame.align, **_shared_doc_kwargs)
- def align(
- self,
- other: DataFrame,
- join: AlignJoin = "outer",
- axis: Axis | None = None,
- level: Level = None,
- copy: bool | None = None,
- fill_value=None,
- method: FillnaOptions | None = None,
- limit: int | None = None,
- fill_axis: Axis = 0,
- broadcast_axis: Axis | None = None,
- ) -> DataFrame:
- return super().align(
- other,
- join=join,
- axis=axis,
- level=level,
- copy=copy,
- fill_value=fill_value,
- method=method,
- limit=limit,
- fill_axis=fill_axis,
- broadcast_axis=broadcast_axis,
- )
- @Appender(
- """
- Examples
- --------
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
- Change the row labels.
- >>> df.set_axis(['a', 'b', 'c'], axis='index')
- A B
- a 1 4
- b 2 5
- c 3 6
- Change the column labels.
- >>> df.set_axis(['I', 'II'], axis='columns')
- I II
- 0 1 4
- 1 2 5
- 2 3 6
- """
- )
- @Substitution(
- **_shared_doc_kwargs,
- extended_summary_sub=" column or",
- axis_description_sub=", and 1 identifies the columns",
- see_also_sub=" or columns",
- )
- @Appender(NDFrame.set_axis.__doc__)
- def set_axis(
- self,
- labels,
- *,
- axis: Axis = 0,
- copy: bool | None = None,
- ) -> DataFrame:
- return super().set_axis(labels, axis=axis, copy=copy)
- @doc(
- NDFrame.reindex,
- klass=_shared_doc_kwargs["klass"],
- optional_reindex=_shared_doc_kwargs["optional_reindex"],
- )
- def reindex( # type: ignore[override]
- self,
- labels=None,
- *,
- index=None,
- columns=None,
- axis: Axis | None = None,
- method: str | None = None,
- copy: bool | None = None,
- level: Level | None = None,
- fill_value: Scalar | None = np.nan,
- limit: int | None = None,
- tolerance=None,
- ) -> DataFrame:
- return super().reindex(
- labels=labels,
- index=index,
- columns=columns,
- axis=axis,
- method=method,
- copy=copy,
- level=level,
- fill_value=fill_value,
- limit=limit,
- tolerance=tolerance,
- )
- @overload
- def drop(
- self,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level = ...,
- inplace: Literal[True],
- errors: IgnoreRaise = ...,
- ) -> None:
- ...
- @overload
- def drop(
- self,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level = ...,
- inplace: Literal[False] = ...,
- errors: IgnoreRaise = ...,
- ) -> DataFrame:
- ...
- @overload
- def drop(
- self,
- labels: IndexLabel = ...,
- *,
- axis: Axis = ...,
- index: IndexLabel = ...,
- columns: IndexLabel = ...,
- level: Level = ...,
- inplace: bool = ...,
- errors: IgnoreRaise = ...,
- ) -> DataFrame | None:
- ...
- def drop(
- self,
- labels: IndexLabel = None,
- *,
- axis: Axis = 0,
- index: IndexLabel = None,
- columns: IndexLabel = None,
- level: Level = None,
- inplace: bool = False,
- errors: IgnoreRaise = "raise",
- ) -> DataFrame | None:
- """
- Drop specified labels from rows or columns.
- Remove rows or columns by specifying label names and corresponding
- axis, or by specifying directly index or column names. When using a
- multi-index, labels on different levels can be removed by specifying
- the level. See the :ref:`user guide <advanced.shown_levels>`
- for more information about the now unused levels.
- Parameters
- ----------
- labels : single label or list-like
- Index or column labels to drop. A tuple will be used as a single
- label and not treated as a list-like.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Whether to drop labels from the index (0 or 'index') or
- columns (1 or 'columns').
- index : single label or list-like
- Alternative to specifying axis (``labels, axis=0``
- is equivalent to ``index=labels``).
- columns : single label or list-like
- Alternative to specifying axis (``labels, axis=1``
- is equivalent to ``columns=labels``).
- level : int or level name, optional
- For MultiIndex, level from which the labels will be removed.
- inplace : bool, default False
- If False, return a copy. Otherwise, do operation
- inplace and return None.
- errors : {'ignore', 'raise'}, default 'raise'
- If 'ignore', suppress error and only existing labels are
- dropped.
- Returns
- -------
- DataFrame or None
- DataFrame without the removed index or column labels or
- None if ``inplace=True``.
- Raises
- ------
- KeyError
- If any of the labels is not found in the selected axis.
- See Also
- --------
- DataFrame.loc : Label-location based indexer for selection by label.
- DataFrame.dropna : Return DataFrame with labels on given axis omitted
- where (all or any) data are missing.
- DataFrame.drop_duplicates : Return DataFrame with duplicate rows
- removed, optionally only considering certain columns.
- Series.drop : Return Series with specified index labels removed.
- Examples
- --------
- >>> df = pd.DataFrame(np.arange(12).reshape(3, 4),
- ... columns=['A', 'B', 'C', 'D'])
- >>> df
- A B C D
- 0 0 1 2 3
- 1 4 5 6 7
- 2 8 9 10 11
- Drop columns
- >>> df.drop(['B', 'C'], axis=1)
- A D
- 0 0 3
- 1 4 7
- 2 8 11
- >>> df.drop(columns=['B', 'C'])
- A D
- 0 0 3
- 1 4 7
- 2 8 11
- Drop a row by index
- >>> df.drop([0, 1])
- A B C D
- 2 8 9 10 11
- Drop columns and/or rows of MultiIndex DataFrame
- >>> midx = pd.MultiIndex(levels=[['lama', 'cow', 'falcon'],
- ... ['speed', 'weight', 'length']],
- ... codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2],
- ... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
- >>> df = pd.DataFrame(index=midx, columns=['big', 'small'],
- ... data=[[45, 30], [200, 100], [1.5, 1], [30, 20],
- ... [250, 150], [1.5, 0.8], [320, 250],
- ... [1, 0.8], [0.3, 0.2]])
- >>> df
- big small
- lama speed 45.0 30.0
- weight 200.0 100.0
- length 1.5 1.0
- cow speed 30.0 20.0
- weight 250.0 150.0
- length 1.5 0.8
- falcon speed 320.0 250.0
- weight 1.0 0.8
- length 0.3 0.2
- Drop a specific index combination from the MultiIndex
- DataFrame, i.e., drop the combination ``'falcon'`` and
- ``'weight'``, which deletes only the corresponding row
- >>> df.drop(index=('falcon', 'weight'))
- big small
- lama speed 45.0 30.0
- weight 200.0 100.0
- length 1.5 1.0
- cow speed 30.0 20.0
- weight 250.0 150.0
- length 1.5 0.8
- falcon speed 320.0 250.0
- length 0.3 0.2
- >>> df.drop(index='cow', columns='small')
- big
- lama speed 45.0
- weight 200.0
- length 1.5
- falcon speed 320.0
- weight 1.0
- length 0.3
- >>> df.drop(index='length', level=1)
- big small
- lama speed 45.0 30.0
- weight 200.0 100.0
- cow speed 30.0 20.0
- weight 250.0 150.0
- falcon speed 320.0 250.0
- weight 1.0 0.8
- """
- return super().drop(
- labels=labels,
- axis=axis,
- index=index,
- columns=columns,
- level=level,
- inplace=inplace,
- errors=errors,
- )
- @overload
- def rename(
- self,
- mapper: Renamer | None = ...,
- *,
- index: Renamer | None = ...,
- columns: Renamer | None = ...,
- axis: Axis | None = ...,
- copy: bool | None = ...,
- inplace: Literal[True],
- level: Level = ...,
- errors: IgnoreRaise = ...,
- ) -> None:
- ...
- @overload
- def rename(
- self,
- mapper: Renamer | None = ...,
- *,
- index: Renamer | None = ...,
- columns: Renamer | None = ...,
- axis: Axis | None = ...,
- copy: bool | None = ...,
- inplace: Literal[False] = ...,
- level: Level = ...,
- errors: IgnoreRaise = ...,
- ) -> DataFrame:
- ...
- @overload
- def rename(
- self,
- mapper: Renamer | None = ...,
- *,
- index: Renamer | None = ...,
- columns: Renamer | None = ...,
- axis: Axis | None = ...,
- copy: bool | None = ...,
- inplace: bool = ...,
- level: Level = ...,
- errors: IgnoreRaise = ...,
- ) -> DataFrame | None:
- ...
- def rename(
- self,
- mapper: Renamer | None = None,
- *,
- index: Renamer | None = None,
- columns: Renamer | None = None,
- axis: Axis | None = None,
- copy: bool | None = None,
- inplace: bool = False,
- level: Level = None,
- errors: IgnoreRaise = "ignore",
- ) -> DataFrame | None:
- """
- Rename columns or index labels.
- Function / dict values must be unique (1-to-1). Labels not contained in
- a dict / Series will be left as-is. Extra labels listed don't throw an
- error.
- See the :ref:`user guide <basics.rename>` for more.
- Parameters
- ----------
- mapper : dict-like or function
- Dict-like or function transformations to apply to
- that axis' values. Use either ``mapper`` and ``axis`` to
- specify the axis to target with ``mapper``, or ``index`` and
- ``columns``.
- index : dict-like or function
- Alternative to specifying axis (``mapper, axis=0``
- is equivalent to ``index=mapper``).
- columns : dict-like or function
- Alternative to specifying axis (``mapper, axis=1``
- is equivalent to ``columns=mapper``).
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Axis to target with ``mapper``. Can be either the axis name
- ('index', 'columns') or number (0, 1). The default is 'index'.
- copy : bool, default True
- Also copy underlying data.
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- If True then value of copy is ignored.
- level : int or level name, default None
- In case of a MultiIndex, only rename labels in the specified
- level.
- errors : {'ignore', 'raise'}, default 'ignore'
- If 'raise', raise a `KeyError` when a dict-like `mapper`, `index`,
- or `columns` contains labels that are not present in the Index
- being transformed.
- If 'ignore', existing keys will be renamed and extra keys will be
- ignored.
- Returns
- -------
- DataFrame or None
- DataFrame with the renamed axis labels or None if ``inplace=True``.
- Raises
- ------
- KeyError
- If any of the labels is not found in the selected axis and
- "errors='raise'".
- See Also
- --------
- DataFrame.rename_axis : Set the name of the axis.
- Examples
- --------
- ``DataFrame.rename`` supports two calling conventions
- * ``(index=index_mapper, columns=columns_mapper, ...)``
- * ``(mapper, axis={'index', 'columns'}, ...)``
- We *highly* recommend using keyword arguments to clarify your
- intent.
- Rename columns using a mapping:
- >>> df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
- >>> df.rename(columns={"A": "a", "B": "c"})
- a c
- 0 1 4
- 1 2 5
- 2 3 6
- Rename index using a mapping:
- >>> df.rename(index={0: "x", 1: "y", 2: "z"})
- A B
- x 1 4
- y 2 5
- z 3 6
- Cast index labels to a different type:
- >>> df.index
- RangeIndex(start=0, stop=3, step=1)
- >>> df.rename(index=str).index
- Index(['0', '1', '2'], dtype='object')
- >>> df.rename(columns={"A": "a", "B": "b", "C": "c"}, errors="raise")
- Traceback (most recent call last):
- KeyError: ['C'] not found in axis
- Using axis-style parameters:
- >>> df.rename(str.lower, axis='columns')
- a b
- 0 1 4
- 1 2 5
- 2 3 6
- >>> df.rename({1: 2, 2: 4}, axis='index')
- A B
- 0 1 4
- 2 2 5
- 4 3 6
- """
- return super()._rename(
- mapper=mapper,
- index=index,
- columns=columns,
- axis=axis,
- copy=copy,
- inplace=inplace,
- level=level,
- errors=errors,
- )
- @overload
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: Literal[False] = ...,
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> DataFrame:
- ...
- @overload
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: Literal[True],
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> None:
- ...
- @overload
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = ...,
- *,
- method: FillnaOptions | None = ...,
- axis: Axis | None = ...,
- inplace: bool = ...,
- limit: int | None = ...,
- downcast: dict | None = ...,
- ) -> DataFrame | None:
- ...
- @doc(NDFrame.fillna, **_shared_doc_kwargs)
- def fillna(
- self,
- value: Hashable | Mapping | Series | DataFrame = None,
- *,
- method: FillnaOptions | None = None,
- axis: Axis | None = None,
- inplace: bool = False,
- limit: int | None = None,
- downcast: dict | None = None,
- ) -> DataFrame | None:
- return super().fillna(
- value=value,
- method=method,
- axis=axis,
- inplace=inplace,
- limit=limit,
- downcast=downcast,
- )
- def pop(self, item: Hashable) -> Series:
- """
- Return item and drop from frame. Raise KeyError if not found.
- Parameters
- ----------
- item : label
- Label of column to be popped.
- Returns
- -------
- Series
- Examples
- --------
- >>> df = pd.DataFrame([('falcon', 'bird', 389.0),
- ... ('parrot', 'bird', 24.0),
- ... ('lion', 'mammal', 80.5),
- ... ('monkey', 'mammal', np.nan)],
- ... columns=('name', 'class', 'max_speed'))
- >>> df
- name class max_speed
- 0 falcon bird 389.0
- 1 parrot bird 24.0
- 2 lion mammal 80.5
- 3 monkey mammal NaN
- >>> df.pop('class')
- 0 bird
- 1 bird
- 2 mammal
- 3 mammal
- Name: class, dtype: object
- >>> df
- name max_speed
- 0 falcon 389.0
- 1 parrot 24.0
- 2 lion 80.5
- 3 monkey NaN
- """
- return super().pop(item=item)
- @overload
- def replace(
- self,
- to_replace=...,
- value=...,
- *,
- inplace: Literal[False] = ...,
- limit: int | None = ...,
- regex: bool = ...,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
- ) -> DataFrame:
- ...
- @overload
- def replace(
- self,
- to_replace=...,
- value=...,
- *,
- inplace: Literal[True],
- limit: int | None = ...,
- regex: bool = ...,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = ...,
- ) -> None:
- ...
- @doc(NDFrame.replace, **_shared_doc_kwargs)
- def replace(
- self,
- to_replace=None,
- value=lib.no_default,
- *,
- inplace: bool = False,
- limit: int | None = None,
- regex: bool = False,
- method: Literal["pad", "ffill", "bfill"] | lib.NoDefault = lib.no_default,
- ) -> DataFrame | None:
- return super().replace(
- to_replace=to_replace,
- value=value,
- inplace=inplace,
- limit=limit,
- regex=regex,
- method=method,
- )
- def _replace_columnwise(
- self, mapping: dict[Hashable, tuple[Any, Any]], inplace: bool, regex
- ):
- """
- Dispatch to Series.replace column-wise.
- Parameters
- ----------
- mapping : dict
- of the form {col: (target, value)}
- inplace : bool
- regex : bool or same types as `to_replace` in DataFrame.replace
- Returns
- -------
- DataFrame or None
- """
- # Operate column-wise
- res = self if inplace else self.copy(deep=None)
- ax = self.columns
- for i, ax_value in enumerate(ax):
- if ax_value in mapping:
- ser = self.iloc[:, i]
- target, value = mapping[ax_value]
- newobj = ser.replace(target, value, regex=regex)
- res._iset_item(i, newobj)
- if inplace:
- return
- return res.__finalize__(self)
- @doc(NDFrame.shift, klass=_shared_doc_kwargs["klass"])
- def shift(
- self,
- periods: int = 1,
- freq: Frequency | None = None,
- axis: Axis = 0,
- fill_value: Hashable = lib.no_default,
- ) -> DataFrame:
- axis = self._get_axis_number(axis)
- ncols = len(self.columns)
- if (
- axis == 1
- and periods != 0
- and freq is None
- and fill_value is lib.no_default
- and ncols > 0
- ):
- # We will infer fill_value to match the closest column
- # Use a column that we know is valid for our column's dtype GH#38434
- label = self.columns[0]
- if periods > 0:
- result = self.iloc[:, :-periods]
- for col in range(min(ncols, abs(periods))):
- # TODO(EA2D): doing this in a loop unnecessary with 2D EAs
- # Define filler inside loop so we get a copy
- filler = self.iloc[:, 0].shift(len(self))
- result.insert(0, label, filler, allow_duplicates=True)
- else:
- result = self.iloc[:, -periods:]
- for col in range(min(ncols, abs(periods))):
- # Define filler inside loop so we get a copy
- filler = self.iloc[:, -1].shift(len(self))
- result.insert(
- len(result.columns), label, filler, allow_duplicates=True
- )
- result.columns = self.columns.copy()
- return result
- elif (
- axis == 1
- and periods != 0
- and fill_value is not lib.no_default
- and ncols > 0
- ):
- arrays = self._mgr.arrays
- if len(arrays) > 1 or (
- # If we only have one block and we know that we can't
- # keep the same dtype (i.e. the _can_hold_element check)
- # then we can go through the reindex_indexer path
- # (and avoid casting logic in the Block method).
- not can_hold_element(arrays[0], fill_value)
- ):
- # GH#35488 we need to watch out for multi-block cases
- # We only get here with fill_value not-lib.no_default
- nper = abs(periods)
- nper = min(nper, ncols)
- if periods > 0:
- indexer = np.array(
- [-1] * nper + list(range(ncols - periods)), dtype=np.intp
- )
- else:
- indexer = np.array(
- list(range(nper, ncols)) + [-1] * nper, dtype=np.intp
- )
- mgr = self._mgr.reindex_indexer(
- self.columns,
- indexer,
- axis=0,
- fill_value=fill_value,
- allow_dups=True,
- )
- res_df = self._constructor(mgr)
- return res_df.__finalize__(self, method="shift")
- return super().shift(
- periods=periods, freq=freq, axis=axis, fill_value=fill_value
- )
- @overload
- def set_index(
- self,
- keys,
- *,
- drop: bool = ...,
- append: bool = ...,
- inplace: Literal[False] = ...,
- verify_integrity: bool = ...,
- ) -> DataFrame:
- ...
- @overload
- def set_index(
- self,
- keys,
- *,
- drop: bool = ...,
- append: bool = ...,
- inplace: Literal[True],
- verify_integrity: bool = ...,
- ) -> None:
- ...
- def set_index(
- self,
- keys,
- *,
- drop: bool = True,
- append: bool = False,
- inplace: bool = False,
- verify_integrity: bool = False,
- ) -> DataFrame | None:
- """
- Set the DataFrame index using existing columns.
- Set the DataFrame index (row labels) using one or more existing
- columns or arrays (of the correct length). The index can replace the
- existing index or expand on it.
- Parameters
- ----------
- keys : label or array-like or list of labels/arrays
- This parameter can be either a single column key, a single array of
- the same length as the calling DataFrame, or a list containing an
- arbitrary combination of column keys and arrays. Here, "array"
- encompasses :class:`Series`, :class:`Index`, ``np.ndarray``, and
- instances of :class:`~collections.abc.Iterator`.
- drop : bool, default True
- Delete columns to be used as the new index.
- append : bool, default False
- Whether to append columns to existing index.
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- verify_integrity : bool, default False
- Check the new index for duplicates. Otherwise defer the check until
- necessary. Setting to False will improve the performance of this
- method.
- Returns
- -------
- DataFrame or None
- Changed row labels or None if ``inplace=True``.
- See Also
- --------
- DataFrame.reset_index : Opposite of set_index.
- DataFrame.reindex : Change to new indices or expand indices.
- DataFrame.reindex_like : Change to same indices as other DataFrame.
- Examples
- --------
- >>> df = pd.DataFrame({'month': [1, 4, 7, 10],
- ... 'year': [2012, 2014, 2013, 2014],
- ... 'sale': [55, 40, 84, 31]})
- >>> df
- month year sale
- 0 1 2012 55
- 1 4 2014 40
- 2 7 2013 84
- 3 10 2014 31
- Set the index to become the 'month' column:
- >>> df.set_index('month')
- year sale
- month
- 1 2012 55
- 4 2014 40
- 7 2013 84
- 10 2014 31
- Create a MultiIndex using columns 'year' and 'month':
- >>> df.set_index(['year', 'month'])
- sale
- year month
- 2012 1 55
- 2014 4 40
- 2013 7 84
- 2014 10 31
- Create a MultiIndex using an Index and a column:
- >>> df.set_index([pd.Index([1, 2, 3, 4]), 'year'])
- month sale
- year
- 1 2012 1 55
- 2 2014 4 40
- 3 2013 7 84
- 4 2014 10 31
- Create a MultiIndex using two Series:
- >>> s = pd.Series([1, 2, 3, 4])
- >>> df.set_index([s, s**2])
- month year sale
- 1 1 1 2012 55
- 2 4 4 2014 40
- 3 9 7 2013 84
- 4 16 10 2014 31
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- self._check_inplace_and_allows_duplicate_labels(inplace)
- if not isinstance(keys, list):
- keys = [keys]
- err_msg = (
- 'The parameter "keys" may be a column key, one-dimensional '
- "array, or a list containing only valid column keys and "
- "one-dimensional arrays."
- )
- missing: list[Hashable] = []
- for col in keys:
- if isinstance(col, (Index, Series, np.ndarray, list, abc.Iterator)):
- # arrays are fine as long as they are one-dimensional
- # iterators get converted to list below
- if getattr(col, "ndim", 1) != 1:
- raise ValueError(err_msg)
- else:
- # everything else gets tried as a key; see GH 24969
- try:
- found = col in self.columns
- except TypeError as err:
- raise TypeError(
- f"{err_msg}. Received column of type {type(col)}"
- ) from err
- else:
- if not found:
- missing.append(col)
- if missing:
- raise KeyError(f"None of {missing} are in the columns")
- if inplace:
- frame = self
- else:
- # GH 49473 Use "lazy copy" with Copy-on-Write
- frame = self.copy(deep=None)
- arrays = []
- names: list[Hashable] = []
- if append:
- names = list(self.index.names)
- if isinstance(self.index, MultiIndex):
- for i in range(self.index.nlevels):
- arrays.append(self.index._get_level_values(i))
- else:
- arrays.append(self.index)
- to_remove: list[Hashable] = []
- for col in keys:
- if isinstance(col, MultiIndex):
- for n in range(col.nlevels):
- arrays.append(col._get_level_values(n))
- names.extend(col.names)
- elif isinstance(col, (Index, Series)):
- # if Index then not MultiIndex (treated above)
- # error: Argument 1 to "append" of "list" has incompatible type
- # "Union[Index, Series]"; expected "Index"
- arrays.append(col) # type:ignore[arg-type]
- names.append(col.name)
- elif isinstance(col, (list, np.ndarray)):
- # error: Argument 1 to "append" of "list" has incompatible type
- # "Union[List[Any], ndarray]"; expected "Index"
- arrays.append(col) # type: ignore[arg-type]
- names.append(None)
- elif isinstance(col, abc.Iterator):
- # error: Argument 1 to "append" of "list" has incompatible type
- # "List[Any]"; expected "Index"
- arrays.append(list(col)) # type: ignore[arg-type]
- names.append(None)
- # from here, col can only be a column label
- else:
- arrays.append(frame[col])
- names.append(col)
- if drop:
- to_remove.append(col)
- if len(arrays[-1]) != len(self):
- # check newest element against length of calling frame, since
- # ensure_index_from_sequences would not raise for append=False.
- raise ValueError(
- f"Length mismatch: Expected {len(self)} rows, "
- f"received array of length {len(arrays[-1])}"
- )
- index = ensure_index_from_sequences(arrays, names)
- if verify_integrity and not index.is_unique:
- duplicates = index[index.duplicated()].unique()
- raise ValueError(f"Index has duplicate keys: {duplicates}")
- # use set to handle duplicate column names gracefully in case of drop
- for c in set(to_remove):
- del frame[c]
- # clear up memory usage
- index._cleanup()
- frame.index = index
- if not inplace:
- return frame
- return None
- @overload
- def reset_index(
- self,
- level: IndexLabel = ...,
- *,
- drop: bool = ...,
- inplace: Literal[False] = ...,
- col_level: Hashable = ...,
- col_fill: Hashable = ...,
- allow_duplicates: bool | lib.NoDefault = ...,
- names: Hashable | Sequence[Hashable] = None,
- ) -> DataFrame:
- ...
- @overload
- def reset_index(
- self,
- level: IndexLabel = ...,
- *,
- drop: bool = ...,
- inplace: Literal[True],
- col_level: Hashable = ...,
- col_fill: Hashable = ...,
- allow_duplicates: bool | lib.NoDefault = ...,
- names: Hashable | Sequence[Hashable] = None,
- ) -> None:
- ...
- @overload
- def reset_index(
- self,
- level: IndexLabel = ...,
- *,
- drop: bool = ...,
- inplace: bool = ...,
- col_level: Hashable = ...,
- col_fill: Hashable = ...,
- allow_duplicates: bool | lib.NoDefault = ...,
- names: Hashable | Sequence[Hashable] = None,
- ) -> DataFrame | None:
- ...
- def reset_index(
- self,
- level: IndexLabel = None,
- *,
- drop: bool = False,
- inplace: bool = False,
- col_level: Hashable = 0,
- col_fill: Hashable = "",
- allow_duplicates: bool | lib.NoDefault = lib.no_default,
- names: Hashable | Sequence[Hashable] = None,
- ) -> DataFrame | None:
- """
- Reset the index, or a level of it.
- Reset the index of the DataFrame, and use the default one instead.
- If the DataFrame has a MultiIndex, this method can remove one or more
- levels.
- Parameters
- ----------
- level : int, str, tuple, or list, default None
- Only remove the given levels from the index. Removes all levels by
- default.
- drop : bool, default False
- Do not try to insert index into dataframe columns. This resets
- the index to the default integer index.
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- col_level : int or str, default 0
- If the columns have multiple levels, determines which level the
- labels are inserted into. By default it is inserted into the first
- level.
- col_fill : object, default ''
- If the columns have multiple levels, determines how the other
- levels are named. If None then the index name is repeated.
- allow_duplicates : bool, optional, default lib.no_default
- Allow duplicate column labels to be created.
- .. versionadded:: 1.5.0
- names : int, str or 1-dimensional list, default None
- Using the given string, rename the DataFrame column which contains the
- index data. If the DataFrame has a MultiIndex, this has to be a list or
- tuple with length equal to the number of levels.
- .. versionadded:: 1.5.0
- Returns
- -------
- DataFrame or None
- DataFrame with the new index or None if ``inplace=True``.
- See Also
- --------
- DataFrame.set_index : Opposite of reset_index.
- DataFrame.reindex : Change to new indices or expand indices.
- DataFrame.reindex_like : Change to same indices as other DataFrame.
- Examples
- --------
- >>> df = pd.DataFrame([('bird', 389.0),
- ... ('bird', 24.0),
- ... ('mammal', 80.5),
- ... ('mammal', np.nan)],
- ... index=['falcon', 'parrot', 'lion', 'monkey'],
- ... columns=('class', 'max_speed'))
- >>> df
- class max_speed
- falcon bird 389.0
- parrot bird 24.0
- lion mammal 80.5
- monkey mammal NaN
- When we reset the index, the old index is added as a column, and a
- new sequential index is used:
- >>> df.reset_index()
- index class max_speed
- 0 falcon bird 389.0
- 1 parrot bird 24.0
- 2 lion mammal 80.5
- 3 monkey mammal NaN
- We can use the `drop` parameter to avoid the old index being added as
- a column:
- >>> df.reset_index(drop=True)
- class max_speed
- 0 bird 389.0
- 1 bird 24.0
- 2 mammal 80.5
- 3 mammal NaN
- You can also use `reset_index` with `MultiIndex`.
- >>> index = pd.MultiIndex.from_tuples([('bird', 'falcon'),
- ... ('bird', 'parrot'),
- ... ('mammal', 'lion'),
- ... ('mammal', 'monkey')],
- ... names=['class', 'name'])
- >>> columns = pd.MultiIndex.from_tuples([('speed', 'max'),
- ... ('species', 'type')])
- >>> df = pd.DataFrame([(389.0, 'fly'),
- ... (24.0, 'fly'),
- ... (80.5, 'run'),
- ... (np.nan, 'jump')],
- ... index=index,
- ... columns=columns)
- >>> df
- speed species
- max type
- class name
- bird falcon 389.0 fly
- parrot 24.0 fly
- mammal lion 80.5 run
- monkey NaN jump
- Using the `names` parameter, choose a name for the index column:
- >>> df.reset_index(names=['classes', 'names'])
- classes names speed species
- max type
- 0 bird falcon 389.0 fly
- 1 bird parrot 24.0 fly
- 2 mammal lion 80.5 run
- 3 mammal monkey NaN jump
- If the index has multiple levels, we can reset a subset of them:
- >>> df.reset_index(level='class')
- class speed species
- max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
- If we are not dropping the index, by default, it is placed in the top
- level. We can place it in another level:
- >>> df.reset_index(level='class', col_level=1)
- speed species
- class max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
- When the index is inserted under another level, we can specify under
- which one with the parameter `col_fill`:
- >>> df.reset_index(level='class', col_level=1, col_fill='species')
- species speed species
- class max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
- If we specify a nonexistent level for `col_fill`, it is created:
- >>> df.reset_index(level='class', col_level=1, col_fill='genus')
- genus speed species
- class max type
- name
- falcon bird 389.0 fly
- parrot bird 24.0 fly
- lion mammal 80.5 run
- monkey mammal NaN jump
- """
- inplace = validate_bool_kwarg(inplace, "inplace")
- self._check_inplace_and_allows_duplicate_labels(inplace)
- if inplace:
- new_obj = self
- else:
- new_obj = self.copy(deep=None)
- if allow_duplicates is not lib.no_default:
- allow_duplicates = validate_bool_kwarg(allow_duplicates, "allow_duplicates")
- new_index = default_index(len(new_obj))
- if level is not None:
- if not isinstance(level, (tuple, list)):
- level = [level]
- level = [self.index._get_level_number(lev) for lev in level]
- if len(level) < self.index.nlevels:
- new_index = self.index.droplevel(level)
- if not drop:
- to_insert: Iterable[tuple[Any, Any | None]]
- default = "index" if "index" not in self else "level_0"
- names = self.index._get_default_index_names(names, default)
- if isinstance(self.index, MultiIndex):
- to_insert = zip(self.index.levels, self.index.codes)
- else:
- to_insert = ((self.index, None),)
- multi_col = isinstance(self.columns, MultiIndex)
- for i, (lev, lab) in reversed(list(enumerate(to_insert))):
- if level is not None and i not in level:
- continue
- name = names[i]
- if multi_col:
- col_name = list(name) if isinstance(name, tuple) else [name]
- if col_fill is None:
- if len(col_name) not in (1, self.columns.nlevels):
- raise ValueError(
- "col_fill=None is incompatible "
- f"with incomplete column name {name}"
- )
- col_fill = col_name[0]
- lev_num = self.columns._get_level_number(col_level)
- name_lst = [col_fill] * lev_num + col_name
- missing = self.columns.nlevels - len(name_lst)
- name_lst += [col_fill] * missing
- name = tuple(name_lst)
- # to ndarray and maybe infer different dtype
- level_values = lev._values
- if level_values.dtype == np.object_:
- level_values = lib.maybe_convert_objects(level_values)
- if lab is not None:
- # if we have the codes, extract the values with a mask
- level_values = algorithms.take(
- level_values, lab, allow_fill=True, fill_value=lev._na_value
- )
- new_obj.insert(
- 0,
- name,
- level_values,
- allow_duplicates=allow_duplicates,
- )
- new_obj.index = new_index
- if not inplace:
- return new_obj
- return None
- # ----------------------------------------------------------------------
- # Reindex-based selection methods
- @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
- def isna(self) -> DataFrame:
- result = self._constructor(self._mgr.isna(func=isna))
- return result.__finalize__(self, method="isna")
- @doc(NDFrame.isna, klass=_shared_doc_kwargs["klass"])
- def isnull(self) -> DataFrame:
- """
- DataFrame.isnull is an alias for DataFrame.isna.
- """
- return self.isna()
- @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
- def notna(self) -> DataFrame:
- return ~self.isna()
- @doc(NDFrame.notna, klass=_shared_doc_kwargs["klass"])
- def notnull(self) -> DataFrame:
- """
- DataFrame.notnull is an alias for DataFrame.notna.
- """
- return ~self.isna()
- @overload
- def dropna(
- self,
- *,
- axis: Axis = ...,
- how: AnyAll | NoDefault = ...,
- thresh: int | NoDefault = ...,
- subset: IndexLabel = ...,
- inplace: Literal[False] = ...,
- ignore_index: bool = ...,
- ) -> DataFrame:
- ...
- @overload
- def dropna(
- self,
- *,
- axis: Axis = ...,
- how: AnyAll | NoDefault = ...,
- thresh: int | NoDefault = ...,
- subset: IndexLabel = ...,
- inplace: Literal[True],
- ignore_index: bool = ...,
- ) -> None:
- ...
- def dropna(
- self,
- *,
- axis: Axis = 0,
- how: AnyAll | NoDefault = no_default,
- thresh: int | NoDefault = no_default,
- subset: IndexLabel = None,
- inplace: bool = False,
- ignore_index: bool = False,
- ) -> DataFrame | None:
- """
- Remove missing values.
- See the :ref:`User Guide <missing_data>` for more on which values are
- considered missing, and how to work with missing data.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Determine if rows or columns which contain missing values are
- removed.
- * 0, or 'index' : Drop rows which contain missing values.
- * 1, or 'columns' : Drop columns which contain missing value.
- Pass tuple or list to drop on multiple axes.
- Only a single axis is allowed.
- how : {'any', 'all'}, default 'any'
- Determine if row or column is removed from DataFrame, when we have
- at least one NA or all NA.
- * 'any' : If any NA values are present, drop that row or column.
- * 'all' : If all values are NA, drop that row or column.
- thresh : int, optional
- Require that many non-NA values. Cannot be combined with how.
- subset : column label or sequence of labels, optional
- Labels along other axis to consider, e.g. if you are dropping rows
- these would be a list of columns to include.
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- ignore_index : bool, default ``False``
- If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
- .. versionadded:: 2.0.0
- Returns
- -------
- DataFrame or None
- DataFrame with NA entries dropped from it or None if ``inplace=True``.
- See Also
- --------
- DataFrame.isna: Indicate missing values.
- DataFrame.notna : Indicate existing (non-missing) values.
- DataFrame.fillna : Replace missing values.
- Series.dropna : Drop missing values.
- Index.dropna : Drop missing indices.
- Examples
- --------
- >>> df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
- ... "toy": [np.nan, 'Batmobile', 'Bullwhip'],
- ... "born": [pd.NaT, pd.Timestamp("1940-04-25"),
- ... pd.NaT]})
- >>> df
- name toy born
- 0 Alfred NaN NaT
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
- Drop the rows where at least one element is missing.
- >>> df.dropna()
- name toy born
- 1 Batman Batmobile 1940-04-25
- Drop the columns where at least one element is missing.
- >>> df.dropna(axis='columns')
- name
- 0 Alfred
- 1 Batman
- 2 Catwoman
- Drop the rows where all elements are missing.
- >>> df.dropna(how='all')
- name toy born
- 0 Alfred NaN NaT
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
- Keep only the rows with at least 2 non-NA values.
- >>> df.dropna(thresh=2)
- name toy born
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
- Define in which columns to look for missing values.
- >>> df.dropna(subset=['name', 'toy'])
- name toy born
- 1 Batman Batmobile 1940-04-25
- 2 Catwoman Bullwhip NaT
- """
- if (how is not no_default) and (thresh is not no_default):
- raise TypeError(
- "You cannot set both the how and thresh arguments at the same time."
- )
- if how is no_default:
- how = "any"
- inplace = validate_bool_kwarg(inplace, "inplace")
- if isinstance(axis, (tuple, list)):
- # GH20987
- raise TypeError("supplying multiple axes to axis is no longer supported.")
- axis = self._get_axis_number(axis)
- agg_axis = 1 - axis
- agg_obj = self
- if subset is not None:
- # subset needs to be list
- if not is_list_like(subset):
- subset = [subset]
- ax = self._get_axis(agg_axis)
- indices = ax.get_indexer_for(subset)
- check = indices == -1
- if check.any():
- raise KeyError(np.array(subset)[check].tolist())
- agg_obj = self.take(indices, axis=agg_axis)
- if thresh is not no_default:
- count = agg_obj.count(axis=agg_axis)
- mask = count >= thresh
- elif how == "any":
- # faster equivalent to 'agg_obj.count(agg_axis) == self.shape[agg_axis]'
- mask = notna(agg_obj).all(axis=agg_axis, bool_only=False)
- elif how == "all":
- # faster equivalent to 'agg_obj.count(agg_axis) > 0'
- mask = notna(agg_obj).any(axis=agg_axis, bool_only=False)
- else:
- raise ValueError(f"invalid how option: {how}")
- if np.all(mask):
- result = self.copy(deep=None)
- else:
- result = self.loc(axis=axis)[mask]
- if ignore_index:
- result.index = default_index(len(result))
- if not inplace:
- return result
- self._update_inplace(result)
- return None
- def drop_duplicates(
- self,
- subset: Hashable | Sequence[Hashable] | None = None,
- *,
- keep: DropKeep = "first",
- inplace: bool = False,
- ignore_index: bool = False,
- ) -> DataFrame | None:
- """
- Return DataFrame with duplicate rows removed.
- Considering certain columns is optional. Indexes, including time indexes
- are ignored.
- Parameters
- ----------
- subset : column label or sequence of labels, optional
- Only consider certain columns for identifying duplicates, by
- default use all of the columns.
- keep : {'first', 'last', ``False``}, default 'first'
- Determines which duplicates (if any) to keep.
- - 'first' : Drop duplicates except for the first occurrence.
- - 'last' : Drop duplicates except for the last occurrence.
- - ``False`` : Drop all duplicates.
- inplace : bool, default ``False``
- Whether to modify the DataFrame rather than creating a new one.
- ignore_index : bool, default ``False``
- If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
- Returns
- -------
- DataFrame or None
- DataFrame with duplicates removed or None if ``inplace=True``.
- See Also
- --------
- DataFrame.value_counts: Count unique combinations of columns.
- Examples
- --------
- Consider dataset containing ramen rating.
- >>> df = pd.DataFrame({
- ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
- ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
- ... 'rating': [4, 4, 3.5, 15, 5]
- ... })
- >>> df
- brand style rating
- 0 Yum Yum cup 4.0
- 1 Yum Yum cup 4.0
- 2 Indomie cup 3.5
- 3 Indomie pack 15.0
- 4 Indomie pack 5.0
- By default, it removes duplicate rows based on all columns.
- >>> df.drop_duplicates()
- brand style rating
- 0 Yum Yum cup 4.0
- 2 Indomie cup 3.5
- 3 Indomie pack 15.0
- 4 Indomie pack 5.0
- To remove duplicates on specific column(s), use ``subset``.
- >>> df.drop_duplicates(subset=['brand'])
- brand style rating
- 0 Yum Yum cup 4.0
- 2 Indomie cup 3.5
- To remove duplicates and keep last occurrences, use ``keep``.
- >>> df.drop_duplicates(subset=['brand', 'style'], keep='last')
- brand style rating
- 1 Yum Yum cup 4.0
- 2 Indomie cup 3.5
- 4 Indomie pack 5.0
- """
- if self.empty:
- return self.copy(deep=None)
- inplace = validate_bool_kwarg(inplace, "inplace")
- ignore_index = validate_bool_kwarg(ignore_index, "ignore_index")
- result = self[-self.duplicated(subset, keep=keep)]
- if ignore_index:
- result.index = default_index(len(result))
- if inplace:
- self._update_inplace(result)
- return None
- else:
- return result
- def duplicated(
- self,
- subset: Hashable | Sequence[Hashable] | None = None,
- keep: DropKeep = "first",
- ) -> Series:
- """
- Return boolean Series denoting duplicate rows.
- Considering certain columns is optional.
- Parameters
- ----------
- subset : column label or sequence of labels, optional
- Only consider certain columns for identifying duplicates, by
- default use all of the columns.
- keep : {'first', 'last', False}, default 'first'
- Determines which duplicates (if any) to mark.
- - ``first`` : Mark duplicates as ``True`` except for the first occurrence.
- - ``last`` : Mark duplicates as ``True`` except for the last occurrence.
- - False : Mark all duplicates as ``True``.
- Returns
- -------
- Series
- Boolean series for each duplicated rows.
- See Also
- --------
- Index.duplicated : Equivalent method on index.
- Series.duplicated : Equivalent method on Series.
- Series.drop_duplicates : Remove duplicate values from Series.
- DataFrame.drop_duplicates : Remove duplicate values from DataFrame.
- Examples
- --------
- Consider dataset containing ramen rating.
- >>> df = pd.DataFrame({
- ... 'brand': ['Yum Yum', 'Yum Yum', 'Indomie', 'Indomie', 'Indomie'],
- ... 'style': ['cup', 'cup', 'cup', 'pack', 'pack'],
- ... 'rating': [4, 4, 3.5, 15, 5]
- ... })
- >>> df
- brand style rating
- 0 Yum Yum cup 4.0
- 1 Yum Yum cup 4.0
- 2 Indomie cup 3.5
- 3 Indomie pack 15.0
- 4 Indomie pack 5.0
- By default, for each set of duplicated values, the first occurrence
- is set on False and all others on True.
- >>> df.duplicated()
- 0 False
- 1 True
- 2 False
- 3 False
- 4 False
- dtype: bool
- By using 'last', the last occurrence of each set of duplicated values
- is set on False and all others on True.
- >>> df.duplicated(keep='last')
- 0 True
- 1 False
- 2 False
- 3 False
- 4 False
- dtype: bool
- By setting ``keep`` on False, all duplicates are True.
- >>> df.duplicated(keep=False)
- 0 True
- 1 True
- 2 False
- 3 False
- 4 False
- dtype: bool
- To find duplicates on specific column(s), use ``subset``.
- >>> df.duplicated(subset=['brand'])
- 0 False
- 1 True
- 2 False
- 3 True
- 4 True
- dtype: bool
- """
- if self.empty:
- return self._constructor_sliced(dtype=bool)
- def f(vals) -> tuple[np.ndarray, int]:
- labels, shape = algorithms.factorize(vals, size_hint=len(self))
- return labels.astype("i8", copy=False), len(shape)
- if subset is None:
- # https://github.com/pandas-dev/pandas/issues/28770
- # Incompatible types in assignment (expression has type "Index", variable
- # has type "Sequence[Any]")
- subset = self.columns # type: ignore[assignment]
- elif (
- not np.iterable(subset)
- or isinstance(subset, str)
- or isinstance(subset, tuple)
- and subset in self.columns
- ):
- subset = (subset,)
- # needed for mypy since can't narrow types using np.iterable
- subset = cast(Sequence, subset)
- # Verify all columns in subset exist in the queried dataframe
- # Otherwise, raise a KeyError, same as if you try to __getitem__ with a
- # key that doesn't exist.
- diff = set(subset) - set(self.columns)
- if diff:
- raise KeyError(Index(diff))
- if len(subset) == 1 and self.columns.is_unique:
- # GH#45236 This is faster than get_group_index below
- result = self[subset[0]].duplicated(keep)
- result.name = None
- else:
- vals = (col.values for name, col in self.items() if name in subset)
- labels, shape = map(list, zip(*map(f, vals)))
- ids = get_group_index(
- labels,
- # error: Argument 1 to "tuple" has incompatible type "List[_T]";
- # expected "Iterable[int]"
- tuple(shape), # type: ignore[arg-type]
- sort=False,
- xnull=False,
- )
- result = self._constructor_sliced(duplicated(ids, keep), index=self.index)
- return result.__finalize__(self, method="duplicated")
- # ----------------------------------------------------------------------
- # Sorting
- # error: Signature of "sort_values" incompatible with supertype "NDFrame"
- @overload # type: ignore[override]
- def sort_values(
- self,
- by: IndexLabel,
- *,
- axis: Axis = ...,
- ascending=...,
- inplace: Literal[False] = ...,
- kind: str = ...,
- na_position: str = ...,
- ignore_index: bool = ...,
- key: ValueKeyFunc = ...,
- ) -> DataFrame:
- ...
- @overload
- def sort_values(
- self,
- by: IndexLabel,
- *,
- axis: Axis = ...,
- ascending=...,
- inplace: Literal[True],
- kind: str = ...,
- na_position: str = ...,
- ignore_index: bool = ...,
- key: ValueKeyFunc = ...,
- ) -> None:
- ...
- # TODO: Just move the sort_values doc here.
- @Substitution(**_shared_doc_kwargs)
- @Appender(NDFrame.sort_values.__doc__)
- def sort_values(
- self,
- by: IndexLabel,
- *,
- axis: Axis = 0,
- ascending: bool | list[bool] | tuple[bool, ...] = True,
- inplace: bool = False,
- kind: str = "quicksort",
- na_position: str = "last",
- ignore_index: bool = False,
- key: ValueKeyFunc = None,
- ) -> DataFrame | None:
- inplace = validate_bool_kwarg(inplace, "inplace")
- axis = self._get_axis_number(axis)
- ascending = validate_ascending(ascending)
- if not isinstance(by, list):
- by = [by]
- # error: Argument 1 to "len" has incompatible type "Union[bool, List[bool]]";
- # expected "Sized"
- if is_sequence(ascending) and (
- len(by) != len(ascending) # type: ignore[arg-type]
- ):
- # error: Argument 1 to "len" has incompatible type "Union[bool,
- # List[bool]]"; expected "Sized"
- raise ValueError(
- f"Length of ascending ({len(ascending)})" # type: ignore[arg-type]
- f" != length of by ({len(by)})"
- )
- if len(by) > 1:
- keys = [self._get_label_or_level_values(x, axis=axis) for x in by]
- # need to rewrap columns in Series to apply key function
- if key is not None:
- # error: List comprehension has incompatible type List[Series];
- # expected List[ndarray]
- keys = [
- Series(k, name=name) # type: ignore[misc]
- for (k, name) in zip(keys, by)
- ]
- indexer = lexsort_indexer(
- keys, orders=ascending, na_position=na_position, key=key
- )
- elif len(by):
- # len(by) == 1
- by = by[0]
- k = self._get_label_or_level_values(by, axis=axis)
- # need to rewrap column in Series to apply key function
- if key is not None:
- # error: Incompatible types in assignment (expression has type
- # "Series", variable has type "ndarray")
- k = Series(k, name=by) # type: ignore[assignment]
- if isinstance(ascending, (tuple, list)):
- ascending = ascending[0]
- indexer = nargsort(
- k, kind=kind, ascending=ascending, na_position=na_position, key=key
- )
- else:
- if inplace:
- return self._update_inplace(self)
- else:
- return self.copy(deep=None)
- if is_range_indexer(indexer, len(indexer)):
- result = self.copy(deep=(not inplace and not using_copy_on_write()))
- if ignore_index:
- result.index = default_index(len(result))
- if inplace:
- return self._update_inplace(result)
- else:
- return result
- new_data = self._mgr.take(
- indexer, axis=self._get_block_manager_axis(axis), verify=False
- )
- if ignore_index:
- new_data.set_axis(
- self._get_block_manager_axis(axis), default_index(len(indexer))
- )
- result = self._constructor(new_data)
- if inplace:
- return self._update_inplace(result)
- else:
- return result.__finalize__(self, method="sort_values")
- @overload
- def sort_index(
- self,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool | Sequence[bool] = ...,
- inplace: Literal[True],
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool = ...,
- ignore_index: bool = ...,
- key: IndexKeyFunc = ...,
- ) -> None:
- ...
- @overload
- def sort_index(
- self,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool | Sequence[bool] = ...,
- inplace: Literal[False] = ...,
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool = ...,
- ignore_index: bool = ...,
- key: IndexKeyFunc = ...,
- ) -> DataFrame:
- ...
- @overload
- def sort_index(
- self,
- *,
- axis: Axis = ...,
- level: IndexLabel = ...,
- ascending: bool | Sequence[bool] = ...,
- inplace: bool = ...,
- kind: SortKind = ...,
- na_position: NaPosition = ...,
- sort_remaining: bool = ...,
- ignore_index: bool = ...,
- key: IndexKeyFunc = ...,
- ) -> DataFrame | None:
- ...
- def sort_index(
- self,
- *,
- axis: Axis = 0,
- level: IndexLabel = None,
- ascending: bool | Sequence[bool] = True,
- inplace: bool = False,
- kind: SortKind = "quicksort",
- na_position: NaPosition = "last",
- sort_remaining: bool = True,
- ignore_index: bool = False,
- key: IndexKeyFunc = None,
- ) -> DataFrame | None:
- """
- Sort object by labels (along an axis).
- Returns a new DataFrame sorted by label if `inplace` argument is
- ``False``, otherwise updates the original DataFrame and returns None.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis along which to sort. The value 0 identifies the rows,
- and 1 identifies the columns.
- level : int or level name or list of ints or list of level names
- If not None, sort on values in specified index level(s).
- ascending : bool or list-like of bools, default True
- Sort ascending vs. descending. When the index is a MultiIndex the
- sort direction can be controlled for each level individually.
- inplace : bool, default False
- Whether to modify the DataFrame rather than creating a new one.
- kind : {'quicksort', 'mergesort', 'heapsort', 'stable'}, default 'quicksort'
- Choice of sorting algorithm. See also :func:`numpy.sort` for more
- information. `mergesort` and `stable` are the only stable algorithms. For
- DataFrames, this option is only applied when sorting on a single
- column or label.
- na_position : {'first', 'last'}, default 'last'
- Puts NaNs at the beginning if `first`; `last` puts NaNs at the end.
- Not implemented for MultiIndex.
- sort_remaining : bool, default True
- If True and sorting by level and index is multilevel, sort by other
- levels too (in order) after sorting by specified level.
- ignore_index : bool, default False
- If True, the resulting axis will be labeled 0, 1, …, n - 1.
- key : callable, optional
- If not None, apply the key function to the index values
- before sorting. This is similar to the `key` argument in the
- builtin :meth:`sorted` function, with the notable difference that
- this `key` function should be *vectorized*. It should expect an
- ``Index`` and return an ``Index`` of the same shape. For MultiIndex
- inputs, the key is applied *per level*.
- .. versionadded:: 1.1.0
- Returns
- -------
- DataFrame or None
- The original DataFrame sorted by the labels or None if ``inplace=True``.
- See Also
- --------
- Series.sort_index : Sort Series by the index.
- DataFrame.sort_values : Sort DataFrame by the value.
- Series.sort_values : Sort Series by the value.
- Examples
- --------
- >>> df = pd.DataFrame([1, 2, 3, 4, 5], index=[100, 29, 234, 1, 150],
- ... columns=['A'])
- >>> df.sort_index()
- A
- 1 4
- 29 2
- 100 1
- 150 5
- 234 3
- By default, it sorts in ascending order, to sort in descending order,
- use ``ascending=False``
- >>> df.sort_index(ascending=False)
- A
- 234 3
- 150 5
- 100 1
- 29 2
- 1 4
- A key function can be specified which is applied to the index before
- sorting. For a ``MultiIndex`` this is applied to each level separately.
- >>> df = pd.DataFrame({"a": [1, 2, 3, 4]}, index=['A', 'b', 'C', 'd'])
- >>> df.sort_index(key=lambda x: x.str.lower())
- a
- A 1
- b 2
- C 3
- d 4
- """
- return super().sort_index(
- axis=axis,
- level=level,
- ascending=ascending,
- inplace=inplace,
- kind=kind,
- na_position=na_position,
- sort_remaining=sort_remaining,
- ignore_index=ignore_index,
- key=key,
- )
- def value_counts(
- self,
- subset: Sequence[Hashable] | None = None,
- normalize: bool = False,
- sort: bool = True,
- ascending: bool = False,
- dropna: bool = True,
- ) -> Series:
- """
- Return a Series containing counts of unique rows in the DataFrame.
- .. versionadded:: 1.1.0
- Parameters
- ----------
- subset : label or list of labels, optional
- Columns to use when counting unique combinations.
- normalize : bool, default False
- Return proportions rather than frequencies.
- sort : bool, default True
- Sort by frequencies.
- ascending : bool, default False
- Sort in ascending order.
- dropna : bool, default True
- Don’t include counts of rows that contain NA values.
- .. versionadded:: 1.3.0
- Returns
- -------
- Series
- See Also
- --------
- Series.value_counts: Equivalent method on Series.
- Notes
- -----
- The returned Series will have a MultiIndex with one level per input
- column but an Index (non-multi) for a single label. By default, rows
- that contain any NA values are omitted from the result. By default,
- the resulting Series will be in descending order so that the first
- element is the most frequently-occurring row.
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [2, 4, 4, 6],
- ... 'num_wings': [2, 0, 0, 0]},
- ... index=['falcon', 'dog', 'cat', 'ant'])
- >>> df
- num_legs num_wings
- falcon 2 2
- dog 4 0
- cat 4 0
- ant 6 0
- >>> df.value_counts()
- num_legs num_wings
- 4 0 2
- 2 2 1
- 6 0 1
- Name: count, dtype: int64
- >>> df.value_counts(sort=False)
- num_legs num_wings
- 2 2 1
- 4 0 2
- 6 0 1
- Name: count, dtype: int64
- >>> df.value_counts(ascending=True)
- num_legs num_wings
- 2 2 1
- 6 0 1
- 4 0 2
- Name: count, dtype: int64
- >>> df.value_counts(normalize=True)
- num_legs num_wings
- 4 0 0.50
- 2 2 0.25
- 6 0 0.25
- Name: proportion, dtype: float64
- With `dropna` set to `False` we can also count rows with NA values.
- >>> df = pd.DataFrame({'first_name': ['John', 'Anne', 'John', 'Beth'],
- ... 'middle_name': ['Smith', pd.NA, pd.NA, 'Louise']})
- >>> df
- first_name middle_name
- 0 John Smith
- 1 Anne <NA>
- 2 John <NA>
- 3 Beth Louise
- >>> df.value_counts()
- first_name middle_name
- Beth Louise 1
- John Smith 1
- Name: count, dtype: int64
- >>> df.value_counts(dropna=False)
- first_name middle_name
- Anne NaN 1
- Beth Louise 1
- John Smith 1
- NaN 1
- Name: count, dtype: int64
- >>> df.value_counts("first_name")
- first_name
- John 2
- Anne 1
- Beth 1
- Name: count, dtype: int64
- """
- if subset is None:
- subset = self.columns.tolist()
- name = "proportion" if normalize else "count"
- counts = self.groupby(subset, dropna=dropna).grouper.size()
- counts.name = name
- if sort:
- counts = counts.sort_values(ascending=ascending)
- if normalize:
- counts /= counts.sum()
- # Force MultiIndex for single column
- if is_list_like(subset) and len(subset) == 1:
- counts.index = MultiIndex.from_arrays(
- [counts.index], names=[counts.index.name]
- )
- return counts
- def nlargest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
- """
- Return the first `n` rows ordered by `columns` in descending order.
- Return the first `n` rows with the largest values in `columns`, in
- descending order. The columns that are not specified are returned as
- well, but not used for ordering.
- This method is equivalent to
- ``df.sort_values(columns, ascending=False).head(n)``, but more
- performant.
- Parameters
- ----------
- n : int
- Number of rows to return.
- columns : label or list of labels
- Column label(s) to order by.
- keep : {'first', 'last', 'all'}, default 'first'
- Where there are duplicate values:
- - ``first`` : prioritize the first occurrence(s)
- - ``last`` : prioritize the last occurrence(s)
- - ``all`` : do not drop any duplicates, even it means
- selecting more than `n` items.
- Returns
- -------
- DataFrame
- The first `n` rows ordered by the given columns in descending
- order.
- See Also
- --------
- DataFrame.nsmallest : Return the first `n` rows ordered by `columns` in
- ascending order.
- DataFrame.sort_values : Sort DataFrame by the values.
- DataFrame.head : Return the first `n` rows without re-ordering.
- Notes
- -----
- This function cannot be used with all column types. For example, when
- specifying columns with `object` or `category` dtypes, ``TypeError`` is
- raised.
- Examples
- --------
- >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
- ... 434000, 434000, 337000, 11300,
- ... 11300, 11300],
- ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
- ... 17036, 182, 38, 311],
- ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
- ... "IS", "NR", "TV", "AI"]},
- ... index=["Italy", "France", "Malta",
- ... "Maldives", "Brunei", "Iceland",
- ... "Nauru", "Tuvalu", "Anguilla"])
- >>> df
- population GDP alpha-2
- Italy 59000000 1937894 IT
- France 65000000 2583560 FR
- Malta 434000 12011 MT
- Maldives 434000 4520 MV
- Brunei 434000 12128 BN
- Iceland 337000 17036 IS
- Nauru 11300 182 NR
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- In the following example, we will use ``nlargest`` to select the three
- rows having the largest values in column "population".
- >>> df.nlargest(3, 'population')
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Malta 434000 12011 MT
- When using ``keep='last'``, ties are resolved in reverse order:
- >>> df.nlargest(3, 'population', keep='last')
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Brunei 434000 12128 BN
- When using ``keep='all'``, all duplicate items are maintained:
- >>> df.nlargest(3, 'population', keep='all')
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Malta 434000 12011 MT
- Maldives 434000 4520 MV
- Brunei 434000 12128 BN
- To order by the largest values in column "population" and then "GDP",
- we can specify multiple columns like in the next example.
- >>> df.nlargest(3, ['population', 'GDP'])
- population GDP alpha-2
- France 65000000 2583560 FR
- Italy 59000000 1937894 IT
- Brunei 434000 12128 BN
- """
- return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nlargest()
- def nsmallest(self, n: int, columns: IndexLabel, keep: str = "first") -> DataFrame:
- """
- Return the first `n` rows ordered by `columns` in ascending order.
- Return the first `n` rows with the smallest values in `columns`, in
- ascending order. The columns that are not specified are returned as
- well, but not used for ordering.
- This method is equivalent to
- ``df.sort_values(columns, ascending=True).head(n)``, but more
- performant.
- Parameters
- ----------
- n : int
- Number of items to retrieve.
- columns : list or str
- Column name or names to order by.
- keep : {'first', 'last', 'all'}, default 'first'
- Where there are duplicate values:
- - ``first`` : take the first occurrence.
- - ``last`` : take the last occurrence.
- - ``all`` : do not drop any duplicates, even it means
- selecting more than `n` items.
- Returns
- -------
- DataFrame
- See Also
- --------
- DataFrame.nlargest : Return the first `n` rows ordered by `columns` in
- descending order.
- DataFrame.sort_values : Sort DataFrame by the values.
- DataFrame.head : Return the first `n` rows without re-ordering.
- Examples
- --------
- >>> df = pd.DataFrame({'population': [59000000, 65000000, 434000,
- ... 434000, 434000, 337000, 337000,
- ... 11300, 11300],
- ... 'GDP': [1937894, 2583560 , 12011, 4520, 12128,
- ... 17036, 182, 38, 311],
- ... 'alpha-2': ["IT", "FR", "MT", "MV", "BN",
- ... "IS", "NR", "TV", "AI"]},
- ... index=["Italy", "France", "Malta",
- ... "Maldives", "Brunei", "Iceland",
- ... "Nauru", "Tuvalu", "Anguilla"])
- >>> df
- population GDP alpha-2
- Italy 59000000 1937894 IT
- France 65000000 2583560 FR
- Malta 434000 12011 MT
- Maldives 434000 4520 MV
- Brunei 434000 12128 BN
- Iceland 337000 17036 IS
- Nauru 337000 182 NR
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- In the following example, we will use ``nsmallest`` to select the
- three rows having the smallest values in column "population".
- >>> df.nsmallest(3, 'population')
- population GDP alpha-2
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- Iceland 337000 17036 IS
- When using ``keep='last'``, ties are resolved in reverse order:
- >>> df.nsmallest(3, 'population', keep='last')
- population GDP alpha-2
- Anguilla 11300 311 AI
- Tuvalu 11300 38 TV
- Nauru 337000 182 NR
- When using ``keep='all'``, all duplicate items are maintained:
- >>> df.nsmallest(3, 'population', keep='all')
- population GDP alpha-2
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- Iceland 337000 17036 IS
- Nauru 337000 182 NR
- To order by the smallest values in column "population" and then "GDP", we can
- specify multiple columns like in the next example.
- >>> df.nsmallest(3, ['population', 'GDP'])
- population GDP alpha-2
- Tuvalu 11300 38 TV
- Anguilla 11300 311 AI
- Nauru 337000 182 NR
- """
- return selectn.SelectNFrame(self, n=n, keep=keep, columns=columns).nsmallest()
- @doc(
- Series.swaplevel,
- klass=_shared_doc_kwargs["klass"],
- extra_params=dedent(
- """axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to swap levels on. 0 or 'index' for row-wise, 1 or
- 'columns' for column-wise."""
- ),
- examples=dedent(
- """\
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {"Grade": ["A", "B", "A", "C"]},
- ... index=[
- ... ["Final exam", "Final exam", "Coursework", "Coursework"],
- ... ["History", "Geography", "History", "Geography"],
- ... ["January", "February", "March", "April"],
- ... ],
- ... )
- >>> df
- Grade
- Final exam History January A
- Geography February B
- Coursework History March A
- Geography April C
- In the following example, we will swap the levels of the indices.
- Here, we will swap the levels column-wise, but levels can be swapped row-wise
- in a similar manner. Note that column-wise is the default behaviour.
- By not supplying any arguments for i and j, we swap the last and second to
- last indices.
- >>> df.swaplevel()
- Grade
- Final exam January History A
- February Geography B
- Coursework March History A
- April Geography C
- By supplying one argument, we can choose which index to swap the last
- index with. We can for example swap the first index with the last one as
- follows.
- >>> df.swaplevel(0)
- Grade
- January History Final exam A
- February Geography Final exam B
- March History Coursework A
- April Geography Coursework C
- We can also define explicitly which indices we want to swap by supplying values
- for both i and j. Here, we for example swap the first and second indices.
- >>> df.swaplevel(0, 1)
- Grade
- History Final exam January A
- Geography Final exam February B
- History Coursework March A
- Geography Coursework April C"""
- ),
- )
- def swaplevel(self, i: Axis = -2, j: Axis = -1, axis: Axis = 0) -> DataFrame:
- result = self.copy(deep=None)
- axis = self._get_axis_number(axis)
- if not isinstance(result._get_axis(axis), MultiIndex): # pragma: no cover
- raise TypeError("Can only swap levels on a hierarchical axis.")
- if axis == 0:
- assert isinstance(result.index, MultiIndex)
- result.index = result.index.swaplevel(i, j)
- else:
- assert isinstance(result.columns, MultiIndex)
- result.columns = result.columns.swaplevel(i, j)
- return result
- def reorder_levels(self, order: Sequence[int | str], axis: Axis = 0) -> DataFrame:
- """
- Rearrange index levels using input order. May not drop or duplicate levels.
- Parameters
- ----------
- order : list of int or list of str
- List representing new level order. Reference level by number
- (position) or by key (label).
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Where to reorder levels.
- Returns
- -------
- DataFrame
- Examples
- --------
- >>> data = {
- ... "class": ["Mammals", "Mammals", "Reptiles"],
- ... "diet": ["Omnivore", "Carnivore", "Carnivore"],
- ... "species": ["Humans", "Dogs", "Snakes"],
- ... }
- >>> df = pd.DataFrame(data, columns=["class", "diet", "species"])
- >>> df = df.set_index(["class", "diet"])
- >>> df
- species
- class diet
- Mammals Omnivore Humans
- Carnivore Dogs
- Reptiles Carnivore Snakes
- Let's reorder the levels of the index:
- >>> df.reorder_levels(["diet", "class"])
- species
- diet class
- Omnivore Mammals Humans
- Carnivore Mammals Dogs
- Reptiles Snakes
- """
- axis = self._get_axis_number(axis)
- if not isinstance(self._get_axis(axis), MultiIndex): # pragma: no cover
- raise TypeError("Can only reorder levels on a hierarchical axis.")
- result = self.copy(deep=None)
- if axis == 0:
- assert isinstance(result.index, MultiIndex)
- result.index = result.index.reorder_levels(order)
- else:
- assert isinstance(result.columns, MultiIndex)
- result.columns = result.columns.reorder_levels(order)
- return result
- # ----------------------------------------------------------------------
- # Arithmetic Methods
- def _cmp_method(self, other, op):
- axis: Literal[1] = 1 # only relevant for Series other case
- self, other = ops.align_method_FRAME(self, other, axis, flex=False, level=None)
- # See GH#4537 for discussion of scalar op behavior
- new_data = self._dispatch_frame_op(other, op, axis=axis)
- return self._construct_result(new_data)
- def _arith_method(self, other, op):
- if ops.should_reindex_frame_op(self, other, op, 1, None, None):
- return ops.frame_arith_method_with_reindex(self, other, op)
- axis: Literal[1] = 1 # only relevant for Series other case
- other = ops.maybe_prepare_scalar_for_op(other, (self.shape[axis],))
- self, other = ops.align_method_FRAME(self, other, axis, flex=True, level=None)
- new_data = self._dispatch_frame_op(other, op, axis=axis)
- return self._construct_result(new_data)
- _logical_method = _arith_method
- def _dispatch_frame_op(self, right, func: Callable, axis: AxisInt | None = None):
- """
- Evaluate the frame operation func(left, right) by evaluating
- column-by-column, dispatching to the Series implementation.
- Parameters
- ----------
- right : scalar, Series, or DataFrame
- func : arithmetic or comparison operator
- axis : {None, 0, 1}
- Returns
- -------
- DataFrame
- """
- # Get the appropriate array-op to apply to each column/block's values.
- array_op = ops.get_array_op(func)
- right = lib.item_from_zerodim(right)
- if not is_list_like(right):
- # i.e. scalar, faster than checking np.ndim(right) == 0
- with np.errstate(all="ignore"):
- bm = self._mgr.apply(array_op, right=right)
- return self._constructor(bm)
- elif isinstance(right, DataFrame):
- assert self.index.equals(right.index)
- assert self.columns.equals(right.columns)
- # TODO: The previous assertion `assert right._indexed_same(self)`
- # fails in cases with empty columns reached via
- # _frame_arith_method_with_reindex
- # TODO operate_blockwise expects a manager of the same type
- with np.errstate(all="ignore"):
- bm = self._mgr.operate_blockwise(
- # error: Argument 1 to "operate_blockwise" of "ArrayManager" has
- # incompatible type "Union[ArrayManager, BlockManager]"; expected
- # "ArrayManager"
- # error: Argument 1 to "operate_blockwise" of "BlockManager" has
- # incompatible type "Union[ArrayManager, BlockManager]"; expected
- # "BlockManager"
- right._mgr, # type: ignore[arg-type]
- array_op,
- )
- return self._constructor(bm)
- elif isinstance(right, Series) and axis == 1:
- # axis=1 means we want to operate row-by-row
- assert right.index.equals(self.columns)
- right = right._values
- # maybe_align_as_frame ensures we do not have an ndarray here
- assert not isinstance(right, np.ndarray)
- with np.errstate(all="ignore"):
- arrays = [
- array_op(_left, _right)
- for _left, _right in zip(self._iter_column_arrays(), right)
- ]
- elif isinstance(right, Series):
- assert right.index.equals(self.index) # Handle other cases later
- right = right._values
- with np.errstate(all="ignore"):
- arrays = [array_op(left, right) for left in self._iter_column_arrays()]
- else:
- # Remaining cases have less-obvious dispatch rules
- raise NotImplementedError(right)
- return type(self)._from_arrays(
- arrays, self.columns, self.index, verify_integrity=False
- )
- def _combine_frame(self, other: DataFrame, func, fill_value=None):
- # at this point we have `self._indexed_same(other)`
- if fill_value is None:
- # since _arith_op may be called in a loop, avoid function call
- # overhead if possible by doing this check once
- _arith_op = func
- else:
- def _arith_op(left, right):
- # for the mixed_type case where we iterate over columns,
- # _arith_op(left, right) is equivalent to
- # left._binop(right, func, fill_value=fill_value)
- left, right = ops.fill_binop(left, right, fill_value)
- return func(left, right)
- new_data = self._dispatch_frame_op(other, _arith_op)
- return new_data
- def _construct_result(self, result) -> DataFrame:
- """
- Wrap the result of an arithmetic, comparison, or logical operation.
- Parameters
- ----------
- result : DataFrame
- Returns
- -------
- DataFrame
- """
- out = self._constructor(result, copy=False).__finalize__(self)
- # Pin columns instead of passing to constructor for compat with
- # non-unique columns case
- out.columns = self.columns
- out.index = self.index
- return out
- def __divmod__(self, other) -> tuple[DataFrame, DataFrame]:
- # Naive implementation, room for optimization
- div = self // other
- mod = self - div * other
- return div, mod
- def __rdivmod__(self, other) -> tuple[DataFrame, DataFrame]:
- # Naive implementation, room for optimization
- div = other // self
- mod = other - div * self
- return div, mod
- # ----------------------------------------------------------------------
- # Combination-Related
- @doc(
- _shared_docs["compare"],
- """
- Returns
- -------
- DataFrame
- DataFrame that shows the differences stacked side by side.
- The resulting index will be a MultiIndex with 'self' and 'other'
- stacked alternately at the inner level.
- Raises
- ------
- ValueError
- When the two DataFrames don't have identical labels or shape.
- See Also
- --------
- Series.compare : Compare with another Series and show differences.
- DataFrame.equals : Test whether two objects contain the same elements.
- Notes
- -----
- Matching NaNs will not appear as a difference.
- Can only compare identically-labeled
- (i.e. same shape, identical row and column labels) DataFrames
- Examples
- --------
- >>> df = pd.DataFrame(
- ... {{
- ... "col1": ["a", "a", "b", "b", "a"],
- ... "col2": [1.0, 2.0, 3.0, np.nan, 5.0],
- ... "col3": [1.0, 2.0, 3.0, 4.0, 5.0]
- ... }},
- ... columns=["col1", "col2", "col3"],
- ... )
- >>> df
- col1 col2 col3
- 0 a 1.0 1.0
- 1 a 2.0 2.0
- 2 b 3.0 3.0
- 3 b NaN 4.0
- 4 a 5.0 5.0
- >>> df2 = df.copy()
- >>> df2.loc[0, 'col1'] = 'c'
- >>> df2.loc[2, 'col3'] = 4.0
- >>> df2
- col1 col2 col3
- 0 c 1.0 1.0
- 1 a 2.0 2.0
- 2 b 3.0 4.0
- 3 b NaN 4.0
- 4 a 5.0 5.0
- Align the differences on columns
- >>> df.compare(df2)
- col1 col3
- self other self other
- 0 a c NaN NaN
- 2 NaN NaN 3.0 4.0
- Assign result_names
- >>> df.compare(df2, result_names=("left", "right"))
- col1 col3
- left right left right
- 0 a c NaN NaN
- 2 NaN NaN 3.0 4.0
- Stack the differences on rows
- >>> df.compare(df2, align_axis=0)
- col1 col3
- 0 self a NaN
- other c NaN
- 2 self NaN 3.0
- other NaN 4.0
- Keep the equal values
- >>> df.compare(df2, keep_equal=True)
- col1 col3
- self other self other
- 0 a c 1.0 1.0
- 2 b b 3.0 4.0
- Keep all original rows and columns
- >>> df.compare(df2, keep_shape=True)
- col1 col2 col3
- self other self other self other
- 0 a c NaN NaN NaN NaN
- 1 NaN NaN NaN NaN NaN NaN
- 2 NaN NaN NaN NaN 3.0 4.0
- 3 NaN NaN NaN NaN NaN NaN
- 4 NaN NaN NaN NaN NaN NaN
- Keep all original rows and columns and also all original values
- >>> df.compare(df2, keep_shape=True, keep_equal=True)
- col1 col2 col3
- self other self other self other
- 0 a c 1.0 1.0 1.0 1.0
- 1 a a 2.0 2.0 2.0 2.0
- 2 b b 3.0 3.0 3.0 4.0
- 3 b b NaN NaN 4.0 4.0
- 4 a a 5.0 5.0 5.0 5.0
- """,
- klass=_shared_doc_kwargs["klass"],
- )
- def compare(
- self,
- other: DataFrame,
- align_axis: Axis = 1,
- keep_shape: bool = False,
- keep_equal: bool = False,
- result_names: Suffixes = ("self", "other"),
- ) -> DataFrame:
- return super().compare(
- other=other,
- align_axis=align_axis,
- keep_shape=keep_shape,
- keep_equal=keep_equal,
- result_names=result_names,
- )
- def combine(
- self,
- other: DataFrame,
- func: Callable[[Series, Series], Series | Hashable],
- fill_value=None,
- overwrite: bool = True,
- ) -> DataFrame:
- """
- Perform column-wise combine with another DataFrame.
- Combines a DataFrame with `other` DataFrame using `func`
- to element-wise combine columns. The row and column indexes of the
- resulting DataFrame will be the union of the two.
- Parameters
- ----------
- other : DataFrame
- The DataFrame to merge column-wise.
- func : function
- Function that takes two series as inputs and return a Series or a
- scalar. Used to merge the two dataframes column by columns.
- fill_value : scalar value, default None
- The value to fill NaNs with prior to passing any column to the
- merge func.
- overwrite : bool, default True
- If True, columns in `self` that do not exist in `other` will be
- overwritten with NaNs.
- Returns
- -------
- DataFrame
- Combination of the provided DataFrames.
- See Also
- --------
- DataFrame.combine_first : Combine two DataFrame objects and default to
- non-null values in frame calling the method.
- Examples
- --------
- Combine using a simple function that chooses the smaller column.
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> take_smaller = lambda s1, s2: s1 if s1.sum() < s2.sum() else s2
- >>> df1.combine(df2, take_smaller)
- A B
- 0 0 3
- 1 0 3
- Example using a true element-wise combine function.
- >>> df1 = pd.DataFrame({'A': [5, 0], 'B': [2, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> df1.combine(df2, np.minimum)
- A B
- 0 1 2
- 1 0 3
- Using `fill_value` fills Nones prior to passing the column to the
- merge function.
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> df1.combine(df2, take_smaller, fill_value=-5)
- A B
- 0 0 -5.0
- 1 0 4.0
- However, if the same element in both dataframes is None, that None
- is preserved
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [None, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [None, 3]})
- >>> df1.combine(df2, take_smaller, fill_value=-5)
- A B
- 0 0 -5.0
- 1 0 3.0
- Example that demonstrates the use of `overwrite` and behavior when
- the axis differ between the dataframes.
- >>> df1 = pd.DataFrame({'A': [0, 0], 'B': [4, 4]})
- >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [-10, 1], }, index=[1, 2])
- >>> df1.combine(df2, take_smaller)
- A B C
- 0 NaN NaN NaN
- 1 NaN 3.0 -10.0
- 2 NaN 3.0 1.0
- >>> df1.combine(df2, take_smaller, overwrite=False)
- A B C
- 0 0.0 NaN NaN
- 1 0.0 3.0 -10.0
- 2 NaN 3.0 1.0
- Demonstrating the preference of the passed in dataframe.
- >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1], }, index=[1, 2])
- >>> df2.combine(df1, take_smaller)
- A B C
- 0 0.0 NaN NaN
- 1 0.0 3.0 NaN
- 2 NaN 3.0 NaN
- >>> df2.combine(df1, take_smaller, overwrite=False)
- A B C
- 0 0.0 NaN NaN
- 1 0.0 3.0 1.0
- 2 NaN 3.0 1.0
- """
- other_idxlen = len(other.index) # save for compare
- this, other = self.align(other, copy=False)
- new_index = this.index
- if other.empty and len(new_index) == len(self.index):
- return self.copy()
- if self.empty and len(other) == other_idxlen:
- return other.copy()
- # sorts if possible; otherwise align above ensures that these are set-equal
- new_columns = this.columns.union(other.columns)
- do_fill = fill_value is not None
- result = {}
- for col in new_columns:
- series = this[col]
- other_series = other[col]
- this_dtype = series.dtype
- other_dtype = other_series.dtype
- this_mask = isna(series)
- other_mask = isna(other_series)
- # don't overwrite columns unnecessarily
- # DO propagate if this column is not in the intersection
- if not overwrite and other_mask.all():
- result[col] = this[col].copy()
- continue
- if do_fill:
- series = series.copy()
- other_series = other_series.copy()
- series[this_mask] = fill_value
- other_series[other_mask] = fill_value
- if col not in self.columns:
- # If self DataFrame does not have col in other DataFrame,
- # try to promote series, which is all NaN, as other_dtype.
- new_dtype = other_dtype
- try:
- series = series.astype(new_dtype, copy=False)
- except ValueError:
- # e.g. new_dtype is integer types
- pass
- else:
- # if we have different dtypes, possibly promote
- new_dtype = find_common_type([this_dtype, other_dtype])
- series = series.astype(new_dtype, copy=False)
- other_series = other_series.astype(new_dtype, copy=False)
- arr = func(series, other_series)
- if isinstance(new_dtype, np.dtype):
- # if new_dtype is an EA Dtype, then `func` is expected to return
- # the correct dtype without any additional casting
- # error: No overload variant of "maybe_downcast_to_dtype" matches
- # argument types "Union[Series, Hashable]", "dtype[Any]"
- arr = maybe_downcast_to_dtype( # type: ignore[call-overload]
- arr, new_dtype
- )
- result[col] = arr
- # convert_objects just in case
- return self._constructor(result, index=new_index, columns=new_columns)
- def combine_first(self, other: DataFrame) -> DataFrame:
- """
- Update null elements with value in the same location in `other`.
- Combine two DataFrame objects by filling null values in one DataFrame
- with non-null values from other DataFrame. The row and column indexes
- of the resulting DataFrame will be the union of the two. The resulting
- dataframe contains the 'first' dataframe values and overrides the
- second one values where both first.loc[index, col] and
- second.loc[index, col] are not missing values, upon calling
- first.combine_first(second).
- Parameters
- ----------
- other : DataFrame
- Provided DataFrame to use to fill null values.
- Returns
- -------
- DataFrame
- The result of combining the provided DataFrame with the other object.
- See Also
- --------
- DataFrame.combine : Perform series-wise operation on two DataFrames
- using a given function.
- Examples
- --------
- >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [None, 4]})
- >>> df2 = pd.DataFrame({'A': [1, 1], 'B': [3, 3]})
- >>> df1.combine_first(df2)
- A B
- 0 1.0 3.0
- 1 0.0 4.0
- Null values still persist if the location of that null value
- does not exist in `other`
- >>> df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
- >>> df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
- >>> df1.combine_first(df2)
- A B C
- 0 NaN 4.0 NaN
- 1 0.0 3.0 1.0
- 2 NaN 3.0 1.0
- """
- from pandas.core.computation import expressions
- def combiner(x, y):
- mask = extract_array(isna(x))
- x_values = extract_array(x, extract_numpy=True)
- y_values = extract_array(y, extract_numpy=True)
- # If the column y in other DataFrame is not in first DataFrame,
- # just return y_values.
- if y.name not in self.columns:
- return y_values
- return expressions.where(mask, y_values, x_values)
- combined = self.combine(other, combiner, overwrite=False)
- dtypes = {
- col: find_common_type([self.dtypes[col], other.dtypes[col]])
- for col in self.columns.intersection(other.columns)
- if not is_dtype_equal(combined.dtypes[col], self.dtypes[col])
- }
- if dtypes:
- combined = combined.astype(dtypes)
- return combined
- def update(
- self,
- other,
- join: str = "left",
- overwrite: bool = True,
- filter_func=None,
- errors: str = "ignore",
- ) -> None:
- """
- Modify in place using non-NA values from another DataFrame.
- Aligns on indices. There is no return value.
- Parameters
- ----------
- other : DataFrame, or object coercible into a DataFrame
- Should have at least one matching index/column label
- with the original DataFrame. If a Series is passed,
- its name attribute must be set, and that will be
- used as the column name to align with the original DataFrame.
- join : {'left'}, default 'left'
- Only left join is implemented, keeping the index and columns of the
- original object.
- overwrite : bool, default True
- How to handle non-NA values for overlapping keys:
- * True: overwrite original DataFrame's values
- with values from `other`.
- * False: only update values that are NA in
- the original DataFrame.
- filter_func : callable(1d-array) -> bool 1d-array, optional
- Can choose to replace values other than NA. Return True for values
- that should be updated.
- errors : {'raise', 'ignore'}, default 'ignore'
- If 'raise', will raise a ValueError if the DataFrame and `other`
- both contain non-NA data in the same place.
- Returns
- -------
- None
- This method directly changes calling object.
- Raises
- ------
- ValueError
- * When `errors='raise'` and there's overlapping non-NA data.
- * When `errors` is not either `'ignore'` or `'raise'`
- NotImplementedError
- * If `join != 'left'`
- See Also
- --------
- dict.update : Similar method for dictionaries.
- DataFrame.merge : For column(s)-on-column(s) operations.
- Examples
- --------
- >>> df = pd.DataFrame({'A': [1, 2, 3],
- ... 'B': [400, 500, 600]})
- >>> new_df = pd.DataFrame({'B': [4, 5, 6],
- ... 'C': [7, 8, 9]})
- >>> df.update(new_df)
- >>> df
- A B
- 0 1 4
- 1 2 5
- 2 3 6
- The DataFrame's length does not increase as a result of the update,
- only values at matching index/column labels are updated.
- >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
- ... 'B': ['x', 'y', 'z']})
- >>> new_df = pd.DataFrame({'B': ['d', 'e', 'f', 'g', 'h', 'i']})
- >>> df.update(new_df)
- >>> df
- A B
- 0 a d
- 1 b e
- 2 c f
- For Series, its name attribute must be set.
- >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
- ... 'B': ['x', 'y', 'z']})
- >>> new_column = pd.Series(['d', 'e'], name='B', index=[0, 2])
- >>> df.update(new_column)
- >>> df
- A B
- 0 a d
- 1 b y
- 2 c e
- >>> df = pd.DataFrame({'A': ['a', 'b', 'c'],
- ... 'B': ['x', 'y', 'z']})
- >>> new_df = pd.DataFrame({'B': ['d', 'e']}, index=[1, 2])
- >>> df.update(new_df)
- >>> df
- A B
- 0 a x
- 1 b d
- 2 c e
- If `other` contains NaNs the corresponding values are not updated
- in the original dataframe.
- >>> df = pd.DataFrame({'A': [1, 2, 3],
- ... 'B': [400, 500, 600]})
- >>> new_df = pd.DataFrame({'B': [4, np.nan, 6]})
- >>> df.update(new_df)
- >>> df
- A B
- 0 1 4
- 1 2 500
- 2 3 6
- """
- from pandas.core.computation import expressions
- # TODO: Support other joins
- if join != "left": # pragma: no cover
- raise NotImplementedError("Only left join is supported")
- if errors not in ["ignore", "raise"]:
- raise ValueError("The parameter errors must be either 'ignore' or 'raise'")
- if not isinstance(other, DataFrame):
- other = DataFrame(other)
- other = other.reindex(self.index)
- for col in self.columns.intersection(other.columns):
- this = self[col]._values
- that = other[col]._values
- if filter_func is not None:
- with np.errstate(all="ignore"):
- mask = ~filter_func(this) | isna(that)
- else:
- if errors == "raise":
- mask_this = notna(that)
- mask_that = notna(this)
- if any(mask_this & mask_that):
- raise ValueError("Data overlaps.")
- if overwrite:
- mask = isna(that)
- else:
- mask = notna(this)
- # don't overwrite columns unnecessarily
- if mask.all():
- continue
- self.loc[:, col] = expressions.where(mask, this, that)
- # ----------------------------------------------------------------------
- # Data reshaping
- @Appender(
- """
- Examples
- --------
- >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
- ... 'Parrot', 'Parrot'],
- ... 'Max Speed': [380., 370., 24., 26.]})
- >>> df
- Animal Max Speed
- 0 Falcon 380.0
- 1 Falcon 370.0
- 2 Parrot 24.0
- 3 Parrot 26.0
- >>> df.groupby(['Animal']).mean()
- Max Speed
- Animal
- Falcon 375.0
- Parrot 25.0
- **Hierarchical Indexes**
- We can groupby different levels of a hierarchical index
- using the `level` parameter:
- >>> arrays = [['Falcon', 'Falcon', 'Parrot', 'Parrot'],
- ... ['Captive', 'Wild', 'Captive', 'Wild']]
- >>> index = pd.MultiIndex.from_arrays(arrays, names=('Animal', 'Type'))
- >>> df = pd.DataFrame({'Max Speed': [390., 350., 30., 20.]},
- ... index=index)
- >>> df
- Max Speed
- Animal Type
- Falcon Captive 390.0
- Wild 350.0
- Parrot Captive 30.0
- Wild 20.0
- >>> df.groupby(level=0).mean()
- Max Speed
- Animal
- Falcon 370.0
- Parrot 25.0
- >>> df.groupby(level="Type").mean()
- Max Speed
- Type
- Captive 210.0
- Wild 185.0
- We can also choose to include NA in group keys or not by setting
- `dropna` parameter, the default setting is `True`.
- >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
- >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
- >>> df.groupby(by=["b"]).sum()
- a c
- b
- 1.0 2 3
- 2.0 2 5
- >>> df.groupby(by=["b"], dropna=False).sum()
- a c
- b
- 1.0 2 3
- 2.0 2 5
- NaN 1 4
- >>> l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
- >>> df = pd.DataFrame(l, columns=["a", "b", "c"])
- >>> df.groupby(by="a").sum()
- b c
- a
- a 13.0 13.0
- b 12.3 123.0
- >>> df.groupby(by="a", dropna=False).sum()
- b c
- a
- a 13.0 13.0
- b 12.3 123.0
- NaN 12.3 33.0
- When using ``.apply()``, use ``group_keys`` to include or exclude the group keys.
- The ``group_keys`` argument defaults to ``True`` (include).
- >>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
- ... 'Parrot', 'Parrot'],
- ... 'Max Speed': [380., 370., 24., 26.]})
- >>> df.groupby("Animal", group_keys=True).apply(lambda x: x)
- Animal Max Speed
- Animal
- Falcon 0 Falcon 380.0
- 1 Falcon 370.0
- Parrot 2 Parrot 24.0
- 3 Parrot 26.0
- >>> df.groupby("Animal", group_keys=False).apply(lambda x: x)
- Animal Max Speed
- 0 Falcon 380.0
- 1 Falcon 370.0
- 2 Parrot 24.0
- 3 Parrot 26.0
- """
- )
- @Appender(_shared_docs["groupby"] % _shared_doc_kwargs)
- def groupby(
- self,
- by=None,
- axis: Axis = 0,
- level: IndexLabel | None = None,
- as_index: bool = True,
- sort: bool = True,
- group_keys: bool = True,
- observed: bool = False,
- dropna: bool = True,
- ) -> DataFrameGroupBy:
- from pandas.core.groupby.generic import DataFrameGroupBy
- if level is None and by is None:
- raise TypeError("You have to supply one of 'by' and 'level'")
- axis = self._get_axis_number(axis)
- return DataFrameGroupBy(
- obj=self,
- keys=by,
- axis=axis,
- level=level,
- as_index=as_index,
- sort=sort,
- group_keys=group_keys,
- observed=observed,
- dropna=dropna,
- )
- _shared_docs[
- "pivot"
- ] = """
- Return reshaped DataFrame organized by given index / column values.
- Reshape data (produce a "pivot" table) based on column values. Uses
- unique values from specified `index` / `columns` to form axes of the
- resulting DataFrame. This function does not support data
- aggregation, multiple values will result in a MultiIndex in the
- columns. See the :ref:`User Guide <reshaping>` for more on reshaping.
- Parameters
- ----------%s
- columns : str or object or a list of str
- Column to use to make new frame's columns.
- .. versionchanged:: 1.1.0
- Also accept list of columns names.
- index : str or object or a list of str, optional
- Column to use to make new frame's index. If not given, uses existing index.
- .. versionchanged:: 1.1.0
- Also accept list of index names.
- values : str, object or a list of the previous, optional
- Column(s) to use for populating new frame's values. If not
- specified, all remaining columns will be used and the result will
- have hierarchically indexed columns.
- Returns
- -------
- DataFrame
- Returns reshaped DataFrame.
- Raises
- ------
- ValueError:
- When there are any `index`, `columns` combinations with multiple
- values. `DataFrame.pivot_table` when you need to aggregate.
- See Also
- --------
- DataFrame.pivot_table : Generalization of pivot that can handle
- duplicate values for one index/column pair.
- DataFrame.unstack : Pivot based on the index values instead of a
- column.
- wide_to_long : Wide panel to long format. Less flexible but more
- user-friendly than melt.
- Notes
- -----
- For finer-tuned control, see hierarchical indexing documentation along
- with the related stack/unstack methods.
- Reference :ref:`the user guide <reshaping.pivot>` for more examples.
- Examples
- --------
- >>> df = pd.DataFrame({'foo': ['one', 'one', 'one', 'two', 'two',
- ... 'two'],
- ... 'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
- ... 'baz': [1, 2, 3, 4, 5, 6],
- ... 'zoo': ['x', 'y', 'z', 'q', 'w', 't']})
- >>> df
- foo bar baz zoo
- 0 one A 1 x
- 1 one B 2 y
- 2 one C 3 z
- 3 two A 4 q
- 4 two B 5 w
- 5 two C 6 t
- >>> df.pivot(index='foo', columns='bar', values='baz')
- bar A B C
- foo
- one 1 2 3
- two 4 5 6
- >>> df.pivot(index='foo', columns='bar')['baz']
- bar A B C
- foo
- one 1 2 3
- two 4 5 6
- >>> df.pivot(index='foo', columns='bar', values=['baz', 'zoo'])
- baz zoo
- bar A B C A B C
- foo
- one 1 2 3 x y z
- two 4 5 6 q w t
- You could also assign a list of column names or a list of index names.
- >>> df = pd.DataFrame({
- ... "lev1": [1, 1, 1, 2, 2, 2],
- ... "lev2": [1, 1, 2, 1, 1, 2],
- ... "lev3": [1, 2, 1, 2, 1, 2],
- ... "lev4": [1, 2, 3, 4, 5, 6],
- ... "values": [0, 1, 2, 3, 4, 5]})
- >>> df
- lev1 lev2 lev3 lev4 values
- 0 1 1 1 1 0
- 1 1 1 2 2 1
- 2 1 2 1 3 2
- 3 2 1 2 4 3
- 4 2 1 1 5 4
- 5 2 2 2 6 5
- >>> df.pivot(index="lev1", columns=["lev2", "lev3"], values="values")
- lev2 1 2
- lev3 1 2 1 2
- lev1
- 1 0.0 1.0 2.0 NaN
- 2 4.0 3.0 NaN 5.0
- >>> df.pivot(index=["lev1", "lev2"], columns=["lev3"], values="values")
- lev3 1 2
- lev1 lev2
- 1 1 0.0 1.0
- 2 2.0 NaN
- 2 1 4.0 3.0
- 2 NaN 5.0
- A ValueError is raised if there are any duplicates.
- >>> df = pd.DataFrame({"foo": ['one', 'one', 'two', 'two'],
- ... "bar": ['A', 'A', 'B', 'C'],
- ... "baz": [1, 2, 3, 4]})
- >>> df
- foo bar baz
- 0 one A 1
- 1 one A 2
- 2 two B 3
- 3 two C 4
- Notice that the first two rows are the same for our `index`
- and `columns` arguments.
- >>> df.pivot(index='foo', columns='bar', values='baz')
- Traceback (most recent call last):
- ...
- ValueError: Index contains duplicate entries, cannot reshape
- """
- @Substitution("")
- @Appender(_shared_docs["pivot"])
- def pivot(self, *, columns, index=lib.NoDefault, values=lib.NoDefault) -> DataFrame:
- from pandas.core.reshape.pivot import pivot
- return pivot(self, index=index, columns=columns, values=values)
- _shared_docs[
- "pivot_table"
- ] = """
- Create a spreadsheet-style pivot table as a DataFrame.
- The levels in the pivot table will be stored in MultiIndex objects
- (hierarchical indexes) on the index and columns of the result DataFrame.
- Parameters
- ----------%s
- values : list-like or scalar, optional
- Column or columns to aggregate.
- index : column, Grouper, array, or list of the previous
- If an array is passed, it must be the same length as the data. The
- list can contain any of the other types (except list).
- Keys to group by on the pivot table index. If an array is passed,
- it is being used as the same manner as column values.
- columns : column, Grouper, array, or list of the previous
- If an array is passed, it must be the same length as the data. The
- list can contain any of the other types (except list).
- Keys to group by on the pivot table column. If an array is passed,
- it is being used as the same manner as column values.
- aggfunc : function, list of functions, dict, default numpy.mean
- If list of functions passed, the resulting pivot table will have
- hierarchical columns whose top level are the function names
- (inferred from the function objects themselves)
- If dict is passed, the key is column to aggregate and value
- is function or list of functions. If ``margin=True``,
- aggfunc will be used to calculate the partial aggregates.
- fill_value : scalar, default None
- Value to replace missing values with (in the resulting pivot table,
- after aggregation).
- margins : bool, default False
- If ``margins=True``, special ``All`` columns and rows
- will be added with partial group aggregates across the categories
- on the rows and columns.
- dropna : bool, default True
- Do not include columns whose entries are all NaN. If True,
- rows with a NaN value in any column will be omitted before
- computing margins.
- margins_name : str, default 'All'
- Name of the row / column that will contain the totals
- when margins is True.
- observed : bool, default False
- This only applies if any of the groupers are Categoricals.
- If True: only show observed values for categorical groupers.
- If False: show all values for categorical groupers.
- sort : bool, default True
- Specifies if the result should be sorted.
- .. versionadded:: 1.3.0
- Returns
- -------
- DataFrame
- An Excel style pivot table.
- See Also
- --------
- DataFrame.pivot : Pivot without aggregation that can handle
- non-numeric data.
- DataFrame.melt: Unpivot a DataFrame from wide to long format,
- optionally leaving identifiers set.
- wide_to_long : Wide panel to long format. Less flexible but more
- user-friendly than melt.
- Notes
- -----
- Reference :ref:`the user guide <reshaping.pivot>` for more examples.
- Examples
- --------
- >>> df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo",
- ... "bar", "bar", "bar", "bar"],
- ... "B": ["one", "one", "one", "two", "two",
- ... "one", "one", "two", "two"],
- ... "C": ["small", "large", "large", "small",
- ... "small", "large", "small", "small",
- ... "large"],
- ... "D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
- ... "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]})
- >>> df
- A B C D E
- 0 foo one small 1 2
- 1 foo one large 2 4
- 2 foo one large 2 5
- 3 foo two small 3 5
- 4 foo two small 3 6
- 5 bar one large 4 6
- 6 bar one small 5 8
- 7 bar two small 6 9
- 8 bar two large 7 9
- This first example aggregates values by taking the sum.
- >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
- ... columns=['C'], aggfunc=np.sum)
- >>> table
- C large small
- A B
- bar one 4.0 5.0
- two 7.0 6.0
- foo one 4.0 1.0
- two NaN 6.0
- We can also fill missing values using the `fill_value` parameter.
- >>> table = pd.pivot_table(df, values='D', index=['A', 'B'],
- ... columns=['C'], aggfunc=np.sum, fill_value=0)
- >>> table
- C large small
- A B
- bar one 4 5
- two 7 6
- foo one 4 1
- two 0 6
- The next example aggregates by taking the mean across multiple columns.
- >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
- ... aggfunc={'D': np.mean, 'E': np.mean})
- >>> table
- D E
- A C
- bar large 5.500000 7.500000
- small 5.500000 8.500000
- foo large 2.000000 4.500000
- small 2.333333 4.333333
- We can also calculate multiple types of aggregations for any given
- value column.
- >>> table = pd.pivot_table(df, values=['D', 'E'], index=['A', 'C'],
- ... aggfunc={'D': np.mean,
- ... 'E': [min, max, np.mean]})
- >>> table
- D E
- mean max mean min
- A C
- bar large 5.500000 9 7.500000 6
- small 5.500000 9 8.500000 8
- foo large 2.000000 5 4.500000 4
- small 2.333333 6 4.333333 2
- """
- @Substitution("")
- @Appender(_shared_docs["pivot_table"])
- def pivot_table(
- self,
- values=None,
- index=None,
- columns=None,
- aggfunc: AggFuncType = "mean",
- fill_value=None,
- margins: bool = False,
- dropna: bool = True,
- margins_name: Level = "All",
- observed: bool = False,
- sort: bool = True,
- ) -> DataFrame:
- from pandas.core.reshape.pivot import pivot_table
- return pivot_table(
- self,
- values=values,
- index=index,
- columns=columns,
- aggfunc=aggfunc,
- fill_value=fill_value,
- margins=margins,
- dropna=dropna,
- margins_name=margins_name,
- observed=observed,
- sort=sort,
- )
- def stack(self, level: Level = -1, dropna: bool = True):
- """
- Stack the prescribed level(s) from columns to index.
- Return a reshaped DataFrame or Series having a multi-level
- index with one or more new inner-most levels compared to the current
- DataFrame. The new inner-most levels are created by pivoting the
- columns of the current dataframe:
- - if the columns have a single level, the output is a Series;
- - if the columns have multiple levels, the new index
- level(s) is (are) taken from the prescribed level(s) and
- the output is a DataFrame.
- Parameters
- ----------
- level : int, str, list, default -1
- Level(s) to stack from the column axis onto the index
- axis, defined as one index or label, or a list of indices
- or labels.
- dropna : bool, default True
- Whether to drop rows in the resulting Frame/Series with
- missing values. Stacking a column level onto the index
- axis can create combinations of index and column values
- that are missing from the original dataframe. See Examples
- section.
- Returns
- -------
- DataFrame or Series
- Stacked dataframe or series.
- See Also
- --------
- DataFrame.unstack : Unstack prescribed level(s) from index axis
- onto column axis.
- DataFrame.pivot : Reshape dataframe from long format to wide
- format.
- DataFrame.pivot_table : Create a spreadsheet-style pivot table
- as a DataFrame.
- Notes
- -----
- The function is named by analogy with a collection of books
- being reorganized from being side by side on a horizontal
- position (the columns of the dataframe) to being stacked
- vertically on top of each other (in the index of the
- dataframe).
- Reference :ref:`the user guide <reshaping.stacking>` for more examples.
- Examples
- --------
- **Single level columns**
- >>> df_single_level_cols = pd.DataFrame([[0, 1], [2, 3]],
- ... index=['cat', 'dog'],
- ... columns=['weight', 'height'])
- Stacking a dataframe with a single level column axis returns a Series:
- >>> df_single_level_cols
- weight height
- cat 0 1
- dog 2 3
- >>> df_single_level_cols.stack()
- cat weight 0
- height 1
- dog weight 2
- height 3
- dtype: int64
- **Multi level columns: simple case**
- >>> multicol1 = pd.MultiIndex.from_tuples([('weight', 'kg'),
- ... ('weight', 'pounds')])
- >>> df_multi_level_cols1 = pd.DataFrame([[1, 2], [2, 4]],
- ... index=['cat', 'dog'],
- ... columns=multicol1)
- Stacking a dataframe with a multi-level column axis:
- >>> df_multi_level_cols1
- weight
- kg pounds
- cat 1 2
- dog 2 4
- >>> df_multi_level_cols1.stack()
- weight
- cat kg 1
- pounds 2
- dog kg 2
- pounds 4
- **Missing values**
- >>> multicol2 = pd.MultiIndex.from_tuples([('weight', 'kg'),
- ... ('height', 'm')])
- >>> df_multi_level_cols2 = pd.DataFrame([[1.0, 2.0], [3.0, 4.0]],
- ... index=['cat', 'dog'],
- ... columns=multicol2)
- It is common to have missing values when stacking a dataframe
- with multi-level columns, as the stacked dataframe typically
- has more values than the original dataframe. Missing values
- are filled with NaNs:
- >>> df_multi_level_cols2
- weight height
- kg m
- cat 1.0 2.0
- dog 3.0 4.0
- >>> df_multi_level_cols2.stack()
- height weight
- cat kg NaN 1.0
- m 2.0 NaN
- dog kg NaN 3.0
- m 4.0 NaN
- **Prescribing the level(s) to be stacked**
- The first parameter controls which level or levels are stacked:
- >>> df_multi_level_cols2.stack(0)
- kg m
- cat height NaN 2.0
- weight 1.0 NaN
- dog height NaN 4.0
- weight 3.0 NaN
- >>> df_multi_level_cols2.stack([0, 1])
- cat height m 2.0
- weight kg 1.0
- dog height m 4.0
- weight kg 3.0
- dtype: float64
- **Dropping missing values**
- >>> df_multi_level_cols3 = pd.DataFrame([[None, 1.0], [2.0, 3.0]],
- ... index=['cat', 'dog'],
- ... columns=multicol2)
- Note that rows where all values are missing are dropped by
- default but this behaviour can be controlled via the dropna
- keyword parameter:
- >>> df_multi_level_cols3
- weight height
- kg m
- cat NaN 1.0
- dog 2.0 3.0
- >>> df_multi_level_cols3.stack(dropna=False)
- height weight
- cat kg NaN NaN
- m 1.0 NaN
- dog kg NaN 2.0
- m 3.0 NaN
- >>> df_multi_level_cols3.stack(dropna=True)
- height weight
- cat m 1.0 NaN
- dog kg NaN 2.0
- m 3.0 NaN
- """
- from pandas.core.reshape.reshape import (
- stack,
- stack_multiple,
- )
- if isinstance(level, (tuple, list)):
- result = stack_multiple(self, level, dropna=dropna)
- else:
- result = stack(self, level, dropna=dropna)
- return result.__finalize__(self, method="stack")
- def explode(
- self,
- column: IndexLabel,
- ignore_index: bool = False,
- ) -> DataFrame:
- """
- Transform each element of a list-like to a row, replicating index values.
- Parameters
- ----------
- column : IndexLabel
- Column(s) to explode.
- For multiple columns, specify a non-empty list with each element
- be str or tuple, and all specified columns their list-like data
- on same row of the frame must have matching length.
- .. versionadded:: 1.3.0
- Multi-column explode
- ignore_index : bool, default False
- If True, the resulting index will be labeled 0, 1, …, n - 1.
- .. versionadded:: 1.1.0
- Returns
- -------
- DataFrame
- Exploded lists to rows of the subset columns;
- index will be duplicated for these rows.
- Raises
- ------
- ValueError :
- * If columns of the frame are not unique.
- * If specified columns to explode is empty list.
- * If specified columns to explode have not matching count of
- elements rowwise in the frame.
- See Also
- --------
- DataFrame.unstack : Pivot a level of the (necessarily hierarchical)
- index labels.
- DataFrame.melt : Unpivot a DataFrame from wide format to long format.
- Series.explode : Explode a DataFrame from list-like columns to long format.
- Notes
- -----
- This routine will explode list-likes including lists, tuples, sets,
- Series, and np.ndarray. The result dtype of the subset rows will
- be object. Scalars will be returned unchanged, and empty list-likes will
- result in a np.nan for that row. In addition, the ordering of rows in the
- output will be non-deterministic when exploding sets.
- Reference :ref:`the user guide <reshaping.explode>` for more examples.
- Examples
- --------
- >>> df = pd.DataFrame({'A': [[0, 1, 2], 'foo', [], [3, 4]],
- ... 'B': 1,
- ... 'C': [['a', 'b', 'c'], np.nan, [], ['d', 'e']]})
- >>> df
- A B C
- 0 [0, 1, 2] 1 [a, b, c]
- 1 foo 1 NaN
- 2 [] 1 []
- 3 [3, 4] 1 [d, e]
- Single-column explode.
- >>> df.explode('A')
- A B C
- 0 0 1 [a, b, c]
- 0 1 1 [a, b, c]
- 0 2 1 [a, b, c]
- 1 foo 1 NaN
- 2 NaN 1 []
- 3 3 1 [d, e]
- 3 4 1 [d, e]
- Multi-column explode.
- >>> df.explode(list('AC'))
- A B C
- 0 0 1 a
- 0 1 1 b
- 0 2 1 c
- 1 foo 1 NaN
- 2 NaN 1 NaN
- 3 3 1 d
- 3 4 1 e
- """
- if not self.columns.is_unique:
- duplicate_cols = self.columns[self.columns.duplicated()].tolist()
- raise ValueError(
- f"DataFrame columns must be unique. Duplicate columns: {duplicate_cols}"
- )
- columns: list[Hashable]
- if is_scalar(column) or isinstance(column, tuple):
- columns = [column]
- elif isinstance(column, list) and all(
- is_scalar(c) or isinstance(c, tuple) for c in column
- ):
- if not column:
- raise ValueError("column must be nonempty")
- if len(column) > len(set(column)):
- raise ValueError("column must be unique")
- columns = column
- else:
- raise ValueError("column must be a scalar, tuple, or list thereof")
- df = self.reset_index(drop=True)
- if len(columns) == 1:
- result = df[columns[0]].explode()
- else:
- mylen = lambda x: len(x) if (is_list_like(x) and len(x) > 0) else 1
- counts0 = self[columns[0]].apply(mylen)
- for c in columns[1:]:
- if not all(counts0 == self[c].apply(mylen)):
- raise ValueError("columns must have matching element counts")
- result = DataFrame({c: df[c].explode() for c in columns})
- result = df.drop(columns, axis=1).join(result)
- if ignore_index:
- result.index = default_index(len(result))
- else:
- result.index = self.index.take(result.index)
- result = result.reindex(columns=self.columns, copy=False)
- return result.__finalize__(self, method="explode")
- def unstack(self, level: Level = -1, fill_value=None):
- """
- Pivot a level of the (necessarily hierarchical) index labels.
- Returns a DataFrame having a new level of column labels whose inner-most level
- consists of the pivoted index labels.
- If the index is not a MultiIndex, the output will be a Series
- (the analogue of stack when the columns are not a MultiIndex).
- Parameters
- ----------
- level : int, str, or list of these, default -1 (last level)
- Level(s) of index to unstack, can pass level name.
- fill_value : int, str or dict
- Replace NaN with this value if the unstack produces missing values.
- Returns
- -------
- Series or DataFrame
- See Also
- --------
- DataFrame.pivot : Pivot a table based on column values.
- DataFrame.stack : Pivot a level of the column labels (inverse operation
- from `unstack`).
- Notes
- -----
- Reference :ref:`the user guide <reshaping.stacking>` for more examples.
- Examples
- --------
- >>> index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
- ... ('two', 'a'), ('two', 'b')])
- >>> s = pd.Series(np.arange(1.0, 5.0), index=index)
- >>> s
- one a 1.0
- b 2.0
- two a 3.0
- b 4.0
- dtype: float64
- >>> s.unstack(level=-1)
- a b
- one 1.0 2.0
- two 3.0 4.0
- >>> s.unstack(level=0)
- one two
- a 1.0 3.0
- b 2.0 4.0
- >>> df = s.unstack(level=0)
- >>> df.unstack()
- one a 1.0
- b 2.0
- two a 3.0
- b 4.0
- dtype: float64
- """
- from pandas.core.reshape.reshape import unstack
- result = unstack(self, level, fill_value)
- return result.__finalize__(self, method="unstack")
- @Appender(_shared_docs["melt"] % {"caller": "df.melt(", "other": "melt"})
- def melt(
- self,
- id_vars=None,
- value_vars=None,
- var_name=None,
- value_name: Hashable = "value",
- col_level: Level = None,
- ignore_index: bool = True,
- ) -> DataFrame:
- return melt(
- self,
- id_vars=id_vars,
- value_vars=value_vars,
- var_name=var_name,
- value_name=value_name,
- col_level=col_level,
- ignore_index=ignore_index,
- ).__finalize__(self, method="melt")
- # ----------------------------------------------------------------------
- # Time series-related
- @doc(
- Series.diff,
- klass="DataFrame",
- extra_params="axis : {0 or 'index', 1 or 'columns'}, default 0\n "
- "Take difference over rows (0) or columns (1).\n",
- other_klass="Series",
- examples=dedent(
- """
- Difference with previous row
- >>> df = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6],
- ... 'b': [1, 1, 2, 3, 5, 8],
- ... 'c': [1, 4, 9, 16, 25, 36]})
- >>> df
- a b c
- 0 1 1 1
- 1 2 1 4
- 2 3 2 9
- 3 4 3 16
- 4 5 5 25
- 5 6 8 36
- >>> df.diff()
- a b c
- 0 NaN NaN NaN
- 1 1.0 0.0 3.0
- 2 1.0 1.0 5.0
- 3 1.0 1.0 7.0
- 4 1.0 2.0 9.0
- 5 1.0 3.0 11.0
- Difference with previous column
- >>> df.diff(axis=1)
- a b c
- 0 NaN 0 0
- 1 NaN -1 3
- 2 NaN -1 7
- 3 NaN -1 13
- 4 NaN 0 20
- 5 NaN 2 28
- Difference with 3rd previous row
- >>> df.diff(periods=3)
- a b c
- 0 NaN NaN NaN
- 1 NaN NaN NaN
- 2 NaN NaN NaN
- 3 3.0 2.0 15.0
- 4 3.0 4.0 21.0
- 5 3.0 6.0 27.0
- Difference with following row
- >>> df.diff(periods=-1)
- a b c
- 0 -1.0 0.0 -3.0
- 1 -1.0 -1.0 -5.0
- 2 -1.0 -1.0 -7.0
- 3 -1.0 -2.0 -9.0
- 4 -1.0 -3.0 -11.0
- 5 NaN NaN NaN
- Overflow in input dtype
- >>> df = pd.DataFrame({'a': [1, 0]}, dtype=np.uint8)
- >>> df.diff()
- a
- 0 NaN
- 1 255.0"""
- ),
- )
- def diff(self, periods: int = 1, axis: Axis = 0) -> DataFrame:
- if not lib.is_integer(periods):
- if not (
- is_float(periods)
- # error: "int" has no attribute "is_integer"
- and periods.is_integer() # type: ignore[attr-defined]
- ):
- raise ValueError("periods must be an integer")
- periods = int(periods)
- axis = self._get_axis_number(axis)
- if axis == 1:
- if periods != 0:
- # in the periods == 0 case, this is equivalent diff of 0 periods
- # along axis=0, and the Manager method may be somewhat more
- # performant, so we dispatch in that case.
- return self - self.shift(periods, axis=axis)
- # With periods=0 this is equivalent to a diff with axis=0
- axis = 0
- new_data = self._mgr.diff(n=periods, axis=axis)
- return self._constructor(new_data).__finalize__(self, "diff")
- # ----------------------------------------------------------------------
- # Function application
- def _gotitem(
- self,
- key: IndexLabel,
- ndim: int,
- subset: DataFrame | Series | None = None,
- ) -> DataFrame | Series:
- """
- Sub-classes to define. Return a sliced object.
- Parameters
- ----------
- key : string / list of selections
- ndim : {1, 2}
- requested ndim of result
- subset : object, default None
- subset to act on
- """
- if subset is None:
- subset = self
- elif subset.ndim == 1: # is Series
- return subset
- # TODO: _shallow_copy(subset)?
- return subset[key]
- _agg_summary_and_see_also_doc = dedent(
- """
- The aggregation operations are always performed over an axis, either the
- index (default) or the column axis. This behavior is different from
- `numpy` aggregation functions (`mean`, `median`, `prod`, `sum`, `std`,
- `var`), where the default is to compute the aggregation of the flattened
- array, e.g., ``numpy.mean(arr_2d)`` as opposed to
- ``numpy.mean(arr_2d, axis=0)``.
- `agg` is an alias for `aggregate`. Use the alias.
- See Also
- --------
- DataFrame.apply : Perform any type of operations.
- DataFrame.transform : Perform transformation type operations.
- core.groupby.GroupBy : Perform operations over groups.
- core.resample.Resampler : Perform operations over resampled bins.
- core.window.Rolling : Perform operations over rolling window.
- core.window.Expanding : Perform operations over expanding window.
- core.window.ExponentialMovingWindow : Perform operation over exponential weighted
- window.
- """
- )
- _agg_examples_doc = dedent(
- """
- Examples
- --------
- >>> df = pd.DataFrame([[1, 2, 3],
- ... [4, 5, 6],
- ... [7, 8, 9],
- ... [np.nan, np.nan, np.nan]],
- ... columns=['A', 'B', 'C'])
- Aggregate these functions over the rows.
- >>> df.agg(['sum', 'min'])
- A B C
- sum 12.0 15.0 18.0
- min 1.0 2.0 3.0
- Different aggregations per column.
- >>> df.agg({'A' : ['sum', 'min'], 'B' : ['min', 'max']})
- A B
- sum 12.0 NaN
- min 1.0 2.0
- max NaN 8.0
- Aggregate different functions over the columns and rename the index of the resulting
- DataFrame.
- >>> df.agg(x=('A', max), y=('B', 'min'), z=('C', np.mean))
- A B C
- x 7.0 NaN NaN
- y NaN 2.0 NaN
- z NaN NaN 6.0
- Aggregate over the columns.
- >>> df.agg("mean", axis="columns")
- 0 2.0
- 1 5.0
- 2 8.0
- 3 NaN
- dtype: float64
- """
- )
- @doc(
- _shared_docs["aggregate"],
- klass=_shared_doc_kwargs["klass"],
- axis=_shared_doc_kwargs["axis"],
- see_also=_agg_summary_and_see_also_doc,
- examples=_agg_examples_doc,
- )
- def aggregate(self, func=None, axis: Axis = 0, *args, **kwargs):
- from pandas.core.apply import frame_apply
- axis = self._get_axis_number(axis)
- relabeling, func, columns, order = reconstruct_func(func, **kwargs)
- op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
- result = op.agg()
- if relabeling:
- # This is to keep the order to columns occurrence unchanged, and also
- # keep the order of new columns occurrence unchanged
- # For the return values of reconstruct_func, if relabeling is
- # False, columns and order will be None.
- assert columns is not None
- assert order is not None
- result_in_dict = relabel_result(result, func, columns, order)
- result = DataFrame(result_in_dict, index=columns)
- return result
- agg = aggregate
- # error: Signature of "any" incompatible with supertype "NDFrame" [override]
- @overload # type: ignore[override]
- def any(
- self,
- *,
- axis: Axis = ...,
- bool_only: bool | None = ...,
- skipna: bool = ...,
- level: None = ...,
- **kwargs,
- ) -> Series:
- ...
- @overload
- def any(
- self,
- *,
- axis: Axis = ...,
- bool_only: bool | None = ...,
- skipna: bool = ...,
- level: Level,
- **kwargs,
- ) -> DataFrame | Series:
- ...
- # error: Missing return statement
- @doc(NDFrame.any, **_shared_doc_kwargs)
- def any( # type: ignore[empty-body]
- self,
- axis: Axis = 0,
- bool_only: bool | None = None,
- skipna: bool = True,
- level: Level = None,
- **kwargs,
- ) -> DataFrame | Series:
- ...
- @doc(
- _shared_docs["transform"],
- klass=_shared_doc_kwargs["klass"],
- axis=_shared_doc_kwargs["axis"],
- )
- def transform(
- self, func: AggFuncType, axis: Axis = 0, *args, **kwargs
- ) -> DataFrame:
- from pandas.core.apply import frame_apply
- op = frame_apply(self, func=func, axis=axis, args=args, kwargs=kwargs)
- result = op.transform()
- assert isinstance(result, DataFrame)
- return result
- def apply(
- self,
- func: AggFuncType,
- axis: Axis = 0,
- raw: bool = False,
- result_type: Literal["expand", "reduce", "broadcast"] | None = None,
- args=(),
- **kwargs,
- ):
- """
- Apply a function along an axis of the DataFrame.
- Objects passed to the function are Series objects whose index is
- either the DataFrame's index (``axis=0``) or the DataFrame's columns
- (``axis=1``). By default (``result_type=None``), the final return type
- is inferred from the return type of the applied function. Otherwise,
- it depends on the `result_type` argument.
- Parameters
- ----------
- func : function
- Function to apply to each column or row.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Axis along which the function is applied:
- * 0 or 'index': apply function to each column.
- * 1 or 'columns': apply function to each row.
- raw : bool, default False
- Determines if row or column is passed as a Series or ndarray object:
- * ``False`` : passes each row or column as a Series to the
- function.
- * ``True`` : the passed function will receive ndarray objects
- instead.
- If you are just applying a NumPy reduction function this will
- achieve much better performance.
- result_type : {'expand', 'reduce', 'broadcast', None}, default None
- These only act when ``axis=1`` (columns):
- * 'expand' : list-like results will be turned into columns.
- * 'reduce' : returns a Series if possible rather than expanding
- list-like results. This is the opposite of 'expand'.
- * 'broadcast' : results will be broadcast to the original shape
- of the DataFrame, the original index and columns will be
- retained.
- The default behaviour (None) depends on the return value of the
- applied function: list-like results will be returned as a Series
- of those. However if the apply function returns a Series these
- are expanded to columns.
- args : tuple
- Positional arguments to pass to `func` in addition to the
- array/series.
- **kwargs
- Additional keyword arguments to pass as keywords arguments to
- `func`.
- Returns
- -------
- Series or DataFrame
- Result of applying ``func`` along the given axis of the
- DataFrame.
- See Also
- --------
- DataFrame.applymap: For elementwise operations.
- DataFrame.aggregate: Only perform aggregating type operations.
- DataFrame.transform: Only perform transforming type operations.
- Notes
- -----
- Functions that mutate the passed object can produce unexpected
- behavior or errors and are not supported. See :ref:`gotchas.udf-mutation`
- for more details.
- Examples
- --------
- >>> df = pd.DataFrame([[4, 9]] * 3, columns=['A', 'B'])
- >>> df
- A B
- 0 4 9
- 1 4 9
- 2 4 9
- Using a numpy universal function (in this case the same as
- ``np.sqrt(df)``):
- >>> df.apply(np.sqrt)
- A B
- 0 2.0 3.0
- 1 2.0 3.0
- 2 2.0 3.0
- Using a reducing function on either axis
- >>> df.apply(np.sum, axis=0)
- A 12
- B 27
- dtype: int64
- >>> df.apply(np.sum, axis=1)
- 0 13
- 1 13
- 2 13
- dtype: int64
- Returning a list-like will result in a Series
- >>> df.apply(lambda x: [1, 2], axis=1)
- 0 [1, 2]
- 1 [1, 2]
- 2 [1, 2]
- dtype: object
- Passing ``result_type='expand'`` will expand list-like results
- to columns of a Dataframe
- >>> df.apply(lambda x: [1, 2], axis=1, result_type='expand')
- 0 1
- 0 1 2
- 1 1 2
- 2 1 2
- Returning a Series inside the function is similar to passing
- ``result_type='expand'``. The resulting column names
- will be the Series index.
- >>> df.apply(lambda x: pd.Series([1, 2], index=['foo', 'bar']), axis=1)
- foo bar
- 0 1 2
- 1 1 2
- 2 1 2
- Passing ``result_type='broadcast'`` will ensure the same shape
- result, whether list-like or scalar is returned by the function,
- and broadcast it along the axis. The resulting column names will
- be the originals.
- >>> df.apply(lambda x: [1, 2], axis=1, result_type='broadcast')
- A B
- 0 1 2
- 1 1 2
- 2 1 2
- """
- from pandas.core.apply import frame_apply
- op = frame_apply(
- self,
- func=func,
- axis=axis,
- raw=raw,
- result_type=result_type,
- args=args,
- kwargs=kwargs,
- )
- return op.apply().__finalize__(self, method="apply")
- def applymap(
- self, func: PythonFuncType, na_action: str | None = None, **kwargs
- ) -> DataFrame:
- """
- Apply a function to a Dataframe elementwise.
- This method applies a function that accepts and returns a scalar
- to every element of a DataFrame.
- Parameters
- ----------
- func : callable
- Python function, returns a single value from a single value.
- na_action : {None, 'ignore'}, default None
- If ‘ignore’, propagate NaN values, without passing them to func.
- .. versionadded:: 1.2
- **kwargs
- Additional keyword arguments to pass as keywords arguments to
- `func`.
- .. versionadded:: 1.3.0
- Returns
- -------
- DataFrame
- Transformed DataFrame.
- See Also
- --------
- DataFrame.apply : Apply a function along input axis of DataFrame.
- Examples
- --------
- >>> df = pd.DataFrame([[1, 2.12], [3.356, 4.567]])
- >>> df
- 0 1
- 0 1.000 2.120
- 1 3.356 4.567
- >>> df.applymap(lambda x: len(str(x)))
- 0 1
- 0 3 4
- 1 5 5
- Like Series.map, NA values can be ignored:
- >>> df_copy = df.copy()
- >>> df_copy.iloc[0, 0] = pd.NA
- >>> df_copy.applymap(lambda x: len(str(x)), na_action='ignore')
- 0 1
- 0 NaN 4
- 1 5.0 5
- Note that a vectorized version of `func` often exists, which will
- be much faster. You could square each number elementwise.
- >>> df.applymap(lambda x: x**2)
- 0 1
- 0 1.000000 4.494400
- 1 11.262736 20.857489
- But it's better to avoid applymap in that case.
- >>> df ** 2
- 0 1
- 0 1.000000 4.494400
- 1 11.262736 20.857489
- """
- if na_action not in {"ignore", None}:
- raise ValueError(
- f"na_action must be 'ignore' or None. Got {repr(na_action)}"
- )
- ignore_na = na_action == "ignore"
- func = functools.partial(func, **kwargs)
- # if we have a dtype == 'M8[ns]', provide boxed values
- def infer(x):
- if x.empty:
- return lib.map_infer(x, func, ignore_na=ignore_na)
- return lib.map_infer(x.astype(object)._values, func, ignore_na=ignore_na)
- return self.apply(infer).__finalize__(self, "applymap")
- # ----------------------------------------------------------------------
- # Merging / joining methods
- def _append(
- self,
- other,
- ignore_index: bool = False,
- verify_integrity: bool = False,
- sort: bool = False,
- ) -> DataFrame:
- if isinstance(other, (Series, dict)):
- if isinstance(other, dict):
- if not ignore_index:
- raise TypeError("Can only append a dict if ignore_index=True")
- other = Series(other)
- if other.name is None and not ignore_index:
- raise TypeError(
- "Can only append a Series if ignore_index=True "
- "or if the Series has a name"
- )
- index = Index(
- [other.name],
- name=self.index.names
- if isinstance(self.index, MultiIndex)
- else self.index.name,
- )
- row_df = other.to_frame().T
- # infer_objects is needed for
- # test_append_empty_frame_to_series_with_dateutil_tz
- other = row_df.infer_objects(copy=False).rename_axis(
- index.names, copy=False
- )
- elif isinstance(other, list):
- if not other:
- pass
- elif not isinstance(other[0], DataFrame):
- other = DataFrame(other)
- if self.index.name is not None and not ignore_index:
- other.index.name = self.index.name
- from pandas.core.reshape.concat import concat
- if isinstance(other, (list, tuple)):
- to_concat = [self, *other]
- else:
- to_concat = [self, other]
- result = concat(
- to_concat,
- ignore_index=ignore_index,
- verify_integrity=verify_integrity,
- sort=sort,
- )
- return result.__finalize__(self, method="append")
- def join(
- self,
- other: DataFrame | Series | Iterable[DataFrame | Series],
- on: IndexLabel | None = None,
- how: MergeHow = "left",
- lsuffix: str = "",
- rsuffix: str = "",
- sort: bool = False,
- validate: str | None = None,
- ) -> DataFrame:
- """
- Join columns of another DataFrame.
- Join columns with `other` DataFrame either on index or on a key
- column. Efficiently join multiple DataFrame objects by index at once by
- passing a list.
- Parameters
- ----------
- other : DataFrame, Series, or a list containing any combination of them
- Index should be similar to one of the columns in this one. If a
- Series is passed, its name attribute must be set, and that will be
- used as the column name in the resulting joined DataFrame.
- on : str, list of str, or array-like, optional
- Column or index level name(s) in the caller to join on the index
- in `other`, otherwise joins index-on-index. If multiple
- values given, the `other` DataFrame must have a MultiIndex. Can
- pass an array as the join key if it is not already contained in
- the calling DataFrame. Like an Excel VLOOKUP operation.
- how : {'left', 'right', 'outer', 'inner', 'cross'}, default 'left'
- How to handle the operation of the two objects.
- * left: use calling frame's index (or column if on is specified)
- * right: use `other`'s index.
- * outer: form union of calling frame's index (or column if on is
- specified) with `other`'s index, and sort it.
- lexicographically.
- * inner: form intersection of calling frame's index (or column if
- on is specified) with `other`'s index, preserving the order
- of the calling's one.
- * cross: creates the cartesian product from both frames, preserves the order
- of the left keys.
- .. versionadded:: 1.2.0
- lsuffix : str, default ''
- Suffix to use from left frame's overlapping columns.
- rsuffix : str, default ''
- Suffix to use from right frame's overlapping columns.
- sort : bool, default False
- Order result DataFrame lexicographically by the join key. If False,
- the order of the join key depends on the join type (how keyword).
- validate : str, optional
- If specified, checks if join is of specified type.
- * "one_to_one" or "1:1": check if join keys are unique in both left
- and right datasets.
- * "one_to_many" or "1:m": check if join keys are unique in left dataset.
- * "many_to_one" or "m:1": check if join keys are unique in right dataset.
- * "many_to_many" or "m:m": allowed, but does not result in checks.
- .. versionadded:: 1.5.0
- Returns
- -------
- DataFrame
- A dataframe containing columns from both the caller and `other`.
- See Also
- --------
- DataFrame.merge : For column(s)-on-column(s) operations.
- Notes
- -----
- Parameters `on`, `lsuffix`, and `rsuffix` are not supported when
- passing a list of `DataFrame` objects.
- Support for specifying index levels as the `on` parameter was added
- in version 0.23.0.
- Examples
- --------
- >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3', 'K4', 'K5'],
- ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
- >>> df
- key A
- 0 K0 A0
- 1 K1 A1
- 2 K2 A2
- 3 K3 A3
- 4 K4 A4
- 5 K5 A5
- >>> other = pd.DataFrame({'key': ['K0', 'K1', 'K2'],
- ... 'B': ['B0', 'B1', 'B2']})
- >>> other
- key B
- 0 K0 B0
- 1 K1 B1
- 2 K2 B2
- Join DataFrames using their indexes.
- >>> df.join(other, lsuffix='_caller', rsuffix='_other')
- key_caller A key_other B
- 0 K0 A0 K0 B0
- 1 K1 A1 K1 B1
- 2 K2 A2 K2 B2
- 3 K3 A3 NaN NaN
- 4 K4 A4 NaN NaN
- 5 K5 A5 NaN NaN
- If we want to join using the key columns, we need to set key to be
- the index in both `df` and `other`. The joined DataFrame will have
- key as its index.
- >>> df.set_index('key').join(other.set_index('key'))
- A B
- key
- K0 A0 B0
- K1 A1 B1
- K2 A2 B2
- K3 A3 NaN
- K4 A4 NaN
- K5 A5 NaN
- Another option to join using the key columns is to use the `on`
- parameter. DataFrame.join always uses `other`'s index but we can use
- any column in `df`. This method preserves the original DataFrame's
- index in the result.
- >>> df.join(other.set_index('key'), on='key')
- key A B
- 0 K0 A0 B0
- 1 K1 A1 B1
- 2 K2 A2 B2
- 3 K3 A3 NaN
- 4 K4 A4 NaN
- 5 K5 A5 NaN
- Using non-unique key values shows how they are matched.
- >>> df = pd.DataFrame({'key': ['K0', 'K1', 'K1', 'K3', 'K0', 'K1'],
- ... 'A': ['A0', 'A1', 'A2', 'A3', 'A4', 'A5']})
- >>> df
- key A
- 0 K0 A0
- 1 K1 A1
- 2 K1 A2
- 3 K3 A3
- 4 K0 A4
- 5 K1 A5
- >>> df.join(other.set_index('key'), on='key', validate='m:1')
- key A B
- 0 K0 A0 B0
- 1 K1 A1 B1
- 2 K1 A2 B1
- 3 K3 A3 NaN
- 4 K0 A4 B0
- 5 K1 A5 B1
- """
- return self._join_compat(
- other,
- on=on,
- how=how,
- lsuffix=lsuffix,
- rsuffix=rsuffix,
- sort=sort,
- validate=validate,
- )
- def _join_compat(
- self,
- other: DataFrame | Series | Iterable[DataFrame | Series],
- on: IndexLabel | None = None,
- how: MergeHow = "left",
- lsuffix: str = "",
- rsuffix: str = "",
- sort: bool = False,
- validate: str | None = None,
- ):
- from pandas.core.reshape.concat import concat
- from pandas.core.reshape.merge import merge
- if isinstance(other, Series):
- if other.name is None:
- raise ValueError("Other Series must have a name")
- other = DataFrame({other.name: other})
- if isinstance(other, DataFrame):
- if how == "cross":
- return merge(
- self,
- other,
- how=how,
- on=on,
- suffixes=(lsuffix, rsuffix),
- sort=sort,
- validate=validate,
- )
- return merge(
- self,
- other,
- left_on=on,
- how=how,
- left_index=on is None,
- right_index=True,
- suffixes=(lsuffix, rsuffix),
- sort=sort,
- validate=validate,
- )
- else:
- if on is not None:
- raise ValueError(
- "Joining multiple DataFrames only supported for joining on index"
- )
- if rsuffix or lsuffix:
- raise ValueError(
- "Suffixes not supported when joining multiple DataFrames"
- )
- # Mypy thinks the RHS is a
- # "Union[DataFrame, Series, Iterable[Union[DataFrame, Series]]]" whereas
- # the LHS is an "Iterable[DataFrame]", but in reality both types are
- # "Iterable[Union[DataFrame, Series]]" due to the if statements
- frames = [cast("DataFrame | Series", self)] + list(other)
- can_concat = all(df.index.is_unique for df in frames)
- # join indexes only using concat
- if can_concat:
- if how == "left":
- res = concat(
- frames, axis=1, join="outer", verify_integrity=True, sort=sort
- )
- return res.reindex(self.index, copy=False)
- else:
- return concat(
- frames, axis=1, join=how, verify_integrity=True, sort=sort
- )
- joined = frames[0]
- for frame in frames[1:]:
- joined = merge(
- joined,
- frame,
- how=how,
- left_index=True,
- right_index=True,
- validate=validate,
- )
- return joined
- @Substitution("")
- @Appender(_merge_doc, indents=2)
- def merge(
- self,
- right: DataFrame | Series,
- how: MergeHow = "inner",
- on: IndexLabel | None = None,
- left_on: IndexLabel | None = None,
- right_on: IndexLabel | None = None,
- left_index: bool = False,
- right_index: bool = False,
- sort: bool = False,
- suffixes: Suffixes = ("_x", "_y"),
- copy: bool | None = None,
- indicator: str | bool = False,
- validate: str | None = None,
- ) -> DataFrame:
- from pandas.core.reshape.merge import merge
- return merge(
- self,
- right,
- how=how,
- on=on,
- left_on=left_on,
- right_on=right_on,
- left_index=left_index,
- right_index=right_index,
- sort=sort,
- suffixes=suffixes,
- copy=copy,
- indicator=indicator,
- validate=validate,
- )
- def round(
- self, decimals: int | dict[IndexLabel, int] | Series = 0, *args, **kwargs
- ) -> DataFrame:
- """
- Round a DataFrame to a variable number of decimal places.
- Parameters
- ----------
- decimals : int, dict, Series
- Number of decimal places to round each column to. If an int is
- given, round each column to the same number of places.
- Otherwise dict and Series round to variable numbers of places.
- Column names should be in the keys if `decimals` is a
- dict-like, or in the index if `decimals` is a Series. Any
- columns not included in `decimals` will be left as is. Elements
- of `decimals` which are not columns of the input will be
- ignored.
- *args
- Additional keywords have no effect but might be accepted for
- compatibility with numpy.
- **kwargs
- Additional keywords have no effect but might be accepted for
- compatibility with numpy.
- Returns
- -------
- DataFrame
- A DataFrame with the affected columns rounded to the specified
- number of decimal places.
- See Also
- --------
- numpy.around : Round a numpy array to the given number of decimals.
- Series.round : Round a Series to the given number of decimals.
- Examples
- --------
- >>> df = pd.DataFrame([(.21, .32), (.01, .67), (.66, .03), (.21, .18)],
- ... columns=['dogs', 'cats'])
- >>> df
- dogs cats
- 0 0.21 0.32
- 1 0.01 0.67
- 2 0.66 0.03
- 3 0.21 0.18
- By providing an integer each column is rounded to the same number
- of decimal places
- >>> df.round(1)
- dogs cats
- 0 0.2 0.3
- 1 0.0 0.7
- 2 0.7 0.0
- 3 0.2 0.2
- With a dict, the number of places for specific columns can be
- specified with the column names as key and the number of decimal
- places as value
- >>> df.round({'dogs': 1, 'cats': 0})
- dogs cats
- 0 0.2 0.0
- 1 0.0 1.0
- 2 0.7 0.0
- 3 0.2 0.0
- Using a Series, the number of places for specific columns can be
- specified with the column names as index and the number of
- decimal places as value
- >>> decimals = pd.Series([0, 1], index=['cats', 'dogs'])
- >>> df.round(decimals)
- dogs cats
- 0 0.2 0.0
- 1 0.0 1.0
- 2 0.7 0.0
- 3 0.2 0.0
- """
- from pandas.core.reshape.concat import concat
- def _dict_round(df: DataFrame, decimals):
- for col, vals in df.items():
- try:
- yield _series_round(vals, decimals[col])
- except KeyError:
- yield vals
- def _series_round(ser: Series, decimals: int) -> Series:
- if is_integer_dtype(ser.dtype) or is_float_dtype(ser.dtype):
- return ser.round(decimals)
- return ser
- nv.validate_round(args, kwargs)
- if isinstance(decimals, (dict, Series)):
- if isinstance(decimals, Series) and not decimals.index.is_unique:
- raise ValueError("Index of decimals must be unique")
- if is_dict_like(decimals) and not all(
- is_integer(value) for _, value in decimals.items()
- ):
- raise TypeError("Values in decimals must be integers")
- new_cols = list(_dict_round(self, decimals))
- elif is_integer(decimals):
- # Dispatch to Block.round
- return self._constructor(
- self._mgr.round(decimals=decimals, using_cow=using_copy_on_write()),
- ).__finalize__(self, method="round")
- else:
- raise TypeError("decimals must be an integer, a dict-like or a Series")
- if new_cols is not None and len(new_cols) > 0:
- return self._constructor(
- concat(new_cols, axis=1), index=self.index, columns=self.columns
- ).__finalize__(self, method="round")
- else:
- return self.copy(deep=False)
- # ----------------------------------------------------------------------
- # Statistical methods, etc.
- def corr(
- self,
- method: CorrelationMethod = "pearson",
- min_periods: int = 1,
- numeric_only: bool = False,
- ) -> DataFrame:
- """
- Compute pairwise correlation of columns, excluding NA/null values.
- Parameters
- ----------
- method : {'pearson', 'kendall', 'spearman'} or callable
- Method of correlation:
- * pearson : standard correlation coefficient
- * kendall : Kendall Tau correlation coefficient
- * spearman : Spearman rank correlation
- * callable: callable with input two 1d ndarrays
- and returning a float. Note that the returned matrix from corr
- will have 1 along the diagonals and will be symmetric
- regardless of the callable's behavior.
- min_periods : int, optional
- Minimum number of observations required per pair of columns
- to have a valid result. Currently only available for Pearson
- and Spearman correlation.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- .. versionchanged:: 2.0.0
- The default value of ``numeric_only`` is now ``False``.
- Returns
- -------
- DataFrame
- Correlation matrix.
- See Also
- --------
- DataFrame.corrwith : Compute pairwise correlation with another
- DataFrame or Series.
- Series.corr : Compute the correlation between two Series.
- Notes
- -----
- Pearson, Kendall and Spearman correlation are currently computed using pairwise complete observations.
- * `Pearson correlation coefficient <https://en.wikipedia.org/wiki/Pearson_correlation_coefficient>`_
- * `Kendall rank correlation coefficient <https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient>`_
- * `Spearman's rank correlation coefficient <https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient>`_
- Examples
- --------
- >>> def histogram_intersection(a, b):
- ... v = np.minimum(a, b).sum().round(decimals=1)
- ... return v
- >>> df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
- ... columns=['dogs', 'cats'])
- >>> df.corr(method=histogram_intersection)
- dogs cats
- dogs 1.0 0.3
- cats 0.3 1.0
- >>> df = pd.DataFrame([(1, 1), (2, np.nan), (np.nan, 3), (4, 4)],
- ... columns=['dogs', 'cats'])
- >>> df.corr(min_periods=3)
- dogs cats
- dogs 1.0 NaN
- cats NaN 1.0
- """ # noqa:E501
- data = self._get_numeric_data() if numeric_only else self
- cols = data.columns
- idx = cols.copy()
- mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
- if method == "pearson":
- correl = libalgos.nancorr(mat, minp=min_periods)
- elif method == "spearman":
- correl = libalgos.nancorr_spearman(mat, minp=min_periods)
- elif method == "kendall" or callable(method):
- if min_periods is None:
- min_periods = 1
- mat = mat.T
- corrf = nanops.get_corr_func(method)
- K = len(cols)
- correl = np.empty((K, K), dtype=float)
- mask = np.isfinite(mat)
- for i, ac in enumerate(mat):
- for j, bc in enumerate(mat):
- if i > j:
- continue
- valid = mask[i] & mask[j]
- if valid.sum() < min_periods:
- c = np.nan
- elif i == j:
- c = 1.0
- elif not valid.all():
- c = corrf(ac[valid], bc[valid])
- else:
- c = corrf(ac, bc)
- correl[i, j] = c
- correl[j, i] = c
- else:
- raise ValueError(
- "method must be either 'pearson', "
- "'spearman', 'kendall', or a callable, "
- f"'{method}' was supplied"
- )
- result = self._constructor(correl, index=idx, columns=cols, copy=False)
- return result.__finalize__(self, method="corr")
- def cov(
- self,
- min_periods: int | None = None,
- ddof: int | None = 1,
- numeric_only: bool = False,
- ) -> DataFrame:
- """
- Compute pairwise covariance of columns, excluding NA/null values.
- Compute the pairwise covariance among the series of a DataFrame.
- The returned data frame is the `covariance matrix
- <https://en.wikipedia.org/wiki/Covariance_matrix>`__ of the columns
- of the DataFrame.
- Both NA and null values are automatically excluded from the
- calculation. (See the note below about bias from missing values.)
- A threshold can be set for the minimum number of
- observations for each value created. Comparisons with observations
- below this threshold will be returned as ``NaN``.
- This method is generally used for the analysis of time series data to
- understand the relationship between different measures
- across time.
- Parameters
- ----------
- min_periods : int, optional
- Minimum number of observations required per pair of columns
- to have a valid result.
- ddof : int, default 1
- Delta degrees of freedom. The divisor used in calculations
- is ``N - ddof``, where ``N`` represents the number of elements.
- .. versionadded:: 1.1.0
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- .. versionchanged:: 2.0.0
- The default value of ``numeric_only`` is now ``False``.
- Returns
- -------
- DataFrame
- The covariance matrix of the series of the DataFrame.
- See Also
- --------
- Series.cov : Compute covariance with another Series.
- core.window.ewm.ExponentialMovingWindow.cov : Exponential weighted sample
- covariance.
- core.window.expanding.Expanding.cov : Expanding sample covariance.
- core.window.rolling.Rolling.cov : Rolling sample covariance.
- Notes
- -----
- Returns the covariance matrix of the DataFrame's time series.
- The covariance is normalized by N-ddof.
- For DataFrames that have Series that are missing data (assuming that
- data is `missing at random
- <https://en.wikipedia.org/wiki/Missing_data#Missing_at_random>`__)
- the returned covariance matrix will be an unbiased estimate
- of the variance and covariance between the member Series.
- However, for many applications this estimate may not be acceptable
- because the estimate covariance matrix is not guaranteed to be positive
- semi-definite. This could lead to estimate correlations having
- absolute values which are greater than one, and/or a non-invertible
- covariance matrix. See `Estimation of covariance matrices
- <https://en.wikipedia.org/w/index.php?title=Estimation_of_covariance_
- matrices>`__ for more details.
- Examples
- --------
- >>> df = pd.DataFrame([(1, 2), (0, 3), (2, 0), (1, 1)],
- ... columns=['dogs', 'cats'])
- >>> df.cov()
- dogs cats
- dogs 0.666667 -1.000000
- cats -1.000000 1.666667
- >>> np.random.seed(42)
- >>> df = pd.DataFrame(np.random.randn(1000, 5),
- ... columns=['a', 'b', 'c', 'd', 'e'])
- >>> df.cov()
- a b c d e
- a 0.998438 -0.020161 0.059277 -0.008943 0.014144
- b -0.020161 1.059352 -0.008543 -0.024738 0.009826
- c 0.059277 -0.008543 1.010670 -0.001486 -0.000271
- d -0.008943 -0.024738 -0.001486 0.921297 -0.013692
- e 0.014144 0.009826 -0.000271 -0.013692 0.977795
- **Minimum number of periods**
- This method also supports an optional ``min_periods`` keyword
- that specifies the required minimum number of non-NA observations for
- each column pair in order to have a valid result:
- >>> np.random.seed(42)
- >>> df = pd.DataFrame(np.random.randn(20, 3),
- ... columns=['a', 'b', 'c'])
- >>> df.loc[df.index[:5], 'a'] = np.nan
- >>> df.loc[df.index[5:10], 'b'] = np.nan
- >>> df.cov(min_periods=12)
- a b c
- a 0.316741 NaN -0.150812
- b NaN 1.248003 0.191417
- c -0.150812 0.191417 0.895202
- """
- data = self._get_numeric_data() if numeric_only else self
- cols = data.columns
- idx = cols.copy()
- mat = data.to_numpy(dtype=float, na_value=np.nan, copy=False)
- if notna(mat).all():
- if min_periods is not None and min_periods > len(mat):
- base_cov = np.empty((mat.shape[1], mat.shape[1]))
- base_cov.fill(np.nan)
- else:
- base_cov = np.cov(mat.T, ddof=ddof)
- base_cov = base_cov.reshape((len(cols), len(cols)))
- else:
- base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)
- result = self._constructor(base_cov, index=idx, columns=cols, copy=False)
- return result.__finalize__(self, method="cov")
- def corrwith(
- self,
- other: DataFrame | Series,
- axis: Axis = 0,
- drop: bool = False,
- method: CorrelationMethod = "pearson",
- numeric_only: bool = False,
- ) -> Series:
- """
- Compute pairwise correlation.
- Pairwise correlation is computed between rows or columns of
- DataFrame with rows or columns of Series or DataFrame. DataFrames
- are first aligned along both axes before computing the
- correlations.
- Parameters
- ----------
- other : DataFrame, Series
- Object with which to compute correlations.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to use. 0 or 'index' to compute row-wise, 1 or 'columns' for
- column-wise.
- drop : bool, default False
- Drop missing indices from result.
- method : {'pearson', 'kendall', 'spearman'} or callable
- Method of correlation:
- * pearson : standard correlation coefficient
- * kendall : Kendall Tau correlation coefficient
- * spearman : Spearman rank correlation
- * callable: callable with input two 1d ndarrays
- and returning a float.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionadded:: 1.5.0
- .. versionchanged:: 2.0.0
- The default value of ``numeric_only`` is now ``False``.
- Returns
- -------
- Series
- Pairwise correlations.
- See Also
- --------
- DataFrame.corr : Compute pairwise correlation of columns.
- Examples
- --------
- >>> index = ["a", "b", "c", "d", "e"]
- >>> columns = ["one", "two", "three", "four"]
- >>> df1 = pd.DataFrame(np.arange(20).reshape(5, 4), index=index, columns=columns)
- >>> df2 = pd.DataFrame(np.arange(16).reshape(4, 4), index=index[:4], columns=columns)
- >>> df1.corrwith(df2)
- one 1.0
- two 1.0
- three 1.0
- four 1.0
- dtype: float64
- >>> df2.corrwith(df1, axis=1)
- a 1.0
- b 1.0
- c 1.0
- d 1.0
- e NaN
- dtype: float64
- """ # noqa:E501
- axis = self._get_axis_number(axis)
- this = self._get_numeric_data() if numeric_only else self
- if isinstance(other, Series):
- return this.apply(lambda x: other.corr(x, method=method), axis=axis)
- if numeric_only:
- other = other._get_numeric_data()
- left, right = this.align(other, join="inner", copy=False)
- if axis == 1:
- left = left.T
- right = right.T
- if method == "pearson":
- # mask missing values
- left = left + right * 0
- right = right + left * 0
- # demeaned data
- ldem = left - left.mean(numeric_only=numeric_only)
- rdem = right - right.mean(numeric_only=numeric_only)
- num = (ldem * rdem).sum()
- dom = (
- (left.count() - 1)
- * left.std(numeric_only=numeric_only)
- * right.std(numeric_only=numeric_only)
- )
- correl = num / dom
- elif method in ["kendall", "spearman"] or callable(method):
- def c(x):
- return nanops.nancorr(x[0], x[1], method=method)
- correl = self._constructor_sliced(
- map(c, zip(left.values.T, right.values.T)),
- index=left.columns,
- copy=False,
- )
- else:
- raise ValueError(
- f"Invalid method {method} was passed, "
- "valid methods are: 'pearson', 'kendall', "
- "'spearman', or callable"
- )
- if not drop:
- # Find non-matching labels along the given axis
- # and append missing correlations (GH 22375)
- raxis: AxisInt = 1 if axis == 0 else 0
- result_index = this._get_axis(raxis).union(other._get_axis(raxis))
- idx_diff = result_index.difference(correl.index)
- if len(idx_diff) > 0:
- correl = correl._append(
- Series([np.nan] * len(idx_diff), index=idx_diff)
- )
- return correl
- # ----------------------------------------------------------------------
- # ndarray-like stats methods
- def count(self, axis: Axis = 0, numeric_only: bool = False):
- """
- Count non-NA cells for each column or row.
- The values `None`, `NaN`, `NaT`, and optionally `numpy.inf` (depending
- on `pandas.options.mode.use_inf_as_na`) are considered NA.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- If 0 or 'index' counts are generated for each column.
- If 1 or 'columns' counts are generated for each row.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- Returns
- -------
- Series or DataFrame
- For each column/row the number of non-NA/null entries.
- If `level` is specified returns a `DataFrame`.
- See Also
- --------
- Series.count: Number of non-NA elements in a Series.
- DataFrame.value_counts: Count unique combinations of columns.
- DataFrame.shape: Number of DataFrame rows and columns (including NA
- elements).
- DataFrame.isna: Boolean same-sized DataFrame showing places of NA
- elements.
- Examples
- --------
- Constructing DataFrame from a dictionary:
- >>> df = pd.DataFrame({"Person":
- ... ["John", "Myla", "Lewis", "John", "Myla"],
- ... "Age": [24., np.nan, 21., 33, 26],
- ... "Single": [False, True, True, True, False]})
- >>> df
- Person Age Single
- 0 John 24.0 False
- 1 Myla NaN True
- 2 Lewis 21.0 True
- 3 John 33.0 True
- 4 Myla 26.0 False
- Notice the uncounted NA values:
- >>> df.count()
- Person 5
- Age 4
- Single 5
- dtype: int64
- Counts for each **row**:
- >>> df.count(axis='columns')
- 0 3
- 1 2
- 2 3
- 3 3
- 4 3
- dtype: int64
- """
- axis = self._get_axis_number(axis)
- if numeric_only:
- frame = self._get_numeric_data()
- else:
- frame = self
- # GH #423
- if len(frame._get_axis(axis)) == 0:
- result = self._constructor_sliced(0, index=frame._get_agg_axis(axis))
- else:
- if frame._is_mixed_type or frame._mgr.any_extension_types:
- # the or any_extension_types is really only hit for single-
- # column frames with an extension array
- result = notna(frame).sum(axis=axis)
- else:
- # GH13407
- series_counts = notna(frame).sum(axis=axis)
- counts = series_counts._values
- result = self._constructor_sliced(
- counts, index=frame._get_agg_axis(axis), copy=False
- )
- return result.astype("int64").__finalize__(self, method="count")
- def _reduce(
- self,
- op,
- name: str,
- *,
- axis: Axis = 0,
- skipna: bool = True,
- numeric_only: bool = False,
- filter_type=None,
- **kwds,
- ):
- assert filter_type is None or filter_type == "bool", filter_type
- out_dtype = "bool" if filter_type == "bool" else None
- if axis is not None:
- axis = self._get_axis_number(axis)
- def func(values: np.ndarray):
- # We only use this in the case that operates on self.values
- return op(values, axis=axis, skipna=skipna, **kwds)
- def blk_func(values, axis: Axis = 1):
- if isinstance(values, ExtensionArray):
- if not is_1d_only_ea_dtype(values.dtype) and not isinstance(
- self._mgr, ArrayManager
- ):
- return values._reduce(name, axis=1, skipna=skipna, **kwds)
- return values._reduce(name, skipna=skipna, **kwds)
- else:
- return op(values, axis=axis, skipna=skipna, **kwds)
- def _get_data() -> DataFrame:
- if filter_type is None:
- data = self._get_numeric_data()
- else:
- # GH#25101, GH#24434
- assert filter_type == "bool"
- data = self._get_bool_data()
- return data
- # Case with EAs see GH#35881
- df = self
- if numeric_only:
- df = _get_data()
- if axis is None:
- return func(df.values)
- elif axis == 1:
- if len(df.index) == 0:
- # Taking a transpose would result in no columns, losing the dtype.
- # In the empty case, reducing along axis 0 or 1 gives the same
- # result dtype, so reduce with axis=0 and ignore values
- result = df._reduce(
- op,
- name,
- axis=0,
- skipna=skipna,
- numeric_only=False,
- filter_type=filter_type,
- **kwds,
- ).iloc[:0]
- result.index = df.index
- return result
- df = df.T
- # After possibly _get_data and transposing, we are now in the
- # simple case where we can use BlockManager.reduce
- res = df._mgr.reduce(blk_func)
- out = df._constructor(res).iloc[0]
- if out_dtype is not None:
- out = out.astype(out_dtype)
- elif (df._mgr.get_dtypes() == object).any():
- out = out.astype(object)
- elif len(self) == 0 and name in ("sum", "prod"):
- # Even if we are object dtype, follow numpy and return
- # float64, see test_apply_funcs_over_empty
- out = out.astype(np.float64)
- return out
- def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
- """
- Special case for _reduce to try to avoid a potentially-expensive transpose.
- Apply the reduction block-wise along axis=1 and then reduce the resulting
- 1D arrays.
- """
- if name == "all":
- result = np.ones(len(self), dtype=bool)
- ufunc = np.logical_and
- elif name == "any":
- result = np.zeros(len(self), dtype=bool)
- # error: Incompatible types in assignment
- # (expression has type "_UFunc_Nin2_Nout1[Literal['logical_or'],
- # Literal[20], Literal[False]]", variable has type
- # "_UFunc_Nin2_Nout1[Literal['logical_and'], Literal[20],
- # Literal[True]]")
- ufunc = np.logical_or # type: ignore[assignment]
- else:
- raise NotImplementedError(name)
- for arr in self._mgr.arrays:
- middle = func(arr, axis=0, skipna=skipna)
- result = ufunc(result, middle)
- res_ser = self._constructor_sliced(result, index=self.index, copy=False)
- return res_ser
- def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
- """
- Count number of distinct elements in specified axis.
- Return Series with number of distinct elements. Can ignore NaN
- values.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for
- column-wise.
- dropna : bool, default True
- Don't include NaN in the counts.
- Returns
- -------
- Series
- See Also
- --------
- Series.nunique: Method nunique for Series.
- DataFrame.count: Count non-NA cells for each column or row.
- Examples
- --------
- >>> df = pd.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]})
- >>> df.nunique()
- A 3
- B 2
- dtype: int64
- >>> df.nunique(axis=1)
- 0 1
- 1 2
- 2 2
- dtype: int64
- """
- return self.apply(Series.nunique, axis=axis, dropna=dropna)
- @doc(_shared_docs["idxmin"], numeric_only_default="False")
- def idxmin(
- self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
- ) -> Series:
- axis = self._get_axis_number(axis)
- if numeric_only:
- data = self._get_numeric_data()
- else:
- data = self
- res = data._reduce(
- nanops.nanargmin, "argmin", axis=axis, skipna=skipna, numeric_only=False
- )
- indices = res._values
- # indices will always be np.ndarray since axis is not None and
- # values is a 2d array for DataFrame
- # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
- assert isinstance(indices, np.ndarray) # for mypy
- index = data._get_axis(axis)
- result = [index[i] if i >= 0 else np.nan for i in indices]
- final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
- return final_result.__finalize__(self, method="idxmin")
- @doc(_shared_docs["idxmax"], numeric_only_default="False")
- def idxmax(
- self, axis: Axis = 0, skipna: bool = True, numeric_only: bool = False
- ) -> Series:
- axis = self._get_axis_number(axis)
- if numeric_only:
- data = self._get_numeric_data()
- else:
- data = self
- res = data._reduce(
- nanops.nanargmax, "argmax", axis=axis, skipna=skipna, numeric_only=False
- )
- indices = res._values
- # indices will always be np.ndarray since axis is not None and
- # values is a 2d array for DataFrame
- # error: Item "int" of "Union[int, Any]" has no attribute "__iter__"
- assert isinstance(indices, np.ndarray) # for mypy
- index = data._get_axis(axis)
- result = [index[i] if i >= 0 else np.nan for i in indices]
- final_result = data._constructor_sliced(result, index=data._get_agg_axis(axis))
- return final_result.__finalize__(self, method="idxmax")
- def _get_agg_axis(self, axis_num: int) -> Index:
- """
- Let's be explicit about this.
- """
- if axis_num == 0:
- return self.columns
- elif axis_num == 1:
- return self.index
- else:
- raise ValueError(f"Axis must be 0 or 1 (got {repr(axis_num)})")
- def mode(
- self, axis: Axis = 0, numeric_only: bool = False, dropna: bool = True
- ) -> DataFrame:
- """
- Get the mode(s) of each element along the selected axis.
- The mode of a set of values is the value that appears most often.
- It can be multiple values.
- Parameters
- ----------
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to iterate over while searching for the mode:
- * 0 or 'index' : get mode of each column
- * 1 or 'columns' : get mode of each row.
- numeric_only : bool, default False
- If True, only apply to numeric columns.
- dropna : bool, default True
- Don't consider counts of NaN/NaT.
- Returns
- -------
- DataFrame
- The modes of each column or row.
- See Also
- --------
- Series.mode : Return the highest frequency value in a Series.
- Series.value_counts : Return the counts of values in a Series.
- Examples
- --------
- >>> df = pd.DataFrame([('bird', 2, 2),
- ... ('mammal', 4, np.nan),
- ... ('arthropod', 8, 0),
- ... ('bird', 2, np.nan)],
- ... index=('falcon', 'horse', 'spider', 'ostrich'),
- ... columns=('species', 'legs', 'wings'))
- >>> df
- species legs wings
- falcon bird 2 2.0
- horse mammal 4 NaN
- spider arthropod 8 0.0
- ostrich bird 2 NaN
- By default, missing values are not considered, and the mode of wings
- are both 0 and 2. Because the resulting DataFrame has two rows,
- the second row of ``species`` and ``legs`` contains ``NaN``.
- >>> df.mode()
- species legs wings
- 0 bird 2.0 0.0
- 1 NaN NaN 2.0
- Setting ``dropna=False`` ``NaN`` values are considered and they can be
- the mode (like for wings).
- >>> df.mode(dropna=False)
- species legs wings
- 0 bird 2 NaN
- Setting ``numeric_only=True``, only the mode of numeric columns is
- computed, and columns of other types are ignored.
- >>> df.mode(numeric_only=True)
- legs wings
- 0 2.0 0.0
- 1 NaN 2.0
- To compute the mode over columns and not rows, use the axis parameter:
- >>> df.mode(axis='columns', numeric_only=True)
- 0 1
- falcon 2.0 NaN
- horse 4.0 NaN
- spider 0.0 8.0
- ostrich 2.0 NaN
- """
- data = self if not numeric_only else self._get_numeric_data()
- def f(s):
- return s.mode(dropna=dropna)
- data = data.apply(f, axis=axis)
- # Ensure index is type stable (should always use int index)
- if data.empty:
- data.index = default_index(0)
- return data
- @overload
- def quantile(
- self,
- q: float = ...,
- axis: Axis = ...,
- numeric_only: bool = ...,
- interpolation: QuantileInterpolation = ...,
- ) -> Series:
- ...
- @overload
- def quantile(
- self,
- q: AnyArrayLike | Sequence[float],
- axis: Axis = ...,
- numeric_only: bool = ...,
- interpolation: QuantileInterpolation = ...,
- ) -> Series | DataFrame:
- ...
- @overload
- def quantile(
- self,
- q: float | AnyArrayLike | Sequence[float] = ...,
- axis: Axis = ...,
- numeric_only: bool = ...,
- interpolation: QuantileInterpolation = ...,
- ) -> Series | DataFrame:
- ...
- def quantile(
- self,
- q: float | AnyArrayLike | Sequence[float] = 0.5,
- axis: Axis = 0,
- numeric_only: bool = False,
- interpolation: QuantileInterpolation = "linear",
- method: Literal["single", "table"] = "single",
- ) -> Series | DataFrame:
- """
- Return values at the given quantile over requested axis.
- Parameters
- ----------
- q : float or array-like, default 0.5 (50% quantile)
- Value between 0 <= q <= 1, the quantile(s) to compute.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- Equals 0 or 'index' for row-wise, 1 or 'columns' for column-wise.
- numeric_only : bool, default False
- Include only `float`, `int` or `boolean` data.
- .. versionchanged:: 2.0.0
- The default value of ``numeric_only`` is now ``False``.
- interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
- This optional parameter specifies the interpolation method to use,
- when the desired quantile lies between two data points `i` and `j`:
- * linear: `i + (j - i) * fraction`, where `fraction` is the
- fractional part of the index surrounded by `i` and `j`.
- * lower: `i`.
- * higher: `j`.
- * nearest: `i` or `j` whichever is nearest.
- * midpoint: (`i` + `j`) / 2.
- method : {'single', 'table'}, default 'single'
- Whether to compute quantiles per-column ('single') or over all columns
- ('table'). When 'table', the only allowed interpolation methods are
- 'nearest', 'lower', and 'higher'.
- Returns
- -------
- Series or DataFrame
- If ``q`` is an array, a DataFrame will be returned where the
- index is ``q``, the columns are the columns of self, and the
- values are the quantiles.
- If ``q`` is a float, a Series will be returned where the
- index is the columns of self and the values are the quantiles.
- See Also
- --------
- core.window.rolling.Rolling.quantile: Rolling quantile.
- numpy.percentile: Numpy function to compute the percentile.
- Examples
- --------
- >>> df = pd.DataFrame(np.array([[1, 1], [2, 10], [3, 100], [4, 100]]),
- ... columns=['a', 'b'])
- >>> df.quantile(.1)
- a 1.3
- b 3.7
- Name: 0.1, dtype: float64
- >>> df.quantile([.1, .5])
- a b
- 0.1 1.3 3.7
- 0.5 2.5 55.0
- Specifying `method='table'` will compute the quantile over all columns.
- >>> df.quantile(.1, method="table", interpolation="nearest")
- a 1
- b 1
- Name: 0.1, dtype: int64
- >>> df.quantile([.1, .5], method="table", interpolation="nearest")
- a b
- 0.1 1 1
- 0.5 3 100
- Specifying `numeric_only=False` will also compute the quantile of
- datetime and timedelta data.
- >>> df = pd.DataFrame({'A': [1, 2],
- ... 'B': [pd.Timestamp('2010'),
- ... pd.Timestamp('2011')],
- ... 'C': [pd.Timedelta('1 days'),
- ... pd.Timedelta('2 days')]})
- >>> df.quantile(0.5, numeric_only=False)
- A 1.5
- B 2010-07-02 12:00:00
- C 1 days 12:00:00
- Name: 0.5, dtype: object
- """
- validate_percentile(q)
- axis = self._get_axis_number(axis)
- if not is_list_like(q):
- # BlockManager.quantile expects listlike, so we wrap and unwrap here
- # error: List item 0 has incompatible type "Union[float, Union[Union[
- # ExtensionArray, ndarray[Any, Any]], Index, Series], Sequence[float]]";
- # expected "float"
- res_df = self.quantile( # type: ignore[call-overload]
- [q],
- axis=axis,
- numeric_only=numeric_only,
- interpolation=interpolation,
- method=method,
- )
- if method == "single":
- res = res_df.iloc[0]
- else:
- # cannot directly iloc over sparse arrays
- res = res_df.T.iloc[:, 0]
- if axis == 1 and len(self) == 0:
- # GH#41544 try to get an appropriate dtype
- dtype = find_common_type(list(self.dtypes))
- if needs_i8_conversion(dtype):
- return res.astype(dtype)
- return res
- q = Index(q, dtype=np.float64)
- data = self._get_numeric_data() if numeric_only else self
- if axis == 1:
- data = data.T
- if len(data.columns) == 0:
- # GH#23925 _get_numeric_data may have dropped all columns
- cols = Index([], name=self.columns.name)
- dtype = np.float64
- if axis == 1:
- # GH#41544 try to get an appropriate dtype
- cdtype = find_common_type(list(self.dtypes))
- if needs_i8_conversion(cdtype):
- dtype = cdtype
- res = self._constructor([], index=q, columns=cols, dtype=dtype)
- return res.__finalize__(self, method="quantile")
- valid_method = {"single", "table"}
- if method not in valid_method:
- raise ValueError(
- f"Invalid method: {method}. Method must be in {valid_method}."
- )
- if method == "single":
- res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation)
- elif method == "table":
- valid_interpolation = {"nearest", "lower", "higher"}
- if interpolation not in valid_interpolation:
- raise ValueError(
- f"Invalid interpolation: {interpolation}. "
- f"Interpolation must be in {valid_interpolation}"
- )
- # handle degenerate case
- if len(data) == 0:
- if data.ndim == 2:
- dtype = find_common_type(list(self.dtypes))
- else:
- dtype = self.dtype
- return self._constructor([], index=q, columns=data.columns, dtype=dtype)
- q_idx = np.quantile( # type: ignore[call-overload]
- np.arange(len(data)), q, **{np_percentile_argname: interpolation}
- )
- by = data.columns
- if len(by) > 1:
- keys = [data._get_label_or_level_values(x) for x in by]
- indexer = lexsort_indexer(keys)
- else:
- by = by[0]
- k = data._get_label_or_level_values(by) # type: ignore[arg-type]
- indexer = nargsort(k)
- res = data._mgr.take(indexer[q_idx], verify=False)
- res.axes[1] = q
- result = self._constructor(res)
- return result.__finalize__(self, method="quantile")
- @doc(NDFrame.asfreq, **_shared_doc_kwargs)
- def asfreq(
- self,
- freq: Frequency,
- method: FillnaOptions | None = None,
- how: str | None = None,
- normalize: bool = False,
- fill_value: Hashable = None,
- ) -> DataFrame:
- return super().asfreq(
- freq=freq,
- method=method,
- how=how,
- normalize=normalize,
- fill_value=fill_value,
- )
- @doc(NDFrame.resample, **_shared_doc_kwargs)
- def resample(
- self,
- rule,
- axis: Axis = 0,
- closed: str | None = None,
- label: str | None = None,
- convention: str = "start",
- kind: str | None = None,
- on: Level = None,
- level: Level = None,
- origin: str | TimestampConvertibleTypes = "start_day",
- offset: TimedeltaConvertibleTypes | None = None,
- group_keys: bool = False,
- ) -> Resampler:
- return super().resample(
- rule=rule,
- axis=axis,
- closed=closed,
- label=label,
- convention=convention,
- kind=kind,
- on=on,
- level=level,
- origin=origin,
- offset=offset,
- group_keys=group_keys,
- )
- def to_timestamp(
- self,
- freq: Frequency | None = None,
- how: str = "start",
- axis: Axis = 0,
- copy: bool | None = None,
- ) -> DataFrame:
- """
- Cast to DatetimeIndex of timestamps, at *beginning* of period.
- Parameters
- ----------
- freq : str, default frequency of PeriodIndex
- Desired frequency.
- how : {'s', 'e', 'start', 'end'}
- Convention for converting period to timestamp; start of period
- vs. end.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to convert (the index by default).
- copy : bool, default True
- If False then underlying input data is not copied.
- Returns
- -------
- DataFrame
- The DataFrame has a DatetimeIndex.
- Examples
- --------
- >>> idx = pd.PeriodIndex(['2023', '2024'], freq='Y')
- >>> d = {'col1': [1, 2], 'col2': [3, 4]}
- >>> df1 = pd.DataFrame(data=d, index=idx)
- >>> df1
- col1 col2
- 2023 1 3
- 2024 2 4
- The resulting timestamps will be at the beginning of the year in this case
- >>> df1 = df1.to_timestamp()
- >>> df1
- col1 col2
- 2023-01-01 1 3
- 2024-01-01 2 4
- >>> df1.index
- DatetimeIndex(['2023-01-01', '2024-01-01'], dtype='datetime64[ns]', freq=None)
- Using `freq` which is the offset that the Timestamps will have
- >>> df2 = pd.DataFrame(data=d, index=idx)
- >>> df2 = df2.to_timestamp(freq='M')
- >>> df2
- col1 col2
- 2023-01-31 1 3
- 2024-01-31 2 4
- >>> df2.index
- DatetimeIndex(['2023-01-31', '2024-01-31'], dtype='datetime64[ns]', freq=None)
- """
- new_obj = self.copy(deep=copy and not using_copy_on_write())
- axis_name = self._get_axis_name(axis)
- old_ax = getattr(self, axis_name)
- if not isinstance(old_ax, PeriodIndex):
- raise TypeError(f"unsupported Type {type(old_ax).__name__}")
- new_ax = old_ax.to_timestamp(freq=freq, how=how)
- setattr(new_obj, axis_name, new_ax)
- return new_obj
- def to_period(
- self, freq: Frequency | None = None, axis: Axis = 0, copy: bool | None = None
- ) -> DataFrame:
- """
- Convert DataFrame from DatetimeIndex to PeriodIndex.
- Convert DataFrame from DatetimeIndex to PeriodIndex with desired
- frequency (inferred from index if not passed).
- Parameters
- ----------
- freq : str, default
- Frequency of the PeriodIndex.
- axis : {0 or 'index', 1 or 'columns'}, default 0
- The axis to convert (the index by default).
- copy : bool, default True
- If False then underlying input data is not copied.
- Returns
- -------
- DataFrame
- The DataFrame has a PeriodIndex.
- Examples
- --------
- >>> idx = pd.to_datetime(
- ... [
- ... "2001-03-31 00:00:00",
- ... "2002-05-31 00:00:00",
- ... "2003-08-31 00:00:00",
- ... ]
- ... )
- >>> idx
- DatetimeIndex(['2001-03-31', '2002-05-31', '2003-08-31'],
- dtype='datetime64[ns]', freq=None)
- >>> idx.to_period("M")
- PeriodIndex(['2001-03', '2002-05', '2003-08'], dtype='period[M]')
- For the yearly frequency
- >>> idx.to_period("Y")
- PeriodIndex(['2001', '2002', '2003'], dtype='period[A-DEC]')
- """
- new_obj = self.copy(deep=copy and not using_copy_on_write())
- axis_name = self._get_axis_name(axis)
- old_ax = getattr(self, axis_name)
- if not isinstance(old_ax, DatetimeIndex):
- raise TypeError(f"unsupported Type {type(old_ax).__name__}")
- new_ax = old_ax.to_period(freq=freq)
- setattr(new_obj, axis_name, new_ax)
- return new_obj
- def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
- """
- Whether each element in the DataFrame is contained in values.
- Parameters
- ----------
- values : iterable, Series, DataFrame or dict
- The result will only be true at a location if all the
- labels match. If `values` is a Series, that's the index. If
- `values` is a dict, the keys must be the column names,
- which must match. If `values` is a DataFrame,
- then both the index and column labels must match.
- Returns
- -------
- DataFrame
- DataFrame of booleans showing whether each element in the DataFrame
- is contained in values.
- See Also
- --------
- DataFrame.eq: Equality test for DataFrame.
- Series.isin: Equivalent method on Series.
- Series.str.contains: Test if pattern or regex is contained within a
- string of a Series or Index.
- Examples
- --------
- >>> df = pd.DataFrame({'num_legs': [2, 4], 'num_wings': [2, 0]},
- ... index=['falcon', 'dog'])
- >>> df
- num_legs num_wings
- falcon 2 2
- dog 4 0
- When ``values`` is a list check whether every value in the DataFrame
- is present in the list (which animals have 0 or 2 legs or wings)
- >>> df.isin([0, 2])
- num_legs num_wings
- falcon True True
- dog False True
- To check if ``values`` is *not* in the DataFrame, use the ``~`` operator:
- >>> ~df.isin([0, 2])
- num_legs num_wings
- falcon False False
- dog True False
- When ``values`` is a dict, we can pass values to check for each
- column separately:
- >>> df.isin({'num_wings': [0, 3]})
- num_legs num_wings
- falcon False False
- dog False True
- When ``values`` is a Series or DataFrame the index and column must
- match. Note that 'falcon' does not match based on the number of legs
- in other.
- >>> other = pd.DataFrame({'num_legs': [8, 3], 'num_wings': [0, 2]},
- ... index=['spider', 'falcon'])
- >>> df.isin(other)
- num_legs num_wings
- falcon False True
- dog False False
- """
- if isinstance(values, dict):
- from pandas.core.reshape.concat import concat
- values = collections.defaultdict(list, values)
- result = concat(
- (
- self.iloc[:, [i]].isin(values[col])
- for i, col in enumerate(self.columns)
- ),
- axis=1,
- )
- elif isinstance(values, Series):
- if not values.index.is_unique:
- raise ValueError("cannot compute isin with a duplicate axis.")
- result = self.eq(values.reindex_like(self), axis="index")
- elif isinstance(values, DataFrame):
- if not (values.columns.is_unique and values.index.is_unique):
- raise ValueError("cannot compute isin with a duplicate axis.")
- result = self.eq(values.reindex_like(self))
- else:
- if not is_list_like(values):
- raise TypeError(
- "only list-like or dict-like objects are allowed "
- "to be passed to DataFrame.isin(), "
- f"you passed a '{type(values).__name__}'"
- )
- # error: Argument 2 to "isin" has incompatible type "Union[Sequence[Any],
- # Mapping[Any, Any]]"; expected "Union[Union[ExtensionArray,
- # ndarray[Any, Any]], Index, Series]"
- result = self._constructor(
- algorithms.isin(
- self.values.ravel(), values # type: ignore[arg-type]
- ).reshape(self.shape),
- self.index,
- self.columns,
- copy=False,
- )
- return result.__finalize__(self, method="isin")
- # ----------------------------------------------------------------------
- # Add index and columns
- _AXIS_ORDERS: list[Literal["index", "columns"]] = ["index", "columns"]
- _AXIS_TO_AXIS_NUMBER: dict[Axis, int] = {
- **NDFrame._AXIS_TO_AXIS_NUMBER,
- 1: 1,
- "columns": 1,
- }
- _AXIS_LEN = len(_AXIS_ORDERS)
- _info_axis_number: Literal[1] = 1
- _info_axis_name: Literal["columns"] = "columns"
- index = properties.AxisProperty(
- axis=1, doc="The index (row labels) of the DataFrame."
- )
- columns = properties.AxisProperty(axis=0, doc="The column labels of the DataFrame.")
- # ----------------------------------------------------------------------
- # Add plotting methods to DataFrame
- plot = CachedAccessor("plot", pandas.plotting.PlotAccessor)
- hist = pandas.plotting.hist_frame
- boxplot = pandas.plotting.boxplot_frame
- sparse = CachedAccessor("sparse", SparseFrameAccessor)
- # ----------------------------------------------------------------------
- # Internal Interface Methods
- def _to_dict_of_blocks(self, copy: bool = True):
- """
- Return a dict of dtype -> Constructor Types that
- each is a homogeneous dtype.
- Internal ONLY - only works for BlockManager
- """
- mgr = self._mgr
- # convert to BlockManager if needed -> this way support ArrayManager as well
- mgr = mgr_to_mgr(mgr, "block")
- mgr = cast(BlockManager, mgr)
- return {
- k: self._constructor(v).__finalize__(self)
- for k, v, in mgr.to_dict(copy=copy).items()
- }
- @property
- def values(self) -> np.ndarray:
- """
- Return a Numpy representation of the DataFrame.
- .. warning::
- We recommend using :meth:`DataFrame.to_numpy` instead.
- Only the values in the DataFrame will be returned, the axes labels
- will be removed.
- Returns
- -------
- numpy.ndarray
- The values of the DataFrame.
- See Also
- --------
- DataFrame.to_numpy : Recommended alternative to this method.
- DataFrame.index : Retrieve the index labels.
- DataFrame.columns : Retrieving the column names.
- Notes
- -----
- The dtype will be a lower-common-denominator dtype (implicit
- upcasting); that is to say if the dtypes (even of numeric types)
- are mixed, the one that accommodates all will be chosen. Use this
- with care if you are not dealing with the blocks.
- e.g. If the dtypes are float16 and float32, dtype will be upcast to
- float32. If dtypes are int32 and uint8, dtype will be upcast to
- int32. By :func:`numpy.find_common_type` convention, mixing int64
- and uint64 will result in a float64 dtype.
- Examples
- --------
- A DataFrame where all columns are the same type (e.g., int64) results
- in an array of the same type.
- >>> df = pd.DataFrame({'age': [ 3, 29],
- ... 'height': [94, 170],
- ... 'weight': [31, 115]})
- >>> df
- age height weight
- 0 3 94 31
- 1 29 170 115
- >>> df.dtypes
- age int64
- height int64
- weight int64
- dtype: object
- >>> df.values
- array([[ 3, 94, 31],
- [ 29, 170, 115]])
- A DataFrame with mixed type columns(e.g., str/object, int64, float32)
- results in an ndarray of the broadest type that accommodates these
- mixed types (e.g., object).
- >>> df2 = pd.DataFrame([('parrot', 24.0, 'second'),
- ... ('lion', 80.5, 1),
- ... ('monkey', np.nan, None)],
- ... columns=('name', 'max_speed', 'rank'))
- >>> df2.dtypes
- name object
- max_speed float64
- rank object
- dtype: object
- >>> df2.values
- array([['parrot', 24.0, 'second'],
- ['lion', 80.5, 1],
- ['monkey', nan, None]], dtype=object)
- """
- return self._mgr.as_array()
- @overload
- def ffill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[False] = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> DataFrame:
- ...
- @overload
- def ffill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[True],
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> None:
- ...
- @overload
- def ffill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: bool = ...,
- limit: None | int = ...,
- downcast: dict | None = ...,
- ) -> DataFrame | None:
- ...
- def ffill(
- self,
- *,
- axis: None | Axis = None,
- inplace: bool = False,
- limit: None | int = None,
- downcast: dict | None = None,
- ) -> DataFrame | None:
- return super().ffill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
- @overload
- def bfill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[False] = ...,
- limit: None | int = ...,
- downcast=...,
- ) -> DataFrame:
- ...
- @overload
- def bfill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: Literal[True],
- limit: None | int = ...,
- downcast=...,
- ) -> None:
- ...
- @overload
- def bfill(
- self,
- *,
- axis: None | Axis = ...,
- inplace: bool = ...,
- limit: None | int = ...,
- downcast=...,
- ) -> DataFrame | None:
- ...
- def bfill(
- self,
- *,
- axis: None | Axis = None,
- inplace: bool = False,
- limit: None | int = None,
- downcast=None,
- ) -> DataFrame | None:
- return super().bfill(axis=axis, inplace=inplace, limit=limit, downcast=downcast)
- def clip(
- self: DataFrame,
- lower: float | None = None,
- upper: float | None = None,
- *,
- axis: Axis | None = None,
- inplace: bool = False,
- **kwargs,
- ) -> DataFrame | None:
- return super().clip(lower, upper, axis=axis, inplace=inplace, **kwargs)
- def interpolate(
- self: DataFrame,
- method: str = "linear",
- *,
- axis: Axis = 0,
- limit: int | None = None,
- inplace: bool = False,
- limit_direction: str | None = None,
- limit_area: str | None = None,
- downcast: str | None = None,
- **kwargs,
- ) -> DataFrame | None:
- return super().interpolate(
- method=method,
- axis=axis,
- limit=limit,
- inplace=inplace,
- limit_direction=limit_direction,
- limit_area=limit_area,
- downcast=downcast,
- **kwargs,
- )
- @overload
- def where(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[False] = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> DataFrame:
- ...
- @overload
- def where(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[True],
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> None:
- ...
- @overload
- def where(
- self,
- cond,
- other=...,
- *,
- inplace: bool = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> DataFrame | None:
- ...
- def where(
- self,
- cond,
- other=lib.no_default,
- *,
- inplace: bool = False,
- axis: Axis | None = None,
- level: Level = None,
- ) -> DataFrame | None:
- return super().where(
- cond,
- other,
- inplace=inplace,
- axis=axis,
- level=level,
- )
- @overload
- def mask(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[False] = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> DataFrame:
- ...
- @overload
- def mask(
- self,
- cond,
- other=...,
- *,
- inplace: Literal[True],
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> None:
- ...
- @overload
- def mask(
- self,
- cond,
- other=...,
- *,
- inplace: bool = ...,
- axis: Axis | None = ...,
- level: Level = ...,
- ) -> DataFrame | None:
- ...
- def mask(
- self,
- cond,
- other=lib.no_default,
- *,
- inplace: bool = False,
- axis: Axis | None = None,
- level: Level = None,
- ) -> DataFrame | None:
- return super().mask(
- cond,
- other,
- inplace=inplace,
- axis=axis,
- level=level,
- )
- DataFrame._add_numeric_operations()
- ops.add_flex_arithmetic_methods(DataFrame)
- def _from_nested_dict(data) -> collections.defaultdict:
- new_data: collections.defaultdict = collections.defaultdict(dict)
- for index, s in data.items():
- for col, v in s.items():
- new_data[col][index] = v
- return new_data
- def _reindex_for_setitem(value: DataFrame | Series, index: Index) -> ArrayLike:
- # reindex if necessary
- if value.index.equals(index) or not len(index):
- return value._values.copy()
- # GH#4107
- try:
- reindexed_value = value.reindex(index)._values
- except ValueError as err:
- # raised in MultiIndex.from_tuples, see test_insert_error_msmgs
- if not value.index.is_unique:
- # duplicate axis
- raise err
- raise TypeError(
- "incompatible index of inserted column with frame index"
- ) from err
- return reindexed_value
|