pytables.py 168 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846384738483849385038513852385338543855385638573858385938603861386238633864386538663867386838693870387138723873387438753876387738783879388038813882388338843885388638873888388938903891389238933894389538963897389838993900390139023903390439053906390739083909391039113912391339143915391639173918391939203921392239233924392539263927392839293930393139323933393439353936393739383939394039413942394339443945394639473948394939503951395239533954395539563957395839593960396139623963396439653966396739683969397039713972397339743975397639773978397939803981398239833984398539863987398839893990399139923993399439953996399739983999400040014002400340044005400640074008400940104011401240134014401540164017401840194020402140224023402440254026402740284029403040314032403340344035403640374038403940404041404240434044404540464047404840494050405140524053405440554056405740584059406040614062406340644065406640674068406940704071407240734074407540764077407840794080408140824083408440854086408740884089409040914092409340944095409640974098409941004101410241034104410541064107410841094110411141124113411441154116411741184119412041214122412341244125412641274128412941304131413241334134413541364137413841394140414141424143414441454146414741484149415041514152415341544155415641574158415941604161416241634164416541664167416841694170417141724173417441754176417741784179418041814182418341844185418641874188418941904191419241934194419541964197419841994200420142024203420442054206420742084209421042114212421342144215421642174218421942204221422242234224422542264227422842294230423142324233423442354236423742384239424042414242424342444245424642474248424942504251425242534254425542564257425842594260426142624263426442654266426742684269427042714272427342744275427642774278427942804281428242834284428542864287428842894290429142924293429442954296429742984299430043014302430343044305430643074308430943104311431243134314431543164317431843194320432143224323432443254326432743284329433043314332433343344335433643374338433943404341434243434344434543464347434843494350435143524353435443554356435743584359436043614362436343644365436643674368436943704371437243734374437543764377437843794380438143824383438443854386438743884389439043914392439343944395439643974398439944004401440244034404440544064407440844094410441144124413441444154416441744184419442044214422442344244425442644274428442944304431443244334434443544364437443844394440444144424443444444454446444744484449445044514452445344544455445644574458445944604461446244634464446544664467446844694470447144724473447444754476447744784479448044814482448344844485448644874488448944904491449244934494449544964497449844994500450145024503450445054506450745084509451045114512451345144515451645174518451945204521452245234524452545264527452845294530453145324533453445354536453745384539454045414542454345444545454645474548454945504551455245534554455545564557455845594560456145624563456445654566456745684569457045714572457345744575457645774578457945804581458245834584458545864587458845894590459145924593459445954596459745984599460046014602460346044605460646074608460946104611461246134614461546164617461846194620462146224623462446254626462746284629463046314632463346344635463646374638463946404641464246434644464546464647464846494650465146524653465446554656465746584659466046614662466346644665466646674668466946704671467246734674467546764677467846794680468146824683468446854686468746884689469046914692469346944695469646974698469947004701470247034704470547064707470847094710471147124713471447154716471747184719472047214722472347244725472647274728472947304731473247334734473547364737473847394740474147424743474447454746474747484749475047514752475347544755475647574758475947604761476247634764476547664767476847694770477147724773477447754776477747784779478047814782478347844785478647874788478947904791479247934794479547964797479847994800480148024803480448054806480748084809481048114812481348144815481648174818481948204821482248234824482548264827482848294830483148324833483448354836483748384839484048414842484348444845484648474848484948504851485248534854485548564857485848594860486148624863486448654866486748684869487048714872487348744875487648774878487948804881488248834884488548864887488848894890489148924893489448954896489748984899490049014902490349044905490649074908490949104911491249134914491549164917491849194920492149224923492449254926492749284929493049314932493349344935493649374938493949404941494249434944494549464947494849494950495149524953495449554956495749584959496049614962496349644965496649674968496949704971497249734974497549764977497849794980498149824983498449854986498749884989499049914992499349944995499649974998499950005001500250035004500550065007500850095010501150125013501450155016501750185019502050215022502350245025502650275028502950305031503250335034503550365037503850395040504150425043504450455046504750485049505050515052505350545055505650575058505950605061506250635064506550665067506850695070507150725073507450755076507750785079508050815082508350845085508650875088508950905091509250935094509550965097509850995100510151025103510451055106510751085109511051115112511351145115511651175118511951205121512251235124512551265127512851295130513151325133513451355136513751385139514051415142514351445145514651475148514951505151515251535154515551565157515851595160516151625163516451655166516751685169517051715172517351745175517651775178517951805181518251835184518551865187518851895190519151925193519451955196519751985199520052015202520352045205520652075208520952105211521252135214521552165217521852195220522152225223522452255226522752285229523052315232523352345235523652375238523952405241524252435244524552465247524852495250525152525253525452555256525752585259526052615262526352645265526652675268526952705271527252735274527552765277527852795280528152825283528452855286528752885289
  1. """
  2. High level interface to PyTables for reading and writing pandas data structures
  3. to disk
  4. """
  5. from __future__ import annotations
  6. from contextlib import suppress
  7. import copy
  8. from datetime import (
  9. date,
  10. tzinfo,
  11. )
  12. import itertools
  13. import os
  14. import re
  15. from textwrap import dedent
  16. from types import TracebackType
  17. from typing import (
  18. TYPE_CHECKING,
  19. Any,
  20. Callable,
  21. Final,
  22. Hashable,
  23. Iterator,
  24. Literal,
  25. Sequence,
  26. cast,
  27. overload,
  28. )
  29. import warnings
  30. import numpy as np
  31. from pandas._config import (
  32. config,
  33. get_option,
  34. )
  35. from pandas._libs import (
  36. lib,
  37. writers as libwriters,
  38. )
  39. from pandas._libs.tslibs import timezones
  40. from pandas._typing import (
  41. AnyArrayLike,
  42. ArrayLike,
  43. AxisInt,
  44. DtypeArg,
  45. FilePath,
  46. Shape,
  47. npt,
  48. )
  49. from pandas.compat._optional import import_optional_dependency
  50. from pandas.compat.pickle_compat import patch_pickle
  51. from pandas.errors import (
  52. AttributeConflictWarning,
  53. ClosedFileError,
  54. IncompatibilityWarning,
  55. PerformanceWarning,
  56. PossibleDataLossError,
  57. )
  58. from pandas.util._decorators import cache_readonly
  59. from pandas.util._exceptions import find_stack_level
  60. from pandas.core.dtypes.common import (
  61. ensure_object,
  62. is_bool_dtype,
  63. is_categorical_dtype,
  64. is_complex_dtype,
  65. is_datetime64_dtype,
  66. is_datetime64tz_dtype,
  67. is_extension_array_dtype,
  68. is_integer_dtype,
  69. is_list_like,
  70. is_object_dtype,
  71. is_string_dtype,
  72. is_timedelta64_dtype,
  73. needs_i8_conversion,
  74. )
  75. from pandas.core.dtypes.missing import array_equivalent
  76. from pandas import (
  77. DataFrame,
  78. DatetimeIndex,
  79. Index,
  80. MultiIndex,
  81. PeriodIndex,
  82. RangeIndex,
  83. Series,
  84. TimedeltaIndex,
  85. concat,
  86. isna,
  87. )
  88. from pandas.core.arrays import (
  89. Categorical,
  90. DatetimeArray,
  91. PeriodArray,
  92. )
  93. import pandas.core.common as com
  94. from pandas.core.computation.pytables import (
  95. PyTablesExpr,
  96. maybe_expression,
  97. )
  98. from pandas.core.construction import extract_array
  99. from pandas.core.indexes.api import ensure_index
  100. from pandas.core.internals import (
  101. ArrayManager,
  102. BlockManager,
  103. )
  104. from pandas.io.common import stringify_path
  105. from pandas.io.formats.printing import (
  106. adjoin,
  107. pprint_thing,
  108. )
  109. if TYPE_CHECKING:
  110. from tables import (
  111. Col,
  112. File,
  113. Node,
  114. )
  115. from pandas.core.internals import Block
  116. # versioning attribute
  117. _version = "0.15.2"
  118. # encoding
  119. _default_encoding = "UTF-8"
  120. def _ensure_decoded(s):
  121. """if we have bytes, decode them to unicode"""
  122. if isinstance(s, np.bytes_):
  123. s = s.decode("UTF-8")
  124. return s
  125. def _ensure_encoding(encoding: str | None) -> str:
  126. # set the encoding if we need
  127. if encoding is None:
  128. encoding = _default_encoding
  129. return encoding
  130. def _ensure_str(name):
  131. """
  132. Ensure that an index / column name is a str (python 3); otherwise they
  133. may be np.string dtype. Non-string dtypes are passed through unchanged.
  134. https://github.com/pandas-dev/pandas/issues/13492
  135. """
  136. if isinstance(name, str):
  137. name = str(name)
  138. return name
  139. Term = PyTablesExpr
  140. def _ensure_term(where, scope_level: int):
  141. """
  142. Ensure that the where is a Term or a list of Term.
  143. This makes sure that we are capturing the scope of variables that are
  144. passed create the terms here with a frame_level=2 (we are 2 levels down)
  145. """
  146. # only consider list/tuple here as an ndarray is automatically a coordinate
  147. # list
  148. level = scope_level + 1
  149. if isinstance(where, (list, tuple)):
  150. where = [
  151. Term(term, scope_level=level + 1) if maybe_expression(term) else term
  152. for term in where
  153. if term is not None
  154. ]
  155. elif maybe_expression(where):
  156. where = Term(where, scope_level=level)
  157. return where if where is None or len(where) else None
  158. incompatibility_doc: Final = """
  159. where criteria is being ignored as this version [%s] is too old (or
  160. not-defined), read the file in and write it out to a new file to upgrade (with
  161. the copy_to method)
  162. """
  163. attribute_conflict_doc: Final = """
  164. the [%s] attribute of the existing index is [%s] which conflicts with the new
  165. [%s], resetting the attribute to None
  166. """
  167. performance_doc: Final = """
  168. your performance may suffer as PyTables will pickle object types that it cannot
  169. map directly to c-types [inferred_type->%s,key->%s] [items->%s]
  170. """
  171. # formats
  172. _FORMAT_MAP = {"f": "fixed", "fixed": "fixed", "t": "table", "table": "table"}
  173. # axes map
  174. _AXES_MAP = {DataFrame: [0]}
  175. # register our configuration options
  176. dropna_doc: Final = """
  177. : boolean
  178. drop ALL nan rows when appending to a table
  179. """
  180. format_doc: Final = """
  181. : format
  182. default format writing format, if None, then
  183. put will default to 'fixed' and append will default to 'table'
  184. """
  185. with config.config_prefix("io.hdf"):
  186. config.register_option("dropna_table", False, dropna_doc, validator=config.is_bool)
  187. config.register_option(
  188. "default_format",
  189. None,
  190. format_doc,
  191. validator=config.is_one_of_factory(["fixed", "table", None]),
  192. )
  193. # oh the troubles to reduce import time
  194. _table_mod = None
  195. _table_file_open_policy_is_strict = False
  196. def _tables():
  197. global _table_mod
  198. global _table_file_open_policy_is_strict
  199. if _table_mod is None:
  200. import tables
  201. _table_mod = tables
  202. # set the file open policy
  203. # return the file open policy; this changes as of pytables 3.1
  204. # depending on the HDF5 version
  205. with suppress(AttributeError):
  206. _table_file_open_policy_is_strict = (
  207. tables.file._FILE_OPEN_POLICY == "strict"
  208. )
  209. return _table_mod
  210. # interface to/from ###
  211. def to_hdf(
  212. path_or_buf: FilePath | HDFStore,
  213. key: str,
  214. value: DataFrame | Series,
  215. mode: str = "a",
  216. complevel: int | None = None,
  217. complib: str | None = None,
  218. append: bool = False,
  219. format: str | None = None,
  220. index: bool = True,
  221. min_itemsize: int | dict[str, int] | None = None,
  222. nan_rep=None,
  223. dropna: bool | None = None,
  224. data_columns: Literal[True] | list[str] | None = None,
  225. errors: str = "strict",
  226. encoding: str = "UTF-8",
  227. ) -> None:
  228. """store this object, close it if we opened it"""
  229. if append:
  230. f = lambda store: store.append(
  231. key,
  232. value,
  233. format=format,
  234. index=index,
  235. min_itemsize=min_itemsize,
  236. nan_rep=nan_rep,
  237. dropna=dropna,
  238. data_columns=data_columns,
  239. errors=errors,
  240. encoding=encoding,
  241. )
  242. else:
  243. # NB: dropna is not passed to `put`
  244. f = lambda store: store.put(
  245. key,
  246. value,
  247. format=format,
  248. index=index,
  249. min_itemsize=min_itemsize,
  250. nan_rep=nan_rep,
  251. data_columns=data_columns,
  252. errors=errors,
  253. encoding=encoding,
  254. dropna=dropna,
  255. )
  256. path_or_buf = stringify_path(path_or_buf)
  257. if isinstance(path_or_buf, str):
  258. with HDFStore(
  259. path_or_buf, mode=mode, complevel=complevel, complib=complib
  260. ) as store:
  261. f(store)
  262. else:
  263. f(path_or_buf)
  264. def read_hdf(
  265. path_or_buf: FilePath | HDFStore,
  266. key=None,
  267. mode: str = "r",
  268. errors: str = "strict",
  269. where: str | list | None = None,
  270. start: int | None = None,
  271. stop: int | None = None,
  272. columns: list[str] | None = None,
  273. iterator: bool = False,
  274. chunksize: int | None = None,
  275. **kwargs,
  276. ):
  277. """
  278. Read from the store, close it if we opened it.
  279. Retrieve pandas object stored in file, optionally based on where
  280. criteria.
  281. .. warning::
  282. Pandas uses PyTables for reading and writing HDF5 files, which allows
  283. serializing object-dtype data with pickle when using the "fixed" format.
  284. Loading pickled data received from untrusted sources can be unsafe.
  285. See: https://docs.python.org/3/library/pickle.html for more.
  286. Parameters
  287. ----------
  288. path_or_buf : str, path object, pandas.HDFStore
  289. Any valid string path is acceptable. Only supports the local file system,
  290. remote URLs and file-like objects are not supported.
  291. If you want to pass in a path object, pandas accepts any
  292. ``os.PathLike``.
  293. Alternatively, pandas accepts an open :class:`pandas.HDFStore` object.
  294. key : object, optional
  295. The group identifier in the store. Can be omitted if the HDF file
  296. contains a single pandas object.
  297. mode : {'r', 'r+', 'a'}, default 'r'
  298. Mode to use when opening the file. Ignored if path_or_buf is a
  299. :class:`pandas.HDFStore`. Default is 'r'.
  300. errors : str, default 'strict'
  301. Specifies how encoding and decoding errors are to be handled.
  302. See the errors argument for :func:`open` for a full list
  303. of options.
  304. where : list, optional
  305. A list of Term (or convertible) objects.
  306. start : int, optional
  307. Row number to start selection.
  308. stop : int, optional
  309. Row number to stop selection.
  310. columns : list, optional
  311. A list of columns names to return.
  312. iterator : bool, optional
  313. Return an iterator object.
  314. chunksize : int, optional
  315. Number of rows to include in an iteration when using an iterator.
  316. **kwargs
  317. Additional keyword arguments passed to HDFStore.
  318. Returns
  319. -------
  320. object
  321. The selected object. Return type depends on the object stored.
  322. See Also
  323. --------
  324. DataFrame.to_hdf : Write a HDF file from a DataFrame.
  325. HDFStore : Low-level access to HDF files.
  326. Examples
  327. --------
  328. >>> df = pd.DataFrame([[1, 1.0, 'a']], columns=['x', 'y', 'z']) # doctest: +SKIP
  329. >>> df.to_hdf('./store.h5', 'data') # doctest: +SKIP
  330. >>> reread = pd.read_hdf('./store.h5') # doctest: +SKIP
  331. """
  332. if mode not in ["r", "r+", "a"]:
  333. raise ValueError(
  334. f"mode {mode} is not allowed while performing a read. "
  335. f"Allowed modes are r, r+ and a."
  336. )
  337. # grab the scope
  338. if where is not None:
  339. where = _ensure_term(where, scope_level=1)
  340. if isinstance(path_or_buf, HDFStore):
  341. if not path_or_buf.is_open:
  342. raise OSError("The HDFStore must be open for reading.")
  343. store = path_or_buf
  344. auto_close = False
  345. else:
  346. path_or_buf = stringify_path(path_or_buf)
  347. if not isinstance(path_or_buf, str):
  348. raise NotImplementedError(
  349. "Support for generic buffers has not been implemented."
  350. )
  351. try:
  352. exists = os.path.exists(path_or_buf)
  353. # if filepath is too long
  354. except (TypeError, ValueError):
  355. exists = False
  356. if not exists:
  357. raise FileNotFoundError(f"File {path_or_buf} does not exist")
  358. store = HDFStore(path_or_buf, mode=mode, errors=errors, **kwargs)
  359. # can't auto open/close if we are using an iterator
  360. # so delegate to the iterator
  361. auto_close = True
  362. try:
  363. if key is None:
  364. groups = store.groups()
  365. if len(groups) == 0:
  366. raise ValueError(
  367. "Dataset(s) incompatible with Pandas data types, "
  368. "not table, or no datasets found in HDF5 file."
  369. )
  370. candidate_only_group = groups[0]
  371. # For the HDF file to have only one dataset, all other groups
  372. # should then be metadata groups for that candidate group. (This
  373. # assumes that the groups() method enumerates parent groups
  374. # before their children.)
  375. for group_to_check in groups[1:]:
  376. if not _is_metadata_of(group_to_check, candidate_only_group):
  377. raise ValueError(
  378. "key must be provided when HDF5 "
  379. "file contains multiple datasets."
  380. )
  381. key = candidate_only_group._v_pathname
  382. return store.select(
  383. key,
  384. where=where,
  385. start=start,
  386. stop=stop,
  387. columns=columns,
  388. iterator=iterator,
  389. chunksize=chunksize,
  390. auto_close=auto_close,
  391. )
  392. except (ValueError, TypeError, KeyError):
  393. if not isinstance(path_or_buf, HDFStore):
  394. # if there is an error, close the store if we opened it.
  395. with suppress(AttributeError):
  396. store.close()
  397. raise
  398. def _is_metadata_of(group: Node, parent_group: Node) -> bool:
  399. """Check if a given group is a metadata group for a given parent_group."""
  400. if group._v_depth <= parent_group._v_depth:
  401. return False
  402. current = group
  403. while current._v_depth > 1:
  404. parent = current._v_parent
  405. if parent == parent_group and current._v_name == "meta":
  406. return True
  407. current = current._v_parent
  408. return False
  409. class HDFStore:
  410. """
  411. Dict-like IO interface for storing pandas objects in PyTables.
  412. Either Fixed or Table format.
  413. .. warning::
  414. Pandas uses PyTables for reading and writing HDF5 files, which allows
  415. serializing object-dtype data with pickle when using the "fixed" format.
  416. Loading pickled data received from untrusted sources can be unsafe.
  417. See: https://docs.python.org/3/library/pickle.html for more.
  418. Parameters
  419. ----------
  420. path : str
  421. File path to HDF5 file.
  422. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  423. ``'r'``
  424. Read-only; no data can be modified.
  425. ``'w'``
  426. Write; a new file is created (an existing file with the same
  427. name would be deleted).
  428. ``'a'``
  429. Append; an existing file is opened for reading and writing,
  430. and if the file does not exist it is created.
  431. ``'r+'``
  432. It is similar to ``'a'``, but the file must already exist.
  433. complevel : int, 0-9, default None
  434. Specifies a compression level for data.
  435. A value of 0 or None disables compression.
  436. complib : {'zlib', 'lzo', 'bzip2', 'blosc'}, default 'zlib'
  437. Specifies the compression library to be used.
  438. As of v0.20.2 these additional compressors for Blosc are supported
  439. (default if no compressor specified: 'blosc:blosclz'):
  440. {'blosc:blosclz', 'blosc:lz4', 'blosc:lz4hc', 'blosc:snappy',
  441. 'blosc:zlib', 'blosc:zstd'}.
  442. Specifying a compression library which is not available issues
  443. a ValueError.
  444. fletcher32 : bool, default False
  445. If applying compression use the fletcher32 checksum.
  446. **kwargs
  447. These parameters will be passed to the PyTables open_file method.
  448. Examples
  449. --------
  450. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  451. >>> store = pd.HDFStore('test.h5')
  452. >>> store['foo'] = bar # write to HDF5
  453. >>> bar = store['foo'] # retrieve
  454. >>> store.close()
  455. **Create or load HDF5 file in-memory**
  456. When passing the `driver` option to the PyTables open_file method through
  457. **kwargs, the HDF5 file is loaded or created in-memory and will only be
  458. written when closed:
  459. >>> bar = pd.DataFrame(np.random.randn(10, 4))
  460. >>> store = pd.HDFStore('test.h5', driver='H5FD_CORE')
  461. >>> store['foo'] = bar
  462. >>> store.close() # only now, data is written to disk
  463. """
  464. _handle: File | None
  465. _mode: str
  466. def __init__(
  467. self,
  468. path,
  469. mode: str = "a",
  470. complevel: int | None = None,
  471. complib=None,
  472. fletcher32: bool = False,
  473. **kwargs,
  474. ) -> None:
  475. if "format" in kwargs:
  476. raise ValueError("format is not a defined argument for HDFStore")
  477. tables = import_optional_dependency("tables")
  478. if complib is not None and complib not in tables.filters.all_complibs:
  479. raise ValueError(
  480. f"complib only supports {tables.filters.all_complibs} compression."
  481. )
  482. if complib is None and complevel is not None:
  483. complib = tables.filters.default_complib
  484. self._path = stringify_path(path)
  485. if mode is None:
  486. mode = "a"
  487. self._mode = mode
  488. self._handle = None
  489. self._complevel = complevel if complevel else 0
  490. self._complib = complib
  491. self._fletcher32 = fletcher32
  492. self._filters = None
  493. self.open(mode=mode, **kwargs)
  494. def __fspath__(self) -> str:
  495. return self._path
  496. @property
  497. def root(self):
  498. """return the root node"""
  499. self._check_if_open()
  500. assert self._handle is not None # for mypy
  501. return self._handle.root
  502. @property
  503. def filename(self) -> str:
  504. return self._path
  505. def __getitem__(self, key: str):
  506. return self.get(key)
  507. def __setitem__(self, key: str, value) -> None:
  508. self.put(key, value)
  509. def __delitem__(self, key: str) -> None:
  510. return self.remove(key)
  511. def __getattr__(self, name: str):
  512. """allow attribute access to get stores"""
  513. try:
  514. return self.get(name)
  515. except (KeyError, ClosedFileError):
  516. pass
  517. raise AttributeError(
  518. f"'{type(self).__name__}' object has no attribute '{name}'"
  519. )
  520. def __contains__(self, key: str) -> bool:
  521. """
  522. check for existence of this key
  523. can match the exact pathname or the pathnm w/o the leading '/'
  524. """
  525. node = self.get_node(key)
  526. if node is not None:
  527. name = node._v_pathname
  528. if key in (name, name[1:]):
  529. return True
  530. return False
  531. def __len__(self) -> int:
  532. return len(self.groups())
  533. def __repr__(self) -> str:
  534. pstr = pprint_thing(self._path)
  535. return f"{type(self)}\nFile path: {pstr}\n"
  536. def __enter__(self) -> HDFStore:
  537. return self
  538. def __exit__(
  539. self,
  540. exc_type: type[BaseException] | None,
  541. exc_value: BaseException | None,
  542. traceback: TracebackType | None,
  543. ) -> None:
  544. self.close()
  545. def keys(self, include: str = "pandas") -> list[str]:
  546. """
  547. Return a list of keys corresponding to objects stored in HDFStore.
  548. Parameters
  549. ----------
  550. include : str, default 'pandas'
  551. When kind equals 'pandas' return pandas objects.
  552. When kind equals 'native' return native HDF5 Table objects.
  553. .. versionadded:: 1.1.0
  554. Returns
  555. -------
  556. list
  557. List of ABSOLUTE path-names (e.g. have the leading '/').
  558. Raises
  559. ------
  560. raises ValueError if kind has an illegal value
  561. """
  562. if include == "pandas":
  563. return [n._v_pathname for n in self.groups()]
  564. elif include == "native":
  565. assert self._handle is not None # mypy
  566. return [
  567. n._v_pathname for n in self._handle.walk_nodes("/", classname="Table")
  568. ]
  569. raise ValueError(
  570. f"`include` should be either 'pandas' or 'native' but is '{include}'"
  571. )
  572. def __iter__(self) -> Iterator[str]:
  573. return iter(self.keys())
  574. def items(self) -> Iterator[tuple[str, list]]:
  575. """
  576. iterate on key->group
  577. """
  578. for g in self.groups():
  579. yield g._v_pathname, g
  580. def open(self, mode: str = "a", **kwargs) -> None:
  581. """
  582. Open the file in the specified mode
  583. Parameters
  584. ----------
  585. mode : {'a', 'w', 'r', 'r+'}, default 'a'
  586. See HDFStore docstring or tables.open_file for info about modes
  587. **kwargs
  588. These parameters will be passed to the PyTables open_file method.
  589. """
  590. tables = _tables()
  591. if self._mode != mode:
  592. # if we are changing a write mode to read, ok
  593. if self._mode in ["a", "w"] and mode in ["r", "r+"]:
  594. pass
  595. elif mode in ["w"]:
  596. # this would truncate, raise here
  597. if self.is_open:
  598. raise PossibleDataLossError(
  599. f"Re-opening the file [{self._path}] with mode [{self._mode}] "
  600. "will delete the current file!"
  601. )
  602. self._mode = mode
  603. # close and reopen the handle
  604. if self.is_open:
  605. self.close()
  606. if self._complevel and self._complevel > 0:
  607. self._filters = _tables().Filters(
  608. self._complevel, self._complib, fletcher32=self._fletcher32
  609. )
  610. if _table_file_open_policy_is_strict and self.is_open:
  611. msg = (
  612. "Cannot open HDF5 file, which is already opened, "
  613. "even in read-only mode."
  614. )
  615. raise ValueError(msg)
  616. self._handle = tables.open_file(self._path, self._mode, **kwargs)
  617. def close(self) -> None:
  618. """
  619. Close the PyTables file handle
  620. """
  621. if self._handle is not None:
  622. self._handle.close()
  623. self._handle = None
  624. @property
  625. def is_open(self) -> bool:
  626. """
  627. return a boolean indicating whether the file is open
  628. """
  629. if self._handle is None:
  630. return False
  631. return bool(self._handle.isopen)
  632. def flush(self, fsync: bool = False) -> None:
  633. """
  634. Force all buffered modifications to be written to disk.
  635. Parameters
  636. ----------
  637. fsync : bool (default False)
  638. call ``os.fsync()`` on the file handle to force writing to disk.
  639. Notes
  640. -----
  641. Without ``fsync=True``, flushing may not guarantee that the OS writes
  642. to disk. With fsync, the operation will block until the OS claims the
  643. file has been written; however, other caching layers may still
  644. interfere.
  645. """
  646. if self._handle is not None:
  647. self._handle.flush()
  648. if fsync:
  649. with suppress(OSError):
  650. os.fsync(self._handle.fileno())
  651. def get(self, key: str):
  652. """
  653. Retrieve pandas object stored in file.
  654. Parameters
  655. ----------
  656. key : str
  657. Returns
  658. -------
  659. object
  660. Same type as object stored in file.
  661. """
  662. with patch_pickle():
  663. # GH#31167 Without this patch, pickle doesn't know how to unpickle
  664. # old DateOffset objects now that they are cdef classes.
  665. group = self.get_node(key)
  666. if group is None:
  667. raise KeyError(f"No object named {key} in the file")
  668. return self._read_group(group)
  669. def select(
  670. self,
  671. key: str,
  672. where=None,
  673. start=None,
  674. stop=None,
  675. columns=None,
  676. iterator: bool = False,
  677. chunksize=None,
  678. auto_close: bool = False,
  679. ):
  680. """
  681. Retrieve pandas object stored in file, optionally based on where criteria.
  682. .. warning::
  683. Pandas uses PyTables for reading and writing HDF5 files, which allows
  684. serializing object-dtype data with pickle when using the "fixed" format.
  685. Loading pickled data received from untrusted sources can be unsafe.
  686. See: https://docs.python.org/3/library/pickle.html for more.
  687. Parameters
  688. ----------
  689. key : str
  690. Object being retrieved from file.
  691. where : list or None
  692. List of Term (or convertible) objects, optional.
  693. start : int or None
  694. Row number to start selection.
  695. stop : int, default None
  696. Row number to stop selection.
  697. columns : list or None
  698. A list of columns that if not None, will limit the return columns.
  699. iterator : bool or False
  700. Returns an iterator.
  701. chunksize : int or None
  702. Number or rows to include in iteration, return an iterator.
  703. auto_close : bool or False
  704. Should automatically close the store when finished.
  705. Returns
  706. -------
  707. object
  708. Retrieved object from file.
  709. """
  710. group = self.get_node(key)
  711. if group is None:
  712. raise KeyError(f"No object named {key} in the file")
  713. # create the storer and axes
  714. where = _ensure_term(where, scope_level=1)
  715. s = self._create_storer(group)
  716. s.infer_axes()
  717. # function to call on iteration
  718. def func(_start, _stop, _where):
  719. return s.read(start=_start, stop=_stop, where=_where, columns=columns)
  720. # create the iterator
  721. it = TableIterator(
  722. self,
  723. s,
  724. func,
  725. where=where,
  726. nrows=s.nrows,
  727. start=start,
  728. stop=stop,
  729. iterator=iterator,
  730. chunksize=chunksize,
  731. auto_close=auto_close,
  732. )
  733. return it.get_result()
  734. def select_as_coordinates(
  735. self,
  736. key: str,
  737. where=None,
  738. start: int | None = None,
  739. stop: int | None = None,
  740. ):
  741. """
  742. return the selection as an Index
  743. .. warning::
  744. Pandas uses PyTables for reading and writing HDF5 files, which allows
  745. serializing object-dtype data with pickle when using the "fixed" format.
  746. Loading pickled data received from untrusted sources can be unsafe.
  747. See: https://docs.python.org/3/library/pickle.html for more.
  748. Parameters
  749. ----------
  750. key : str
  751. where : list of Term (or convertible) objects, optional
  752. start : integer (defaults to None), row number to start selection
  753. stop : integer (defaults to None), row number to stop selection
  754. """
  755. where = _ensure_term(where, scope_level=1)
  756. tbl = self.get_storer(key)
  757. if not isinstance(tbl, Table):
  758. raise TypeError("can only read_coordinates with a table")
  759. return tbl.read_coordinates(where=where, start=start, stop=stop)
  760. def select_column(
  761. self,
  762. key: str,
  763. column: str,
  764. start: int | None = None,
  765. stop: int | None = None,
  766. ):
  767. """
  768. return a single column from the table. This is generally only useful to
  769. select an indexable
  770. .. warning::
  771. Pandas uses PyTables for reading and writing HDF5 files, which allows
  772. serializing object-dtype data with pickle when using the "fixed" format.
  773. Loading pickled data received from untrusted sources can be unsafe.
  774. See: https://docs.python.org/3/library/pickle.html for more.
  775. Parameters
  776. ----------
  777. key : str
  778. column : str
  779. The column of interest.
  780. start : int or None, default None
  781. stop : int or None, default None
  782. Raises
  783. ------
  784. raises KeyError if the column is not found (or key is not a valid
  785. store)
  786. raises ValueError if the column can not be extracted individually (it
  787. is part of a data block)
  788. """
  789. tbl = self.get_storer(key)
  790. if not isinstance(tbl, Table):
  791. raise TypeError("can only read_column with a table")
  792. return tbl.read_column(column=column, start=start, stop=stop)
  793. def select_as_multiple(
  794. self,
  795. keys,
  796. where=None,
  797. selector=None,
  798. columns=None,
  799. start=None,
  800. stop=None,
  801. iterator: bool = False,
  802. chunksize=None,
  803. auto_close: bool = False,
  804. ):
  805. """
  806. Retrieve pandas objects from multiple tables.
  807. .. warning::
  808. Pandas uses PyTables for reading and writing HDF5 files, which allows
  809. serializing object-dtype data with pickle when using the "fixed" format.
  810. Loading pickled data received from untrusted sources can be unsafe.
  811. See: https://docs.python.org/3/library/pickle.html for more.
  812. Parameters
  813. ----------
  814. keys : a list of the tables
  815. selector : the table to apply the where criteria (defaults to keys[0]
  816. if not supplied)
  817. columns : the columns I want back
  818. start : integer (defaults to None), row number to start selection
  819. stop : integer (defaults to None), row number to stop selection
  820. iterator : bool, return an iterator, default False
  821. chunksize : nrows to include in iteration, return an iterator
  822. auto_close : bool, default False
  823. Should automatically close the store when finished.
  824. Raises
  825. ------
  826. raises KeyError if keys or selector is not found or keys is empty
  827. raises TypeError if keys is not a list or tuple
  828. raises ValueError if the tables are not ALL THE SAME DIMENSIONS
  829. """
  830. # default to single select
  831. where = _ensure_term(where, scope_level=1)
  832. if isinstance(keys, (list, tuple)) and len(keys) == 1:
  833. keys = keys[0]
  834. if isinstance(keys, str):
  835. return self.select(
  836. key=keys,
  837. where=where,
  838. columns=columns,
  839. start=start,
  840. stop=stop,
  841. iterator=iterator,
  842. chunksize=chunksize,
  843. auto_close=auto_close,
  844. )
  845. if not isinstance(keys, (list, tuple)):
  846. raise TypeError("keys must be a list/tuple")
  847. if not len(keys):
  848. raise ValueError("keys must have a non-zero length")
  849. if selector is None:
  850. selector = keys[0]
  851. # collect the tables
  852. tbls = [self.get_storer(k) for k in keys]
  853. s = self.get_storer(selector)
  854. # validate rows
  855. nrows = None
  856. for t, k in itertools.chain([(s, selector)], zip(tbls, keys)):
  857. if t is None:
  858. raise KeyError(f"Invalid table [{k}]")
  859. if not t.is_table:
  860. raise TypeError(
  861. f"object [{t.pathname}] is not a table, and cannot be used in all "
  862. "select as multiple"
  863. )
  864. if nrows is None:
  865. nrows = t.nrows
  866. elif t.nrows != nrows:
  867. raise ValueError("all tables must have exactly the same nrows!")
  868. # The isinstance checks here are redundant with the check above,
  869. # but necessary for mypy; see GH#29757
  870. _tbls = [x for x in tbls if isinstance(x, Table)]
  871. # axis is the concentration axes
  872. axis = {t.non_index_axes[0][0] for t in _tbls}.pop()
  873. def func(_start, _stop, _where):
  874. # retrieve the objs, _where is always passed as a set of
  875. # coordinates here
  876. objs = [
  877. t.read(where=_where, columns=columns, start=_start, stop=_stop)
  878. for t in tbls
  879. ]
  880. # concat and return
  881. return concat(objs, axis=axis, verify_integrity=False)._consolidate()
  882. # create the iterator
  883. it = TableIterator(
  884. self,
  885. s,
  886. func,
  887. where=where,
  888. nrows=nrows,
  889. start=start,
  890. stop=stop,
  891. iterator=iterator,
  892. chunksize=chunksize,
  893. auto_close=auto_close,
  894. )
  895. return it.get_result(coordinates=True)
  896. def put(
  897. self,
  898. key: str,
  899. value: DataFrame | Series,
  900. format=None,
  901. index: bool = True,
  902. append: bool = False,
  903. complib=None,
  904. complevel: int | None = None,
  905. min_itemsize: int | dict[str, int] | None = None,
  906. nan_rep=None,
  907. data_columns: Literal[True] | list[str] | None = None,
  908. encoding=None,
  909. errors: str = "strict",
  910. track_times: bool = True,
  911. dropna: bool = False,
  912. ) -> None:
  913. """
  914. Store object in HDFStore.
  915. Parameters
  916. ----------
  917. key : str
  918. value : {Series, DataFrame}
  919. format : 'fixed(f)|table(t)', default is 'fixed'
  920. Format to use when storing object in HDFStore. Value can be one of:
  921. ``'fixed'``
  922. Fixed format. Fast writing/reading. Not-appendable, nor searchable.
  923. ``'table'``
  924. Table format. Write as a PyTables Table structure which may perform
  925. worse but allow more flexible operations like searching / selecting
  926. subsets of the data.
  927. index : bool, default True
  928. Write DataFrame index as a column.
  929. append : bool, default False
  930. This will force Table format, append the input data to the existing.
  931. data_columns : list of columns or True, default None
  932. List of columns to create as data columns, or True to use all columns.
  933. See `here
  934. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
  935. encoding : str, default None
  936. Provide an encoding for strings.
  937. track_times : bool, default True
  938. Parameter is propagated to 'create_table' method of 'PyTables'.
  939. If set to False it enables to have the same h5 files (same hashes)
  940. independent on creation time.
  941. dropna : bool, default False, optional
  942. Remove missing values.
  943. .. versionadded:: 1.1.0
  944. """
  945. if format is None:
  946. format = get_option("io.hdf.default_format") or "fixed"
  947. format = self._validate_format(format)
  948. self._write_to_group(
  949. key,
  950. value,
  951. format=format,
  952. index=index,
  953. append=append,
  954. complib=complib,
  955. complevel=complevel,
  956. min_itemsize=min_itemsize,
  957. nan_rep=nan_rep,
  958. data_columns=data_columns,
  959. encoding=encoding,
  960. errors=errors,
  961. track_times=track_times,
  962. dropna=dropna,
  963. )
  964. def remove(self, key: str, where=None, start=None, stop=None) -> None:
  965. """
  966. Remove pandas object partially by specifying the where condition
  967. Parameters
  968. ----------
  969. key : str
  970. Node to remove or delete rows from
  971. where : list of Term (or convertible) objects, optional
  972. start : integer (defaults to None), row number to start selection
  973. stop : integer (defaults to None), row number to stop selection
  974. Returns
  975. -------
  976. number of rows removed (or None if not a Table)
  977. Raises
  978. ------
  979. raises KeyError if key is not a valid store
  980. """
  981. where = _ensure_term(where, scope_level=1)
  982. try:
  983. s = self.get_storer(key)
  984. except KeyError:
  985. # the key is not a valid store, re-raising KeyError
  986. raise
  987. except AssertionError:
  988. # surface any assertion errors for e.g. debugging
  989. raise
  990. except Exception as err:
  991. # In tests we get here with ClosedFileError, TypeError, and
  992. # _table_mod.NoSuchNodeError. TODO: Catch only these?
  993. if where is not None:
  994. raise ValueError(
  995. "trying to remove a node with a non-None where clause!"
  996. ) from err
  997. # we are actually trying to remove a node (with children)
  998. node = self.get_node(key)
  999. if node is not None:
  1000. node._f_remove(recursive=True)
  1001. return None
  1002. # remove the node
  1003. if com.all_none(where, start, stop):
  1004. s.group._f_remove(recursive=True)
  1005. # delete from the table
  1006. else:
  1007. if not s.is_table:
  1008. raise ValueError(
  1009. "can only remove with where on objects written as tables"
  1010. )
  1011. return s.delete(where=where, start=start, stop=stop)
  1012. def append(
  1013. self,
  1014. key: str,
  1015. value: DataFrame | Series,
  1016. format=None,
  1017. axes=None,
  1018. index: bool | list[str] = True,
  1019. append: bool = True,
  1020. complib=None,
  1021. complevel: int | None = None,
  1022. columns=None,
  1023. min_itemsize: int | dict[str, int] | None = None,
  1024. nan_rep=None,
  1025. chunksize=None,
  1026. expectedrows=None,
  1027. dropna: bool | None = None,
  1028. data_columns: Literal[True] | list[str] | None = None,
  1029. encoding=None,
  1030. errors: str = "strict",
  1031. ) -> None:
  1032. """
  1033. Append to Table in file.
  1034. Node must already exist and be Table format.
  1035. Parameters
  1036. ----------
  1037. key : str
  1038. value : {Series, DataFrame}
  1039. format : 'table' is the default
  1040. Format to use when storing object in HDFStore. Value can be one of:
  1041. ``'table'``
  1042. Table format. Write as a PyTables Table structure which may perform
  1043. worse but allow more flexible operations like searching / selecting
  1044. subsets of the data.
  1045. index : bool, default True
  1046. Write DataFrame index as a column.
  1047. append : bool, default True
  1048. Append the input data to the existing.
  1049. data_columns : list of columns, or True, default None
  1050. List of columns to create as indexed data columns for on-disk
  1051. queries, or True to use all columns. By default only the axes
  1052. of the object are indexed. See `here
  1053. <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#query-via-data-columns>`__.
  1054. min_itemsize : dict of columns that specify minimum str sizes
  1055. nan_rep : str to use as str nan representation
  1056. chunksize : size to chunk the writing
  1057. expectedrows : expected TOTAL row size of this table
  1058. encoding : default None, provide an encoding for str
  1059. dropna : bool, default False, optional
  1060. Do not write an ALL nan row to the store settable
  1061. by the option 'io.hdf.dropna_table'.
  1062. Notes
  1063. -----
  1064. Does *not* check if data being appended overlaps with existing
  1065. data in the table, so be careful
  1066. """
  1067. if columns is not None:
  1068. raise TypeError(
  1069. "columns is not a supported keyword in append, try data_columns"
  1070. )
  1071. if dropna is None:
  1072. dropna = get_option("io.hdf.dropna_table")
  1073. if format is None:
  1074. format = get_option("io.hdf.default_format") or "table"
  1075. format = self._validate_format(format)
  1076. self._write_to_group(
  1077. key,
  1078. value,
  1079. format=format,
  1080. axes=axes,
  1081. index=index,
  1082. append=append,
  1083. complib=complib,
  1084. complevel=complevel,
  1085. min_itemsize=min_itemsize,
  1086. nan_rep=nan_rep,
  1087. chunksize=chunksize,
  1088. expectedrows=expectedrows,
  1089. dropna=dropna,
  1090. data_columns=data_columns,
  1091. encoding=encoding,
  1092. errors=errors,
  1093. )
  1094. def append_to_multiple(
  1095. self,
  1096. d: dict,
  1097. value,
  1098. selector,
  1099. data_columns=None,
  1100. axes=None,
  1101. dropna: bool = False,
  1102. **kwargs,
  1103. ) -> None:
  1104. """
  1105. Append to multiple tables
  1106. Parameters
  1107. ----------
  1108. d : a dict of table_name to table_columns, None is acceptable as the
  1109. values of one node (this will get all the remaining columns)
  1110. value : a pandas object
  1111. selector : a string that designates the indexable table; all of its
  1112. columns will be designed as data_columns, unless data_columns is
  1113. passed, in which case these are used
  1114. data_columns : list of columns to create as data columns, or True to
  1115. use all columns
  1116. dropna : if evaluates to True, drop rows from all tables if any single
  1117. row in each table has all NaN. Default False.
  1118. Notes
  1119. -----
  1120. axes parameter is currently not accepted
  1121. """
  1122. if axes is not None:
  1123. raise TypeError(
  1124. "axes is currently not accepted as a parameter to append_to_multiple; "
  1125. "you can create the tables independently instead"
  1126. )
  1127. if not isinstance(d, dict):
  1128. raise ValueError(
  1129. "append_to_multiple must have a dictionary specified as the "
  1130. "way to split the value"
  1131. )
  1132. if selector not in d:
  1133. raise ValueError(
  1134. "append_to_multiple requires a selector that is in passed dict"
  1135. )
  1136. # figure out the splitting axis (the non_index_axis)
  1137. axis = list(set(range(value.ndim)) - set(_AXES_MAP[type(value)]))[0]
  1138. # figure out how to split the value
  1139. remain_key = None
  1140. remain_values: list = []
  1141. for k, v in d.items():
  1142. if v is None:
  1143. if remain_key is not None:
  1144. raise ValueError(
  1145. "append_to_multiple can only have one value in d that is None"
  1146. )
  1147. remain_key = k
  1148. else:
  1149. remain_values.extend(v)
  1150. if remain_key is not None:
  1151. ordered = value.axes[axis]
  1152. ordd = ordered.difference(Index(remain_values))
  1153. ordd = sorted(ordered.get_indexer(ordd))
  1154. d[remain_key] = ordered.take(ordd)
  1155. # data_columns
  1156. if data_columns is None:
  1157. data_columns = d[selector]
  1158. # ensure rows are synchronized across the tables
  1159. if dropna:
  1160. idxs = (value[cols].dropna(how="all").index for cols in d.values())
  1161. valid_index = next(idxs)
  1162. for index in idxs:
  1163. valid_index = valid_index.intersection(index)
  1164. value = value.loc[valid_index]
  1165. min_itemsize = kwargs.pop("min_itemsize", None)
  1166. # append
  1167. for k, v in d.items():
  1168. dc = data_columns if k == selector else None
  1169. # compute the val
  1170. val = value.reindex(v, axis=axis)
  1171. filtered = (
  1172. {key: value for (key, value) in min_itemsize.items() if key in v}
  1173. if min_itemsize is not None
  1174. else None
  1175. )
  1176. self.append(k, val, data_columns=dc, min_itemsize=filtered, **kwargs)
  1177. def create_table_index(
  1178. self,
  1179. key: str,
  1180. columns=None,
  1181. optlevel: int | None = None,
  1182. kind: str | None = None,
  1183. ) -> None:
  1184. """
  1185. Create a pytables index on the table.
  1186. Parameters
  1187. ----------
  1188. key : str
  1189. columns : None, bool, or listlike[str]
  1190. Indicate which columns to create an index on.
  1191. * False : Do not create any indexes.
  1192. * True : Create indexes on all columns.
  1193. * None : Create indexes on all columns.
  1194. * listlike : Create indexes on the given columns.
  1195. optlevel : int or None, default None
  1196. Optimization level, if None, pytables defaults to 6.
  1197. kind : str or None, default None
  1198. Kind of index, if None, pytables defaults to "medium".
  1199. Raises
  1200. ------
  1201. TypeError: raises if the node is not a table
  1202. """
  1203. # version requirements
  1204. _tables()
  1205. s = self.get_storer(key)
  1206. if s is None:
  1207. return
  1208. if not isinstance(s, Table):
  1209. raise TypeError("cannot create table index on a Fixed format store")
  1210. s.create_index(columns=columns, optlevel=optlevel, kind=kind)
  1211. def groups(self) -> list:
  1212. """
  1213. Return a list of all the top-level nodes.
  1214. Each node returned is not a pandas storage object.
  1215. Returns
  1216. -------
  1217. list
  1218. List of objects.
  1219. """
  1220. _tables()
  1221. self._check_if_open()
  1222. assert self._handle is not None # for mypy
  1223. assert _table_mod is not None # for mypy
  1224. return [
  1225. g
  1226. for g in self._handle.walk_groups()
  1227. if (
  1228. not isinstance(g, _table_mod.link.Link)
  1229. and (
  1230. getattr(g._v_attrs, "pandas_type", None)
  1231. or getattr(g, "table", None)
  1232. or (isinstance(g, _table_mod.table.Table) and g._v_name != "table")
  1233. )
  1234. )
  1235. ]
  1236. def walk(self, where: str = "/") -> Iterator[tuple[str, list[str], list[str]]]:
  1237. """
  1238. Walk the pytables group hierarchy for pandas objects.
  1239. This generator will yield the group path, subgroups and pandas object
  1240. names for each group.
  1241. Any non-pandas PyTables objects that are not a group will be ignored.
  1242. The `where` group itself is listed first (preorder), then each of its
  1243. child groups (following an alphanumerical order) is also traversed,
  1244. following the same procedure.
  1245. Parameters
  1246. ----------
  1247. where : str, default "/"
  1248. Group where to start walking.
  1249. Yields
  1250. ------
  1251. path : str
  1252. Full path to a group (without trailing '/').
  1253. groups : list
  1254. Names (strings) of the groups contained in `path`.
  1255. leaves : list
  1256. Names (strings) of the pandas objects contained in `path`.
  1257. """
  1258. _tables()
  1259. self._check_if_open()
  1260. assert self._handle is not None # for mypy
  1261. assert _table_mod is not None # for mypy
  1262. for g in self._handle.walk_groups(where):
  1263. if getattr(g._v_attrs, "pandas_type", None) is not None:
  1264. continue
  1265. groups = []
  1266. leaves = []
  1267. for child in g._v_children.values():
  1268. pandas_type = getattr(child._v_attrs, "pandas_type", None)
  1269. if pandas_type is None:
  1270. if isinstance(child, _table_mod.group.Group):
  1271. groups.append(child._v_name)
  1272. else:
  1273. leaves.append(child._v_name)
  1274. yield (g._v_pathname.rstrip("/"), groups, leaves)
  1275. def get_node(self, key: str) -> Node | None:
  1276. """return the node with the key or None if it does not exist"""
  1277. self._check_if_open()
  1278. if not key.startswith("/"):
  1279. key = "/" + key
  1280. assert self._handle is not None
  1281. assert _table_mod is not None # for mypy
  1282. try:
  1283. node = self._handle.get_node(self.root, key)
  1284. except _table_mod.exceptions.NoSuchNodeError:
  1285. return None
  1286. assert isinstance(node, _table_mod.Node), type(node)
  1287. return node
  1288. def get_storer(self, key: str) -> GenericFixed | Table:
  1289. """return the storer object for a key, raise if not in the file"""
  1290. group = self.get_node(key)
  1291. if group is None:
  1292. raise KeyError(f"No object named {key} in the file")
  1293. s = self._create_storer(group)
  1294. s.infer_axes()
  1295. return s
  1296. def copy(
  1297. self,
  1298. file,
  1299. mode: str = "w",
  1300. propindexes: bool = True,
  1301. keys=None,
  1302. complib=None,
  1303. complevel: int | None = None,
  1304. fletcher32: bool = False,
  1305. overwrite: bool = True,
  1306. ) -> HDFStore:
  1307. """
  1308. Copy the existing store to a new file, updating in place.
  1309. Parameters
  1310. ----------
  1311. propindexes : bool, default True
  1312. Restore indexes in copied file.
  1313. keys : list, optional
  1314. List of keys to include in the copy (defaults to all).
  1315. overwrite : bool, default True
  1316. Whether to overwrite (remove and replace) existing nodes in the new store.
  1317. mode, complib, complevel, fletcher32 same as in HDFStore.__init__
  1318. Returns
  1319. -------
  1320. open file handle of the new store
  1321. """
  1322. new_store = HDFStore(
  1323. file, mode=mode, complib=complib, complevel=complevel, fletcher32=fletcher32
  1324. )
  1325. if keys is None:
  1326. keys = list(self.keys())
  1327. if not isinstance(keys, (tuple, list)):
  1328. keys = [keys]
  1329. for k in keys:
  1330. s = self.get_storer(k)
  1331. if s is not None:
  1332. if k in new_store:
  1333. if overwrite:
  1334. new_store.remove(k)
  1335. data = self.select(k)
  1336. if isinstance(s, Table):
  1337. index: bool | list[str] = False
  1338. if propindexes:
  1339. index = [a.name for a in s.axes if a.is_indexed]
  1340. new_store.append(
  1341. k,
  1342. data,
  1343. index=index,
  1344. data_columns=getattr(s, "data_columns", None),
  1345. encoding=s.encoding,
  1346. )
  1347. else:
  1348. new_store.put(k, data, encoding=s.encoding)
  1349. return new_store
  1350. def info(self) -> str:
  1351. """
  1352. Print detailed information on the store.
  1353. Returns
  1354. -------
  1355. str
  1356. """
  1357. path = pprint_thing(self._path)
  1358. output = f"{type(self)}\nFile path: {path}\n"
  1359. if self.is_open:
  1360. lkeys = sorted(self.keys())
  1361. if len(lkeys):
  1362. keys = []
  1363. values = []
  1364. for k in lkeys:
  1365. try:
  1366. s = self.get_storer(k)
  1367. if s is not None:
  1368. keys.append(pprint_thing(s.pathname or k))
  1369. values.append(pprint_thing(s or "invalid_HDFStore node"))
  1370. except AssertionError:
  1371. # surface any assertion errors for e.g. debugging
  1372. raise
  1373. except Exception as detail:
  1374. keys.append(k)
  1375. dstr = pprint_thing(detail)
  1376. values.append(f"[invalid_HDFStore node: {dstr}]")
  1377. output += adjoin(12, keys, values)
  1378. else:
  1379. output += "Empty"
  1380. else:
  1381. output += "File is CLOSED"
  1382. return output
  1383. # ------------------------------------------------------------------------
  1384. # private methods
  1385. def _check_if_open(self):
  1386. if not self.is_open:
  1387. raise ClosedFileError(f"{self._path} file is not open!")
  1388. def _validate_format(self, format: str) -> str:
  1389. """validate / deprecate formats"""
  1390. # validate
  1391. try:
  1392. format = _FORMAT_MAP[format.lower()]
  1393. except KeyError as err:
  1394. raise TypeError(f"invalid HDFStore format specified [{format}]") from err
  1395. return format
  1396. def _create_storer(
  1397. self,
  1398. group,
  1399. format=None,
  1400. value: DataFrame | Series | None = None,
  1401. encoding: str = "UTF-8",
  1402. errors: str = "strict",
  1403. ) -> GenericFixed | Table:
  1404. """return a suitable class to operate"""
  1405. cls: type[GenericFixed] | type[Table]
  1406. if value is not None and not isinstance(value, (Series, DataFrame)):
  1407. raise TypeError("value must be None, Series, or DataFrame")
  1408. pt = _ensure_decoded(getattr(group._v_attrs, "pandas_type", None))
  1409. tt = _ensure_decoded(getattr(group._v_attrs, "table_type", None))
  1410. # infer the pt from the passed value
  1411. if pt is None:
  1412. if value is None:
  1413. _tables()
  1414. assert _table_mod is not None # for mypy
  1415. if getattr(group, "table", None) or isinstance(
  1416. group, _table_mod.table.Table
  1417. ):
  1418. pt = "frame_table"
  1419. tt = "generic_table"
  1420. else:
  1421. raise TypeError(
  1422. "cannot create a storer if the object is not existing "
  1423. "nor a value are passed"
  1424. )
  1425. else:
  1426. if isinstance(value, Series):
  1427. pt = "series"
  1428. else:
  1429. pt = "frame"
  1430. # we are actually a table
  1431. if format == "table":
  1432. pt += "_table"
  1433. # a storer node
  1434. if "table" not in pt:
  1435. _STORER_MAP = {"series": SeriesFixed, "frame": FrameFixed}
  1436. try:
  1437. cls = _STORER_MAP[pt]
  1438. except KeyError as err:
  1439. raise TypeError(
  1440. f"cannot properly create the storer for: [_STORER_MAP] [group->"
  1441. f"{group},value->{type(value)},format->{format}"
  1442. ) from err
  1443. return cls(self, group, encoding=encoding, errors=errors)
  1444. # existing node (and must be a table)
  1445. if tt is None:
  1446. # if we are a writer, determine the tt
  1447. if value is not None:
  1448. if pt == "series_table":
  1449. index = getattr(value, "index", None)
  1450. if index is not None:
  1451. if index.nlevels == 1:
  1452. tt = "appendable_series"
  1453. elif index.nlevels > 1:
  1454. tt = "appendable_multiseries"
  1455. elif pt == "frame_table":
  1456. index = getattr(value, "index", None)
  1457. if index is not None:
  1458. if index.nlevels == 1:
  1459. tt = "appendable_frame"
  1460. elif index.nlevels > 1:
  1461. tt = "appendable_multiframe"
  1462. _TABLE_MAP = {
  1463. "generic_table": GenericTable,
  1464. "appendable_series": AppendableSeriesTable,
  1465. "appendable_multiseries": AppendableMultiSeriesTable,
  1466. "appendable_frame": AppendableFrameTable,
  1467. "appendable_multiframe": AppendableMultiFrameTable,
  1468. "worm": WORMTable,
  1469. }
  1470. try:
  1471. cls = _TABLE_MAP[tt]
  1472. except KeyError as err:
  1473. raise TypeError(
  1474. f"cannot properly create the storer for: [_TABLE_MAP] [group->"
  1475. f"{group},value->{type(value)},format->{format}"
  1476. ) from err
  1477. return cls(self, group, encoding=encoding, errors=errors)
  1478. def _write_to_group(
  1479. self,
  1480. key: str,
  1481. value: DataFrame | Series,
  1482. format,
  1483. axes=None,
  1484. index: bool | list[str] = True,
  1485. append: bool = False,
  1486. complib=None,
  1487. complevel: int | None = None,
  1488. fletcher32=None,
  1489. min_itemsize: int | dict[str, int] | None = None,
  1490. chunksize=None,
  1491. expectedrows=None,
  1492. dropna: bool = False,
  1493. nan_rep=None,
  1494. data_columns=None,
  1495. encoding=None,
  1496. errors: str = "strict",
  1497. track_times: bool = True,
  1498. ) -> None:
  1499. # we don't want to store a table node at all if our object is 0-len
  1500. # as there are not dtypes
  1501. if getattr(value, "empty", None) and (format == "table" or append):
  1502. return
  1503. group = self._identify_group(key, append)
  1504. s = self._create_storer(group, format, value, encoding=encoding, errors=errors)
  1505. if append:
  1506. # raise if we are trying to append to a Fixed format,
  1507. # or a table that exists (and we are putting)
  1508. if not s.is_table or (s.is_table and format == "fixed" and s.is_exists):
  1509. raise ValueError("Can only append to Tables")
  1510. if not s.is_exists:
  1511. s.set_object_info()
  1512. else:
  1513. s.set_object_info()
  1514. if not s.is_table and complib:
  1515. raise ValueError("Compression not supported on Fixed format stores")
  1516. # write the object
  1517. s.write(
  1518. obj=value,
  1519. axes=axes,
  1520. append=append,
  1521. complib=complib,
  1522. complevel=complevel,
  1523. fletcher32=fletcher32,
  1524. min_itemsize=min_itemsize,
  1525. chunksize=chunksize,
  1526. expectedrows=expectedrows,
  1527. dropna=dropna,
  1528. nan_rep=nan_rep,
  1529. data_columns=data_columns,
  1530. track_times=track_times,
  1531. )
  1532. if isinstance(s, Table) and index:
  1533. s.create_index(columns=index)
  1534. def _read_group(self, group: Node):
  1535. s = self._create_storer(group)
  1536. s.infer_axes()
  1537. return s.read()
  1538. def _identify_group(self, key: str, append: bool) -> Node:
  1539. """Identify HDF5 group based on key, delete/create group if needed."""
  1540. group = self.get_node(key)
  1541. # we make this assertion for mypy; the get_node call will already
  1542. # have raised if this is incorrect
  1543. assert self._handle is not None
  1544. # remove the node if we are not appending
  1545. if group is not None and not append:
  1546. self._handle.remove_node(group, recursive=True)
  1547. group = None
  1548. if group is None:
  1549. group = self._create_nodes_and_group(key)
  1550. return group
  1551. def _create_nodes_and_group(self, key: str) -> Node:
  1552. """Create nodes from key and return group name."""
  1553. # assertion for mypy
  1554. assert self._handle is not None
  1555. paths = key.split("/")
  1556. # recursively create the groups
  1557. path = "/"
  1558. for p in paths:
  1559. if not len(p):
  1560. continue
  1561. new_path = path
  1562. if not path.endswith("/"):
  1563. new_path += "/"
  1564. new_path += p
  1565. group = self.get_node(new_path)
  1566. if group is None:
  1567. group = self._handle.create_group(path, p)
  1568. path = new_path
  1569. return group
  1570. class TableIterator:
  1571. """
  1572. Define the iteration interface on a table
  1573. Parameters
  1574. ----------
  1575. store : HDFStore
  1576. s : the referred storer
  1577. func : the function to execute the query
  1578. where : the where of the query
  1579. nrows : the rows to iterate on
  1580. start : the passed start value (default is None)
  1581. stop : the passed stop value (default is None)
  1582. iterator : bool, default False
  1583. Whether to use the default iterator.
  1584. chunksize : the passed chunking value (default is 100000)
  1585. auto_close : bool, default False
  1586. Whether to automatically close the store at the end of iteration.
  1587. """
  1588. chunksize: int | None
  1589. store: HDFStore
  1590. s: GenericFixed | Table
  1591. def __init__(
  1592. self,
  1593. store: HDFStore,
  1594. s: GenericFixed | Table,
  1595. func,
  1596. where,
  1597. nrows,
  1598. start=None,
  1599. stop=None,
  1600. iterator: bool = False,
  1601. chunksize: int | None = None,
  1602. auto_close: bool = False,
  1603. ) -> None:
  1604. self.store = store
  1605. self.s = s
  1606. self.func = func
  1607. self.where = where
  1608. # set start/stop if they are not set if we are a table
  1609. if self.s.is_table:
  1610. if nrows is None:
  1611. nrows = 0
  1612. if start is None:
  1613. start = 0
  1614. if stop is None:
  1615. stop = nrows
  1616. stop = min(nrows, stop)
  1617. self.nrows = nrows
  1618. self.start = start
  1619. self.stop = stop
  1620. self.coordinates = None
  1621. if iterator or chunksize is not None:
  1622. if chunksize is None:
  1623. chunksize = 100000
  1624. self.chunksize = int(chunksize)
  1625. else:
  1626. self.chunksize = None
  1627. self.auto_close = auto_close
  1628. def __iter__(self) -> Iterator:
  1629. # iterate
  1630. current = self.start
  1631. if self.coordinates is None:
  1632. raise ValueError("Cannot iterate until get_result is called.")
  1633. while current < self.stop:
  1634. stop = min(current + self.chunksize, self.stop)
  1635. value = self.func(None, None, self.coordinates[current:stop])
  1636. current = stop
  1637. if value is None or not len(value):
  1638. continue
  1639. yield value
  1640. self.close()
  1641. def close(self) -> None:
  1642. if self.auto_close:
  1643. self.store.close()
  1644. def get_result(self, coordinates: bool = False):
  1645. # return the actual iterator
  1646. if self.chunksize is not None:
  1647. if not isinstance(self.s, Table):
  1648. raise TypeError("can only use an iterator or chunksize on a table")
  1649. self.coordinates = self.s.read_coordinates(where=self.where)
  1650. return self
  1651. # if specified read via coordinates (necessary for multiple selections
  1652. if coordinates:
  1653. if not isinstance(self.s, Table):
  1654. raise TypeError("can only read_coordinates on a table")
  1655. where = self.s.read_coordinates(
  1656. where=self.where, start=self.start, stop=self.stop
  1657. )
  1658. else:
  1659. where = self.where
  1660. # directly return the result
  1661. results = self.func(self.start, self.stop, where)
  1662. self.close()
  1663. return results
  1664. class IndexCol:
  1665. """
  1666. an index column description class
  1667. Parameters
  1668. ----------
  1669. axis : axis which I reference
  1670. values : the ndarray like converted values
  1671. kind : a string description of this type
  1672. typ : the pytables type
  1673. pos : the position in the pytables
  1674. """
  1675. is_an_indexable: bool = True
  1676. is_data_indexable: bool = True
  1677. _info_fields = ["freq", "tz", "index_name"]
  1678. def __init__(
  1679. self,
  1680. name: str,
  1681. values=None,
  1682. kind=None,
  1683. typ=None,
  1684. cname: str | None = None,
  1685. axis=None,
  1686. pos=None,
  1687. freq=None,
  1688. tz=None,
  1689. index_name=None,
  1690. ordered=None,
  1691. table=None,
  1692. meta=None,
  1693. metadata=None,
  1694. ) -> None:
  1695. if not isinstance(name, str):
  1696. raise ValueError("`name` must be a str.")
  1697. self.values = values
  1698. self.kind = kind
  1699. self.typ = typ
  1700. self.name = name
  1701. self.cname = cname or name
  1702. self.axis = axis
  1703. self.pos = pos
  1704. self.freq = freq
  1705. self.tz = tz
  1706. self.index_name = index_name
  1707. self.ordered = ordered
  1708. self.table = table
  1709. self.meta = meta
  1710. self.metadata = metadata
  1711. if pos is not None:
  1712. self.set_pos(pos)
  1713. # These are ensured as long as the passed arguments match the
  1714. # constructor annotations.
  1715. assert isinstance(self.name, str)
  1716. assert isinstance(self.cname, str)
  1717. @property
  1718. def itemsize(self) -> int:
  1719. # Assumes self.typ has already been initialized
  1720. return self.typ.itemsize
  1721. @property
  1722. def kind_attr(self) -> str:
  1723. return f"{self.name}_kind"
  1724. def set_pos(self, pos: int) -> None:
  1725. """set the position of this column in the Table"""
  1726. self.pos = pos
  1727. if pos is not None and self.typ is not None:
  1728. self.typ._v_pos = pos
  1729. def __repr__(self) -> str:
  1730. temp = tuple(
  1731. map(pprint_thing, (self.name, self.cname, self.axis, self.pos, self.kind))
  1732. )
  1733. return ",".join(
  1734. [
  1735. f"{key}->{value}"
  1736. for key, value in zip(["name", "cname", "axis", "pos", "kind"], temp)
  1737. ]
  1738. )
  1739. def __eq__(self, other: Any) -> bool:
  1740. """compare 2 col items"""
  1741. return all(
  1742. getattr(self, a, None) == getattr(other, a, None)
  1743. for a in ["name", "cname", "axis", "pos"]
  1744. )
  1745. def __ne__(self, other) -> bool:
  1746. return not self.__eq__(other)
  1747. @property
  1748. def is_indexed(self) -> bool:
  1749. """return whether I am an indexed column"""
  1750. if not hasattr(self.table, "cols"):
  1751. # e.g. if infer hasn't been called yet, self.table will be None.
  1752. return False
  1753. return getattr(self.table.cols, self.cname).is_indexed
  1754. def convert(
  1755. self, values: np.ndarray, nan_rep, encoding: str, errors: str
  1756. ) -> tuple[np.ndarray, np.ndarray] | tuple[Index, Index]:
  1757. """
  1758. Convert the data from this selection to the appropriate pandas type.
  1759. """
  1760. assert isinstance(values, np.ndarray), type(values)
  1761. # values is a recarray
  1762. if values.dtype.fields is not None:
  1763. # Copy, otherwise values will be a view
  1764. # preventing the original recarry from being free'ed
  1765. values = values[self.cname].copy()
  1766. val_kind = _ensure_decoded(self.kind)
  1767. values = _maybe_convert(values, val_kind, encoding, errors)
  1768. kwargs = {}
  1769. kwargs["name"] = _ensure_decoded(self.index_name)
  1770. if self.freq is not None:
  1771. kwargs["freq"] = _ensure_decoded(self.freq)
  1772. factory: type[Index] | type[DatetimeIndex] = Index
  1773. if is_datetime64_dtype(values.dtype) or is_datetime64tz_dtype(values.dtype):
  1774. factory = DatetimeIndex
  1775. elif values.dtype == "i8" and "freq" in kwargs:
  1776. # PeriodIndex data is stored as i8
  1777. # error: Incompatible types in assignment (expression has type
  1778. # "Callable[[Any, KwArg(Any)], PeriodIndex]", variable has type
  1779. # "Union[Type[Index], Type[DatetimeIndex]]")
  1780. factory = lambda x, **kwds: PeriodIndex( # type: ignore[assignment]
  1781. ordinal=x, **kwds
  1782. )
  1783. # making an Index instance could throw a number of different errors
  1784. try:
  1785. new_pd_index = factory(values, **kwargs)
  1786. except ValueError:
  1787. # if the output freq is different that what we recorded,
  1788. # it should be None (see also 'doc example part 2')
  1789. if "freq" in kwargs:
  1790. kwargs["freq"] = None
  1791. new_pd_index = factory(values, **kwargs)
  1792. final_pd_index = _set_tz(new_pd_index, self.tz)
  1793. return final_pd_index, final_pd_index
  1794. def take_data(self):
  1795. """return the values"""
  1796. return self.values
  1797. @property
  1798. def attrs(self):
  1799. return self.table._v_attrs
  1800. @property
  1801. def description(self):
  1802. return self.table.description
  1803. @property
  1804. def col(self):
  1805. """return my current col description"""
  1806. return getattr(self.description, self.cname, None)
  1807. @property
  1808. def cvalues(self):
  1809. """return my cython values"""
  1810. return self.values
  1811. def __iter__(self) -> Iterator:
  1812. return iter(self.values)
  1813. def maybe_set_size(self, min_itemsize=None) -> None:
  1814. """
  1815. maybe set a string col itemsize:
  1816. min_itemsize can be an integer or a dict with this columns name
  1817. with an integer size
  1818. """
  1819. if _ensure_decoded(self.kind) == "string":
  1820. if isinstance(min_itemsize, dict):
  1821. min_itemsize = min_itemsize.get(self.name)
  1822. if min_itemsize is not None and self.typ.itemsize < min_itemsize:
  1823. self.typ = _tables().StringCol(itemsize=min_itemsize, pos=self.pos)
  1824. def validate_names(self) -> None:
  1825. pass
  1826. def validate_and_set(self, handler: AppendableTable, append: bool) -> None:
  1827. self.table = handler.table
  1828. self.validate_col()
  1829. self.validate_attr(append)
  1830. self.validate_metadata(handler)
  1831. self.write_metadata(handler)
  1832. self.set_attr()
  1833. def validate_col(self, itemsize=None):
  1834. """validate this column: return the compared against itemsize"""
  1835. # validate this column for string truncation (or reset to the max size)
  1836. if _ensure_decoded(self.kind) == "string":
  1837. c = self.col
  1838. if c is not None:
  1839. if itemsize is None:
  1840. itemsize = self.itemsize
  1841. if c.itemsize < itemsize:
  1842. raise ValueError(
  1843. f"Trying to store a string with len [{itemsize}] in "
  1844. f"[{self.cname}] column but\nthis column has a limit of "
  1845. f"[{c.itemsize}]!\nConsider using min_itemsize to "
  1846. "preset the sizes on these columns"
  1847. )
  1848. return c.itemsize
  1849. return None
  1850. def validate_attr(self, append: bool) -> None:
  1851. # check for backwards incompatibility
  1852. if append:
  1853. existing_kind = getattr(self.attrs, self.kind_attr, None)
  1854. if existing_kind is not None and existing_kind != self.kind:
  1855. raise TypeError(
  1856. f"incompatible kind in col [{existing_kind} - {self.kind}]"
  1857. )
  1858. def update_info(self, info) -> None:
  1859. """
  1860. set/update the info for this indexable with the key/value
  1861. if there is a conflict raise/warn as needed
  1862. """
  1863. for key in self._info_fields:
  1864. value = getattr(self, key, None)
  1865. idx = info.setdefault(self.name, {})
  1866. existing_value = idx.get(key)
  1867. if key in idx and value is not None and existing_value != value:
  1868. # frequency/name just warn
  1869. if key in ["freq", "index_name"]:
  1870. ws = attribute_conflict_doc % (key, existing_value, value)
  1871. warnings.warn(
  1872. ws, AttributeConflictWarning, stacklevel=find_stack_level()
  1873. )
  1874. # reset
  1875. idx[key] = None
  1876. setattr(self, key, None)
  1877. else:
  1878. raise ValueError(
  1879. f"invalid info for [{self.name}] for [{key}], "
  1880. f"existing_value [{existing_value}] conflicts with "
  1881. f"new value [{value}]"
  1882. )
  1883. else:
  1884. if value is not None or existing_value is not None:
  1885. idx[key] = value
  1886. def set_info(self, info) -> None:
  1887. """set my state from the passed info"""
  1888. idx = info.get(self.name)
  1889. if idx is not None:
  1890. self.__dict__.update(idx)
  1891. def set_attr(self) -> None:
  1892. """set the kind for this column"""
  1893. setattr(self.attrs, self.kind_attr, self.kind)
  1894. def validate_metadata(self, handler: AppendableTable) -> None:
  1895. """validate that kind=category does not change the categories"""
  1896. if self.meta == "category":
  1897. new_metadata = self.metadata
  1898. cur_metadata = handler.read_metadata(self.cname)
  1899. if (
  1900. new_metadata is not None
  1901. and cur_metadata is not None
  1902. and not array_equivalent(new_metadata, cur_metadata)
  1903. ):
  1904. raise ValueError(
  1905. "cannot append a categorical with "
  1906. "different categories to the existing"
  1907. )
  1908. def write_metadata(self, handler: AppendableTable) -> None:
  1909. """set the meta data"""
  1910. if self.metadata is not None:
  1911. handler.write_metadata(self.cname, self.metadata)
  1912. class GenericIndexCol(IndexCol):
  1913. """an index which is not represented in the data of the table"""
  1914. @property
  1915. def is_indexed(self) -> bool:
  1916. return False
  1917. def convert(
  1918. self, values: np.ndarray, nan_rep, encoding: str, errors: str
  1919. ) -> tuple[Index, Index]:
  1920. """
  1921. Convert the data from this selection to the appropriate pandas type.
  1922. Parameters
  1923. ----------
  1924. values : np.ndarray
  1925. nan_rep : str
  1926. encoding : str
  1927. errors : str
  1928. """
  1929. assert isinstance(values, np.ndarray), type(values)
  1930. index = RangeIndex(len(values))
  1931. return index, index
  1932. def set_attr(self) -> None:
  1933. pass
  1934. class DataCol(IndexCol):
  1935. """
  1936. a data holding column, by definition this is not indexable
  1937. Parameters
  1938. ----------
  1939. data : the actual data
  1940. cname : the column name in the table to hold the data (typically
  1941. values)
  1942. meta : a string description of the metadata
  1943. metadata : the actual metadata
  1944. """
  1945. is_an_indexable = False
  1946. is_data_indexable = False
  1947. _info_fields = ["tz", "ordered"]
  1948. def __init__(
  1949. self,
  1950. name: str,
  1951. values=None,
  1952. kind=None,
  1953. typ=None,
  1954. cname: str | None = None,
  1955. pos=None,
  1956. tz=None,
  1957. ordered=None,
  1958. table=None,
  1959. meta=None,
  1960. metadata=None,
  1961. dtype: DtypeArg | None = None,
  1962. data=None,
  1963. ) -> None:
  1964. super().__init__(
  1965. name=name,
  1966. values=values,
  1967. kind=kind,
  1968. typ=typ,
  1969. pos=pos,
  1970. cname=cname,
  1971. tz=tz,
  1972. ordered=ordered,
  1973. table=table,
  1974. meta=meta,
  1975. metadata=metadata,
  1976. )
  1977. self.dtype = dtype
  1978. self.data = data
  1979. @property
  1980. def dtype_attr(self) -> str:
  1981. return f"{self.name}_dtype"
  1982. @property
  1983. def meta_attr(self) -> str:
  1984. return f"{self.name}_meta"
  1985. def __repr__(self) -> str:
  1986. temp = tuple(
  1987. map(
  1988. pprint_thing, (self.name, self.cname, self.dtype, self.kind, self.shape)
  1989. )
  1990. )
  1991. return ",".join(
  1992. [
  1993. f"{key}->{value}"
  1994. for key, value in zip(["name", "cname", "dtype", "kind", "shape"], temp)
  1995. ]
  1996. )
  1997. def __eq__(self, other: Any) -> bool:
  1998. """compare 2 col items"""
  1999. return all(
  2000. getattr(self, a, None) == getattr(other, a, None)
  2001. for a in ["name", "cname", "dtype", "pos"]
  2002. )
  2003. def set_data(self, data: ArrayLike) -> None:
  2004. assert data is not None
  2005. assert self.dtype is None
  2006. data, dtype_name = _get_data_and_dtype_name(data)
  2007. self.data = data
  2008. self.dtype = dtype_name
  2009. self.kind = _dtype_to_kind(dtype_name)
  2010. def take_data(self):
  2011. """return the data"""
  2012. return self.data
  2013. @classmethod
  2014. def _get_atom(cls, values: ArrayLike) -> Col:
  2015. """
  2016. Get an appropriately typed and shaped pytables.Col object for values.
  2017. """
  2018. dtype = values.dtype
  2019. # error: Item "ExtensionDtype" of "Union[ExtensionDtype, dtype[Any]]" has no
  2020. # attribute "itemsize"
  2021. itemsize = dtype.itemsize # type: ignore[union-attr]
  2022. shape = values.shape
  2023. if values.ndim == 1:
  2024. # EA, use block shape pretending it is 2D
  2025. # TODO(EA2D): not necessary with 2D EAs
  2026. shape = (1, values.size)
  2027. if isinstance(values, Categorical):
  2028. codes = values.codes
  2029. atom = cls.get_atom_data(shape, kind=codes.dtype.name)
  2030. elif is_datetime64_dtype(dtype) or is_datetime64tz_dtype(dtype):
  2031. atom = cls.get_atom_datetime64(shape)
  2032. elif is_timedelta64_dtype(dtype):
  2033. atom = cls.get_atom_timedelta64(shape)
  2034. elif is_complex_dtype(dtype):
  2035. atom = _tables().ComplexCol(itemsize=itemsize, shape=shape[0])
  2036. elif is_string_dtype(dtype):
  2037. atom = cls.get_atom_string(shape, itemsize)
  2038. else:
  2039. atom = cls.get_atom_data(shape, kind=dtype.name)
  2040. return atom
  2041. @classmethod
  2042. def get_atom_string(cls, shape, itemsize):
  2043. return _tables().StringCol(itemsize=itemsize, shape=shape[0])
  2044. @classmethod
  2045. def get_atom_coltype(cls, kind: str) -> type[Col]:
  2046. """return the PyTables column class for this column"""
  2047. if kind.startswith("uint"):
  2048. k4 = kind[4:]
  2049. col_name = f"UInt{k4}Col"
  2050. elif kind.startswith("period"):
  2051. # we store as integer
  2052. col_name = "Int64Col"
  2053. else:
  2054. kcap = kind.capitalize()
  2055. col_name = f"{kcap}Col"
  2056. return getattr(_tables(), col_name)
  2057. @classmethod
  2058. def get_atom_data(cls, shape, kind: str) -> Col:
  2059. return cls.get_atom_coltype(kind=kind)(shape=shape[0])
  2060. @classmethod
  2061. def get_atom_datetime64(cls, shape):
  2062. return _tables().Int64Col(shape=shape[0])
  2063. @classmethod
  2064. def get_atom_timedelta64(cls, shape):
  2065. return _tables().Int64Col(shape=shape[0])
  2066. @property
  2067. def shape(self):
  2068. return getattr(self.data, "shape", None)
  2069. @property
  2070. def cvalues(self):
  2071. """return my cython values"""
  2072. return self.data
  2073. def validate_attr(self, append) -> None:
  2074. """validate that we have the same order as the existing & same dtype"""
  2075. if append:
  2076. existing_fields = getattr(self.attrs, self.kind_attr, None)
  2077. if existing_fields is not None and existing_fields != list(self.values):
  2078. raise ValueError("appended items do not match existing items in table!")
  2079. existing_dtype = getattr(self.attrs, self.dtype_attr, None)
  2080. if existing_dtype is not None and existing_dtype != self.dtype:
  2081. raise ValueError(
  2082. "appended items dtype do not match existing items dtype in table!"
  2083. )
  2084. def convert(self, values: np.ndarray, nan_rep, encoding: str, errors: str):
  2085. """
  2086. Convert the data from this selection to the appropriate pandas type.
  2087. Parameters
  2088. ----------
  2089. values : np.ndarray
  2090. nan_rep :
  2091. encoding : str
  2092. errors : str
  2093. Returns
  2094. -------
  2095. index : listlike to become an Index
  2096. data : ndarraylike to become a column
  2097. """
  2098. assert isinstance(values, np.ndarray), type(values)
  2099. # values is a recarray
  2100. if values.dtype.fields is not None:
  2101. values = values[self.cname]
  2102. assert self.typ is not None
  2103. if self.dtype is None:
  2104. # Note: in tests we never have timedelta64 or datetime64,
  2105. # so the _get_data_and_dtype_name may be unnecessary
  2106. converted, dtype_name = _get_data_and_dtype_name(values)
  2107. kind = _dtype_to_kind(dtype_name)
  2108. else:
  2109. converted = values
  2110. dtype_name = self.dtype
  2111. kind = self.kind
  2112. assert isinstance(converted, np.ndarray) # for mypy
  2113. # use the meta if needed
  2114. meta = _ensure_decoded(self.meta)
  2115. metadata = self.metadata
  2116. ordered = self.ordered
  2117. tz = self.tz
  2118. assert dtype_name is not None
  2119. # convert to the correct dtype
  2120. dtype = _ensure_decoded(dtype_name)
  2121. # reverse converts
  2122. if dtype == "datetime64":
  2123. # recreate with tz if indicated
  2124. converted = _set_tz(converted, tz, coerce=True)
  2125. elif dtype == "timedelta64":
  2126. converted = np.asarray(converted, dtype="m8[ns]")
  2127. elif dtype == "date":
  2128. try:
  2129. converted = np.asarray(
  2130. [date.fromordinal(v) for v in converted], dtype=object
  2131. )
  2132. except ValueError:
  2133. converted = np.asarray(
  2134. [date.fromtimestamp(v) for v in converted], dtype=object
  2135. )
  2136. elif meta == "category":
  2137. # we have a categorical
  2138. categories = metadata
  2139. codes = converted.ravel()
  2140. # if we have stored a NaN in the categories
  2141. # then strip it; in theory we could have BOTH
  2142. # -1s in the codes and nulls :<
  2143. if categories is None:
  2144. # Handle case of NaN-only categorical columns in which case
  2145. # the categories are an empty array; when this is stored,
  2146. # pytables cannot write a zero-len array, so on readback
  2147. # the categories would be None and `read_hdf()` would fail.
  2148. categories = Index([], dtype=np.float64)
  2149. else:
  2150. mask = isna(categories)
  2151. if mask.any():
  2152. categories = categories[~mask]
  2153. codes[codes != -1] -= mask.astype(int).cumsum()._values
  2154. converted = Categorical.from_codes(
  2155. codes, categories=categories, ordered=ordered
  2156. )
  2157. else:
  2158. try:
  2159. converted = converted.astype(dtype, copy=False)
  2160. except TypeError:
  2161. converted = converted.astype("O", copy=False)
  2162. # convert nans / decode
  2163. if _ensure_decoded(kind) == "string":
  2164. converted = _unconvert_string_array(
  2165. converted, nan_rep=nan_rep, encoding=encoding, errors=errors
  2166. )
  2167. return self.values, converted
  2168. def set_attr(self) -> None:
  2169. """set the data for this column"""
  2170. setattr(self.attrs, self.kind_attr, self.values)
  2171. setattr(self.attrs, self.meta_attr, self.meta)
  2172. assert self.dtype is not None
  2173. setattr(self.attrs, self.dtype_attr, self.dtype)
  2174. class DataIndexableCol(DataCol):
  2175. """represent a data column that can be indexed"""
  2176. is_data_indexable = True
  2177. def validate_names(self) -> None:
  2178. if not is_object_dtype(Index(self.values)):
  2179. # TODO: should the message here be more specifically non-str?
  2180. raise ValueError("cannot have non-object label DataIndexableCol")
  2181. @classmethod
  2182. def get_atom_string(cls, shape, itemsize):
  2183. return _tables().StringCol(itemsize=itemsize)
  2184. @classmethod
  2185. def get_atom_data(cls, shape, kind: str) -> Col:
  2186. return cls.get_atom_coltype(kind=kind)()
  2187. @classmethod
  2188. def get_atom_datetime64(cls, shape):
  2189. return _tables().Int64Col()
  2190. @classmethod
  2191. def get_atom_timedelta64(cls, shape):
  2192. return _tables().Int64Col()
  2193. class GenericDataIndexableCol(DataIndexableCol):
  2194. """represent a generic pytables data column"""
  2195. class Fixed:
  2196. """
  2197. represent an object in my store
  2198. facilitate read/write of various types of objects
  2199. this is an abstract base class
  2200. Parameters
  2201. ----------
  2202. parent : HDFStore
  2203. group : Node
  2204. The group node where the table resides.
  2205. """
  2206. pandas_kind: str
  2207. format_type: str = "fixed" # GH#30962 needed by dask
  2208. obj_type: type[DataFrame | Series]
  2209. ndim: int
  2210. parent: HDFStore
  2211. is_table: bool = False
  2212. def __init__(
  2213. self,
  2214. parent: HDFStore,
  2215. group: Node,
  2216. encoding: str | None = "UTF-8",
  2217. errors: str = "strict",
  2218. ) -> None:
  2219. assert isinstance(parent, HDFStore), type(parent)
  2220. assert _table_mod is not None # needed for mypy
  2221. assert isinstance(group, _table_mod.Node), type(group)
  2222. self.parent = parent
  2223. self.group = group
  2224. self.encoding = _ensure_encoding(encoding)
  2225. self.errors = errors
  2226. @property
  2227. def is_old_version(self) -> bool:
  2228. return self.version[0] <= 0 and self.version[1] <= 10 and self.version[2] < 1
  2229. @property
  2230. def version(self) -> tuple[int, int, int]:
  2231. """compute and set our version"""
  2232. version = _ensure_decoded(getattr(self.group._v_attrs, "pandas_version", None))
  2233. try:
  2234. version = tuple(int(x) for x in version.split("."))
  2235. if len(version) == 2:
  2236. version = version + (0,)
  2237. except AttributeError:
  2238. version = (0, 0, 0)
  2239. return version
  2240. @property
  2241. def pandas_type(self):
  2242. return _ensure_decoded(getattr(self.group._v_attrs, "pandas_type", None))
  2243. def __repr__(self) -> str:
  2244. """return a pretty representation of myself"""
  2245. self.infer_axes()
  2246. s = self.shape
  2247. if s is not None:
  2248. if isinstance(s, (list, tuple)):
  2249. jshape = ",".join([pprint_thing(x) for x in s])
  2250. s = f"[{jshape}]"
  2251. return f"{self.pandas_type:12.12} (shape->{s})"
  2252. return self.pandas_type
  2253. def set_object_info(self) -> None:
  2254. """set my pandas type & version"""
  2255. self.attrs.pandas_type = str(self.pandas_kind)
  2256. self.attrs.pandas_version = str(_version)
  2257. def copy(self) -> Fixed:
  2258. new_self = copy.copy(self)
  2259. return new_self
  2260. @property
  2261. def shape(self):
  2262. return self.nrows
  2263. @property
  2264. def pathname(self):
  2265. return self.group._v_pathname
  2266. @property
  2267. def _handle(self):
  2268. return self.parent._handle
  2269. @property
  2270. def _filters(self):
  2271. return self.parent._filters
  2272. @property
  2273. def _complevel(self) -> int:
  2274. return self.parent._complevel
  2275. @property
  2276. def _fletcher32(self) -> bool:
  2277. return self.parent._fletcher32
  2278. @property
  2279. def attrs(self):
  2280. return self.group._v_attrs
  2281. def set_attrs(self) -> None:
  2282. """set our object attributes"""
  2283. def get_attrs(self) -> None:
  2284. """get our object attributes"""
  2285. @property
  2286. def storable(self):
  2287. """return my storable"""
  2288. return self.group
  2289. @property
  2290. def is_exists(self) -> bool:
  2291. return False
  2292. @property
  2293. def nrows(self):
  2294. return getattr(self.storable, "nrows", None)
  2295. def validate(self, other) -> Literal[True] | None:
  2296. """validate against an existing storable"""
  2297. if other is None:
  2298. return None
  2299. return True
  2300. def validate_version(self, where=None) -> None:
  2301. """are we trying to operate on an old version?"""
  2302. def infer_axes(self) -> bool:
  2303. """
  2304. infer the axes of my storer
  2305. return a boolean indicating if we have a valid storer or not
  2306. """
  2307. s = self.storable
  2308. if s is None:
  2309. return False
  2310. self.get_attrs()
  2311. return True
  2312. def read(
  2313. self,
  2314. where=None,
  2315. columns=None,
  2316. start: int | None = None,
  2317. stop: int | None = None,
  2318. ):
  2319. raise NotImplementedError(
  2320. "cannot read on an abstract storer: subclasses should implement"
  2321. )
  2322. def write(self, **kwargs):
  2323. raise NotImplementedError(
  2324. "cannot write on an abstract storer: subclasses should implement"
  2325. )
  2326. def delete(
  2327. self, where=None, start: int | None = None, stop: int | None = None
  2328. ) -> None:
  2329. """
  2330. support fully deleting the node in its entirety (only) - where
  2331. specification must be None
  2332. """
  2333. if com.all_none(where, start, stop):
  2334. self._handle.remove_node(self.group, recursive=True)
  2335. return None
  2336. raise TypeError("cannot delete on an abstract storer")
  2337. class GenericFixed(Fixed):
  2338. """a generified fixed version"""
  2339. _index_type_map = {DatetimeIndex: "datetime", PeriodIndex: "period"}
  2340. _reverse_index_map = {v: k for k, v in _index_type_map.items()}
  2341. attributes: list[str] = []
  2342. # indexer helpers
  2343. def _class_to_alias(self, cls) -> str:
  2344. return self._index_type_map.get(cls, "")
  2345. def _alias_to_class(self, alias):
  2346. if isinstance(alias, type): # pragma: no cover
  2347. # compat: for a short period of time master stored types
  2348. return alias
  2349. return self._reverse_index_map.get(alias, Index)
  2350. def _get_index_factory(self, attrs):
  2351. index_class = self._alias_to_class(
  2352. _ensure_decoded(getattr(attrs, "index_class", ""))
  2353. )
  2354. factory: Callable
  2355. if index_class == DatetimeIndex:
  2356. def f(values, freq=None, tz=None):
  2357. # data are already in UTC, localize and convert if tz present
  2358. dta = DatetimeArray._simple_new(values.values, freq=freq)
  2359. result = DatetimeIndex._simple_new(dta, name=None)
  2360. if tz is not None:
  2361. result = result.tz_localize("UTC").tz_convert(tz)
  2362. return result
  2363. factory = f
  2364. elif index_class == PeriodIndex:
  2365. def f(values, freq=None, tz=None):
  2366. parr = PeriodArray._simple_new(values, freq=freq)
  2367. return PeriodIndex._simple_new(parr, name=None)
  2368. factory = f
  2369. else:
  2370. factory = index_class
  2371. kwargs = {}
  2372. if "freq" in attrs:
  2373. kwargs["freq"] = attrs["freq"]
  2374. if index_class is Index:
  2375. # DTI/PI would be gotten by _alias_to_class
  2376. factory = TimedeltaIndex
  2377. if "tz" in attrs:
  2378. if isinstance(attrs["tz"], bytes):
  2379. # created by python2
  2380. kwargs["tz"] = attrs["tz"].decode("utf-8")
  2381. else:
  2382. # created by python3
  2383. kwargs["tz"] = attrs["tz"]
  2384. assert index_class is DatetimeIndex # just checking
  2385. return factory, kwargs
  2386. def validate_read(self, columns, where) -> None:
  2387. """
  2388. raise if any keywords are passed which are not-None
  2389. """
  2390. if columns is not None:
  2391. raise TypeError(
  2392. "cannot pass a column specification when reading "
  2393. "a Fixed format store. this store must be selected in its entirety"
  2394. )
  2395. if where is not None:
  2396. raise TypeError(
  2397. "cannot pass a where specification when reading "
  2398. "from a Fixed format store. this store must be selected in its entirety"
  2399. )
  2400. @property
  2401. def is_exists(self) -> bool:
  2402. return True
  2403. def set_attrs(self) -> None:
  2404. """set our object attributes"""
  2405. self.attrs.encoding = self.encoding
  2406. self.attrs.errors = self.errors
  2407. def get_attrs(self) -> None:
  2408. """retrieve our attributes"""
  2409. self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
  2410. self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
  2411. for n in self.attributes:
  2412. setattr(self, n, _ensure_decoded(getattr(self.attrs, n, None)))
  2413. # error: Signature of "write" incompatible with supertype "Fixed"
  2414. def write(self, obj, **kwargs) -> None: # type: ignore[override]
  2415. self.set_attrs()
  2416. def read_array(self, key: str, start: int | None = None, stop: int | None = None):
  2417. """read an array for the specified node (off of group"""
  2418. import tables
  2419. node = getattr(self.group, key)
  2420. attrs = node._v_attrs
  2421. transposed = getattr(attrs, "transposed", False)
  2422. if isinstance(node, tables.VLArray):
  2423. ret = node[0][start:stop]
  2424. else:
  2425. dtype = _ensure_decoded(getattr(attrs, "value_type", None))
  2426. shape = getattr(attrs, "shape", None)
  2427. if shape is not None:
  2428. # length 0 axis
  2429. ret = np.empty(shape, dtype=dtype)
  2430. else:
  2431. ret = node[start:stop]
  2432. if dtype == "datetime64":
  2433. # reconstruct a timezone if indicated
  2434. tz = getattr(attrs, "tz", None)
  2435. ret = _set_tz(ret, tz, coerce=True)
  2436. elif dtype == "timedelta64":
  2437. ret = np.asarray(ret, dtype="m8[ns]")
  2438. if transposed:
  2439. return ret.T
  2440. else:
  2441. return ret
  2442. def read_index(
  2443. self, key: str, start: int | None = None, stop: int | None = None
  2444. ) -> Index:
  2445. variety = _ensure_decoded(getattr(self.attrs, f"{key}_variety"))
  2446. if variety == "multi":
  2447. return self.read_multi_index(key, start=start, stop=stop)
  2448. elif variety == "regular":
  2449. node = getattr(self.group, key)
  2450. index = self.read_index_node(node, start=start, stop=stop)
  2451. return index
  2452. else: # pragma: no cover
  2453. raise TypeError(f"unrecognized index variety: {variety}")
  2454. def write_index(self, key: str, index: Index) -> None:
  2455. if isinstance(index, MultiIndex):
  2456. setattr(self.attrs, f"{key}_variety", "multi")
  2457. self.write_multi_index(key, index)
  2458. else:
  2459. setattr(self.attrs, f"{key}_variety", "regular")
  2460. converted = _convert_index("index", index, self.encoding, self.errors)
  2461. self.write_array(key, converted.values)
  2462. node = getattr(self.group, key)
  2463. node._v_attrs.kind = converted.kind
  2464. node._v_attrs.name = index.name
  2465. if isinstance(index, (DatetimeIndex, PeriodIndex)):
  2466. node._v_attrs.index_class = self._class_to_alias(type(index))
  2467. if isinstance(index, (DatetimeIndex, PeriodIndex, TimedeltaIndex)):
  2468. node._v_attrs.freq = index.freq
  2469. if isinstance(index, DatetimeIndex) and index.tz is not None:
  2470. node._v_attrs.tz = _get_tz(index.tz)
  2471. def write_multi_index(self, key: str, index: MultiIndex) -> None:
  2472. setattr(self.attrs, f"{key}_nlevels", index.nlevels)
  2473. for i, (lev, level_codes, name) in enumerate(
  2474. zip(index.levels, index.codes, index.names)
  2475. ):
  2476. # write the level
  2477. if is_extension_array_dtype(lev):
  2478. raise NotImplementedError(
  2479. "Saving a MultiIndex with an extension dtype is not supported."
  2480. )
  2481. level_key = f"{key}_level{i}"
  2482. conv_level = _convert_index(level_key, lev, self.encoding, self.errors)
  2483. self.write_array(level_key, conv_level.values)
  2484. node = getattr(self.group, level_key)
  2485. node._v_attrs.kind = conv_level.kind
  2486. node._v_attrs.name = name
  2487. # write the name
  2488. setattr(node._v_attrs, f"{key}_name{name}", name)
  2489. # write the labels
  2490. label_key = f"{key}_label{i}"
  2491. self.write_array(label_key, level_codes)
  2492. def read_multi_index(
  2493. self, key: str, start: int | None = None, stop: int | None = None
  2494. ) -> MultiIndex:
  2495. nlevels = getattr(self.attrs, f"{key}_nlevels")
  2496. levels = []
  2497. codes = []
  2498. names: list[Hashable] = []
  2499. for i in range(nlevels):
  2500. level_key = f"{key}_level{i}"
  2501. node = getattr(self.group, level_key)
  2502. lev = self.read_index_node(node, start=start, stop=stop)
  2503. levels.append(lev)
  2504. names.append(lev.name)
  2505. label_key = f"{key}_label{i}"
  2506. level_codes = self.read_array(label_key, start=start, stop=stop)
  2507. codes.append(level_codes)
  2508. return MultiIndex(
  2509. levels=levels, codes=codes, names=names, verify_integrity=True
  2510. )
  2511. def read_index_node(
  2512. self, node: Node, start: int | None = None, stop: int | None = None
  2513. ) -> Index:
  2514. data = node[start:stop]
  2515. # If the index was an empty array write_array_empty() will
  2516. # have written a sentinel. Here we replace it with the original.
  2517. if "shape" in node._v_attrs and np.prod(node._v_attrs.shape) == 0:
  2518. data = np.empty(node._v_attrs.shape, dtype=node._v_attrs.value_type)
  2519. kind = _ensure_decoded(node._v_attrs.kind)
  2520. name = None
  2521. if "name" in node._v_attrs:
  2522. name = _ensure_str(node._v_attrs.name)
  2523. name = _ensure_decoded(name)
  2524. attrs = node._v_attrs
  2525. factory, kwargs = self._get_index_factory(attrs)
  2526. if kind in ("date", "object"):
  2527. index = factory(
  2528. _unconvert_index(
  2529. data, kind, encoding=self.encoding, errors=self.errors
  2530. ),
  2531. dtype=object,
  2532. **kwargs,
  2533. )
  2534. else:
  2535. index = factory(
  2536. _unconvert_index(
  2537. data, kind, encoding=self.encoding, errors=self.errors
  2538. ),
  2539. **kwargs,
  2540. )
  2541. index.name = name
  2542. return index
  2543. def write_array_empty(self, key: str, value: ArrayLike) -> None:
  2544. """write a 0-len array"""
  2545. # ugly hack for length 0 axes
  2546. arr = np.empty((1,) * value.ndim)
  2547. self._handle.create_array(self.group, key, arr)
  2548. node = getattr(self.group, key)
  2549. node._v_attrs.value_type = str(value.dtype)
  2550. node._v_attrs.shape = value.shape
  2551. def write_array(
  2552. self, key: str, obj: AnyArrayLike, items: Index | None = None
  2553. ) -> None:
  2554. # TODO: we only have a few tests that get here, the only EA
  2555. # that gets passed is DatetimeArray, and we never have
  2556. # both self._filters and EA
  2557. value = extract_array(obj, extract_numpy=True)
  2558. if key in self.group:
  2559. self._handle.remove_node(self.group, key)
  2560. # Transform needed to interface with pytables row/col notation
  2561. empty_array = value.size == 0
  2562. transposed = False
  2563. if is_categorical_dtype(value.dtype):
  2564. raise NotImplementedError(
  2565. "Cannot store a category dtype in a HDF5 dataset that uses format="
  2566. '"fixed". Use format="table".'
  2567. )
  2568. if not empty_array:
  2569. if hasattr(value, "T"):
  2570. # ExtensionArrays (1d) may not have transpose.
  2571. value = value.T
  2572. transposed = True
  2573. atom = None
  2574. if self._filters is not None:
  2575. with suppress(ValueError):
  2576. # get the atom for this datatype
  2577. atom = _tables().Atom.from_dtype(value.dtype)
  2578. if atom is not None:
  2579. # We only get here if self._filters is non-None and
  2580. # the Atom.from_dtype call succeeded
  2581. # create an empty chunked array and fill it from value
  2582. if not empty_array:
  2583. ca = self._handle.create_carray(
  2584. self.group, key, atom, value.shape, filters=self._filters
  2585. )
  2586. ca[:] = value
  2587. else:
  2588. self.write_array_empty(key, value)
  2589. elif value.dtype.type == np.object_:
  2590. # infer the type, warn if we have a non-string type here (for
  2591. # performance)
  2592. inferred_type = lib.infer_dtype(value, skipna=False)
  2593. if empty_array:
  2594. pass
  2595. elif inferred_type == "string":
  2596. pass
  2597. else:
  2598. ws = performance_doc % (inferred_type, key, items)
  2599. warnings.warn(ws, PerformanceWarning, stacklevel=find_stack_level())
  2600. vlarr = self._handle.create_vlarray(self.group, key, _tables().ObjectAtom())
  2601. vlarr.append(value)
  2602. elif is_datetime64_dtype(value.dtype):
  2603. self._handle.create_array(self.group, key, value.view("i8"))
  2604. getattr(self.group, key)._v_attrs.value_type = "datetime64"
  2605. elif is_datetime64tz_dtype(value.dtype):
  2606. # store as UTC
  2607. # with a zone
  2608. # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
  2609. # attribute "asi8"
  2610. self._handle.create_array(
  2611. self.group, key, value.asi8 # type: ignore[union-attr]
  2612. )
  2613. node = getattr(self.group, key)
  2614. # error: Item "ExtensionArray" of "Union[Any, ExtensionArray]" has no
  2615. # attribute "tz"
  2616. node._v_attrs.tz = _get_tz(value.tz) # type: ignore[union-attr]
  2617. node._v_attrs.value_type = "datetime64"
  2618. elif is_timedelta64_dtype(value.dtype):
  2619. self._handle.create_array(self.group, key, value.view("i8"))
  2620. getattr(self.group, key)._v_attrs.value_type = "timedelta64"
  2621. elif empty_array:
  2622. self.write_array_empty(key, value)
  2623. else:
  2624. self._handle.create_array(self.group, key, value)
  2625. getattr(self.group, key)._v_attrs.transposed = transposed
  2626. class SeriesFixed(GenericFixed):
  2627. pandas_kind = "series"
  2628. attributes = ["name"]
  2629. name: Hashable
  2630. @property
  2631. def shape(self):
  2632. try:
  2633. return (len(self.group.values),)
  2634. except (TypeError, AttributeError):
  2635. return None
  2636. def read(
  2637. self,
  2638. where=None,
  2639. columns=None,
  2640. start: int | None = None,
  2641. stop: int | None = None,
  2642. ) -> Series:
  2643. self.validate_read(columns, where)
  2644. index = self.read_index("index", start=start, stop=stop)
  2645. values = self.read_array("values", start=start, stop=stop)
  2646. return Series(values, index=index, name=self.name, copy=False)
  2647. # error: Signature of "write" incompatible with supertype "Fixed"
  2648. def write(self, obj, **kwargs) -> None: # type: ignore[override]
  2649. super().write(obj, **kwargs)
  2650. self.write_index("index", obj.index)
  2651. self.write_array("values", obj)
  2652. self.attrs.name = obj.name
  2653. class BlockManagerFixed(GenericFixed):
  2654. attributes = ["ndim", "nblocks"]
  2655. nblocks: int
  2656. @property
  2657. def shape(self) -> Shape | None:
  2658. try:
  2659. ndim = self.ndim
  2660. # items
  2661. items = 0
  2662. for i in range(self.nblocks):
  2663. node = getattr(self.group, f"block{i}_items")
  2664. shape = getattr(node, "shape", None)
  2665. if shape is not None:
  2666. items += shape[0]
  2667. # data shape
  2668. node = self.group.block0_values
  2669. shape = getattr(node, "shape", None)
  2670. if shape is not None:
  2671. shape = list(shape[0 : (ndim - 1)])
  2672. else:
  2673. shape = []
  2674. shape.append(items)
  2675. return shape
  2676. except AttributeError:
  2677. return None
  2678. def read(
  2679. self,
  2680. where=None,
  2681. columns=None,
  2682. start: int | None = None,
  2683. stop: int | None = None,
  2684. ) -> DataFrame:
  2685. # start, stop applied to rows, so 0th axis only
  2686. self.validate_read(columns, where)
  2687. select_axis = self.obj_type()._get_block_manager_axis(0)
  2688. axes = []
  2689. for i in range(self.ndim):
  2690. _start, _stop = (start, stop) if i == select_axis else (None, None)
  2691. ax = self.read_index(f"axis{i}", start=_start, stop=_stop)
  2692. axes.append(ax)
  2693. items = axes[0]
  2694. dfs = []
  2695. for i in range(self.nblocks):
  2696. blk_items = self.read_index(f"block{i}_items")
  2697. values = self.read_array(f"block{i}_values", start=_start, stop=_stop)
  2698. columns = items[items.get_indexer(blk_items)]
  2699. df = DataFrame(values.T, columns=columns, index=axes[1], copy=False)
  2700. dfs.append(df)
  2701. if len(dfs) > 0:
  2702. out = concat(dfs, axis=1, copy=True)
  2703. out = out.reindex(columns=items, copy=False)
  2704. return out
  2705. return DataFrame(columns=axes[0], index=axes[1])
  2706. # error: Signature of "write" incompatible with supertype "Fixed"
  2707. def write(self, obj, **kwargs) -> None: # type: ignore[override]
  2708. super().write(obj, **kwargs)
  2709. # TODO(ArrayManager) HDFStore relies on accessing the blocks
  2710. if isinstance(obj._mgr, ArrayManager):
  2711. obj = obj._as_manager("block")
  2712. data = obj._mgr
  2713. if not data.is_consolidated():
  2714. data = data.consolidate()
  2715. self.attrs.ndim = data.ndim
  2716. for i, ax in enumerate(data.axes):
  2717. if i == 0 and (not ax.is_unique):
  2718. raise ValueError("Columns index has to be unique for fixed format")
  2719. self.write_index(f"axis{i}", ax)
  2720. # Supporting mixed-type DataFrame objects...nontrivial
  2721. self.attrs.nblocks = len(data.blocks)
  2722. for i, blk in enumerate(data.blocks):
  2723. # I have no idea why, but writing values before items fixed #2299
  2724. blk_items = data.items.take(blk.mgr_locs)
  2725. self.write_array(f"block{i}_values", blk.values, items=blk_items)
  2726. self.write_index(f"block{i}_items", blk_items)
  2727. class FrameFixed(BlockManagerFixed):
  2728. pandas_kind = "frame"
  2729. obj_type = DataFrame
  2730. class Table(Fixed):
  2731. """
  2732. represent a table:
  2733. facilitate read/write of various types of tables
  2734. Attrs in Table Node
  2735. -------------------
  2736. These are attributes that are store in the main table node, they are
  2737. necessary to recreate these tables when read back in.
  2738. index_axes : a list of tuples of the (original indexing axis and
  2739. index column)
  2740. non_index_axes: a list of tuples of the (original index axis and
  2741. columns on a non-indexing axis)
  2742. values_axes : a list of the columns which comprise the data of this
  2743. table
  2744. data_columns : a list of the columns that we are allowing indexing
  2745. (these become single columns in values_axes)
  2746. nan_rep : the string to use for nan representations for string
  2747. objects
  2748. levels : the names of levels
  2749. metadata : the names of the metadata columns
  2750. """
  2751. pandas_kind = "wide_table"
  2752. format_type: str = "table" # GH#30962 needed by dask
  2753. table_type: str
  2754. levels: int | list[Hashable] = 1
  2755. is_table = True
  2756. metadata: list
  2757. def __init__(
  2758. self,
  2759. parent: HDFStore,
  2760. group: Node,
  2761. encoding: str | None = None,
  2762. errors: str = "strict",
  2763. index_axes: list[IndexCol] | None = None,
  2764. non_index_axes: list[tuple[AxisInt, Any]] | None = None,
  2765. values_axes: list[DataCol] | None = None,
  2766. data_columns: list | None = None,
  2767. info: dict | None = None,
  2768. nan_rep=None,
  2769. ) -> None:
  2770. super().__init__(parent, group, encoding=encoding, errors=errors)
  2771. self.index_axes = index_axes or []
  2772. self.non_index_axes = non_index_axes or []
  2773. self.values_axes = values_axes or []
  2774. self.data_columns = data_columns or []
  2775. self.info = info or {}
  2776. self.nan_rep = nan_rep
  2777. @property
  2778. def table_type_short(self) -> str:
  2779. return self.table_type.split("_")[0]
  2780. def __repr__(self) -> str:
  2781. """return a pretty representation of myself"""
  2782. self.infer_axes()
  2783. jdc = ",".join(self.data_columns) if len(self.data_columns) else ""
  2784. dc = f",dc->[{jdc}]"
  2785. ver = ""
  2786. if self.is_old_version:
  2787. jver = ".".join([str(x) for x in self.version])
  2788. ver = f"[{jver}]"
  2789. jindex_axes = ",".join([a.name for a in self.index_axes])
  2790. return (
  2791. f"{self.pandas_type:12.12}{ver} "
  2792. f"(typ->{self.table_type_short},nrows->{self.nrows},"
  2793. f"ncols->{self.ncols},indexers->[{jindex_axes}]{dc})"
  2794. )
  2795. def __getitem__(self, c: str):
  2796. """return the axis for c"""
  2797. for a in self.axes:
  2798. if c == a.name:
  2799. return a
  2800. return None
  2801. def validate(self, other) -> None:
  2802. """validate against an existing table"""
  2803. if other is None:
  2804. return
  2805. if other.table_type != self.table_type:
  2806. raise TypeError(
  2807. "incompatible table_type with existing "
  2808. f"[{other.table_type} - {self.table_type}]"
  2809. )
  2810. for c in ["index_axes", "non_index_axes", "values_axes"]:
  2811. sv = getattr(self, c, None)
  2812. ov = getattr(other, c, None)
  2813. if sv != ov:
  2814. # show the error for the specific axes
  2815. # Argument 1 to "enumerate" has incompatible type
  2816. # "Optional[Any]"; expected "Iterable[Any]" [arg-type]
  2817. for i, sax in enumerate(sv): # type: ignore[arg-type]
  2818. # Value of type "Optional[Any]" is not indexable [index]
  2819. oax = ov[i] # type: ignore[index]
  2820. if sax != oax:
  2821. raise ValueError(
  2822. f"invalid combination of [{c}] on appending data "
  2823. f"[{sax}] vs current table [{oax}]"
  2824. )
  2825. # should never get here
  2826. raise Exception(
  2827. f"invalid combination of [{c}] on appending data [{sv}] vs "
  2828. f"current table [{ov}]"
  2829. )
  2830. @property
  2831. def is_multi_index(self) -> bool:
  2832. """the levels attribute is 1 or a list in the case of a multi-index"""
  2833. return isinstance(self.levels, list)
  2834. def validate_multiindex(
  2835. self, obj: DataFrame | Series
  2836. ) -> tuple[DataFrame, list[Hashable]]:
  2837. """
  2838. validate that we can store the multi-index; reset and return the
  2839. new object
  2840. """
  2841. levels = com.fill_missing_names(obj.index.names)
  2842. try:
  2843. reset_obj = obj.reset_index()
  2844. except ValueError as err:
  2845. raise ValueError(
  2846. "duplicate names/columns in the multi-index when storing as a table"
  2847. ) from err
  2848. assert isinstance(reset_obj, DataFrame) # for mypy
  2849. return reset_obj, levels
  2850. @property
  2851. def nrows_expected(self) -> int:
  2852. """based on our axes, compute the expected nrows"""
  2853. return np.prod([i.cvalues.shape[0] for i in self.index_axes])
  2854. @property
  2855. def is_exists(self) -> bool:
  2856. """has this table been created"""
  2857. return "table" in self.group
  2858. @property
  2859. def storable(self):
  2860. return getattr(self.group, "table", None)
  2861. @property
  2862. def table(self):
  2863. """return the table group (this is my storable)"""
  2864. return self.storable
  2865. @property
  2866. def dtype(self):
  2867. return self.table.dtype
  2868. @property
  2869. def description(self):
  2870. return self.table.description
  2871. @property
  2872. def axes(self):
  2873. return itertools.chain(self.index_axes, self.values_axes)
  2874. @property
  2875. def ncols(self) -> int:
  2876. """the number of total columns in the values axes"""
  2877. return sum(len(a.values) for a in self.values_axes)
  2878. @property
  2879. def is_transposed(self) -> bool:
  2880. return False
  2881. @property
  2882. def data_orientation(self) -> tuple[int, ...]:
  2883. """return a tuple of my permutated axes, non_indexable at the front"""
  2884. return tuple(
  2885. itertools.chain(
  2886. [int(a[0]) for a in self.non_index_axes],
  2887. [int(a.axis) for a in self.index_axes],
  2888. )
  2889. )
  2890. def queryables(self) -> dict[str, Any]:
  2891. """return a dict of the kinds allowable columns for this object"""
  2892. # mypy doesn't recognize DataFrame._AXIS_NAMES, so we re-write it here
  2893. axis_names = {0: "index", 1: "columns"}
  2894. # compute the values_axes queryables
  2895. d1 = [(a.cname, a) for a in self.index_axes]
  2896. d2 = [(axis_names[axis], None) for axis, values in self.non_index_axes]
  2897. d3 = [
  2898. (v.cname, v) for v in self.values_axes if v.name in set(self.data_columns)
  2899. ]
  2900. return dict(d1 + d2 + d3)
  2901. def index_cols(self):
  2902. """return a list of my index cols"""
  2903. # Note: each `i.cname` below is assured to be a str.
  2904. return [(i.axis, i.cname) for i in self.index_axes]
  2905. def values_cols(self) -> list[str]:
  2906. """return a list of my values cols"""
  2907. return [i.cname for i in self.values_axes]
  2908. def _get_metadata_path(self, key: str) -> str:
  2909. """return the metadata pathname for this key"""
  2910. group = self.group._v_pathname
  2911. return f"{group}/meta/{key}/meta"
  2912. def write_metadata(self, key: str, values: np.ndarray) -> None:
  2913. """
  2914. Write out a metadata array to the key as a fixed-format Series.
  2915. Parameters
  2916. ----------
  2917. key : str
  2918. values : ndarray
  2919. """
  2920. self.parent.put(
  2921. self._get_metadata_path(key),
  2922. Series(values, copy=False),
  2923. format="table",
  2924. encoding=self.encoding,
  2925. errors=self.errors,
  2926. nan_rep=self.nan_rep,
  2927. )
  2928. def read_metadata(self, key: str):
  2929. """return the meta data array for this key"""
  2930. if getattr(getattr(self.group, "meta", None), key, None) is not None:
  2931. return self.parent.select(self._get_metadata_path(key))
  2932. return None
  2933. def set_attrs(self) -> None:
  2934. """set our table type & indexables"""
  2935. self.attrs.table_type = str(self.table_type)
  2936. self.attrs.index_cols = self.index_cols()
  2937. self.attrs.values_cols = self.values_cols()
  2938. self.attrs.non_index_axes = self.non_index_axes
  2939. self.attrs.data_columns = self.data_columns
  2940. self.attrs.nan_rep = self.nan_rep
  2941. self.attrs.encoding = self.encoding
  2942. self.attrs.errors = self.errors
  2943. self.attrs.levels = self.levels
  2944. self.attrs.info = self.info
  2945. def get_attrs(self) -> None:
  2946. """retrieve our attributes"""
  2947. self.non_index_axes = getattr(self.attrs, "non_index_axes", None) or []
  2948. self.data_columns = getattr(self.attrs, "data_columns", None) or []
  2949. self.info = getattr(self.attrs, "info", None) or {}
  2950. self.nan_rep = getattr(self.attrs, "nan_rep", None)
  2951. self.encoding = _ensure_encoding(getattr(self.attrs, "encoding", None))
  2952. self.errors = _ensure_decoded(getattr(self.attrs, "errors", "strict"))
  2953. self.levels: list[Hashable] = getattr(self.attrs, "levels", None) or []
  2954. self.index_axes = [a for a in self.indexables if a.is_an_indexable]
  2955. self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
  2956. def validate_version(self, where=None) -> None:
  2957. """are we trying to operate on an old version?"""
  2958. if where is not None:
  2959. if self.is_old_version:
  2960. ws = incompatibility_doc % ".".join([str(x) for x in self.version])
  2961. warnings.warn(
  2962. ws,
  2963. IncompatibilityWarning,
  2964. stacklevel=find_stack_level(),
  2965. )
  2966. def validate_min_itemsize(self, min_itemsize) -> None:
  2967. """
  2968. validate the min_itemsize doesn't contain items that are not in the
  2969. axes this needs data_columns to be defined
  2970. """
  2971. if min_itemsize is None:
  2972. return
  2973. if not isinstance(min_itemsize, dict):
  2974. return
  2975. q = self.queryables()
  2976. for k in min_itemsize:
  2977. # ok, apply generally
  2978. if k == "values":
  2979. continue
  2980. if k not in q:
  2981. raise ValueError(
  2982. f"min_itemsize has the key [{k}] which is not an axis or "
  2983. "data_column"
  2984. )
  2985. @cache_readonly
  2986. def indexables(self):
  2987. """create/cache the indexables if they don't exist"""
  2988. _indexables = []
  2989. desc = self.description
  2990. table_attrs = self.table.attrs
  2991. # Note: each of the `name` kwargs below are str, ensured
  2992. # by the definition in index_cols.
  2993. # index columns
  2994. for i, (axis, name) in enumerate(self.attrs.index_cols):
  2995. atom = getattr(desc, name)
  2996. md = self.read_metadata(name)
  2997. meta = "category" if md is not None else None
  2998. kind_attr = f"{name}_kind"
  2999. kind = getattr(table_attrs, kind_attr, None)
  3000. index_col = IndexCol(
  3001. name=name,
  3002. axis=axis,
  3003. pos=i,
  3004. kind=kind,
  3005. typ=atom,
  3006. table=self.table,
  3007. meta=meta,
  3008. metadata=md,
  3009. )
  3010. _indexables.append(index_col)
  3011. # values columns
  3012. dc = set(self.data_columns)
  3013. base_pos = len(_indexables)
  3014. def f(i, c):
  3015. assert isinstance(c, str)
  3016. klass = DataCol
  3017. if c in dc:
  3018. klass = DataIndexableCol
  3019. atom = getattr(desc, c)
  3020. adj_name = _maybe_adjust_name(c, self.version)
  3021. # TODO: why kind_attr here?
  3022. values = getattr(table_attrs, f"{adj_name}_kind", None)
  3023. dtype = getattr(table_attrs, f"{adj_name}_dtype", None)
  3024. # Argument 1 to "_dtype_to_kind" has incompatible type
  3025. # "Optional[Any]"; expected "str" [arg-type]
  3026. kind = _dtype_to_kind(dtype) # type: ignore[arg-type]
  3027. md = self.read_metadata(c)
  3028. # TODO: figure out why these two versions of `meta` dont always match.
  3029. # meta = "category" if md is not None else None
  3030. meta = getattr(table_attrs, f"{adj_name}_meta", None)
  3031. obj = klass(
  3032. name=adj_name,
  3033. cname=c,
  3034. values=values,
  3035. kind=kind,
  3036. pos=base_pos + i,
  3037. typ=atom,
  3038. table=self.table,
  3039. meta=meta,
  3040. metadata=md,
  3041. dtype=dtype,
  3042. )
  3043. return obj
  3044. # Note: the definition of `values_cols` ensures that each
  3045. # `c` below is a str.
  3046. _indexables.extend([f(i, c) for i, c in enumerate(self.attrs.values_cols)])
  3047. return _indexables
  3048. def create_index(
  3049. self, columns=None, optlevel=None, kind: str | None = None
  3050. ) -> None:
  3051. """
  3052. Create a pytables index on the specified columns.
  3053. Parameters
  3054. ----------
  3055. columns : None, bool, or listlike[str]
  3056. Indicate which columns to create an index on.
  3057. * False : Do not create any indexes.
  3058. * True : Create indexes on all columns.
  3059. * None : Create indexes on all columns.
  3060. * listlike : Create indexes on the given columns.
  3061. optlevel : int or None, default None
  3062. Optimization level, if None, pytables defaults to 6.
  3063. kind : str or None, default None
  3064. Kind of index, if None, pytables defaults to "medium".
  3065. Raises
  3066. ------
  3067. TypeError if trying to create an index on a complex-type column.
  3068. Notes
  3069. -----
  3070. Cannot index Time64Col or ComplexCol.
  3071. Pytables must be >= 3.0.
  3072. """
  3073. if not self.infer_axes():
  3074. return
  3075. if columns is False:
  3076. return
  3077. # index all indexables and data_columns
  3078. if columns is None or columns is True:
  3079. columns = [a.cname for a in self.axes if a.is_data_indexable]
  3080. if not isinstance(columns, (tuple, list)):
  3081. columns = [columns]
  3082. kw = {}
  3083. if optlevel is not None:
  3084. kw["optlevel"] = optlevel
  3085. if kind is not None:
  3086. kw["kind"] = kind
  3087. table = self.table
  3088. for c in columns:
  3089. v = getattr(table.cols, c, None)
  3090. if v is not None:
  3091. # remove the index if the kind/optlevel have changed
  3092. if v.is_indexed:
  3093. index = v.index
  3094. cur_optlevel = index.optlevel
  3095. cur_kind = index.kind
  3096. if kind is not None and cur_kind != kind:
  3097. v.remove_index()
  3098. else:
  3099. kw["kind"] = cur_kind
  3100. if optlevel is not None and cur_optlevel != optlevel:
  3101. v.remove_index()
  3102. else:
  3103. kw["optlevel"] = cur_optlevel
  3104. # create the index
  3105. if not v.is_indexed:
  3106. if v.type.startswith("complex"):
  3107. raise TypeError(
  3108. "Columns containing complex values can be stored but "
  3109. "cannot be indexed when using table format. Either use "
  3110. "fixed format, set index=False, or do not include "
  3111. "the columns containing complex values to "
  3112. "data_columns when initializing the table."
  3113. )
  3114. v.create_index(**kw)
  3115. elif c in self.non_index_axes[0][1]:
  3116. # GH 28156
  3117. raise AttributeError(
  3118. f"column {c} is not a data_column.\n"
  3119. f"In order to read column {c} you must reload the dataframe \n"
  3120. f"into HDFStore and include {c} with the data_columns argument."
  3121. )
  3122. def _read_axes(
  3123. self, where, start: int | None = None, stop: int | None = None
  3124. ) -> list[tuple[ArrayLike, ArrayLike]]:
  3125. """
  3126. Create the axes sniffed from the table.
  3127. Parameters
  3128. ----------
  3129. where : ???
  3130. start : int or None, default None
  3131. stop : int or None, default None
  3132. Returns
  3133. -------
  3134. List[Tuple[index_values, column_values]]
  3135. """
  3136. # create the selection
  3137. selection = Selection(self, where=where, start=start, stop=stop)
  3138. values = selection.select()
  3139. results = []
  3140. # convert the data
  3141. for a in self.axes:
  3142. a.set_info(self.info)
  3143. res = a.convert(
  3144. values,
  3145. nan_rep=self.nan_rep,
  3146. encoding=self.encoding,
  3147. errors=self.errors,
  3148. )
  3149. results.append(res)
  3150. return results
  3151. @classmethod
  3152. def get_object(cls, obj, transposed: bool):
  3153. """return the data for this obj"""
  3154. return obj
  3155. def validate_data_columns(self, data_columns, min_itemsize, non_index_axes):
  3156. """
  3157. take the input data_columns and min_itemize and create a data
  3158. columns spec
  3159. """
  3160. if not len(non_index_axes):
  3161. return []
  3162. axis, axis_labels = non_index_axes[0]
  3163. info = self.info.get(axis, {})
  3164. if info.get("type") == "MultiIndex" and data_columns:
  3165. raise ValueError(
  3166. f"cannot use a multi-index on axis [{axis}] with "
  3167. f"data_columns {data_columns}"
  3168. )
  3169. # evaluate the passed data_columns, True == use all columns
  3170. # take only valid axis labels
  3171. if data_columns is True:
  3172. data_columns = list(axis_labels)
  3173. elif data_columns is None:
  3174. data_columns = []
  3175. # if min_itemsize is a dict, add the keys (exclude 'values')
  3176. if isinstance(min_itemsize, dict):
  3177. existing_data_columns = set(data_columns)
  3178. data_columns = list(data_columns) # ensure we do not modify
  3179. data_columns.extend(
  3180. [
  3181. k
  3182. for k in min_itemsize.keys()
  3183. if k != "values" and k not in existing_data_columns
  3184. ]
  3185. )
  3186. # return valid columns in the order of our axis
  3187. return [c for c in data_columns if c in axis_labels]
  3188. def _create_axes(
  3189. self,
  3190. axes,
  3191. obj: DataFrame,
  3192. validate: bool = True,
  3193. nan_rep=None,
  3194. data_columns=None,
  3195. min_itemsize=None,
  3196. ):
  3197. """
  3198. Create and return the axes.
  3199. Parameters
  3200. ----------
  3201. axes: list or None
  3202. The names or numbers of the axes to create.
  3203. obj : DataFrame
  3204. The object to create axes on.
  3205. validate: bool, default True
  3206. Whether to validate the obj against an existing object already written.
  3207. nan_rep :
  3208. A value to use for string column nan_rep.
  3209. data_columns : List[str], True, or None, default None
  3210. Specify the columns that we want to create to allow indexing on.
  3211. * True : Use all available columns.
  3212. * None : Use no columns.
  3213. * List[str] : Use the specified columns.
  3214. min_itemsize: Dict[str, int] or None, default None
  3215. The min itemsize for a column in bytes.
  3216. """
  3217. if not isinstance(obj, DataFrame):
  3218. group = self.group._v_name
  3219. raise TypeError(
  3220. f"cannot properly create the storer for: [group->{group},"
  3221. f"value->{type(obj)}]"
  3222. )
  3223. # set the default axes if needed
  3224. if axes is None:
  3225. axes = [0]
  3226. # map axes to numbers
  3227. axes = [obj._get_axis_number(a) for a in axes]
  3228. # do we have an existing table (if so, use its axes & data_columns)
  3229. if self.infer_axes():
  3230. table_exists = True
  3231. axes = [a.axis for a in self.index_axes]
  3232. data_columns = list(self.data_columns)
  3233. nan_rep = self.nan_rep
  3234. # TODO: do we always have validate=True here?
  3235. else:
  3236. table_exists = False
  3237. new_info = self.info
  3238. assert self.ndim == 2 # with next check, we must have len(axes) == 1
  3239. # currently support on ndim-1 axes
  3240. if len(axes) != self.ndim - 1:
  3241. raise ValueError(
  3242. "currently only support ndim-1 indexers in an AppendableTable"
  3243. )
  3244. # create according to the new data
  3245. new_non_index_axes: list = []
  3246. # nan_representation
  3247. if nan_rep is None:
  3248. nan_rep = "nan"
  3249. # We construct the non-index-axis first, since that alters new_info
  3250. idx = [x for x in [0, 1] if x not in axes][0]
  3251. a = obj.axes[idx]
  3252. # we might be able to change the axes on the appending data if necessary
  3253. append_axis = list(a)
  3254. if table_exists:
  3255. indexer = len(new_non_index_axes) # i.e. 0
  3256. exist_axis = self.non_index_axes[indexer][1]
  3257. if not array_equivalent(np.array(append_axis), np.array(exist_axis)):
  3258. # ahah! -> reindex
  3259. if array_equivalent(
  3260. np.array(sorted(append_axis)), np.array(sorted(exist_axis))
  3261. ):
  3262. append_axis = exist_axis
  3263. # the non_index_axes info
  3264. info = new_info.setdefault(idx, {})
  3265. info["names"] = list(a.names)
  3266. info["type"] = type(a).__name__
  3267. new_non_index_axes.append((idx, append_axis))
  3268. # Now we can construct our new index axis
  3269. idx = axes[0]
  3270. a = obj.axes[idx]
  3271. axis_name = obj._get_axis_name(idx)
  3272. new_index = _convert_index(axis_name, a, self.encoding, self.errors)
  3273. new_index.axis = idx
  3274. # Because we are always 2D, there is only one new_index, so
  3275. # we know it will have pos=0
  3276. new_index.set_pos(0)
  3277. new_index.update_info(new_info)
  3278. new_index.maybe_set_size(min_itemsize) # check for column conflicts
  3279. new_index_axes = [new_index]
  3280. j = len(new_index_axes) # i.e. 1
  3281. assert j == 1
  3282. # reindex by our non_index_axes & compute data_columns
  3283. assert len(new_non_index_axes) == 1
  3284. for a in new_non_index_axes:
  3285. obj = _reindex_axis(obj, a[0], a[1])
  3286. transposed = new_index.axis == 1
  3287. # figure out data_columns and get out blocks
  3288. data_columns = self.validate_data_columns(
  3289. data_columns, min_itemsize, new_non_index_axes
  3290. )
  3291. frame = self.get_object(obj, transposed)._consolidate()
  3292. blocks, blk_items = self._get_blocks_and_items(
  3293. frame, table_exists, new_non_index_axes, self.values_axes, data_columns
  3294. )
  3295. # add my values
  3296. vaxes = []
  3297. for i, (blk, b_items) in enumerate(zip(blocks, blk_items)):
  3298. # shape of the data column are the indexable axes
  3299. klass = DataCol
  3300. name = None
  3301. # we have a data_column
  3302. if data_columns and len(b_items) == 1 and b_items[0] in data_columns:
  3303. klass = DataIndexableCol
  3304. name = b_items[0]
  3305. if not (name is None or isinstance(name, str)):
  3306. # TODO: should the message here be more specifically non-str?
  3307. raise ValueError("cannot have non-object label DataIndexableCol")
  3308. # make sure that we match up the existing columns
  3309. # if we have an existing table
  3310. existing_col: DataCol | None
  3311. if table_exists and validate:
  3312. try:
  3313. existing_col = self.values_axes[i]
  3314. except (IndexError, KeyError) as err:
  3315. raise ValueError(
  3316. f"Incompatible appended table [{blocks}]"
  3317. f"with existing table [{self.values_axes}]"
  3318. ) from err
  3319. else:
  3320. existing_col = None
  3321. new_name = name or f"values_block_{i}"
  3322. data_converted = _maybe_convert_for_string_atom(
  3323. new_name,
  3324. blk.values,
  3325. existing_col=existing_col,
  3326. min_itemsize=min_itemsize,
  3327. nan_rep=nan_rep,
  3328. encoding=self.encoding,
  3329. errors=self.errors,
  3330. columns=b_items,
  3331. )
  3332. adj_name = _maybe_adjust_name(new_name, self.version)
  3333. typ = klass._get_atom(data_converted)
  3334. kind = _dtype_to_kind(data_converted.dtype.name)
  3335. tz = None
  3336. if getattr(data_converted, "tz", None) is not None:
  3337. tz = _get_tz(data_converted.tz)
  3338. meta = metadata = ordered = None
  3339. if is_categorical_dtype(data_converted.dtype):
  3340. ordered = data_converted.ordered
  3341. meta = "category"
  3342. metadata = np.array(data_converted.categories, copy=False).ravel()
  3343. data, dtype_name = _get_data_and_dtype_name(data_converted)
  3344. col = klass(
  3345. name=adj_name,
  3346. cname=new_name,
  3347. values=list(b_items),
  3348. typ=typ,
  3349. pos=j,
  3350. kind=kind,
  3351. tz=tz,
  3352. ordered=ordered,
  3353. meta=meta,
  3354. metadata=metadata,
  3355. dtype=dtype_name,
  3356. data=data,
  3357. )
  3358. col.update_info(new_info)
  3359. vaxes.append(col)
  3360. j += 1
  3361. dcs = [col.name for col in vaxes if col.is_data_indexable]
  3362. new_table = type(self)(
  3363. parent=self.parent,
  3364. group=self.group,
  3365. encoding=self.encoding,
  3366. errors=self.errors,
  3367. index_axes=new_index_axes,
  3368. non_index_axes=new_non_index_axes,
  3369. values_axes=vaxes,
  3370. data_columns=dcs,
  3371. info=new_info,
  3372. nan_rep=nan_rep,
  3373. )
  3374. if hasattr(self, "levels"):
  3375. # TODO: get this into constructor, only for appropriate subclass
  3376. new_table.levels = self.levels
  3377. new_table.validate_min_itemsize(min_itemsize)
  3378. if validate and table_exists:
  3379. new_table.validate(self)
  3380. return new_table
  3381. @staticmethod
  3382. def _get_blocks_and_items(
  3383. frame: DataFrame,
  3384. table_exists: bool,
  3385. new_non_index_axes,
  3386. values_axes,
  3387. data_columns,
  3388. ):
  3389. # Helper to clarify non-state-altering parts of _create_axes
  3390. # TODO(ArrayManager) HDFStore relies on accessing the blocks
  3391. if isinstance(frame._mgr, ArrayManager):
  3392. frame = frame._as_manager("block")
  3393. def get_blk_items(mgr):
  3394. return [mgr.items.take(blk.mgr_locs) for blk in mgr.blocks]
  3395. mgr = frame._mgr
  3396. mgr = cast(BlockManager, mgr)
  3397. blocks: list[Block] = list(mgr.blocks)
  3398. blk_items: list[Index] = get_blk_items(mgr)
  3399. if len(data_columns):
  3400. # TODO: prove that we only get here with axis == 1?
  3401. # It is the case in all extant tests, but NOT the case
  3402. # outside this `if len(data_columns)` check.
  3403. axis, axis_labels = new_non_index_axes[0]
  3404. new_labels = Index(axis_labels).difference(Index(data_columns))
  3405. mgr = frame.reindex(new_labels, axis=axis)._mgr
  3406. mgr = cast(BlockManager, mgr)
  3407. blocks = list(mgr.blocks)
  3408. blk_items = get_blk_items(mgr)
  3409. for c in data_columns:
  3410. # This reindex would raise ValueError if we had a duplicate
  3411. # index, so we can infer that (as long as axis==1) we
  3412. # get a single column back, so a single block.
  3413. mgr = frame.reindex([c], axis=axis)._mgr
  3414. mgr = cast(BlockManager, mgr)
  3415. blocks.extend(mgr.blocks)
  3416. blk_items.extend(get_blk_items(mgr))
  3417. # reorder the blocks in the same order as the existing table if we can
  3418. if table_exists:
  3419. by_items = {
  3420. tuple(b_items.tolist()): (b, b_items)
  3421. for b, b_items in zip(blocks, blk_items)
  3422. }
  3423. new_blocks: list[Block] = []
  3424. new_blk_items = []
  3425. for ea in values_axes:
  3426. items = tuple(ea.values)
  3427. try:
  3428. b, b_items = by_items.pop(items)
  3429. new_blocks.append(b)
  3430. new_blk_items.append(b_items)
  3431. except (IndexError, KeyError) as err:
  3432. jitems = ",".join([pprint_thing(item) for item in items])
  3433. raise ValueError(
  3434. f"cannot match existing table structure for [{jitems}] "
  3435. "on appending data"
  3436. ) from err
  3437. blocks = new_blocks
  3438. blk_items = new_blk_items
  3439. return blocks, blk_items
  3440. def process_axes(self, obj, selection: Selection, columns=None) -> DataFrame:
  3441. """process axes filters"""
  3442. # make a copy to avoid side effects
  3443. if columns is not None:
  3444. columns = list(columns)
  3445. # make sure to include levels if we have them
  3446. if columns is not None and self.is_multi_index:
  3447. assert isinstance(self.levels, list) # assured by is_multi_index
  3448. for n in self.levels:
  3449. if n not in columns:
  3450. columns.insert(0, n)
  3451. # reorder by any non_index_axes & limit to the select columns
  3452. for axis, labels in self.non_index_axes:
  3453. obj = _reindex_axis(obj, axis, labels, columns)
  3454. def process_filter(field, filt, op):
  3455. for axis_name in obj._AXIS_ORDERS:
  3456. axis_number = obj._get_axis_number(axis_name)
  3457. axis_values = obj._get_axis(axis_name)
  3458. assert axis_number is not None
  3459. # see if the field is the name of an axis
  3460. if field == axis_name:
  3461. # if we have a multi-index, then need to include
  3462. # the levels
  3463. if self.is_multi_index:
  3464. filt = filt.union(Index(self.levels))
  3465. takers = op(axis_values, filt)
  3466. return obj.loc(axis=axis_number)[takers]
  3467. # this might be the name of a file IN an axis
  3468. elif field in axis_values:
  3469. # we need to filter on this dimension
  3470. values = ensure_index(getattr(obj, field).values)
  3471. filt = ensure_index(filt)
  3472. # hack until we support reversed dim flags
  3473. if isinstance(obj, DataFrame):
  3474. axis_number = 1 - axis_number
  3475. takers = op(values, filt)
  3476. return obj.loc(axis=axis_number)[takers]
  3477. raise ValueError(f"cannot find the field [{field}] for filtering!")
  3478. # apply the selection filters (but keep in the same order)
  3479. if selection.filter is not None:
  3480. for field, op, filt in selection.filter.format():
  3481. obj = process_filter(field, filt, op)
  3482. return obj
  3483. def create_description(
  3484. self,
  3485. complib,
  3486. complevel: int | None,
  3487. fletcher32: bool,
  3488. expectedrows: int | None,
  3489. ) -> dict[str, Any]:
  3490. """create the description of the table from the axes & values"""
  3491. # provided expected rows if its passed
  3492. if expectedrows is None:
  3493. expectedrows = max(self.nrows_expected, 10000)
  3494. d = {"name": "table", "expectedrows": expectedrows}
  3495. # description from the axes & values
  3496. d["description"] = {a.cname: a.typ for a in self.axes}
  3497. if complib:
  3498. if complevel is None:
  3499. complevel = self._complevel or 9
  3500. filters = _tables().Filters(
  3501. complevel=complevel,
  3502. complib=complib,
  3503. fletcher32=fletcher32 or self._fletcher32,
  3504. )
  3505. d["filters"] = filters
  3506. elif self._filters is not None:
  3507. d["filters"] = self._filters
  3508. return d
  3509. def read_coordinates(
  3510. self, where=None, start: int | None = None, stop: int | None = None
  3511. ):
  3512. """
  3513. select coordinates (row numbers) from a table; return the
  3514. coordinates object
  3515. """
  3516. # validate the version
  3517. self.validate_version(where)
  3518. # infer the data kind
  3519. if not self.infer_axes():
  3520. return False
  3521. # create the selection
  3522. selection = Selection(self, where=where, start=start, stop=stop)
  3523. coords = selection.select_coords()
  3524. if selection.filter is not None:
  3525. for field, op, filt in selection.filter.format():
  3526. data = self.read_column(
  3527. field, start=coords.min(), stop=coords.max() + 1
  3528. )
  3529. coords = coords[op(data.iloc[coords - coords.min()], filt).values]
  3530. return Index(coords)
  3531. def read_column(
  3532. self,
  3533. column: str,
  3534. where=None,
  3535. start: int | None = None,
  3536. stop: int | None = None,
  3537. ):
  3538. """
  3539. return a single column from the table, generally only indexables
  3540. are interesting
  3541. """
  3542. # validate the version
  3543. self.validate_version()
  3544. # infer the data kind
  3545. if not self.infer_axes():
  3546. return False
  3547. if where is not None:
  3548. raise TypeError("read_column does not currently accept a where clause")
  3549. # find the axes
  3550. for a in self.axes:
  3551. if column == a.name:
  3552. if not a.is_data_indexable:
  3553. raise ValueError(
  3554. f"column [{column}] can not be extracted individually; "
  3555. "it is not data indexable"
  3556. )
  3557. # column must be an indexable or a data column
  3558. c = getattr(self.table.cols, column)
  3559. a.set_info(self.info)
  3560. col_values = a.convert(
  3561. c[start:stop],
  3562. nan_rep=self.nan_rep,
  3563. encoding=self.encoding,
  3564. errors=self.errors,
  3565. )
  3566. return Series(_set_tz(col_values[1], a.tz), name=column, copy=False)
  3567. raise KeyError(f"column [{column}] not found in the table")
  3568. class WORMTable(Table):
  3569. """
  3570. a write-once read-many table: this format DOES NOT ALLOW appending to a
  3571. table. writing is a one-time operation the data are stored in a format
  3572. that allows for searching the data on disk
  3573. """
  3574. table_type = "worm"
  3575. def read(
  3576. self,
  3577. where=None,
  3578. columns=None,
  3579. start: int | None = None,
  3580. stop: int | None = None,
  3581. ):
  3582. """
  3583. read the indices and the indexing array, calculate offset rows and return
  3584. """
  3585. raise NotImplementedError("WORMTable needs to implement read")
  3586. def write(self, **kwargs) -> None:
  3587. """
  3588. write in a format that we can search later on (but cannot append
  3589. to): write out the indices and the values using _write_array
  3590. (e.g. a CArray) create an indexing table so that we can search
  3591. """
  3592. raise NotImplementedError("WORMTable needs to implement write")
  3593. class AppendableTable(Table):
  3594. """support the new appendable table formats"""
  3595. table_type = "appendable"
  3596. # error: Signature of "write" incompatible with supertype "Fixed"
  3597. def write( # type: ignore[override]
  3598. self,
  3599. obj,
  3600. axes=None,
  3601. append: bool = False,
  3602. complib=None,
  3603. complevel=None,
  3604. fletcher32=None,
  3605. min_itemsize=None,
  3606. chunksize=None,
  3607. expectedrows=None,
  3608. dropna: bool = False,
  3609. nan_rep=None,
  3610. data_columns=None,
  3611. track_times: bool = True,
  3612. ) -> None:
  3613. if not append and self.is_exists:
  3614. self._handle.remove_node(self.group, "table")
  3615. # create the axes
  3616. table = self._create_axes(
  3617. axes=axes,
  3618. obj=obj,
  3619. validate=append,
  3620. min_itemsize=min_itemsize,
  3621. nan_rep=nan_rep,
  3622. data_columns=data_columns,
  3623. )
  3624. for a in table.axes:
  3625. a.validate_names()
  3626. if not table.is_exists:
  3627. # create the table
  3628. options = table.create_description(
  3629. complib=complib,
  3630. complevel=complevel,
  3631. fletcher32=fletcher32,
  3632. expectedrows=expectedrows,
  3633. )
  3634. # set the table attributes
  3635. table.set_attrs()
  3636. options["track_times"] = track_times
  3637. # create the table
  3638. table._handle.create_table(table.group, **options)
  3639. # update my info
  3640. table.attrs.info = table.info
  3641. # validate the axes and set the kinds
  3642. for a in table.axes:
  3643. a.validate_and_set(table, append)
  3644. # add the rows
  3645. table.write_data(chunksize, dropna=dropna)
  3646. def write_data(self, chunksize: int | None, dropna: bool = False) -> None:
  3647. """
  3648. we form the data into a 2-d including indexes,values,mask write chunk-by-chunk
  3649. """
  3650. names = self.dtype.names
  3651. nrows = self.nrows_expected
  3652. # if dropna==True, then drop ALL nan rows
  3653. masks = []
  3654. if dropna:
  3655. for a in self.values_axes:
  3656. # figure the mask: only do if we can successfully process this
  3657. # column, otherwise ignore the mask
  3658. mask = isna(a.data).all(axis=0)
  3659. if isinstance(mask, np.ndarray):
  3660. masks.append(mask.astype("u1", copy=False))
  3661. # consolidate masks
  3662. if len(masks):
  3663. mask = masks[0]
  3664. for m in masks[1:]:
  3665. mask = mask & m
  3666. mask = mask.ravel()
  3667. else:
  3668. mask = None
  3669. # broadcast the indexes if needed
  3670. indexes = [a.cvalues for a in self.index_axes]
  3671. nindexes = len(indexes)
  3672. assert nindexes == 1, nindexes # ensures we dont need to broadcast
  3673. # transpose the values so first dimension is last
  3674. # reshape the values if needed
  3675. values = [a.take_data() for a in self.values_axes]
  3676. values = [v.transpose(np.roll(np.arange(v.ndim), v.ndim - 1)) for v in values]
  3677. bvalues = []
  3678. for i, v in enumerate(values):
  3679. new_shape = (nrows,) + self.dtype[names[nindexes + i]].shape
  3680. bvalues.append(v.reshape(new_shape))
  3681. # write the chunks
  3682. if chunksize is None:
  3683. chunksize = 100000
  3684. rows = np.empty(min(chunksize, nrows), dtype=self.dtype)
  3685. chunks = nrows // chunksize + 1
  3686. for i in range(chunks):
  3687. start_i = i * chunksize
  3688. end_i = min((i + 1) * chunksize, nrows)
  3689. if start_i >= end_i:
  3690. break
  3691. self.write_data_chunk(
  3692. rows,
  3693. indexes=[a[start_i:end_i] for a in indexes],
  3694. mask=mask[start_i:end_i] if mask is not None else None,
  3695. values=[v[start_i:end_i] for v in bvalues],
  3696. )
  3697. def write_data_chunk(
  3698. self,
  3699. rows: np.ndarray,
  3700. indexes: list[np.ndarray],
  3701. mask: npt.NDArray[np.bool_] | None,
  3702. values: list[np.ndarray],
  3703. ) -> None:
  3704. """
  3705. Parameters
  3706. ----------
  3707. rows : an empty memory space where we are putting the chunk
  3708. indexes : an array of the indexes
  3709. mask : an array of the masks
  3710. values : an array of the values
  3711. """
  3712. # 0 len
  3713. for v in values:
  3714. if not np.prod(v.shape):
  3715. return
  3716. nrows = indexes[0].shape[0]
  3717. if nrows != len(rows):
  3718. rows = np.empty(nrows, dtype=self.dtype)
  3719. names = self.dtype.names
  3720. nindexes = len(indexes)
  3721. # indexes
  3722. for i, idx in enumerate(indexes):
  3723. rows[names[i]] = idx
  3724. # values
  3725. for i, v in enumerate(values):
  3726. rows[names[i + nindexes]] = v
  3727. # mask
  3728. if mask is not None:
  3729. m = ~mask.ravel().astype(bool, copy=False)
  3730. if not m.all():
  3731. rows = rows[m]
  3732. if len(rows):
  3733. self.table.append(rows)
  3734. self.table.flush()
  3735. def delete(self, where=None, start: int | None = None, stop: int | None = None):
  3736. # delete all rows (and return the nrows)
  3737. if where is None or not len(where):
  3738. if start is None and stop is None:
  3739. nrows = self.nrows
  3740. self._handle.remove_node(self.group, recursive=True)
  3741. else:
  3742. # pytables<3.0 would remove a single row with stop=None
  3743. if stop is None:
  3744. stop = self.nrows
  3745. nrows = self.table.remove_rows(start=start, stop=stop)
  3746. self.table.flush()
  3747. return nrows
  3748. # infer the data kind
  3749. if not self.infer_axes():
  3750. return None
  3751. # create the selection
  3752. table = self.table
  3753. selection = Selection(self, where, start=start, stop=stop)
  3754. values = selection.select_coords()
  3755. # delete the rows in reverse order
  3756. sorted_series = Series(values, copy=False).sort_values()
  3757. ln = len(sorted_series)
  3758. if ln:
  3759. # construct groups of consecutive rows
  3760. diff = sorted_series.diff()
  3761. groups = list(diff[diff > 1].index)
  3762. # 1 group
  3763. if not len(groups):
  3764. groups = [0]
  3765. # final element
  3766. if groups[-1] != ln:
  3767. groups.append(ln)
  3768. # initial element
  3769. if groups[0] != 0:
  3770. groups.insert(0, 0)
  3771. # we must remove in reverse order!
  3772. pg = groups.pop()
  3773. for g in reversed(groups):
  3774. rows = sorted_series.take(range(g, pg))
  3775. table.remove_rows(
  3776. start=rows[rows.index[0]], stop=rows[rows.index[-1]] + 1
  3777. )
  3778. pg = g
  3779. self.table.flush()
  3780. # return the number of rows removed
  3781. return ln
  3782. class AppendableFrameTable(AppendableTable):
  3783. """support the new appendable table formats"""
  3784. pandas_kind = "frame_table"
  3785. table_type = "appendable_frame"
  3786. ndim = 2
  3787. obj_type: type[DataFrame | Series] = DataFrame
  3788. @property
  3789. def is_transposed(self) -> bool:
  3790. return self.index_axes[0].axis == 1
  3791. @classmethod
  3792. def get_object(cls, obj, transposed: bool):
  3793. """these are written transposed"""
  3794. if transposed:
  3795. obj = obj.T
  3796. return obj
  3797. def read(
  3798. self,
  3799. where=None,
  3800. columns=None,
  3801. start: int | None = None,
  3802. stop: int | None = None,
  3803. ):
  3804. # validate the version
  3805. self.validate_version(where)
  3806. # infer the data kind
  3807. if not self.infer_axes():
  3808. return None
  3809. result = self._read_axes(where=where, start=start, stop=stop)
  3810. info = (
  3811. self.info.get(self.non_index_axes[0][0], {})
  3812. if len(self.non_index_axes)
  3813. else {}
  3814. )
  3815. inds = [i for i, ax in enumerate(self.axes) if ax is self.index_axes[0]]
  3816. assert len(inds) == 1
  3817. ind = inds[0]
  3818. index = result[ind][0]
  3819. frames = []
  3820. for i, a in enumerate(self.axes):
  3821. if a not in self.values_axes:
  3822. continue
  3823. index_vals, cvalues = result[i]
  3824. # we could have a multi-index constructor here
  3825. # ensure_index doesn't recognized our list-of-tuples here
  3826. if info.get("type") != "MultiIndex":
  3827. cols = Index(index_vals)
  3828. else:
  3829. cols = MultiIndex.from_tuples(index_vals)
  3830. names = info.get("names")
  3831. if names is not None:
  3832. cols.set_names(names, inplace=True)
  3833. if self.is_transposed:
  3834. values = cvalues
  3835. index_ = cols
  3836. cols_ = Index(index, name=getattr(index, "name", None))
  3837. else:
  3838. values = cvalues.T
  3839. index_ = Index(index, name=getattr(index, "name", None))
  3840. cols_ = cols
  3841. # if we have a DataIndexableCol, its shape will only be 1 dim
  3842. if values.ndim == 1 and isinstance(values, np.ndarray):
  3843. values = values.reshape((1, values.shape[0]))
  3844. if isinstance(values, np.ndarray):
  3845. df = DataFrame(values.T, columns=cols_, index=index_, copy=False)
  3846. elif isinstance(values, Index):
  3847. df = DataFrame(values, columns=cols_, index=index_)
  3848. else:
  3849. # Categorical
  3850. df = DataFrame._from_arrays([values], columns=cols_, index=index_)
  3851. assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
  3852. frames.append(df)
  3853. if len(frames) == 1:
  3854. df = frames[0]
  3855. else:
  3856. df = concat(frames, axis=1)
  3857. selection = Selection(self, where=where, start=start, stop=stop)
  3858. # apply the selection filters & axis orderings
  3859. df = self.process_axes(df, selection=selection, columns=columns)
  3860. return df
  3861. class AppendableSeriesTable(AppendableFrameTable):
  3862. """support the new appendable table formats"""
  3863. pandas_kind = "series_table"
  3864. table_type = "appendable_series"
  3865. ndim = 2
  3866. obj_type = Series
  3867. @property
  3868. def is_transposed(self) -> bool:
  3869. return False
  3870. @classmethod
  3871. def get_object(cls, obj, transposed: bool):
  3872. return obj
  3873. def write(self, obj, data_columns=None, **kwargs):
  3874. """we are going to write this as a frame table"""
  3875. if not isinstance(obj, DataFrame):
  3876. name = obj.name or "values"
  3877. obj = obj.to_frame(name)
  3878. return super().write(obj=obj, data_columns=obj.columns.tolist(), **kwargs)
  3879. def read(
  3880. self,
  3881. where=None,
  3882. columns=None,
  3883. start: int | None = None,
  3884. stop: int | None = None,
  3885. ) -> Series:
  3886. is_multi_index = self.is_multi_index
  3887. if columns is not None and is_multi_index:
  3888. assert isinstance(self.levels, list) # needed for mypy
  3889. for n in self.levels:
  3890. if n not in columns:
  3891. columns.insert(0, n)
  3892. s = super().read(where=where, columns=columns, start=start, stop=stop)
  3893. if is_multi_index:
  3894. s.set_index(self.levels, inplace=True)
  3895. s = s.iloc[:, 0]
  3896. # remove the default name
  3897. if s.name == "values":
  3898. s.name = None
  3899. return s
  3900. class AppendableMultiSeriesTable(AppendableSeriesTable):
  3901. """support the new appendable table formats"""
  3902. pandas_kind = "series_table"
  3903. table_type = "appendable_multiseries"
  3904. def write(self, obj, **kwargs):
  3905. """we are going to write this as a frame table"""
  3906. name = obj.name or "values"
  3907. newobj, self.levels = self.validate_multiindex(obj)
  3908. assert isinstance(self.levels, list) # for mypy
  3909. cols = list(self.levels)
  3910. cols.append(name)
  3911. newobj.columns = Index(cols)
  3912. return super().write(obj=newobj, **kwargs)
  3913. class GenericTable(AppendableFrameTable):
  3914. """a table that read/writes the generic pytables table format"""
  3915. pandas_kind = "frame_table"
  3916. table_type = "generic_table"
  3917. ndim = 2
  3918. obj_type = DataFrame
  3919. levels: list[Hashable]
  3920. @property
  3921. def pandas_type(self) -> str:
  3922. return self.pandas_kind
  3923. @property
  3924. def storable(self):
  3925. return getattr(self.group, "table", None) or self.group
  3926. def get_attrs(self) -> None:
  3927. """retrieve our attributes"""
  3928. self.non_index_axes = []
  3929. self.nan_rep = None
  3930. self.levels = []
  3931. self.index_axes = [a for a in self.indexables if a.is_an_indexable]
  3932. self.values_axes = [a for a in self.indexables if not a.is_an_indexable]
  3933. self.data_columns = [a.name for a in self.values_axes]
  3934. @cache_readonly
  3935. def indexables(self):
  3936. """create the indexables from the table description"""
  3937. d = self.description
  3938. # TODO: can we get a typ for this? AFAICT it is the only place
  3939. # where we aren't passing one
  3940. # the index columns is just a simple index
  3941. md = self.read_metadata("index")
  3942. meta = "category" if md is not None else None
  3943. index_col = GenericIndexCol(
  3944. name="index", axis=0, table=self.table, meta=meta, metadata=md
  3945. )
  3946. _indexables: list[GenericIndexCol | GenericDataIndexableCol] = [index_col]
  3947. for i, n in enumerate(d._v_names):
  3948. assert isinstance(n, str)
  3949. atom = getattr(d, n)
  3950. md = self.read_metadata(n)
  3951. meta = "category" if md is not None else None
  3952. dc = GenericDataIndexableCol(
  3953. name=n,
  3954. pos=i,
  3955. values=[n],
  3956. typ=atom,
  3957. table=self.table,
  3958. meta=meta,
  3959. metadata=md,
  3960. )
  3961. _indexables.append(dc)
  3962. return _indexables
  3963. def write(self, **kwargs):
  3964. raise NotImplementedError("cannot write on an generic table")
  3965. class AppendableMultiFrameTable(AppendableFrameTable):
  3966. """a frame with a multi-index"""
  3967. table_type = "appendable_multiframe"
  3968. obj_type = DataFrame
  3969. ndim = 2
  3970. _re_levels = re.compile(r"^level_\d+$")
  3971. @property
  3972. def table_type_short(self) -> str:
  3973. return "appendable_multi"
  3974. def write(self, obj, data_columns=None, **kwargs):
  3975. if data_columns is None:
  3976. data_columns = []
  3977. elif data_columns is True:
  3978. data_columns = obj.columns.tolist()
  3979. obj, self.levels = self.validate_multiindex(obj)
  3980. assert isinstance(self.levels, list) # for mypy
  3981. for n in self.levels:
  3982. if n not in data_columns:
  3983. data_columns.insert(0, n)
  3984. return super().write(obj=obj, data_columns=data_columns, **kwargs)
  3985. def read(
  3986. self,
  3987. where=None,
  3988. columns=None,
  3989. start: int | None = None,
  3990. stop: int | None = None,
  3991. ):
  3992. df = super().read(where=where, columns=columns, start=start, stop=stop)
  3993. df = df.set_index(self.levels)
  3994. # remove names for 'level_%d'
  3995. df.index = df.index.set_names(
  3996. [None if self._re_levels.search(name) else name for name in df.index.names]
  3997. )
  3998. return df
  3999. def _reindex_axis(
  4000. obj: DataFrame, axis: AxisInt, labels: Index, other=None
  4001. ) -> DataFrame:
  4002. ax = obj._get_axis(axis)
  4003. labels = ensure_index(labels)
  4004. # try not to reindex even if other is provided
  4005. # if it equals our current index
  4006. if other is not None:
  4007. other = ensure_index(other)
  4008. if (other is None or labels.equals(other)) and labels.equals(ax):
  4009. return obj
  4010. labels = ensure_index(labels.unique())
  4011. if other is not None:
  4012. labels = ensure_index(other.unique()).intersection(labels, sort=False)
  4013. if not labels.equals(ax):
  4014. slicer: list[slice | Index] = [slice(None, None)] * obj.ndim
  4015. slicer[axis] = labels
  4016. obj = obj.loc[tuple(slicer)]
  4017. return obj
  4018. # tz to/from coercion
  4019. def _get_tz(tz: tzinfo) -> str | tzinfo:
  4020. """for a tz-aware type, return an encoded zone"""
  4021. zone = timezones.get_timezone(tz)
  4022. return zone
  4023. @overload
  4024. def _set_tz(
  4025. values: np.ndarray | Index, tz: str | tzinfo, coerce: bool = False
  4026. ) -> DatetimeIndex:
  4027. ...
  4028. @overload
  4029. def _set_tz(values: np.ndarray | Index, tz: None, coerce: bool = False) -> np.ndarray:
  4030. ...
  4031. def _set_tz(
  4032. values: np.ndarray | Index, tz: str | tzinfo | None, coerce: bool = False
  4033. ) -> np.ndarray | DatetimeIndex:
  4034. """
  4035. coerce the values to a DatetimeIndex if tz is set
  4036. preserve the input shape if possible
  4037. Parameters
  4038. ----------
  4039. values : ndarray or Index
  4040. tz : str or tzinfo
  4041. coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
  4042. """
  4043. if isinstance(values, DatetimeIndex):
  4044. # If values is tzaware, the tz gets dropped in the values.ravel()
  4045. # call below (which returns an ndarray). So we are only non-lossy
  4046. # if `tz` matches `values.tz`.
  4047. assert values.tz is None or values.tz == tz
  4048. if tz is not None:
  4049. if isinstance(values, DatetimeIndex):
  4050. name = values.name
  4051. values = values.asi8
  4052. else:
  4053. name = None
  4054. values = values.ravel()
  4055. tz = _ensure_decoded(tz)
  4056. values = DatetimeIndex(values, name=name)
  4057. values = values.tz_localize("UTC").tz_convert(tz)
  4058. elif coerce:
  4059. values = np.asarray(values, dtype="M8[ns]")
  4060. # error: Incompatible return value type (got "Union[ndarray, Index]",
  4061. # expected "Union[ndarray, DatetimeIndex]")
  4062. return values # type: ignore[return-value]
  4063. def _convert_index(name: str, index: Index, encoding: str, errors: str) -> IndexCol:
  4064. assert isinstance(name, str)
  4065. index_name = index.name
  4066. # error: Argument 1 to "_get_data_and_dtype_name" has incompatible type "Index";
  4067. # expected "Union[ExtensionArray, ndarray]"
  4068. converted, dtype_name = _get_data_and_dtype_name(index) # type: ignore[arg-type]
  4069. kind = _dtype_to_kind(dtype_name)
  4070. atom = DataIndexableCol._get_atom(converted)
  4071. if (
  4072. (isinstance(index.dtype, np.dtype) and is_integer_dtype(index))
  4073. or needs_i8_conversion(index.dtype)
  4074. or is_bool_dtype(index.dtype)
  4075. ):
  4076. # Includes Index, RangeIndex, DatetimeIndex, TimedeltaIndex, PeriodIndex,
  4077. # in which case "kind" is "integer", "integer", "datetime64",
  4078. # "timedelta64", and "integer", respectively.
  4079. return IndexCol(
  4080. name,
  4081. values=converted,
  4082. kind=kind,
  4083. typ=atom,
  4084. freq=getattr(index, "freq", None),
  4085. tz=getattr(index, "tz", None),
  4086. index_name=index_name,
  4087. )
  4088. if isinstance(index, MultiIndex):
  4089. raise TypeError("MultiIndex not supported here!")
  4090. inferred_type = lib.infer_dtype(index, skipna=False)
  4091. # we won't get inferred_type of "datetime64" or "timedelta64" as these
  4092. # would go through the DatetimeIndex/TimedeltaIndex paths above
  4093. values = np.asarray(index)
  4094. if inferred_type == "date":
  4095. converted = np.asarray([v.toordinal() for v in values], dtype=np.int32)
  4096. return IndexCol(
  4097. name, converted, "date", _tables().Time32Col(), index_name=index_name
  4098. )
  4099. elif inferred_type == "string":
  4100. converted = _convert_string_array(values, encoding, errors)
  4101. itemsize = converted.dtype.itemsize
  4102. return IndexCol(
  4103. name,
  4104. converted,
  4105. "string",
  4106. _tables().StringCol(itemsize),
  4107. index_name=index_name,
  4108. )
  4109. elif inferred_type in ["integer", "floating"]:
  4110. return IndexCol(
  4111. name, values=converted, kind=kind, typ=atom, index_name=index_name
  4112. )
  4113. else:
  4114. assert isinstance(converted, np.ndarray) and converted.dtype == object
  4115. assert kind == "object", kind
  4116. atom = _tables().ObjectAtom()
  4117. return IndexCol(name, converted, kind, atom, index_name=index_name)
  4118. def _unconvert_index(data, kind: str, encoding: str, errors: str) -> np.ndarray | Index:
  4119. index: Index | np.ndarray
  4120. if kind == "datetime64":
  4121. index = DatetimeIndex(data)
  4122. elif kind == "timedelta64":
  4123. index = TimedeltaIndex(data)
  4124. elif kind == "date":
  4125. try:
  4126. index = np.asarray([date.fromordinal(v) for v in data], dtype=object)
  4127. except ValueError:
  4128. index = np.asarray([date.fromtimestamp(v) for v in data], dtype=object)
  4129. elif kind in ("integer", "float", "bool"):
  4130. index = np.asarray(data)
  4131. elif kind in ("string"):
  4132. index = _unconvert_string_array(
  4133. data, nan_rep=None, encoding=encoding, errors=errors
  4134. )
  4135. elif kind == "object":
  4136. index = np.asarray(data[0])
  4137. else: # pragma: no cover
  4138. raise ValueError(f"unrecognized index type {kind}")
  4139. return index
  4140. def _maybe_convert_for_string_atom(
  4141. name: str,
  4142. bvalues: ArrayLike,
  4143. existing_col,
  4144. min_itemsize,
  4145. nan_rep,
  4146. encoding,
  4147. errors,
  4148. columns: list[str],
  4149. ):
  4150. if bvalues.dtype != object:
  4151. return bvalues
  4152. bvalues = cast(np.ndarray, bvalues)
  4153. dtype_name = bvalues.dtype.name
  4154. inferred_type = lib.infer_dtype(bvalues, skipna=False)
  4155. if inferred_type == "date":
  4156. raise TypeError("[date] is not implemented as a table column")
  4157. if inferred_type == "datetime":
  4158. # after GH#8260
  4159. # this only would be hit for a multi-timezone dtype which is an error
  4160. raise TypeError(
  4161. "too many timezones in this block, create separate data columns"
  4162. )
  4163. if not (inferred_type == "string" or dtype_name == "object"):
  4164. return bvalues
  4165. mask = isna(bvalues)
  4166. data = bvalues.copy()
  4167. data[mask] = nan_rep
  4168. # see if we have a valid string type
  4169. inferred_type = lib.infer_dtype(data, skipna=False)
  4170. if inferred_type != "string":
  4171. # we cannot serialize this data, so report an exception on a column
  4172. # by column basis
  4173. # expected behaviour:
  4174. # search block for a non-string object column by column
  4175. for i in range(data.shape[0]):
  4176. col = data[i]
  4177. inferred_type = lib.infer_dtype(col, skipna=False)
  4178. if inferred_type != "string":
  4179. error_column_label = columns[i] if len(columns) > i else f"No.{i}"
  4180. raise TypeError(
  4181. f"Cannot serialize the column [{error_column_label}]\n"
  4182. f"because its data contents are not [string] but "
  4183. f"[{inferred_type}] object dtype"
  4184. )
  4185. # itemsize is the maximum length of a string (along any dimension)
  4186. data_converted = _convert_string_array(data, encoding, errors).reshape(data.shape)
  4187. itemsize = data_converted.itemsize
  4188. # specified min_itemsize?
  4189. if isinstance(min_itemsize, dict):
  4190. min_itemsize = int(min_itemsize.get(name) or min_itemsize.get("values") or 0)
  4191. itemsize = max(min_itemsize or 0, itemsize)
  4192. # check for column in the values conflicts
  4193. if existing_col is not None:
  4194. eci = existing_col.validate_col(itemsize)
  4195. if eci is not None and eci > itemsize:
  4196. itemsize = eci
  4197. data_converted = data_converted.astype(f"|S{itemsize}", copy=False)
  4198. return data_converted
  4199. def _convert_string_array(data: np.ndarray, encoding: str, errors: str) -> np.ndarray:
  4200. """
  4201. Take a string-like that is object dtype and coerce to a fixed size string type.
  4202. Parameters
  4203. ----------
  4204. data : np.ndarray[object]
  4205. encoding : str
  4206. errors : str
  4207. Handler for encoding errors.
  4208. Returns
  4209. -------
  4210. np.ndarray[fixed-length-string]
  4211. """
  4212. # encode if needed
  4213. if len(data):
  4214. data = (
  4215. Series(data.ravel(), copy=False)
  4216. .str.encode(encoding, errors)
  4217. ._values.reshape(data.shape)
  4218. )
  4219. # create the sized dtype
  4220. ensured = ensure_object(data.ravel())
  4221. itemsize = max(1, libwriters.max_len_string_array(ensured))
  4222. data = np.asarray(data, dtype=f"S{itemsize}")
  4223. return data
  4224. def _unconvert_string_array(
  4225. data: np.ndarray, nan_rep, encoding: str, errors: str
  4226. ) -> np.ndarray:
  4227. """
  4228. Inverse of _convert_string_array.
  4229. Parameters
  4230. ----------
  4231. data : np.ndarray[fixed-length-string]
  4232. nan_rep : the storage repr of NaN
  4233. encoding : str
  4234. errors : str
  4235. Handler for encoding errors.
  4236. Returns
  4237. -------
  4238. np.ndarray[object]
  4239. Decoded data.
  4240. """
  4241. shape = data.shape
  4242. data = np.asarray(data.ravel(), dtype=object)
  4243. if len(data):
  4244. itemsize = libwriters.max_len_string_array(ensure_object(data))
  4245. dtype = f"U{itemsize}"
  4246. if isinstance(data[0], bytes):
  4247. data = Series(data, copy=False).str.decode(encoding, errors=errors)._values
  4248. else:
  4249. data = data.astype(dtype, copy=False).astype(object, copy=False)
  4250. if nan_rep is None:
  4251. nan_rep = "nan"
  4252. libwriters.string_array_replace_from_nan_rep(data, nan_rep)
  4253. return data.reshape(shape)
  4254. def _maybe_convert(values: np.ndarray, val_kind: str, encoding: str, errors: str):
  4255. assert isinstance(val_kind, str), type(val_kind)
  4256. if _need_convert(val_kind):
  4257. conv = _get_converter(val_kind, encoding, errors)
  4258. values = conv(values)
  4259. return values
  4260. def _get_converter(kind: str, encoding: str, errors: str):
  4261. if kind == "datetime64":
  4262. return lambda x: np.asarray(x, dtype="M8[ns]")
  4263. elif kind == "string":
  4264. return lambda x: _unconvert_string_array(
  4265. x, nan_rep=None, encoding=encoding, errors=errors
  4266. )
  4267. else: # pragma: no cover
  4268. raise ValueError(f"invalid kind {kind}")
  4269. def _need_convert(kind: str) -> bool:
  4270. if kind in ("datetime64", "string"):
  4271. return True
  4272. return False
  4273. def _maybe_adjust_name(name: str, version: Sequence[int]) -> str:
  4274. """
  4275. Prior to 0.10.1, we named values blocks like: values_block_0 an the
  4276. name values_0, adjust the given name if necessary.
  4277. Parameters
  4278. ----------
  4279. name : str
  4280. version : Tuple[int, int, int]
  4281. Returns
  4282. -------
  4283. str
  4284. """
  4285. if isinstance(version, str) or len(version) < 3:
  4286. raise ValueError("Version is incorrect, expected sequence of 3 integers.")
  4287. if version[0] == 0 and version[1] <= 10 and version[2] == 0:
  4288. m = re.search(r"values_block_(\d+)", name)
  4289. if m:
  4290. grp = m.groups()[0]
  4291. name = f"values_{grp}"
  4292. return name
  4293. def _dtype_to_kind(dtype_str: str) -> str:
  4294. """
  4295. Find the "kind" string describing the given dtype name.
  4296. """
  4297. dtype_str = _ensure_decoded(dtype_str)
  4298. if dtype_str.startswith("string") or dtype_str.startswith("bytes"):
  4299. kind = "string"
  4300. elif dtype_str.startswith("float"):
  4301. kind = "float"
  4302. elif dtype_str.startswith("complex"):
  4303. kind = "complex"
  4304. elif dtype_str.startswith("int") or dtype_str.startswith("uint"):
  4305. kind = "integer"
  4306. elif dtype_str.startswith("datetime64"):
  4307. kind = "datetime64"
  4308. elif dtype_str.startswith("timedelta"):
  4309. kind = "timedelta64"
  4310. elif dtype_str.startswith("bool"):
  4311. kind = "bool"
  4312. elif dtype_str.startswith("category"):
  4313. kind = "category"
  4314. elif dtype_str.startswith("period"):
  4315. # We store the `freq` attr so we can restore from integers
  4316. kind = "integer"
  4317. elif dtype_str == "object":
  4318. kind = "object"
  4319. else:
  4320. raise ValueError(f"cannot interpret dtype of [{dtype_str}]")
  4321. return kind
  4322. def _get_data_and_dtype_name(data: ArrayLike):
  4323. """
  4324. Convert the passed data into a storable form and a dtype string.
  4325. """
  4326. if isinstance(data, Categorical):
  4327. data = data.codes
  4328. # For datetime64tz we need to drop the TZ in tests TODO: why?
  4329. dtype_name = data.dtype.name.split("[")[0]
  4330. if data.dtype.kind in ["m", "M"]:
  4331. data = np.asarray(data.view("i8"))
  4332. # TODO: we used to reshape for the dt64tz case, but no longer
  4333. # doing that doesn't seem to break anything. why?
  4334. elif isinstance(data, PeriodIndex):
  4335. data = data.asi8
  4336. data = np.asarray(data)
  4337. return data, dtype_name
  4338. class Selection:
  4339. """
  4340. Carries out a selection operation on a tables.Table object.
  4341. Parameters
  4342. ----------
  4343. table : a Table object
  4344. where : list of Terms (or convertible to)
  4345. start, stop: indices to start and/or stop selection
  4346. """
  4347. def __init__(
  4348. self,
  4349. table: Table,
  4350. where=None,
  4351. start: int | None = None,
  4352. stop: int | None = None,
  4353. ) -> None:
  4354. self.table = table
  4355. self.where = where
  4356. self.start = start
  4357. self.stop = stop
  4358. self.condition = None
  4359. self.filter = None
  4360. self.terms = None
  4361. self.coordinates = None
  4362. if is_list_like(where):
  4363. # see if we have a passed coordinate like
  4364. with suppress(ValueError):
  4365. inferred = lib.infer_dtype(where, skipna=False)
  4366. if inferred in ("integer", "boolean"):
  4367. where = np.asarray(where)
  4368. if where.dtype == np.bool_:
  4369. start, stop = self.start, self.stop
  4370. if start is None:
  4371. start = 0
  4372. if stop is None:
  4373. stop = self.table.nrows
  4374. self.coordinates = np.arange(start, stop)[where]
  4375. elif issubclass(where.dtype.type, np.integer):
  4376. if (self.start is not None and (where < self.start).any()) or (
  4377. self.stop is not None and (where >= self.stop).any()
  4378. ):
  4379. raise ValueError(
  4380. "where must have index locations >= start and < stop"
  4381. )
  4382. self.coordinates = where
  4383. if self.coordinates is None:
  4384. self.terms = self.generate(where)
  4385. # create the numexpr & the filter
  4386. if self.terms is not None:
  4387. self.condition, self.filter = self.terms.evaluate()
  4388. def generate(self, where):
  4389. """where can be a : dict,list,tuple,string"""
  4390. if where is None:
  4391. return None
  4392. q = self.table.queryables()
  4393. try:
  4394. return PyTablesExpr(where, queryables=q, encoding=self.table.encoding)
  4395. except NameError as err:
  4396. # raise a nice message, suggesting that the user should use
  4397. # data_columns
  4398. qkeys = ",".join(q.keys())
  4399. msg = dedent(
  4400. f"""\
  4401. The passed where expression: {where}
  4402. contains an invalid variable reference
  4403. all of the variable references must be a reference to
  4404. an axis (e.g. 'index' or 'columns'), or a data_column
  4405. The currently defined references are: {qkeys}
  4406. """
  4407. )
  4408. raise ValueError(msg) from err
  4409. def select(self):
  4410. """
  4411. generate the selection
  4412. """
  4413. if self.condition is not None:
  4414. return self.table.table.read_where(
  4415. self.condition.format(), start=self.start, stop=self.stop
  4416. )
  4417. elif self.coordinates is not None:
  4418. return self.table.table.read_coordinates(self.coordinates)
  4419. return self.table.table.read(start=self.start, stop=self.stop)
  4420. def select_coords(self):
  4421. """
  4422. generate the selection
  4423. """
  4424. start, stop = self.start, self.stop
  4425. nrows = self.table.nrows
  4426. if start is None:
  4427. start = 0
  4428. elif start < 0:
  4429. start += nrows
  4430. if stop is None:
  4431. stop = nrows
  4432. elif stop < 0:
  4433. stop += nrows
  4434. if self.condition is not None:
  4435. return self.table.table.get_where_list(
  4436. self.condition.format(), start=start, stop=stop, sort=True
  4437. )
  4438. elif self.coordinates is not None:
  4439. return self.coordinates
  4440. return np.arange(start, stop)