123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127 |
- # Copyright (c) 2012, Lambda Foundry, Inc.
- # See LICENSE for the license
- from collections import defaultdict
- from csv import (
- QUOTE_MINIMAL,
- QUOTE_NONE,
- QUOTE_NONNUMERIC,
- )
- import sys
- import time
- import warnings
- from pandas.errors import ParserError
- from pandas.util._exceptions import find_stack_level
- from pandas import StringDtype
- from pandas.core.arrays import (
- ArrowExtensionArray,
- BooleanArray,
- FloatingArray,
- IntegerArray,
- )
- cimport cython
- from cpython.bytes cimport PyBytes_AsString
- from cpython.exc cimport (
- PyErr_Fetch,
- PyErr_Occurred,
- )
- from cpython.object cimport PyObject
- from cpython.ref cimport (
- Py_INCREF,
- Py_XDECREF,
- )
- from cpython.unicode cimport (
- PyUnicode_AsUTF8String,
- PyUnicode_Decode,
- PyUnicode_DecodeUTF8,
- )
- from cython cimport Py_ssize_t
- from libc.stdlib cimport free
- from libc.string cimport (
- strcasecmp,
- strlen,
- strncpy,
- )
- cdef extern from "Python.h":
- # TODO(cython3): get this from cpython.unicode
- object PyUnicode_FromString(char *v)
- import numpy as np
- cimport numpy as cnp
- from numpy cimport (
- float64_t,
- int64_t,
- ndarray,
- uint8_t,
- uint64_t,
- )
- cnp.import_array()
- from pandas._libs cimport util
- from pandas._libs.util cimport (
- INT64_MAX,
- INT64_MIN,
- UINT64_MAX,
- )
- from pandas._libs import lib
- from pandas._libs.khash cimport (
- kh_destroy_float64,
- kh_destroy_str,
- kh_destroy_str_starts,
- kh_destroy_strbox,
- kh_exist_str,
- kh_float64_t,
- kh_get_float64,
- kh_get_str,
- kh_get_str_starts_item,
- kh_get_strbox,
- kh_init_float64,
- kh_init_str,
- kh_init_str_starts,
- kh_init_strbox,
- kh_put_float64,
- kh_put_str,
- kh_put_str_starts_item,
- kh_put_strbox,
- kh_resize_float64,
- kh_resize_str_starts,
- kh_str_starts_t,
- kh_str_t,
- kh_strbox_t,
- khiter_t,
- )
- from pandas.errors import (
- EmptyDataError,
- ParserError,
- ParserWarning,
- )
- from pandas.core.dtypes.common import (
- is_bool_dtype,
- is_datetime64_dtype,
- is_extension_array_dtype,
- is_float_dtype,
- is_integer_dtype,
- is_object_dtype,
- )
- from pandas.core.dtypes.dtypes import CategoricalDtype
- from pandas.core.dtypes.inference import is_dict_like
- from pandas.core.arrays.boolean import BooleanDtype
- cdef:
- float64_t INF = <float64_t>np.inf
- float64_t NEGINF = -INF
- int64_t DEFAULT_CHUNKSIZE = 256 * 1024
- cdef extern from "headers/portable.h":
- # I *think* this is here so that strcasecmp is defined on Windows
- # so we don't get
- # `parsers.obj : error LNK2001: unresolved external symbol strcasecmp`
- # in Appveyor.
- # In a sane world, the `from libc.string cimport` above would fail
- # loudly.
- pass
- cdef extern from "parser/tokenizer.h":
- ctypedef enum ParserState:
- START_RECORD
- START_FIELD
- ESCAPED_CHAR
- IN_FIELD
- IN_QUOTED_FIELD
- ESCAPE_IN_QUOTED_FIELD
- QUOTE_IN_QUOTED_FIELD
- EAT_CRNL
- EAT_CRNL_NOP
- EAT_WHITESPACE
- EAT_COMMENT
- EAT_LINE_COMMENT
- WHITESPACE_LINE
- SKIP_LINE
- FINISHED
- enum: ERROR_OVERFLOW
- ctypedef enum BadLineHandleMethod:
- ERROR,
- WARN,
- SKIP
- ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
- int *status, const char *encoding_errors)
- ctypedef int (*io_cleanup)(void *src)
- ctypedef struct parser_t:
- void *source
- io_callback cb_io
- io_cleanup cb_cleanup
- int64_t chunksize # Number of bytes to prepare for each chunk
- char *data # pointer to data to be processed
- int64_t datalen # amount of data available
- int64_t datapos
- # where to write out tokenized data
- char *stream
- uint64_t stream_len
- uint64_t stream_cap
- # Store words in (potentially ragged) matrix for now, hmm
- char **words
- int64_t *word_starts # where we are in the stream
- uint64_t words_len
- uint64_t words_cap
- uint64_t max_words_cap # maximum word cap encountered
- char *pword_start # pointer to stream start of current field
- int64_t word_start # position start of current field
- int64_t *line_start # position in words for start of line
- int64_t *line_fields # Number of fields in each line
- uint64_t lines # Number of lines observed
- uint64_t file_lines # Number of lines observed (with bad/skipped)
- uint64_t lines_cap # Vector capacity
- # Tokenizing stuff
- ParserState state
- int doublequote # is " represented by ""? */
- char delimiter # field separator */
- int delim_whitespace # consume tabs / spaces instead
- char quotechar # quote character */
- char escapechar # escape character */
- char lineterminator
- int skipinitialspace # ignore spaces following delimiter? */
- int quoting # style of quoting to write */
- char commentchar
- int allow_embedded_newline
- int usecols
- Py_ssize_t expected_fields
- BadLineHandleMethod on_bad_lines
- # floating point options
- char decimal
- char sci
- # thousands separator (comma, period)
- char thousands
- int header # Boolean: 1: has header, 0: no header
- int64_t header_start # header row start
- uint64_t header_end # header row end
- void *skipset
- PyObject *skipfunc
- int64_t skip_first_N_rows
- int64_t skipfooter
- # pick one, depending on whether the converter requires GIL
- float64_t (*double_converter)(const char *, char **,
- char, char, char,
- int, int *, int *) nogil
- # error handling
- char *warn_msg
- char *error_msg
- int64_t skip_empty_lines
- ctypedef struct coliter_t:
- char **words
- int64_t *line_start
- int64_t col
- ctypedef struct uint_state:
- int seen_sint
- int seen_uint
- int seen_null
- void uint_state_init(uint_state *self)
- int uint64_conflict(uint_state *self)
- void coliter_setup(coliter_t *it, parser_t *parser,
- int64_t i, int64_t start) nogil
- void COLITER_NEXT(coliter_t, const char *) nogil
- parser_t* parser_new()
- int parser_init(parser_t *self) nogil
- void parser_free(parser_t *self) nogil
- void parser_del(parser_t *self) nogil
- int parser_add_skiprow(parser_t *self, int64_t row)
- int parser_set_skipfirstnrows(parser_t *self, int64_t nrows)
- void parser_set_default_options(parser_t *self)
- int parser_consume_rows(parser_t *self, size_t nrows)
- int parser_trim_buffers(parser_t *self)
- int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
- int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
- int64_t str_to_int64(char *p_item, int64_t int_min,
- int64_t int_max, int *error, char tsep) nogil
- uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
- uint64_t uint_max, int *error, char tsep) nogil
- float64_t xstrtod(const char *p, char **q, char decimal,
- char sci, char tsep, int skip_trailing,
- int *error, int *maybe_int) nogil
- float64_t precise_xstrtod(const char *p, char **q, char decimal,
- char sci, char tsep, int skip_trailing,
- int *error, int *maybe_int) nogil
- float64_t round_trip(const char *p, char **q, char decimal,
- char sci, char tsep, int skip_trailing,
- int *error, int *maybe_int) nogil
- int to_boolean(const char *item, uint8_t *val) nogil
- cdef extern from "parser/io.h":
- void *new_rd_source(object obj) except NULL
- int del_rd_source(void *src)
- void* buffer_rd_bytes(void *source, size_t nbytes,
- size_t *bytes_read, int *status, const char *encoding_errors)
- cdef class TextReader:
- """
- # source: StringIO or file object
- ..versionchange:: 1.2.0
- removed 'compression', 'memory_map', and 'encoding' argument.
- These arguments are outsourced to CParserWrapper.
- 'source' has to be a file handle.
- """
- cdef:
- parser_t *parser
- object na_fvalues
- object true_values, false_values
- object handle
- object orig_header
- bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
- bint allow_leading_cols
- uint64_t parser_start # this is modified after __init__
- list clocks
- const char *encoding_errors
- kh_str_starts_t *false_set
- kh_str_starts_t *true_set
- int64_t buffer_lines, skipfooter
- list dtype_cast_order # list[np.dtype]
- list names # can be None
- set noconvert # set[int]
- cdef public:
- int64_t leading_cols, table_width
- object delimiter # bytes or str
- object converters
- object na_values
- list header # list[list[non-negative integers]]
- object index_col
- object skiprows
- object dtype
- object usecols
- set unnamed_cols # set[str]
- str dtype_backend
- def __cinit__(self, source,
- delimiter=b",", # bytes | str
- header=0,
- int64_t header_start=0,
- uint64_t header_end=0,
- index_col=None,
- names=None,
- tokenize_chunksize=DEFAULT_CHUNKSIZE,
- bint delim_whitespace=False,
- converters=None,
- bint skipinitialspace=False,
- escapechar=None, # bytes | str
- bint doublequote=True,
- quotechar=b'"',
- quoting=0, # int
- lineterminator=None, # bytes | str
- comment=None,
- decimal=b".", # bytes | str
- thousands=None, # bytes | str
- dtype=None,
- usecols=None,
- on_bad_lines=ERROR,
- bint na_filter=True,
- na_values=None,
- na_fvalues=None,
- bint keep_default_na=True,
- true_values=None,
- false_values=None,
- bint allow_leading_cols=True,
- skiprows=None,
- skipfooter=0, # int64_t
- bint verbose=False,
- float_precision=None,
- bint skip_blank_lines=True,
- encoding_errors=b"strict",
- dtype_backend="numpy"):
- # set encoding for native Python and C library
- if isinstance(encoding_errors, str):
- encoding_errors = encoding_errors.encode("utf-8")
- elif encoding_errors is None:
- encoding_errors = b"strict"
- Py_INCREF(encoding_errors)
- self.encoding_errors = PyBytes_AsString(encoding_errors)
- self.parser = parser_new()
- self.parser.chunksize = tokenize_chunksize
- # For timekeeping
- self.clocks = []
- self.parser.usecols = (usecols is not None)
- self._setup_parser_source(source)
- parser_set_default_options(self.parser)
- parser_init(self.parser)
- if delim_whitespace:
- self.parser.delim_whitespace = delim_whitespace
- else:
- if len(delimiter) > 1:
- raise ValueError("only length-1 separators excluded right now")
- self.parser.delimiter = <char>ord(delimiter)
- # ----------------------------------------
- # parser options
- self.parser.doublequote = doublequote
- self.parser.skipinitialspace = skipinitialspace
- self.parser.skip_empty_lines = skip_blank_lines
- if lineterminator is not None:
- if len(lineterminator) != 1:
- raise ValueError("Only length-1 line terminators supported")
- self.parser.lineterminator = <char>ord(lineterminator)
- if len(decimal) != 1:
- raise ValueError("Only length-1 decimal markers supported")
- self.parser.decimal = <char>ord(decimal)
- if thousands is not None:
- if len(thousands) != 1:
- raise ValueError("Only length-1 thousands markers supported")
- self.parser.thousands = <char>ord(thousands)
- if escapechar is not None:
- if len(escapechar) != 1:
- raise ValueError("Only length-1 escapes supported")
- self.parser.escapechar = <char>ord(escapechar)
- self._set_quoting(quotechar, quoting)
- dtype_order = ["int64", "float64", "bool", "object"]
- if quoting == QUOTE_NONNUMERIC:
- # consistent with csv module semantics, cast all to float
- dtype_order = dtype_order[1:]
- self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
- if comment is not None:
- if len(comment) > 1:
- raise ValueError("Only length-1 comment characters supported")
- self.parser.commentchar = <char>ord(comment)
- self.parser.on_bad_lines = on_bad_lines
- self.skiprows = skiprows
- if skiprows is not None:
- self._make_skiprow_set()
- self.skipfooter = skipfooter
- if usecols is not None:
- self.has_usecols = 1
- # GH-20558, validate usecols at higher level and only pass clean
- # usecols into TextReader.
- self.usecols = usecols
- if skipfooter > 0:
- self.parser.on_bad_lines = SKIP
- self.delimiter = delimiter
- self.na_values = na_values
- if na_fvalues is None:
- na_fvalues = set()
- self.na_fvalues = na_fvalues
- self.true_values = _maybe_encode(true_values) + _true_values
- self.false_values = _maybe_encode(false_values) + _false_values
- self.true_set = kset_from_list(self.true_values)
- self.false_set = kset_from_list(self.false_values)
- self.keep_default_na = keep_default_na
- self.converters = converters
- self.na_filter = na_filter
- self.verbose = verbose
- if float_precision == "round_trip":
- # see gh-15140
- self.parser.double_converter = round_trip
- elif float_precision == "legacy":
- self.parser.double_converter = xstrtod
- elif float_precision == "high" or float_precision is None:
- self.parser.double_converter = precise_xstrtod
- else:
- raise ValueError(f"Unrecognized float_precision option: "
- f"{float_precision}")
- # Caller is responsible for ensuring we have one of
- # - None
- # - DtypeObj
- # - dict[Any, DtypeObj]
- self.dtype = dtype
- self.dtype_backend = dtype_backend
- self.noconvert = set()
- self.index_col = index_col
- # ----------------------------------------
- # header stuff
- self.allow_leading_cols = allow_leading_cols
- self.leading_cols = 0 # updated in _get_header
- # TODO: no header vs. header is not the first row
- self.has_mi_columns = 0
- self.orig_header = header
- if header is None:
- # sentinel value
- self.parser.header_start = -1
- self.parser.header_end = -1
- self.parser.header = -1
- self.parser_start = 0
- prelim_header = []
- else:
- if isinstance(header, list):
- if len(header) > 1:
- # need to artificially skip the final line
- # which is still a header line
- header = list(header)
- header.append(header[-1] + 1)
- self.parser.header_end = header[-1]
- self.has_mi_columns = 1
- else:
- self.parser.header_end = header[0]
- self.parser_start = header[-1] + 1
- self.parser.header_start = header[0]
- self.parser.header = header[0]
- prelim_header = header
- else:
- self.parser.header_start = header
- self.parser.header_end = header
- self.parser_start = header + 1
- self.parser.header = header
- prelim_header = [header]
- self.names = names
- header, table_width, unnamed_cols = self._get_header(prelim_header)
- # header, table_width, and unnamed_cols are set here, never changed
- self.header = header
- self.table_width = table_width
- self.unnamed_cols = unnamed_cols
- if not self.table_width:
- raise EmptyDataError("No columns to parse from file")
- # Compute buffer_lines as function of table width.
- heuristic = 2**20 // self.table_width
- self.buffer_lines = 1
- while self.buffer_lines * 2 < heuristic:
- self.buffer_lines *= 2
- def __init__(self, *args, **kwargs):
- pass
- def __dealloc__(self):
- _close(self)
- parser_del(self.parser)
- def close(self):
- _close(self)
- def _set_quoting(self, quote_char: str | bytes | None, quoting: int):
- if not isinstance(quoting, int):
- raise TypeError('"quoting" must be an integer')
- if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE:
- raise TypeError('bad "quoting" value')
- if not isinstance(quote_char, (str, bytes)) and quote_char is not None:
- dtype = type(quote_char).__name__
- raise TypeError(f'"quotechar" must be string, not {dtype}')
- if quote_char is None or quote_char == "":
- if quoting != QUOTE_NONE:
- raise TypeError("quotechar must be set if quoting enabled")
- self.parser.quoting = quoting
- self.parser.quotechar = -1
- elif len(quote_char) > 1: # 0-len case handled earlier
- raise TypeError('"quotechar" must be a 1-character string')
- else:
- self.parser.quoting = quoting
- self.parser.quotechar = <char>ord(quote_char)
- cdef _make_skiprow_set(self):
- if util.is_integer_object(self.skiprows):
- parser_set_skipfirstnrows(self.parser, self.skiprows)
- elif not callable(self.skiprows):
- for i in self.skiprows:
- parser_add_skiprow(self.parser, i)
- else:
- self.parser.skipfunc = <PyObject *>self.skiprows
- cdef _setup_parser_source(self, source):
- cdef:
- void *ptr
- ptr = new_rd_source(source)
- self.parser.source = ptr
- self.parser.cb_io = &buffer_rd_bytes
- self.parser.cb_cleanup = &del_rd_source
- cdef _get_header(self, list prelim_header):
- # header is now a list of lists, so field_count should use header[0]
- #
- # modifies:
- # self.parser attributes
- # self.parser_start
- # self.leading_cols
- cdef:
- Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
- char *word
- str name
- uint64_t hr, data_line = 0
- list header = []
- set unnamed_cols = set()
- if self.parser.header_start >= 0:
- # Header is in the file
- for level, hr in enumerate(prelim_header):
- this_header = []
- if self.parser.lines < hr + 1:
- self._tokenize_rows(hr + 2)
- if self.parser.lines == 0:
- field_count = 0
- start = self.parser.line_start[0]
- # e.g., if header=3 and file only has 2 lines
- elif (self.parser.lines < hr + 1
- and not isinstance(self.orig_header, list)) or (
- self.parser.lines < hr):
- msg = self.orig_header
- if isinstance(msg, list):
- joined = ",".join(str(m) for m in msg)
- msg = f"[{joined}], len of {len(msg)},"
- raise ParserError(
- f"Passed header={msg} but only "
- f"{self.parser.lines} lines in file")
- else:
- field_count = self.parser.line_fields[hr]
- start = self.parser.line_start[hr]
- unnamed_count = 0
- unnamed_col_indices = []
- for i in range(field_count):
- word = self.parser.words[start + i]
- name = PyUnicode_DecodeUTF8(word, strlen(word),
- self.encoding_errors)
- if name == "":
- if self.has_mi_columns:
- name = f"Unnamed: {i}_level_{level}"
- else:
- name = f"Unnamed: {i}"
- unnamed_count += 1
- unnamed_col_indices.append(i)
- this_header.append(name)
- if not self.has_mi_columns:
- # Ensure that regular columns are used before unnamed ones
- # to keep given names and mangle unnamed columns
- col_loop_order = [i for i in range(len(this_header))
- if i not in unnamed_col_indices
- ] + unnamed_col_indices
- counts = {}
- for i in col_loop_order:
- col = this_header[i]
- old_col = col
- cur_count = counts.get(col, 0)
- if cur_count > 0:
- while cur_count > 0:
- counts[old_col] = cur_count + 1
- col = f"{old_col}.{cur_count}"
- if col in this_header:
- cur_count += 1
- else:
- cur_count = counts.get(col, 0)
- if (
- self.dtype is not None
- and is_dict_like(self.dtype)
- and self.dtype.get(old_col) is not None
- and self.dtype.get(col) is None
- ):
- self.dtype.update({col: self.dtype.get(old_col)})
- this_header[i] = col
- counts[col] = cur_count + 1
- if self.has_mi_columns:
- # If we have grabbed an extra line, but it's not in our
- # format, save in the buffer, and create an blank extra
- # line for the rest of the parsing code.
- if hr == prelim_header[-1]:
- lc = len(this_header)
- ic = (len(self.index_col) if self.index_col
- is not None else 0)
- # if wrong number of blanks or no index, not our format
- if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0:
- hr -= 1
- self.parser_start -= 1
- this_header = [None] * lc
- data_line = hr + 1
- header.append(this_header)
- unnamed_cols.update({this_header[i] for i in unnamed_col_indices})
- if self.names is not None:
- header = [self.names]
- elif self.names is not None:
- # Names passed
- if self.parser.lines < 1:
- if not self.has_usecols:
- self.parser.expected_fields = len(self.names)
- self._tokenize_rows(1)
- header = [self.names]
- if self.parser.lines < 1:
- field_count = len(header[0])
- else:
- field_count = self.parser.line_fields[data_line]
- # Enforce this unless usecols
- if not self.has_usecols:
- self.parser.expected_fields = max(field_count, len(self.names))
- else:
- # No header passed nor to be found in the file
- if self.parser.lines < 1:
- self._tokenize_rows(1)
- return None, self.parser.line_fields[0], unnamed_cols
- # Corner case, not enough lines in the file
- if self.parser.lines < data_line + 1:
- field_count = len(header[0])
- else:
- field_count = self.parser.line_fields[data_line]
- # #2981
- if self.names is not None:
- field_count = max(field_count, len(self.names))
- passed_count = len(header[0])
- if (self.has_usecols and self.allow_leading_cols and
- not callable(self.usecols)):
- nuse = len(self.usecols)
- if nuse == passed_count:
- self.leading_cols = 0
- elif self.names is None and nuse < passed_count:
- self.leading_cols = field_count - passed_count
- elif passed_count != field_count:
- raise ValueError("Number of passed names did not match number of "
- "header fields in the file")
- # oh boy, #2442, #2981
- elif self.allow_leading_cols and passed_count < field_count:
- self.leading_cols = field_count - passed_count
- return header, field_count, unnamed_cols
- def read(self, rows: int | None = None) -> dict[int, "ArrayLike"]:
- """
- rows=None --> read all rows
- """
- # Don't care about memory usage
- columns = self._read_rows(rows, 1)
- return columns
- def read_low_memory(self, rows: int | None)-> list[dict[int, "ArrayLike"]]:
- """
- rows=None --> read all rows
- """
- # Conserve intermediate space
- # Caller is responsible for concatenating chunks,
- # see c_parser_wrapper._concatenate_chunks
- cdef:
- size_t rows_read = 0
- list chunks = []
- if rows is None:
- while True:
- try:
- chunk = self._read_rows(self.buffer_lines, 0)
- if len(chunk) == 0:
- break
- except StopIteration:
- break
- else:
- chunks.append(chunk)
- else:
- while rows_read < rows:
- try:
- crows = min(self.buffer_lines, rows - rows_read)
- chunk = self._read_rows(crows, 0)
- if len(chunk) == 0:
- break
- rows_read += len(list(chunk.values())[0])
- except StopIteration:
- break
- else:
- chunks.append(chunk)
- parser_trim_buffers(self.parser)
- if len(chunks) == 0:
- raise StopIteration
- return chunks
- cdef _tokenize_rows(self, size_t nrows):
- cdef:
- int status
- with nogil:
- status = tokenize_nrows(self.parser, nrows, self.encoding_errors)
- self._check_tokenize_status(status)
- cdef _check_tokenize_status(self, int status):
- if self.parser.warn_msg != NULL:
- print(PyUnicode_DecodeUTF8(
- self.parser.warn_msg, strlen(self.parser.warn_msg),
- self.encoding_errors), file=sys.stderr)
- free(self.parser.warn_msg)
- self.parser.warn_msg = NULL
- if status < 0:
- raise_parser_error("Error tokenizing data", self.parser)
- # -> dict[int, "ArrayLike"]
- cdef _read_rows(self, rows, bint trim):
- cdef:
- int64_t buffered_lines
- int64_t irows
- self._start_clock()
- if rows is not None:
- irows = rows
- buffered_lines = self.parser.lines - self.parser_start
- if buffered_lines < irows:
- self._tokenize_rows(irows - buffered_lines)
- if self.skipfooter > 0:
- raise ValueError("skipfooter can only be used to read "
- "the whole file")
- else:
- with nogil:
- status = tokenize_all_rows(self.parser, self.encoding_errors)
- self._check_tokenize_status(status)
- if self.parser_start >= self.parser.lines:
- raise StopIteration
- self._end_clock("Tokenization")
- self._start_clock()
- columns = self._convert_column_data(rows)
- self._end_clock("Type conversion")
- self._start_clock()
- if len(columns) > 0:
- rows_read = len(list(columns.values())[0])
- # trim
- parser_consume_rows(self.parser, rows_read)
- if trim:
- parser_trim_buffers(self.parser)
- self.parser_start -= rows_read
- self._end_clock("Parser memory cleanup")
- return columns
- cdef _start_clock(self):
- self.clocks.append(time.time())
- cdef _end_clock(self, str what):
- if self.verbose:
- elapsed = time.time() - self.clocks.pop(-1)
- print(f"{what} took: {elapsed * 1000:.2f} ms")
- def set_noconvert(self, i: int) -> None:
- self.noconvert.add(i)
- def remove_noconvert(self, i: int) -> None:
- self.noconvert.remove(i)
- def _convert_column_data(self, rows: int | None) -> dict[int, "ArrayLike"]:
- cdef:
- int64_t i
- int nused
- kh_str_starts_t *na_hashset = NULL
- int64_t start, end
- object name, na_flist, col_dtype = None
- bint na_filter = 0
- int64_t num_cols
- dict results
- start = self.parser_start
- if rows is None:
- end = self.parser.lines
- else:
- end = min(start + rows, self.parser.lines)
- num_cols = -1
- # Py_ssize_t cast prevents build warning
- for i in range(<Py_ssize_t>self.parser.lines):
- num_cols = (num_cols < self.parser.line_fields[i]) * \
- self.parser.line_fields[i] + \
- (num_cols >= self.parser.line_fields[i]) * num_cols
- usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols
- names_larger_num_cols = (self.names and
- len(self.names) - self.leading_cols > num_cols)
- if self.table_width - self.leading_cols > num_cols:
- if (usecols_not_callable_and_exists
- and self.table_width - self.leading_cols < len(self.usecols)
- or names_larger_num_cols):
- raise ParserError(f"Too many columns specified: expected "
- f"{self.table_width - self.leading_cols} "
- f"and found {num_cols}")
- if (usecols_not_callable_and_exists and
- all(isinstance(u, int) for u in self.usecols)):
- missing_usecols = [col for col in self.usecols if col >= num_cols]
- if missing_usecols:
- raise ParserError(
- "Defining usecols without of bounds indices is not allowed. "
- f"{missing_usecols} are out of bounds.",
- )
- results = {}
- nused = 0
- is_default_dict_dtype = isinstance(self.dtype, defaultdict)
- for i in range(self.table_width):
- if i < self.leading_cols:
- # Pass through leading columns always
- name = i
- elif (self.usecols and not callable(self.usecols) and
- nused == len(self.usecols)):
- # Once we've gathered all requested columns, stop. GH5766
- break
- else:
- name = self._get_column_name(i, nused)
- usecols = set()
- if callable(self.usecols):
- if self.usecols(name):
- usecols = {i}
- else:
- usecols = self.usecols
- if self.has_usecols and not (i in usecols or
- name in usecols):
- continue
- nused += 1
- conv = self._get_converter(i, name)
- col_dtype = None
- if self.dtype is not None:
- if isinstance(self.dtype, dict):
- if name in self.dtype:
- col_dtype = self.dtype[name]
- elif i in self.dtype:
- col_dtype = self.dtype[i]
- elif is_default_dict_dtype:
- col_dtype = self.dtype[name]
- else:
- if self.dtype.names:
- # structured array
- col_dtype = np.dtype(self.dtype.descr[i][1])
- else:
- col_dtype = self.dtype
- if conv:
- if col_dtype is not None:
- warnings.warn((f"Both a converter and dtype were specified "
- f"for column {name} - only the converter will "
- f"be used."), ParserWarning,
- stacklevel=find_stack_level())
- results[i] = _apply_converter(conv, self.parser, i, start, end)
- continue
- # Collect the list of NaN values associated with the column.
- # If we aren't supposed to do that, or none are collected,
- # we set `na_filter` to `0` (`1` otherwise).
- na_flist = set()
- if self.na_filter:
- na_list, na_flist = self._get_na_list(i, name)
- if na_list is None:
- na_filter = 0
- else:
- na_filter = 1
- na_hashset = kset_from_list(na_list)
- else:
- na_filter = 0
- # Attempt to parse tokens and infer dtype of the column.
- # Should return as the desired dtype (inferred or specified).
- try:
- col_res, na_count = self._convert_tokens(
- i, start, end, name, na_filter, na_hashset,
- na_flist, col_dtype)
- finally:
- # gh-21353
- #
- # Cleanup the NaN hash that we generated
- # to avoid memory leaks.
- if na_filter:
- self._free_na_set(na_hashset)
- # don't try to upcast EAs
- if (
- na_count > 0 and not is_extension_array_dtype(col_dtype)
- or self.dtype_backend != "numpy"
- ):
- use_dtype_backend = self.dtype_backend != "numpy" and col_dtype is None
- col_res = _maybe_upcast(
- col_res,
- use_dtype_backend=use_dtype_backend,
- dtype_backend=self.dtype_backend,
- )
- if col_res is None:
- raise ParserError(f"Unable to parse column {i}")
- results[i] = col_res
- self.parser_start += end - start
- return results
- # -> tuple["ArrayLike", int]:
- cdef _convert_tokens(self, Py_ssize_t i, int64_t start,
- int64_t end, object name, bint na_filter,
- kh_str_starts_t *na_hashset,
- object na_flist, object col_dtype):
- if col_dtype is not None:
- col_res, na_count = self._convert_with_dtype(
- col_dtype, i, start, end, na_filter,
- 1, na_hashset, na_flist)
- # Fallback on the parse (e.g. we requested int dtype,
- # but its actually a float).
- if col_res is not None:
- return col_res, na_count
- if i in self.noconvert:
- return self._string_convert(i, start, end, na_filter, na_hashset)
- else:
- col_res = None
- for dt in self.dtype_cast_order:
- try:
- col_res, na_count = self._convert_with_dtype(
- dt, i, start, end, na_filter, 0, na_hashset, na_flist)
- except ValueError:
- # This error is raised from trying to convert to uint64,
- # and we discover that we cannot convert to any numerical
- # dtype successfully. As a result, we leave the data
- # column AS IS with object dtype.
- col_res, na_count = self._convert_with_dtype(
- np.dtype("object"), i, start, end, 0,
- 0, na_hashset, na_flist)
- except OverflowError:
- col_res, na_count = self._convert_with_dtype(
- np.dtype("object"), i, start, end, na_filter,
- 0, na_hashset, na_flist)
- if col_res is not None:
- break
- # we had a fallback parse on the dtype, so now try to cast
- if col_res is not None and col_dtype is not None:
- # If col_res is bool, it might actually be a bool array mixed with NaNs
- # (see _try_bool_flex()). Usually this would be taken care of using
- # _maybe_upcast(), but if col_dtype is a floating type we should just
- # take care of that cast here.
- if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
- mask = col_res.view(np.uint8) == na_values[np.uint8]
- col_res = col_res.astype(col_dtype)
- np.putmask(col_res, mask, np.nan)
- return col_res, na_count
- # NaNs are already cast to True here, so can not use astype
- if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
- if na_count > 0:
- raise ValueError(
- f"cannot safely convert passed user dtype of "
- f"{col_dtype} for {np.bool_} dtyped data in "
- f"column {i} due to NA values"
- )
- # only allow safe casts, eg. with a nan you cannot safely cast to int
- try:
- col_res = col_res.astype(col_dtype, casting="safe")
- except TypeError:
- # float -> int conversions can fail the above
- # even with no nans
- col_res_orig = col_res
- col_res = col_res.astype(col_dtype)
- if (col_res != col_res_orig).any():
- raise ValueError(
- f"cannot safely convert passed user dtype of "
- f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in "
- f"column {i}")
- return col_res, na_count
- cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
- int64_t start, int64_t end,
- bint na_filter,
- bint user_dtype,
- kh_str_starts_t *na_hashset,
- object na_flist):
- if isinstance(dtype, CategoricalDtype):
- # TODO: I suspect that _categorical_convert could be
- # optimized when dtype is an instance of CategoricalDtype
- codes, cats, na_count = _categorical_convert(
- self.parser, i, start, end, na_filter, na_hashset)
- # Method accepts list of strings, not encoded ones.
- true_values = [x.decode() for x in self.true_values]
- array_type = dtype.construct_array_type()
- cat = array_type._from_inferred_categories(
- cats, codes, dtype, true_values=true_values)
- return cat, na_count
- elif is_extension_array_dtype(dtype):
- result, na_count = self._string_convert(i, start, end, na_filter,
- na_hashset)
- array_type = dtype.construct_array_type()
- try:
- # use _from_sequence_of_strings if the class defines it
- if isinstance(dtype, BooleanDtype):
- # xref GH 47534: BooleanArray._from_sequence_of_strings has extra
- # kwargs
- true_values = [x.decode() for x in self.true_values]
- false_values = [x.decode() for x in self.false_values]
- result = array_type._from_sequence_of_strings(
- result, dtype=dtype, true_values=true_values,
- false_values=false_values)
- else:
- result = array_type._from_sequence_of_strings(result, dtype=dtype)
- except NotImplementedError:
- raise NotImplementedError(
- f"Extension Array: {array_type} must implement "
- f"_from_sequence_of_strings in order "
- f"to be used in parser methods")
- return result, na_count
- elif is_integer_dtype(dtype):
- try:
- result, na_count = _try_int64(self.parser, i, start,
- end, na_filter, na_hashset)
- if user_dtype and na_count is not None:
- if na_count > 0:
- raise ValueError(f"Integer column has NA values in column {i}")
- except OverflowError:
- result = _try_uint64(self.parser, i, start, end,
- na_filter, na_hashset)
- na_count = 0
- if result is not None and dtype != "int64":
- result = result.astype(dtype)
- return result, na_count
- elif is_float_dtype(dtype):
- result, na_count = _try_double(self.parser, i, start, end,
- na_filter, na_hashset, na_flist)
- if result is not None and dtype != "float64":
- result = result.astype(dtype)
- return result, na_count
- elif is_bool_dtype(dtype):
- result, na_count = _try_bool_flex(self.parser, i, start, end,
- na_filter, na_hashset,
- self.true_set, self.false_set)
- if user_dtype and na_count is not None:
- if na_count > 0:
- raise ValueError(f"Bool column has NA values in column {i}")
- return result, na_count
- elif dtype.kind == "S":
- # TODO: na handling
- width = dtype.itemsize
- if width > 0:
- result = _to_fw_string(self.parser, i, start, end, width)
- return result, 0
- # treat as a regular string parsing
- return self._string_convert(i, start, end, na_filter,
- na_hashset)
- elif dtype.kind == "U":
- width = dtype.itemsize
- if width > 0:
- raise TypeError(f"the dtype {dtype} is not supported for parsing")
- # unicode variable width
- return self._string_convert(i, start, end, na_filter,
- na_hashset)
- elif is_object_dtype(dtype):
- return self._string_convert(i, start, end, na_filter,
- na_hashset)
- elif is_datetime64_dtype(dtype):
- raise TypeError(f"the dtype {dtype} is not supported "
- f"for parsing, pass this column "
- f"using parse_dates instead")
- else:
- raise TypeError(f"the dtype {dtype} is not supported for parsing")
- # -> tuple[ndarray[object], int]
- cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
- bint na_filter, kh_str_starts_t *na_hashset):
- return _string_box_utf8(self.parser, i, start, end, na_filter,
- na_hashset, self.encoding_errors)
- def _get_converter(self, i: int, name):
- if self.converters is None:
- return None
- if name is not None and name in self.converters:
- return self.converters[name]
- # Converter for position, if any
- return self.converters.get(i)
- cdef _get_na_list(self, Py_ssize_t i, name):
- # Note: updates self.na_values, self.na_fvalues
- if self.na_values is None:
- return None, set()
- if isinstance(self.na_values, dict):
- key = None
- values = None
- if name is not None and name in self.na_values:
- key = name
- elif i in self.na_values:
- key = i
- else: # No na_values provided for this column.
- if self.keep_default_na:
- return _NA_VALUES, set()
- return list(), set()
- values = self.na_values[key]
- if values is not None and not isinstance(values, list):
- values = list(values)
- fvalues = self.na_fvalues[key]
- if fvalues is not None and not isinstance(fvalues, set):
- fvalues = set(fvalues)
- return _ensure_encoded(values), fvalues
- else:
- if not isinstance(self.na_values, list):
- self.na_values = list(self.na_values)
- if not isinstance(self.na_fvalues, set):
- self.na_fvalues = set(self.na_fvalues)
- return _ensure_encoded(self.na_values), self.na_fvalues
- cdef _free_na_set(self, kh_str_starts_t *table):
- kh_destroy_str_starts(table)
- cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
- cdef int64_t j
- if self.has_usecols and self.names is not None:
- if (not callable(self.usecols) and
- len(self.names) == len(self.usecols)):
- return self.names[nused]
- else:
- return self.names[i - self.leading_cols]
- else:
- if self.header is not None:
- j = i - self.leading_cols
- # generate extra (bogus) headers if there are more columns than headers
- # These should be strings, not integers, because otherwise we might get
- # issues with callables as usecols GH#46997
- if j >= len(self.header[0]):
- return str(j)
- elif self.has_mi_columns:
- return tuple(header_row[j] for header_row in self.header)
- else:
- return self.header[0][j]
- else:
- return None
- # Factor out code common to TextReader.__dealloc__ and TextReader.close
- # It cannot be a class method, since calling self.close() in __dealloc__
- # which causes a class attribute lookup and violates best practices
- # https://cython.readthedocs.io/en/latest/src/userguide/special_methods.html#finalization-method-dealloc
- cdef _close(TextReader reader):
- # also preemptively free all allocated memory
- parser_free(reader.parser)
- if reader.true_set:
- kh_destroy_str_starts(reader.true_set)
- reader.true_set = NULL
- if reader.false_set:
- kh_destroy_str_starts(reader.false_set)
- reader.false_set = NULL
- cdef:
- object _true_values = [b"True", b"TRUE", b"true"]
- object _false_values = [b"False", b"FALSE", b"false"]
- def _ensure_encoded(list lst):
- cdef:
- list result = []
- for x in lst:
- if isinstance(x, str):
- x = PyUnicode_AsUTF8String(x)
- elif not isinstance(x, bytes):
- x = str(x).encode("utf-8")
- result.append(x)
- return result
- # common NA values
- # no longer excluding inf representations
- # '1.#INF','-1.#INF', '1.#INF000000',
- STR_NA_VALUES = {
- "-1.#IND",
- "1.#QNAN",
- "1.#IND",
- "-1.#QNAN",
- "#N/A N/A",
- "#N/A",
- "N/A",
- "n/a",
- "NA",
- "<NA>",
- "#NA",
- "NULL",
- "null",
- "NaN",
- "-NaN",
- "nan",
- "-nan",
- "",
- "None",
- }
- _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
- def _maybe_upcast(
- arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy"
- ):
- """Sets nullable dtypes or upcasts if nans are present.
- Upcast, if use_dtype_backend is false and nans are present so that the
- current dtype can not hold the na value. We use nullable dtypes if the
- flag is true for every array.
- Parameters
- ----------
- arr: ndarray
- Numpy array that is potentially being upcast.
- use_dtype_backend: bool, default False
- If true, we cast to the associated nullable dtypes.
- Returns
- -------
- The casted array.
- """
- if is_extension_array_dtype(arr.dtype):
- # TODO: the docstring says arr is an ndarray, in which case this cannot
- # be reached. Is that incorrect?
- return arr
- na_value = na_values[arr.dtype]
- if issubclass(arr.dtype.type, np.integer):
- mask = arr == na_value
- if use_dtype_backend:
- arr = IntegerArray(arr, mask)
- else:
- arr = arr.astype(float)
- np.putmask(arr, mask, np.nan)
- elif arr.dtype == np.bool_:
- mask = arr.view(np.uint8) == na_value
- if use_dtype_backend:
- arr = BooleanArray(arr, mask)
- else:
- arr = arr.astype(object)
- np.putmask(arr, mask, np.nan)
- elif issubclass(arr.dtype.type, float) or arr.dtype.type == np.float32:
- if use_dtype_backend:
- mask = np.isnan(arr)
- arr = FloatingArray(arr, mask)
- elif arr.dtype == np.object_:
- if use_dtype_backend:
- arr = StringDtype().construct_array_type()._from_sequence(arr)
- if use_dtype_backend and dtype_backend == "pyarrow":
- import pyarrow as pa
- if isinstance(arr, IntegerArray) and arr.isna().all():
- # use null instead of int64 in pyarrow
- arr = arr.to_numpy()
- arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
- return arr
- # ----------------------------------------------------------------------
- # Type conversions / inference support code
- # -> tuple[ndarray[object], int]
- cdef _string_box_utf8(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset,
- const char *encoding_errors):
- cdef:
- int na_count = 0
- Py_ssize_t i, lines
- coliter_t it
- const char *word = NULL
- ndarray[object] result
- int ret = 0
- kh_strbox_t *table
- object pyval
- object NA = na_values[np.object_]
- khiter_t k
- table = kh_init_strbox()
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.object_)
- coliter_setup(&it, parser, col, line_start)
- for i in range(lines):
- COLITER_NEXT(it, word)
- if na_filter:
- if kh_get_str_starts_item(na_hashset, word):
- # in the hash table
- na_count += 1
- result[i] = NA
- continue
- k = kh_get_strbox(table, word)
- # in the hash table
- if k != table.n_buckets:
- # this increments the refcount, but need to test
- pyval = <object>table.vals[k]
- else:
- # box it. new ref?
- pyval = PyUnicode_Decode(word, strlen(word), "utf-8", encoding_errors)
- k = kh_put_strbox(table, word, &ret)
- table.vals[k] = <PyObject *>pyval
- result[i] = pyval
- kh_destroy_strbox(table)
- return result, na_count
- @cython.boundscheck(False)
- cdef _categorical_convert(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset):
- "Convert column data into codes, categories"
- cdef:
- int na_count = 0
- Py_ssize_t i, lines
- coliter_t it
- const char *word = NULL
- int64_t NA = -1
- int64_t[::1] codes
- int64_t current_category = 0
- int ret = 0
- kh_str_t *table
- khiter_t k
- lines = line_end - line_start
- codes = np.empty(lines, dtype=np.int64)
- # factorize parsed values, creating a hash table
- # bytes -> category code
- with nogil:
- table = kh_init_str()
- coliter_setup(&it, parser, col, line_start)
- for i in range(lines):
- COLITER_NEXT(it, word)
- if na_filter:
- if kh_get_str_starts_item(na_hashset, word):
- # is in NA values
- na_count += 1
- codes[i] = NA
- continue
- k = kh_get_str(table, word)
- # not in the hash table
- if k == table.n_buckets:
- k = kh_put_str(table, word, &ret)
- table.vals[k] = current_category
- current_category += 1
- codes[i] = table.vals[k]
- # parse and box categories to python strings
- result = np.empty(table.n_occupied, dtype=np.object_)
- for k in range(table.n_buckets):
- if kh_exist_str(table, k):
- result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
- kh_destroy_str(table)
- return np.asarray(codes), result, na_count
- # -> ndarray[f'|S{width}']
- cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
- int64_t line_end, int64_t width):
- cdef:
- char *data
- ndarray result
- result = np.empty(line_end - line_start, dtype=f"|S{width}")
- data = <char*>result.data
- with nogil:
- _to_fw_string_nogil(parser, col, line_start, line_end, width, data)
- return result
- cdef void _to_fw_string_nogil(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- size_t width, char *data) nogil:
- cdef:
- int64_t i
- coliter_t it
- const char *word = NULL
- coliter_setup(&it, parser, col, line_start)
- for i in range(line_end - line_start):
- COLITER_NEXT(it, word)
- strncpy(data, word, width)
- data += width
- cdef:
- char* cinf = b"inf"
- char* cposinf = b"+inf"
- char* cneginf = b"-inf"
- char* cinfty = b"Infinity"
- char* cposinfty = b"+Infinity"
- char* cneginfty = b"-Infinity"
- # -> tuple[ndarray[float64_t], int] | tuple[None, None]
- cdef _try_double(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset, object na_flist):
- cdef:
- int error, na_count = 0
- Py_ssize_t lines
- float64_t *data
- float64_t NA = na_values[np.float64]
- kh_float64_t *na_fset
- ndarray[float64_t] result
- bint use_na_flist = len(na_flist) > 0
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.float64)
- data = <float64_t *>result.data
- na_fset = kset_float64_from_list(na_flist)
- with nogil:
- error = _try_double_nogil(parser, parser.double_converter,
- col, line_start, line_end,
- na_filter, na_hashset, use_na_flist,
- na_fset, NA, data, &na_count)
- kh_destroy_float64(na_fset)
- if error != 0:
- return None, None
- return result, na_count
- cdef int _try_double_nogil(parser_t *parser,
- float64_t (*double_converter)(
- const char *, char **, char,
- char, char, int, int *, int *) nogil,
- int64_t col, int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset,
- bint use_na_flist,
- const kh_float64_t *na_flist,
- float64_t NA, float64_t *data,
- int *na_count) nogil:
- cdef:
- int error = 0,
- Py_ssize_t i, lines = line_end - line_start
- coliter_t it
- const char *word = NULL
- char *p_end
- khiter_t k64
- na_count[0] = 0
- coliter_setup(&it, parser, col, line_start)
- if na_filter:
- for i in range(lines):
- COLITER_NEXT(it, word)
- if kh_get_str_starts_item(na_hashset, word):
- # in the hash table
- na_count[0] += 1
- data[0] = NA
- else:
- data[0] = double_converter(word, &p_end, parser.decimal,
- parser.sci, parser.thousands,
- 1, &error, NULL)
- if error != 0 or p_end == word or p_end[0]:
- error = 0
- if (strcasecmp(word, cinf) == 0 or
- strcasecmp(word, cposinf) == 0 or
- strcasecmp(word, cinfty) == 0 or
- strcasecmp(word, cposinfty) == 0):
- data[0] = INF
- elif (strcasecmp(word, cneginf) == 0 or
- strcasecmp(word, cneginfty) == 0):
- data[0] = NEGINF
- else:
- return 1
- if use_na_flist:
- k64 = kh_get_float64(na_flist, data[0])
- if k64 != na_flist.n_buckets:
- na_count[0] += 1
- data[0] = NA
- data += 1
- else:
- for i in range(lines):
- COLITER_NEXT(it, word)
- data[0] = double_converter(word, &p_end, parser.decimal,
- parser.sci, parser.thousands,
- 1, &error, NULL)
- if error != 0 or p_end == word or p_end[0]:
- error = 0
- if (strcasecmp(word, cinf) == 0 or
- strcasecmp(word, cposinf) == 0 or
- strcasecmp(word, cinfty) == 0 or
- strcasecmp(word, cposinfty) == 0):
- data[0] = INF
- elif (strcasecmp(word, cneginf) == 0 or
- strcasecmp(word, cneginfty) == 0):
- data[0] = NEGINF
- else:
- return 1
- data += 1
- return 0
- cdef _try_uint64(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset):
- cdef:
- int error
- Py_ssize_t lines
- coliter_t it
- uint64_t *data
- ndarray result
- uint_state state
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.uint64)
- data = <uint64_t *>result.data
- uint_state_init(&state)
- coliter_setup(&it, parser, col, line_start)
- with nogil:
- error = _try_uint64_nogil(parser, col, line_start, line_end,
- na_filter, na_hashset, data, &state)
- if error != 0:
- if error == ERROR_OVERFLOW:
- # Can't get the word variable
- raise OverflowError("Overflow")
- return None
- if uint64_conflict(&state):
- raise ValueError("Cannot convert to numerical dtype")
- if state.seen_sint:
- raise OverflowError("Overflow")
- return result
- cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
- int64_t line_start,
- int64_t line_end, bint na_filter,
- const kh_str_starts_t *na_hashset,
- uint64_t *data, uint_state *state) nogil:
- cdef:
- int error
- Py_ssize_t i, lines = line_end - line_start
- coliter_t it
- const char *word = NULL
- coliter_setup(&it, parser, col, line_start)
- if na_filter:
- for i in range(lines):
- COLITER_NEXT(it, word)
- if kh_get_str_starts_item(na_hashset, word):
- # in the hash table
- state.seen_null = 1
- data[i] = 0
- continue
- data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
- &error, parser.thousands)
- if error != 0:
- return error
- else:
- for i in range(lines):
- COLITER_NEXT(it, word)
- data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
- &error, parser.thousands)
- if error != 0:
- return error
- return 0
- cdef _try_int64(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, kh_str_starts_t *na_hashset):
- cdef:
- int error, na_count = 0
- Py_ssize_t lines
- coliter_t it
- int64_t *data
- ndarray result
- int64_t NA = na_values[np.int64]
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.int64)
- data = <int64_t *>result.data
- coliter_setup(&it, parser, col, line_start)
- with nogil:
- error = _try_int64_nogil(parser, col, line_start, line_end,
- na_filter, na_hashset, NA, data, &na_count)
- if error != 0:
- if error == ERROR_OVERFLOW:
- # Can't get the word variable
- raise OverflowError("Overflow")
- return None, None
- return result, na_count
- cdef int _try_int64_nogil(parser_t *parser, int64_t col,
- int64_t line_start,
- int64_t line_end, bint na_filter,
- const kh_str_starts_t *na_hashset, int64_t NA,
- int64_t *data, int *na_count) nogil:
- cdef:
- int error
- Py_ssize_t i, lines = line_end - line_start
- coliter_t it
- const char *word = NULL
- na_count[0] = 0
- coliter_setup(&it, parser, col, line_start)
- if na_filter:
- for i in range(lines):
- COLITER_NEXT(it, word)
- if kh_get_str_starts_item(na_hashset, word):
- # in the hash table
- na_count[0] += 1
- data[i] = NA
- continue
- data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
- &error, parser.thousands)
- if error != 0:
- return error
- else:
- for i in range(lines):
- COLITER_NEXT(it, word)
- data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
- &error, parser.thousands)
- if error != 0:
- return error
- return 0
- # -> tuple[ndarray[bool], int]
- cdef _try_bool_flex(parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end,
- bint na_filter, const kh_str_starts_t *na_hashset,
- const kh_str_starts_t *true_hashset,
- const kh_str_starts_t *false_hashset):
- cdef:
- int error, na_count = 0
- Py_ssize_t lines
- uint8_t *data
- ndarray result
- uint8_t NA = na_values[np.bool_]
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.uint8)
- data = <uint8_t *>result.data
- with nogil:
- error = _try_bool_flex_nogil(parser, col, line_start, line_end,
- na_filter, na_hashset, true_hashset,
- false_hashset, NA, data, &na_count)
- if error != 0:
- return None, None
- return result.view(np.bool_), na_count
- cdef int _try_bool_flex_nogil(parser_t *parser, int64_t col,
- int64_t line_start,
- int64_t line_end, bint na_filter,
- const kh_str_starts_t *na_hashset,
- const kh_str_starts_t *true_hashset,
- const kh_str_starts_t *false_hashset,
- uint8_t NA, uint8_t *data,
- int *na_count) nogil:
- cdef:
- int error = 0
- Py_ssize_t i, lines = line_end - line_start
- coliter_t it
- const char *word = NULL
- na_count[0] = 0
- coliter_setup(&it, parser, col, line_start)
- if na_filter:
- for i in range(lines):
- COLITER_NEXT(it, word)
- if kh_get_str_starts_item(na_hashset, word):
- # in the hash table
- na_count[0] += 1
- data[0] = NA
- data += 1
- continue
- if kh_get_str_starts_item(true_hashset, word):
- data[0] = 1
- data += 1
- continue
- if kh_get_str_starts_item(false_hashset, word):
- data[0] = 0
- data += 1
- continue
- error = to_boolean(word, data)
- if error != 0:
- return error
- data += 1
- else:
- for i in range(lines):
- COLITER_NEXT(it, word)
- if kh_get_str_starts_item(true_hashset, word):
- data[0] = 1
- data += 1
- continue
- if kh_get_str_starts_item(false_hashset, word):
- data[0] = 0
- data += 1
- continue
- error = to_boolean(word, data)
- if error != 0:
- return error
- data += 1
- return 0
- cdef kh_str_starts_t* kset_from_list(list values) except NULL:
- # caller takes responsibility for freeing the hash table
- cdef:
- Py_ssize_t i
- kh_str_starts_t *table
- int ret = 0
- object val
- table = kh_init_str_starts()
- for i in range(len(values)):
- val = values[i]
- # None creeps in sometimes, which isn't possible here
- if not isinstance(val, bytes):
- kh_destroy_str_starts(table)
- raise ValueError("Must be all encoded bytes")
- kh_put_str_starts_item(table, PyBytes_AsString(val), &ret)
- if table.table.n_buckets <= 128:
- # Resize the hash table to make it almost empty, this
- # reduces amount of hash collisions on lookup thus
- # "key not in table" case is faster.
- # Note that this trades table memory footprint for lookup speed.
- kh_resize_str_starts(table, table.table.n_buckets * 8)
- return table
- cdef kh_float64_t* kset_float64_from_list(values) except NULL:
- # caller takes responsibility for freeing the hash table
- cdef:
- kh_float64_t *table
- int ret = 0
- float64_t val
- object value
- table = kh_init_float64()
- for value in values:
- val = float(value)
- kh_put_float64(table, val, &ret)
- if table.n_buckets <= 128:
- # See reasoning in kset_from_list
- kh_resize_float64(table, table.n_buckets * 8)
- return table
- cdef raise_parser_error(object base, parser_t *parser):
- cdef:
- object old_exc
- object exc_type
- PyObject *type
- PyObject *value
- PyObject *traceback
- if PyErr_Occurred():
- PyErr_Fetch(&type, &value, &traceback)
- Py_XDECREF(traceback)
- if value != NULL:
- old_exc = <object>value
- Py_XDECREF(value)
- # PyErr_Fetch only returned the error message in *value,
- # so the Exception class must be extracted from *type.
- if isinstance(old_exc, str):
- if type != NULL:
- exc_type = <object>type
- else:
- exc_type = ParserError
- Py_XDECREF(type)
- raise exc_type(old_exc)
- else:
- Py_XDECREF(type)
- raise old_exc
- message = f"{base}. C error: "
- if parser.error_msg != NULL:
- message += parser.error_msg.decode("utf-8")
- else:
- message += "no error message set"
- raise ParserError(message)
- # ----------------------------------------------------------------------
- # NA values
- def _compute_na_values():
- int64info = np.iinfo(np.int64)
- int32info = np.iinfo(np.int32)
- int16info = np.iinfo(np.int16)
- int8info = np.iinfo(np.int8)
- uint64info = np.iinfo(np.uint64)
- uint32info = np.iinfo(np.uint32)
- uint16info = np.iinfo(np.uint16)
- uint8info = np.iinfo(np.uint8)
- na_values = {
- np.float32: np.nan,
- np.float64: np.nan,
- np.int64: int64info.min,
- np.int32: int32info.min,
- np.int16: int16info.min,
- np.int8: int8info.min,
- np.uint64: uint64info.max,
- np.uint32: uint32info.max,
- np.uint16: uint16info.max,
- np.uint8: uint8info.max,
- np.bool_: uint8info.max,
- np.object_: np.nan,
- }
- return na_values
- na_values = _compute_na_values()
- for k in list(na_values):
- na_values[np.dtype(k)] = na_values[k]
- # -> ArrayLike
- cdef _apply_converter(object f, parser_t *parser, int64_t col,
- int64_t line_start, int64_t line_end):
- cdef:
- Py_ssize_t i, lines
- coliter_t it
- const char *word = NULL
- ndarray[object] result
- object val
- lines = line_end - line_start
- result = np.empty(lines, dtype=np.object_)
- coliter_setup(&it, parser, col, line_start)
- for i in range(lines):
- COLITER_NEXT(it, word)
- val = PyUnicode_FromString(word)
- result[i] = f(val)
- return lib.maybe_convert_objects(result)
- cdef list _maybe_encode(list values):
- if values is None:
- return []
- return [x.encode("utf-8") if isinstance(x, str) else x for x in values]
- def sanitize_objects(ndarray[object] values, set na_values) -> int:
- """
- Convert specified values, including the given set na_values to np.nan.
- Parameters
- ----------
- values : ndarray[object]
- na_values : set
- Returns
- -------
- na_count : int
- """
- cdef:
- Py_ssize_t i, n
- object val, onan
- Py_ssize_t na_count = 0
- dict memo = {}
- n = len(values)
- onan = np.nan
- for i in range(n):
- val = values[i]
- if val in na_values:
- values[i] = onan
- na_count += 1
- elif val in memo:
- values[i] = memo[val]
- else:
- memo[val] = val
- return na_count
|