parsers.pyx 69 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127
  1. # Copyright (c) 2012, Lambda Foundry, Inc.
  2. # See LICENSE for the license
  3. from collections import defaultdict
  4. from csv import (
  5. QUOTE_MINIMAL,
  6. QUOTE_NONE,
  7. QUOTE_NONNUMERIC,
  8. )
  9. import sys
  10. import time
  11. import warnings
  12. from pandas.errors import ParserError
  13. from pandas.util._exceptions import find_stack_level
  14. from pandas import StringDtype
  15. from pandas.core.arrays import (
  16. ArrowExtensionArray,
  17. BooleanArray,
  18. FloatingArray,
  19. IntegerArray,
  20. )
  21. cimport cython
  22. from cpython.bytes cimport PyBytes_AsString
  23. from cpython.exc cimport (
  24. PyErr_Fetch,
  25. PyErr_Occurred,
  26. )
  27. from cpython.object cimport PyObject
  28. from cpython.ref cimport (
  29. Py_INCREF,
  30. Py_XDECREF,
  31. )
  32. from cpython.unicode cimport (
  33. PyUnicode_AsUTF8String,
  34. PyUnicode_Decode,
  35. PyUnicode_DecodeUTF8,
  36. )
  37. from cython cimport Py_ssize_t
  38. from libc.stdlib cimport free
  39. from libc.string cimport (
  40. strcasecmp,
  41. strlen,
  42. strncpy,
  43. )
  44. cdef extern from "Python.h":
  45. # TODO(cython3): get this from cpython.unicode
  46. object PyUnicode_FromString(char *v)
  47. import numpy as np
  48. cimport numpy as cnp
  49. from numpy cimport (
  50. float64_t,
  51. int64_t,
  52. ndarray,
  53. uint8_t,
  54. uint64_t,
  55. )
  56. cnp.import_array()
  57. from pandas._libs cimport util
  58. from pandas._libs.util cimport (
  59. INT64_MAX,
  60. INT64_MIN,
  61. UINT64_MAX,
  62. )
  63. from pandas._libs import lib
  64. from pandas._libs.khash cimport (
  65. kh_destroy_float64,
  66. kh_destroy_str,
  67. kh_destroy_str_starts,
  68. kh_destroy_strbox,
  69. kh_exist_str,
  70. kh_float64_t,
  71. kh_get_float64,
  72. kh_get_str,
  73. kh_get_str_starts_item,
  74. kh_get_strbox,
  75. kh_init_float64,
  76. kh_init_str,
  77. kh_init_str_starts,
  78. kh_init_strbox,
  79. kh_put_float64,
  80. kh_put_str,
  81. kh_put_str_starts_item,
  82. kh_put_strbox,
  83. kh_resize_float64,
  84. kh_resize_str_starts,
  85. kh_str_starts_t,
  86. kh_str_t,
  87. kh_strbox_t,
  88. khiter_t,
  89. )
  90. from pandas.errors import (
  91. EmptyDataError,
  92. ParserError,
  93. ParserWarning,
  94. )
  95. from pandas.core.dtypes.common import (
  96. is_bool_dtype,
  97. is_datetime64_dtype,
  98. is_extension_array_dtype,
  99. is_float_dtype,
  100. is_integer_dtype,
  101. is_object_dtype,
  102. )
  103. from pandas.core.dtypes.dtypes import CategoricalDtype
  104. from pandas.core.dtypes.inference import is_dict_like
  105. from pandas.core.arrays.boolean import BooleanDtype
  106. cdef:
  107. float64_t INF = <float64_t>np.inf
  108. float64_t NEGINF = -INF
  109. int64_t DEFAULT_CHUNKSIZE = 256 * 1024
  110. cdef extern from "headers/portable.h":
  111. # I *think* this is here so that strcasecmp is defined on Windows
  112. # so we don't get
  113. # `parsers.obj : error LNK2001: unresolved external symbol strcasecmp`
  114. # in Appveyor.
  115. # In a sane world, the `from libc.string cimport` above would fail
  116. # loudly.
  117. pass
  118. cdef extern from "parser/tokenizer.h":
  119. ctypedef enum ParserState:
  120. START_RECORD
  121. START_FIELD
  122. ESCAPED_CHAR
  123. IN_FIELD
  124. IN_QUOTED_FIELD
  125. ESCAPE_IN_QUOTED_FIELD
  126. QUOTE_IN_QUOTED_FIELD
  127. EAT_CRNL
  128. EAT_CRNL_NOP
  129. EAT_WHITESPACE
  130. EAT_COMMENT
  131. EAT_LINE_COMMENT
  132. WHITESPACE_LINE
  133. SKIP_LINE
  134. FINISHED
  135. enum: ERROR_OVERFLOW
  136. ctypedef enum BadLineHandleMethod:
  137. ERROR,
  138. WARN,
  139. SKIP
  140. ctypedef void* (*io_callback)(void *src, size_t nbytes, size_t *bytes_read,
  141. int *status, const char *encoding_errors)
  142. ctypedef int (*io_cleanup)(void *src)
  143. ctypedef struct parser_t:
  144. void *source
  145. io_callback cb_io
  146. io_cleanup cb_cleanup
  147. int64_t chunksize # Number of bytes to prepare for each chunk
  148. char *data # pointer to data to be processed
  149. int64_t datalen # amount of data available
  150. int64_t datapos
  151. # where to write out tokenized data
  152. char *stream
  153. uint64_t stream_len
  154. uint64_t stream_cap
  155. # Store words in (potentially ragged) matrix for now, hmm
  156. char **words
  157. int64_t *word_starts # where we are in the stream
  158. uint64_t words_len
  159. uint64_t words_cap
  160. uint64_t max_words_cap # maximum word cap encountered
  161. char *pword_start # pointer to stream start of current field
  162. int64_t word_start # position start of current field
  163. int64_t *line_start # position in words for start of line
  164. int64_t *line_fields # Number of fields in each line
  165. uint64_t lines # Number of lines observed
  166. uint64_t file_lines # Number of lines observed (with bad/skipped)
  167. uint64_t lines_cap # Vector capacity
  168. # Tokenizing stuff
  169. ParserState state
  170. int doublequote # is " represented by ""? */
  171. char delimiter # field separator */
  172. int delim_whitespace # consume tabs / spaces instead
  173. char quotechar # quote character */
  174. char escapechar # escape character */
  175. char lineterminator
  176. int skipinitialspace # ignore spaces following delimiter? */
  177. int quoting # style of quoting to write */
  178. char commentchar
  179. int allow_embedded_newline
  180. int usecols
  181. Py_ssize_t expected_fields
  182. BadLineHandleMethod on_bad_lines
  183. # floating point options
  184. char decimal
  185. char sci
  186. # thousands separator (comma, period)
  187. char thousands
  188. int header # Boolean: 1: has header, 0: no header
  189. int64_t header_start # header row start
  190. uint64_t header_end # header row end
  191. void *skipset
  192. PyObject *skipfunc
  193. int64_t skip_first_N_rows
  194. int64_t skipfooter
  195. # pick one, depending on whether the converter requires GIL
  196. float64_t (*double_converter)(const char *, char **,
  197. char, char, char,
  198. int, int *, int *) nogil
  199. # error handling
  200. char *warn_msg
  201. char *error_msg
  202. int64_t skip_empty_lines
  203. ctypedef struct coliter_t:
  204. char **words
  205. int64_t *line_start
  206. int64_t col
  207. ctypedef struct uint_state:
  208. int seen_sint
  209. int seen_uint
  210. int seen_null
  211. void uint_state_init(uint_state *self)
  212. int uint64_conflict(uint_state *self)
  213. void coliter_setup(coliter_t *it, parser_t *parser,
  214. int64_t i, int64_t start) nogil
  215. void COLITER_NEXT(coliter_t, const char *) nogil
  216. parser_t* parser_new()
  217. int parser_init(parser_t *self) nogil
  218. void parser_free(parser_t *self) nogil
  219. void parser_del(parser_t *self) nogil
  220. int parser_add_skiprow(parser_t *self, int64_t row)
  221. int parser_set_skipfirstnrows(parser_t *self, int64_t nrows)
  222. void parser_set_default_options(parser_t *self)
  223. int parser_consume_rows(parser_t *self, size_t nrows)
  224. int parser_trim_buffers(parser_t *self)
  225. int tokenize_all_rows(parser_t *self, const char *encoding_errors) nogil
  226. int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
  227. int64_t str_to_int64(char *p_item, int64_t int_min,
  228. int64_t int_max, int *error, char tsep) nogil
  229. uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
  230. uint64_t uint_max, int *error, char tsep) nogil
  231. float64_t xstrtod(const char *p, char **q, char decimal,
  232. char sci, char tsep, int skip_trailing,
  233. int *error, int *maybe_int) nogil
  234. float64_t precise_xstrtod(const char *p, char **q, char decimal,
  235. char sci, char tsep, int skip_trailing,
  236. int *error, int *maybe_int) nogil
  237. float64_t round_trip(const char *p, char **q, char decimal,
  238. char sci, char tsep, int skip_trailing,
  239. int *error, int *maybe_int) nogil
  240. int to_boolean(const char *item, uint8_t *val) nogil
  241. cdef extern from "parser/io.h":
  242. void *new_rd_source(object obj) except NULL
  243. int del_rd_source(void *src)
  244. void* buffer_rd_bytes(void *source, size_t nbytes,
  245. size_t *bytes_read, int *status, const char *encoding_errors)
  246. cdef class TextReader:
  247. """
  248. # source: StringIO or file object
  249. ..versionchange:: 1.2.0
  250. removed 'compression', 'memory_map', and 'encoding' argument.
  251. These arguments are outsourced to CParserWrapper.
  252. 'source' has to be a file handle.
  253. """
  254. cdef:
  255. parser_t *parser
  256. object na_fvalues
  257. object true_values, false_values
  258. object handle
  259. object orig_header
  260. bint na_filter, keep_default_na, verbose, has_usecols, has_mi_columns
  261. bint allow_leading_cols
  262. uint64_t parser_start # this is modified after __init__
  263. list clocks
  264. const char *encoding_errors
  265. kh_str_starts_t *false_set
  266. kh_str_starts_t *true_set
  267. int64_t buffer_lines, skipfooter
  268. list dtype_cast_order # list[np.dtype]
  269. list names # can be None
  270. set noconvert # set[int]
  271. cdef public:
  272. int64_t leading_cols, table_width
  273. object delimiter # bytes or str
  274. object converters
  275. object na_values
  276. list header # list[list[non-negative integers]]
  277. object index_col
  278. object skiprows
  279. object dtype
  280. object usecols
  281. set unnamed_cols # set[str]
  282. str dtype_backend
  283. def __cinit__(self, source,
  284. delimiter=b",", # bytes | str
  285. header=0,
  286. int64_t header_start=0,
  287. uint64_t header_end=0,
  288. index_col=None,
  289. names=None,
  290. tokenize_chunksize=DEFAULT_CHUNKSIZE,
  291. bint delim_whitespace=False,
  292. converters=None,
  293. bint skipinitialspace=False,
  294. escapechar=None, # bytes | str
  295. bint doublequote=True,
  296. quotechar=b'"',
  297. quoting=0, # int
  298. lineterminator=None, # bytes | str
  299. comment=None,
  300. decimal=b".", # bytes | str
  301. thousands=None, # bytes | str
  302. dtype=None,
  303. usecols=None,
  304. on_bad_lines=ERROR,
  305. bint na_filter=True,
  306. na_values=None,
  307. na_fvalues=None,
  308. bint keep_default_na=True,
  309. true_values=None,
  310. false_values=None,
  311. bint allow_leading_cols=True,
  312. skiprows=None,
  313. skipfooter=0, # int64_t
  314. bint verbose=False,
  315. float_precision=None,
  316. bint skip_blank_lines=True,
  317. encoding_errors=b"strict",
  318. dtype_backend="numpy"):
  319. # set encoding for native Python and C library
  320. if isinstance(encoding_errors, str):
  321. encoding_errors = encoding_errors.encode("utf-8")
  322. elif encoding_errors is None:
  323. encoding_errors = b"strict"
  324. Py_INCREF(encoding_errors)
  325. self.encoding_errors = PyBytes_AsString(encoding_errors)
  326. self.parser = parser_new()
  327. self.parser.chunksize = tokenize_chunksize
  328. # For timekeeping
  329. self.clocks = []
  330. self.parser.usecols = (usecols is not None)
  331. self._setup_parser_source(source)
  332. parser_set_default_options(self.parser)
  333. parser_init(self.parser)
  334. if delim_whitespace:
  335. self.parser.delim_whitespace = delim_whitespace
  336. else:
  337. if len(delimiter) > 1:
  338. raise ValueError("only length-1 separators excluded right now")
  339. self.parser.delimiter = <char>ord(delimiter)
  340. # ----------------------------------------
  341. # parser options
  342. self.parser.doublequote = doublequote
  343. self.parser.skipinitialspace = skipinitialspace
  344. self.parser.skip_empty_lines = skip_blank_lines
  345. if lineterminator is not None:
  346. if len(lineterminator) != 1:
  347. raise ValueError("Only length-1 line terminators supported")
  348. self.parser.lineterminator = <char>ord(lineterminator)
  349. if len(decimal) != 1:
  350. raise ValueError("Only length-1 decimal markers supported")
  351. self.parser.decimal = <char>ord(decimal)
  352. if thousands is not None:
  353. if len(thousands) != 1:
  354. raise ValueError("Only length-1 thousands markers supported")
  355. self.parser.thousands = <char>ord(thousands)
  356. if escapechar is not None:
  357. if len(escapechar) != 1:
  358. raise ValueError("Only length-1 escapes supported")
  359. self.parser.escapechar = <char>ord(escapechar)
  360. self._set_quoting(quotechar, quoting)
  361. dtype_order = ["int64", "float64", "bool", "object"]
  362. if quoting == QUOTE_NONNUMERIC:
  363. # consistent with csv module semantics, cast all to float
  364. dtype_order = dtype_order[1:]
  365. self.dtype_cast_order = [np.dtype(x) for x in dtype_order]
  366. if comment is not None:
  367. if len(comment) > 1:
  368. raise ValueError("Only length-1 comment characters supported")
  369. self.parser.commentchar = <char>ord(comment)
  370. self.parser.on_bad_lines = on_bad_lines
  371. self.skiprows = skiprows
  372. if skiprows is not None:
  373. self._make_skiprow_set()
  374. self.skipfooter = skipfooter
  375. if usecols is not None:
  376. self.has_usecols = 1
  377. # GH-20558, validate usecols at higher level and only pass clean
  378. # usecols into TextReader.
  379. self.usecols = usecols
  380. if skipfooter > 0:
  381. self.parser.on_bad_lines = SKIP
  382. self.delimiter = delimiter
  383. self.na_values = na_values
  384. if na_fvalues is None:
  385. na_fvalues = set()
  386. self.na_fvalues = na_fvalues
  387. self.true_values = _maybe_encode(true_values) + _true_values
  388. self.false_values = _maybe_encode(false_values) + _false_values
  389. self.true_set = kset_from_list(self.true_values)
  390. self.false_set = kset_from_list(self.false_values)
  391. self.keep_default_na = keep_default_na
  392. self.converters = converters
  393. self.na_filter = na_filter
  394. self.verbose = verbose
  395. if float_precision == "round_trip":
  396. # see gh-15140
  397. self.parser.double_converter = round_trip
  398. elif float_precision == "legacy":
  399. self.parser.double_converter = xstrtod
  400. elif float_precision == "high" or float_precision is None:
  401. self.parser.double_converter = precise_xstrtod
  402. else:
  403. raise ValueError(f"Unrecognized float_precision option: "
  404. f"{float_precision}")
  405. # Caller is responsible for ensuring we have one of
  406. # - None
  407. # - DtypeObj
  408. # - dict[Any, DtypeObj]
  409. self.dtype = dtype
  410. self.dtype_backend = dtype_backend
  411. self.noconvert = set()
  412. self.index_col = index_col
  413. # ----------------------------------------
  414. # header stuff
  415. self.allow_leading_cols = allow_leading_cols
  416. self.leading_cols = 0 # updated in _get_header
  417. # TODO: no header vs. header is not the first row
  418. self.has_mi_columns = 0
  419. self.orig_header = header
  420. if header is None:
  421. # sentinel value
  422. self.parser.header_start = -1
  423. self.parser.header_end = -1
  424. self.parser.header = -1
  425. self.parser_start = 0
  426. prelim_header = []
  427. else:
  428. if isinstance(header, list):
  429. if len(header) > 1:
  430. # need to artificially skip the final line
  431. # which is still a header line
  432. header = list(header)
  433. header.append(header[-1] + 1)
  434. self.parser.header_end = header[-1]
  435. self.has_mi_columns = 1
  436. else:
  437. self.parser.header_end = header[0]
  438. self.parser_start = header[-1] + 1
  439. self.parser.header_start = header[0]
  440. self.parser.header = header[0]
  441. prelim_header = header
  442. else:
  443. self.parser.header_start = header
  444. self.parser.header_end = header
  445. self.parser_start = header + 1
  446. self.parser.header = header
  447. prelim_header = [header]
  448. self.names = names
  449. header, table_width, unnamed_cols = self._get_header(prelim_header)
  450. # header, table_width, and unnamed_cols are set here, never changed
  451. self.header = header
  452. self.table_width = table_width
  453. self.unnamed_cols = unnamed_cols
  454. if not self.table_width:
  455. raise EmptyDataError("No columns to parse from file")
  456. # Compute buffer_lines as function of table width.
  457. heuristic = 2**20 // self.table_width
  458. self.buffer_lines = 1
  459. while self.buffer_lines * 2 < heuristic:
  460. self.buffer_lines *= 2
  461. def __init__(self, *args, **kwargs):
  462. pass
  463. def __dealloc__(self):
  464. _close(self)
  465. parser_del(self.parser)
  466. def close(self):
  467. _close(self)
  468. def _set_quoting(self, quote_char: str | bytes | None, quoting: int):
  469. if not isinstance(quoting, int):
  470. raise TypeError('"quoting" must be an integer')
  471. if not QUOTE_MINIMAL <= quoting <= QUOTE_NONE:
  472. raise TypeError('bad "quoting" value')
  473. if not isinstance(quote_char, (str, bytes)) and quote_char is not None:
  474. dtype = type(quote_char).__name__
  475. raise TypeError(f'"quotechar" must be string, not {dtype}')
  476. if quote_char is None or quote_char == "":
  477. if quoting != QUOTE_NONE:
  478. raise TypeError("quotechar must be set if quoting enabled")
  479. self.parser.quoting = quoting
  480. self.parser.quotechar = -1
  481. elif len(quote_char) > 1: # 0-len case handled earlier
  482. raise TypeError('"quotechar" must be a 1-character string')
  483. else:
  484. self.parser.quoting = quoting
  485. self.parser.quotechar = <char>ord(quote_char)
  486. cdef _make_skiprow_set(self):
  487. if util.is_integer_object(self.skiprows):
  488. parser_set_skipfirstnrows(self.parser, self.skiprows)
  489. elif not callable(self.skiprows):
  490. for i in self.skiprows:
  491. parser_add_skiprow(self.parser, i)
  492. else:
  493. self.parser.skipfunc = <PyObject *>self.skiprows
  494. cdef _setup_parser_source(self, source):
  495. cdef:
  496. void *ptr
  497. ptr = new_rd_source(source)
  498. self.parser.source = ptr
  499. self.parser.cb_io = &buffer_rd_bytes
  500. self.parser.cb_cleanup = &del_rd_source
  501. cdef _get_header(self, list prelim_header):
  502. # header is now a list of lists, so field_count should use header[0]
  503. #
  504. # modifies:
  505. # self.parser attributes
  506. # self.parser_start
  507. # self.leading_cols
  508. cdef:
  509. Py_ssize_t i, start, field_count, passed_count, unnamed_count, level
  510. char *word
  511. str name
  512. uint64_t hr, data_line = 0
  513. list header = []
  514. set unnamed_cols = set()
  515. if self.parser.header_start >= 0:
  516. # Header is in the file
  517. for level, hr in enumerate(prelim_header):
  518. this_header = []
  519. if self.parser.lines < hr + 1:
  520. self._tokenize_rows(hr + 2)
  521. if self.parser.lines == 0:
  522. field_count = 0
  523. start = self.parser.line_start[0]
  524. # e.g., if header=3 and file only has 2 lines
  525. elif (self.parser.lines < hr + 1
  526. and not isinstance(self.orig_header, list)) or (
  527. self.parser.lines < hr):
  528. msg = self.orig_header
  529. if isinstance(msg, list):
  530. joined = ",".join(str(m) for m in msg)
  531. msg = f"[{joined}], len of {len(msg)},"
  532. raise ParserError(
  533. f"Passed header={msg} but only "
  534. f"{self.parser.lines} lines in file")
  535. else:
  536. field_count = self.parser.line_fields[hr]
  537. start = self.parser.line_start[hr]
  538. unnamed_count = 0
  539. unnamed_col_indices = []
  540. for i in range(field_count):
  541. word = self.parser.words[start + i]
  542. name = PyUnicode_DecodeUTF8(word, strlen(word),
  543. self.encoding_errors)
  544. if name == "":
  545. if self.has_mi_columns:
  546. name = f"Unnamed: {i}_level_{level}"
  547. else:
  548. name = f"Unnamed: {i}"
  549. unnamed_count += 1
  550. unnamed_col_indices.append(i)
  551. this_header.append(name)
  552. if not self.has_mi_columns:
  553. # Ensure that regular columns are used before unnamed ones
  554. # to keep given names and mangle unnamed columns
  555. col_loop_order = [i for i in range(len(this_header))
  556. if i not in unnamed_col_indices
  557. ] + unnamed_col_indices
  558. counts = {}
  559. for i in col_loop_order:
  560. col = this_header[i]
  561. old_col = col
  562. cur_count = counts.get(col, 0)
  563. if cur_count > 0:
  564. while cur_count > 0:
  565. counts[old_col] = cur_count + 1
  566. col = f"{old_col}.{cur_count}"
  567. if col in this_header:
  568. cur_count += 1
  569. else:
  570. cur_count = counts.get(col, 0)
  571. if (
  572. self.dtype is not None
  573. and is_dict_like(self.dtype)
  574. and self.dtype.get(old_col) is not None
  575. and self.dtype.get(col) is None
  576. ):
  577. self.dtype.update({col: self.dtype.get(old_col)})
  578. this_header[i] = col
  579. counts[col] = cur_count + 1
  580. if self.has_mi_columns:
  581. # If we have grabbed an extra line, but it's not in our
  582. # format, save in the buffer, and create an blank extra
  583. # line for the rest of the parsing code.
  584. if hr == prelim_header[-1]:
  585. lc = len(this_header)
  586. ic = (len(self.index_col) if self.index_col
  587. is not None else 0)
  588. # if wrong number of blanks or no index, not our format
  589. if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0:
  590. hr -= 1
  591. self.parser_start -= 1
  592. this_header = [None] * lc
  593. data_line = hr + 1
  594. header.append(this_header)
  595. unnamed_cols.update({this_header[i] for i in unnamed_col_indices})
  596. if self.names is not None:
  597. header = [self.names]
  598. elif self.names is not None:
  599. # Names passed
  600. if self.parser.lines < 1:
  601. if not self.has_usecols:
  602. self.parser.expected_fields = len(self.names)
  603. self._tokenize_rows(1)
  604. header = [self.names]
  605. if self.parser.lines < 1:
  606. field_count = len(header[0])
  607. else:
  608. field_count = self.parser.line_fields[data_line]
  609. # Enforce this unless usecols
  610. if not self.has_usecols:
  611. self.parser.expected_fields = max(field_count, len(self.names))
  612. else:
  613. # No header passed nor to be found in the file
  614. if self.parser.lines < 1:
  615. self._tokenize_rows(1)
  616. return None, self.parser.line_fields[0], unnamed_cols
  617. # Corner case, not enough lines in the file
  618. if self.parser.lines < data_line + 1:
  619. field_count = len(header[0])
  620. else:
  621. field_count = self.parser.line_fields[data_line]
  622. # #2981
  623. if self.names is not None:
  624. field_count = max(field_count, len(self.names))
  625. passed_count = len(header[0])
  626. if (self.has_usecols and self.allow_leading_cols and
  627. not callable(self.usecols)):
  628. nuse = len(self.usecols)
  629. if nuse == passed_count:
  630. self.leading_cols = 0
  631. elif self.names is None and nuse < passed_count:
  632. self.leading_cols = field_count - passed_count
  633. elif passed_count != field_count:
  634. raise ValueError("Number of passed names did not match number of "
  635. "header fields in the file")
  636. # oh boy, #2442, #2981
  637. elif self.allow_leading_cols and passed_count < field_count:
  638. self.leading_cols = field_count - passed_count
  639. return header, field_count, unnamed_cols
  640. def read(self, rows: int | None = None) -> dict[int, "ArrayLike"]:
  641. """
  642. rows=None --> read all rows
  643. """
  644. # Don't care about memory usage
  645. columns = self._read_rows(rows, 1)
  646. return columns
  647. def read_low_memory(self, rows: int | None)-> list[dict[int, "ArrayLike"]]:
  648. """
  649. rows=None --> read all rows
  650. """
  651. # Conserve intermediate space
  652. # Caller is responsible for concatenating chunks,
  653. # see c_parser_wrapper._concatenate_chunks
  654. cdef:
  655. size_t rows_read = 0
  656. list chunks = []
  657. if rows is None:
  658. while True:
  659. try:
  660. chunk = self._read_rows(self.buffer_lines, 0)
  661. if len(chunk) == 0:
  662. break
  663. except StopIteration:
  664. break
  665. else:
  666. chunks.append(chunk)
  667. else:
  668. while rows_read < rows:
  669. try:
  670. crows = min(self.buffer_lines, rows - rows_read)
  671. chunk = self._read_rows(crows, 0)
  672. if len(chunk) == 0:
  673. break
  674. rows_read += len(list(chunk.values())[0])
  675. except StopIteration:
  676. break
  677. else:
  678. chunks.append(chunk)
  679. parser_trim_buffers(self.parser)
  680. if len(chunks) == 0:
  681. raise StopIteration
  682. return chunks
  683. cdef _tokenize_rows(self, size_t nrows):
  684. cdef:
  685. int status
  686. with nogil:
  687. status = tokenize_nrows(self.parser, nrows, self.encoding_errors)
  688. self._check_tokenize_status(status)
  689. cdef _check_tokenize_status(self, int status):
  690. if self.parser.warn_msg != NULL:
  691. print(PyUnicode_DecodeUTF8(
  692. self.parser.warn_msg, strlen(self.parser.warn_msg),
  693. self.encoding_errors), file=sys.stderr)
  694. free(self.parser.warn_msg)
  695. self.parser.warn_msg = NULL
  696. if status < 0:
  697. raise_parser_error("Error tokenizing data", self.parser)
  698. # -> dict[int, "ArrayLike"]
  699. cdef _read_rows(self, rows, bint trim):
  700. cdef:
  701. int64_t buffered_lines
  702. int64_t irows
  703. self._start_clock()
  704. if rows is not None:
  705. irows = rows
  706. buffered_lines = self.parser.lines - self.parser_start
  707. if buffered_lines < irows:
  708. self._tokenize_rows(irows - buffered_lines)
  709. if self.skipfooter > 0:
  710. raise ValueError("skipfooter can only be used to read "
  711. "the whole file")
  712. else:
  713. with nogil:
  714. status = tokenize_all_rows(self.parser, self.encoding_errors)
  715. self._check_tokenize_status(status)
  716. if self.parser_start >= self.parser.lines:
  717. raise StopIteration
  718. self._end_clock("Tokenization")
  719. self._start_clock()
  720. columns = self._convert_column_data(rows)
  721. self._end_clock("Type conversion")
  722. self._start_clock()
  723. if len(columns) > 0:
  724. rows_read = len(list(columns.values())[0])
  725. # trim
  726. parser_consume_rows(self.parser, rows_read)
  727. if trim:
  728. parser_trim_buffers(self.parser)
  729. self.parser_start -= rows_read
  730. self._end_clock("Parser memory cleanup")
  731. return columns
  732. cdef _start_clock(self):
  733. self.clocks.append(time.time())
  734. cdef _end_clock(self, str what):
  735. if self.verbose:
  736. elapsed = time.time() - self.clocks.pop(-1)
  737. print(f"{what} took: {elapsed * 1000:.2f} ms")
  738. def set_noconvert(self, i: int) -> None:
  739. self.noconvert.add(i)
  740. def remove_noconvert(self, i: int) -> None:
  741. self.noconvert.remove(i)
  742. def _convert_column_data(self, rows: int | None) -> dict[int, "ArrayLike"]:
  743. cdef:
  744. int64_t i
  745. int nused
  746. kh_str_starts_t *na_hashset = NULL
  747. int64_t start, end
  748. object name, na_flist, col_dtype = None
  749. bint na_filter = 0
  750. int64_t num_cols
  751. dict results
  752. start = self.parser_start
  753. if rows is None:
  754. end = self.parser.lines
  755. else:
  756. end = min(start + rows, self.parser.lines)
  757. num_cols = -1
  758. # Py_ssize_t cast prevents build warning
  759. for i in range(<Py_ssize_t>self.parser.lines):
  760. num_cols = (num_cols < self.parser.line_fields[i]) * \
  761. self.parser.line_fields[i] + \
  762. (num_cols >= self.parser.line_fields[i]) * num_cols
  763. usecols_not_callable_and_exists = not callable(self.usecols) and self.usecols
  764. names_larger_num_cols = (self.names and
  765. len(self.names) - self.leading_cols > num_cols)
  766. if self.table_width - self.leading_cols > num_cols:
  767. if (usecols_not_callable_and_exists
  768. and self.table_width - self.leading_cols < len(self.usecols)
  769. or names_larger_num_cols):
  770. raise ParserError(f"Too many columns specified: expected "
  771. f"{self.table_width - self.leading_cols} "
  772. f"and found {num_cols}")
  773. if (usecols_not_callable_and_exists and
  774. all(isinstance(u, int) for u in self.usecols)):
  775. missing_usecols = [col for col in self.usecols if col >= num_cols]
  776. if missing_usecols:
  777. raise ParserError(
  778. "Defining usecols without of bounds indices is not allowed. "
  779. f"{missing_usecols} are out of bounds.",
  780. )
  781. results = {}
  782. nused = 0
  783. is_default_dict_dtype = isinstance(self.dtype, defaultdict)
  784. for i in range(self.table_width):
  785. if i < self.leading_cols:
  786. # Pass through leading columns always
  787. name = i
  788. elif (self.usecols and not callable(self.usecols) and
  789. nused == len(self.usecols)):
  790. # Once we've gathered all requested columns, stop. GH5766
  791. break
  792. else:
  793. name = self._get_column_name(i, nused)
  794. usecols = set()
  795. if callable(self.usecols):
  796. if self.usecols(name):
  797. usecols = {i}
  798. else:
  799. usecols = self.usecols
  800. if self.has_usecols and not (i in usecols or
  801. name in usecols):
  802. continue
  803. nused += 1
  804. conv = self._get_converter(i, name)
  805. col_dtype = None
  806. if self.dtype is not None:
  807. if isinstance(self.dtype, dict):
  808. if name in self.dtype:
  809. col_dtype = self.dtype[name]
  810. elif i in self.dtype:
  811. col_dtype = self.dtype[i]
  812. elif is_default_dict_dtype:
  813. col_dtype = self.dtype[name]
  814. else:
  815. if self.dtype.names:
  816. # structured array
  817. col_dtype = np.dtype(self.dtype.descr[i][1])
  818. else:
  819. col_dtype = self.dtype
  820. if conv:
  821. if col_dtype is not None:
  822. warnings.warn((f"Both a converter and dtype were specified "
  823. f"for column {name} - only the converter will "
  824. f"be used."), ParserWarning,
  825. stacklevel=find_stack_level())
  826. results[i] = _apply_converter(conv, self.parser, i, start, end)
  827. continue
  828. # Collect the list of NaN values associated with the column.
  829. # If we aren't supposed to do that, or none are collected,
  830. # we set `na_filter` to `0` (`1` otherwise).
  831. na_flist = set()
  832. if self.na_filter:
  833. na_list, na_flist = self._get_na_list(i, name)
  834. if na_list is None:
  835. na_filter = 0
  836. else:
  837. na_filter = 1
  838. na_hashset = kset_from_list(na_list)
  839. else:
  840. na_filter = 0
  841. # Attempt to parse tokens and infer dtype of the column.
  842. # Should return as the desired dtype (inferred or specified).
  843. try:
  844. col_res, na_count = self._convert_tokens(
  845. i, start, end, name, na_filter, na_hashset,
  846. na_flist, col_dtype)
  847. finally:
  848. # gh-21353
  849. #
  850. # Cleanup the NaN hash that we generated
  851. # to avoid memory leaks.
  852. if na_filter:
  853. self._free_na_set(na_hashset)
  854. # don't try to upcast EAs
  855. if (
  856. na_count > 0 and not is_extension_array_dtype(col_dtype)
  857. or self.dtype_backend != "numpy"
  858. ):
  859. use_dtype_backend = self.dtype_backend != "numpy" and col_dtype is None
  860. col_res = _maybe_upcast(
  861. col_res,
  862. use_dtype_backend=use_dtype_backend,
  863. dtype_backend=self.dtype_backend,
  864. )
  865. if col_res is None:
  866. raise ParserError(f"Unable to parse column {i}")
  867. results[i] = col_res
  868. self.parser_start += end - start
  869. return results
  870. # -> tuple["ArrayLike", int]:
  871. cdef _convert_tokens(self, Py_ssize_t i, int64_t start,
  872. int64_t end, object name, bint na_filter,
  873. kh_str_starts_t *na_hashset,
  874. object na_flist, object col_dtype):
  875. if col_dtype is not None:
  876. col_res, na_count = self._convert_with_dtype(
  877. col_dtype, i, start, end, na_filter,
  878. 1, na_hashset, na_flist)
  879. # Fallback on the parse (e.g. we requested int dtype,
  880. # but its actually a float).
  881. if col_res is not None:
  882. return col_res, na_count
  883. if i in self.noconvert:
  884. return self._string_convert(i, start, end, na_filter, na_hashset)
  885. else:
  886. col_res = None
  887. for dt in self.dtype_cast_order:
  888. try:
  889. col_res, na_count = self._convert_with_dtype(
  890. dt, i, start, end, na_filter, 0, na_hashset, na_flist)
  891. except ValueError:
  892. # This error is raised from trying to convert to uint64,
  893. # and we discover that we cannot convert to any numerical
  894. # dtype successfully. As a result, we leave the data
  895. # column AS IS with object dtype.
  896. col_res, na_count = self._convert_with_dtype(
  897. np.dtype("object"), i, start, end, 0,
  898. 0, na_hashset, na_flist)
  899. except OverflowError:
  900. col_res, na_count = self._convert_with_dtype(
  901. np.dtype("object"), i, start, end, na_filter,
  902. 0, na_hashset, na_flist)
  903. if col_res is not None:
  904. break
  905. # we had a fallback parse on the dtype, so now try to cast
  906. if col_res is not None and col_dtype is not None:
  907. # If col_res is bool, it might actually be a bool array mixed with NaNs
  908. # (see _try_bool_flex()). Usually this would be taken care of using
  909. # _maybe_upcast(), but if col_dtype is a floating type we should just
  910. # take care of that cast here.
  911. if col_res.dtype == np.bool_ and is_float_dtype(col_dtype):
  912. mask = col_res.view(np.uint8) == na_values[np.uint8]
  913. col_res = col_res.astype(col_dtype)
  914. np.putmask(col_res, mask, np.nan)
  915. return col_res, na_count
  916. # NaNs are already cast to True here, so can not use astype
  917. if col_res.dtype == np.bool_ and is_integer_dtype(col_dtype):
  918. if na_count > 0:
  919. raise ValueError(
  920. f"cannot safely convert passed user dtype of "
  921. f"{col_dtype} for {np.bool_} dtyped data in "
  922. f"column {i} due to NA values"
  923. )
  924. # only allow safe casts, eg. with a nan you cannot safely cast to int
  925. try:
  926. col_res = col_res.astype(col_dtype, casting="safe")
  927. except TypeError:
  928. # float -> int conversions can fail the above
  929. # even with no nans
  930. col_res_orig = col_res
  931. col_res = col_res.astype(col_dtype)
  932. if (col_res != col_res_orig).any():
  933. raise ValueError(
  934. f"cannot safely convert passed user dtype of "
  935. f"{col_dtype} for {col_res_orig.dtype.name} dtyped data in "
  936. f"column {i}")
  937. return col_res, na_count
  938. cdef _convert_with_dtype(self, object dtype, Py_ssize_t i,
  939. int64_t start, int64_t end,
  940. bint na_filter,
  941. bint user_dtype,
  942. kh_str_starts_t *na_hashset,
  943. object na_flist):
  944. if isinstance(dtype, CategoricalDtype):
  945. # TODO: I suspect that _categorical_convert could be
  946. # optimized when dtype is an instance of CategoricalDtype
  947. codes, cats, na_count = _categorical_convert(
  948. self.parser, i, start, end, na_filter, na_hashset)
  949. # Method accepts list of strings, not encoded ones.
  950. true_values = [x.decode() for x in self.true_values]
  951. array_type = dtype.construct_array_type()
  952. cat = array_type._from_inferred_categories(
  953. cats, codes, dtype, true_values=true_values)
  954. return cat, na_count
  955. elif is_extension_array_dtype(dtype):
  956. result, na_count = self._string_convert(i, start, end, na_filter,
  957. na_hashset)
  958. array_type = dtype.construct_array_type()
  959. try:
  960. # use _from_sequence_of_strings if the class defines it
  961. if isinstance(dtype, BooleanDtype):
  962. # xref GH 47534: BooleanArray._from_sequence_of_strings has extra
  963. # kwargs
  964. true_values = [x.decode() for x in self.true_values]
  965. false_values = [x.decode() for x in self.false_values]
  966. result = array_type._from_sequence_of_strings(
  967. result, dtype=dtype, true_values=true_values,
  968. false_values=false_values)
  969. else:
  970. result = array_type._from_sequence_of_strings(result, dtype=dtype)
  971. except NotImplementedError:
  972. raise NotImplementedError(
  973. f"Extension Array: {array_type} must implement "
  974. f"_from_sequence_of_strings in order "
  975. f"to be used in parser methods")
  976. return result, na_count
  977. elif is_integer_dtype(dtype):
  978. try:
  979. result, na_count = _try_int64(self.parser, i, start,
  980. end, na_filter, na_hashset)
  981. if user_dtype and na_count is not None:
  982. if na_count > 0:
  983. raise ValueError(f"Integer column has NA values in column {i}")
  984. except OverflowError:
  985. result = _try_uint64(self.parser, i, start, end,
  986. na_filter, na_hashset)
  987. na_count = 0
  988. if result is not None and dtype != "int64":
  989. result = result.astype(dtype)
  990. return result, na_count
  991. elif is_float_dtype(dtype):
  992. result, na_count = _try_double(self.parser, i, start, end,
  993. na_filter, na_hashset, na_flist)
  994. if result is not None and dtype != "float64":
  995. result = result.astype(dtype)
  996. return result, na_count
  997. elif is_bool_dtype(dtype):
  998. result, na_count = _try_bool_flex(self.parser, i, start, end,
  999. na_filter, na_hashset,
  1000. self.true_set, self.false_set)
  1001. if user_dtype and na_count is not None:
  1002. if na_count > 0:
  1003. raise ValueError(f"Bool column has NA values in column {i}")
  1004. return result, na_count
  1005. elif dtype.kind == "S":
  1006. # TODO: na handling
  1007. width = dtype.itemsize
  1008. if width > 0:
  1009. result = _to_fw_string(self.parser, i, start, end, width)
  1010. return result, 0
  1011. # treat as a regular string parsing
  1012. return self._string_convert(i, start, end, na_filter,
  1013. na_hashset)
  1014. elif dtype.kind == "U":
  1015. width = dtype.itemsize
  1016. if width > 0:
  1017. raise TypeError(f"the dtype {dtype} is not supported for parsing")
  1018. # unicode variable width
  1019. return self._string_convert(i, start, end, na_filter,
  1020. na_hashset)
  1021. elif is_object_dtype(dtype):
  1022. return self._string_convert(i, start, end, na_filter,
  1023. na_hashset)
  1024. elif is_datetime64_dtype(dtype):
  1025. raise TypeError(f"the dtype {dtype} is not supported "
  1026. f"for parsing, pass this column "
  1027. f"using parse_dates instead")
  1028. else:
  1029. raise TypeError(f"the dtype {dtype} is not supported for parsing")
  1030. # -> tuple[ndarray[object], int]
  1031. cdef _string_convert(self, Py_ssize_t i, int64_t start, int64_t end,
  1032. bint na_filter, kh_str_starts_t *na_hashset):
  1033. return _string_box_utf8(self.parser, i, start, end, na_filter,
  1034. na_hashset, self.encoding_errors)
  1035. def _get_converter(self, i: int, name):
  1036. if self.converters is None:
  1037. return None
  1038. if name is not None and name in self.converters:
  1039. return self.converters[name]
  1040. # Converter for position, if any
  1041. return self.converters.get(i)
  1042. cdef _get_na_list(self, Py_ssize_t i, name):
  1043. # Note: updates self.na_values, self.na_fvalues
  1044. if self.na_values is None:
  1045. return None, set()
  1046. if isinstance(self.na_values, dict):
  1047. key = None
  1048. values = None
  1049. if name is not None and name in self.na_values:
  1050. key = name
  1051. elif i in self.na_values:
  1052. key = i
  1053. else: # No na_values provided for this column.
  1054. if self.keep_default_na:
  1055. return _NA_VALUES, set()
  1056. return list(), set()
  1057. values = self.na_values[key]
  1058. if values is not None and not isinstance(values, list):
  1059. values = list(values)
  1060. fvalues = self.na_fvalues[key]
  1061. if fvalues is not None and not isinstance(fvalues, set):
  1062. fvalues = set(fvalues)
  1063. return _ensure_encoded(values), fvalues
  1064. else:
  1065. if not isinstance(self.na_values, list):
  1066. self.na_values = list(self.na_values)
  1067. if not isinstance(self.na_fvalues, set):
  1068. self.na_fvalues = set(self.na_fvalues)
  1069. return _ensure_encoded(self.na_values), self.na_fvalues
  1070. cdef _free_na_set(self, kh_str_starts_t *table):
  1071. kh_destroy_str_starts(table)
  1072. cdef _get_column_name(self, Py_ssize_t i, Py_ssize_t nused):
  1073. cdef int64_t j
  1074. if self.has_usecols and self.names is not None:
  1075. if (not callable(self.usecols) and
  1076. len(self.names) == len(self.usecols)):
  1077. return self.names[nused]
  1078. else:
  1079. return self.names[i - self.leading_cols]
  1080. else:
  1081. if self.header is not None:
  1082. j = i - self.leading_cols
  1083. # generate extra (bogus) headers if there are more columns than headers
  1084. # These should be strings, not integers, because otherwise we might get
  1085. # issues with callables as usecols GH#46997
  1086. if j >= len(self.header[0]):
  1087. return str(j)
  1088. elif self.has_mi_columns:
  1089. return tuple(header_row[j] for header_row in self.header)
  1090. else:
  1091. return self.header[0][j]
  1092. else:
  1093. return None
  1094. # Factor out code common to TextReader.__dealloc__ and TextReader.close
  1095. # It cannot be a class method, since calling self.close() in __dealloc__
  1096. # which causes a class attribute lookup and violates best practices
  1097. # https://cython.readthedocs.io/en/latest/src/userguide/special_methods.html#finalization-method-dealloc
  1098. cdef _close(TextReader reader):
  1099. # also preemptively free all allocated memory
  1100. parser_free(reader.parser)
  1101. if reader.true_set:
  1102. kh_destroy_str_starts(reader.true_set)
  1103. reader.true_set = NULL
  1104. if reader.false_set:
  1105. kh_destroy_str_starts(reader.false_set)
  1106. reader.false_set = NULL
  1107. cdef:
  1108. object _true_values = [b"True", b"TRUE", b"true"]
  1109. object _false_values = [b"False", b"FALSE", b"false"]
  1110. def _ensure_encoded(list lst):
  1111. cdef:
  1112. list result = []
  1113. for x in lst:
  1114. if isinstance(x, str):
  1115. x = PyUnicode_AsUTF8String(x)
  1116. elif not isinstance(x, bytes):
  1117. x = str(x).encode("utf-8")
  1118. result.append(x)
  1119. return result
  1120. # common NA values
  1121. # no longer excluding inf representations
  1122. # '1.#INF','-1.#INF', '1.#INF000000',
  1123. STR_NA_VALUES = {
  1124. "-1.#IND",
  1125. "1.#QNAN",
  1126. "1.#IND",
  1127. "-1.#QNAN",
  1128. "#N/A N/A",
  1129. "#N/A",
  1130. "N/A",
  1131. "n/a",
  1132. "NA",
  1133. "<NA>",
  1134. "#NA",
  1135. "NULL",
  1136. "null",
  1137. "NaN",
  1138. "-NaN",
  1139. "nan",
  1140. "-nan",
  1141. "",
  1142. "None",
  1143. }
  1144. _NA_VALUES = _ensure_encoded(list(STR_NA_VALUES))
  1145. def _maybe_upcast(
  1146. arr, use_dtype_backend: bool = False, dtype_backend: str = "numpy"
  1147. ):
  1148. """Sets nullable dtypes or upcasts if nans are present.
  1149. Upcast, if use_dtype_backend is false and nans are present so that the
  1150. current dtype can not hold the na value. We use nullable dtypes if the
  1151. flag is true for every array.
  1152. Parameters
  1153. ----------
  1154. arr: ndarray
  1155. Numpy array that is potentially being upcast.
  1156. use_dtype_backend: bool, default False
  1157. If true, we cast to the associated nullable dtypes.
  1158. Returns
  1159. -------
  1160. The casted array.
  1161. """
  1162. if is_extension_array_dtype(arr.dtype):
  1163. # TODO: the docstring says arr is an ndarray, in which case this cannot
  1164. # be reached. Is that incorrect?
  1165. return arr
  1166. na_value = na_values[arr.dtype]
  1167. if issubclass(arr.dtype.type, np.integer):
  1168. mask = arr == na_value
  1169. if use_dtype_backend:
  1170. arr = IntegerArray(arr, mask)
  1171. else:
  1172. arr = arr.astype(float)
  1173. np.putmask(arr, mask, np.nan)
  1174. elif arr.dtype == np.bool_:
  1175. mask = arr.view(np.uint8) == na_value
  1176. if use_dtype_backend:
  1177. arr = BooleanArray(arr, mask)
  1178. else:
  1179. arr = arr.astype(object)
  1180. np.putmask(arr, mask, np.nan)
  1181. elif issubclass(arr.dtype.type, float) or arr.dtype.type == np.float32:
  1182. if use_dtype_backend:
  1183. mask = np.isnan(arr)
  1184. arr = FloatingArray(arr, mask)
  1185. elif arr.dtype == np.object_:
  1186. if use_dtype_backend:
  1187. arr = StringDtype().construct_array_type()._from_sequence(arr)
  1188. if use_dtype_backend and dtype_backend == "pyarrow":
  1189. import pyarrow as pa
  1190. if isinstance(arr, IntegerArray) and arr.isna().all():
  1191. # use null instead of int64 in pyarrow
  1192. arr = arr.to_numpy()
  1193. arr = ArrowExtensionArray(pa.array(arr, from_pandas=True))
  1194. return arr
  1195. # ----------------------------------------------------------------------
  1196. # Type conversions / inference support code
  1197. # -> tuple[ndarray[object], int]
  1198. cdef _string_box_utf8(parser_t *parser, int64_t col,
  1199. int64_t line_start, int64_t line_end,
  1200. bint na_filter, kh_str_starts_t *na_hashset,
  1201. const char *encoding_errors):
  1202. cdef:
  1203. int na_count = 0
  1204. Py_ssize_t i, lines
  1205. coliter_t it
  1206. const char *word = NULL
  1207. ndarray[object] result
  1208. int ret = 0
  1209. kh_strbox_t *table
  1210. object pyval
  1211. object NA = na_values[np.object_]
  1212. khiter_t k
  1213. table = kh_init_strbox()
  1214. lines = line_end - line_start
  1215. result = np.empty(lines, dtype=np.object_)
  1216. coliter_setup(&it, parser, col, line_start)
  1217. for i in range(lines):
  1218. COLITER_NEXT(it, word)
  1219. if na_filter:
  1220. if kh_get_str_starts_item(na_hashset, word):
  1221. # in the hash table
  1222. na_count += 1
  1223. result[i] = NA
  1224. continue
  1225. k = kh_get_strbox(table, word)
  1226. # in the hash table
  1227. if k != table.n_buckets:
  1228. # this increments the refcount, but need to test
  1229. pyval = <object>table.vals[k]
  1230. else:
  1231. # box it. new ref?
  1232. pyval = PyUnicode_Decode(word, strlen(word), "utf-8", encoding_errors)
  1233. k = kh_put_strbox(table, word, &ret)
  1234. table.vals[k] = <PyObject *>pyval
  1235. result[i] = pyval
  1236. kh_destroy_strbox(table)
  1237. return result, na_count
  1238. @cython.boundscheck(False)
  1239. cdef _categorical_convert(parser_t *parser, int64_t col,
  1240. int64_t line_start, int64_t line_end,
  1241. bint na_filter, kh_str_starts_t *na_hashset):
  1242. "Convert column data into codes, categories"
  1243. cdef:
  1244. int na_count = 0
  1245. Py_ssize_t i, lines
  1246. coliter_t it
  1247. const char *word = NULL
  1248. int64_t NA = -1
  1249. int64_t[::1] codes
  1250. int64_t current_category = 0
  1251. int ret = 0
  1252. kh_str_t *table
  1253. khiter_t k
  1254. lines = line_end - line_start
  1255. codes = np.empty(lines, dtype=np.int64)
  1256. # factorize parsed values, creating a hash table
  1257. # bytes -> category code
  1258. with nogil:
  1259. table = kh_init_str()
  1260. coliter_setup(&it, parser, col, line_start)
  1261. for i in range(lines):
  1262. COLITER_NEXT(it, word)
  1263. if na_filter:
  1264. if kh_get_str_starts_item(na_hashset, word):
  1265. # is in NA values
  1266. na_count += 1
  1267. codes[i] = NA
  1268. continue
  1269. k = kh_get_str(table, word)
  1270. # not in the hash table
  1271. if k == table.n_buckets:
  1272. k = kh_put_str(table, word, &ret)
  1273. table.vals[k] = current_category
  1274. current_category += 1
  1275. codes[i] = table.vals[k]
  1276. # parse and box categories to python strings
  1277. result = np.empty(table.n_occupied, dtype=np.object_)
  1278. for k in range(table.n_buckets):
  1279. if kh_exist_str(table, k):
  1280. result[table.vals[k]] = PyUnicode_FromString(table.keys[k])
  1281. kh_destroy_str(table)
  1282. return np.asarray(codes), result, na_count
  1283. # -> ndarray[f'|S{width}']
  1284. cdef _to_fw_string(parser_t *parser, int64_t col, int64_t line_start,
  1285. int64_t line_end, int64_t width):
  1286. cdef:
  1287. char *data
  1288. ndarray result
  1289. result = np.empty(line_end - line_start, dtype=f"|S{width}")
  1290. data = <char*>result.data
  1291. with nogil:
  1292. _to_fw_string_nogil(parser, col, line_start, line_end, width, data)
  1293. return result
  1294. cdef void _to_fw_string_nogil(parser_t *parser, int64_t col,
  1295. int64_t line_start, int64_t line_end,
  1296. size_t width, char *data) nogil:
  1297. cdef:
  1298. int64_t i
  1299. coliter_t it
  1300. const char *word = NULL
  1301. coliter_setup(&it, parser, col, line_start)
  1302. for i in range(line_end - line_start):
  1303. COLITER_NEXT(it, word)
  1304. strncpy(data, word, width)
  1305. data += width
  1306. cdef:
  1307. char* cinf = b"inf"
  1308. char* cposinf = b"+inf"
  1309. char* cneginf = b"-inf"
  1310. char* cinfty = b"Infinity"
  1311. char* cposinfty = b"+Infinity"
  1312. char* cneginfty = b"-Infinity"
  1313. # -> tuple[ndarray[float64_t], int] | tuple[None, None]
  1314. cdef _try_double(parser_t *parser, int64_t col,
  1315. int64_t line_start, int64_t line_end,
  1316. bint na_filter, kh_str_starts_t *na_hashset, object na_flist):
  1317. cdef:
  1318. int error, na_count = 0
  1319. Py_ssize_t lines
  1320. float64_t *data
  1321. float64_t NA = na_values[np.float64]
  1322. kh_float64_t *na_fset
  1323. ndarray[float64_t] result
  1324. bint use_na_flist = len(na_flist) > 0
  1325. lines = line_end - line_start
  1326. result = np.empty(lines, dtype=np.float64)
  1327. data = <float64_t *>result.data
  1328. na_fset = kset_float64_from_list(na_flist)
  1329. with nogil:
  1330. error = _try_double_nogil(parser, parser.double_converter,
  1331. col, line_start, line_end,
  1332. na_filter, na_hashset, use_na_flist,
  1333. na_fset, NA, data, &na_count)
  1334. kh_destroy_float64(na_fset)
  1335. if error != 0:
  1336. return None, None
  1337. return result, na_count
  1338. cdef int _try_double_nogil(parser_t *parser,
  1339. float64_t (*double_converter)(
  1340. const char *, char **, char,
  1341. char, char, int, int *, int *) nogil,
  1342. int64_t col, int64_t line_start, int64_t line_end,
  1343. bint na_filter, kh_str_starts_t *na_hashset,
  1344. bint use_na_flist,
  1345. const kh_float64_t *na_flist,
  1346. float64_t NA, float64_t *data,
  1347. int *na_count) nogil:
  1348. cdef:
  1349. int error = 0,
  1350. Py_ssize_t i, lines = line_end - line_start
  1351. coliter_t it
  1352. const char *word = NULL
  1353. char *p_end
  1354. khiter_t k64
  1355. na_count[0] = 0
  1356. coliter_setup(&it, parser, col, line_start)
  1357. if na_filter:
  1358. for i in range(lines):
  1359. COLITER_NEXT(it, word)
  1360. if kh_get_str_starts_item(na_hashset, word):
  1361. # in the hash table
  1362. na_count[0] += 1
  1363. data[0] = NA
  1364. else:
  1365. data[0] = double_converter(word, &p_end, parser.decimal,
  1366. parser.sci, parser.thousands,
  1367. 1, &error, NULL)
  1368. if error != 0 or p_end == word or p_end[0]:
  1369. error = 0
  1370. if (strcasecmp(word, cinf) == 0 or
  1371. strcasecmp(word, cposinf) == 0 or
  1372. strcasecmp(word, cinfty) == 0 or
  1373. strcasecmp(word, cposinfty) == 0):
  1374. data[0] = INF
  1375. elif (strcasecmp(word, cneginf) == 0 or
  1376. strcasecmp(word, cneginfty) == 0):
  1377. data[0] = NEGINF
  1378. else:
  1379. return 1
  1380. if use_na_flist:
  1381. k64 = kh_get_float64(na_flist, data[0])
  1382. if k64 != na_flist.n_buckets:
  1383. na_count[0] += 1
  1384. data[0] = NA
  1385. data += 1
  1386. else:
  1387. for i in range(lines):
  1388. COLITER_NEXT(it, word)
  1389. data[0] = double_converter(word, &p_end, parser.decimal,
  1390. parser.sci, parser.thousands,
  1391. 1, &error, NULL)
  1392. if error != 0 or p_end == word or p_end[0]:
  1393. error = 0
  1394. if (strcasecmp(word, cinf) == 0 or
  1395. strcasecmp(word, cposinf) == 0 or
  1396. strcasecmp(word, cinfty) == 0 or
  1397. strcasecmp(word, cposinfty) == 0):
  1398. data[0] = INF
  1399. elif (strcasecmp(word, cneginf) == 0 or
  1400. strcasecmp(word, cneginfty) == 0):
  1401. data[0] = NEGINF
  1402. else:
  1403. return 1
  1404. data += 1
  1405. return 0
  1406. cdef _try_uint64(parser_t *parser, int64_t col,
  1407. int64_t line_start, int64_t line_end,
  1408. bint na_filter, kh_str_starts_t *na_hashset):
  1409. cdef:
  1410. int error
  1411. Py_ssize_t lines
  1412. coliter_t it
  1413. uint64_t *data
  1414. ndarray result
  1415. uint_state state
  1416. lines = line_end - line_start
  1417. result = np.empty(lines, dtype=np.uint64)
  1418. data = <uint64_t *>result.data
  1419. uint_state_init(&state)
  1420. coliter_setup(&it, parser, col, line_start)
  1421. with nogil:
  1422. error = _try_uint64_nogil(parser, col, line_start, line_end,
  1423. na_filter, na_hashset, data, &state)
  1424. if error != 0:
  1425. if error == ERROR_OVERFLOW:
  1426. # Can't get the word variable
  1427. raise OverflowError("Overflow")
  1428. return None
  1429. if uint64_conflict(&state):
  1430. raise ValueError("Cannot convert to numerical dtype")
  1431. if state.seen_sint:
  1432. raise OverflowError("Overflow")
  1433. return result
  1434. cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
  1435. int64_t line_start,
  1436. int64_t line_end, bint na_filter,
  1437. const kh_str_starts_t *na_hashset,
  1438. uint64_t *data, uint_state *state) nogil:
  1439. cdef:
  1440. int error
  1441. Py_ssize_t i, lines = line_end - line_start
  1442. coliter_t it
  1443. const char *word = NULL
  1444. coliter_setup(&it, parser, col, line_start)
  1445. if na_filter:
  1446. for i in range(lines):
  1447. COLITER_NEXT(it, word)
  1448. if kh_get_str_starts_item(na_hashset, word):
  1449. # in the hash table
  1450. state.seen_null = 1
  1451. data[i] = 0
  1452. continue
  1453. data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
  1454. &error, parser.thousands)
  1455. if error != 0:
  1456. return error
  1457. else:
  1458. for i in range(lines):
  1459. COLITER_NEXT(it, word)
  1460. data[i] = str_to_uint64(state, word, INT64_MAX, UINT64_MAX,
  1461. &error, parser.thousands)
  1462. if error != 0:
  1463. return error
  1464. return 0
  1465. cdef _try_int64(parser_t *parser, int64_t col,
  1466. int64_t line_start, int64_t line_end,
  1467. bint na_filter, kh_str_starts_t *na_hashset):
  1468. cdef:
  1469. int error, na_count = 0
  1470. Py_ssize_t lines
  1471. coliter_t it
  1472. int64_t *data
  1473. ndarray result
  1474. int64_t NA = na_values[np.int64]
  1475. lines = line_end - line_start
  1476. result = np.empty(lines, dtype=np.int64)
  1477. data = <int64_t *>result.data
  1478. coliter_setup(&it, parser, col, line_start)
  1479. with nogil:
  1480. error = _try_int64_nogil(parser, col, line_start, line_end,
  1481. na_filter, na_hashset, NA, data, &na_count)
  1482. if error != 0:
  1483. if error == ERROR_OVERFLOW:
  1484. # Can't get the word variable
  1485. raise OverflowError("Overflow")
  1486. return None, None
  1487. return result, na_count
  1488. cdef int _try_int64_nogil(parser_t *parser, int64_t col,
  1489. int64_t line_start,
  1490. int64_t line_end, bint na_filter,
  1491. const kh_str_starts_t *na_hashset, int64_t NA,
  1492. int64_t *data, int *na_count) nogil:
  1493. cdef:
  1494. int error
  1495. Py_ssize_t i, lines = line_end - line_start
  1496. coliter_t it
  1497. const char *word = NULL
  1498. na_count[0] = 0
  1499. coliter_setup(&it, parser, col, line_start)
  1500. if na_filter:
  1501. for i in range(lines):
  1502. COLITER_NEXT(it, word)
  1503. if kh_get_str_starts_item(na_hashset, word):
  1504. # in the hash table
  1505. na_count[0] += 1
  1506. data[i] = NA
  1507. continue
  1508. data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
  1509. &error, parser.thousands)
  1510. if error != 0:
  1511. return error
  1512. else:
  1513. for i in range(lines):
  1514. COLITER_NEXT(it, word)
  1515. data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
  1516. &error, parser.thousands)
  1517. if error != 0:
  1518. return error
  1519. return 0
  1520. # -> tuple[ndarray[bool], int]
  1521. cdef _try_bool_flex(parser_t *parser, int64_t col,
  1522. int64_t line_start, int64_t line_end,
  1523. bint na_filter, const kh_str_starts_t *na_hashset,
  1524. const kh_str_starts_t *true_hashset,
  1525. const kh_str_starts_t *false_hashset):
  1526. cdef:
  1527. int error, na_count = 0
  1528. Py_ssize_t lines
  1529. uint8_t *data
  1530. ndarray result
  1531. uint8_t NA = na_values[np.bool_]
  1532. lines = line_end - line_start
  1533. result = np.empty(lines, dtype=np.uint8)
  1534. data = <uint8_t *>result.data
  1535. with nogil:
  1536. error = _try_bool_flex_nogil(parser, col, line_start, line_end,
  1537. na_filter, na_hashset, true_hashset,
  1538. false_hashset, NA, data, &na_count)
  1539. if error != 0:
  1540. return None, None
  1541. return result.view(np.bool_), na_count
  1542. cdef int _try_bool_flex_nogil(parser_t *parser, int64_t col,
  1543. int64_t line_start,
  1544. int64_t line_end, bint na_filter,
  1545. const kh_str_starts_t *na_hashset,
  1546. const kh_str_starts_t *true_hashset,
  1547. const kh_str_starts_t *false_hashset,
  1548. uint8_t NA, uint8_t *data,
  1549. int *na_count) nogil:
  1550. cdef:
  1551. int error = 0
  1552. Py_ssize_t i, lines = line_end - line_start
  1553. coliter_t it
  1554. const char *word = NULL
  1555. na_count[0] = 0
  1556. coliter_setup(&it, parser, col, line_start)
  1557. if na_filter:
  1558. for i in range(lines):
  1559. COLITER_NEXT(it, word)
  1560. if kh_get_str_starts_item(na_hashset, word):
  1561. # in the hash table
  1562. na_count[0] += 1
  1563. data[0] = NA
  1564. data += 1
  1565. continue
  1566. if kh_get_str_starts_item(true_hashset, word):
  1567. data[0] = 1
  1568. data += 1
  1569. continue
  1570. if kh_get_str_starts_item(false_hashset, word):
  1571. data[0] = 0
  1572. data += 1
  1573. continue
  1574. error = to_boolean(word, data)
  1575. if error != 0:
  1576. return error
  1577. data += 1
  1578. else:
  1579. for i in range(lines):
  1580. COLITER_NEXT(it, word)
  1581. if kh_get_str_starts_item(true_hashset, word):
  1582. data[0] = 1
  1583. data += 1
  1584. continue
  1585. if kh_get_str_starts_item(false_hashset, word):
  1586. data[0] = 0
  1587. data += 1
  1588. continue
  1589. error = to_boolean(word, data)
  1590. if error != 0:
  1591. return error
  1592. data += 1
  1593. return 0
  1594. cdef kh_str_starts_t* kset_from_list(list values) except NULL:
  1595. # caller takes responsibility for freeing the hash table
  1596. cdef:
  1597. Py_ssize_t i
  1598. kh_str_starts_t *table
  1599. int ret = 0
  1600. object val
  1601. table = kh_init_str_starts()
  1602. for i in range(len(values)):
  1603. val = values[i]
  1604. # None creeps in sometimes, which isn't possible here
  1605. if not isinstance(val, bytes):
  1606. kh_destroy_str_starts(table)
  1607. raise ValueError("Must be all encoded bytes")
  1608. kh_put_str_starts_item(table, PyBytes_AsString(val), &ret)
  1609. if table.table.n_buckets <= 128:
  1610. # Resize the hash table to make it almost empty, this
  1611. # reduces amount of hash collisions on lookup thus
  1612. # "key not in table" case is faster.
  1613. # Note that this trades table memory footprint for lookup speed.
  1614. kh_resize_str_starts(table, table.table.n_buckets * 8)
  1615. return table
  1616. cdef kh_float64_t* kset_float64_from_list(values) except NULL:
  1617. # caller takes responsibility for freeing the hash table
  1618. cdef:
  1619. kh_float64_t *table
  1620. int ret = 0
  1621. float64_t val
  1622. object value
  1623. table = kh_init_float64()
  1624. for value in values:
  1625. val = float(value)
  1626. kh_put_float64(table, val, &ret)
  1627. if table.n_buckets <= 128:
  1628. # See reasoning in kset_from_list
  1629. kh_resize_float64(table, table.n_buckets * 8)
  1630. return table
  1631. cdef raise_parser_error(object base, parser_t *parser):
  1632. cdef:
  1633. object old_exc
  1634. object exc_type
  1635. PyObject *type
  1636. PyObject *value
  1637. PyObject *traceback
  1638. if PyErr_Occurred():
  1639. PyErr_Fetch(&type, &value, &traceback)
  1640. Py_XDECREF(traceback)
  1641. if value != NULL:
  1642. old_exc = <object>value
  1643. Py_XDECREF(value)
  1644. # PyErr_Fetch only returned the error message in *value,
  1645. # so the Exception class must be extracted from *type.
  1646. if isinstance(old_exc, str):
  1647. if type != NULL:
  1648. exc_type = <object>type
  1649. else:
  1650. exc_type = ParserError
  1651. Py_XDECREF(type)
  1652. raise exc_type(old_exc)
  1653. else:
  1654. Py_XDECREF(type)
  1655. raise old_exc
  1656. message = f"{base}. C error: "
  1657. if parser.error_msg != NULL:
  1658. message += parser.error_msg.decode("utf-8")
  1659. else:
  1660. message += "no error message set"
  1661. raise ParserError(message)
  1662. # ----------------------------------------------------------------------
  1663. # NA values
  1664. def _compute_na_values():
  1665. int64info = np.iinfo(np.int64)
  1666. int32info = np.iinfo(np.int32)
  1667. int16info = np.iinfo(np.int16)
  1668. int8info = np.iinfo(np.int8)
  1669. uint64info = np.iinfo(np.uint64)
  1670. uint32info = np.iinfo(np.uint32)
  1671. uint16info = np.iinfo(np.uint16)
  1672. uint8info = np.iinfo(np.uint8)
  1673. na_values = {
  1674. np.float32: np.nan,
  1675. np.float64: np.nan,
  1676. np.int64: int64info.min,
  1677. np.int32: int32info.min,
  1678. np.int16: int16info.min,
  1679. np.int8: int8info.min,
  1680. np.uint64: uint64info.max,
  1681. np.uint32: uint32info.max,
  1682. np.uint16: uint16info.max,
  1683. np.uint8: uint8info.max,
  1684. np.bool_: uint8info.max,
  1685. np.object_: np.nan,
  1686. }
  1687. return na_values
  1688. na_values = _compute_na_values()
  1689. for k in list(na_values):
  1690. na_values[np.dtype(k)] = na_values[k]
  1691. # -> ArrayLike
  1692. cdef _apply_converter(object f, parser_t *parser, int64_t col,
  1693. int64_t line_start, int64_t line_end):
  1694. cdef:
  1695. Py_ssize_t i, lines
  1696. coliter_t it
  1697. const char *word = NULL
  1698. ndarray[object] result
  1699. object val
  1700. lines = line_end - line_start
  1701. result = np.empty(lines, dtype=np.object_)
  1702. coliter_setup(&it, parser, col, line_start)
  1703. for i in range(lines):
  1704. COLITER_NEXT(it, word)
  1705. val = PyUnicode_FromString(word)
  1706. result[i] = f(val)
  1707. return lib.maybe_convert_objects(result)
  1708. cdef list _maybe_encode(list values):
  1709. if values is None:
  1710. return []
  1711. return [x.encode("utf-8") if isinstance(x, str) else x for x in values]
  1712. def sanitize_objects(ndarray[object] values, set na_values) -> int:
  1713. """
  1714. Convert specified values, including the given set na_values to np.nan.
  1715. Parameters
  1716. ----------
  1717. values : ndarray[object]
  1718. na_values : set
  1719. Returns
  1720. -------
  1721. na_count : int
  1722. """
  1723. cdef:
  1724. Py_ssize_t i, n
  1725. object val, onan
  1726. Py_ssize_t na_count = 0
  1727. dict memo = {}
  1728. n = len(values)
  1729. onan = np.nan
  1730. for i in range(n):
  1731. val = values[i]
  1732. if val in na_values:
  1733. values[i] = onan
  1734. na_count += 1
  1735. elif val in memo:
  1736. values[i] = memo[val]
  1737. else:
  1738. memo[val] = val
  1739. return na_count