python_parser.py 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351
  1. from __future__ import annotations
  2. from collections import (
  3. abc,
  4. defaultdict,
  5. )
  6. import csv
  7. from io import StringIO
  8. import re
  9. import sys
  10. from typing import (
  11. IO,
  12. TYPE_CHECKING,
  13. DefaultDict,
  14. Hashable,
  15. Iterator,
  16. List,
  17. Literal,
  18. Mapping,
  19. Sequence,
  20. cast,
  21. )
  22. import numpy as np
  23. from pandas._libs import lib
  24. from pandas._typing import (
  25. ArrayLike,
  26. ReadCsvBuffer,
  27. Scalar,
  28. )
  29. from pandas.errors import (
  30. EmptyDataError,
  31. ParserError,
  32. )
  33. from pandas.core.dtypes.common import is_integer
  34. from pandas.core.dtypes.inference import is_dict_like
  35. from pandas.io.common import (
  36. dedup_names,
  37. is_potential_multi_index,
  38. )
  39. from pandas.io.parsers.base_parser import (
  40. ParserBase,
  41. parser_defaults,
  42. )
  43. if TYPE_CHECKING:
  44. from pandas import (
  45. Index,
  46. MultiIndex,
  47. )
  48. # BOM character (byte order mark)
  49. # This exists at the beginning of a file to indicate endianness
  50. # of a file (stream). Unfortunately, this marker screws up parsing,
  51. # so we need to remove it if we see it.
  52. _BOM = "\ufeff"
  53. class PythonParser(ParserBase):
  54. def __init__(self, f: ReadCsvBuffer[str] | list, **kwds) -> None:
  55. """
  56. Workhorse function for processing nested list into DataFrame
  57. """
  58. super().__init__(kwds)
  59. self.data: Iterator[str] | None = None
  60. self.buf: list = []
  61. self.pos = 0
  62. self.line_pos = 0
  63. self.skiprows = kwds["skiprows"]
  64. if callable(self.skiprows):
  65. self.skipfunc = self.skiprows
  66. else:
  67. self.skipfunc = lambda x: x in self.skiprows
  68. self.skipfooter = _validate_skipfooter_arg(kwds["skipfooter"])
  69. self.delimiter = kwds["delimiter"]
  70. self.quotechar = kwds["quotechar"]
  71. if isinstance(self.quotechar, str):
  72. self.quotechar = str(self.quotechar)
  73. self.escapechar = kwds["escapechar"]
  74. self.doublequote = kwds["doublequote"]
  75. self.skipinitialspace = kwds["skipinitialspace"]
  76. self.lineterminator = kwds["lineterminator"]
  77. self.quoting = kwds["quoting"]
  78. self.skip_blank_lines = kwds["skip_blank_lines"]
  79. self.names_passed = kwds["names"] or None
  80. self.has_index_names = False
  81. if "has_index_names" in kwds:
  82. self.has_index_names = kwds["has_index_names"]
  83. self.verbose = kwds["verbose"]
  84. self.thousands = kwds["thousands"]
  85. self.decimal = kwds["decimal"]
  86. self.comment = kwds["comment"]
  87. # Set self.data to something that can read lines.
  88. if isinstance(f, list):
  89. # read_excel: f is a list
  90. self.data = cast(Iterator[str], f)
  91. else:
  92. assert hasattr(f, "readline")
  93. self._make_reader(f)
  94. # Get columns in two steps: infer from data, then
  95. # infer column indices from self.usecols if it is specified.
  96. self._col_indices: list[int] | None = None
  97. columns: list[list[Scalar | None]]
  98. (
  99. columns,
  100. self.num_original_columns,
  101. self.unnamed_cols,
  102. ) = self._infer_columns()
  103. # Now self.columns has the set of columns that we will process.
  104. # The original set is stored in self.original_columns.
  105. # error: Cannot determine type of 'index_names'
  106. (
  107. self.columns,
  108. self.index_names,
  109. self.col_names,
  110. _,
  111. ) = self._extract_multi_indexer_columns(
  112. columns,
  113. self.index_names, # type: ignore[has-type]
  114. )
  115. # get popped off for index
  116. self.orig_names: list[Hashable] = list(self.columns)
  117. # needs to be cleaned/refactored
  118. # multiple date column thing turning into a real spaghetti factory
  119. if not self._has_complex_date_col:
  120. (index_names, self.orig_names, self.columns) = self._get_index_name(
  121. self.columns
  122. )
  123. self._name_processed = True
  124. if self.index_names is None:
  125. self.index_names = index_names
  126. if self._col_indices is None:
  127. self._col_indices = list(range(len(self.columns)))
  128. self._parse_date_cols = self._validate_parse_dates_presence(self.columns)
  129. no_thousands_columns: set[int] | None = None
  130. if self.parse_dates:
  131. no_thousands_columns = self._set_noconvert_dtype_columns(
  132. self._col_indices, self.columns
  133. )
  134. self._no_thousands_columns = no_thousands_columns
  135. if len(self.decimal) != 1:
  136. raise ValueError("Only length-1 decimal markers supported")
  137. decimal = re.escape(self.decimal)
  138. if self.thousands is None:
  139. regex = rf"^[\-\+]?[0-9]*({decimal}[0-9]*)?([0-9]?(E|e)\-?[0-9]+)?$"
  140. else:
  141. thousands = re.escape(self.thousands)
  142. regex = (
  143. rf"^[\-\+]?([0-9]+{thousands}|[0-9])*({decimal}[0-9]*)?"
  144. rf"([0-9]?(E|e)\-?[0-9]+)?$"
  145. )
  146. self.num = re.compile(regex)
  147. def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
  148. sep = self.delimiter
  149. if sep is None or len(sep) == 1:
  150. if self.lineterminator:
  151. raise ValueError(
  152. "Custom line terminators not supported in python parser (yet)"
  153. )
  154. class MyDialect(csv.Dialect):
  155. delimiter = self.delimiter
  156. quotechar = self.quotechar
  157. escapechar = self.escapechar
  158. doublequote = self.doublequote
  159. skipinitialspace = self.skipinitialspace
  160. quoting = self.quoting
  161. lineterminator = "\n"
  162. dia = MyDialect
  163. if sep is not None:
  164. dia.delimiter = sep
  165. else:
  166. # attempt to sniff the delimiter from the first valid line,
  167. # i.e. no comment line and not in skiprows
  168. line = f.readline()
  169. lines = self._check_comments([[line]])[0]
  170. while self.skipfunc(self.pos) or not lines:
  171. self.pos += 1
  172. line = f.readline()
  173. lines = self._check_comments([[line]])[0]
  174. lines_str = cast(List[str], lines)
  175. # since `line` was a string, lines will be a list containing
  176. # only a single string
  177. line = lines_str[0]
  178. self.pos += 1
  179. self.line_pos += 1
  180. sniffed = csv.Sniffer().sniff(line)
  181. dia.delimiter = sniffed.delimiter
  182. # Note: encoding is irrelevant here
  183. line_rdr = csv.reader(StringIO(line), dialect=dia)
  184. self.buf.extend(list(line_rdr))
  185. # Note: encoding is irrelevant here
  186. reader = csv.reader(f, dialect=dia, strict=True)
  187. else:
  188. def _read():
  189. line = f.readline()
  190. pat = re.compile(sep)
  191. yield pat.split(line.strip())
  192. for line in f:
  193. yield pat.split(line.strip())
  194. reader = _read()
  195. # error: Incompatible types in assignment (expression has type "_reader",
  196. # variable has type "Union[IO[Any], RawIOBase, BufferedIOBase, TextIOBase,
  197. # TextIOWrapper, mmap, None]")
  198. self.data = reader # type: ignore[assignment]
  199. def read(
  200. self, rows: int | None = None
  201. ) -> tuple[
  202. Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike]
  203. ]:
  204. try:
  205. content = self._get_lines(rows)
  206. except StopIteration:
  207. if self._first_chunk:
  208. content = []
  209. else:
  210. self.close()
  211. raise
  212. # done with first read, next time raise StopIteration
  213. self._first_chunk = False
  214. columns: Sequence[Hashable] = list(self.orig_names)
  215. if not len(content): # pragma: no cover
  216. # DataFrame with the right metadata, even though it's length 0
  217. # error: Cannot determine type of 'index_col'
  218. names = dedup_names(
  219. self.orig_names,
  220. is_potential_multi_index(
  221. self.orig_names,
  222. self.index_col, # type: ignore[has-type]
  223. ),
  224. )
  225. # error: Cannot determine type of 'index_col'
  226. index, columns, col_dict = self._get_empty_meta(
  227. names,
  228. self.index_col, # type: ignore[has-type]
  229. self.index_names,
  230. self.dtype,
  231. )
  232. conv_columns = self._maybe_make_multi_index_columns(columns, self.col_names)
  233. return index, conv_columns, col_dict
  234. # handle new style for names in index
  235. count_empty_content_vals = count_empty_vals(content[0])
  236. indexnamerow = None
  237. if self.has_index_names and count_empty_content_vals == len(columns):
  238. indexnamerow = content[0]
  239. content = content[1:]
  240. alldata = self._rows_to_cols(content)
  241. data, columns = self._exclude_implicit_index(alldata)
  242. conv_data = self._convert_data(data)
  243. columns, conv_data = self._do_date_conversions(columns, conv_data)
  244. index, result_columns = self._make_index(
  245. conv_data, alldata, columns, indexnamerow
  246. )
  247. return index, result_columns, conv_data
  248. def _exclude_implicit_index(
  249. self,
  250. alldata: list[np.ndarray],
  251. ) -> tuple[Mapping[Hashable, np.ndarray], Sequence[Hashable]]:
  252. # error: Cannot determine type of 'index_col'
  253. names = dedup_names(
  254. self.orig_names,
  255. is_potential_multi_index(
  256. self.orig_names,
  257. self.index_col, # type: ignore[has-type]
  258. ),
  259. )
  260. offset = 0
  261. if self._implicit_index:
  262. # error: Cannot determine type of 'index_col'
  263. offset = len(self.index_col) # type: ignore[has-type]
  264. len_alldata = len(alldata)
  265. self._check_data_length(names, alldata)
  266. return {
  267. name: alldata[i + offset] for i, name in enumerate(names) if i < len_alldata
  268. }, names
  269. # legacy
  270. def get_chunk(
  271. self, size: int | None = None
  272. ) -> tuple[
  273. Index | None, Sequence[Hashable] | MultiIndex, Mapping[Hashable, ArrayLike]
  274. ]:
  275. if size is None:
  276. # error: "PythonParser" has no attribute "chunksize"
  277. size = self.chunksize # type: ignore[attr-defined]
  278. return self.read(rows=size)
  279. def _convert_data(
  280. self,
  281. data: Mapping[Hashable, np.ndarray],
  282. ) -> Mapping[Hashable, ArrayLike]:
  283. # apply converters
  284. clean_conv = self._clean_mapping(self.converters)
  285. clean_dtypes = self._clean_mapping(self.dtype)
  286. # Apply NA values.
  287. clean_na_values = {}
  288. clean_na_fvalues = {}
  289. if isinstance(self.na_values, dict):
  290. for col in self.na_values:
  291. na_value = self.na_values[col]
  292. na_fvalue = self.na_fvalues[col]
  293. if isinstance(col, int) and col not in self.orig_names:
  294. col = self.orig_names[col]
  295. clean_na_values[col] = na_value
  296. clean_na_fvalues[col] = na_fvalue
  297. else:
  298. clean_na_values = self.na_values
  299. clean_na_fvalues = self.na_fvalues
  300. return self._convert_to_ndarrays(
  301. data,
  302. clean_na_values,
  303. clean_na_fvalues,
  304. self.verbose,
  305. clean_conv,
  306. clean_dtypes,
  307. )
  308. def _infer_columns(
  309. self,
  310. ) -> tuple[list[list[Scalar | None]], int, set[Scalar | None]]:
  311. names = self.names
  312. num_original_columns = 0
  313. clear_buffer = True
  314. unnamed_cols: set[Scalar | None] = set()
  315. self._header_line = None
  316. if self.header is not None:
  317. header = self.header
  318. if isinstance(header, (list, tuple, np.ndarray)):
  319. have_mi_columns = len(header) > 1
  320. # we have a mi columns, so read an extra line
  321. if have_mi_columns:
  322. header = list(header) + [header[-1] + 1]
  323. else:
  324. have_mi_columns = False
  325. header = [header]
  326. columns: list[list[Scalar | None]] = []
  327. for level, hr in enumerate(header):
  328. try:
  329. line = self._buffered_line()
  330. while self.line_pos <= hr:
  331. line = self._next_line()
  332. except StopIteration as err:
  333. if 0 < self.line_pos <= hr and (
  334. not have_mi_columns or hr != header[-1]
  335. ):
  336. # If no rows we want to raise a different message and if
  337. # we have mi columns, the last line is not part of the header
  338. joi = list(map(str, header[:-1] if have_mi_columns else header))
  339. msg = f"[{','.join(joi)}], len of {len(joi)}, "
  340. raise ValueError(
  341. f"Passed header={msg}"
  342. f"but only {self.line_pos} lines in file"
  343. ) from err
  344. # We have an empty file, so check
  345. # if columns are provided. That will
  346. # serve as the 'line' for parsing
  347. if have_mi_columns and hr > 0:
  348. if clear_buffer:
  349. self._clear_buffer()
  350. columns.append([None] * len(columns[-1]))
  351. return columns, num_original_columns, unnamed_cols
  352. if not self.names:
  353. raise EmptyDataError("No columns to parse from file") from err
  354. line = self.names[:]
  355. this_columns: list[Scalar | None] = []
  356. this_unnamed_cols = []
  357. for i, c in enumerate(line):
  358. if c == "":
  359. if have_mi_columns:
  360. col_name = f"Unnamed: {i}_level_{level}"
  361. else:
  362. col_name = f"Unnamed: {i}"
  363. this_unnamed_cols.append(i)
  364. this_columns.append(col_name)
  365. else:
  366. this_columns.append(c)
  367. if not have_mi_columns:
  368. counts: DefaultDict = defaultdict(int)
  369. # Ensure that regular columns are used before unnamed ones
  370. # to keep given names and mangle unnamed columns
  371. col_loop_order = [
  372. i
  373. for i in range(len(this_columns))
  374. if i not in this_unnamed_cols
  375. ] + this_unnamed_cols
  376. # TODO: Use pandas.io.common.dedup_names instead (see #50371)
  377. for i in col_loop_order:
  378. col = this_columns[i]
  379. old_col = col
  380. cur_count = counts[col]
  381. if cur_count > 0:
  382. while cur_count > 0:
  383. counts[old_col] = cur_count + 1
  384. col = f"{old_col}.{cur_count}"
  385. if col in this_columns:
  386. cur_count += 1
  387. else:
  388. cur_count = counts[col]
  389. if (
  390. self.dtype is not None
  391. and is_dict_like(self.dtype)
  392. and self.dtype.get(old_col) is not None
  393. and self.dtype.get(col) is None
  394. ):
  395. self.dtype.update({col: self.dtype.get(old_col)})
  396. this_columns[i] = col
  397. counts[col] = cur_count + 1
  398. elif have_mi_columns:
  399. # if we have grabbed an extra line, but its not in our
  400. # format so save in the buffer, and create an blank extra
  401. # line for the rest of the parsing code
  402. if hr == header[-1]:
  403. lc = len(this_columns)
  404. # error: Cannot determine type of 'index_col'
  405. sic = self.index_col # type: ignore[has-type]
  406. ic = len(sic) if sic is not None else 0
  407. unnamed_count = len(this_unnamed_cols)
  408. # if wrong number of blanks or no index, not our format
  409. if (lc != unnamed_count and lc - ic > unnamed_count) or ic == 0:
  410. clear_buffer = False
  411. this_columns = [None] * lc
  412. self.buf = [self.buf[-1]]
  413. columns.append(this_columns)
  414. unnamed_cols.update({this_columns[i] for i in this_unnamed_cols})
  415. if len(columns) == 1:
  416. num_original_columns = len(this_columns)
  417. if clear_buffer:
  418. self._clear_buffer()
  419. first_line: list[Scalar] | None
  420. if names is not None:
  421. # Read first row after header to check if data are longer
  422. try:
  423. first_line = self._next_line()
  424. except StopIteration:
  425. first_line = None
  426. len_first_data_row = 0 if first_line is None else len(first_line)
  427. if len(names) > len(columns[0]) and len(names) > len_first_data_row:
  428. raise ValueError(
  429. "Number of passed names did not match "
  430. "number of header fields in the file"
  431. )
  432. if len(columns) > 1:
  433. raise TypeError("Cannot pass names with multi-index columns")
  434. if self.usecols is not None:
  435. # Set _use_cols. We don't store columns because they are
  436. # overwritten.
  437. self._handle_usecols(columns, names, num_original_columns)
  438. else:
  439. num_original_columns = len(names)
  440. if self._col_indices is not None and len(names) != len(
  441. self._col_indices
  442. ):
  443. columns = [[names[i] for i in sorted(self._col_indices)]]
  444. else:
  445. columns = [names]
  446. else:
  447. columns = self._handle_usecols(
  448. columns, columns[0], num_original_columns
  449. )
  450. else:
  451. try:
  452. line = self._buffered_line()
  453. except StopIteration as err:
  454. if not names:
  455. raise EmptyDataError("No columns to parse from file") from err
  456. line = names[:]
  457. # Store line, otherwise it is lost for guessing the index
  458. self._header_line = line
  459. ncols = len(line)
  460. num_original_columns = ncols
  461. if not names:
  462. columns = [list(range(ncols))]
  463. columns = self._handle_usecols(
  464. columns, columns[0], num_original_columns
  465. )
  466. else:
  467. if self.usecols is None or len(names) >= num_original_columns:
  468. columns = self._handle_usecols([names], names, num_original_columns)
  469. num_original_columns = len(names)
  470. else:
  471. if not callable(self.usecols) and len(names) != len(self.usecols):
  472. raise ValueError(
  473. "Number of passed names did not match number of "
  474. "header fields in the file"
  475. )
  476. # Ignore output but set used columns.
  477. self._handle_usecols([names], names, ncols)
  478. columns = [names]
  479. num_original_columns = ncols
  480. return columns, num_original_columns, unnamed_cols
  481. def _handle_usecols(
  482. self,
  483. columns: list[list[Scalar | None]],
  484. usecols_key: list[Scalar | None],
  485. num_original_columns: int,
  486. ) -> list[list[Scalar | None]]:
  487. """
  488. Sets self._col_indices
  489. usecols_key is used if there are string usecols.
  490. """
  491. col_indices: set[int] | list[int]
  492. if self.usecols is not None:
  493. if callable(self.usecols):
  494. col_indices = self._evaluate_usecols(self.usecols, usecols_key)
  495. elif any(isinstance(u, str) for u in self.usecols):
  496. if len(columns) > 1:
  497. raise ValueError(
  498. "If using multiple headers, usecols must be integers."
  499. )
  500. col_indices = []
  501. for col in self.usecols:
  502. if isinstance(col, str):
  503. try:
  504. col_indices.append(usecols_key.index(col))
  505. except ValueError:
  506. self._validate_usecols_names(self.usecols, usecols_key)
  507. else:
  508. col_indices.append(col)
  509. else:
  510. missing_usecols = [
  511. col for col in self.usecols if col >= num_original_columns
  512. ]
  513. if missing_usecols:
  514. raise ParserError(
  515. "Defining usecols without of bounds indices is not allowed. "
  516. f"{missing_usecols} are out of bounds.",
  517. )
  518. col_indices = self.usecols
  519. columns = [
  520. [n for i, n in enumerate(column) if i in col_indices]
  521. for column in columns
  522. ]
  523. self._col_indices = sorted(col_indices)
  524. return columns
  525. def _buffered_line(self) -> list[Scalar]:
  526. """
  527. Return a line from buffer, filling buffer if required.
  528. """
  529. if len(self.buf) > 0:
  530. return self.buf[0]
  531. else:
  532. return self._next_line()
  533. def _check_for_bom(self, first_row: list[Scalar]) -> list[Scalar]:
  534. """
  535. Checks whether the file begins with the BOM character.
  536. If it does, remove it. In addition, if there is quoting
  537. in the field subsequent to the BOM, remove it as well
  538. because it technically takes place at the beginning of
  539. the name, not the middle of it.
  540. """
  541. # first_row will be a list, so we need to check
  542. # that that list is not empty before proceeding.
  543. if not first_row:
  544. return first_row
  545. # The first element of this row is the one that could have the
  546. # BOM that we want to remove. Check that the first element is a
  547. # string before proceeding.
  548. if not isinstance(first_row[0], str):
  549. return first_row
  550. # Check that the string is not empty, as that would
  551. # obviously not have a BOM at the start of it.
  552. if not first_row[0]:
  553. return first_row
  554. # Since the string is non-empty, check that it does
  555. # in fact begin with a BOM.
  556. first_elt = first_row[0][0]
  557. if first_elt != _BOM:
  558. return first_row
  559. first_row_bom = first_row[0]
  560. new_row: str
  561. if len(first_row_bom) > 1 and first_row_bom[1] == self.quotechar:
  562. start = 2
  563. quote = first_row_bom[1]
  564. end = first_row_bom[2:].index(quote) + 2
  565. # Extract the data between the quotation marks
  566. new_row = first_row_bom[start:end]
  567. # Extract any remaining data after the second
  568. # quotation mark.
  569. if len(first_row_bom) > end + 1:
  570. new_row += first_row_bom[end + 1 :]
  571. else:
  572. # No quotation so just remove BOM from first element
  573. new_row = first_row_bom[1:]
  574. new_row_list: list[Scalar] = [new_row]
  575. return new_row_list + first_row[1:]
  576. def _is_line_empty(self, line: list[Scalar]) -> bool:
  577. """
  578. Check if a line is empty or not.
  579. Parameters
  580. ----------
  581. line : str, array-like
  582. The line of data to check.
  583. Returns
  584. -------
  585. boolean : Whether or not the line is empty.
  586. """
  587. return not line or all(not x for x in line)
  588. def _next_line(self) -> list[Scalar]:
  589. if isinstance(self.data, list):
  590. while self.skipfunc(self.pos):
  591. if self.pos >= len(self.data):
  592. break
  593. self.pos += 1
  594. while True:
  595. try:
  596. line = self._check_comments([self.data[self.pos]])[0]
  597. self.pos += 1
  598. # either uncommented or blank to begin with
  599. if not self.skip_blank_lines and (
  600. self._is_line_empty(self.data[self.pos - 1]) or line
  601. ):
  602. break
  603. if self.skip_blank_lines:
  604. ret = self._remove_empty_lines([line])
  605. if ret:
  606. line = ret[0]
  607. break
  608. except IndexError:
  609. raise StopIteration
  610. else:
  611. while self.skipfunc(self.pos):
  612. self.pos += 1
  613. # assert for mypy, data is Iterator[str] or None, would error in next
  614. assert self.data is not None
  615. next(self.data)
  616. while True:
  617. orig_line = self._next_iter_line(row_num=self.pos + 1)
  618. self.pos += 1
  619. if orig_line is not None:
  620. line = self._check_comments([orig_line])[0]
  621. if self.skip_blank_lines:
  622. ret = self._remove_empty_lines([line])
  623. if ret:
  624. line = ret[0]
  625. break
  626. elif self._is_line_empty(orig_line) or line:
  627. break
  628. # This was the first line of the file,
  629. # which could contain the BOM at the
  630. # beginning of it.
  631. if self.pos == 1:
  632. line = self._check_for_bom(line)
  633. self.line_pos += 1
  634. self.buf.append(line)
  635. return line
  636. def _alert_malformed(self, msg: str, row_num: int) -> None:
  637. """
  638. Alert a user about a malformed row, depending on value of
  639. `self.on_bad_lines` enum.
  640. If `self.on_bad_lines` is ERROR, the alert will be `ParserError`.
  641. If `self.on_bad_lines` is WARN, the alert will be printed out.
  642. Parameters
  643. ----------
  644. msg: str
  645. The error message to display.
  646. row_num: int
  647. The row number where the parsing error occurred.
  648. Because this row number is displayed, we 1-index,
  649. even though we 0-index internally.
  650. """
  651. if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
  652. raise ParserError(msg)
  653. if self.on_bad_lines == self.BadLineHandleMethod.WARN:
  654. base = f"Skipping line {row_num}: "
  655. sys.stderr.write(base + msg + "\n")
  656. def _next_iter_line(self, row_num: int) -> list[Scalar] | None:
  657. """
  658. Wrapper around iterating through `self.data` (CSV source).
  659. When a CSV error is raised, we check for specific
  660. error messages that allow us to customize the
  661. error message displayed to the user.
  662. Parameters
  663. ----------
  664. row_num: int
  665. The row number of the line being parsed.
  666. """
  667. try:
  668. # assert for mypy, data is Iterator[str] or None, would error in next
  669. assert self.data is not None
  670. line = next(self.data)
  671. # for mypy
  672. assert isinstance(line, list)
  673. return line
  674. except csv.Error as e:
  675. if self.on_bad_lines in (
  676. self.BadLineHandleMethod.ERROR,
  677. self.BadLineHandleMethod.WARN,
  678. ):
  679. msg = str(e)
  680. if "NULL byte" in msg or "line contains NUL" in msg:
  681. msg = (
  682. "NULL byte detected. This byte "
  683. "cannot be processed in Python's "
  684. "native csv library at the moment, "
  685. "so please pass in engine='c' instead"
  686. )
  687. if self.skipfooter > 0:
  688. reason = (
  689. "Error could possibly be due to "
  690. "parsing errors in the skipped footer rows "
  691. "(the skipfooter keyword is only applied "
  692. "after Python's csv library has parsed "
  693. "all rows)."
  694. )
  695. msg += ". " + reason
  696. self._alert_malformed(msg, row_num)
  697. return None
  698. def _check_comments(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
  699. if self.comment is None:
  700. return lines
  701. ret = []
  702. for line in lines:
  703. rl = []
  704. for x in line:
  705. if (
  706. not isinstance(x, str)
  707. or self.comment not in x
  708. or x in self.na_values
  709. ):
  710. rl.append(x)
  711. else:
  712. x = x[: x.find(self.comment)]
  713. if len(x) > 0:
  714. rl.append(x)
  715. break
  716. ret.append(rl)
  717. return ret
  718. def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
  719. """
  720. Iterate through the lines and remove any that are
  721. either empty or contain only one whitespace value
  722. Parameters
  723. ----------
  724. lines : list of list of Scalars
  725. The array of lines that we are to filter.
  726. Returns
  727. -------
  728. filtered_lines : list of list of Scalars
  729. The same array of lines with the "empty" ones removed.
  730. """
  731. ret = []
  732. for line in lines:
  733. # Remove empty lines and lines with only one whitespace value
  734. if (
  735. len(line) > 1
  736. or len(line) == 1
  737. and (not isinstance(line[0], str) or line[0].strip())
  738. ):
  739. ret.append(line)
  740. return ret
  741. def _check_thousands(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
  742. if self.thousands is None:
  743. return lines
  744. return self._search_replace_num_columns(
  745. lines=lines, search=self.thousands, replace=""
  746. )
  747. def _search_replace_num_columns(
  748. self, lines: list[list[Scalar]], search: str, replace: str
  749. ) -> list[list[Scalar]]:
  750. ret = []
  751. for line in lines:
  752. rl = []
  753. for i, x in enumerate(line):
  754. if (
  755. not isinstance(x, str)
  756. or search not in x
  757. or (self._no_thousands_columns and i in self._no_thousands_columns)
  758. or not self.num.search(x.strip())
  759. ):
  760. rl.append(x)
  761. else:
  762. rl.append(x.replace(search, replace))
  763. ret.append(rl)
  764. return ret
  765. def _check_decimal(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
  766. if self.decimal == parser_defaults["decimal"]:
  767. return lines
  768. return self._search_replace_num_columns(
  769. lines=lines, search=self.decimal, replace="."
  770. )
  771. def _clear_buffer(self) -> None:
  772. self.buf = []
  773. _implicit_index = False
  774. def _get_index_name(
  775. self, columns: Sequence[Hashable]
  776. ) -> tuple[Sequence[Hashable] | None, list[Hashable], list[Hashable]]:
  777. """
  778. Try several cases to get lines:
  779. 0) There are headers on row 0 and row 1 and their
  780. total summed lengths equals the length of the next line.
  781. Treat row 0 as columns and row 1 as indices
  782. 1) Look for implicit index: there are more columns
  783. on row 1 than row 0. If this is true, assume that row
  784. 1 lists index columns and row 0 lists normal columns.
  785. 2) Get index from the columns if it was listed.
  786. """
  787. orig_names = list(columns)
  788. columns = list(columns)
  789. line: list[Scalar] | None
  790. if self._header_line is not None:
  791. line = self._header_line
  792. else:
  793. try:
  794. line = self._next_line()
  795. except StopIteration:
  796. line = None
  797. next_line: list[Scalar] | None
  798. try:
  799. next_line = self._next_line()
  800. except StopIteration:
  801. next_line = None
  802. # implicitly index_col=0 b/c 1 fewer column names
  803. implicit_first_cols = 0
  804. if line is not None:
  805. # leave it 0, #2442
  806. # Case 1
  807. # error: Cannot determine type of 'index_col'
  808. index_col = self.index_col # type: ignore[has-type]
  809. if index_col is not False:
  810. implicit_first_cols = len(line) - self.num_original_columns
  811. # Case 0
  812. if (
  813. next_line is not None
  814. and self.header is not None
  815. and index_col is not False
  816. ):
  817. if len(next_line) == len(line) + self.num_original_columns:
  818. # column and index names on diff rows
  819. self.index_col = list(range(len(line)))
  820. self.buf = self.buf[1:]
  821. for c in reversed(line):
  822. columns.insert(0, c)
  823. # Update list of original names to include all indices.
  824. orig_names = list(columns)
  825. self.num_original_columns = len(columns)
  826. return line, orig_names, columns
  827. if implicit_first_cols > 0:
  828. # Case 1
  829. self._implicit_index = True
  830. if self.index_col is None:
  831. self.index_col = list(range(implicit_first_cols))
  832. index_name = None
  833. else:
  834. # Case 2
  835. (index_name, _, self.index_col) = self._clean_index_names(
  836. columns, self.index_col
  837. )
  838. return index_name, orig_names, columns
  839. def _rows_to_cols(self, content: list[list[Scalar]]) -> list[np.ndarray]:
  840. col_len = self.num_original_columns
  841. if self._implicit_index:
  842. col_len += len(self.index_col)
  843. max_len = max(len(row) for row in content)
  844. # Check that there are no rows with too many
  845. # elements in their row (rows with too few
  846. # elements are padded with NaN).
  847. # error: Non-overlapping identity check (left operand type: "List[int]",
  848. # right operand type: "Literal[False]")
  849. if (
  850. max_len > col_len
  851. and self.index_col is not False # type: ignore[comparison-overlap]
  852. and self.usecols is None
  853. ):
  854. footers = self.skipfooter if self.skipfooter else 0
  855. bad_lines = []
  856. iter_content = enumerate(content)
  857. content_len = len(content)
  858. content = []
  859. for i, _content in iter_content:
  860. actual_len = len(_content)
  861. if actual_len > col_len:
  862. if callable(self.on_bad_lines):
  863. new_l = self.on_bad_lines(_content)
  864. if new_l is not None:
  865. content.append(new_l)
  866. elif self.on_bad_lines in (
  867. self.BadLineHandleMethod.ERROR,
  868. self.BadLineHandleMethod.WARN,
  869. ):
  870. row_num = self.pos - (content_len - i + footers)
  871. bad_lines.append((row_num, actual_len))
  872. if self.on_bad_lines == self.BadLineHandleMethod.ERROR:
  873. break
  874. else:
  875. content.append(_content)
  876. for row_num, actual_len in bad_lines:
  877. msg = (
  878. f"Expected {col_len} fields in line {row_num + 1}, saw "
  879. f"{actual_len}"
  880. )
  881. if (
  882. self.delimiter
  883. and len(self.delimiter) > 1
  884. and self.quoting != csv.QUOTE_NONE
  885. ):
  886. # see gh-13374
  887. reason = (
  888. "Error could possibly be due to quotes being "
  889. "ignored when a multi-char delimiter is used."
  890. )
  891. msg += ". " + reason
  892. self._alert_malformed(msg, row_num + 1)
  893. # see gh-13320
  894. zipped_content = list(lib.to_object_array(content, min_width=col_len).T)
  895. if self.usecols:
  896. assert self._col_indices is not None
  897. col_indices = self._col_indices
  898. if self._implicit_index:
  899. zipped_content = [
  900. a
  901. for i, a in enumerate(zipped_content)
  902. if (
  903. i < len(self.index_col)
  904. or i - len(self.index_col) in col_indices
  905. )
  906. ]
  907. else:
  908. zipped_content = [
  909. a for i, a in enumerate(zipped_content) if i in col_indices
  910. ]
  911. return zipped_content
  912. def _get_lines(self, rows: int | None = None) -> list[list[Scalar]]:
  913. lines = self.buf
  914. new_rows = None
  915. # already fetched some number
  916. if rows is not None:
  917. # we already have the lines in the buffer
  918. if len(self.buf) >= rows:
  919. new_rows, self.buf = self.buf[:rows], self.buf[rows:]
  920. # need some lines
  921. else:
  922. rows -= len(self.buf)
  923. if new_rows is None:
  924. if isinstance(self.data, list):
  925. if self.pos > len(self.data):
  926. raise StopIteration
  927. if rows is None:
  928. new_rows = self.data[self.pos :]
  929. new_pos = len(self.data)
  930. else:
  931. new_rows = self.data[self.pos : self.pos + rows]
  932. new_pos = self.pos + rows
  933. new_rows = self._remove_skipped_rows(new_rows)
  934. lines.extend(new_rows)
  935. self.pos = new_pos
  936. else:
  937. new_rows = []
  938. try:
  939. if rows is not None:
  940. rows_to_skip = 0
  941. if self.skiprows is not None and self.pos is not None:
  942. # Only read additional rows if pos is in skiprows
  943. rows_to_skip = len(
  944. set(self.skiprows) - set(range(self.pos))
  945. )
  946. for _ in range(rows + rows_to_skip):
  947. # assert for mypy, data is Iterator[str] or None, would
  948. # error in next
  949. assert self.data is not None
  950. new_rows.append(next(self.data))
  951. len_new_rows = len(new_rows)
  952. new_rows = self._remove_skipped_rows(new_rows)
  953. lines.extend(new_rows)
  954. else:
  955. rows = 0
  956. while True:
  957. new_row = self._next_iter_line(row_num=self.pos + rows + 1)
  958. rows += 1
  959. if new_row is not None:
  960. new_rows.append(new_row)
  961. len_new_rows = len(new_rows)
  962. except StopIteration:
  963. len_new_rows = len(new_rows)
  964. new_rows = self._remove_skipped_rows(new_rows)
  965. lines.extend(new_rows)
  966. if len(lines) == 0:
  967. raise
  968. self.pos += len_new_rows
  969. self.buf = []
  970. else:
  971. lines = new_rows
  972. if self.skipfooter:
  973. lines = lines[: -self.skipfooter]
  974. lines = self._check_comments(lines)
  975. if self.skip_blank_lines:
  976. lines = self._remove_empty_lines(lines)
  977. lines = self._check_thousands(lines)
  978. return self._check_decimal(lines)
  979. def _remove_skipped_rows(self, new_rows: list[list[Scalar]]) -> list[list[Scalar]]:
  980. if self.skiprows:
  981. return [
  982. row for i, row in enumerate(new_rows) if not self.skipfunc(i + self.pos)
  983. ]
  984. return new_rows
  985. class FixedWidthReader(abc.Iterator):
  986. """
  987. A reader of fixed-width lines.
  988. """
  989. def __init__(
  990. self,
  991. f: IO[str] | ReadCsvBuffer[str],
  992. colspecs: list[tuple[int, int]] | Literal["infer"],
  993. delimiter: str | None,
  994. comment: str | None,
  995. skiprows: set[int] | None = None,
  996. infer_nrows: int = 100,
  997. ) -> None:
  998. self.f = f
  999. self.buffer: Iterator | None = None
  1000. self.delimiter = "\r\n" + delimiter if delimiter else "\n\r\t "
  1001. self.comment = comment
  1002. if colspecs == "infer":
  1003. self.colspecs = self.detect_colspecs(
  1004. infer_nrows=infer_nrows, skiprows=skiprows
  1005. )
  1006. else:
  1007. self.colspecs = colspecs
  1008. if not isinstance(self.colspecs, (tuple, list)):
  1009. raise TypeError(
  1010. "column specifications must be a list or tuple, "
  1011. f"input was a {type(colspecs).__name__}"
  1012. )
  1013. for colspec in self.colspecs:
  1014. if not (
  1015. isinstance(colspec, (tuple, list))
  1016. and len(colspec) == 2
  1017. and isinstance(colspec[0], (int, np.integer, type(None)))
  1018. and isinstance(colspec[1], (int, np.integer, type(None)))
  1019. ):
  1020. raise TypeError(
  1021. "Each column specification must be "
  1022. "2 element tuple or list of integers"
  1023. )
  1024. def get_rows(self, infer_nrows: int, skiprows: set[int] | None = None) -> list[str]:
  1025. """
  1026. Read rows from self.f, skipping as specified.
  1027. We distinguish buffer_rows (the first <= infer_nrows
  1028. lines) from the rows returned to detect_colspecs
  1029. because it's simpler to leave the other locations
  1030. with skiprows logic alone than to modify them to
  1031. deal with the fact we skipped some rows here as
  1032. well.
  1033. Parameters
  1034. ----------
  1035. infer_nrows : int
  1036. Number of rows to read from self.f, not counting
  1037. rows that are skipped.
  1038. skiprows: set, optional
  1039. Indices of rows to skip.
  1040. Returns
  1041. -------
  1042. detect_rows : list of str
  1043. A list containing the rows to read.
  1044. """
  1045. if skiprows is None:
  1046. skiprows = set()
  1047. buffer_rows = []
  1048. detect_rows = []
  1049. for i, row in enumerate(self.f):
  1050. if i not in skiprows:
  1051. detect_rows.append(row)
  1052. buffer_rows.append(row)
  1053. if len(detect_rows) >= infer_nrows:
  1054. break
  1055. self.buffer = iter(buffer_rows)
  1056. return detect_rows
  1057. def detect_colspecs(
  1058. self, infer_nrows: int = 100, skiprows: set[int] | None = None
  1059. ) -> list[tuple[int, int]]:
  1060. # Regex escape the delimiters
  1061. delimiters = "".join([rf"\{x}" for x in self.delimiter])
  1062. pattern = re.compile(f"([^{delimiters}]+)")
  1063. rows = self.get_rows(infer_nrows, skiprows)
  1064. if not rows:
  1065. raise EmptyDataError("No rows from which to infer column width")
  1066. max_len = max(map(len, rows))
  1067. mask = np.zeros(max_len + 1, dtype=int)
  1068. if self.comment is not None:
  1069. rows = [row.partition(self.comment)[0] for row in rows]
  1070. for row in rows:
  1071. for m in pattern.finditer(row):
  1072. mask[m.start() : m.end()] = 1
  1073. shifted = np.roll(mask, 1)
  1074. shifted[0] = 0
  1075. edges = np.where((mask ^ shifted) == 1)[0]
  1076. edge_pairs = list(zip(edges[::2], edges[1::2]))
  1077. return edge_pairs
  1078. def __next__(self) -> list[str]:
  1079. # Argument 1 to "next" has incompatible type "Union[IO[str],
  1080. # ReadCsvBuffer[str]]"; expected "SupportsNext[str]"
  1081. if self.buffer is not None:
  1082. try:
  1083. line = next(self.buffer)
  1084. except StopIteration:
  1085. self.buffer = None
  1086. line = next(self.f) # type: ignore[arg-type]
  1087. else:
  1088. line = next(self.f) # type: ignore[arg-type]
  1089. # Note: 'colspecs' is a sequence of half-open intervals.
  1090. return [line[from_:to].strip(self.delimiter) for (from_, to) in self.colspecs]
  1091. class FixedWidthFieldParser(PythonParser):
  1092. """
  1093. Specialization that Converts fixed-width fields into DataFrames.
  1094. See PythonParser for details.
  1095. """
  1096. def __init__(self, f: ReadCsvBuffer[str], **kwds) -> None:
  1097. # Support iterators, convert to a list.
  1098. self.colspecs = kwds.pop("colspecs")
  1099. self.infer_nrows = kwds.pop("infer_nrows")
  1100. PythonParser.__init__(self, f, **kwds)
  1101. def _make_reader(self, f: IO[str] | ReadCsvBuffer[str]) -> None:
  1102. self.data = FixedWidthReader(
  1103. f,
  1104. self.colspecs,
  1105. self.delimiter,
  1106. self.comment,
  1107. self.skiprows,
  1108. self.infer_nrows,
  1109. )
  1110. def _remove_empty_lines(self, lines: list[list[Scalar]]) -> list[list[Scalar]]:
  1111. """
  1112. Returns the list of lines without the empty ones. With fixed-width
  1113. fields, empty lines become arrays of empty strings.
  1114. See PythonParser._remove_empty_lines.
  1115. """
  1116. return [
  1117. line
  1118. for line in lines
  1119. if any(not isinstance(e, str) or e.strip() for e in line)
  1120. ]
  1121. def count_empty_vals(vals) -> int:
  1122. return sum(1 for v in vals if v == "" or v is None)
  1123. def _validate_skipfooter_arg(skipfooter: int) -> int:
  1124. """
  1125. Validate the 'skipfooter' parameter.
  1126. Checks whether 'skipfooter' is a non-negative integer.
  1127. Raises a ValueError if that is not the case.
  1128. Parameters
  1129. ----------
  1130. skipfooter : non-negative integer
  1131. The number of rows to skip at the end of the file.
  1132. Returns
  1133. -------
  1134. validated_skipfooter : non-negative integer
  1135. The original input if the validation succeeds.
  1136. Raises
  1137. ------
  1138. ValueError : 'skipfooter' was not a non-negative integer.
  1139. """
  1140. if not is_integer(skipfooter):
  1141. raise ValueError("skipfooter must be an integer")
  1142. if skipfooter < 0:
  1143. raise ValueError("skipfooter cannot be negative")
  1144. return skipfooter