c_parser_wrapper.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. from __future__ import annotations
  2. from collections import defaultdict
  3. from typing import (
  4. TYPE_CHECKING,
  5. Hashable,
  6. Mapping,
  7. Sequence,
  8. )
  9. import warnings
  10. import numpy as np
  11. from pandas._libs import (
  12. lib,
  13. parsers,
  14. )
  15. from pandas._typing import (
  16. ArrayLike,
  17. DtypeArg,
  18. DtypeObj,
  19. ReadCsvBuffer,
  20. )
  21. from pandas.compat._optional import import_optional_dependency
  22. from pandas.errors import DtypeWarning
  23. from pandas.util._exceptions import find_stack_level
  24. from pandas.core.dtypes.common import (
  25. is_categorical_dtype,
  26. pandas_dtype,
  27. )
  28. from pandas.core.dtypes.concat import (
  29. concat_compat,
  30. union_categoricals,
  31. )
  32. from pandas.core.indexes.api import ensure_index_from_sequences
  33. from pandas.io.common import (
  34. dedup_names,
  35. is_potential_multi_index,
  36. )
  37. from pandas.io.parsers.base_parser import (
  38. ParserBase,
  39. ParserError,
  40. is_index_col,
  41. )
  42. if TYPE_CHECKING:
  43. from pandas import (
  44. Index,
  45. MultiIndex,
  46. )
  47. class CParserWrapper(ParserBase):
  48. low_memory: bool
  49. _reader: parsers.TextReader
  50. def __init__(self, src: ReadCsvBuffer[str], **kwds) -> None:
  51. super().__init__(kwds)
  52. self.kwds = kwds
  53. kwds = kwds.copy()
  54. self.low_memory = kwds.pop("low_memory", False)
  55. # #2442
  56. # error: Cannot determine type of 'index_col'
  57. kwds["allow_leading_cols"] = (
  58. self.index_col is not False # type: ignore[has-type]
  59. )
  60. # GH20529, validate usecol arg before TextReader
  61. kwds["usecols"] = self.usecols
  62. # Have to pass int, would break tests using TextReader directly otherwise :(
  63. kwds["on_bad_lines"] = self.on_bad_lines.value
  64. for key in (
  65. "storage_options",
  66. "encoding",
  67. "memory_map",
  68. "compression",
  69. ):
  70. kwds.pop(key, None)
  71. kwds["dtype"] = ensure_dtype_objs(kwds.get("dtype", None))
  72. if "dtype_backend" not in kwds or kwds["dtype_backend"] is lib.no_default:
  73. kwds["dtype_backend"] = "numpy"
  74. if kwds["dtype_backend"] == "pyarrow":
  75. # Fail here loudly instead of in cython after reading
  76. import_optional_dependency("pyarrow")
  77. self._reader = parsers.TextReader(src, **kwds)
  78. self.unnamed_cols = self._reader.unnamed_cols
  79. # error: Cannot determine type of 'names'
  80. passed_names = self.names is None # type: ignore[has-type]
  81. if self._reader.header is None:
  82. self.names = None
  83. else:
  84. # error: Cannot determine type of 'names'
  85. # error: Cannot determine type of 'index_names'
  86. (
  87. self.names, # type: ignore[has-type]
  88. self.index_names,
  89. self.col_names,
  90. passed_names,
  91. ) = self._extract_multi_indexer_columns(
  92. self._reader.header,
  93. self.index_names, # type: ignore[has-type]
  94. passed_names,
  95. )
  96. # error: Cannot determine type of 'names'
  97. if self.names is None: # type: ignore[has-type]
  98. self.names = list(range(self._reader.table_width))
  99. # gh-9755
  100. #
  101. # need to set orig_names here first
  102. # so that proper indexing can be done
  103. # with _set_noconvert_columns
  104. #
  105. # once names has been filtered, we will
  106. # then set orig_names again to names
  107. # error: Cannot determine type of 'names'
  108. self.orig_names = self.names[:] # type: ignore[has-type]
  109. if self.usecols:
  110. usecols = self._evaluate_usecols(self.usecols, self.orig_names)
  111. # GH 14671
  112. # assert for mypy, orig_names is List or None, None would error in issubset
  113. assert self.orig_names is not None
  114. if self.usecols_dtype == "string" and not set(usecols).issubset(
  115. self.orig_names
  116. ):
  117. self._validate_usecols_names(usecols, self.orig_names)
  118. # error: Cannot determine type of 'names'
  119. if len(self.names) > len(usecols): # type: ignore[has-type]
  120. # error: Cannot determine type of 'names'
  121. self.names = [ # type: ignore[has-type]
  122. n
  123. # error: Cannot determine type of 'names'
  124. for i, n in enumerate(self.names) # type: ignore[has-type]
  125. if (i in usecols or n in usecols)
  126. ]
  127. # error: Cannot determine type of 'names'
  128. if len(self.names) < len(usecols): # type: ignore[has-type]
  129. # error: Cannot determine type of 'names'
  130. self._validate_usecols_names(
  131. usecols,
  132. self.names, # type: ignore[has-type]
  133. )
  134. # error: Cannot determine type of 'names'
  135. self._validate_parse_dates_presence(self.names) # type: ignore[has-type]
  136. self._set_noconvert_columns()
  137. # error: Cannot determine type of 'names'
  138. self.orig_names = self.names # type: ignore[has-type]
  139. if not self._has_complex_date_col:
  140. # error: Cannot determine type of 'index_col'
  141. if self._reader.leading_cols == 0 and is_index_col(
  142. self.index_col # type: ignore[has-type]
  143. ):
  144. self._name_processed = True
  145. (
  146. index_names,
  147. # error: Cannot determine type of 'names'
  148. self.names, # type: ignore[has-type]
  149. self.index_col,
  150. ) = self._clean_index_names(
  151. # error: Cannot determine type of 'names'
  152. self.names, # type: ignore[has-type]
  153. # error: Cannot determine type of 'index_col'
  154. self.index_col, # type: ignore[has-type]
  155. )
  156. if self.index_names is None:
  157. self.index_names = index_names
  158. if self._reader.header is None and not passed_names:
  159. assert self.index_names is not None
  160. self.index_names = [None] * len(self.index_names)
  161. self._implicit_index = self._reader.leading_cols > 0
  162. def close(self) -> None:
  163. # close handles opened by C parser
  164. try:
  165. self._reader.close()
  166. except ValueError:
  167. pass
  168. def _set_noconvert_columns(self) -> None:
  169. """
  170. Set the columns that should not undergo dtype conversions.
  171. Currently, any column that is involved with date parsing will not
  172. undergo such conversions.
  173. """
  174. assert self.orig_names is not None
  175. # error: Cannot determine type of 'names'
  176. # much faster than using orig_names.index(x) xref GH#44106
  177. names_dict = {x: i for i, x in enumerate(self.orig_names)}
  178. col_indices = [names_dict[x] for x in self.names] # type: ignore[has-type]
  179. # error: Cannot determine type of 'names'
  180. noconvert_columns = self._set_noconvert_dtype_columns(
  181. col_indices,
  182. self.names, # type: ignore[has-type]
  183. )
  184. for col in noconvert_columns:
  185. self._reader.set_noconvert(col)
  186. def read(
  187. self,
  188. nrows: int | None = None,
  189. ) -> tuple[
  190. Index | MultiIndex | None,
  191. Sequence[Hashable] | MultiIndex,
  192. Mapping[Hashable, ArrayLike],
  193. ]:
  194. index: Index | MultiIndex | None
  195. column_names: Sequence[Hashable] | MultiIndex
  196. try:
  197. if self.low_memory:
  198. chunks = self._reader.read_low_memory(nrows)
  199. # destructive to chunks
  200. data = _concatenate_chunks(chunks)
  201. else:
  202. data = self._reader.read(nrows)
  203. except StopIteration:
  204. if self._first_chunk:
  205. self._first_chunk = False
  206. names = dedup_names(
  207. self.orig_names,
  208. is_potential_multi_index(self.orig_names, self.index_col),
  209. )
  210. index, columns, col_dict = self._get_empty_meta(
  211. names,
  212. self.index_col,
  213. self.index_names,
  214. dtype=self.kwds.get("dtype"),
  215. )
  216. columns = self._maybe_make_multi_index_columns(columns, self.col_names)
  217. if self.usecols is not None:
  218. columns = self._filter_usecols(columns)
  219. col_dict = {k: v for k, v in col_dict.items() if k in columns}
  220. return index, columns, col_dict
  221. else:
  222. self.close()
  223. raise
  224. # Done with first read, next time raise StopIteration
  225. self._first_chunk = False
  226. # error: Cannot determine type of 'names'
  227. names = self.names # type: ignore[has-type]
  228. if self._reader.leading_cols:
  229. if self._has_complex_date_col:
  230. raise NotImplementedError("file structure not yet supported")
  231. # implicit index, no index names
  232. arrays = []
  233. if self.index_col and self._reader.leading_cols != len(self.index_col):
  234. raise ParserError(
  235. "Could not construct index. Requested to use "
  236. f"{len(self.index_col)} number of columns, but "
  237. f"{self._reader.leading_cols} left to parse."
  238. )
  239. for i in range(self._reader.leading_cols):
  240. if self.index_col is None:
  241. values = data.pop(i)
  242. else:
  243. values = data.pop(self.index_col[i])
  244. values = self._maybe_parse_dates(values, i, try_parse_dates=True)
  245. arrays.append(values)
  246. index = ensure_index_from_sequences(arrays)
  247. if self.usecols is not None:
  248. names = self._filter_usecols(names)
  249. names = dedup_names(names, is_potential_multi_index(names, self.index_col))
  250. # rename dict keys
  251. data_tups = sorted(data.items())
  252. data = {k: v for k, (i, v) in zip(names, data_tups)}
  253. column_names, date_data = self._do_date_conversions(names, data)
  254. # maybe create a mi on the columns
  255. column_names = self._maybe_make_multi_index_columns(
  256. column_names, self.col_names
  257. )
  258. else:
  259. # rename dict keys
  260. data_tups = sorted(data.items())
  261. # ugh, mutation
  262. # assert for mypy, orig_names is List or None, None would error in list(...)
  263. assert self.orig_names is not None
  264. names = list(self.orig_names)
  265. names = dedup_names(names, is_potential_multi_index(names, self.index_col))
  266. if self.usecols is not None:
  267. names = self._filter_usecols(names)
  268. # columns as list
  269. alldata = [x[1] for x in data_tups]
  270. if self.usecols is None:
  271. self._check_data_length(names, alldata)
  272. data = {k: v for k, (i, v) in zip(names, data_tups)}
  273. names, date_data = self._do_date_conversions(names, data)
  274. index, column_names = self._make_index(date_data, alldata, names)
  275. return index, column_names, date_data
  276. def _filter_usecols(self, names: Sequence[Hashable]) -> Sequence[Hashable]:
  277. # hackish
  278. usecols = self._evaluate_usecols(self.usecols, names)
  279. if usecols is not None and len(names) != len(usecols):
  280. names = [
  281. name for i, name in enumerate(names) if i in usecols or name in usecols
  282. ]
  283. return names
  284. def _get_index_names(self):
  285. names = list(self._reader.header[0])
  286. idx_names = None
  287. if self._reader.leading_cols == 0 and self.index_col is not None:
  288. (idx_names, names, self.index_col) = self._clean_index_names(
  289. names, self.index_col
  290. )
  291. return names, idx_names
  292. def _maybe_parse_dates(self, values, index: int, try_parse_dates: bool = True):
  293. if try_parse_dates and self._should_parse_dates(index):
  294. values = self._date_conv(
  295. values,
  296. col=self.index_names[index] if self.index_names is not None else None,
  297. )
  298. return values
  299. def _concatenate_chunks(chunks: list[dict[int, ArrayLike]]) -> dict:
  300. """
  301. Concatenate chunks of data read with low_memory=True.
  302. The tricky part is handling Categoricals, where different chunks
  303. may have different inferred categories.
  304. """
  305. names = list(chunks[0].keys())
  306. warning_columns = []
  307. result: dict = {}
  308. for name in names:
  309. arrs = [chunk.pop(name) for chunk in chunks]
  310. # Check each arr for consistent types.
  311. dtypes = {a.dtype for a in arrs}
  312. non_cat_dtypes = {x for x in dtypes if not is_categorical_dtype(x)}
  313. dtype = dtypes.pop()
  314. if is_categorical_dtype(dtype):
  315. result[name] = union_categoricals(arrs, sort_categories=False)
  316. else:
  317. result[name] = concat_compat(arrs)
  318. if len(non_cat_dtypes) > 1 and result[name].dtype == np.dtype(object):
  319. warning_columns.append(str(name))
  320. if warning_columns:
  321. warning_names = ",".join(warning_columns)
  322. warning_message = " ".join(
  323. [
  324. f"Columns ({warning_names}) have mixed types. "
  325. f"Specify dtype option on import or set low_memory=False."
  326. ]
  327. )
  328. warnings.warn(warning_message, DtypeWarning, stacklevel=find_stack_level())
  329. return result
  330. def ensure_dtype_objs(
  331. dtype: DtypeArg | dict[Hashable, DtypeArg] | None
  332. ) -> DtypeObj | dict[Hashable, DtypeObj] | None:
  333. """
  334. Ensure we have either None, a dtype object, or a dictionary mapping to
  335. dtype objects.
  336. """
  337. if isinstance(dtype, defaultdict):
  338. # "None" not callable [misc]
  339. default_dtype = pandas_dtype(dtype.default_factory()) # type: ignore[misc]
  340. dtype_converted: defaultdict = defaultdict(lambda: default_dtype)
  341. for key in dtype.keys():
  342. dtype_converted[key] = pandas_dtype(dtype[key])
  343. return dtype_converted
  344. elif isinstance(dtype, dict):
  345. return {k: pandas_dtype(dtype[k]) for k in dtype}
  346. elif dtype is not None:
  347. return pandas_dtype(dtype)
  348. return dtype