12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388 |
- from __future__ import annotations
- from collections import defaultdict
- from copy import copy
- import csv
- import datetime
- from enum import Enum
- import itertools
- from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Hashable,
- Iterable,
- List,
- Mapping,
- Sequence,
- Tuple,
- cast,
- final,
- overload,
- )
- import warnings
- import numpy as np
- from pandas._libs import (
- lib,
- parsers,
- )
- import pandas._libs.ops as libops
- from pandas._libs.parsers import STR_NA_VALUES
- from pandas._libs.tslibs import parsing
- from pandas._typing import (
- ArrayLike,
- DtypeArg,
- DtypeObj,
- Scalar,
- )
- from pandas.compat._optional import import_optional_dependency
- from pandas.errors import (
- ParserError,
- ParserWarning,
- )
- from pandas.util._exceptions import find_stack_level
- from pandas.core.dtypes.astype import astype_array
- from pandas.core.dtypes.common import (
- ensure_object,
- is_bool_dtype,
- is_dict_like,
- is_dtype_equal,
- is_extension_array_dtype,
- is_float_dtype,
- is_integer,
- is_integer_dtype,
- is_list_like,
- is_object_dtype,
- is_scalar,
- is_string_dtype,
- pandas_dtype,
- )
- from pandas.core.dtypes.dtypes import (
- CategoricalDtype,
- ExtensionDtype,
- )
- from pandas.core.dtypes.missing import isna
- from pandas import (
- ArrowDtype,
- DatetimeIndex,
- StringDtype,
- )
- from pandas.core import algorithms
- from pandas.core.arrays import (
- ArrowExtensionArray,
- BooleanArray,
- Categorical,
- ExtensionArray,
- FloatingArray,
- IntegerArray,
- )
- from pandas.core.arrays.boolean import BooleanDtype
- from pandas.core.indexes.api import (
- Index,
- MultiIndex,
- default_index,
- ensure_index_from_sequences,
- )
- from pandas.core.series import Series
- from pandas.core.tools import datetimes as tools
- from pandas.io.common import is_potential_multi_index
- if TYPE_CHECKING:
- from pandas import DataFrame
- class ParserBase:
- class BadLineHandleMethod(Enum):
- ERROR = 0
- WARN = 1
- SKIP = 2
- _implicit_index: bool = False
- _first_chunk: bool
- def __init__(self, kwds) -> None:
- self.names = kwds.get("names")
- self.orig_names: Sequence[Hashable] | None = None
- self.index_col = kwds.get("index_col", None)
- self.unnamed_cols: set = set()
- self.index_names: Sequence[Hashable] | None = None
- self.col_names: Sequence[Hashable] | None = None
- self.parse_dates = _validate_parse_dates_arg(kwds.pop("parse_dates", False))
- self._parse_date_cols: Iterable = []
- self.date_parser = kwds.pop("date_parser", lib.no_default)
- self.date_format = kwds.pop("date_format", None)
- self.dayfirst = kwds.pop("dayfirst", False)
- self.keep_date_col = kwds.pop("keep_date_col", False)
- self.na_values = kwds.get("na_values")
- self.na_fvalues = kwds.get("na_fvalues")
- self.na_filter = kwds.get("na_filter", False)
- self.keep_default_na = kwds.get("keep_default_na", True)
- self.dtype = copy(kwds.get("dtype", None))
- self.converters = kwds.get("converters")
- self.dtype_backend = kwds.get("dtype_backend")
- self.true_values = kwds.get("true_values")
- self.false_values = kwds.get("false_values")
- self.cache_dates = kwds.pop("cache_dates", True)
- self._date_conv = _make_date_converter(
- date_parser=self.date_parser,
- date_format=self.date_format,
- dayfirst=self.dayfirst,
- cache_dates=self.cache_dates,
- )
- # validate header options for mi
- self.header = kwds.get("header")
- if is_list_like(self.header, allow_sets=False):
- if kwds.get("usecols"):
- raise ValueError(
- "cannot specify usecols when specifying a multi-index header"
- )
- if kwds.get("names"):
- raise ValueError(
- "cannot specify names when specifying a multi-index header"
- )
- # validate index_col that only contains integers
- if self.index_col is not None:
- if not (
- is_list_like(self.index_col, allow_sets=False)
- and all(map(is_integer, self.index_col))
- or is_integer(self.index_col)
- ):
- raise ValueError(
- "index_col must only contain row numbers "
- "when specifying a multi-index header"
- )
- self._name_processed = False
- self._first_chunk = True
- self.usecols, self.usecols_dtype = self._validate_usecols_arg(kwds["usecols"])
- # Fallback to error to pass a sketchy test(test_override_set_noconvert_columns)
- # Normally, this arg would get pre-processed earlier on
- self.on_bad_lines = kwds.get("on_bad_lines", self.BadLineHandleMethod.ERROR)
- def _validate_parse_dates_presence(self, columns: Sequence[Hashable]) -> Iterable:
- """
- Check if parse_dates are in columns.
- If user has provided names for parse_dates, check if those columns
- are available.
- Parameters
- ----------
- columns : list
- List of names of the dataframe.
- Returns
- -------
- The names of the columns which will get parsed later if a dict or list
- is given as specification.
- Raises
- ------
- ValueError
- If column to parse_date is not in dataframe.
- """
- cols_needed: Iterable
- if is_dict_like(self.parse_dates):
- cols_needed = itertools.chain(*self.parse_dates.values())
- elif is_list_like(self.parse_dates):
- # a column in parse_dates could be represented
- # ColReference = Union[int, str]
- # DateGroups = List[ColReference]
- # ParseDates = Union[DateGroups, List[DateGroups],
- # Dict[ColReference, DateGroups]]
- cols_needed = itertools.chain.from_iterable(
- col if is_list_like(col) and not isinstance(col, tuple) else [col]
- for col in self.parse_dates
- )
- else:
- cols_needed = []
- cols_needed = list(cols_needed)
- # get only columns that are references using names (str), not by index
- missing_cols = ", ".join(
- sorted(
- {
- col
- for col in cols_needed
- if isinstance(col, str) and col not in columns
- }
- )
- )
- if missing_cols:
- raise ValueError(
- f"Missing column provided to 'parse_dates': '{missing_cols}'"
- )
- # Convert positions to actual column names
- return [
- col if (isinstance(col, str) or col in columns) else columns[col]
- for col in cols_needed
- ]
- def close(self) -> None:
- pass
- @final
- @property
- def _has_complex_date_col(self) -> bool:
- return isinstance(self.parse_dates, dict) or (
- isinstance(self.parse_dates, list)
- and len(self.parse_dates) > 0
- and isinstance(self.parse_dates[0], list)
- )
- @final
- def _should_parse_dates(self, i: int) -> bool:
- if isinstance(self.parse_dates, bool):
- return self.parse_dates
- else:
- if self.index_names is not None:
- name = self.index_names[i]
- else:
- name = None
- j = i if self.index_col is None else self.index_col[i]
- if is_scalar(self.parse_dates):
- return (j == self.parse_dates) or (
- name is not None and name == self.parse_dates
- )
- else:
- return (j in self.parse_dates) or (
- name is not None and name in self.parse_dates
- )
- @final
- def _extract_multi_indexer_columns(
- self,
- header,
- index_names: Sequence[Hashable] | None,
- passed_names: bool = False,
- ) -> tuple[
- Sequence[Hashable], Sequence[Hashable] | None, Sequence[Hashable] | None, bool
- ]:
- """
- Extract and return the names, index_names, col_names if the column
- names are a MultiIndex.
- Parameters
- ----------
- header: list of lists
- The header rows
- index_names: list, optional
- The names of the future index
- passed_names: bool, default False
- A flag specifying if names where passed
- """
- if len(header) < 2:
- return header[0], index_names, None, passed_names
- # the names are the tuples of the header that are not the index cols
- # 0 is the name of the index, assuming index_col is a list of column
- # numbers
- ic = self.index_col
- if ic is None:
- ic = []
- if not isinstance(ic, (list, tuple, np.ndarray)):
- ic = [ic]
- sic = set(ic)
- # clean the index_names
- index_names = header.pop(-1)
- index_names, _, _ = self._clean_index_names(index_names, self.index_col)
- # extract the columns
- field_count = len(header[0])
- # check if header lengths are equal
- if not all(len(header_iter) == field_count for header_iter in header[1:]):
- raise ParserError("Header rows must have an equal number of columns.")
- def extract(r):
- return tuple(r[i] for i in range(field_count) if i not in sic)
- columns = list(zip(*(extract(r) for r in header)))
- names = columns.copy()
- for single_ic in sorted(ic):
- names.insert(single_ic, single_ic)
- # Clean the column names (if we have an index_col).
- if len(ic):
- col_names = [
- r[ic[0]]
- if ((r[ic[0]] is not None) and r[ic[0]] not in self.unnamed_cols)
- else None
- for r in header
- ]
- else:
- col_names = [None] * len(header)
- passed_names = True
- return names, index_names, col_names, passed_names
- @final
- def _maybe_make_multi_index_columns(
- self,
- columns: Sequence[Hashable],
- col_names: Sequence[Hashable] | None = None,
- ) -> Sequence[Hashable] | MultiIndex:
- # possibly create a column mi here
- if is_potential_multi_index(columns):
- list_columns = cast(List[Tuple], columns)
- return MultiIndex.from_tuples(list_columns, names=col_names)
- return columns
- @final
- def _make_index(
- self, data, alldata, columns, indexnamerow: list[Scalar] | None = None
- ) -> tuple[Index | None, Sequence[Hashable] | MultiIndex]:
- index: Index | None
- if not is_index_col(self.index_col) or not self.index_col:
- index = None
- elif not self._has_complex_date_col:
- simple_index = self._get_simple_index(alldata, columns)
- index = self._agg_index(simple_index)
- elif self._has_complex_date_col:
- if not self._name_processed:
- (self.index_names, _, self.index_col) = self._clean_index_names(
- list(columns), self.index_col
- )
- self._name_processed = True
- date_index = self._get_complex_date_index(data, columns)
- index = self._agg_index(date_index, try_parse_dates=False)
- # add names for the index
- if indexnamerow:
- coffset = len(indexnamerow) - len(columns)
- assert index is not None
- index = index.set_names(indexnamerow[:coffset])
- # maybe create a mi on the columns
- columns = self._maybe_make_multi_index_columns(columns, self.col_names)
- return index, columns
- @final
- def _get_simple_index(self, data, columns):
- def ix(col):
- if not isinstance(col, str):
- return col
- raise ValueError(f"Index {col} invalid")
- to_remove = []
- index = []
- for idx in self.index_col:
- i = ix(idx)
- to_remove.append(i)
- index.append(data[i])
- # remove index items from content and columns, don't pop in
- # loop
- for i in sorted(to_remove, reverse=True):
- data.pop(i)
- if not self._implicit_index:
- columns.pop(i)
- return index
- @final
- def _get_complex_date_index(self, data, col_names):
- def _get_name(icol):
- if isinstance(icol, str):
- return icol
- if col_names is None:
- raise ValueError(f"Must supply column order to use {icol!s} as index")
- for i, c in enumerate(col_names):
- if i == icol:
- return c
- to_remove = []
- index = []
- for idx in self.index_col:
- name = _get_name(idx)
- to_remove.append(name)
- index.append(data[name])
- # remove index items from content and columns, don't pop in
- # loop
- for c in sorted(to_remove, reverse=True):
- data.pop(c)
- col_names.remove(c)
- return index
- def _clean_mapping(self, mapping):
- """converts col numbers to names"""
- if not isinstance(mapping, dict):
- return mapping
- clean = {}
- # for mypy
- assert self.orig_names is not None
- for col, v in mapping.items():
- if isinstance(col, int) and col not in self.orig_names:
- col = self.orig_names[col]
- clean[col] = v
- if isinstance(mapping, defaultdict):
- remaining_cols = set(self.orig_names) - set(clean.keys())
- clean.update({col: mapping[col] for col in remaining_cols})
- return clean
- @final
- def _agg_index(self, index, try_parse_dates: bool = True) -> Index:
- arrays = []
- converters = self._clean_mapping(self.converters)
- for i, arr in enumerate(index):
- if try_parse_dates and self._should_parse_dates(i):
- arr = self._date_conv(
- arr,
- col=self.index_names[i] if self.index_names is not None else None,
- )
- if self.na_filter:
- col_na_values = self.na_values
- col_na_fvalues = self.na_fvalues
- else:
- col_na_values = set()
- col_na_fvalues = set()
- if isinstance(self.na_values, dict):
- assert self.index_names is not None
- col_name = self.index_names[i]
- if col_name is not None:
- col_na_values, col_na_fvalues = _get_na_values(
- col_name, self.na_values, self.na_fvalues, self.keep_default_na
- )
- clean_dtypes = self._clean_mapping(self.dtype)
- cast_type = None
- index_converter = False
- if self.index_names is not None:
- if isinstance(clean_dtypes, dict):
- cast_type = clean_dtypes.get(self.index_names[i], None)
- if isinstance(converters, dict):
- index_converter = converters.get(self.index_names[i]) is not None
- try_num_bool = not (
- cast_type and is_string_dtype(cast_type) or index_converter
- )
- arr, _ = self._infer_types(
- arr, col_na_values | col_na_fvalues, cast_type is None, try_num_bool
- )
- arrays.append(arr)
- names = self.index_names
- index = ensure_index_from_sequences(arrays, names)
- return index
- @final
- def _convert_to_ndarrays(
- self,
- dct: Mapping,
- na_values,
- na_fvalues,
- verbose: bool = False,
- converters=None,
- dtypes=None,
- ):
- result = {}
- for c, values in dct.items():
- conv_f = None if converters is None else converters.get(c, None)
- if isinstance(dtypes, dict):
- cast_type = dtypes.get(c, None)
- else:
- # single dtype or None
- cast_type = dtypes
- if self.na_filter:
- col_na_values, col_na_fvalues = _get_na_values(
- c, na_values, na_fvalues, self.keep_default_na
- )
- else:
- col_na_values, col_na_fvalues = set(), set()
- if c in self._parse_date_cols:
- # GH#26203 Do not convert columns which get converted to dates
- # but replace nans to ensure to_datetime works
- mask = algorithms.isin(values, set(col_na_values) | col_na_fvalues)
- np.putmask(values, mask, np.nan)
- result[c] = values
- continue
- if conv_f is not None:
- # conv_f applied to data before inference
- if cast_type is not None:
- warnings.warn(
- (
- "Both a converter and dtype were specified "
- f"for column {c} - only the converter will be used."
- ),
- ParserWarning,
- stacklevel=find_stack_level(),
- )
- try:
- values = lib.map_infer(values, conv_f)
- except ValueError:
- # error: Argument 2 to "isin" has incompatible type "List[Any]";
- # expected "Union[Union[ExtensionArray, ndarray], Index, Series]"
- mask = algorithms.isin(
- values, list(na_values) # type: ignore[arg-type]
- ).view(np.uint8)
- values = lib.map_infer_mask(values, conv_f, mask)
- cvals, na_count = self._infer_types(
- values,
- set(col_na_values) | col_na_fvalues,
- cast_type is None,
- try_num_bool=False,
- )
- else:
- is_ea = is_extension_array_dtype(cast_type)
- is_str_or_ea_dtype = is_ea or is_string_dtype(cast_type)
- # skip inference if specified dtype is object
- # or casting to an EA
- try_num_bool = not (cast_type and is_str_or_ea_dtype)
- # general type inference and conversion
- cvals, na_count = self._infer_types(
- values,
- set(col_na_values) | col_na_fvalues,
- cast_type is None,
- try_num_bool,
- )
- # type specified in dtype param or cast_type is an EA
- if cast_type and (not is_dtype_equal(cvals, cast_type) or is_ea):
- if not is_ea and na_count > 0:
- if is_bool_dtype(cast_type):
- raise ValueError(f"Bool column has NA values in column {c}")
- cast_type = pandas_dtype(cast_type)
- cvals = self._cast_types(cvals, cast_type, c)
- result[c] = cvals
- if verbose and na_count:
- print(f"Filled {na_count} NA values in column {c!s}")
- return result
- @final
- def _set_noconvert_dtype_columns(
- self, col_indices: list[int], names: Sequence[Hashable]
- ) -> set[int]:
- """
- Set the columns that should not undergo dtype conversions.
- Currently, any column that is involved with date parsing will not
- undergo such conversions. If usecols is specified, the positions of the columns
- not to cast is relative to the usecols not to all columns.
- Parameters
- ----------
- col_indices: The indices specifying order and positions of the columns
- names: The column names which order is corresponding with the order
- of col_indices
- Returns
- -------
- A set of integers containing the positions of the columns not to convert.
- """
- usecols: list[int] | list[str] | None
- noconvert_columns = set()
- if self.usecols_dtype == "integer":
- # A set of integers will be converted to a list in
- # the correct order every single time.
- usecols = sorted(self.usecols)
- elif callable(self.usecols) or self.usecols_dtype not in ("empty", None):
- # The names attribute should have the correct columns
- # in the proper order for indexing with parse_dates.
- usecols = col_indices
- else:
- # Usecols is empty.
- usecols = None
- def _set(x) -> int:
- if usecols is not None and is_integer(x):
- x = usecols[x]
- if not is_integer(x):
- x = col_indices[names.index(x)]
- return x
- if isinstance(self.parse_dates, list):
- for val in self.parse_dates:
- if isinstance(val, list):
- for k in val:
- noconvert_columns.add(_set(k))
- else:
- noconvert_columns.add(_set(val))
- elif isinstance(self.parse_dates, dict):
- for val in self.parse_dates.values():
- if isinstance(val, list):
- for k in val:
- noconvert_columns.add(_set(k))
- else:
- noconvert_columns.add(_set(val))
- elif self.parse_dates:
- if isinstance(self.index_col, list):
- for k in self.index_col:
- noconvert_columns.add(_set(k))
- elif self.index_col is not None:
- noconvert_columns.add(_set(self.index_col))
- return noconvert_columns
- def _infer_types(
- self, values, na_values, no_dtype_specified, try_num_bool: bool = True
- ) -> tuple[ArrayLike, int]:
- """
- Infer types of values, possibly casting
- Parameters
- ----------
- values : ndarray
- na_values : set
- no_dtype_specified: Specifies if we want to cast explicitly
- try_num_bool : bool, default try
- try to cast values to numeric (first preference) or boolean
- Returns
- -------
- converted : ndarray or ExtensionArray
- na_count : int
- """
- na_count = 0
- if issubclass(values.dtype.type, (np.number, np.bool_)):
- # If our array has numeric dtype, we don't have to check for strings in isin
- na_values = np.array([val for val in na_values if not isinstance(val, str)])
- mask = algorithms.isin(values, na_values)
- na_count = mask.astype("uint8", copy=False).sum()
- if na_count > 0:
- if is_integer_dtype(values):
- values = values.astype(np.float64)
- np.putmask(values, mask, np.nan)
- return values, na_count
- dtype_backend = self.dtype_backend
- non_default_dtype_backend = (
- no_dtype_specified and dtype_backend is not lib.no_default
- )
- result: ArrayLike
- if try_num_bool and is_object_dtype(values.dtype):
- # exclude e.g DatetimeIndex here
- try:
- result, result_mask = lib.maybe_convert_numeric(
- values,
- na_values,
- False,
- convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa
- )
- except (ValueError, TypeError):
- # e.g. encountering datetime string gets ValueError
- # TypeError can be raised in floatify
- na_count = parsers.sanitize_objects(values, na_values)
- result = values
- else:
- if non_default_dtype_backend:
- if result_mask is None:
- result_mask = np.zeros(result.shape, dtype=np.bool_)
- if result_mask.all():
- result = IntegerArray(
- np.ones(result_mask.shape, dtype=np.int64), result_mask
- )
- elif is_integer_dtype(result):
- result = IntegerArray(result, result_mask)
- elif is_bool_dtype(result):
- result = BooleanArray(result, result_mask)
- elif is_float_dtype(result):
- result = FloatingArray(result, result_mask)
- na_count = result_mask.sum()
- else:
- na_count = isna(result).sum()
- else:
- result = values
- if values.dtype == np.object_:
- na_count = parsers.sanitize_objects(values, na_values)
- if result.dtype == np.object_ and try_num_bool:
- result, bool_mask = libops.maybe_convert_bool(
- np.asarray(values),
- true_values=self.true_values,
- false_values=self.false_values,
- convert_to_masked_nullable=non_default_dtype_backend, # type: ignore[arg-type] # noqa
- )
- if result.dtype == np.bool_ and non_default_dtype_backend:
- if bool_mask is None:
- bool_mask = np.zeros(result.shape, dtype=np.bool_)
- result = BooleanArray(result, bool_mask)
- elif result.dtype == np.object_ and non_default_dtype_backend:
- # read_excel sends array of datetime objects
- inferred_type = lib.infer_dtype(result)
- if inferred_type != "datetime":
- result = StringDtype().construct_array_type()._from_sequence(values)
- if dtype_backend == "pyarrow":
- pa = import_optional_dependency("pyarrow")
- if isinstance(result, np.ndarray):
- result = ArrowExtensionArray(pa.array(result, from_pandas=True))
- else:
- # ExtensionArray
- result = ArrowExtensionArray(
- pa.array(result.to_numpy(), from_pandas=True)
- )
- return result, na_count
- def _cast_types(self, values: ArrayLike, cast_type: DtypeObj, column) -> ArrayLike:
- """
- Cast values to specified type
- Parameters
- ----------
- values : ndarray or ExtensionArray
- cast_type : np.dtype or ExtensionDtype
- dtype to cast values to
- column : string
- column name - used only for error reporting
- Returns
- -------
- converted : ndarray or ExtensionArray
- """
- if isinstance(cast_type, CategoricalDtype):
- known_cats = cast_type.categories is not None
- if not is_object_dtype(values.dtype) and not known_cats:
- # TODO: this is for consistency with
- # c-parser which parses all categories
- # as strings
- values = lib.ensure_string_array(
- values, skipna=False, convert_na_value=False
- )
- cats = Index(values).unique().dropna()
- values = Categorical._from_inferred_categories(
- cats, cats.get_indexer(values), cast_type, true_values=self.true_values
- )
- # use the EA's implementation of casting
- elif isinstance(cast_type, ExtensionDtype):
- array_type = cast_type.construct_array_type()
- try:
- if isinstance(cast_type, BooleanDtype):
- # error: Unexpected keyword argument "true_values" for
- # "_from_sequence_of_strings" of "ExtensionArray"
- return array_type._from_sequence_of_strings( # type: ignore[call-arg] # noqa:E501
- values,
- dtype=cast_type,
- true_values=self.true_values,
- false_values=self.false_values,
- )
- else:
- return array_type._from_sequence_of_strings(values, dtype=cast_type)
- except NotImplementedError as err:
- raise NotImplementedError(
- f"Extension Array: {array_type} must implement "
- "_from_sequence_of_strings in order to be used in parser methods"
- ) from err
- elif isinstance(values, ExtensionArray):
- values = values.astype(cast_type, copy=False)
- elif issubclass(cast_type.type, str):
- # TODO: why skipna=True here and False above? some tests depend
- # on it here, but nothing fails if we change it above
- # (as no tests get there as of 2022-12-06)
- values = lib.ensure_string_array(
- values, skipna=True, convert_na_value=False
- )
- else:
- try:
- values = astype_array(values, cast_type, copy=True)
- except ValueError as err:
- raise ValueError(
- f"Unable to convert column {column} to type {cast_type}"
- ) from err
- return values
- @overload
- def _do_date_conversions(
- self,
- names: Index,
- data: DataFrame,
- ) -> tuple[Sequence[Hashable] | Index, DataFrame]:
- ...
- @overload
- def _do_date_conversions(
- self,
- names: Sequence[Hashable],
- data: Mapping[Hashable, ArrayLike],
- ) -> tuple[Sequence[Hashable], Mapping[Hashable, ArrayLike]]:
- ...
- def _do_date_conversions(
- self,
- names: Sequence[Hashable] | Index,
- data: Mapping[Hashable, ArrayLike] | DataFrame,
- ) -> tuple[Sequence[Hashable] | Index, Mapping[Hashable, ArrayLike] | DataFrame]:
- # returns data, columns
- if self.parse_dates is not None:
- data, names = _process_date_conversion(
- data,
- self._date_conv,
- self.parse_dates,
- self.index_col,
- self.index_names,
- names,
- keep_date_col=self.keep_date_col,
- dtype_backend=self.dtype_backend,
- )
- return names, data
- def _check_data_length(
- self,
- columns: Sequence[Hashable],
- data: Sequence[ArrayLike],
- ) -> None:
- """Checks if length of data is equal to length of column names.
- One set of trailing commas is allowed. self.index_col not False
- results in a ParserError previously when lengths do not match.
- Parameters
- ----------
- columns: list of column names
- data: list of array-likes containing the data column-wise.
- """
- if not self.index_col and len(columns) != len(data) and columns:
- empty_str = is_object_dtype(data[-1]) and data[-1] == ""
- # error: No overload variant of "__ror__" of "ndarray" matches
- # argument type "ExtensionArray"
- empty_str_or_na = empty_str | isna(data[-1]) # type: ignore[operator]
- if len(columns) == len(data) - 1 and np.all(empty_str_or_na):
- return
- warnings.warn(
- "Length of header or names does not match length of data. This leads "
- "to a loss of data with index_col=False.",
- ParserWarning,
- stacklevel=find_stack_level(),
- )
- @overload
- def _evaluate_usecols(
- self,
- usecols: set[int] | Callable[[Hashable], object],
- names: Sequence[Hashable],
- ) -> set[int]:
- ...
- @overload
- def _evaluate_usecols(
- self, usecols: set[str], names: Sequence[Hashable]
- ) -> set[str]:
- ...
- def _evaluate_usecols(
- self,
- usecols: Callable[[Hashable], object] | set[str] | set[int],
- names: Sequence[Hashable],
- ) -> set[str] | set[int]:
- """
- Check whether or not the 'usecols' parameter
- is a callable. If so, enumerates the 'names'
- parameter and returns a set of indices for
- each entry in 'names' that evaluates to True.
- If not a callable, returns 'usecols'.
- """
- if callable(usecols):
- return {i for i, name in enumerate(names) if usecols(name)}
- return usecols
- def _validate_usecols_names(self, usecols, names):
- """
- Validates that all usecols are present in a given
- list of names. If not, raise a ValueError that
- shows what usecols are missing.
- Parameters
- ----------
- usecols : iterable of usecols
- The columns to validate are present in names.
- names : iterable of names
- The column names to check against.
- Returns
- -------
- usecols : iterable of usecols
- The `usecols` parameter if the validation succeeds.
- Raises
- ------
- ValueError : Columns were missing. Error message will list them.
- """
- missing = [c for c in usecols if c not in names]
- if len(missing) > 0:
- raise ValueError(
- f"Usecols do not match columns, columns expected but not found: "
- f"{missing}"
- )
- return usecols
- def _validate_usecols_arg(self, usecols):
- """
- Validate the 'usecols' parameter.
- Checks whether or not the 'usecols' parameter contains all integers
- (column selection by index), strings (column by name) or is a callable.
- Raises a ValueError if that is not the case.
- Parameters
- ----------
- usecols : list-like, callable, or None
- List of columns to use when parsing or a callable that can be used
- to filter a list of table columns.
- Returns
- -------
- usecols_tuple : tuple
- A tuple of (verified_usecols, usecols_dtype).
- 'verified_usecols' is either a set if an array-like is passed in or
- 'usecols' if a callable or None is passed in.
- 'usecols_dtype` is the inferred dtype of 'usecols' if an array-like
- is passed in or None if a callable or None is passed in.
- """
- msg = (
- "'usecols' must either be list-like of all strings, all unicode, "
- "all integers or a callable."
- )
- if usecols is not None:
- if callable(usecols):
- return usecols, None
- if not is_list_like(usecols):
- # see gh-20529
- #
- # Ensure it is iterable container but not string.
- raise ValueError(msg)
- usecols_dtype = lib.infer_dtype(usecols, skipna=False)
- if usecols_dtype not in ("empty", "integer", "string"):
- raise ValueError(msg)
- usecols = set(usecols)
- return usecols, usecols_dtype
- return usecols, None
- def _clean_index_names(self, columns, index_col) -> tuple[list | None, list, list]:
- if not is_index_col(index_col):
- return None, columns, index_col
- columns = list(columns)
- # In case of no rows and multiindex columns we have to set index_names to
- # list of Nones GH#38292
- if not columns:
- return [None] * len(index_col), columns, index_col
- cp_cols = list(columns)
- index_names: list[str | int | None] = []
- # don't mutate
- index_col = list(index_col)
- for i, c in enumerate(index_col):
- if isinstance(c, str):
- index_names.append(c)
- for j, name in enumerate(cp_cols):
- if name == c:
- index_col[i] = j
- columns.remove(name)
- break
- else:
- name = cp_cols[c]
- columns.remove(name)
- index_names.append(name)
- # Only clean index names that were placeholders.
- for i, name in enumerate(index_names):
- if isinstance(name, str) and name in self.unnamed_cols:
- index_names[i] = None
- return index_names, columns, index_col
- def _get_empty_meta(
- self, columns, index_col, index_names, dtype: DtypeArg | None = None
- ):
- columns = list(columns)
- # Convert `dtype` to a defaultdict of some kind.
- # This will enable us to write `dtype[col_name]`
- # without worrying about KeyError issues later on.
- dtype_dict: defaultdict[Hashable, Any]
- if not is_dict_like(dtype):
- # if dtype == None, default will be object.
- default_dtype = dtype or object
- dtype_dict = defaultdict(lambda: default_dtype)
- else:
- dtype = cast(dict, dtype)
- dtype_dict = defaultdict(
- lambda: object,
- {columns[k] if is_integer(k) else k: v for k, v in dtype.items()},
- )
- # Even though we have no data, the "index" of the empty DataFrame
- # could for example still be an empty MultiIndex. Thus, we need to
- # check whether we have any index columns specified, via either:
- #
- # 1) index_col (column indices)
- # 2) index_names (column names)
- #
- # Both must be non-null to ensure a successful construction. Otherwise,
- # we have to create a generic empty Index.
- index: Index
- if (index_col is None or index_col is False) or index_names is None:
- index = default_index(0)
- else:
- data = [Series([], dtype=dtype_dict[name]) for name in index_names]
- index = ensure_index_from_sequences(data, names=index_names)
- index_col.sort()
- for i, n in enumerate(index_col):
- columns.pop(n - i)
- col_dict = {
- col_name: Series([], dtype=dtype_dict[col_name]) for col_name in columns
- }
- return index, columns, col_dict
- def _make_date_converter(
- date_parser=lib.no_default,
- dayfirst: bool = False,
- cache_dates: bool = True,
- date_format: dict[Hashable, str] | str | None = None,
- ):
- if date_parser is not lib.no_default:
- warnings.warn(
- "The argument 'date_parser' is deprecated and will "
- "be removed in a future version. "
- "Please use 'date_format' instead, or read your data in as 'object' dtype "
- "and then call 'to_datetime'.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- if date_parser is not lib.no_default and date_format is not None:
- raise TypeError("Cannot use both 'date_parser' and 'date_format'")
- def unpack_if_single_element(arg):
- # NumPy 1.25 deprecation: https://github.com/numpy/numpy/pull/10615
- if isinstance(arg, np.ndarray) and arg.ndim == 1 and len(arg) == 1:
- return arg[0]
- return arg
- def converter(*date_cols, col: Hashable):
- if len(date_cols) == 1 and date_cols[0].dtype.kind in "Mm":
- return date_cols[0]
- if date_parser is lib.no_default:
- strs = parsing.concat_date_cols(date_cols)
- date_fmt = (
- date_format.get(col) if isinstance(date_format, dict) else date_format
- )
- result = tools.to_datetime(
- ensure_object(strs),
- format=date_fmt,
- utc=False,
- dayfirst=dayfirst,
- errors="ignore",
- cache=cache_dates,
- )
- if isinstance(result, DatetimeIndex):
- arr = result.to_numpy()
- arr.flags.writeable = True
- return arr
- return result._values
- else:
- try:
- result = tools.to_datetime(
- date_parser(*(unpack_if_single_element(arg) for arg in date_cols)),
- errors="ignore",
- cache=cache_dates,
- )
- if isinstance(result, datetime.datetime):
- raise Exception("scalar parser")
- return result
- except Exception:
- return tools.to_datetime(
- parsing.try_parse_dates(
- parsing.concat_date_cols(date_cols),
- parser=date_parser,
- ),
- errors="ignore",
- )
- return converter
- parser_defaults = {
- "delimiter": None,
- "escapechar": None,
- "quotechar": '"',
- "quoting": csv.QUOTE_MINIMAL,
- "doublequote": True,
- "skipinitialspace": False,
- "lineterminator": None,
- "header": "infer",
- "index_col": None,
- "names": None,
- "skiprows": None,
- "skipfooter": 0,
- "nrows": None,
- "na_values": None,
- "keep_default_na": True,
- "true_values": None,
- "false_values": None,
- "converters": None,
- "dtype": None,
- "cache_dates": True,
- "thousands": None,
- "comment": None,
- "decimal": ".",
- # 'engine': 'c',
- "parse_dates": False,
- "keep_date_col": False,
- "dayfirst": False,
- "date_parser": lib.no_default,
- "date_format": None,
- "usecols": None,
- # 'iterator': False,
- "chunksize": None,
- "verbose": False,
- "encoding": None,
- "compression": None,
- "skip_blank_lines": True,
- "encoding_errors": "strict",
- "on_bad_lines": ParserBase.BadLineHandleMethod.ERROR,
- "dtype_backend": lib.no_default,
- }
- def _process_date_conversion(
- data_dict,
- converter: Callable,
- parse_spec,
- index_col,
- index_names,
- columns,
- keep_date_col: bool = False,
- dtype_backend=lib.no_default,
- ):
- def _isindex(colspec):
- return (isinstance(index_col, list) and colspec in index_col) or (
- isinstance(index_names, list) and colspec in index_names
- )
- new_cols = []
- new_data = {}
- orig_names = columns
- columns = list(columns)
- date_cols = set()
- if parse_spec is None or isinstance(parse_spec, bool):
- return data_dict, columns
- if isinstance(parse_spec, list):
- # list of column lists
- for colspec in parse_spec:
- if is_scalar(colspec) or isinstance(colspec, tuple):
- if isinstance(colspec, int) and colspec not in data_dict:
- colspec = orig_names[colspec]
- if _isindex(colspec):
- continue
- elif dtype_backend == "pyarrow":
- import pyarrow as pa
- dtype = data_dict[colspec].dtype
- if isinstance(dtype, ArrowDtype) and (
- pa.types.is_timestamp(dtype.pyarrow_dtype)
- or pa.types.is_date(dtype.pyarrow_dtype)
- ):
- continue
- # Pyarrow engine returns Series which we need to convert to
- # numpy array before converter, its a no-op for other parsers
- data_dict[colspec] = converter(
- np.asarray(data_dict[colspec]), col=colspec
- )
- else:
- new_name, col, old_names = _try_convert_dates(
- converter, colspec, data_dict, orig_names
- )
- if new_name in data_dict:
- raise ValueError(f"New date column already in dict {new_name}")
- new_data[new_name] = col
- new_cols.append(new_name)
- date_cols.update(old_names)
- elif isinstance(parse_spec, dict):
- # dict of new name to column list
- for new_name, colspec in parse_spec.items():
- if new_name in data_dict:
- raise ValueError(f"Date column {new_name} already in dict")
- _, col, old_names = _try_convert_dates(
- converter,
- colspec,
- data_dict,
- orig_names,
- target_name=new_name,
- )
- new_data[new_name] = col
- # If original column can be converted to date we keep the converted values
- # This can only happen if values are from single column
- if len(colspec) == 1:
- new_data[colspec[0]] = col
- new_cols.append(new_name)
- date_cols.update(old_names)
- data_dict.update(new_data)
- new_cols.extend(columns)
- if not keep_date_col:
- for c in list(date_cols):
- data_dict.pop(c)
- new_cols.remove(c)
- return data_dict, new_cols
- def _try_convert_dates(
- parser: Callable, colspec, data_dict, columns, target_name: str | None = None
- ):
- colset = set(columns)
- colnames = []
- for c in colspec:
- if c in colset:
- colnames.append(c)
- elif isinstance(c, int) and c not in columns:
- colnames.append(columns[c])
- else:
- colnames.append(c)
- new_name: tuple | str
- if all(isinstance(x, tuple) for x in colnames):
- new_name = tuple(map("_".join, zip(*colnames)))
- else:
- new_name = "_".join([str(x) for x in colnames])
- to_parse = [np.asarray(data_dict[c]) for c in colnames if c in data_dict]
- new_col = parser(*to_parse, col=new_name if target_name is None else target_name)
- return new_name, new_col, colnames
- def _get_na_values(col, na_values, na_fvalues, keep_default_na):
- """
- Get the NaN values for a given column.
- Parameters
- ----------
- col : str
- The name of the column.
- na_values : array-like, dict
- The object listing the NaN values as strings.
- na_fvalues : array-like, dict
- The object listing the NaN values as floats.
- keep_default_na : bool
- If `na_values` is a dict, and the column is not mapped in the
- dictionary, whether to return the default NaN values or the empty set.
- Returns
- -------
- nan_tuple : A length-two tuple composed of
- 1) na_values : the string NaN values for that column.
- 2) na_fvalues : the float NaN values for that column.
- """
- if isinstance(na_values, dict):
- if col in na_values:
- return na_values[col], na_fvalues[col]
- else:
- if keep_default_na:
- return STR_NA_VALUES, set()
- return set(), set()
- else:
- return na_values, na_fvalues
- def _validate_parse_dates_arg(parse_dates):
- """
- Check whether or not the 'parse_dates' parameter
- is a non-boolean scalar. Raises a ValueError if
- that is the case.
- """
- msg = (
- "Only booleans, lists, and dictionaries are accepted "
- "for the 'parse_dates' parameter"
- )
- if parse_dates is not None:
- if is_scalar(parse_dates):
- if not lib.is_bool(parse_dates):
- raise TypeError(msg)
- elif not isinstance(parse_dates, (list, dict)):
- raise TypeError(msg)
- return parse_dates
- def is_index_col(col) -> bool:
- return col is not None and col is not False
|