123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127 |
- """
- Module contains tools for processing files into DataFrames or other objects
- GH#48849 provides a convenient way of deprecating keyword arguments
- """
- from __future__ import annotations
- from collections import abc
- import csv
- import sys
- from textwrap import fill
- from types import TracebackType
- from typing import (
- IO,
- Any,
- Callable,
- Hashable,
- Literal,
- NamedTuple,
- Sequence,
- overload,
- )
- import warnings
- import numpy as np
- from pandas._libs import lib
- from pandas._libs.parsers import STR_NA_VALUES
- from pandas._typing import (
- CompressionOptions,
- CSVEngine,
- DtypeArg,
- DtypeBackend,
- FilePath,
- IndexLabel,
- ReadCsvBuffer,
- StorageOptions,
- )
- from pandas.errors import (
- AbstractMethodError,
- ParserWarning,
- )
- from pandas.util._decorators import Appender
- from pandas.util._exceptions import find_stack_level
- from pandas.util._validators import check_dtype_backend
- from pandas.core.dtypes.common import (
- is_file_like,
- is_float,
- is_integer,
- is_list_like,
- )
- from pandas.core.frame import DataFrame
- from pandas.core.indexes.api import RangeIndex
- from pandas.core.shared_docs import _shared_docs
- from pandas.io.common import (
- IOHandles,
- get_handle,
- stringify_path,
- validate_header_arg,
- )
- from pandas.io.parsers.arrow_parser_wrapper import ArrowParserWrapper
- from pandas.io.parsers.base_parser import (
- ParserBase,
- is_index_col,
- parser_defaults,
- )
- from pandas.io.parsers.c_parser_wrapper import CParserWrapper
- from pandas.io.parsers.python_parser import (
- FixedWidthFieldParser,
- PythonParser,
- )
- _doc_read_csv_and_table = (
- r"""
- {summary}
- Also supports optionally iterating or breaking of the file
- into chunks.
- Additional help can be found in the online docs for
- `IO Tools <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
- Parameters
- ----------
- filepath_or_buffer : str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, gs, and file. For file URLs, a host is
- expected. A local file could be: file://localhost/path/to/table.csv.
- If you want to pass in a path object, pandas accepts any ``os.PathLike``.
- By file-like object, we refer to objects with a ``read()`` method, such as
- a file handle (e.g. via builtin ``open`` function) or ``StringIO``.
- sep : str, default {_default_sep}
- Delimiter to use. If sep is None, the C engine cannot automatically detect
- the separator, but the Python parsing engine can, meaning the latter will
- be used and automatically detect the separator by Python's builtin sniffer
- tool, ``csv.Sniffer``. In addition, separators longer than 1 character and
- different from ``'\s+'`` will be interpreted as regular expressions and
- will also force the use of the Python parsing engine. Note that regex
- delimiters are prone to ignoring quoted data. Regex example: ``'\r\t'``.
- delimiter : str, default ``None``
- Alias for sep.
- header : int, list of int, None, default 'infer'
- Row number(s) to use as the column names, and the start of the
- data. Default behavior is to infer the column names: if no names
- are passed the behavior is identical to ``header=0`` and column
- names are inferred from the first line of the file, if column
- names are passed explicitly then the behavior is identical to
- ``header=None``. Explicitly pass ``header=0`` to be able to
- replace existing names. The header can be a list of integers that
- specify row locations for a multi-index on the columns
- e.g. [0,1,3]. Intervening rows that are not specified will be
- skipped (e.g. 2 in this example is skipped). Note that this
- parameter ignores commented lines and empty lines if
- ``skip_blank_lines=True``, so ``header=0`` denotes the first line of
- data rather than the first line of the file.
- names : array-like, optional
- List of column names to use. If the file contains a header row,
- then you should explicitly pass ``header=0`` to override the column names.
- Duplicates in this list are not allowed.
- index_col : int, str, sequence of int / str, or False, optional, default ``None``
- Column(s) to use as the row labels of the ``DataFrame``, either given as
- string name or column index. If a sequence of int / str is given, a
- MultiIndex is used.
- Note: ``index_col=False`` can be used to force pandas to *not* use the first
- column as the index, e.g. when you have a malformed file with delimiters at
- the end of each line.
- usecols : list-like or callable, optional
- Return a subset of the columns. If list-like, all elements must either
- be positional (i.e. integer indices into the document columns) or strings
- that correspond to column names provided either by the user in `names` or
- inferred from the document header row(s). If ``names`` are given, the document
- header row(s) are not taken into account. For example, a valid list-like
- `usecols` parameter would be ``[0, 1, 2]`` or ``['foo', 'bar', 'baz']``.
- Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``.
- To instantiate a DataFrame from ``data`` with element order preserved use
- ``pd.read_csv(data, usecols=['foo', 'bar'])[['foo', 'bar']]`` for columns
- in ``['foo', 'bar']`` order or
- ``pd.read_csv(data, usecols=['foo', 'bar'])[['bar', 'foo']]``
- for ``['bar', 'foo']`` order.
- If callable, the callable function will be evaluated against the column
- names, returning names where the callable function evaluates to True. An
- example of a valid callable argument would be ``lambda x: x.upper() in
- ['AAA', 'BBB', 'DDD']``. Using this parameter results in much faster
- parsing time and lower memory usage.
- dtype : Type name or dict of column -> type, optional
- Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
- 'c': 'Int64'}}
- Use `str` or `object` together with suitable `na_values` settings
- to preserve and not interpret dtype.
- If converters are specified, they will be applied INSTEAD
- of dtype conversion.
- .. versionadded:: 1.5.0
- Support for defaultdict was added. Specify a defaultdict as input where
- the default determines the dtype of the columns which are not explicitly
- listed.
- engine : {{'c', 'python', 'pyarrow'}}, optional
- Parser engine to use. The C and pyarrow engines are faster, while the python engine
- is currently more feature-complete. Multithreading is currently only supported by
- the pyarrow engine.
- .. versionadded:: 1.4.0
- The "pyarrow" engine was added as an *experimental* engine, and some features
- are unsupported, or may not work correctly, with this engine.
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can either
- be integers or column labels.
- true_values : list, optional
- Values to consider as True in addition to case-insensitive variants of "True".
- false_values : list, optional
- Values to consider as False in addition to case-insensitive variants of "False".
- skipinitialspace : bool, default False
- Skip spaces after delimiter.
- skiprows : list-like, int or callable, optional
- Line numbers to skip (0-indexed) or number of lines to skip (int)
- at the start of the file.
- If callable, the callable function will be evaluated against the row
- indices, returning True if the row should be skipped and False otherwise.
- An example of a valid callable argument would be ``lambda x: x in [0, 2]``.
- skipfooter : int, default 0
- Number of lines at bottom of file to skip (Unsupported with engine='c').
- nrows : int, optional
- Number of rows of file to read. Useful for reading pieces of large files.
- na_values : scalar, str, list-like, or dict, optional
- Additional strings to recognize as NA/NaN. If dict passed, specific
- per-column NA values. By default the following values are interpreted as
- NaN: '"""
- + fill("', '".join(sorted(STR_NA_VALUES)), 70, subsequent_indent=" ")
- + """'.
- keep_default_na : bool, default True
- Whether or not to include the default NaN values when parsing the data.
- Depending on whether `na_values` is passed in, the behavior is as follows:
- * If `keep_default_na` is True, and `na_values` are specified, `na_values`
- is appended to the default NaN values used for parsing.
- * If `keep_default_na` is True, and `na_values` are not specified, only
- the default NaN values are used for parsing.
- * If `keep_default_na` is False, and `na_values` are specified, only
- the NaN values specified `na_values` are used for parsing.
- * If `keep_default_na` is False, and `na_values` are not specified, no
- strings will be parsed as NaN.
- Note that if `na_filter` is passed in as False, the `keep_default_na` and
- `na_values` parameters will be ignored.
- na_filter : bool, default True
- Detect missing value markers (empty strings and the value of na_values). In
- data without any NAs, passing na_filter=False can improve the performance
- of reading a large file.
- verbose : bool, default False
- Indicate number of NA values placed in non-numeric columns.
- skip_blank_lines : bool, default True
- If True, skip over blank lines rather than interpreting as NaN values.
- parse_dates : bool or list of int or names or list of lists or dict, \
- default False
- The behavior is as follows:
- * boolean. If True -> try parsing the index.
- * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
- each as a separate date column.
- * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
- a single date column.
- * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
- result 'foo'
- If a column or index cannot be represented as an array of datetimes,
- say because of an unparsable value or a mixture of timezones, the column
- or index will be returned unaltered as an object data type. For
- non-standard datetime parsing, use ``pd.to_datetime`` after
- ``pd.read_csv``.
- Note: A fast-path exists for iso8601-formatted dates.
- infer_datetime_format : bool, default False
- If True and `parse_dates` is enabled, pandas will attempt to infer the
- format of the datetime strings in the columns, and if it can be inferred,
- switch to a faster method of parsing them. In some cases this can increase
- the parsing speed by 5-10x.
- .. deprecated:: 2.0.0
- A strict version of this argument is now the default, passing it has no effect.
- keep_date_col : bool, default False
- If True and `parse_dates` specifies combining multiple columns then
- keep the original columns.
- date_parser : function, optional
- Function to use for converting a sequence of string columns to an array of
- datetime instances. The default uses ``dateutil.parser.parser`` to do the
- conversion. Pandas will try to call `date_parser` in three different ways,
- advancing to the next if an exception occurs: 1) Pass one or more arrays
- (as defined by `parse_dates`) as arguments; 2) concatenate (row-wise) the
- string values from the columns defined by `parse_dates` into a single array
- and pass that; and 3) call `date_parser` once for each row using one or
- more strings (corresponding to the columns defined by `parse_dates`) as
- arguments.
- .. deprecated:: 2.0.0
- Use ``date_format`` instead, or read in as ``object`` and then apply
- :func:`to_datetime` as-needed.
- date_format : str or dict of column -> format, default ``None``
- If used in conjunction with ``parse_dates``, will parse dates according to this
- format. For anything more complex,
- please read in as ``object`` and then apply :func:`to_datetime` as-needed.
- .. versionadded:: 2.0.0
- dayfirst : bool, default False
- DD/MM format dates, international and European format.
- cache_dates : bool, default True
- If True, use a cache of unique, converted dates to apply the datetime
- conversion. May produce significant speed-up when parsing duplicate
- date strings, especially ones with timezone offsets.
- iterator : bool, default False
- Return TextFileReader object for iteration or getting chunks with
- ``get_chunk()``.
- .. versionchanged:: 1.2
- ``TextFileReader`` is a context manager.
- chunksize : int, optional
- Return TextFileReader object for iteration.
- See the `IO Tools docs
- <https://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
- for more information on ``iterator`` and ``chunksize``.
- .. versionchanged:: 1.2
- ``TextFileReader`` is a context manager.
- {decompression_options}
- .. versionchanged:: 1.4.0 Zstandard support.
- thousands : str, optional
- Thousands separator.
- decimal : str, default '.'
- Character to recognize as decimal point (e.g. use ',' for European data).
- lineterminator : str (length 1), optional
- Character to break file into lines. Only valid with C parser.
- quotechar : str (length 1), optional
- The character used to denote the start and end of a quoted item. Quoted
- items can include the delimiter and it will be ignored.
- quoting : int or csv.QUOTE_* instance, default 0
- Control field quoting behavior per ``csv.QUOTE_*`` constants. Use one of
- QUOTE_MINIMAL (0), QUOTE_ALL (1), QUOTE_NONNUMERIC (2) or QUOTE_NONE (3).
- doublequote : bool, default ``True``
- When quotechar is specified and quoting is not ``QUOTE_NONE``, indicate
- whether or not to interpret two consecutive quotechar elements INSIDE a
- field as a single ``quotechar`` element.
- escapechar : str (length 1), optional
- One-character string used to escape other characters.
- comment : str, optional
- Indicates remainder of line should not be parsed. If found at the beginning
- of a line, the line will be ignored altogether. This parameter must be a
- single character. Like empty lines (as long as ``skip_blank_lines=True``),
- fully commented lines are ignored by the parameter `header` but not by
- `skiprows`. For example, if ``comment='#'``, parsing
- ``#empty\\na,b,c\\n1,2,3`` with ``header=0`` will result in 'a,b,c' being
- treated as the header.
- encoding : str, optional, default "utf-8"
- Encoding to use for UTF when reading/writing (ex. 'utf-8'). `List of Python
- standard encodings
- <https://docs.python.org/3/library/codecs.html#standard-encodings>`_ .
- .. versionchanged:: 1.2
- When ``encoding`` is ``None``, ``errors="replace"`` is passed to
- ``open()``. Otherwise, ``errors="strict"`` is passed to ``open()``.
- This behavior was previously only the case for ``engine="python"``.
- .. versionchanged:: 1.3.0
- ``encoding_errors`` is a new argument. ``encoding`` has no longer an
- influence on how encoding errors are handled.
- encoding_errors : str, optional, default "strict"
- How encoding errors are treated. `List of possible values
- <https://docs.python.org/3/library/codecs.html#error-handlers>`_ .
- .. versionadded:: 1.3.0
- dialect : str or csv.Dialect, optional
- If provided, this parameter will override values (default or not) for the
- following parameters: `delimiter`, `doublequote`, `escapechar`,
- `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
- override values, a ParserWarning will be issued. See csv.Dialect
- documentation for more details.
- on_bad_lines : {{'error', 'warn', 'skip'}} or callable, default 'error'
- Specifies what to do upon encountering a bad line (a line with too many fields).
- Allowed values are :
- - 'error', raise an Exception when a bad line is encountered.
- - 'warn', raise a warning when a bad line is encountered and skip that line.
- - 'skip', skip bad lines without raising or warning when they are encountered.
- .. versionadded:: 1.3.0
- .. versionadded:: 1.4.0
- - callable, function with signature
- ``(bad_line: list[str]) -> list[str] | None`` that will process a single
- bad line. ``bad_line`` is a list of strings split by the ``sep``.
- If the function returns ``None``, the bad line will be ignored.
- If the function returns a new list of strings with more elements than
- expected, a ``ParserWarning`` will be emitted while dropping extra elements.
- Only supported when ``engine="python"``
- delim_whitespace : bool, default False
- Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
- used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
- is set to True, nothing should be passed in for the ``delimiter``
- parameter.
- low_memory : bool, default True
- Internally process the file in chunks, resulting in lower memory use
- while parsing, but possibly mixed type inference. To ensure no mixed
- types either set False, or specify the type with the `dtype` parameter.
- Note that the entire file is read into a single DataFrame regardless,
- use the `chunksize` or `iterator` parameter to return the data in chunks.
- (Only valid with C parser).
- memory_map : bool, default False
- If a filepath is provided for `filepath_or_buffer`, map the file object
- directly onto memory and access the data directly from there. Using this
- option can improve performance because there is no longer any I/O overhead.
- float_precision : str, optional
- Specifies which converter the C engine should use for floating-point
- values. The options are ``None`` or 'high' for the ordinary converter,
- 'legacy' for the original lower precision pandas converter, and
- 'round_trip' for the round-trip converter.
- .. versionchanged:: 1.2
- {storage_options}
- .. versionadded:: 1.2
- dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
- The dtype_backends are still experimential.
- .. versionadded:: 2.0
- Returns
- -------
- DataFrame or TextFileReader
- A comma-separated values (csv) file is returned as two-dimensional
- data structure with labeled axes.
- See Also
- --------
- DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- read_fwf : Read a table of fixed-width formatted lines into DataFrame.
- Examples
- --------
- >>> pd.{func_name}('data.csv') # doctest: +SKIP
- """
- )
- _c_parser_defaults = {
- "delim_whitespace": False,
- "na_filter": True,
- "low_memory": True,
- "memory_map": False,
- "float_precision": None,
- }
- _fwf_defaults = {"colspecs": "infer", "infer_nrows": 100, "widths": None}
- _c_unsupported = {"skipfooter"}
- _python_unsupported = {"low_memory", "float_precision"}
- _pyarrow_unsupported = {
- "skipfooter",
- "float_precision",
- "chunksize",
- "comment",
- "nrows",
- "thousands",
- "memory_map",
- "dialect",
- "on_bad_lines",
- "delim_whitespace",
- "quoting",
- "lineterminator",
- "converters",
- "iterator",
- "dayfirst",
- "verbose",
- "skipinitialspace",
- "low_memory",
- }
- class _DeprecationConfig(NamedTuple):
- default_value: Any
- msg: str | None
- @overload
- def validate_integer(name, val: None, min_val: int = ...) -> None:
- ...
- @overload
- def validate_integer(name, val: float, min_val: int = ...) -> int:
- ...
- @overload
- def validate_integer(name, val: int | None, min_val: int = ...) -> int | None:
- ...
- def validate_integer(name, val: int | float | None, min_val: int = 0) -> int | None:
- """
- Checks whether the 'name' parameter for parsing is either
- an integer OR float that can SAFELY be cast to an integer
- without losing accuracy. Raises a ValueError if that is
- not the case.
- Parameters
- ----------
- name : str
- Parameter name (used for error reporting)
- val : int or float
- The value to check
- min_val : int
- Minimum allowed value (val < min_val will result in a ValueError)
- """
- if val is None:
- return val
- msg = f"'{name:s}' must be an integer >={min_val:d}"
- if is_float(val):
- if int(val) != val:
- raise ValueError(msg)
- val = int(val)
- elif not (is_integer(val) and val >= min_val):
- raise ValueError(msg)
- return int(val)
- def _validate_names(names: Sequence[Hashable] | None) -> None:
- """
- Raise ValueError if the `names` parameter contains duplicates or has an
- invalid data type.
- Parameters
- ----------
- names : array-like or None
- An array containing a list of the names used for the output DataFrame.
- Raises
- ------
- ValueError
- If names are not unique or are not ordered (e.g. set).
- """
- if names is not None:
- if len(names) != len(set(names)):
- raise ValueError("Duplicate names are not allowed.")
- if not (
- is_list_like(names, allow_sets=False) or isinstance(names, abc.KeysView)
- ):
- raise ValueError("Names should be an ordered collection.")
- def _read(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str], kwds
- ) -> DataFrame | TextFileReader:
- """Generic reader of line files."""
- # if we pass a date_parser and parse_dates=False, we should not parse the
- # dates GH#44366
- if kwds.get("parse_dates", None) is None:
- if (
- kwds.get("date_parser", lib.no_default) is lib.no_default
- and kwds.get("date_format", None) is None
- ):
- kwds["parse_dates"] = False
- else:
- kwds["parse_dates"] = True
- # Extract some of the arguments (pass chunksize on).
- iterator = kwds.get("iterator", False)
- chunksize = kwds.get("chunksize", None)
- if kwds.get("engine") == "pyarrow":
- if iterator:
- raise ValueError(
- "The 'iterator' option is not supported with the 'pyarrow' engine"
- )
- if chunksize is not None:
- raise ValueError(
- "The 'chunksize' option is not supported with the 'pyarrow' engine"
- )
- else:
- chunksize = validate_integer("chunksize", chunksize, 1)
- nrows = kwds.get("nrows", None)
- # Check for duplicates in names.
- _validate_names(kwds.get("names", None))
- # Create the parser.
- parser = TextFileReader(filepath_or_buffer, **kwds)
- if chunksize or iterator:
- return parser
- with parser:
- return parser.read(nrows)
- # iterator=True -> TextFileReader
- @overload
- def read_csv(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] | None = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: Literal[True],
- chunksize: int | None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: Literal["high", "legacy"] | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- ) -> TextFileReader:
- ...
- # chunksize=int -> TextFileReader
- @overload
- def read_csv(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] | None = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: bool = ...,
- chunksize: int,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: Literal["high", "legacy"] | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- ) -> TextFileReader:
- ...
- # default case -> DataFrame
- @overload
- def read_csv(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] | None = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: Literal[False] = ...,
- chunksize: None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: Literal["high", "legacy"] | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- ) -> DataFrame:
- ...
- # Unions -> DataFrame | TextFileReader
- @overload
- def read_csv(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] | None = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: bool = ...,
- chunksize: int | None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: Literal["high", "legacy"] | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- ) -> DataFrame | TextFileReader:
- ...
- @Appender(
- _doc_read_csv_and_table.format(
- func_name="read_csv",
- summary="Read a comma-separated values (csv) file into DataFrame.",
- _default_sep="','",
- storage_options=_shared_docs["storage_options"],
- decompression_options=_shared_docs["decompression_options"]
- % "filepath_or_buffer",
- )
- )
- def read_csv(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = lib.no_default,
- delimiter: str | None | lib.NoDefault = None,
- # Column and Index Locations and Names
- header: int | Sequence[int] | None | Literal["infer"] = "infer",
- names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
- index_col: IndexLabel | Literal[False] | None = None,
- usecols=None,
- # General Parsing Configuration
- dtype: DtypeArg | None = None,
- engine: CSVEngine | None = None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace: bool = False,
- skiprows=None,
- skipfooter: int = 0,
- nrows: int | None = None,
- # NA and Missing Data Handling
- na_values=None,
- keep_default_na: bool = True,
- na_filter: bool = True,
- verbose: bool = False,
- skip_blank_lines: bool = True,
- # Datetime Handling
- parse_dates: bool | Sequence[Hashable] | None = None,
- infer_datetime_format: bool | lib.NoDefault = lib.no_default,
- keep_date_col: bool = False,
- date_parser=lib.no_default,
- date_format: str | None = None,
- dayfirst: bool = False,
- cache_dates: bool = True,
- # Iteration
- iterator: bool = False,
- chunksize: int | None = None,
- # Quoting, Compression, and File Format
- compression: CompressionOptions = "infer",
- thousands: str | None = None,
- decimal: str = ".",
- lineterminator: str | None = None,
- quotechar: str = '"',
- quoting: int = csv.QUOTE_MINIMAL,
- doublequote: bool = True,
- escapechar: str | None = None,
- comment: str | None = None,
- encoding: str | None = None,
- encoding_errors: str | None = "strict",
- dialect: str | csv.Dialect | None = None,
- # Error Handling
- on_bad_lines: str = "error",
- # Internal
- delim_whitespace: bool = False,
- low_memory=_c_parser_defaults["low_memory"],
- memory_map: bool = False,
- float_precision: Literal["high", "legacy"] | None = None,
- storage_options: StorageOptions = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- ) -> DataFrame | TextFileReader:
- if infer_datetime_format is not lib.no_default:
- warnings.warn(
- "The argument 'infer_datetime_format' is deprecated and will "
- "be removed in a future version. "
- "A strict version of it is now the default, see "
- "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
- "You can safely remove this argument.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- # locals() should never be modified
- kwds = locals().copy()
- del kwds["filepath_or_buffer"]
- del kwds["sep"]
- kwds_defaults = _refine_defaults_read(
- dialect,
- delimiter,
- delim_whitespace,
- engine,
- sep,
- on_bad_lines,
- names,
- defaults={"delimiter": ","},
- dtype_backend=dtype_backend,
- )
- kwds.update(kwds_defaults)
- return _read(filepath_or_buffer, kwds)
- # iterator=True -> TextFileReader
- @overload
- def read_table(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: Literal[True],
- chunksize: int | None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: str | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- ) -> TextFileReader:
- ...
- # chunksize=int -> TextFileReader
- @overload
- def read_table(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: bool = ...,
- chunksize: int,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: str | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- ) -> TextFileReader:
- ...
- # default -> DataFrame
- @overload
- def read_table(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: Literal[False] = ...,
- chunksize: None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: str | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- ) -> DataFrame:
- ...
- # Unions -> DataFrame | TextFileReader
- @overload
- def read_table(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = ...,
- delimiter: str | None | lib.NoDefault = ...,
- header: int | Sequence[int] | None | Literal["infer"] = ...,
- names: Sequence[Hashable] | None | lib.NoDefault = ...,
- index_col: IndexLabel | Literal[False] | None = ...,
- usecols=...,
- dtype: DtypeArg | None = ...,
- engine: CSVEngine | None = ...,
- converters=...,
- true_values=...,
- false_values=...,
- skipinitialspace: bool = ...,
- skiprows=...,
- skipfooter: int = ...,
- nrows: int | None = ...,
- na_values=...,
- keep_default_na: bool = ...,
- na_filter: bool = ...,
- verbose: bool = ...,
- skip_blank_lines: bool = ...,
- parse_dates: bool | Sequence[Hashable] = ...,
- infer_datetime_format: bool | lib.NoDefault = ...,
- keep_date_col: bool = ...,
- date_parser=...,
- date_format: str | None = ...,
- dayfirst: bool = ...,
- cache_dates: bool = ...,
- iterator: bool = ...,
- chunksize: int | None = ...,
- compression: CompressionOptions = ...,
- thousands: str | None = ...,
- decimal: str = ...,
- lineterminator: str | None = ...,
- quotechar: str = ...,
- quoting: int = ...,
- doublequote: bool = ...,
- escapechar: str | None = ...,
- comment: str | None = ...,
- encoding: str | None = ...,
- encoding_errors: str | None = ...,
- dialect: str | csv.Dialect | None = ...,
- on_bad_lines=...,
- delim_whitespace: bool = ...,
- low_memory=...,
- memory_map: bool = ...,
- float_precision: str | None = ...,
- storage_options: StorageOptions = ...,
- dtype_backend: DtypeBackend | lib.NoDefault = ...,
- ) -> DataFrame | TextFileReader:
- ...
- @Appender(
- _doc_read_csv_and_table.format(
- func_name="read_table",
- summary="Read general delimited file into DataFrame.",
- _default_sep=r"'\\t' (tab-stop)",
- storage_options=_shared_docs["storage_options"],
- decompression_options=_shared_docs["decompression_options"]
- % "filepath_or_buffer",
- )
- )
- def read_table(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- sep: str | None | lib.NoDefault = lib.no_default,
- delimiter: str | None | lib.NoDefault = None,
- # Column and Index Locations and Names
- header: int | Sequence[int] | None | Literal["infer"] = "infer",
- names: Sequence[Hashable] | None | lib.NoDefault = lib.no_default,
- index_col: IndexLabel | Literal[False] | None = None,
- usecols=None,
- # General Parsing Configuration
- dtype: DtypeArg | None = None,
- engine: CSVEngine | None = None,
- converters=None,
- true_values=None,
- false_values=None,
- skipinitialspace: bool = False,
- skiprows=None,
- skipfooter: int = 0,
- nrows: int | None = None,
- # NA and Missing Data Handling
- na_values=None,
- keep_default_na: bool = True,
- na_filter: bool = True,
- verbose: bool = False,
- skip_blank_lines: bool = True,
- # Datetime Handling
- parse_dates: bool | Sequence[Hashable] = False,
- infer_datetime_format: bool | lib.NoDefault = lib.no_default,
- keep_date_col: bool = False,
- date_parser=lib.no_default,
- date_format: str | None = None,
- dayfirst: bool = False,
- cache_dates: bool = True,
- # Iteration
- iterator: bool = False,
- chunksize: int | None = None,
- # Quoting, Compression, and File Format
- compression: CompressionOptions = "infer",
- thousands: str | None = None,
- decimal: str = ".",
- lineterminator: str | None = None,
- quotechar: str = '"',
- quoting: int = csv.QUOTE_MINIMAL,
- doublequote: bool = True,
- escapechar: str | None = None,
- comment: str | None = None,
- encoding: str | None = None,
- encoding_errors: str | None = "strict",
- dialect: str | csv.Dialect | None = None,
- # Error Handling
- on_bad_lines: str = "error",
- # Internal
- delim_whitespace: bool = False,
- low_memory=_c_parser_defaults["low_memory"],
- memory_map: bool = False,
- float_precision: str | None = None,
- storage_options: StorageOptions = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- ) -> DataFrame | TextFileReader:
- if infer_datetime_format is not lib.no_default:
- warnings.warn(
- "The argument 'infer_datetime_format' is deprecated and will "
- "be removed in a future version. "
- "A strict version of it is now the default, see "
- "https://pandas.pydata.org/pdeps/0004-consistent-to-datetime-parsing.html. "
- "You can safely remove this argument.",
- FutureWarning,
- stacklevel=find_stack_level(),
- )
- # locals() should never be modified
- kwds = locals().copy()
- del kwds["filepath_or_buffer"]
- del kwds["sep"]
- kwds_defaults = _refine_defaults_read(
- dialect,
- delimiter,
- delim_whitespace,
- engine,
- sep,
- on_bad_lines,
- names,
- defaults={"delimiter": "\t"},
- dtype_backend=dtype_backend,
- )
- kwds.update(kwds_defaults)
- return _read(filepath_or_buffer, kwds)
- def read_fwf(
- filepath_or_buffer: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str],
- *,
- colspecs: Sequence[tuple[int, int]] | str | None = "infer",
- widths: Sequence[int] | None = None,
- infer_nrows: int = 100,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- **kwds,
- ) -> DataFrame | TextFileReader:
- r"""
- Read a table of fixed-width formatted lines into DataFrame.
- Also supports optionally iterating or breaking of the file
- into chunks.
- Additional help can be found in the `online docs for IO Tools
- <https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html>`_.
- Parameters
- ----------
- filepath_or_buffer : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a text ``read()`` function.The string could be a URL.
- Valid URL schemes include http, ftp, s3, and file. For file URLs, a host is
- expected. A local file could be:
- ``file://localhost/path/to/table.csv``.
- colspecs : list of tuple (int, int) or 'infer'. optional
- A list of tuples giving the extents of the fixed-width
- fields of each line as half-open intervals (i.e., [from, to[ ).
- String value 'infer' can be used to instruct the parser to try
- detecting the column specifications from the first 100 rows of
- the data which are not being skipped via skiprows (default='infer').
- widths : list of int, optional
- A list of field widths which can be used instead of 'colspecs' if
- the intervals are contiguous.
- infer_nrows : int, default 100
- The number of rows to consider when letting the parser determine the
- `colspecs`.
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
- The dtype_backends are still experimential.
- .. versionadded:: 2.0
- **kwds : optional
- Optional keyword arguments can be passed to ``TextFileReader``.
- Returns
- -------
- DataFrame or TextFileReader
- A comma-separated values (csv) file is returned as two-dimensional
- data structure with labeled axes.
- See Also
- --------
- DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- Examples
- --------
- >>> pd.read_fwf('data.csv') # doctest: +SKIP
- """
- # Check input arguments.
- if colspecs is None and widths is None:
- raise ValueError("Must specify either colspecs or widths")
- if colspecs not in (None, "infer") and widths is not None:
- raise ValueError("You must specify only one of 'widths' and 'colspecs'")
- # Compute 'colspecs' from 'widths', if specified.
- if widths is not None:
- colspecs, col = [], 0
- for w in widths:
- colspecs.append((col, col + w))
- col += w
- # for mypy
- assert colspecs is not None
- # GH#40830
- # Ensure length of `colspecs` matches length of `names`
- names = kwds.get("names")
- if names is not None:
- if len(names) != len(colspecs) and colspecs != "infer":
- # need to check len(index_col) as it might contain
- # unnamed indices, in which case it's name is not required
- len_index = 0
- if kwds.get("index_col") is not None:
- index_col: Any = kwds.get("index_col")
- if index_col is not False:
- if not is_list_like(index_col):
- len_index = 1
- else:
- len_index = len(index_col)
- if kwds.get("usecols") is None and len(names) + len_index != len(colspecs):
- # If usecols is used colspec may be longer than names
- raise ValueError("Length of colspecs must match length of names")
- kwds["colspecs"] = colspecs
- kwds["infer_nrows"] = infer_nrows
- kwds["engine"] = "python-fwf"
- check_dtype_backend(dtype_backend)
- kwds["dtype_backend"] = dtype_backend
- return _read(filepath_or_buffer, kwds)
- class TextFileReader(abc.Iterator):
- """
- Passed dialect overrides any of the related parser options
- """
- def __init__(
- self,
- f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list,
- engine: CSVEngine | None = None,
- **kwds,
- ) -> None:
- if engine is not None:
- engine_specified = True
- else:
- engine = "python"
- engine_specified = False
- self.engine = engine
- self._engine_specified = kwds.get("engine_specified", engine_specified)
- _validate_skipfooter(kwds)
- dialect = _extract_dialect(kwds)
- if dialect is not None:
- if engine == "pyarrow":
- raise ValueError(
- "The 'dialect' option is not supported with the 'pyarrow' engine"
- )
- kwds = _merge_with_dialect_properties(dialect, kwds)
- if kwds.get("header", "infer") == "infer":
- kwds["header"] = 0 if kwds.get("names") is None else None
- self.orig_options = kwds
- # miscellanea
- self._currow = 0
- options = self._get_options_with_defaults(engine)
- options["storage_options"] = kwds.get("storage_options", None)
- self.chunksize = options.pop("chunksize", None)
- self.nrows = options.pop("nrows", None)
- self._check_file_or_buffer(f, engine)
- self.options, self.engine = self._clean_options(options, engine)
- if "has_index_names" in kwds:
- self.options["has_index_names"] = kwds["has_index_names"]
- self.handles: IOHandles | None = None
- self._engine = self._make_engine(f, self.engine)
- def close(self) -> None:
- if self.handles is not None:
- self.handles.close()
- self._engine.close()
- def _get_options_with_defaults(self, engine: CSVEngine) -> dict[str, Any]:
- kwds = self.orig_options
- options = {}
- default: object | None
- for argname, default in parser_defaults.items():
- value = kwds.get(argname, default)
- # see gh-12935
- if (
- engine == "pyarrow"
- and argname in _pyarrow_unsupported
- and value != default
- and value != getattr(value, "value", default)
- ):
- raise ValueError(
- f"The {repr(argname)} option is not supported with the "
- f"'pyarrow' engine"
- )
- options[argname] = value
- for argname, default in _c_parser_defaults.items():
- if argname in kwds:
- value = kwds[argname]
- if engine != "c" and value != default:
- if "python" in engine and argname not in _python_unsupported:
- pass
- else:
- raise ValueError(
- f"The {repr(argname)} option is not supported with the "
- f"{repr(engine)} engine"
- )
- else:
- value = default
- options[argname] = value
- if engine == "python-fwf":
- for argname, default in _fwf_defaults.items():
- options[argname] = kwds.get(argname, default)
- return options
- def _check_file_or_buffer(self, f, engine: CSVEngine) -> None:
- # see gh-16530
- if is_file_like(f) and engine != "c" and not hasattr(f, "__iter__"):
- # The C engine doesn't need the file-like to have the "__iter__"
- # attribute. However, the Python engine needs "__iter__(...)"
- # when iterating through such an object, meaning it
- # needs to have that attribute
- raise ValueError(
- "The 'python' engine cannot iterate through this file buffer."
- )
- def _clean_options(
- self, options: dict[str, Any], engine: CSVEngine
- ) -> tuple[dict[str, Any], CSVEngine]:
- result = options.copy()
- fallback_reason = None
- # C engine not supported yet
- if engine == "c":
- if options["skipfooter"] > 0:
- fallback_reason = "the 'c' engine does not support skipfooter"
- engine = "python"
- sep = options["delimiter"]
- delim_whitespace = options["delim_whitespace"]
- if sep is None and not delim_whitespace:
- if engine in ("c", "pyarrow"):
- fallback_reason = (
- f"the '{engine}' engine does not support "
- "sep=None with delim_whitespace=False"
- )
- engine = "python"
- elif sep is not None and len(sep) > 1:
- if engine == "c" and sep == r"\s+":
- result["delim_whitespace"] = True
- del result["delimiter"]
- elif engine not in ("python", "python-fwf"):
- # wait until regex engine integrated
- fallback_reason = (
- f"the '{engine}' engine does not support "
- "regex separators (separators > 1 char and "
- r"different from '\s+' are interpreted as regex)"
- )
- engine = "python"
- elif delim_whitespace:
- if "python" in engine:
- result["delimiter"] = r"\s+"
- elif sep is not None:
- encodeable = True
- encoding = sys.getfilesystemencoding() or "utf-8"
- try:
- if len(sep.encode(encoding)) > 1:
- encodeable = False
- except UnicodeDecodeError:
- encodeable = False
- if not encodeable and engine not in ("python", "python-fwf"):
- fallback_reason = (
- f"the separator encoded in {encoding} "
- f"is > 1 char long, and the '{engine}' engine "
- "does not support such separators"
- )
- engine = "python"
- quotechar = options["quotechar"]
- if quotechar is not None and isinstance(quotechar, (str, bytes)):
- if (
- len(quotechar) == 1
- and ord(quotechar) > 127
- and engine not in ("python", "python-fwf")
- ):
- fallback_reason = (
- "ord(quotechar) > 127, meaning the "
- "quotechar is larger than one byte, "
- f"and the '{engine}' engine does not support such quotechars"
- )
- engine = "python"
- if fallback_reason and self._engine_specified:
- raise ValueError(fallback_reason)
- if engine == "c":
- for arg in _c_unsupported:
- del result[arg]
- if "python" in engine:
- for arg in _python_unsupported:
- if fallback_reason and result[arg] != _c_parser_defaults[arg]:
- raise ValueError(
- "Falling back to the 'python' engine because "
- f"{fallback_reason}, but this causes {repr(arg)} to be "
- "ignored as it is not supported by the 'python' engine."
- )
- del result[arg]
- if fallback_reason:
- warnings.warn(
- (
- "Falling back to the 'python' engine because "
- f"{fallback_reason}; you can avoid this warning by specifying "
- "engine='python'."
- ),
- ParserWarning,
- stacklevel=find_stack_level(),
- )
- index_col = options["index_col"]
- names = options["names"]
- converters = options["converters"]
- na_values = options["na_values"]
- skiprows = options["skiprows"]
- validate_header_arg(options["header"])
- if index_col is True:
- raise ValueError("The value of index_col couldn't be 'True'")
- if is_index_col(index_col):
- if not isinstance(index_col, (list, tuple, np.ndarray)):
- index_col = [index_col]
- result["index_col"] = index_col
- names = list(names) if names is not None else names
- # type conversion-related
- if converters is not None:
- if not isinstance(converters, dict):
- raise TypeError(
- "Type converters must be a dict or subclass, "
- f"input was a {type(converters).__name__}"
- )
- else:
- converters = {}
- # Converting values to NA
- keep_default_na = options["keep_default_na"]
- na_values, na_fvalues = _clean_na_values(na_values, keep_default_na)
- # handle skiprows; this is internally handled by the
- # c-engine, so only need for python and pyarrow parsers
- if engine == "pyarrow":
- if not is_integer(skiprows) and skiprows is not None:
- # pyarrow expects skiprows to be passed as an integer
- raise ValueError(
- "skiprows argument must be an integer when using "
- "engine='pyarrow'"
- )
- else:
- if is_integer(skiprows):
- skiprows = list(range(skiprows))
- if skiprows is None:
- skiprows = set()
- elif not callable(skiprows):
- skiprows = set(skiprows)
- # put stuff back
- result["names"] = names
- result["converters"] = converters
- result["na_values"] = na_values
- result["na_fvalues"] = na_fvalues
- result["skiprows"] = skiprows
- return result, engine
- def __next__(self) -> DataFrame:
- try:
- return self.get_chunk()
- except StopIteration:
- self.close()
- raise
- def _make_engine(
- self,
- f: FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str] | list | IO,
- engine: CSVEngine = "c",
- ) -> ParserBase:
- mapping: dict[str, type[ParserBase]] = {
- "c": CParserWrapper,
- "python": PythonParser,
- "pyarrow": ArrowParserWrapper,
- "python-fwf": FixedWidthFieldParser,
- }
- if engine not in mapping:
- raise ValueError(
- f"Unknown engine: {engine} (valid options are {mapping.keys()})"
- )
- if not isinstance(f, list):
- # open file here
- is_text = True
- mode = "r"
- if engine == "pyarrow":
- is_text = False
- mode = "rb"
- elif (
- engine == "c"
- and self.options.get("encoding", "utf-8") == "utf-8"
- and isinstance(stringify_path(f), str)
- ):
- # c engine can decode utf-8 bytes, adding TextIOWrapper makes
- # the c-engine especially for memory_map=True far slower
- is_text = False
- if "b" not in mode:
- mode += "b"
- self.handles = get_handle(
- f,
- mode,
- encoding=self.options.get("encoding", None),
- compression=self.options.get("compression", None),
- memory_map=self.options.get("memory_map", False),
- is_text=is_text,
- errors=self.options.get("encoding_errors", "strict"),
- storage_options=self.options.get("storage_options", None),
- )
- assert self.handles is not None
- f = self.handles.handle
- elif engine != "python":
- msg = f"Invalid file path or buffer object type: {type(f)}"
- raise ValueError(msg)
- try:
- return mapping[engine](f, **self.options)
- except Exception:
- if self.handles is not None:
- self.handles.close()
- raise
- def _failover_to_python(self) -> None:
- raise AbstractMethodError(self)
- def read(self, nrows: int | None = None) -> DataFrame:
- if self.engine == "pyarrow":
- try:
- # error: "ParserBase" has no attribute "read"
- df = self._engine.read() # type: ignore[attr-defined]
- except Exception:
- self.close()
- raise
- else:
- nrows = validate_integer("nrows", nrows)
- try:
- # error: "ParserBase" has no attribute "read"
- (
- index,
- columns,
- col_dict,
- ) = self._engine.read( # type: ignore[attr-defined]
- nrows
- )
- except Exception:
- self.close()
- raise
- if index is None:
- if col_dict:
- # Any column is actually fine:
- new_rows = len(next(iter(col_dict.values())))
- index = RangeIndex(self._currow, self._currow + new_rows)
- else:
- new_rows = 0
- else:
- new_rows = len(index)
- df = DataFrame(col_dict, columns=columns, index=index)
- self._currow += new_rows
- return df
- def get_chunk(self, size: int | None = None) -> DataFrame:
- if size is None:
- size = self.chunksize
- if self.nrows is not None:
- if self._currow >= self.nrows:
- raise StopIteration
- size = min(size, self.nrows - self._currow)
- return self.read(nrows=size)
- def __enter__(self) -> TextFileReader:
- return self
- def __exit__(
- self,
- exc_type: type[BaseException] | None,
- exc_value: BaseException | None,
- traceback: TracebackType | None,
- ) -> None:
- self.close()
- def TextParser(*args, **kwds) -> TextFileReader:
- """
- Converts lists of lists/tuples into DataFrames with proper type inference
- and optional (e.g. string to datetime) conversion. Also enables iterating
- lazily over chunks of large files
- Parameters
- ----------
- data : file-like object or list
- delimiter : separator character to use
- dialect : str or csv.Dialect instance, optional
- Ignored if delimiter is longer than 1 character
- names : sequence, default
- header : int, default 0
- Row to use to parse column labels. Defaults to the first row. Prior
- rows will be discarded
- index_col : int or list, optional
- Column or columns to use as the (possibly hierarchical) index
- has_index_names: bool, default False
- True if the cols defined in index_col have an index name and are
- not in the header.
- na_values : scalar, str, list-like, or dict, optional
- Additional strings to recognize as NA/NaN.
- keep_default_na : bool, default True
- thousands : str, optional
- Thousands separator
- comment : str, optional
- Comment out remainder of line
- parse_dates : bool, default False
- keep_date_col : bool, default False
- date_parser : function, optional
- .. deprecated:: 2.0.0
- date_format : str or dict of column -> format, default ``None``
- .. versionadded:: 2.0.0
- skiprows : list of integers
- Row numbers to skip
- skipfooter : int
- Number of line at bottom of file to skip
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can
- either be integers or column labels, values are functions that take one
- input argument, the cell (not column) content, and return the
- transformed content.
- encoding : str, optional
- Encoding to use for UTF when reading/writing (ex. 'utf-8')
- float_precision : str, optional
- Specifies which converter the C engine should use for floating-point
- values. The options are `None` or `high` for the ordinary converter,
- `legacy` for the original lower precision pandas converter, and
- `round_trip` for the round-trip converter.
- .. versionchanged:: 1.2
- """
- kwds["engine"] = "python"
- return TextFileReader(*args, **kwds)
- def _clean_na_values(na_values, keep_default_na: bool = True):
- na_fvalues: set | dict
- if na_values is None:
- if keep_default_na:
- na_values = STR_NA_VALUES
- else:
- na_values = set()
- na_fvalues = set()
- elif isinstance(na_values, dict):
- old_na_values = na_values.copy()
- na_values = {} # Prevent aliasing.
- # Convert the values in the na_values dictionary
- # into array-likes for further use. This is also
- # where we append the default NaN values, provided
- # that `keep_default_na=True`.
- for k, v in old_na_values.items():
- if not is_list_like(v):
- v = [v]
- if keep_default_na:
- v = set(v) | STR_NA_VALUES
- na_values[k] = v
- na_fvalues = {k: _floatify_na_values(v) for k, v in na_values.items()}
- else:
- if not is_list_like(na_values):
- na_values = [na_values]
- na_values = _stringify_na_values(na_values)
- if keep_default_na:
- na_values = na_values | STR_NA_VALUES
- na_fvalues = _floatify_na_values(na_values)
- return na_values, na_fvalues
- def _floatify_na_values(na_values):
- # create float versions of the na_values
- result = set()
- for v in na_values:
- try:
- v = float(v)
- if not np.isnan(v):
- result.add(v)
- except (TypeError, ValueError, OverflowError):
- pass
- return result
- def _stringify_na_values(na_values):
- """return a stringified and numeric for these values"""
- result: list[str | float] = []
- for x in na_values:
- result.append(str(x))
- result.append(x)
- try:
- v = float(x)
- # we are like 999 here
- if v == int(v):
- v = int(v)
- result.append(f"{v}.0")
- result.append(str(v))
- result.append(v)
- except (TypeError, ValueError, OverflowError):
- pass
- try:
- result.append(int(x))
- except (TypeError, ValueError, OverflowError):
- pass
- return set(result)
- def _refine_defaults_read(
- dialect: str | csv.Dialect | None,
- delimiter: str | None | lib.NoDefault,
- delim_whitespace: bool,
- engine: CSVEngine | None,
- sep: str | None | lib.NoDefault,
- on_bad_lines: str | Callable,
- names: Sequence[Hashable] | None | lib.NoDefault,
- defaults: dict[str, Any],
- dtype_backend: DtypeBackend | lib.NoDefault,
- ):
- """Validate/refine default values of input parameters of read_csv, read_table.
- Parameters
- ----------
- dialect : str or csv.Dialect
- If provided, this parameter will override values (default or not) for the
- following parameters: `delimiter`, `doublequote`, `escapechar`,
- `skipinitialspace`, `quotechar`, and `quoting`. If it is necessary to
- override values, a ParserWarning will be issued. See csv.Dialect
- documentation for more details.
- delimiter : str or object
- Alias for sep.
- delim_whitespace : bool
- Specifies whether or not whitespace (e.g. ``' '`` or ``'\t'``) will be
- used as the sep. Equivalent to setting ``sep='\\s+'``. If this option
- is set to True, nothing should be passed in for the ``delimiter``
- parameter.
- engine : {{'c', 'python'}}
- Parser engine to use. The C engine is faster while the python engine is
- currently more feature-complete.
- sep : str or object
- A delimiter provided by the user (str) or a sentinel value, i.e.
- pandas._libs.lib.no_default.
- on_bad_lines : str, callable
- An option for handling bad lines or a sentinel value(None).
- names : array-like, optional
- List of column names to use. If the file contains a header row,
- then you should explicitly pass ``header=0`` to override the column names.
- Duplicates in this list are not allowed.
- defaults: dict
- Default values of input parameters.
- Returns
- -------
- kwds : dict
- Input parameters with correct values.
- Raises
- ------
- ValueError :
- If a delimiter was specified with ``sep`` (or ``delimiter``) and
- ``delim_whitespace=True``.
- """
- # fix types for sep, delimiter to Union(str, Any)
- delim_default = defaults["delimiter"]
- kwds: dict[str, Any] = {}
- # gh-23761
- #
- # When a dialect is passed, it overrides any of the overlapping
- # parameters passed in directly. We don't want to warn if the
- # default parameters were passed in (since it probably means
- # that the user didn't pass them in explicitly in the first place).
- #
- # "delimiter" is the annoying corner case because we alias it to
- # "sep" before doing comparison to the dialect values later on.
- # Thus, we need a flag to indicate that we need to "override"
- # the comparison to dialect values by checking if default values
- # for BOTH "delimiter" and "sep" were provided.
- if dialect is not None:
- kwds["sep_override"] = delimiter is None and (
- sep is lib.no_default or sep == delim_default
- )
- if delimiter and (sep is not lib.no_default):
- raise ValueError("Specified a sep and a delimiter; you can only specify one.")
- kwds["names"] = None if names is lib.no_default else names
- # Alias sep -> delimiter.
- if delimiter is None:
- delimiter = sep
- if delim_whitespace and (delimiter is not lib.no_default):
- raise ValueError(
- "Specified a delimiter with both sep and "
- "delim_whitespace=True; you can only specify one."
- )
- if delimiter == "\n":
- raise ValueError(
- r"Specified \n as separator or delimiter. This forces the python engine "
- "which does not accept a line terminator. Hence it is not allowed to use "
- "the line terminator as separator.",
- )
- if delimiter is lib.no_default:
- # assign default separator value
- kwds["delimiter"] = delim_default
- else:
- kwds["delimiter"] = delimiter
- if engine is not None:
- kwds["engine_specified"] = True
- else:
- kwds["engine"] = "c"
- kwds["engine_specified"] = False
- if on_bad_lines == "error":
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.ERROR
- elif on_bad_lines == "warn":
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.WARN
- elif on_bad_lines == "skip":
- kwds["on_bad_lines"] = ParserBase.BadLineHandleMethod.SKIP
- elif callable(on_bad_lines):
- if engine != "python":
- raise ValueError(
- "on_bad_line can only be a callable function if engine='python'"
- )
- kwds["on_bad_lines"] = on_bad_lines
- else:
- raise ValueError(f"Argument {on_bad_lines} is invalid for on_bad_lines")
- check_dtype_backend(dtype_backend)
- kwds["dtype_backend"] = dtype_backend
- return kwds
- def _extract_dialect(kwds: dict[str, Any]) -> csv.Dialect | None:
- """
- Extract concrete csv dialect instance.
- Returns
- -------
- csv.Dialect or None
- """
- if kwds.get("dialect") is None:
- return None
- dialect = kwds["dialect"]
- if dialect in csv.list_dialects():
- dialect = csv.get_dialect(dialect)
- _validate_dialect(dialect)
- return dialect
- MANDATORY_DIALECT_ATTRS = (
- "delimiter",
- "doublequote",
- "escapechar",
- "skipinitialspace",
- "quotechar",
- "quoting",
- )
- def _validate_dialect(dialect: csv.Dialect) -> None:
- """
- Validate csv dialect instance.
- Raises
- ------
- ValueError
- If incorrect dialect is provided.
- """
- for param in MANDATORY_DIALECT_ATTRS:
- if not hasattr(dialect, param):
- raise ValueError(f"Invalid dialect {dialect} provided")
- def _merge_with_dialect_properties(
- dialect: csv.Dialect,
- defaults: dict[str, Any],
- ) -> dict[str, Any]:
- """
- Merge default kwargs in TextFileReader with dialect parameters.
- Parameters
- ----------
- dialect : csv.Dialect
- Concrete csv dialect. See csv.Dialect documentation for more details.
- defaults : dict
- Keyword arguments passed to TextFileReader.
- Returns
- -------
- kwds : dict
- Updated keyword arguments, merged with dialect parameters.
- """
- kwds = defaults.copy()
- for param in MANDATORY_DIALECT_ATTRS:
- dialect_val = getattr(dialect, param)
- parser_default = parser_defaults[param]
- provided = kwds.get(param, parser_default)
- # Messages for conflicting values between the dialect
- # instance and the actual parameters provided.
- conflict_msgs = []
- # Don't warn if the default parameter was passed in,
- # even if it conflicts with the dialect (gh-23761).
- if provided not in (parser_default, dialect_val):
- msg = (
- f"Conflicting values for '{param}': '{provided}' was "
- f"provided, but the dialect specifies '{dialect_val}'. "
- "Using the dialect-specified value."
- )
- # Annoying corner case for not warning about
- # conflicts between dialect and delimiter parameter.
- # Refer to the outer "_read_" function for more info.
- if not (param == "delimiter" and kwds.pop("sep_override", False)):
- conflict_msgs.append(msg)
- if conflict_msgs:
- warnings.warn(
- "\n\n".join(conflict_msgs), ParserWarning, stacklevel=find_stack_level()
- )
- kwds[param] = dialect_val
- return kwds
- def _validate_skipfooter(kwds: dict[str, Any]) -> None:
- """
- Check whether skipfooter is compatible with other kwargs in TextFileReader.
- Parameters
- ----------
- kwds : dict
- Keyword arguments passed to TextFileReader.
- Raises
- ------
- ValueError
- If skipfooter is not compatible with other parameters.
- """
- if kwds.get("skipfooter"):
- if kwds.get("iterator") or kwds.get("chunksize"):
- raise ValueError("'skipfooter' not supported for iteration")
- if kwds.get("nrows"):
- raise ValueError("'skipfooter' not supported with 'nrows'")
|