123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230 |
- """
- :mod:`pandas.io.html` is a module containing functionality for dealing with
- HTML IO.
- """
- from __future__ import annotations
- from collections import abc
- import numbers
- import re
- from typing import (
- TYPE_CHECKING,
- Iterable,
- Literal,
- Pattern,
- Sequence,
- cast,
- )
- from pandas._libs import lib
- from pandas._typing import (
- BaseBuffer,
- DtypeBackend,
- FilePath,
- ReadBuffer,
- )
- from pandas.compat._optional import import_optional_dependency
- from pandas.errors import (
- AbstractMethodError,
- EmptyDataError,
- )
- from pandas.util._validators import check_dtype_backend
- from pandas.core.dtypes.common import is_list_like
- from pandas import isna
- from pandas.core.indexes.base import Index
- from pandas.core.indexes.multi import MultiIndex
- from pandas.core.series import Series
- from pandas.io.common import (
- file_exists,
- get_handle,
- is_url,
- stringify_path,
- urlopen,
- validate_header_arg,
- )
- from pandas.io.formats.printing import pprint_thing
- from pandas.io.parsers import TextParser
- if TYPE_CHECKING:
- from pandas import DataFrame
- _IMPORTS = False
- _HAS_BS4 = False
- _HAS_LXML = False
- _HAS_HTML5LIB = False
- def _importers() -> None:
- # import things we need
- # but make this done on a first use basis
- global _IMPORTS
- if _IMPORTS:
- return
- global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB
- bs4 = import_optional_dependency("bs4", errors="ignore")
- _HAS_BS4 = bs4 is not None
- lxml = import_optional_dependency("lxml.etree", errors="ignore")
- _HAS_LXML = lxml is not None
- html5lib = import_optional_dependency("html5lib", errors="ignore")
- _HAS_HTML5LIB = html5lib is not None
- _IMPORTS = True
- #############
- # READ HTML #
- #############
- _RE_WHITESPACE = re.compile(r"[\r\n]+|\s{2,}")
- def _remove_whitespace(s: str, regex: Pattern = _RE_WHITESPACE) -> str:
- """
- Replace extra whitespace inside of a string with a single space.
- Parameters
- ----------
- s : str or unicode
- The string from which to remove extra whitespace.
- regex : re.Pattern
- The regular expression to use to remove extra whitespace.
- Returns
- -------
- subd : str or unicode
- `s` with all extra whitespace replaced with a single space.
- """
- return regex.sub(" ", s.strip())
- def _get_skiprows(skiprows: int | Sequence[int] | slice | None) -> int | Sequence[int]:
- """
- Get an iterator given an integer, slice or container.
- Parameters
- ----------
- skiprows : int, slice, container
- The iterator to use to skip rows; can also be a slice.
- Raises
- ------
- TypeError
- * If `skiprows` is not a slice, integer, or Container
- Returns
- -------
- it : iterable
- A proper iterator to use to skip rows of a DataFrame.
- """
- if isinstance(skiprows, slice):
- start, step = skiprows.start or 0, skiprows.step or 1
- return list(range(start, skiprows.stop, step))
- elif isinstance(skiprows, numbers.Integral) or is_list_like(skiprows):
- return cast("int | Sequence[int]", skiprows)
- elif skiprows is None:
- return 0
- raise TypeError(f"{type(skiprows).__name__} is not a valid type for skipping rows")
- def _read(obj: FilePath | BaseBuffer, encoding: str | None) -> str | bytes:
- """
- Try to read from a url, file or string.
- Parameters
- ----------
- obj : str, unicode, path object, or file-like object
- Returns
- -------
- raw_text : str
- """
- text: str | bytes
- if (
- is_url(obj)
- or hasattr(obj, "read")
- or (isinstance(obj, str) and file_exists(obj))
- ):
- with get_handle(obj, "r", encoding=encoding) as handles:
- text = handles.handle.read()
- elif isinstance(obj, (str, bytes)):
- text = obj
- else:
- raise TypeError(f"Cannot read object of type '{type(obj).__name__}'")
- return text
- class _HtmlFrameParser:
- """
- Base class for parsers that parse HTML into DataFrames.
- Parameters
- ----------
- io : str or file-like
- This can be either a string of raw HTML, a valid URL using the HTTP,
- FTP, or FILE protocols or a file-like object.
- match : str or regex
- The text to match in the document.
- attrs : dict
- List of HTML <table> element attributes to match.
- encoding : str
- Encoding to be used by parser
- displayed_only : bool
- Whether or not items with "display:none" should be ignored
- extract_links : {None, "all", "header", "body", "footer"}
- Table elements in the specified section(s) with <a> tags will have their
- href extracted.
- .. versionadded:: 1.5.0
- Attributes
- ----------
- io : str or file-like
- raw HTML, URL, or file-like object
- match : regex
- The text to match in the raw HTML
- attrs : dict-like
- A dictionary of valid table attributes to use to search for table
- elements.
- encoding : str
- Encoding to be used by parser
- displayed_only : bool
- Whether or not items with "display:none" should be ignored
- extract_links : {None, "all", "header", "body", "footer"}
- Table elements in the specified section(s) with <a> tags will have their
- href extracted.
- .. versionadded:: 1.5.0
- Notes
- -----
- To subclass this class effectively you must override the following methods:
- * :func:`_build_doc`
- * :func:`_attr_getter`
- * :func:`_href_getter`
- * :func:`_text_getter`
- * :func:`_parse_td`
- * :func:`_parse_thead_tr`
- * :func:`_parse_tbody_tr`
- * :func:`_parse_tfoot_tr`
- * :func:`_parse_tables`
- * :func:`_equals_tag`
- See each method's respective documentation for details on their
- functionality.
- """
- def __init__(
- self,
- io: FilePath | ReadBuffer[str] | ReadBuffer[bytes],
- match: str | Pattern,
- attrs: dict[str, str] | None,
- encoding: str,
- displayed_only: bool,
- extract_links: Literal[None, "header", "footer", "body", "all"],
- ) -> None:
- self.io = io
- self.match = match
- self.attrs = attrs
- self.encoding = encoding
- self.displayed_only = displayed_only
- self.extract_links = extract_links
- def parse_tables(self):
- """
- Parse and return all tables from the DOM.
- Returns
- -------
- list of parsed (header, body, footer) tuples from tables.
- """
- tables = self._parse_tables(self._build_doc(), self.match, self.attrs)
- return (self._parse_thead_tbody_tfoot(table) for table in tables)
- def _attr_getter(self, obj, attr):
- """
- Return the attribute value of an individual DOM node.
- Parameters
- ----------
- obj : node-like
- A DOM node.
- attr : str or unicode
- The attribute, such as "colspan"
- Returns
- -------
- str or unicode
- The attribute value.
- """
- # Both lxml and BeautifulSoup have the same implementation:
- return obj.get(attr)
- def _href_getter(self, obj):
- """
- Return a href if the DOM node contains a child <a> or None.
- Parameters
- ----------
- obj : node-like
- A DOM node.
- Returns
- -------
- href : str or unicode
- The href from the <a> child of the DOM node.
- """
- raise AbstractMethodError(self)
- def _text_getter(self, obj):
- """
- Return the text of an individual DOM node.
- Parameters
- ----------
- obj : node-like
- A DOM node.
- Returns
- -------
- text : str or unicode
- The text from an individual DOM node.
- """
- raise AbstractMethodError(self)
- def _parse_td(self, obj):
- """
- Return the td elements from a row element.
- Parameters
- ----------
- obj : node-like
- A DOM <tr> node.
- Returns
- -------
- list of node-like
- These are the elements of each row, i.e., the columns.
- """
- raise AbstractMethodError(self)
- def _parse_thead_tr(self, table):
- """
- Return the list of thead row elements from the parsed table element.
- Parameters
- ----------
- table : a table element that contains zero or more thead elements.
- Returns
- -------
- list of node-like
- These are the <tr> row elements of a table.
- """
- raise AbstractMethodError(self)
- def _parse_tbody_tr(self, table):
- """
- Return the list of tbody row elements from the parsed table element.
- HTML5 table bodies consist of either 0 or more <tbody> elements (which
- only contain <tr> elements) or 0 or more <tr> elements. This method
- checks for both structures.
- Parameters
- ----------
- table : a table element that contains row elements.
- Returns
- -------
- list of node-like
- These are the <tr> row elements of a table.
- """
- raise AbstractMethodError(self)
- def _parse_tfoot_tr(self, table):
- """
- Return the list of tfoot row elements from the parsed table element.
- Parameters
- ----------
- table : a table element that contains row elements.
- Returns
- -------
- list of node-like
- These are the <tr> row elements of a table.
- """
- raise AbstractMethodError(self)
- def _parse_tables(self, doc, match, attrs):
- """
- Return all tables from the parsed DOM.
- Parameters
- ----------
- doc : the DOM from which to parse the table element.
- match : str or regular expression
- The text to search for in the DOM tree.
- attrs : dict
- A dictionary of table attributes that can be used to disambiguate
- multiple tables on a page.
- Raises
- ------
- ValueError : `match` does not match any text in the document.
- Returns
- -------
- list of node-like
- HTML <table> elements to be parsed into raw data.
- """
- raise AbstractMethodError(self)
- def _equals_tag(self, obj, tag):
- """
- Return whether an individual DOM node matches a tag
- Parameters
- ----------
- obj : node-like
- A DOM node.
- tag : str
- Tag name to be checked for equality.
- Returns
- -------
- boolean
- Whether `obj`'s tag name is `tag`
- """
- raise AbstractMethodError(self)
- def _build_doc(self):
- """
- Return a tree-like object that can be used to iterate over the DOM.
- Returns
- -------
- node-like
- The DOM from which to parse the table element.
- """
- raise AbstractMethodError(self)
- def _parse_thead_tbody_tfoot(self, table_html):
- """
- Given a table, return parsed header, body, and foot.
- Parameters
- ----------
- table_html : node-like
- Returns
- -------
- tuple of (header, body, footer), each a list of list-of-text rows.
- Notes
- -----
- Header and body are lists-of-lists. Top level list is a list of
- rows. Each row is a list of str text.
- Logic: Use <thead>, <tbody>, <tfoot> elements to identify
- header, body, and footer, otherwise:
- - Put all rows into body
- - Move rows from top of body to header only if
- all elements inside row are <th>
- - Move rows from bottom of body to footer only if
- all elements inside row are <th>
- """
- header_rows = self._parse_thead_tr(table_html)
- body_rows = self._parse_tbody_tr(table_html)
- footer_rows = self._parse_tfoot_tr(table_html)
- def row_is_all_th(row):
- return all(self._equals_tag(t, "th") for t in self._parse_td(row))
- if not header_rows:
- # The table has no <thead>. Move the top all-<th> rows from
- # body_rows to header_rows. (This is a common case because many
- # tables in the wild have no <thead> or <tfoot>
- while body_rows and row_is_all_th(body_rows[0]):
- header_rows.append(body_rows.pop(0))
- header = self._expand_colspan_rowspan(header_rows, section="header")
- body = self._expand_colspan_rowspan(body_rows, section="body")
- footer = self._expand_colspan_rowspan(footer_rows, section="footer")
- return header, body, footer
- def _expand_colspan_rowspan(
- self, rows, section: Literal["header", "footer", "body"]
- ):
- """
- Given a list of <tr>s, return a list of text rows.
- Parameters
- ----------
- rows : list of node-like
- List of <tr>s
- section : the section that the rows belong to (header, body or footer).
- Returns
- -------
- list of list
- Each returned row is a list of str text, or tuple (text, link)
- if extract_links is not None.
- Notes
- -----
- Any cell with ``rowspan`` or ``colspan`` will have its contents copied
- to subsequent cells.
- """
- all_texts = [] # list of rows, each a list of str
- text: str | tuple
- remainder: list[
- tuple[int, str | tuple, int]
- ] = [] # list of (index, text, nrows)
- for tr in rows:
- texts = [] # the output for this row
- next_remainder = []
- index = 0
- tds = self._parse_td(tr)
- for td in tds:
- # Append texts from previous rows with rowspan>1 that come
- # before this <td>
- while remainder and remainder[0][0] <= index:
- prev_i, prev_text, prev_rowspan = remainder.pop(0)
- texts.append(prev_text)
- if prev_rowspan > 1:
- next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
- index += 1
- # Append the text from this <td>, colspan times
- text = _remove_whitespace(self._text_getter(td))
- if self.extract_links in ("all", section):
- href = self._href_getter(td)
- text = (text, href)
- rowspan = int(self._attr_getter(td, "rowspan") or 1)
- colspan = int(self._attr_getter(td, "colspan") or 1)
- for _ in range(colspan):
- texts.append(text)
- if rowspan > 1:
- next_remainder.append((index, text, rowspan - 1))
- index += 1
- # Append texts from previous rows at the final position
- for prev_i, prev_text, prev_rowspan in remainder:
- texts.append(prev_text)
- if prev_rowspan > 1:
- next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
- all_texts.append(texts)
- remainder = next_remainder
- # Append rows that only appear because the previous row had non-1
- # rowspan
- while remainder:
- next_remainder = []
- texts = []
- for prev_i, prev_text, prev_rowspan in remainder:
- texts.append(prev_text)
- if prev_rowspan > 1:
- next_remainder.append((prev_i, prev_text, prev_rowspan - 1))
- all_texts.append(texts)
- remainder = next_remainder
- return all_texts
- def _handle_hidden_tables(self, tbl_list, attr_name):
- """
- Return list of tables, potentially removing hidden elements
- Parameters
- ----------
- tbl_list : list of node-like
- Type of list elements will vary depending upon parser used
- attr_name : str
- Name of the accessor for retrieving HTML attributes
- Returns
- -------
- list of node-like
- Return type matches `tbl_list`
- """
- if not self.displayed_only:
- return tbl_list
- return [
- x
- for x in tbl_list
- if "display:none"
- not in getattr(x, attr_name).get("style", "").replace(" ", "")
- ]
- class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser):
- """
- HTML to DataFrame parser that uses BeautifulSoup under the hood.
- See Also
- --------
- pandas.io.html._HtmlFrameParser
- pandas.io.html._LxmlFrameParser
- Notes
- -----
- Documentation strings for this class are in the base class
- :class:`pandas.io.html._HtmlFrameParser`.
- """
- def __init__(self, *args, **kwargs) -> None:
- super().__init__(*args, **kwargs)
- from bs4 import SoupStrainer
- self._strainer = SoupStrainer("table")
- def _parse_tables(self, doc, match, attrs):
- element_name = self._strainer.name
- tables = doc.find_all(element_name, attrs=attrs)
- if not tables:
- raise ValueError("No tables found")
- result = []
- unique_tables = set()
- tables = self._handle_hidden_tables(tables, "attrs")
- for table in tables:
- if self.displayed_only:
- for elem in table.find_all(style=re.compile(r"display:\s*none")):
- elem.decompose()
- if table not in unique_tables and table.find(string=match) is not None:
- result.append(table)
- unique_tables.add(table)
- if not result:
- raise ValueError(f"No tables found matching pattern {repr(match.pattern)}")
- return result
- def _href_getter(self, obj) -> str | None:
- a = obj.find("a", href=True)
- return None if not a else a["href"]
- def _text_getter(self, obj):
- return obj.text
- def _equals_tag(self, obj, tag):
- return obj.name == tag
- def _parse_td(self, row):
- return row.find_all(("td", "th"), recursive=False)
- def _parse_thead_tr(self, table):
- return table.select("thead tr")
- def _parse_tbody_tr(self, table):
- from_tbody = table.select("tbody tr")
- from_root = table.find_all("tr", recursive=False)
- # HTML spec: at most one of these lists has content
- return from_tbody + from_root
- def _parse_tfoot_tr(self, table):
- return table.select("tfoot tr")
- def _setup_build_doc(self):
- raw_text = _read(self.io, self.encoding)
- if not raw_text:
- raise ValueError(f"No text parsed from document: {self.io}")
- return raw_text
- def _build_doc(self):
- from bs4 import BeautifulSoup
- bdoc = self._setup_build_doc()
- if isinstance(bdoc, bytes) and self.encoding is not None:
- udoc = bdoc.decode(self.encoding)
- from_encoding = None
- else:
- udoc = bdoc
- from_encoding = self.encoding
- soup = BeautifulSoup(udoc, features="html5lib", from_encoding=from_encoding)
- for br in soup.find_all("br"):
- br.replace_with("\n" + br.text)
- return soup
- def _build_xpath_expr(attrs) -> str:
- """
- Build an xpath expression to simulate bs4's ability to pass in kwargs to
- search for attributes when using the lxml parser.
- Parameters
- ----------
- attrs : dict
- A dict of HTML attributes. These are NOT checked for validity.
- Returns
- -------
- expr : unicode
- An XPath expression that checks for the given HTML attributes.
- """
- # give class attribute as class_ because class is a python keyword
- if "class_" in attrs:
- attrs["class"] = attrs.pop("class_")
- s = " and ".join([f"@{k}={repr(v)}" for k, v in attrs.items()])
- return f"[{s}]"
- _re_namespace = {"re": "http://exslt.org/regular-expressions"}
- class _LxmlFrameParser(_HtmlFrameParser):
- """
- HTML to DataFrame parser that uses lxml under the hood.
- Warning
- -------
- This parser can only handle HTTP, FTP, and FILE urls.
- See Also
- --------
- _HtmlFrameParser
- _BeautifulSoupLxmlFrameParser
- Notes
- -----
- Documentation strings for this class are in the base class
- :class:`_HtmlFrameParser`.
- """
- def _href_getter(self, obj) -> str | None:
- href = obj.xpath(".//a/@href")
- return None if not href else href[0]
- def _text_getter(self, obj):
- return obj.text_content()
- def _parse_td(self, row):
- # Look for direct children only: the "row" element here may be a
- # <thead> or <tfoot> (see _parse_thead_tr).
- return row.xpath("./td|./th")
- def _parse_tables(self, doc, match, kwargs):
- pattern = match.pattern
- # 1. check all descendants for the given pattern and only search tables
- # GH 49929
- xpath_expr = f"//table[.//text()[re:test(., {repr(pattern)})]]"
- # if any table attributes were given build an xpath expression to
- # search for them
- if kwargs:
- xpath_expr += _build_xpath_expr(kwargs)
- tables = doc.xpath(xpath_expr, namespaces=_re_namespace)
- tables = self._handle_hidden_tables(tables, "attrib")
- if self.displayed_only:
- for table in tables:
- # lxml utilizes XPATH 1.0 which does not have regex
- # support. As a result, we find all elements with a style
- # attribute and iterate them to check for display:none
- for elem in table.xpath(".//*[@style]"):
- if "display:none" in elem.attrib.get("style", "").replace(" ", ""):
- elem.getparent().remove(elem)
- if not tables:
- raise ValueError(f"No tables found matching regex {repr(pattern)}")
- return tables
- def _equals_tag(self, obj, tag):
- return obj.tag == tag
- def _build_doc(self):
- """
- Raises
- ------
- ValueError
- * If a URL that lxml cannot parse is passed.
- Exception
- * Any other ``Exception`` thrown. For example, trying to parse a
- URL that is syntactically correct on a machine with no internet
- connection will fail.
- See Also
- --------
- pandas.io.html._HtmlFrameParser._build_doc
- """
- from lxml.etree import XMLSyntaxError
- from lxml.html import (
- HTMLParser,
- fromstring,
- parse,
- )
- parser = HTMLParser(recover=True, encoding=self.encoding)
- try:
- if is_url(self.io):
- with urlopen(self.io) as f:
- r = parse(f, parser=parser)
- else:
- # try to parse the input in the simplest way
- r = parse(self.io, parser=parser)
- try:
- r = r.getroot()
- except AttributeError:
- pass
- except (UnicodeDecodeError, OSError) as e:
- # if the input is a blob of html goop
- if not is_url(self.io):
- r = fromstring(self.io, parser=parser)
- try:
- r = r.getroot()
- except AttributeError:
- pass
- else:
- raise e
- else:
- if not hasattr(r, "text_content"):
- raise XMLSyntaxError("no text parsed from document", 0, 0, 0)
- for br in r.xpath("*//br"):
- br.tail = "\n" + (br.tail or "")
- return r
- def _parse_thead_tr(self, table):
- rows = []
- for thead in table.xpath(".//thead"):
- rows.extend(thead.xpath("./tr"))
- # HACK: lxml does not clean up the clearly-erroneous
- # <thead><th>foo</th><th>bar</th></thead>. (Missing <tr>). Add
- # the <thead> and _pretend_ it's a <tr>; _parse_td() will find its
- # children as though it's a <tr>.
- #
- # Better solution would be to use html5lib.
- elements_at_root = thead.xpath("./td|./th")
- if elements_at_root:
- rows.append(thead)
- return rows
- def _parse_tbody_tr(self, table):
- from_tbody = table.xpath(".//tbody//tr")
- from_root = table.xpath("./tr")
- # HTML spec: at most one of these lists has content
- return from_tbody + from_root
- def _parse_tfoot_tr(self, table):
- return table.xpath(".//tfoot//tr")
- def _expand_elements(body) -> None:
- data = [len(elem) for elem in body]
- lens = Series(data)
- lens_max = lens.max()
- not_max = lens[lens != lens_max]
- empty = [""]
- for ind, length in not_max.items():
- body[ind] += empty * (lens_max - length)
- def _data_to_frame(**kwargs):
- head, body, foot = kwargs.pop("data")
- header = kwargs.pop("header")
- kwargs["skiprows"] = _get_skiprows(kwargs["skiprows"])
- if head:
- body = head + body
- # Infer header when there is a <thead> or top <th>-only rows
- if header is None:
- if len(head) == 1:
- header = 0
- else:
- # ignore all-empty-text rows
- header = [i for i, row in enumerate(head) if any(text for text in row)]
- if foot:
- body += foot
- # fill out elements of body that are "ragged"
- _expand_elements(body)
- with TextParser(body, header=header, **kwargs) as tp:
- return tp.read()
- _valid_parsers = {
- "lxml": _LxmlFrameParser,
- None: _LxmlFrameParser,
- "html5lib": _BeautifulSoupHtml5LibFrameParser,
- "bs4": _BeautifulSoupHtml5LibFrameParser,
- }
- def _parser_dispatch(flavor: str | None) -> type[_HtmlFrameParser]:
- """
- Choose the parser based on the input flavor.
- Parameters
- ----------
- flavor : str
- The type of parser to use. This must be a valid backend.
- Returns
- -------
- cls : _HtmlFrameParser subclass
- The parser class based on the requested input flavor.
- Raises
- ------
- ValueError
- * If `flavor` is not a valid backend.
- ImportError
- * If you do not have the requested `flavor`
- """
- valid_parsers = list(_valid_parsers.keys())
- if flavor not in valid_parsers:
- raise ValueError(
- f"{repr(flavor)} is not a valid flavor, valid flavors are {valid_parsers}"
- )
- if flavor in ("bs4", "html5lib"):
- if not _HAS_HTML5LIB:
- raise ImportError("html5lib not found, please install it")
- if not _HAS_BS4:
- raise ImportError("BeautifulSoup4 (bs4) not found, please install it")
- # Although we call this above, we want to raise here right before use.
- bs4 = import_optional_dependency("bs4") # noqa:F841
- else:
- if not _HAS_LXML:
- raise ImportError("lxml not found, please install it")
- return _valid_parsers[flavor]
- def _print_as_set(s) -> str:
- arg = ", ".join([pprint_thing(el) for el in s])
- return f"{{{arg}}}"
- def _validate_flavor(flavor):
- if flavor is None:
- flavor = "lxml", "bs4"
- elif isinstance(flavor, str):
- flavor = (flavor,)
- elif isinstance(flavor, abc.Iterable):
- if not all(isinstance(flav, str) for flav in flavor):
- raise TypeError(
- f"Object of type {repr(type(flavor).__name__)} "
- f"is not an iterable of strings"
- )
- else:
- msg = repr(flavor) if isinstance(flavor, str) else str(flavor)
- msg += " is not a valid flavor"
- raise ValueError(msg)
- flavor = tuple(flavor)
- valid_flavors = set(_valid_parsers)
- flavor_set = set(flavor)
- if not flavor_set & valid_flavors:
- raise ValueError(
- f"{_print_as_set(flavor_set)} is not a valid set of flavors, valid "
- f"flavors are {_print_as_set(valid_flavors)}"
- )
- return flavor
- def _parse(flavor, io, match, attrs, encoding, displayed_only, extract_links, **kwargs):
- flavor = _validate_flavor(flavor)
- compiled_match = re.compile(match) # you can pass a compiled regex here
- retained = None
- for flav in flavor:
- parser = _parser_dispatch(flav)
- p = parser(io, compiled_match, attrs, encoding, displayed_only, extract_links)
- try:
- tables = p.parse_tables()
- except ValueError as caught:
- # if `io` is an io-like object, check if it's seekable
- # and try to rewind it before trying the next parser
- if hasattr(io, "seekable") and io.seekable():
- io.seek(0)
- elif hasattr(io, "seekable") and not io.seekable():
- # if we couldn't rewind it, let the user know
- raise ValueError(
- f"The flavor {flav} failed to parse your input. "
- "Since you passed a non-rewindable file "
- "object, we can't rewind it to try "
- "another parser. Try read_html() with a different flavor."
- ) from caught
- retained = caught
- else:
- break
- else:
- assert retained is not None # for mypy
- raise retained
- ret = []
- for table in tables:
- try:
- df = _data_to_frame(data=table, **kwargs)
- # Cast MultiIndex header to an Index of tuples when extracting header
- # links and replace nan with None (therefore can't use mi.to_flat_index()).
- # This maintains consistency of selection (e.g. df.columns.str[1])
- if extract_links in ("all", "header") and isinstance(
- df.columns, MultiIndex
- ):
- df.columns = Index(
- ((col[0], None if isna(col[1]) else col[1]) for col in df.columns),
- tupleize_cols=False,
- )
- ret.append(df)
- except EmptyDataError: # empty table
- continue
- return ret
- def read_html(
- io: FilePath | ReadBuffer[str],
- *,
- match: str | Pattern = ".+",
- flavor: str | None = None,
- header: int | Sequence[int] | None = None,
- index_col: int | Sequence[int] | None = None,
- skiprows: int | Sequence[int] | slice | None = None,
- attrs: dict[str, str] | None = None,
- parse_dates: bool = False,
- thousands: str | None = ",",
- encoding: str | None = None,
- decimal: str = ".",
- converters: dict | None = None,
- na_values: Iterable[object] | None = None,
- keep_default_na: bool = True,
- displayed_only: bool = True,
- extract_links: Literal[None, "header", "footer", "body", "all"] = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- ) -> list[DataFrame]:
- r"""
- Read HTML tables into a ``list`` of ``DataFrame`` objects.
- Parameters
- ----------
- io : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a string ``read()`` function.
- The string can represent a URL or the HTML itself. Note that
- lxml only accepts the http, ftp and file url protocols. If you have a
- URL that starts with ``'https'`` you might try removing the ``'s'``.
- match : str or compiled regular expression, optional
- The set of tables containing text matching this regex or string will be
- returned. Unless the HTML is extremely simple you will probably need to
- pass a non-empty string here. Defaults to '.+' (match any non-empty
- string). The default value will return all tables contained on a page.
- This value is converted to a regular expression so that there is
- consistent behavior between Beautiful Soup and lxml.
- flavor : str, optional
- The parsing engine to use. 'bs4' and 'html5lib' are synonymous with
- each other, they are both there for backwards compatibility. The
- default of ``None`` tries to use ``lxml`` to parse and if that fails it
- falls back on ``bs4`` + ``html5lib``.
- header : int or list-like, optional
- The row (or list of rows for a :class:`~pandas.MultiIndex`) to use to
- make the columns headers.
- index_col : int or list-like, optional
- The column (or list of columns) to use to create the index.
- skiprows : int, list-like or slice, optional
- Number of rows to skip after parsing the column integer. 0-based. If a
- sequence of integers or a slice is given, will skip the rows indexed by
- that sequence. Note that a single element sequence means 'skip the nth
- row' whereas an integer means 'skip n rows'.
- attrs : dict, optional
- This is a dictionary of attributes that you can pass to use to identify
- the table in the HTML. These are not checked for validity before being
- passed to lxml or Beautiful Soup. However, these attributes must be
- valid HTML table attributes to work correctly. For example, ::
- attrs = {'id': 'table'}
- is a valid attribute dictionary because the 'id' HTML tag attribute is
- a valid HTML attribute for *any* HTML tag as per `this document
- <https://html.spec.whatwg.org/multipage/dom.html#global-attributes>`__. ::
- attrs = {'asdf': 'table'}
- is *not* a valid attribute dictionary because 'asdf' is not a valid
- HTML attribute even if it is a valid XML attribute. Valid HTML 4.01
- table attributes can be found `here
- <http://www.w3.org/TR/REC-html40/struct/tables.html#h-11.2>`__. A
- working draft of the HTML 5 spec can be found `here
- <https://html.spec.whatwg.org/multipage/tables.html>`__. It contains the
- latest information on table attributes for the modern web.
- parse_dates : bool, optional
- See :func:`~read_csv` for more details.
- thousands : str, optional
- Separator to use to parse thousands. Defaults to ``','``.
- encoding : str, optional
- The encoding used to decode the web page. Defaults to ``None``.``None``
- preserves the previous encoding behavior, which depends on the
- underlying parser library (e.g., the parser library will try to use
- the encoding provided by the document).
- decimal : str, default '.'
- Character to recognize as decimal point (e.g. use ',' for European
- data).
- converters : dict, default None
- Dict of functions for converting values in certain columns. Keys can
- either be integers or column labels, values are functions that take one
- input argument, the cell (not column) content, and return the
- transformed content.
- na_values : iterable, default None
- Custom NA values.
- keep_default_na : bool, default True
- If na_values are specified and keep_default_na is False the default NaN
- values are overridden, otherwise they're appended to.
- displayed_only : bool, default True
- Whether elements with "display: none" should be parsed.
- extract_links : {None, "all", "header", "body", "footer"}
- Table elements in the specified section(s) with <a> tags will have their
- href extracted.
- .. versionadded:: 1.5.0
- dtype_backend : {"numpy_nullable", "pyarrow"}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
- The dtype_backends are still experimential.
- .. versionadded:: 2.0
- Returns
- -------
- dfs
- A list of DataFrames.
- See Also
- --------
- read_csv : Read a comma-separated values (csv) file into DataFrame.
- Notes
- -----
- Before using this function you should read the :ref:`gotchas about the
- HTML parsing libraries <io.html.gotchas>`.
- Expect to do some cleanup after you call this function. For example, you
- might need to manually assign column names if the column names are
- converted to NaN when you pass the `header=0` argument. We try to assume as
- little as possible about the structure of the table and push the
- idiosyncrasies of the HTML contained in the table to the user.
- This function searches for ``<table>`` elements and only for ``<tr>``
- and ``<th>`` rows and ``<td>`` elements within each ``<tr>`` or ``<th>``
- element in the table. ``<td>`` stands for "table data". This function
- attempts to properly handle ``colspan`` and ``rowspan`` attributes.
- If the function has a ``<thead>`` argument, it is used to construct
- the header, otherwise the function attempts to find the header within
- the body (by putting rows with only ``<th>`` elements into the header).
- Similar to :func:`~read_csv` the `header` argument is applied
- **after** `skiprows` is applied.
- This function will *always* return a list of :class:`DataFrame` *or*
- it will fail, e.g., it will *not* return an empty list.
- Examples
- --------
- See the :ref:`read_html documentation in the IO section of the docs
- <io.read_html>` for some examples of reading in HTML tables.
- """
- _importers()
- # Type check here. We don't want to parse only to fail because of an
- # invalid value of an integer skiprows.
- if isinstance(skiprows, numbers.Integral) and skiprows < 0:
- raise ValueError(
- "cannot skip rows starting from the end of the "
- "data (you passed a negative value)"
- )
- if extract_links not in [None, "header", "footer", "body", "all"]:
- raise ValueError(
- "`extract_links` must be one of "
- '{None, "header", "footer", "body", "all"}, got '
- f'"{extract_links}"'
- )
- validate_header_arg(header)
- check_dtype_backend(dtype_backend)
- io = stringify_path(io)
- return _parse(
- flavor=flavor,
- io=io,
- match=match,
- header=header,
- index_col=index_col,
- skiprows=skiprows,
- parse_dates=parse_dates,
- thousands=thousands,
- attrs=attrs,
- encoding=encoding,
- decimal=decimal,
- converters=converters,
- na_values=na_values,
- keep_default_na=keep_default_na,
- displayed_only=displayed_only,
- extract_links=extract_links,
- dtype_backend=dtype_backend,
- )
|