1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135 |
- """
- :mod:`pandas.io.xml` is a module for reading XML.
- """
- from __future__ import annotations
- import io
- from typing import (
- Any,
- Callable,
- Sequence,
- )
- from pandas._libs import lib
- from pandas._typing import (
- TYPE_CHECKING,
- CompressionOptions,
- ConvertersArg,
- DtypeArg,
- DtypeBackend,
- FilePath,
- ParseDatesArg,
- ReadBuffer,
- StorageOptions,
- XMLParsers,
- )
- from pandas.compat._optional import import_optional_dependency
- from pandas.errors import (
- AbstractMethodError,
- ParserError,
- )
- from pandas.util._decorators import doc
- from pandas.util._validators import check_dtype_backend
- from pandas.core.dtypes.common import is_list_like
- from pandas.core.shared_docs import _shared_docs
- from pandas.io.common import (
- file_exists,
- get_handle,
- infer_compression,
- is_fsspec_url,
- is_url,
- stringify_path,
- )
- from pandas.io.parsers import TextParser
- if TYPE_CHECKING:
- from xml.etree.ElementTree import Element
- from lxml import etree
- from pandas import DataFrame
- @doc(
- storage_options=_shared_docs["storage_options"],
- decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
- )
- class _XMLFrameParser:
- """
- Internal subclass to parse XML into DataFrames.
- Parameters
- ----------
- path_or_buffer : a valid JSON str, path object or file-like object
- Any valid string path is acceptable. The string could be a URL. Valid
- URL schemes include http, ftp, s3, and file.
- xpath : str or regex
- The XPath expression to parse required set of nodes for
- migration to `Data Frame`. `etree` supports limited XPath.
- namespaces : dict
- The namespaces defined in XML document (`xmlns:namespace='URI')
- as dicts with key being namespace and value the URI.
- elems_only : bool
- Parse only the child elements at the specified `xpath`.
- attrs_only : bool
- Parse only the attributes at the specified `xpath`.
- names : list
- Column names for Data Frame of parsed XML data.
- dtype : dict
- Data type for data or columns. E.g. {{'a': np.float64,
- 'b': np.int32, 'c': 'Int64'}}
- .. versionadded:: 1.5.0
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can
- either be integers or column labels.
- .. versionadded:: 1.5.0
- parse_dates : bool or list of int or names or list of lists or dict
- Converts either index or select columns to datetimes
- .. versionadded:: 1.5.0
- encoding : str
- Encoding of xml object or document.
- stylesheet : str or file-like
- URL, file, file-like object, or a raw string containing XSLT,
- `etree` does not support XSLT but retained for consistency.
- iterparse : dict, optional
- Dict with row element as key and list of descendant elements
- and/or attributes as value to be retrieved in iterparsing of
- XML document.
- .. versionadded:: 1.5.0
- {decompression_options}
- .. versionchanged:: 1.4.0 Zstandard support.
- {storage_options}
- See also
- --------
- pandas.io.xml._EtreeFrameParser
- pandas.io.xml._LxmlFrameParser
- Notes
- -----
- To subclass this class effectively you must override the following methods:`
- * :func:`parse_data`
- * :func:`_parse_nodes`
- * :func:`_iterparse_nodes`
- * :func:`_parse_doc`
- * :func:`_validate_names`
- * :func:`_validate_path`
- See each method's respective documentation for details on their
- functionality.
- """
- def __init__(
- self,
- path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- xpath: str,
- namespaces: dict[str, str] | None,
- elems_only: bool,
- attrs_only: bool,
- names: Sequence[str] | None,
- dtype: DtypeArg | None,
- converters: ConvertersArg | None,
- parse_dates: ParseDatesArg | None,
- encoding: str | None,
- stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
- iterparse: dict[str, list[str]] | None,
- compression: CompressionOptions,
- storage_options: StorageOptions,
- ) -> None:
- self.path_or_buffer = path_or_buffer
- self.xpath = xpath
- self.namespaces = namespaces
- self.elems_only = elems_only
- self.attrs_only = attrs_only
- self.names = names
- self.dtype = dtype
- self.converters = converters
- self.parse_dates = parse_dates
- self.encoding = encoding
- self.stylesheet = stylesheet
- self.iterparse = iterparse
- self.is_style = None
- self.compression = compression
- self.storage_options = storage_options
- def parse_data(self) -> list[dict[str, str | None]]:
- """
- Parse xml data.
- This method will call the other internal methods to
- validate xpath, names, parse and return specific nodes.
- """
- raise AbstractMethodError(self)
- def _parse_nodes(self, elems: list[Any]) -> list[dict[str, str | None]]:
- """
- Parse xml nodes.
- This method will parse the children and attributes of elements
- in xpath, conditionally for only elements, only attributes
- or both while optionally renaming node names.
- Raises
- ------
- ValueError
- * If only elements and only attributes are specified.
- Notes
- -----
- Namespace URIs will be removed from return node values. Also,
- elements with missing children or attributes compared to siblings
- will have optional keys filled with None values.
- """
- dicts: list[dict[str, str | None]]
- if self.elems_only and self.attrs_only:
- raise ValueError("Either element or attributes can be parsed not both.")
- if self.elems_only:
- if self.names:
- dicts = [
- {
- **(
- {el.tag: el.text.strip()}
- if el.text and not el.text.isspace()
- else {}
- ),
- **{
- nm: ch.text.strip() if ch.text else None
- for nm, ch in zip(self.names, el.findall("*"))
- },
- }
- for el in elems
- ]
- else:
- dicts = [
- {
- ch.tag: ch.text.strip() if ch.text else None
- for ch in el.findall("*")
- }
- for el in elems
- ]
- elif self.attrs_only:
- dicts = [
- {k: v.strip() if v else None for k, v in el.attrib.items()}
- for el in elems
- ]
- else:
- if self.names:
- dicts = [
- {
- **el.attrib,
- **(
- {el.tag: el.text.strip()}
- if el.text and not el.text.isspace()
- else {}
- ),
- **{
- nm: ch.text.strip() if ch.text else None
- for nm, ch in zip(self.names, el.findall("*"))
- },
- }
- for el in elems
- ]
- else:
- dicts = [
- {
- **el.attrib,
- **(
- {el.tag: el.text.strip()}
- if el.text and not el.text.isspace()
- else {}
- ),
- **{
- ch.tag: ch.text.strip() if ch.text else None
- for ch in el.findall("*")
- },
- }
- for el in elems
- ]
- dicts = [
- {k.split("}")[1] if "}" in k else k: v for k, v in d.items()} for d in dicts
- ]
- keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
- dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
- if self.names:
- dicts = [dict(zip(self.names, d.values())) for d in dicts]
- return dicts
- def _iterparse_nodes(self, iterparse: Callable) -> list[dict[str, str | None]]:
- """
- Iterparse xml nodes.
- This method will read in local disk, decompressed XML files for elements
- and underlying descendants using iterparse, a method to iterate through
- an XML tree without holding entire XML tree in memory.
- Raises
- ------
- TypeError
- * If `iterparse` is not a dict or its dict value is not list-like.
- ParserError
- * If `path_or_buffer` is not a physical file on disk or file-like object.
- * If no data is returned from selected items in `iterparse`.
- Notes
- -----
- Namespace URIs will be removed from return node values. Also,
- elements with missing children or attributes in submitted list
- will have optional keys filled with None values.
- """
- dicts: list[dict[str, str | None]] = []
- row: dict[str, str | None] | None = None
- if not isinstance(self.iterparse, dict):
- raise TypeError(
- f"{type(self.iterparse).__name__} is not a valid type for iterparse"
- )
- row_node = next(iter(self.iterparse.keys())) if self.iterparse else ""
- if not is_list_like(self.iterparse[row_node]):
- raise TypeError(
- f"{type(self.iterparse[row_node])} is not a valid type "
- "for value in iterparse"
- )
- if (not hasattr(self.path_or_buffer, "read")) and (
- not isinstance(self.path_or_buffer, str)
- or is_url(self.path_or_buffer)
- or is_fsspec_url(self.path_or_buffer)
- or self.path_or_buffer.startswith(("<?xml", "<"))
- or infer_compression(self.path_or_buffer, "infer") is not None
- ):
- raise ParserError(
- "iterparse is designed for large XML files that are fully extracted on "
- "local disk and not as compressed files or online sources."
- )
- iterparse_repeats = len(self.iterparse[row_node]) != len(
- set(self.iterparse[row_node])
- )
- for event, elem in iterparse(self.path_or_buffer, events=("start", "end")):
- curr_elem = elem.tag.split("}")[1] if "}" in elem.tag else elem.tag
- if event == "start":
- if curr_elem == row_node:
- row = {}
- if row is not None:
- if self.names and iterparse_repeats:
- for col, nm in zip(self.iterparse[row_node], self.names):
- if curr_elem == col:
- elem_val = elem.text.strip() if elem.text else None
- if elem_val not in row.values() and nm not in row:
- row[nm] = elem_val
- if col in elem.attrib:
- if elem.attrib[col] not in row.values() and nm not in row:
- row[nm] = elem.attrib[col]
- else:
- for col in self.iterparse[row_node]:
- if curr_elem == col:
- row[col] = elem.text.strip() if elem.text else None
- if col in elem.attrib:
- row[col] = elem.attrib[col]
- if event == "end":
- if curr_elem == row_node and row is not None:
- dicts.append(row)
- row = None
- elem.clear()
- if hasattr(elem, "getprevious"):
- while (
- elem.getprevious() is not None and elem.getparent() is not None
- ):
- del elem.getparent()[0]
- if dicts == []:
- raise ParserError("No result from selected items in iterparse.")
- keys = list(dict.fromkeys([k for d in dicts for k in d.keys()]))
- dicts = [{k: d[k] if k in d.keys() else None for k in keys} for d in dicts]
- if self.names:
- dicts = [dict(zip(self.names, d.values())) for d in dicts]
- return dicts
- def _validate_path(self) -> list[Any]:
- """
- Validate xpath.
- This method checks for syntax, evaluation, or empty nodes return.
- Raises
- ------
- SyntaxError
- * If xpah is not supported or issues with namespaces.
- ValueError
- * If xpah does not return any nodes.
- """
- raise AbstractMethodError(self)
- def _validate_names(self) -> None:
- """
- Validate names.
- This method will check if names is a list-like and aligns
- with length of parse nodes.
- Raises
- ------
- ValueError
- * If value is not a list and less then length of nodes.
- """
- raise AbstractMethodError(self)
- def _parse_doc(
- self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> Element | etree._Element:
- """
- Build tree from path_or_buffer.
- This method will parse XML object into tree
- either from string/bytes or file location.
- """
- raise AbstractMethodError(self)
- class _EtreeFrameParser(_XMLFrameParser):
- """
- Internal class to parse XML into DataFrames with the Python
- standard library XML module: `xml.etree.ElementTree`.
- """
- def parse_data(self) -> list[dict[str, str | None]]:
- from xml.etree.ElementTree import iterparse
- if self.stylesheet is not None:
- raise ValueError(
- "To use stylesheet, you need lxml installed and selected as parser."
- )
- if self.iterparse is None:
- self.xml_doc = self._parse_doc(self.path_or_buffer)
- elems = self._validate_path()
- self._validate_names()
- xml_dicts: list[dict[str, str | None]] = (
- self._parse_nodes(elems)
- if self.iterparse is None
- else self._iterparse_nodes(iterparse)
- )
- return xml_dicts
- def _validate_path(self) -> list[Any]:
- """
- Notes
- -----
- `etree` supports limited XPath. If user attempts a more complex
- expression syntax error will raise.
- """
- msg = (
- "xpath does not return any nodes or attributes. "
- "Be sure to specify in `xpath` the parent nodes of "
- "children and attributes to parse. "
- "If document uses namespaces denoted with "
- "xmlns, be sure to define namespaces and "
- "use them in xpath."
- )
- try:
- elems = self.xml_doc.findall(self.xpath, namespaces=self.namespaces)
- children = [ch for el in elems for ch in el.findall("*")]
- attrs = {k: v for el in elems for k, v in el.attrib.items()}
- if elems is None:
- raise ValueError(msg)
- if elems is not None:
- if self.elems_only and children == []:
- raise ValueError(msg)
- if self.attrs_only and attrs == {}:
- raise ValueError(msg)
- if children == [] and attrs == {}:
- raise ValueError(msg)
- except (KeyError, SyntaxError):
- raise SyntaxError(
- "You have used an incorrect or unsupported XPath "
- "expression for etree library or you used an "
- "undeclared namespace prefix."
- )
- return elems
- def _validate_names(self) -> None:
- children: list[Any]
- if self.names:
- if self.iterparse:
- children = self.iterparse[next(iter(self.iterparse))]
- else:
- parent = self.xml_doc.find(self.xpath, namespaces=self.namespaces)
- children = parent.findall("*") if parent else []
- if is_list_like(self.names):
- if len(self.names) < len(children):
- raise ValueError(
- "names does not match length of child elements in xpath."
- )
- else:
- raise TypeError(
- f"{type(self.names).__name__} is not a valid type for names"
- )
- def _parse_doc(
- self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> Element:
- from xml.etree.ElementTree import (
- XMLParser,
- parse,
- )
- handle_data = get_data_from_filepath(
- filepath_or_buffer=raw_doc,
- encoding=self.encoding,
- compression=self.compression,
- storage_options=self.storage_options,
- )
- with preprocess_data(handle_data) as xml_data:
- curr_parser = XMLParser(encoding=self.encoding)
- document = parse(xml_data, parser=curr_parser)
- return document.getroot()
- class _LxmlFrameParser(_XMLFrameParser):
- """
- Internal class to parse XML into DataFrames with third-party
- full-featured XML library, `lxml`, that supports
- XPath 1.0 and XSLT 1.0.
- """
- def parse_data(self) -> list[dict[str, str | None]]:
- """
- Parse xml data.
- This method will call the other internal methods to
- validate xpath, names, optionally parse and run XSLT,
- and parse original or transformed XML and return specific nodes.
- """
- from lxml.etree import iterparse
- if self.iterparse is None:
- self.xml_doc = self._parse_doc(self.path_or_buffer)
- if self.stylesheet:
- self.xsl_doc = self._parse_doc(self.stylesheet)
- self.xml_doc = self._transform_doc()
- elems = self._validate_path()
- self._validate_names()
- xml_dicts: list[dict[str, str | None]] = (
- self._parse_nodes(elems)
- if self.iterparse is None
- else self._iterparse_nodes(iterparse)
- )
- return xml_dicts
- def _validate_path(self) -> list[Any]:
- msg = (
- "xpath does not return any nodes or attributes. "
- "Be sure to specify in `xpath` the parent nodes of "
- "children and attributes to parse. "
- "If document uses namespaces denoted with "
- "xmlns, be sure to define namespaces and "
- "use them in xpath."
- )
- elems = self.xml_doc.xpath(self.xpath, namespaces=self.namespaces)
- children = [ch for el in elems for ch in el.xpath("*")]
- attrs = {k: v for el in elems for k, v in el.attrib.items()}
- if elems == []:
- raise ValueError(msg)
- if elems != []:
- if self.elems_only and children == []:
- raise ValueError(msg)
- if self.attrs_only and attrs == {}:
- raise ValueError(msg)
- if children == [] and attrs == {}:
- raise ValueError(msg)
- return elems
- def _validate_names(self) -> None:
- children: list[Any]
- if self.names:
- if self.iterparse:
- children = self.iterparse[next(iter(self.iterparse))]
- else:
- children = self.xml_doc.xpath(
- self.xpath + "[1]/*", namespaces=self.namespaces
- )
- if is_list_like(self.names):
- if len(self.names) < len(children):
- raise ValueError(
- "names does not match length of child elements in xpath."
- )
- else:
- raise TypeError(
- f"{type(self.names).__name__} is not a valid type for names"
- )
- def _parse_doc(
- self, raw_doc: FilePath | ReadBuffer[bytes] | ReadBuffer[str]
- ) -> etree._Element:
- from lxml.etree import (
- XMLParser,
- fromstring,
- parse,
- )
- handle_data = get_data_from_filepath(
- filepath_or_buffer=raw_doc,
- encoding=self.encoding,
- compression=self.compression,
- storage_options=self.storage_options,
- )
- with preprocess_data(handle_data) as xml_data:
- curr_parser = XMLParser(encoding=self.encoding)
- if isinstance(xml_data, io.StringIO):
- if self.encoding is None:
- raise TypeError(
- "Can not pass encoding None when input is StringIO."
- )
- document = fromstring(
- xml_data.getvalue().encode(self.encoding), parser=curr_parser
- )
- else:
- document = parse(xml_data, parser=curr_parser)
- return document
- def _transform_doc(self) -> etree._XSLTResultTree:
- """
- Transform original tree using stylesheet.
- This method will transform original xml using XSLT script into
- am ideally flatter xml document for easier parsing and migration
- to Data Frame.
- """
- from lxml.etree import XSLT
- transformer = XSLT(self.xsl_doc)
- new_doc = transformer(self.xml_doc)
- return new_doc
- def get_data_from_filepath(
- filepath_or_buffer: FilePath | bytes | ReadBuffer[bytes] | ReadBuffer[str],
- encoding: str | None,
- compression: CompressionOptions,
- storage_options: StorageOptions,
- ) -> str | bytes | ReadBuffer[bytes] | ReadBuffer[str]:
- """
- Extract raw XML data.
- The method accepts three input types:
- 1. filepath (string-like)
- 2. file-like object (e.g. open file object, StringIO)
- 3. XML string or bytes
- This method turns (1) into (2) to simplify the rest of the processing.
- It returns input types (2) and (3) unchanged.
- """
- if not isinstance(filepath_or_buffer, bytes):
- filepath_or_buffer = stringify_path(filepath_or_buffer)
- if (
- isinstance(filepath_or_buffer, str)
- and not filepath_or_buffer.startswith(("<?xml", "<"))
- ) and (
- not isinstance(filepath_or_buffer, str)
- or is_url(filepath_or_buffer)
- or is_fsspec_url(filepath_or_buffer)
- or file_exists(filepath_or_buffer)
- ):
- with get_handle(
- filepath_or_buffer,
- "r",
- encoding=encoding,
- compression=compression,
- storage_options=storage_options,
- ) as handle_obj:
- filepath_or_buffer = (
- handle_obj.handle.read()
- if hasattr(handle_obj.handle, "read")
- else handle_obj.handle
- )
- return filepath_or_buffer
- def preprocess_data(data) -> io.StringIO | io.BytesIO:
- """
- Convert extracted raw data.
- This method will return underlying data of extracted XML content.
- The data either has a `read` attribute (e.g. a file object or a
- StringIO/BytesIO) or is a string or bytes that is an XML document.
- """
- if isinstance(data, str):
- data = io.StringIO(data)
- elif isinstance(data, bytes):
- data = io.BytesIO(data)
- return data
- def _data_to_frame(data, **kwargs) -> DataFrame:
- """
- Convert parsed data to Data Frame.
- This method will bind xml dictionary data of keys and values
- into named columns of Data Frame using the built-in TextParser
- class that build Data Frame and infers specific dtypes.
- """
- tags = next(iter(data))
- nodes = [list(d.values()) for d in data]
- try:
- with TextParser(nodes, names=tags, **kwargs) as tp:
- return tp.read()
- except ParserError:
- raise ParserError(
- "XML document may be too complex for import. "
- "Try to flatten document and use distinct "
- "element and attribute names."
- )
- def _parse(
- path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- xpath: str,
- namespaces: dict[str, str] | None,
- elems_only: bool,
- attrs_only: bool,
- names: Sequence[str] | None,
- dtype: DtypeArg | None,
- converters: ConvertersArg | None,
- parse_dates: ParseDatesArg | None,
- encoding: str | None,
- parser: XMLParsers,
- stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None,
- iterparse: dict[str, list[str]] | None,
- compression: CompressionOptions,
- storage_options: StorageOptions,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- **kwargs,
- ) -> DataFrame:
- """
- Call internal parsers.
- This method will conditionally call internal parsers:
- LxmlFrameParser and/or EtreeParser.
- Raises
- ------
- ImportError
- * If lxml is not installed if selected as parser.
- ValueError
- * If parser is not lxml or etree.
- """
- p: _EtreeFrameParser | _LxmlFrameParser
- if parser == "lxml":
- lxml = import_optional_dependency("lxml.etree", errors="ignore")
- if lxml is not None:
- p = _LxmlFrameParser(
- path_or_buffer,
- xpath,
- namespaces,
- elems_only,
- attrs_only,
- names,
- dtype,
- converters,
- parse_dates,
- encoding,
- stylesheet,
- iterparse,
- compression,
- storage_options,
- )
- else:
- raise ImportError("lxml not found, please install or use the etree parser.")
- elif parser == "etree":
- p = _EtreeFrameParser(
- path_or_buffer,
- xpath,
- namespaces,
- elems_only,
- attrs_only,
- names,
- dtype,
- converters,
- parse_dates,
- encoding,
- stylesheet,
- iterparse,
- compression,
- storage_options,
- )
- else:
- raise ValueError("Values for parser can only be lxml or etree.")
- data_dicts = p.parse_data()
- return _data_to_frame(
- data=data_dicts,
- dtype=dtype,
- converters=converters,
- parse_dates=parse_dates,
- dtype_backend=dtype_backend,
- **kwargs,
- )
- @doc(
- storage_options=_shared_docs["storage_options"],
- decompression_options=_shared_docs["decompression_options"] % "path_or_buffer",
- )
- def read_xml(
- path_or_buffer: FilePath | ReadBuffer[bytes] | ReadBuffer[str],
- *,
- xpath: str = "./*",
- namespaces: dict[str, str] | None = None,
- elems_only: bool = False,
- attrs_only: bool = False,
- names: Sequence[str] | None = None,
- dtype: DtypeArg | None = None,
- converters: ConvertersArg | None = None,
- parse_dates: ParseDatesArg | None = None,
- # encoding can not be None for lxml and StringIO input
- encoding: str | None = "utf-8",
- parser: XMLParsers = "lxml",
- stylesheet: FilePath | ReadBuffer[bytes] | ReadBuffer[str] | None = None,
- iterparse: dict[str, list[str]] | None = None,
- compression: CompressionOptions = "infer",
- storage_options: StorageOptions = None,
- dtype_backend: DtypeBackend | lib.NoDefault = lib.no_default,
- ) -> DataFrame:
- r"""
- Read XML document into a ``DataFrame`` object.
- .. versionadded:: 1.3.0
- Parameters
- ----------
- path_or_buffer : str, path object, or file-like object
- String, path object (implementing ``os.PathLike[str]``), or file-like
- object implementing a ``read()`` function. The string can be any valid XML
- string or a path. The string can further be a URL. Valid URL schemes
- include http, ftp, s3, and file.
- xpath : str, optional, default './\*'
- The XPath to parse required set of nodes for migration to DataFrame.
- XPath should return a collection of elements and not a single
- element. Note: The ``etree`` parser supports limited XPath
- expressions. For more complex XPath, use ``lxml`` which requires
- installation.
- namespaces : dict, optional
- The namespaces defined in XML document as dicts with key being
- namespace prefix and value the URI. There is no need to include all
- namespaces in XML, only the ones used in ``xpath`` expression.
- Note: if XML document uses default namespace denoted as
- `xmlns='<URI>'` without a prefix, you must assign any temporary
- namespace prefix such as 'doc' to the URI in order to parse
- underlying nodes and/or attributes. For example, ::
- namespaces = {{"doc": "https://example.com"}}
- elems_only : bool, optional, default False
- Parse only the child elements at the specified ``xpath``. By default,
- all child elements and non-empty text nodes are returned.
- attrs_only : bool, optional, default False
- Parse only the attributes at the specified ``xpath``.
- By default, all attributes are returned.
- names : list-like, optional
- Column names for DataFrame of parsed XML data. Use this parameter to
- rename original element names and distinguish same named elements and
- attributes.
- dtype : Type name or dict of column -> type, optional
- Data type for data or columns. E.g. {{'a': np.float64, 'b': np.int32,
- 'c': 'Int64'}}
- Use `str` or `object` together with suitable `na_values` settings
- to preserve and not interpret dtype.
- If converters are specified, they will be applied INSTEAD
- of dtype conversion.
- .. versionadded:: 1.5.0
- converters : dict, optional
- Dict of functions for converting values in certain columns. Keys can either
- be integers or column labels.
- .. versionadded:: 1.5.0
- parse_dates : bool or list of int or names or list of lists or dict, default False
- Identifiers to parse index or columns to datetime. The behavior is as follows:
- * boolean. If True -> try parsing the index.
- * list of int or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3
- each as a separate date column.
- * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as
- a single date column.
- * dict, e.g. {{'foo' : [1, 3]}} -> parse columns 1, 3 as date and call
- result 'foo'
- .. versionadded:: 1.5.0
- encoding : str, optional, default 'utf-8'
- Encoding of XML document.
- parser : {{'lxml','etree'}}, default 'lxml'
- Parser module to use for retrieval of data. Only 'lxml' and
- 'etree' are supported. With 'lxml' more complex XPath searches
- and ability to use XSLT stylesheet are supported.
- stylesheet : str, path object or file-like object
- A URL, file-like object, or a raw string containing an XSLT script.
- This stylesheet should flatten complex, deeply nested XML documents
- for easier parsing. To use this feature you must have ``lxml`` module
- installed and specify 'lxml' as ``parser``. The ``xpath`` must
- reference nodes of transformed XML document generated after XSLT
- transformation and not the original XML document. Only XSLT 1.0
- scripts and not later versions is currently supported.
- iterparse : dict, optional
- The nodes or attributes to retrieve in iterparsing of XML document
- as a dict with key being the name of repeating element and value being
- list of elements or attribute names that are descendants of the repeated
- element. Note: If this option is used, it will replace ``xpath`` parsing
- and unlike xpath, descendants do not need to relate to each other but can
- exist any where in document under the repeating element. This memory-
- efficient method should be used for very large XML files (500MB, 1GB, or 5GB+).
- For example, ::
- iterparse = {{"row_element": ["child_elem", "attr", "grandchild_elem"]}}
- .. versionadded:: 1.5.0
- {decompression_options}
- .. versionchanged:: 1.4.0 Zstandard support.
- {storage_options}
- dtype_backend : {{"numpy_nullable", "pyarrow"}}, defaults to NumPy backed DataFrames
- Which dtype_backend to use, e.g. whether a DataFrame should have NumPy
- arrays, nullable dtypes are used for all dtypes that have a nullable
- implementation when "numpy_nullable" is set, pyarrow is used for all
- dtypes if "pyarrow" is set.
- The dtype_backends are still experimential.
- .. versionadded:: 2.0
- Returns
- -------
- df
- A DataFrame.
- See Also
- --------
- read_json : Convert a JSON string to pandas object.
- read_html : Read HTML tables into a list of DataFrame objects.
- Notes
- -----
- This method is best designed to import shallow XML documents in
- following format which is the ideal fit for the two-dimensions of a
- ``DataFrame`` (row by column). ::
- <root>
- <row>
- <column1>data</column1>
- <column2>data</column2>
- <column3>data</column3>
- ...
- </row>
- <row>
- ...
- </row>
- ...
- </root>
- As a file format, XML documents can be designed any way including
- layout of elements and attributes as long as it conforms to W3C
- specifications. Therefore, this method is a convenience handler for
- a specific flatter design and not all possible XML structures.
- However, for more complex XML documents, ``stylesheet`` allows you to
- temporarily redesign original document with XSLT (a special purpose
- language) for a flatter version for migration to a DataFrame.
- This function will *always* return a single :class:`DataFrame` or raise
- exceptions due to issues with XML document, ``xpath``, or other
- parameters.
- See the :ref:`read_xml documentation in the IO section of the docs
- <io.read_xml>` for more information in using this method to parse XML
- files to DataFrames.
- Examples
- --------
- >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
- ... <data xmlns="http://example.com">
- ... <row>
- ... <shape>square</shape>
- ... <degrees>360</degrees>
- ... <sides>4.0</sides>
- ... </row>
- ... <row>
- ... <shape>circle</shape>
- ... <degrees>360</degrees>
- ... <sides/>
- ... </row>
- ... <row>
- ... <shape>triangle</shape>
- ... <degrees>180</degrees>
- ... <sides>3.0</sides>
- ... </row>
- ... </data>'''
- >>> df = pd.read_xml(xml)
- >>> df
- shape degrees sides
- 0 square 360 4.0
- 1 circle 360 NaN
- 2 triangle 180 3.0
- >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
- ... <data>
- ... <row shape="square" degrees="360" sides="4.0"/>
- ... <row shape="circle" degrees="360"/>
- ... <row shape="triangle" degrees="180" sides="3.0"/>
- ... </data>'''
- >>> df = pd.read_xml(xml, xpath=".//row")
- >>> df
- shape degrees sides
- 0 square 360 4.0
- 1 circle 360 NaN
- 2 triangle 180 3.0
- >>> xml = '''<?xml version='1.0' encoding='utf-8'?>
- ... <doc:data xmlns:doc="https://example.com">
- ... <doc:row>
- ... <doc:shape>square</doc:shape>
- ... <doc:degrees>360</doc:degrees>
- ... <doc:sides>4.0</doc:sides>
- ... </doc:row>
- ... <doc:row>
- ... <doc:shape>circle</doc:shape>
- ... <doc:degrees>360</doc:degrees>
- ... <doc:sides/>
- ... </doc:row>
- ... <doc:row>
- ... <doc:shape>triangle</doc:shape>
- ... <doc:degrees>180</doc:degrees>
- ... <doc:sides>3.0</doc:sides>
- ... </doc:row>
- ... </doc:data>'''
- >>> df = pd.read_xml(xml,
- ... xpath="//doc:row",
- ... namespaces={{"doc": "https://example.com"}})
- >>> df
- shape degrees sides
- 0 square 360 4.0
- 1 circle 360 NaN
- 2 triangle 180 3.0
- """
- check_dtype_backend(dtype_backend)
- return _parse(
- path_or_buffer=path_or_buffer,
- xpath=xpath,
- namespaces=namespaces,
- elems_only=elems_only,
- attrs_only=attrs_only,
- names=names,
- dtype=dtype,
- converters=converters,
- parse_dates=parse_dates,
- encoding=encoding,
- parser=parser,
- stylesheet=stylesheet,
- iterparse=iterparse,
- compression=compression,
- storage_options=storage_options,
- dtype_backend=dtype_backend,
- )
|