xml.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. """
  2. :mod:`pandas.io.formats.xml` is a module for formatting data in XML.
  3. """
  4. from __future__ import annotations
  5. import codecs
  6. import io
  7. from typing import (
  8. TYPE_CHECKING,
  9. Any,
  10. )
  11. from pandas._typing import (
  12. CompressionOptions,
  13. FilePath,
  14. ReadBuffer,
  15. StorageOptions,
  16. WriteBuffer,
  17. )
  18. from pandas.errors import AbstractMethodError
  19. from pandas.util._decorators import doc
  20. from pandas.core.dtypes.common import is_list_like
  21. from pandas.core.dtypes.missing import isna
  22. from pandas.core.shared_docs import _shared_docs
  23. from pandas.io.common import get_handle
  24. from pandas.io.xml import (
  25. get_data_from_filepath,
  26. preprocess_data,
  27. )
  28. if TYPE_CHECKING:
  29. from pandas import DataFrame
  30. @doc(
  31. storage_options=_shared_docs["storage_options"],
  32. compression_options=_shared_docs["compression_options"] % "path_or_buffer",
  33. )
  34. class BaseXMLFormatter:
  35. """
  36. Subclass for formatting data in XML.
  37. Parameters
  38. ----------
  39. path_or_buffer : str or file-like
  40. This can be either a string of raw XML, a valid URL,
  41. file or file-like object.
  42. index : bool
  43. Whether to include index in xml document.
  44. row_name : str
  45. Name for root of xml document. Default is 'data'.
  46. root_name : str
  47. Name for row elements of xml document. Default is 'row'.
  48. na_rep : str
  49. Missing data representation.
  50. attrs_cols : list
  51. List of columns to write as attributes in row element.
  52. elem_cols : list
  53. List of columns to write as children in row element.
  54. namespaces : dict
  55. The namespaces to define in XML document as dicts with key
  56. being namespace and value the URI.
  57. prefix : str
  58. The prefix for each element in XML document including root.
  59. encoding : str
  60. Encoding of xml object or document.
  61. xml_declaration : bool
  62. Whether to include xml declaration at top line item in xml.
  63. pretty_print : bool
  64. Whether to write xml document with line breaks and indentation.
  65. stylesheet : str or file-like
  66. A URL, file, file-like object, or a raw string containing XSLT.
  67. {compression_options}
  68. .. versionchanged:: 1.4.0 Zstandard support.
  69. {storage_options}
  70. See also
  71. --------
  72. pandas.io.formats.xml.EtreeXMLFormatter
  73. pandas.io.formats.xml.LxmlXMLFormatter
  74. """
  75. def __init__(
  76. self,
  77. frame: DataFrame,
  78. path_or_buffer: FilePath | WriteBuffer[bytes] | WriteBuffer[str] | None = None,
  79. index: bool = True,
  80. root_name: str | None = "data",
  81. row_name: str | None = "row",
  82. na_rep: str | None = None,
  83. attr_cols: list[str] | None = None,
  84. elem_cols: list[str] | None = None,
  85. namespaces: dict[str | None, str] | None = None,
  86. prefix: str | None = None,
  87. encoding: str = "utf-8",
  88. xml_declaration: bool | None = True,
  89. pretty_print: bool | None = True,
  90. stylesheet: FilePath | ReadBuffer[str] | ReadBuffer[bytes] | None = None,
  91. compression: CompressionOptions = "infer",
  92. storage_options: StorageOptions = None,
  93. ) -> None:
  94. self.frame = frame
  95. self.path_or_buffer = path_or_buffer
  96. self.index = index
  97. self.root_name = root_name
  98. self.row_name = row_name
  99. self.na_rep = na_rep
  100. self.attr_cols = attr_cols
  101. self.elem_cols = elem_cols
  102. self.namespaces = namespaces
  103. self.prefix = prefix
  104. self.encoding = encoding
  105. self.xml_declaration = xml_declaration
  106. self.pretty_print = pretty_print
  107. self.stylesheet = stylesheet
  108. self.compression = compression
  109. self.storage_options = storage_options
  110. self.orig_cols = self.frame.columns.tolist()
  111. self.frame_dicts = self.process_dataframe()
  112. self.validate_columns()
  113. self.validate_encoding()
  114. self.prefix_uri = self.get_prefix_uri()
  115. self.handle_indexes()
  116. def build_tree(self) -> bytes:
  117. """
  118. Build tree from data.
  119. This method initializes the root and builds attributes and elements
  120. with optional namespaces.
  121. """
  122. raise AbstractMethodError(self)
  123. def validate_columns(self) -> None:
  124. """
  125. Validate elems_cols and attrs_cols.
  126. This method will check if columns is list-like.
  127. Raises
  128. ------
  129. ValueError
  130. * If value is not a list and less then length of nodes.
  131. """
  132. if self.attr_cols and not is_list_like(self.attr_cols):
  133. raise TypeError(
  134. f"{type(self.attr_cols).__name__} is not a valid type for attr_cols"
  135. )
  136. if self.elem_cols and not is_list_like(self.elem_cols):
  137. raise TypeError(
  138. f"{type(self.elem_cols).__name__} is not a valid type for elem_cols"
  139. )
  140. def validate_encoding(self) -> None:
  141. """
  142. Validate encoding.
  143. This method will check if encoding is among listed under codecs.
  144. Raises
  145. ------
  146. LookupError
  147. * If encoding is not available in codecs.
  148. """
  149. codecs.lookup(self.encoding)
  150. def process_dataframe(self) -> dict[int | str, dict[str, Any]]:
  151. """
  152. Adjust Data Frame to fit xml output.
  153. This method will adjust underlying data frame for xml output,
  154. including optionally replacing missing values and including indexes.
  155. """
  156. df = self.frame
  157. if self.index:
  158. df = df.reset_index()
  159. if self.na_rep is not None:
  160. df = df.fillna(self.na_rep)
  161. return df.to_dict(orient="index")
  162. def handle_indexes(self) -> None:
  163. """
  164. Handle indexes.
  165. This method will add indexes into attr_cols or elem_cols.
  166. """
  167. if not self.index:
  168. return
  169. first_key = next(iter(self.frame_dicts))
  170. indexes: list[str] = [
  171. x for x in self.frame_dicts[first_key].keys() if x not in self.orig_cols
  172. ]
  173. if self.attr_cols:
  174. self.attr_cols = indexes + self.attr_cols
  175. if self.elem_cols:
  176. self.elem_cols = indexes + self.elem_cols
  177. def get_prefix_uri(self) -> str:
  178. """
  179. Get uri of namespace prefix.
  180. This method retrieves corresponding URI to prefix in namespaces.
  181. Raises
  182. ------
  183. KeyError
  184. *If prefix is not included in namespace dict.
  185. """
  186. raise AbstractMethodError(self)
  187. def other_namespaces(self) -> dict:
  188. """
  189. Define other namespaces.
  190. This method will build dictionary of namespaces attributes
  191. for root element, conditionally with optional namespaces and
  192. prefix.
  193. """
  194. nmsp_dict: dict[str, str] = {}
  195. if self.namespaces and self.prefix is None:
  196. nmsp_dict = {"xmlns": n for p, n in self.namespaces.items() if p != ""}
  197. if self.namespaces and self.prefix:
  198. nmsp_dict = {"xmlns": n for p, n in self.namespaces.items() if p == ""}
  199. return nmsp_dict
  200. def build_attribs(self, d: dict[str, Any], elem_row: Any) -> Any:
  201. """
  202. Create attributes of row.
  203. This method adds attributes using attr_cols to row element and
  204. works with tuples for multindex or hierarchical columns.
  205. """
  206. if not self.attr_cols:
  207. return elem_row
  208. for col in self.attr_cols:
  209. attr_name = self._get_flat_col_name(col)
  210. try:
  211. if not isna(d[col]):
  212. elem_row.attrib[attr_name] = str(d[col])
  213. except KeyError:
  214. raise KeyError(f"no valid column, {col}")
  215. return elem_row
  216. def _get_flat_col_name(self, col: str | tuple) -> str:
  217. flat_col = col
  218. if isinstance(col, tuple):
  219. flat_col = (
  220. "".join([str(c) for c in col]).strip()
  221. if "" in col
  222. else "_".join([str(c) for c in col]).strip()
  223. )
  224. return f"{self.prefix_uri}{flat_col}"
  225. def build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
  226. """
  227. Create child elements of row.
  228. This method adds child elements using elem_cols to row element and
  229. works with tuples for multindex or hierarchical columns.
  230. """
  231. raise AbstractMethodError(self)
  232. def _build_elems(self, sub_element_cls, d: dict[str, Any], elem_row: Any) -> None:
  233. if not self.elem_cols:
  234. return
  235. for col in self.elem_cols:
  236. elem_name = self._get_flat_col_name(col)
  237. try:
  238. val = None if isna(d[col]) or d[col] == "" else str(d[col])
  239. sub_element_cls(elem_row, elem_name).text = val
  240. except KeyError:
  241. raise KeyError(f"no valid column, {col}")
  242. def write_output(self) -> str | None:
  243. xml_doc = self.build_tree()
  244. if self.path_or_buffer is not None:
  245. with get_handle(
  246. self.path_or_buffer,
  247. "wb",
  248. compression=self.compression,
  249. storage_options=self.storage_options,
  250. is_text=False,
  251. ) as handles:
  252. handles.handle.write(xml_doc)
  253. return None
  254. else:
  255. return xml_doc.decode(self.encoding).rstrip()
  256. class EtreeXMLFormatter(BaseXMLFormatter):
  257. """
  258. Class for formatting data in xml using Python standard library
  259. modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
  260. """
  261. def build_tree(self) -> bytes:
  262. from xml.etree.ElementTree import (
  263. Element,
  264. SubElement,
  265. tostring,
  266. )
  267. self.root = Element(
  268. f"{self.prefix_uri}{self.root_name}", attrib=self.other_namespaces()
  269. )
  270. for d in self.frame_dicts.values():
  271. elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
  272. if not self.attr_cols and not self.elem_cols:
  273. self.elem_cols = list(d.keys())
  274. self.build_elems(d, elem_row)
  275. else:
  276. elem_row = self.build_attribs(d, elem_row)
  277. self.build_elems(d, elem_row)
  278. self.out_xml = tostring(self.root, method="xml", encoding=self.encoding)
  279. if self.pretty_print:
  280. self.out_xml = self.prettify_tree()
  281. if self.xml_declaration:
  282. self.out_xml = self.add_declaration()
  283. else:
  284. self.out_xml = self.remove_declaration()
  285. if self.stylesheet is not None:
  286. raise ValueError(
  287. "To use stylesheet, you need lxml installed and selected as parser."
  288. )
  289. return self.out_xml
  290. def get_prefix_uri(self) -> str:
  291. from xml.etree.ElementTree import register_namespace
  292. uri = ""
  293. if self.namespaces:
  294. for p, n in self.namespaces.items():
  295. if isinstance(p, str) and isinstance(n, str):
  296. register_namespace(p, n)
  297. if self.prefix:
  298. try:
  299. uri = f"{{{self.namespaces[self.prefix]}}}"
  300. except KeyError:
  301. raise KeyError(f"{self.prefix} is not included in namespaces")
  302. else:
  303. uri = f'{{{self.namespaces[""]}}}'
  304. return uri
  305. def build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
  306. from xml.etree.ElementTree import SubElement
  307. self._build_elems(SubElement, d, elem_row)
  308. def prettify_tree(self) -> bytes:
  309. """
  310. Output tree for pretty print format.
  311. This method will pretty print xml with line breaks and indentation.
  312. """
  313. from xml.dom.minidom import parseString
  314. dom = parseString(self.out_xml)
  315. return dom.toprettyxml(indent=" ", encoding=self.encoding)
  316. def add_declaration(self) -> bytes:
  317. """
  318. Add xml declaration.
  319. This method will add xml declaration of working tree. Currently,
  320. xml_declaration is supported in etree starting in Python 3.8.
  321. """
  322. decl = f'<?xml version="1.0" encoding="{self.encoding}"?>\n'
  323. return (
  324. self.out_xml
  325. if self.out_xml.startswith(b"<?xml")
  326. else decl.encode(self.encoding) + self.out_xml
  327. )
  328. def remove_declaration(self) -> bytes:
  329. """
  330. Remove xml declaration.
  331. This method will remove xml declaration of working tree. Currently,
  332. pretty_print is not supported in etree.
  333. """
  334. return self.out_xml.split(b"?>")[-1].strip()
  335. class LxmlXMLFormatter(BaseXMLFormatter):
  336. """
  337. Class for formatting data in xml using Python standard library
  338. modules: `xml.etree.ElementTree` and `xml.dom.minidom`.
  339. """
  340. def __init__(self, *args, **kwargs) -> None:
  341. super().__init__(*args, **kwargs)
  342. self.convert_empty_str_key()
  343. def build_tree(self) -> bytes:
  344. """
  345. Build tree from data.
  346. This method initializes the root and builds attributes and elements
  347. with optional namespaces.
  348. """
  349. from lxml.etree import (
  350. Element,
  351. SubElement,
  352. tostring,
  353. )
  354. self.root = Element(f"{self.prefix_uri}{self.root_name}", nsmap=self.namespaces)
  355. for d in self.frame_dicts.values():
  356. elem_row = SubElement(self.root, f"{self.prefix_uri}{self.row_name}")
  357. if not self.attr_cols and not self.elem_cols:
  358. self.elem_cols = list(d.keys())
  359. self.build_elems(d, elem_row)
  360. else:
  361. elem_row = self.build_attribs(d, elem_row)
  362. self.build_elems(d, elem_row)
  363. self.out_xml = tostring(
  364. self.root,
  365. pretty_print=self.pretty_print,
  366. method="xml",
  367. encoding=self.encoding,
  368. xml_declaration=self.xml_declaration,
  369. )
  370. if self.stylesheet is not None:
  371. self.out_xml = self.transform_doc()
  372. return self.out_xml
  373. def convert_empty_str_key(self) -> None:
  374. """
  375. Replace zero-length string in `namespaces`.
  376. This method will replace '' with None to align to `lxml`
  377. requirement that empty string prefixes are not allowed.
  378. """
  379. if self.namespaces and "" in self.namespaces.keys():
  380. self.namespaces[None] = self.namespaces.pop("", "default")
  381. def get_prefix_uri(self) -> str:
  382. uri = ""
  383. if self.namespaces:
  384. if self.prefix:
  385. try:
  386. uri = f"{{{self.namespaces[self.prefix]}}}"
  387. except KeyError:
  388. raise KeyError(f"{self.prefix} is not included in namespaces")
  389. else:
  390. uri = f'{{{self.namespaces[""]}}}'
  391. return uri
  392. def build_elems(self, d: dict[str, Any], elem_row: Any) -> None:
  393. from lxml.etree import SubElement
  394. self._build_elems(SubElement, d, elem_row)
  395. def transform_doc(self) -> bytes:
  396. """
  397. Parse stylesheet from file or buffer and run it.
  398. This method will parse stylesheet object into tree for parsing
  399. conditionally by its specific object type, then transforms
  400. original tree with XSLT script.
  401. """
  402. from lxml.etree import (
  403. XSLT,
  404. XMLParser,
  405. fromstring,
  406. parse,
  407. )
  408. style_doc = self.stylesheet
  409. assert style_doc is not None # is ensured by caller
  410. handle_data = get_data_from_filepath(
  411. filepath_or_buffer=style_doc,
  412. encoding=self.encoding,
  413. compression=self.compression,
  414. storage_options=self.storage_options,
  415. )
  416. with preprocess_data(handle_data) as xml_data:
  417. curr_parser = XMLParser(encoding=self.encoding)
  418. if isinstance(xml_data, io.StringIO):
  419. xsl_doc = fromstring(
  420. xml_data.getvalue().encode(self.encoding), parser=curr_parser
  421. )
  422. else:
  423. xsl_doc = parse(xml_data, parser=curr_parser)
  424. transformer = XSLT(xsl_doc)
  425. new_doc = transformer(self.root)
  426. return bytes(new_doc)