12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349 |
- from __future__ import annotations
- from io import (
- BytesIO,
- StringIO,
- )
- import os
- import numpy as np
- import pytest
- import pandas.util._test_decorators as td
- from pandas import (
- NA,
- DataFrame,
- Index,
- )
- import pandas._testing as tm
- from pandas.tests.io.test_compression import _compression_to_extension
- from pandas.io.common import get_handle
- from pandas.io.xml import read_xml
- # CHECKLIST
- # [x] - ValueError: "Values for parser can only be lxml or etree."
- # etree
- # [x] - ImportError: "lxml not found, please install or use the etree parser."
- # [X] - TypeError: "...is not a valid type for attr_cols"
- # [X] - TypeError: "...is not a valid type for elem_cols"
- # [X] - LookupError: "unknown encoding"
- # [X] - KeyError: "...is not included in namespaces"
- # [X] - KeyError: "no valid column"
- # [X] - ValueError: "To use stylesheet, you need lxml installed..."
- # [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.)
- # [X] - FileNotFoundError: "No such file or directory"
- # [X] - PermissionError: "Forbidden"
- # lxml
- # [X] - TypeError: "...is not a valid type for attr_cols"
- # [X] - TypeError: "...is not a valid type for elem_cols"
- # [X] - LookupError: "unknown encoding"
- # [] - OSError: (NEED PERMISSOIN ISSUE, DISK FULL, ETC.)
- # [X] - FileNotFoundError: "No such file or directory"
- # [X] - KeyError: "...is not included in namespaces"
- # [X] - KeyError: "no valid column"
- # [X] - ValueError: "stylesheet is not a url, file, or xml string."
- # [] - LookupError: (NEED WRONG ENCODING FOR FILE OUTPUT)
- # [] - URLError: (USUALLY DUE TO NETWORKING)
- # [] - HTTPError: (NEED AN ONLINE STYLESHEET)
- # [X] - OSError: "failed to load external entity"
- # [X] - XMLSyntaxError: "Opening and ending tag mismatch"
- # [X] - XSLTApplyError: "Cannot resolve URI"
- # [X] - XSLTParseError: "failed to compile"
- # [X] - PermissionError: "Forbidden"
- geom_df = DataFrame(
- {
- "shape": ["square", "circle", "triangle"],
- "degrees": [360, 360, 180],
- "sides": [4, np.nan, 3],
- }
- )
- planet_df = DataFrame(
- {
- "planet": [
- "Mercury",
- "Venus",
- "Earth",
- "Mars",
- "Jupiter",
- "Saturn",
- "Uranus",
- "Neptune",
- ],
- "type": [
- "terrestrial",
- "terrestrial",
- "terrestrial",
- "terrestrial",
- "gas giant",
- "gas giant",
- "ice giant",
- "ice giant",
- ],
- "location": [
- "inner",
- "inner",
- "inner",
- "inner",
- "outer",
- "outer",
- "outer",
- "outer",
- ],
- "mass": [
- 0.330114,
- 4.86747,
- 5.97237,
- 0.641712,
- 1898.187,
- 568.3174,
- 86.8127,
- 102.4126,
- ],
- }
- )
- from_file_expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <index>0</index>
- <category>cooking</category>
- <title>Everyday Italian</title>
- <author>Giada De Laurentiis</author>
- <year>2005</year>
- <price>30.0</price>
- </row>
- <row>
- <index>1</index>
- <category>children</category>
- <title>Harry Potter</title>
- <author>J K. Rowling</author>
- <year>2005</year>
- <price>29.99</price>
- </row>
- <row>
- <index>2</index>
- <category>web</category>
- <title>Learning XML</title>
- <author>Erik T. Ray</author>
- <year>2003</year>
- <price>39.95</price>
- </row>
- </data>"""
- def equalize_decl(doc):
- # etree and lxml differ on quotes and case in xml declaration
- if doc is not None:
- doc = doc.replace(
- '<?xml version="1.0" encoding="utf-8"?',
- "<?xml version='1.0' encoding='utf-8'?",
- )
- return doc
- @pytest.fixture(params=["rb", "r"])
- def mode(request):
- return request.param
- @pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"])
- def parser(request):
- return request.param
- # FILE OUTPUT
- def test_file_output_str_read(datapath, parser):
- filename = datapath("io", "data", "xml", "books.xml")
- df_file = read_xml(filename, parser=parser)
- with tm.ensure_clean("test.xml") as path:
- df_file.to_xml(path, parser=parser)
- with open(path, "rb") as f:
- output = f.read().decode("utf-8").strip()
- output = equalize_decl(output)
- assert output == from_file_expected
- def test_file_output_bytes_read(datapath, parser):
- filename = datapath("io", "data", "xml", "books.xml")
- df_file = read_xml(filename, parser=parser)
- with tm.ensure_clean("test.xml") as path:
- df_file.to_xml(path, parser=parser)
- with open(path, "rb") as f:
- output = f.read().decode("utf-8").strip()
- output = equalize_decl(output)
- assert output == from_file_expected
- def test_str_output(datapath, parser):
- filename = datapath("io", "data", "xml", "books.xml")
- df_file = read_xml(filename, parser=parser)
- output = df_file.to_xml(parser=parser)
- output = equalize_decl(output)
- assert output == from_file_expected
- def test_wrong_file_path(parser):
- path = "/my/fake/path/output.xml"
- with pytest.raises(
- OSError,
- match=(r"Cannot save file into a non-existent directory: .*path"),
- ):
- geom_df.to_xml(path, parser=parser)
- # INDEX
- def test_index_false(datapath, parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <category>cooking</category>
- <title>Everyday Italian</title>
- <author>Giada De Laurentiis</author>
- <year>2005</year>
- <price>30.0</price>
- </row>
- <row>
- <category>children</category>
- <title>Harry Potter</title>
- <author>J K. Rowling</author>
- <year>2005</year>
- <price>29.99</price>
- </row>
- <row>
- <category>web</category>
- <title>Learning XML</title>
- <author>Erik T. Ray</author>
- <year>2003</year>
- <price>39.95</price>
- </row>
- </data>"""
- filename = datapath("io", "data", "xml", "books.xml")
- df_file = read_xml(filename, parser=parser)
- with tm.ensure_clean("test.xml") as path:
- df_file.to_xml(path, index=False, parser=parser)
- with open(path, "rb") as f:
- output = f.read().decode("utf-8").strip()
- output = equalize_decl(output)
- assert output == expected
- def test_index_false_rename_row_root(datapath, parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <books>
- <book>
- <category>cooking</category>
- <title>Everyday Italian</title>
- <author>Giada De Laurentiis</author>
- <year>2005</year>
- <price>30.0</price>
- </book>
- <book>
- <category>children</category>
- <title>Harry Potter</title>
- <author>J K. Rowling</author>
- <year>2005</year>
- <price>29.99</price>
- </book>
- <book>
- <category>web</category>
- <title>Learning XML</title>
- <author>Erik T. Ray</author>
- <year>2003</year>
- <price>39.95</price>
- </book>
- </books>"""
- filename = datapath("io", "data", "xml", "books.xml")
- df_file = read_xml(filename, parser=parser)
- with tm.ensure_clean("test.xml") as path:
- df_file.to_xml(
- path, index=False, root_name="books", row_name="book", parser=parser
- )
- with open(path, "rb") as f:
- output = f.read().decode("utf-8").strip()
- output = equalize_decl(output)
- assert output == expected
- @pytest.mark.parametrize(
- "offset_index", [list(range(10, 13)), [str(i) for i in range(10, 13)]]
- )
- def test_index_false_with_offset_input_index(parser, offset_index):
- """
- Tests that the output does not contain the `<index>` field when the index of the
- input Dataframe has an offset.
- This is a regression test for issue #42458.
- """
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <shape>square</shape>
- <degrees>360</degrees>
- <sides>4.0</sides>
- </row>
- <row>
- <shape>circle</shape>
- <degrees>360</degrees>
- <sides/>
- </row>
- <row>
- <shape>triangle</shape>
- <degrees>180</degrees>
- <sides>3.0</sides>
- </row>
- </data>"""
- offset_geom_df = geom_df.copy()
- offset_geom_df.index = Index(offset_index)
- output = offset_geom_df.to_xml(index=False, parser=parser)
- output = equalize_decl(output)
- assert output == expected
- # NA_REP
- na_expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <index>0</index>
- <shape>square</shape>
- <degrees>360</degrees>
- <sides>4.0</sides>
- </row>
- <row>
- <index>1</index>
- <shape>circle</shape>
- <degrees>360</degrees>
- <sides/>
- </row>
- <row>
- <index>2</index>
- <shape>triangle</shape>
- <degrees>180</degrees>
- <sides>3.0</sides>
- </row>
- </data>"""
- def test_na_elem_output(parser):
- output = geom_df.to_xml(parser=parser)
- output = equalize_decl(output)
- assert output == na_expected
- def test_na_empty_str_elem_option(parser):
- output = geom_df.to_xml(na_rep="", parser=parser)
- output = equalize_decl(output)
- assert output == na_expected
- def test_na_empty_elem_option(parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <index>0</index>
- <shape>square</shape>
- <degrees>360</degrees>
- <sides>4.0</sides>
- </row>
- <row>
- <index>1</index>
- <shape>circle</shape>
- <degrees>360</degrees>
- <sides>0.0</sides>
- </row>
- <row>
- <index>2</index>
- <shape>triangle</shape>
- <degrees>180</degrees>
- <sides>3.0</sides>
- </row>
- </data>"""
- output = geom_df.to_xml(na_rep="0.0", parser=parser)
- output = equalize_decl(output)
- assert output == expected
- # ATTR_COLS
- def test_attrs_cols_nan_output(parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row index="0" shape="square" degrees="360" sides="4.0"/>
- <row index="1" shape="circle" degrees="360"/>
- <row index="2" shape="triangle" degrees="180" sides="3.0"/>
- </data>"""
- output = geom_df.to_xml(attr_cols=["shape", "degrees", "sides"], parser=parser)
- output = equalize_decl(output)
- assert output == expected
- def test_attrs_cols_prefix(parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <doc:data xmlns:doc="http://example.xom">
- <doc:row doc:index="0" doc:shape="square" \
- doc:degrees="360" doc:sides="4.0"/>
- <doc:row doc:index="1" doc:shape="circle" \
- doc:degrees="360"/>
- <doc:row doc:index="2" doc:shape="triangle" \
- doc:degrees="180" doc:sides="3.0"/>
- </doc:data>"""
- output = geom_df.to_xml(
- attr_cols=["index", "shape", "degrees", "sides"],
- namespaces={"doc": "http://example.xom"},
- prefix="doc",
- parser=parser,
- )
- output = equalize_decl(output)
- assert output == expected
- def test_attrs_unknown_column(parser):
- with pytest.raises(KeyError, match=("no valid column")):
- geom_df.to_xml(attr_cols=["shape", "degree", "sides"], parser=parser)
- def test_attrs_wrong_type(parser):
- with pytest.raises(TypeError, match=("is not a valid type for attr_cols")):
- geom_df.to_xml(attr_cols='"shape", "degree", "sides"', parser=parser)
- # ELEM_COLS
- def test_elems_cols_nan_output(parser):
- elems_cols_expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <degrees>360</degrees>
- <sides>4.0</sides>
- <shape>square</shape>
- </row>
- <row>
- <degrees>360</degrees>
- <sides/>
- <shape>circle</shape>
- </row>
- <row>
- <degrees>180</degrees>
- <sides>3.0</sides>
- <shape>triangle</shape>
- </row>
- </data>"""
- output = geom_df.to_xml(
- index=False, elem_cols=["degrees", "sides", "shape"], parser=parser
- )
- output = equalize_decl(output)
- assert output == elems_cols_expected
- def test_elems_unknown_column(parser):
- with pytest.raises(KeyError, match=("no valid column")):
- geom_df.to_xml(elem_cols=["shape", "degree", "sides"], parser=parser)
- def test_elems_wrong_type(parser):
- with pytest.raises(TypeError, match=("is not a valid type for elem_cols")):
- geom_df.to_xml(elem_cols='"shape", "degree", "sides"', parser=parser)
- def test_elems_and_attrs_cols(parser):
- elems_cols_expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row shape="square">
- <degrees>360</degrees>
- <sides>4.0</sides>
- </row>
- <row shape="circle">
- <degrees>360</degrees>
- <sides/>
- </row>
- <row shape="triangle">
- <degrees>180</degrees>
- <sides>3.0</sides>
- </row>
- </data>"""
- output = geom_df.to_xml(
- index=False,
- elem_cols=["degrees", "sides"],
- attr_cols=["shape"],
- parser=parser,
- )
- output = equalize_decl(output)
- assert output == elems_cols_expected
- # HIERARCHICAL COLUMNS
- def test_hierarchical_columns(parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <location>inner</location>
- <type>terrestrial</type>
- <count_mass>4</count_mass>
- <sum_mass>11.81</sum_mass>
- <mean_mass>2.95</mean_mass>
- </row>
- <row>
- <location>outer</location>
- <type>gas giant</type>
- <count_mass>2</count_mass>
- <sum_mass>2466.5</sum_mass>
- <mean_mass>1233.25</mean_mass>
- </row>
- <row>
- <location>outer</location>
- <type>ice giant</type>
- <count_mass>2</count_mass>
- <sum_mass>189.23</sum_mass>
- <mean_mass>94.61</mean_mass>
- </row>
- <row>
- <location>All</location>
- <type/>
- <count_mass>8</count_mass>
- <sum_mass>2667.54</sum_mass>
- <mean_mass>333.44</mean_mass>
- </row>
- </data>"""
- pvt = planet_df.pivot_table(
- index=["location", "type"],
- values="mass",
- aggfunc=["count", "sum", "mean"],
- margins=True,
- ).round(2)
- output = pvt.to_xml(parser=parser)
- output = equalize_decl(output)
- assert output == expected
- def test_hierarchical_attrs_columns(parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row location="inner" type="terrestrial" count_mass="4" \
- sum_mass="11.81" mean_mass="2.95"/>
- <row location="outer" type="gas giant" count_mass="2" \
- sum_mass="2466.5" mean_mass="1233.25"/>
- <row location="outer" type="ice giant" count_mass="2" \
- sum_mass="189.23" mean_mass="94.61"/>
- <row location="All" type="" count_mass="8" \
- sum_mass="2667.54" mean_mass="333.44"/>
- </data>"""
- pvt = planet_df.pivot_table(
- index=["location", "type"],
- values="mass",
- aggfunc=["count", "sum", "mean"],
- margins=True,
- ).round(2)
- output = pvt.to_xml(attr_cols=list(pvt.reset_index().columns.values), parser=parser)
- output = equalize_decl(output)
- assert output == expected
- # MULTIINDEX
- def test_multi_index(parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <location>inner</location>
- <type>terrestrial</type>
- <count>4</count>
- <sum>11.81</sum>
- <mean>2.95</mean>
- </row>
- <row>
- <location>outer</location>
- <type>gas giant</type>
- <count>2</count>
- <sum>2466.5</sum>
- <mean>1233.25</mean>
- </row>
- <row>
- <location>outer</location>
- <type>ice giant</type>
- <count>2</count>
- <sum>189.23</sum>
- <mean>94.61</mean>
- </row>
- </data>"""
- agg = (
- planet_df.groupby(["location", "type"])["mass"]
- .agg(["count", "sum", "mean"])
- .round(2)
- )
- output = agg.to_xml(parser=parser)
- output = equalize_decl(output)
- assert output == expected
- def test_multi_index_attrs_cols(parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row location="inner" type="terrestrial" count="4" \
- sum="11.81" mean="2.95"/>
- <row location="outer" type="gas giant" count="2" \
- sum="2466.5" mean="1233.25"/>
- <row location="outer" type="ice giant" count="2" \
- sum="189.23" mean="94.61"/>
- </data>"""
- agg = (
- planet_df.groupby(["location", "type"])["mass"]
- .agg(["count", "sum", "mean"])
- .round(2)
- )
- output = agg.to_xml(attr_cols=list(agg.reset_index().columns.values), parser=parser)
- output = equalize_decl(output)
- assert output == expected
- # NAMESPACE
- def test_default_namespace(parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data xmlns="http://example.com">
- <row>
- <index>0</index>
- <shape>square</shape>
- <degrees>360</degrees>
- <sides>4.0</sides>
- </row>
- <row>
- <index>1</index>
- <shape>circle</shape>
- <degrees>360</degrees>
- <sides/>
- </row>
- <row>
- <index>2</index>
- <shape>triangle</shape>
- <degrees>180</degrees>
- <sides>3.0</sides>
- </row>
- </data>"""
- output = geom_df.to_xml(namespaces={"": "http://example.com"}, parser=parser)
- output = equalize_decl(output)
- assert output == expected
- # PREFIX
- def test_namespace_prefix(parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <doc:data xmlns:doc="http://example.com">
- <doc:row>
- <doc:index>0</doc:index>
- <doc:shape>square</doc:shape>
- <doc:degrees>360</doc:degrees>
- <doc:sides>4.0</doc:sides>
- </doc:row>
- <doc:row>
- <doc:index>1</doc:index>
- <doc:shape>circle</doc:shape>
- <doc:degrees>360</doc:degrees>
- <doc:sides/>
- </doc:row>
- <doc:row>
- <doc:index>2</doc:index>
- <doc:shape>triangle</doc:shape>
- <doc:degrees>180</doc:degrees>
- <doc:sides>3.0</doc:sides>
- </doc:row>
- </doc:data>"""
- output = geom_df.to_xml(
- namespaces={"doc": "http://example.com"}, prefix="doc", parser=parser
- )
- output = equalize_decl(output)
- assert output == expected
- def test_missing_prefix_in_nmsp(parser):
- with pytest.raises(KeyError, match=("doc is not included in namespaces")):
- geom_df.to_xml(
- namespaces={"": "http://example.com"}, prefix="doc", parser=parser
- )
- def test_namespace_prefix_and_default(parser):
- expected = """\
- <?xml version='1.0' encoding='utf-8'?>
- <doc:data xmlns="http://example.com" xmlns:doc="http://other.org">
- <doc:row>
- <doc:index>0</doc:index>
- <doc:shape>square</doc:shape>
- <doc:degrees>360</doc:degrees>
- <doc:sides>4.0</doc:sides>
- </doc:row>
- <doc:row>
- <doc:index>1</doc:index>
- <doc:shape>circle</doc:shape>
- <doc:degrees>360</doc:degrees>
- <doc:sides/>
- </doc:row>
- <doc:row>
- <doc:index>2</doc:index>
- <doc:shape>triangle</doc:shape>
- <doc:degrees>180</doc:degrees>
- <doc:sides>3.0</doc:sides>
- </doc:row>
- </doc:data>"""
- output = geom_df.to_xml(
- namespaces={"": "http://example.com", "doc": "http://other.org"},
- prefix="doc",
- parser=parser,
- )
- output = equalize_decl(output)
- if output is not None:
- # etree and lxml differs on order of namespace prefixes
- output = output.replace(
- 'xmlns:doc="http://other.org" xmlns="http://example.com"',
- 'xmlns="http://example.com" xmlns:doc="http://other.org"',
- )
- assert output == expected
- # ENCODING
- encoding_expected = """\
- <?xml version='1.0' encoding='ISO-8859-1'?>
- <data>
- <row>
- <index>0</index>
- <rank>1</rank>
- <malename>José</malename>
- <femalename>Sofía</femalename>
- </row>
- <row>
- <index>1</index>
- <rank>2</rank>
- <malename>Luis</malename>
- <femalename>Valentina</femalename>
- </row>
- <row>
- <index>2</index>
- <rank>3</rank>
- <malename>Carlos</malename>
- <femalename>Isabella</femalename>
- </row>
- <row>
- <index>3</index>
- <rank>4</rank>
- <malename>Juan</malename>
- <femalename>Camila</femalename>
- </row>
- <row>
- <index>4</index>
- <rank>5</rank>
- <malename>Jorge</malename>
- <femalename>Valeria</femalename>
- </row>
- </data>"""
- def test_encoding_option_str(datapath, parser):
- filename = datapath("io", "data", "xml", "baby_names.xml")
- df_file = read_xml(filename, parser=parser, encoding="ISO-8859-1").head(5)
- output = df_file.to_xml(encoding="ISO-8859-1", parser=parser)
- if output is not None:
- # etree and lxml differ on quotes and case in xml declaration
- output = output.replace(
- '<?xml version="1.0" encoding="ISO-8859-1"?',
- "<?xml version='1.0' encoding='ISO-8859-1'?",
- )
- assert output == encoding_expected
- @td.skip_if_no("lxml")
- def test_correct_encoding_file(datapath):
- filename = datapath("io", "data", "xml", "baby_names.xml")
- df_file = read_xml(filename, encoding="ISO-8859-1", parser="lxml")
- with tm.ensure_clean("test.xml") as path:
- df_file.to_xml(path, index=False, encoding="ISO-8859-1", parser="lxml")
- @td.skip_if_no("lxml")
- @pytest.mark.parametrize("encoding", ["UTF-8", "UTF-16", "ISO-8859-1"])
- def test_wrong_encoding_option_lxml(datapath, parser, encoding):
- filename = datapath("io", "data", "xml", "baby_names.xml")
- df_file = read_xml(filename, encoding="ISO-8859-1", parser="lxml")
- with tm.ensure_clean("test.xml") as path:
- df_file.to_xml(path, index=False, encoding=encoding, parser=parser)
- def test_misspelled_encoding(parser):
- with pytest.raises(LookupError, match=("unknown encoding")):
- geom_df.to_xml(encoding="uft-8", parser=parser)
- # PRETTY PRINT
- @td.skip_if_no("lxml")
- def test_xml_declaration_pretty_print():
- expected = """\
- <data>
- <row>
- <index>0</index>
- <shape>square</shape>
- <degrees>360</degrees>
- <sides>4.0</sides>
- </row>
- <row>
- <index>1</index>
- <shape>circle</shape>
- <degrees>360</degrees>
- <sides/>
- </row>
- <row>
- <index>2</index>
- <shape>triangle</shape>
- <degrees>180</degrees>
- <sides>3.0</sides>
- </row>
- </data>"""
- output = geom_df.to_xml(xml_declaration=False)
- assert output == expected
- def test_no_pretty_print_with_decl(parser):
- expected = (
- "<?xml version='1.0' encoding='utf-8'?>\n"
- "<data><row><index>0</index><shape>square</shape>"
- "<degrees>360</degrees><sides>4.0</sides></row><row>"
- "<index>1</index><shape>circle</shape><degrees>360"
- "</degrees><sides/></row><row><index>2</index><shape>"
- "triangle</shape><degrees>180</degrees><sides>3.0</sides>"
- "</row></data>"
- )
- output = geom_df.to_xml(pretty_print=False, parser=parser)
- output = equalize_decl(output)
- # etree adds space for closed tags
- if output is not None:
- output = output.replace(" />", "/>")
- assert output == expected
- def test_no_pretty_print_no_decl(parser):
- expected = (
- "<data><row><index>0</index><shape>square</shape>"
- "<degrees>360</degrees><sides>4.0</sides></row><row>"
- "<index>1</index><shape>circle</shape><degrees>360"
- "</degrees><sides/></row><row><index>2</index><shape>"
- "triangle</shape><degrees>180</degrees><sides>3.0</sides>"
- "</row></data>"
- )
- output = geom_df.to_xml(xml_declaration=False, pretty_print=False, parser=parser)
- # etree adds space for closed tags
- if output is not None:
- output = output.replace(" />", "/>")
- assert output == expected
- # PARSER
- @td.skip_if_installed("lxml")
- def test_default_parser_no_lxml():
- with pytest.raises(
- ImportError, match=("lxml not found, please install or use the etree parser.")
- ):
- geom_df.to_xml()
- def test_unknown_parser():
- with pytest.raises(
- ValueError, match=("Values for parser can only be lxml or etree.")
- ):
- geom_df.to_xml(parser="bs4")
- # STYLESHEET
- xsl_expected = """\
- <?xml version="1.0" encoding="utf-8"?>
- <data>
- <row>
- <field field="index">0</field>
- <field field="shape">square</field>
- <field field="degrees">360</field>
- <field field="sides">4.0</field>
- </row>
- <row>
- <field field="index">1</field>
- <field field="shape">circle</field>
- <field field="degrees">360</field>
- <field field="sides"/>
- </row>
- <row>
- <field field="index">2</field>
- <field field="shape">triangle</field>
- <field field="degrees">180</field>
- <field field="sides">3.0</field>
- </row>
- </data>"""
- @td.skip_if_no("lxml")
- def test_stylesheet_file_like(datapath, mode):
- xsl = datapath("io", "data", "xml", "row_field_output.xsl")
- with open(xsl, mode) as f:
- assert geom_df.to_xml(stylesheet=f) == xsl_expected
- @td.skip_if_no("lxml")
- def test_stylesheet_io(datapath, mode):
- xsl_path = datapath("io", "data", "xml", "row_field_output.xsl")
- xsl_obj: BytesIO | StringIO
- with open(xsl_path, mode) as f:
- if mode == "rb":
- xsl_obj = BytesIO(f.read())
- else:
- xsl_obj = StringIO(f.read())
- output = geom_df.to_xml(stylesheet=xsl_obj)
- assert output == xsl_expected
- @td.skip_if_no("lxml")
- def test_stylesheet_buffered_reader(datapath, mode):
- xsl = datapath("io", "data", "xml", "row_field_output.xsl")
- with open(xsl, mode) as f:
- xsl_obj = f.read()
- output = geom_df.to_xml(stylesheet=xsl_obj)
- assert output == xsl_expected
- @td.skip_if_no("lxml")
- def test_stylesheet_wrong_path():
- from lxml.etree import XMLSyntaxError
- xsl = os.path.join("data", "xml", "row_field_output.xslt")
- with pytest.raises(
- XMLSyntaxError,
- match=("Start tag expected, '<' not found"),
- ):
- geom_df.to_xml(stylesheet=xsl)
- @td.skip_if_no("lxml")
- @pytest.mark.parametrize("val", ["", b""])
- def test_empty_string_stylesheet(val):
- from lxml.etree import XMLSyntaxError
- msg = "|".join(
- [
- "Document is empty",
- "Start tag expected, '<' not found",
- # Seen on Mac with lxml 4.9.1
- r"None \(line 0\)",
- ]
- )
- with pytest.raises(XMLSyntaxError, match=msg):
- geom_df.to_xml(stylesheet=val)
- @td.skip_if_no("lxml")
- def test_incorrect_xsl_syntax():
- from lxml.etree import XMLSyntaxError
- xsl = """\
- <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
- <xsl:output method="xml" encoding="utf-8" indent="yes" >
- <xsl:strip-space elements="*"/>
- <xsl:template match="@*|node()">
- <xsl:copy>
- <xsl:apply-templates select="@*|node()"/>
- </xsl:copy>
- </xsl:template>
- <xsl:template match="row/*">
- <field>
- <xsl:attribute name="field">
- <xsl:value-of select="name()"/>
- </xsl:attribute>
- <xsl:value-of select="text()"/>
- </field>
- </xsl:template>
- </xsl:stylesheet>"""
- with pytest.raises(XMLSyntaxError, match=("Opening and ending tag mismatch")):
- geom_df.to_xml(stylesheet=xsl)
- @td.skip_if_no("lxml")
- def test_incorrect_xsl_eval():
- from lxml.etree import XSLTParseError
- xsl = """\
- <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
- <xsl:output method="xml" encoding="utf-8" indent="yes" />
- <xsl:strip-space elements="*"/>
- <xsl:template match="@*|node(*)">
- <xsl:copy>
- <xsl:apply-templates select="@*|node()"/>
- </xsl:copy>
- </xsl:template>
- <xsl:template match="row/*">
- <field>
- <xsl:attribute name="field">
- <xsl:value-of select="name()"/>
- </xsl:attribute>
- <xsl:value-of select="text()"/>
- </field>
- </xsl:template>
- </xsl:stylesheet>"""
- with pytest.raises(XSLTParseError, match=("failed to compile")):
- geom_df.to_xml(stylesheet=xsl)
- @td.skip_if_no("lxml")
- def test_incorrect_xsl_apply():
- from lxml.etree import XSLTApplyError
- xsl = """\
- <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
- <xsl:output method="xml" encoding="utf-8" indent="yes" />
- <xsl:strip-space elements="*"/>
- <xsl:template match="@*|node()">
- <xsl:copy>
- <xsl:copy-of select="document('non_existent.xml')/*"/>
- </xsl:copy>
- </xsl:template>
- </xsl:stylesheet>"""
- with pytest.raises(XSLTApplyError, match=("Cannot resolve URI")):
- with tm.ensure_clean("test.xml") as path:
- geom_df.to_xml(path, stylesheet=xsl)
- def test_stylesheet_with_etree():
- xsl = """\
- <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
- <xsl:output method="xml" encoding="utf-8" indent="yes" />
- <xsl:strip-space elements="*"/>
- <xsl:template match="@*|node(*)">
- <xsl:copy>
- <xsl:apply-templates select="@*|node()"/>
- </xsl:copy>
- </xsl:template>"""
- with pytest.raises(
- ValueError, match=("To use stylesheet, you need lxml installed")
- ):
- geom_df.to_xml(parser="etree", stylesheet=xsl)
- @td.skip_if_no("lxml")
- def test_style_to_csv():
- xsl = """\
- <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
- <xsl:output method="text" indent="yes" />
- <xsl:strip-space elements="*"/>
- <xsl:param name="delim">,</xsl:param>
- <xsl:template match="/data">
- <xsl:text>,shape,degrees,sides
</xsl:text>
- <xsl:apply-templates select="row"/>
- </xsl:template>
- <xsl:template match="row">
- <xsl:value-of select="concat(index, $delim, shape, $delim,
- degrees, $delim, sides)"/>
- <xsl:text>
</xsl:text>
- </xsl:template>
- </xsl:stylesheet>"""
- out_csv = geom_df.to_csv(lineterminator="\n")
- if out_csv is not None:
- out_csv = out_csv.strip()
- out_xml = geom_df.to_xml(stylesheet=xsl)
- assert out_csv == out_xml
- @td.skip_if_no("lxml")
- def test_style_to_string():
- xsl = """\
- <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
- <xsl:output method="text" indent="yes" />
- <xsl:strip-space elements="*"/>
- <xsl:param name="delim"><xsl:text> </xsl:text></xsl:param>
- <xsl:template match="/data">
- <xsl:text> shape degrees sides
</xsl:text>
- <xsl:apply-templates select="row"/>
- </xsl:template>
- <xsl:template match="row">
- <xsl:value-of select="concat(index, ' ',
- substring($delim, 1, string-length('triangle')
- - string-length(shape) + 1),
- shape,
- substring($delim, 1, string-length(name(degrees))
- - string-length(degrees) + 2),
- degrees,
- substring($delim, 1, string-length(name(sides))
- - string-length(sides) + 2),
- sides)"/>
- <xsl:text>
</xsl:text>
- </xsl:template>
- </xsl:stylesheet>"""
- out_str = geom_df.to_string()
- out_xml = geom_df.to_xml(na_rep="NaN", stylesheet=xsl)
- assert out_xml == out_str
- @td.skip_if_no("lxml")
- def test_style_to_json():
- xsl = """\
- <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
- <xsl:output method="text" indent="yes" />
- <xsl:strip-space elements="*"/>
- <xsl:param name="quot">"</xsl:param>
- <xsl:template match="/data">
- <xsl:text>{"shape":{</xsl:text>
- <xsl:apply-templates select="descendant::row/shape"/>
- <xsl:text>},"degrees":{</xsl:text>
- <xsl:apply-templates select="descendant::row/degrees"/>
- <xsl:text>},"sides":{</xsl:text>
- <xsl:apply-templates select="descendant::row/sides"/>
- <xsl:text>}}</xsl:text>
- </xsl:template>
- <xsl:template match="shape|degrees|sides">
- <xsl:variable name="val">
- <xsl:if test = ".=''">
- <xsl:value-of select="'null'"/>
- </xsl:if>
- <xsl:if test = "number(text()) = text()">
- <xsl:value-of select="text()"/>
- </xsl:if>
- <xsl:if test = "number(text()) != text()">
- <xsl:value-of select="concat($quot, text(), $quot)"/>
- </xsl:if>
- </xsl:variable>
- <xsl:value-of select="concat($quot, preceding-sibling::index,
- $quot,':', $val)"/>
- <xsl:if test="preceding-sibling::index != //row[last()]/index">
- <xsl:text>,</xsl:text>
- </xsl:if>
- </xsl:template>
- </xsl:stylesheet>"""
- out_json = geom_df.to_json()
- out_xml = geom_df.to_xml(stylesheet=xsl)
- assert out_json == out_xml
- # COMPRESSION
- geom_xml = """\
- <?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <index>0</index>
- <shape>square</shape>
- <degrees>360</degrees>
- <sides>4.0</sides>
- </row>
- <row>
- <index>1</index>
- <shape>circle</shape>
- <degrees>360</degrees>
- <sides/>
- </row>
- <row>
- <index>2</index>
- <shape>triangle</shape>
- <degrees>180</degrees>
- <sides>3.0</sides>
- </row>
- </data>"""
- def test_compression_output(parser, compression_only):
- with tm.ensure_clean() as path:
- geom_df.to_xml(path, parser=parser, compression=compression_only)
- with get_handle(
- path,
- "r",
- compression=compression_only,
- ) as handle_obj:
- output = handle_obj.handle.read()
- output = equalize_decl(output)
- assert geom_xml == output.strip()
- def test_filename_and_suffix_comp(parser, compression_only):
- compfile = "xml." + _compression_to_extension[compression_only]
- with tm.ensure_clean(filename=compfile) as path:
- geom_df.to_xml(path, parser=parser, compression=compression_only)
- with get_handle(
- path,
- "r",
- compression=compression_only,
- ) as handle_obj:
- output = handle_obj.handle.read()
- output = equalize_decl(output)
- assert geom_xml == output.strip()
- def test_ea_dtypes(any_numeric_ea_dtype, parser):
- # GH#43903
- expected = """<?xml version='1.0' encoding='utf-8'?>
- <data>
- <row>
- <index>0</index>
- <a/>
- </row>
- </data>"""
- df = DataFrame({"a": [NA]}).astype(any_numeric_ea_dtype)
- result = df.to_xml(parser=parser)
- assert equalize_decl(result).strip() == expected
- def test_unsuported_compression(parser):
- with pytest.raises(ValueError, match="Unrecognized compression type"):
- with tm.ensure_clean() as path:
- geom_df.to_xml(path, parser=parser, compression="7z")
- # STORAGE OPTIONS
- @pytest.mark.single_cpu
- @td.skip_if_no("s3fs")
- @td.skip_if_no("lxml")
- def test_s3_permission_output(parser, s3_resource):
- # s3_resource hosts pandas-test
- import s3fs
- with pytest.raises(PermissionError, match="Access Denied"):
- fs = s3fs.S3FileSystem(anon=True)
- fs.ls("pandas-test")
- geom_df.to_xml("s3://pandas-test/geom.xml", compression="zip", parser=parser)
|