test_xml.py 55 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900
  1. from __future__ import annotations
  2. from io import (
  3. BytesIO,
  4. StringIO,
  5. )
  6. from lzma import LZMAError
  7. import os
  8. from tarfile import ReadError
  9. from urllib.error import HTTPError
  10. from xml.etree.ElementTree import ParseError
  11. from zipfile import BadZipFile
  12. import numpy as np
  13. import pytest
  14. from pandas.compat import is_ci_environment
  15. from pandas.compat._optional import import_optional_dependency
  16. from pandas.errors import (
  17. EmptyDataError,
  18. ParserError,
  19. )
  20. import pandas.util._test_decorators as td
  21. import pandas as pd
  22. from pandas import (
  23. NA,
  24. DataFrame,
  25. Series,
  26. )
  27. import pandas._testing as tm
  28. from pandas.core.arrays import (
  29. ArrowStringArray,
  30. StringArray,
  31. )
  32. from pandas.io.common import get_handle
  33. from pandas.io.xml import read_xml
  34. # CHECK LIST
  35. # [x] - ValueError: "Values for parser can only be lxml or etree."
  36. # etree
  37. # [X] - ImportError: "lxml not found, please install or use the etree parser."
  38. # [X] - TypeError: "expected str, bytes or os.PathLike object, not NoneType"
  39. # [X] - ValueError: "Either element or attributes can be parsed not both."
  40. # [X] - ValueError: "xpath does not return any nodes..."
  41. # [X] - SyntaxError: "You have used an incorrect or unsupported XPath"
  42. # [X] - ValueError: "names does not match length of child elements in xpath."
  43. # [X] - TypeError: "...is not a valid type for names"
  44. # [X] - ValueError: "To use stylesheet, you need lxml installed..."
  45. # [] - URLError: (GENERAL ERROR WITH HTTPError AS SUBCLASS)
  46. # [X] - HTTPError: "HTTP Error 404: Not Found"
  47. # [] - OSError: (GENERAL ERROR WITH FileNotFoundError AS SUBCLASS)
  48. # [X] - FileNotFoundError: "No such file or directory"
  49. # [] - ParseError (FAILSAFE CATCH ALL FOR VERY COMPLEX XML)
  50. # [X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..."
  51. # [X] - UnicodeError: "UTF-16 stream does not start with BOM"
  52. # [X] - BadZipFile: "File is not a zip file"
  53. # [X] - OSError: "Invalid data stream"
  54. # [X] - LZMAError: "Input format not supported by decoder"
  55. # [X] - ValueError: "Unrecognized compression type"
  56. # [X] - PermissionError: "Forbidden"
  57. # lxml
  58. # [X] - ValueError: "Either element or attributes can be parsed not both."
  59. # [X] - AttributeError: "__enter__"
  60. # [X] - XSLTApplyError: "Cannot resolve URI"
  61. # [X] - XSLTParseError: "document is not a stylesheet"
  62. # [X] - ValueError: "xpath does not return any nodes."
  63. # [X] - XPathEvalError: "Invalid expression"
  64. # [] - XPathSyntaxError: (OLD VERSION IN lxml FOR XPATH ERRORS)
  65. # [X] - TypeError: "empty namespace prefix is not supported in XPath"
  66. # [X] - ValueError: "names does not match length of child elements in xpath."
  67. # [X] - TypeError: "...is not a valid type for names"
  68. # [X] - LookupError: "unknown encoding"
  69. # [] - URLError: (USUALLY DUE TO NETWORKING)
  70. # [X - HTTPError: "HTTP Error 404: Not Found"
  71. # [X] - OSError: "failed to load external entity"
  72. # [X] - XMLSyntaxError: "Start tag expected, '<' not found"
  73. # [] - ParserError: (FAILSAFE CATCH ALL FOR VERY COMPLEX XML
  74. # [X] - ValueError: "Values for parser can only be lxml or etree."
  75. # [X] - UnicodeDecodeError: "'utf-8' codec can't decode byte 0xe9..."
  76. # [X] - UnicodeError: "UTF-16 stream does not start with BOM"
  77. # [X] - BadZipFile: "File is not a zip file"
  78. # [X] - OSError: "Invalid data stream"
  79. # [X] - LZMAError: "Input format not supported by decoder"
  80. # [X] - ValueError: "Unrecognized compression type"
  81. # [X] - PermissionError: "Forbidden"
  82. geom_df = DataFrame(
  83. {
  84. "shape": ["square", "circle", "triangle"],
  85. "degrees": [360, 360, 180],
  86. "sides": [4, np.nan, 3],
  87. }
  88. )
  89. xml_default_nmsp = """\
  90. <?xml version='1.0' encoding='utf-8'?>
  91. <data xmlns="http://example.com">
  92. <row>
  93. <shape>square</shape>
  94. <degrees>360</degrees>
  95. <sides>4</sides>
  96. </row>
  97. <row>
  98. <shape>circle</shape>
  99. <degrees>360</degrees>
  100. <sides/>
  101. </row>
  102. <row>
  103. <shape>triangle</shape>
  104. <degrees>180</degrees>
  105. <sides>3</sides>
  106. </row>
  107. </data>"""
  108. xml_prefix_nmsp = """\
  109. <?xml version='1.0' encoding='utf-8'?>
  110. <doc:data xmlns:doc="http://example.com">
  111. <doc:row>
  112. <doc:shape>square</doc:shape>
  113. <doc:degrees>360</doc:degrees>
  114. <doc:sides>4.0</doc:sides>
  115. </doc:row>
  116. <doc:row>
  117. <doc:shape>circle</doc:shape>
  118. <doc:degrees>360</doc:degrees>
  119. <doc:sides/>
  120. </doc:row>
  121. <doc:row>
  122. <doc:shape>triangle</doc:shape>
  123. <doc:degrees>180</doc:degrees>
  124. <doc:sides>3.0</doc:sides>
  125. </doc:row>
  126. </doc:data>"""
  127. df_kml = DataFrame(
  128. {
  129. "id": {
  130. 0: "ID_00001",
  131. 1: "ID_00002",
  132. 2: "ID_00003",
  133. 3: "ID_00004",
  134. 4: "ID_00005",
  135. },
  136. "name": {
  137. 0: "Blue Line (Forest Park)",
  138. 1: "Red, Purple Line",
  139. 2: "Red, Purple Line",
  140. 3: "Red, Purple Line",
  141. 4: "Red, Purple Line",
  142. },
  143. "styleUrl": {
  144. 0: "#LineStyle01",
  145. 1: "#LineStyle01",
  146. 2: "#LineStyle01",
  147. 3: "#LineStyle01",
  148. 4: "#LineStyle01",
  149. },
  150. "extrude": {0: 0, 1: 0, 2: 0, 3: 0, 4: 0},
  151. "altitudeMode": {
  152. 0: "clampedToGround",
  153. 1: "clampedToGround",
  154. 2: "clampedToGround",
  155. 3: "clampedToGround",
  156. 4: "clampedToGround",
  157. },
  158. "coordinates": {
  159. 0: (
  160. "-87.77678526964958,41.8708863930319,0 "
  161. "-87.77826234150609,41.87097820122218,0 "
  162. "-87.78251583439344,41.87130129991005,0 "
  163. "-87.78418294588424,41.87145055520308,0 "
  164. "-87.7872369165933,41.8717239119163,0 "
  165. "-87.79160214925886,41.87210797280065,0"
  166. ),
  167. 1: (
  168. "-87.65758750947528,41.96427269188822,0 "
  169. "-87.65802133507393,41.96581929055245,0 "
  170. "-87.65819033925305,41.96621846093642,0 "
  171. "-87.6583189819129,41.96650362897086,0 "
  172. "-87.65835858701473,41.96669002089185,0 "
  173. "-87.65838428411853,41.96688150295095,0 "
  174. "-87.65842208882658,41.96745896091846,0 "
  175. "-87.65846556843937,41.9683761425439,0 "
  176. "-87.65849296214573,41.96913893870342,0"
  177. ),
  178. 2: (
  179. "-87.65492939166126,41.95377494531437,0 "
  180. "-87.65557043199591,41.95376544118533,0 "
  181. "-87.65606302030132,41.95376391658746,0 "
  182. "-87.65623502146268,41.95377379126367,0 "
  183. "-87.65634748981634,41.95380103566435,0 "
  184. "-87.65646537904269,41.95387703994676,0 "
  185. "-87.65656532461145,41.95396622645799,0 "
  186. "-87.65664760856414,41.95404201996044,0 "
  187. "-87.65671750555913,41.95416647054043,0 "
  188. "-87.65673983607117,41.95429949810849,0 "
  189. "-87.65673866475777,41.95441024240925,0 "
  190. "-87.6567690255541,41.95490657227902,0 "
  191. "-87.65683672482363,41.95692259283837,0 "
  192. "-87.6568900886376,41.95861070983142,0 "
  193. "-87.65699865558875,41.96181418669004,0 "
  194. "-87.65756347177603,41.96397045777844,0 "
  195. "-87.65758750947528,41.96427269188822,0"
  196. ),
  197. 3: (
  198. "-87.65362593118043,41.94742799535678,0 "
  199. "-87.65363554415794,41.94819886386848,0 "
  200. "-87.6536456393239,41.95059994675451,0 "
  201. "-87.65365831235026,41.95108288489359,0 "
  202. "-87.6536604873874,41.9519954657554,0 "
  203. "-87.65362592053201,41.95245597302328,0 "
  204. "-87.65367158496069,41.95311153649393,0 "
  205. "-87.65368468595476,41.9533202828916,0 "
  206. "-87.65369271253692,41.95343095587119,0 "
  207. "-87.65373335834569,41.95351536301472,0 "
  208. "-87.65378605844126,41.95358212680591,0 "
  209. "-87.65385067928185,41.95364452823767,0 "
  210. "-87.6539390793817,41.95370263886964,0 "
  211. "-87.6540786298351,41.95373403675265,0 "
  212. "-87.65430648647626,41.9537535411832,0 "
  213. "-87.65492939166126,41.95377494531437,0"
  214. ),
  215. 4: (
  216. "-87.65345391792157,41.94217681262115,0 "
  217. "-87.65342448305786,41.94237224420864,0 "
  218. "-87.65339745703922,41.94268217746244,0 "
  219. "-87.65337753982941,41.94288140770284,0 "
  220. "-87.65336256753105,41.94317369618263,0 "
  221. "-87.65338799707138,41.94357253961736,0 "
  222. "-87.65340240886648,41.94389158188269,0 "
  223. "-87.65341837392448,41.94406444407721,0 "
  224. "-87.65342275247338,41.94421065714904,0 "
  225. "-87.65347469646018,41.94434829382345,0 "
  226. "-87.65351486483024,41.94447699917548,0 "
  227. "-87.65353483605053,41.9453896864472,0 "
  228. "-87.65361975532807,41.94689193720703,0 "
  229. "-87.65362593118043,41.94742799535678,0"
  230. ),
  231. },
  232. }
  233. )
  234. @pytest.fixture(params=["rb", "r"])
  235. def mode(request):
  236. return request.param
  237. @pytest.fixture(params=[pytest.param("lxml", marks=td.skip_if_no("lxml")), "etree"])
  238. def parser(request):
  239. return request.param
  240. def read_xml_iterparse(data, **kwargs):
  241. with tm.ensure_clean() as path:
  242. with open(path, "w") as f:
  243. f.write(data)
  244. return read_xml(path, **kwargs)
  245. def read_xml_iterparse_comp(comp_path, compression_only, **kwargs):
  246. with get_handle(comp_path, "r", compression=compression_only) as handles:
  247. with tm.ensure_clean() as path:
  248. with open(path, "w") as f:
  249. f.write(handles.handle.read())
  250. return read_xml(path, **kwargs)
  251. # FILE / URL
  252. @td.skip_if_no("lxml")
  253. def test_parser_consistency_file(datapath):
  254. filename = datapath("io", "data", "xml", "books.xml")
  255. df_file_lxml = read_xml(filename, parser="lxml")
  256. df_file_etree = read_xml(filename, parser="etree")
  257. df_iter_lxml = read_xml(
  258. filename,
  259. parser="lxml",
  260. iterparse={"book": ["category", "title", "year", "author", "price"]},
  261. )
  262. df_iter_etree = read_xml(
  263. filename,
  264. parser="etree",
  265. iterparse={"book": ["category", "title", "year", "author", "price"]},
  266. )
  267. tm.assert_frame_equal(df_file_lxml, df_file_etree)
  268. tm.assert_frame_equal(df_file_lxml, df_iter_lxml)
  269. tm.assert_frame_equal(df_iter_lxml, df_iter_etree)
  270. @pytest.mark.network
  271. @pytest.mark.slow
  272. @tm.network(
  273. url=(
  274. "https://data.cityofchicago.org/api/views/"
  275. "8pix-ypme/rows.xml?accessType=DOWNLOAD"
  276. ),
  277. check_before_test=True,
  278. )
  279. def test_parser_consistency_url(parser):
  280. url = (
  281. "https://data.cityofchicago.org/api/views/"
  282. "8pix-ypme/rows.xml?accessType=DOWNLOAD"
  283. )
  284. with tm.ensure_clean(filename="cta.xml") as path:
  285. (read_xml(url, xpath=".//row/row", parser=parser).to_xml(path, index=False))
  286. df_xpath = read_xml(path, parser=parser)
  287. df_iter = read_xml(
  288. path,
  289. parser=parser,
  290. iterparse={
  291. "row": [
  292. "_id",
  293. "_uuid",
  294. "_position",
  295. "_address",
  296. "stop_id",
  297. "direction_id",
  298. "stop_name",
  299. "station_name",
  300. "station_descriptive_name",
  301. "map_id",
  302. "ada",
  303. "red",
  304. "blue",
  305. "g",
  306. "brn",
  307. "p",
  308. "pexp",
  309. "y",
  310. "pnk",
  311. "o",
  312. "location",
  313. ]
  314. },
  315. )
  316. tm.assert_frame_equal(df_xpath, df_iter)
  317. def test_file_like(datapath, parser, mode):
  318. filename = datapath("io", "data", "xml", "books.xml")
  319. with open(filename, mode) as f:
  320. df_file = read_xml(f, parser=parser)
  321. df_expected = DataFrame(
  322. {
  323. "category": ["cooking", "children", "web"],
  324. "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
  325. "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
  326. "year": [2005, 2005, 2003],
  327. "price": [30.00, 29.99, 39.95],
  328. }
  329. )
  330. tm.assert_frame_equal(df_file, df_expected)
  331. def test_file_io(datapath, parser, mode):
  332. filename = datapath("io", "data", "xml", "books.xml")
  333. with open(filename, mode) as f:
  334. xml_obj = f.read()
  335. df_io = read_xml(
  336. (BytesIO(xml_obj) if isinstance(xml_obj, bytes) else StringIO(xml_obj)),
  337. parser=parser,
  338. )
  339. df_expected = DataFrame(
  340. {
  341. "category": ["cooking", "children", "web"],
  342. "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
  343. "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
  344. "year": [2005, 2005, 2003],
  345. "price": [30.00, 29.99, 39.95],
  346. }
  347. )
  348. tm.assert_frame_equal(df_io, df_expected)
  349. def test_file_buffered_reader_string(datapath, parser, mode):
  350. filename = datapath("io", "data", "xml", "books.xml")
  351. with open(filename, mode) as f:
  352. xml_obj = f.read()
  353. df_str = read_xml(xml_obj, parser=parser)
  354. df_expected = DataFrame(
  355. {
  356. "category": ["cooking", "children", "web"],
  357. "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
  358. "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
  359. "year": [2005, 2005, 2003],
  360. "price": [30.00, 29.99, 39.95],
  361. }
  362. )
  363. tm.assert_frame_equal(df_str, df_expected)
  364. def test_file_buffered_reader_no_xml_declaration(datapath, parser, mode):
  365. filename = datapath("io", "data", "xml", "books.xml")
  366. with open(filename, mode) as f:
  367. next(f)
  368. xml_obj = f.read()
  369. df_str = read_xml(xml_obj, parser=parser)
  370. df_expected = DataFrame(
  371. {
  372. "category": ["cooking", "children", "web"],
  373. "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
  374. "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
  375. "year": [2005, 2005, 2003],
  376. "price": [30.00, 29.99, 39.95],
  377. }
  378. )
  379. tm.assert_frame_equal(df_str, df_expected)
  380. def test_string_charset(parser):
  381. txt = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"
  382. df_str = read_xml(txt, parser=parser)
  383. df_expected = DataFrame({"c1": 1, "c2": 2}, index=[0])
  384. tm.assert_frame_equal(df_str, df_expected)
  385. def test_file_charset(datapath, parser):
  386. xml_file = datapath("io", "data", "xml", "doc_ch_utf.xml")
  387. df_file = read_xml(datapath(xml_file), parser=parser)
  388. df_expected = DataFrame(
  389. {
  390. "問": [
  391. "問 若箇是邪而言破邪 何者是正而道(Sorry, this is Big5 only)申正",
  392. "問 既破有得申無得 亦應但破性執申假名以不",
  393. "問 既破性申假 亦應但破有申無 若有無兩洗 亦應性假雙破耶",
  394. ],
  395. "答": [
  396. "答 邪既無量 正亦多途 大略為言不出二種 謂有得與無得 有得是邪須破 無得是正須申\n\t\t故",
  397. None,
  398. "答 不例 有無皆是性 所以須雙破 既分性假異 故有破不破",
  399. ],
  400. "a": [None, "答 性執是有得 假名是無得 今破有得申無得 即是破性執申假名也", None],
  401. }
  402. )
  403. tm.assert_frame_equal(df_file, df_expected)
  404. def test_file_handle_close(datapath, parser):
  405. xml_file = datapath("io", "data", "xml", "books.xml")
  406. with open(xml_file, "rb") as f:
  407. read_xml(BytesIO(f.read()), parser=parser)
  408. assert not f.closed
  409. @td.skip_if_no("lxml")
  410. @pytest.mark.parametrize("val", ["", b""])
  411. def test_empty_string_lxml(val):
  412. from lxml.etree import XMLSyntaxError
  413. msg = "|".join(
  414. [
  415. "Document is empty",
  416. # Seen on Mac with lxml 4.91
  417. r"None \(line 0\)",
  418. ]
  419. )
  420. with pytest.raises(XMLSyntaxError, match=msg):
  421. read_xml(val, parser="lxml")
  422. @pytest.mark.parametrize("val", ["", b""])
  423. def test_empty_string_etree(val):
  424. with pytest.raises(ParseError, match="no element found"):
  425. read_xml(val, parser="etree")
  426. @td.skip_if_no("lxml")
  427. def test_wrong_file_path_lxml():
  428. from lxml.etree import XMLSyntaxError
  429. filename = os.path.join("data", "html", "books.xml")
  430. with pytest.raises(
  431. XMLSyntaxError,
  432. match=("Start tag expected, '<' not found"),
  433. ):
  434. read_xml(filename, parser="lxml")
  435. def test_wrong_file_path_etree():
  436. filename = os.path.join("data", "html", "books.xml")
  437. with pytest.raises(
  438. ParseError,
  439. match=("not well-formed"),
  440. ):
  441. read_xml(filename, parser="etree")
  442. @pytest.mark.network
  443. @tm.network(
  444. url="https://www.w3schools.com/xml/books.xml",
  445. check_before_test=True,
  446. )
  447. @td.skip_if_no("lxml")
  448. def test_url():
  449. url = "https://www.w3schools.com/xml/books.xml"
  450. df_url = read_xml(url, xpath=".//book[count(*)=4]")
  451. df_expected = DataFrame(
  452. {
  453. "category": ["cooking", "children", "web"],
  454. "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
  455. "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
  456. "year": [2005, 2005, 2003],
  457. "price": [30.00, 29.99, 39.95],
  458. "cover": [None, None, "paperback"],
  459. }
  460. )
  461. tm.assert_frame_equal(df_url, df_expected)
  462. @pytest.mark.network
  463. @tm.network(url="https://www.w3schools.com/xml/python.xml", check_before_test=True)
  464. def test_wrong_url(parser):
  465. with pytest.raises(HTTPError, match=("HTTP Error 404: Not Found")):
  466. url = "https://www.w3schools.com/xml/python.xml"
  467. read_xml(url, xpath=".//book[count(*)=4]", parser=parser)
  468. # XPATH
  469. @td.skip_if_no("lxml")
  470. def test_empty_xpath_lxml(datapath):
  471. filename = datapath("io", "data", "xml", "books.xml")
  472. with pytest.raises(ValueError, match=("xpath does not return any nodes")):
  473. read_xml(filename, xpath=".//python", parser="lxml")
  474. def test_bad_xpath_etree(datapath):
  475. filename = datapath("io", "data", "xml", "books.xml")
  476. with pytest.raises(
  477. SyntaxError, match=("You have used an incorrect or unsupported XPath")
  478. ):
  479. read_xml(filename, xpath=".//[book]", parser="etree")
  480. @td.skip_if_no("lxml")
  481. def test_bad_xpath_lxml(datapath):
  482. from lxml.etree import XPathEvalError
  483. filename = datapath("io", "data", "xml", "books.xml")
  484. with pytest.raises(XPathEvalError, match=("Invalid expression")):
  485. read_xml(filename, xpath=".//[book]", parser="lxml")
  486. # NAMESPACE
  487. def test_default_namespace(parser):
  488. df_nmsp = read_xml(
  489. xml_default_nmsp,
  490. xpath=".//ns:row",
  491. namespaces={"ns": "http://example.com"},
  492. parser=parser,
  493. )
  494. df_iter = read_xml_iterparse(
  495. xml_default_nmsp,
  496. parser=parser,
  497. iterparse={"row": ["shape", "degrees", "sides"]},
  498. )
  499. df_expected = DataFrame(
  500. {
  501. "shape": ["square", "circle", "triangle"],
  502. "degrees": [360, 360, 180],
  503. "sides": [4.0, float("nan"), 3.0],
  504. }
  505. )
  506. tm.assert_frame_equal(df_nmsp, df_expected)
  507. tm.assert_frame_equal(df_iter, df_expected)
  508. def test_prefix_namespace(parser):
  509. df_nmsp = read_xml(
  510. xml_prefix_nmsp,
  511. xpath=".//doc:row",
  512. namespaces={"doc": "http://example.com"},
  513. parser=parser,
  514. )
  515. df_iter = read_xml_iterparse(
  516. xml_prefix_nmsp, parser=parser, iterparse={"row": ["shape", "degrees", "sides"]}
  517. )
  518. df_expected = DataFrame(
  519. {
  520. "shape": ["square", "circle", "triangle"],
  521. "degrees": [360, 360, 180],
  522. "sides": [4.0, float("nan"), 3.0],
  523. }
  524. )
  525. tm.assert_frame_equal(df_nmsp, df_expected)
  526. tm.assert_frame_equal(df_iter, df_expected)
  527. @td.skip_if_no("lxml")
  528. def test_consistency_default_namespace():
  529. df_lxml = read_xml(
  530. xml_default_nmsp,
  531. xpath=".//ns:row",
  532. namespaces={"ns": "http://example.com"},
  533. parser="lxml",
  534. )
  535. df_etree = read_xml(
  536. xml_default_nmsp,
  537. xpath=".//doc:row",
  538. namespaces={"doc": "http://example.com"},
  539. parser="etree",
  540. )
  541. tm.assert_frame_equal(df_lxml, df_etree)
  542. @td.skip_if_no("lxml")
  543. def test_consistency_prefix_namespace():
  544. df_lxml = read_xml(
  545. xml_prefix_nmsp,
  546. xpath=".//doc:row",
  547. namespaces={"doc": "http://example.com"},
  548. parser="lxml",
  549. )
  550. df_etree = read_xml(
  551. xml_prefix_nmsp,
  552. xpath=".//doc:row",
  553. namespaces={"doc": "http://example.com"},
  554. parser="etree",
  555. )
  556. tm.assert_frame_equal(df_lxml, df_etree)
  557. # PREFIX
  558. def test_missing_prefix_with_default_namespace(datapath, parser):
  559. filename = datapath("io", "data", "xml", "books.xml")
  560. with pytest.raises(ValueError, match=("xpath does not return any nodes")):
  561. read_xml(filename, xpath=".//Placemark", parser=parser)
  562. def test_missing_prefix_definition_etree(datapath):
  563. filename = datapath("io", "data", "xml", "cta_rail_lines.kml")
  564. with pytest.raises(SyntaxError, match=("you used an undeclared namespace prefix")):
  565. read_xml(filename, xpath=".//kml:Placemark", parser="etree")
  566. @td.skip_if_no("lxml")
  567. def test_missing_prefix_definition_lxml(datapath):
  568. from lxml.etree import XPathEvalError
  569. filename = datapath("io", "data", "xml", "cta_rail_lines.kml")
  570. with pytest.raises(XPathEvalError, match=("Undefined namespace prefix")):
  571. read_xml(filename, xpath=".//kml:Placemark", parser="lxml")
  572. @td.skip_if_no("lxml")
  573. @pytest.mark.parametrize("key", ["", None])
  574. def test_none_namespace_prefix(key):
  575. with pytest.raises(
  576. TypeError, match=("empty namespace prefix is not supported in XPath")
  577. ):
  578. read_xml(
  579. xml_default_nmsp,
  580. xpath=".//kml:Placemark",
  581. namespaces={key: "http://www.opengis.net/kml/2.2"},
  582. parser="lxml",
  583. )
  584. # ELEMS AND ATTRS
  585. def test_file_elems_and_attrs(datapath, parser):
  586. filename = datapath("io", "data", "xml", "books.xml")
  587. df_file = read_xml(filename, parser=parser)
  588. df_iter = read_xml(
  589. filename,
  590. parser=parser,
  591. iterparse={"book": ["category", "title", "author", "year", "price"]},
  592. )
  593. df_expected = DataFrame(
  594. {
  595. "category": ["cooking", "children", "web"],
  596. "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
  597. "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
  598. "year": [2005, 2005, 2003],
  599. "price": [30.00, 29.99, 39.95],
  600. }
  601. )
  602. tm.assert_frame_equal(df_file, df_expected)
  603. tm.assert_frame_equal(df_iter, df_expected)
  604. def test_file_only_attrs(datapath, parser):
  605. filename = datapath("io", "data", "xml", "books.xml")
  606. df_file = read_xml(filename, attrs_only=True, parser=parser)
  607. df_iter = read_xml(filename, parser=parser, iterparse={"book": ["category"]})
  608. df_expected = DataFrame({"category": ["cooking", "children", "web"]})
  609. tm.assert_frame_equal(df_file, df_expected)
  610. tm.assert_frame_equal(df_iter, df_expected)
  611. def test_file_only_elems(datapath, parser):
  612. filename = datapath("io", "data", "xml", "books.xml")
  613. df_file = read_xml(filename, elems_only=True, parser=parser)
  614. df_iter = read_xml(
  615. filename,
  616. parser=parser,
  617. iterparse={"book": ["title", "author", "year", "price"]},
  618. )
  619. df_expected = DataFrame(
  620. {
  621. "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
  622. "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
  623. "year": [2005, 2005, 2003],
  624. "price": [30.00, 29.99, 39.95],
  625. }
  626. )
  627. tm.assert_frame_equal(df_file, df_expected)
  628. tm.assert_frame_equal(df_iter, df_expected)
  629. def test_elem_and_attrs_only(datapath, parser):
  630. filename = datapath("io", "data", "xml", "cta_rail_lines.kml")
  631. with pytest.raises(
  632. ValueError,
  633. match=("Either element or attributes can be parsed not both"),
  634. ):
  635. read_xml(filename, elems_only=True, attrs_only=True, parser=parser)
  636. def test_empty_attrs_only(parser):
  637. xml = """
  638. <data>
  639. <row>
  640. <shape sides="4">square</shape>
  641. <degrees>360</degrees>
  642. </row>
  643. <row>
  644. <shape sides="0">circle</shape>
  645. <degrees>360</degrees>
  646. </row>
  647. <row>
  648. <shape sides="3">triangle</shape>
  649. <degrees>180</degrees>
  650. </row>
  651. </data>"""
  652. with pytest.raises(
  653. ValueError,
  654. match=("xpath does not return any nodes or attributes"),
  655. ):
  656. read_xml(xml, xpath="./row", attrs_only=True, parser=parser)
  657. def test_empty_elems_only(parser):
  658. xml = """
  659. <data>
  660. <row sides="4" shape="square" degrees="360"/>
  661. <row sides="0" shape="circle" degrees="360"/>
  662. <row sides="3" shape="triangle" degrees="180"/>
  663. </data>"""
  664. with pytest.raises(
  665. ValueError,
  666. match=("xpath does not return any nodes or attributes"),
  667. ):
  668. read_xml(xml, xpath="./row", elems_only=True, parser=parser)
  669. @td.skip_if_no("lxml")
  670. def test_attribute_centric_xml():
  671. xml = """\
  672. <?xml version="1.0" encoding="UTF-8"?>
  673. <TrainSchedule>
  674. <Stations>
  675. <station Name="Manhattan" coords="31,460,195,498"/>
  676. <station Name="Laraway Road" coords="63,409,194,455"/>
  677. <station Name="179th St (Orland Park)" coords="0,364,110,395"/>
  678. <station Name="153rd St (Orland Park)" coords="7,333,113,362"/>
  679. <station Name="143rd St (Orland Park)" coords="17,297,115,330"/>
  680. <station Name="Palos Park" coords="128,281,239,303"/>
  681. <station Name="Palos Heights" coords="148,257,283,279"/>
  682. <station Name="Worth" coords="170,230,248,255"/>
  683. <station Name="Chicago Ridge" coords="70,187,208,214"/>
  684. <station Name="Oak Lawn" coords="166,159,266,185"/>
  685. <station Name="Ashburn" coords="197,133,336,157"/>
  686. <station Name="Wrightwood" coords="219,106,340,133"/>
  687. <station Name="Chicago Union Sta" coords="220,0,360,43"/>
  688. </Stations>
  689. </TrainSchedule>"""
  690. df_lxml = read_xml(xml, xpath=".//station")
  691. df_etree = read_xml(xml, xpath=".//station", parser="etree")
  692. df_iter_lx = read_xml_iterparse(xml, iterparse={"station": ["Name", "coords"]})
  693. df_iter_et = read_xml_iterparse(
  694. xml, parser="etree", iterparse={"station": ["Name", "coords"]}
  695. )
  696. tm.assert_frame_equal(df_lxml, df_etree)
  697. tm.assert_frame_equal(df_iter_lx, df_iter_et)
  698. # NAMES
  699. def test_names_option_output(datapath, parser):
  700. filename = datapath("io", "data", "xml", "books.xml")
  701. df_file = read_xml(
  702. filename, names=["Col1", "Col2", "Col3", "Col4", "Col5"], parser=parser
  703. )
  704. df_iter = read_xml(
  705. filename,
  706. parser=parser,
  707. names=["Col1", "Col2", "Col3", "Col4", "Col5"],
  708. iterparse={"book": ["category", "title", "author", "year", "price"]},
  709. )
  710. df_expected = DataFrame(
  711. {
  712. "Col1": ["cooking", "children", "web"],
  713. "Col2": ["Everyday Italian", "Harry Potter", "Learning XML"],
  714. "Col3": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
  715. "Col4": [2005, 2005, 2003],
  716. "Col5": [30.00, 29.99, 39.95],
  717. }
  718. )
  719. tm.assert_frame_equal(df_file, df_expected)
  720. tm.assert_frame_equal(df_iter, df_expected)
  721. def test_repeat_names(parser):
  722. xml = """\
  723. <shapes>
  724. <shape type="2D">
  725. <name>circle</name>
  726. <type>curved</type>
  727. </shape>
  728. <shape type="3D">
  729. <name>sphere</name>
  730. <type>curved</type>
  731. </shape>
  732. </shapes>"""
  733. df_xpath = read_xml(
  734. xml, xpath=".//shape", parser=parser, names=["type_dim", "shape", "type_edge"]
  735. )
  736. df_iter = read_xml_iterparse(
  737. xml,
  738. parser=parser,
  739. iterparse={"shape": ["type", "name", "type"]},
  740. names=["type_dim", "shape", "type_edge"],
  741. )
  742. df_expected = DataFrame(
  743. {
  744. "type_dim": ["2D", "3D"],
  745. "shape": ["circle", "sphere"],
  746. "type_edge": ["curved", "curved"],
  747. }
  748. )
  749. tm.assert_frame_equal(df_xpath, df_expected)
  750. tm.assert_frame_equal(df_iter, df_expected)
  751. def test_repeat_values_new_names(parser):
  752. xml = """\
  753. <shapes>
  754. <shape>
  755. <name>rectangle</name>
  756. <family>rectangle</family>
  757. </shape>
  758. <shape>
  759. <name>square</name>
  760. <family>rectangle</family>
  761. </shape>
  762. <shape>
  763. <name>ellipse</name>
  764. <family>ellipse</family>
  765. </shape>
  766. <shape>
  767. <name>circle</name>
  768. <family>ellipse</family>
  769. </shape>
  770. </shapes>"""
  771. df_xpath = read_xml(xml, xpath=".//shape", parser=parser, names=["name", "group"])
  772. df_iter = read_xml_iterparse(
  773. xml,
  774. parser=parser,
  775. iterparse={"shape": ["name", "family"]},
  776. names=["name", "group"],
  777. )
  778. df_expected = DataFrame(
  779. {
  780. "name": ["rectangle", "square", "ellipse", "circle"],
  781. "group": ["rectangle", "rectangle", "ellipse", "ellipse"],
  782. }
  783. )
  784. tm.assert_frame_equal(df_xpath, df_expected)
  785. tm.assert_frame_equal(df_iter, df_expected)
  786. def test_repeat_elements(parser):
  787. xml = """\
  788. <shapes>
  789. <shape>
  790. <value item="name">circle</value>
  791. <value item="family">ellipse</value>
  792. <value item="degrees">360</value>
  793. <value item="sides">0</value>
  794. </shape>
  795. <shape>
  796. <value item="name">triangle</value>
  797. <value item="family">polygon</value>
  798. <value item="degrees">180</value>
  799. <value item="sides">3</value>
  800. </shape>
  801. <shape>
  802. <value item="name">square</value>
  803. <value item="family">polygon</value>
  804. <value item="degrees">360</value>
  805. <value item="sides">4</value>
  806. </shape>
  807. </shapes>"""
  808. df_xpath = read_xml(
  809. xml,
  810. xpath=".//shape",
  811. parser=parser,
  812. names=["name", "family", "degrees", "sides"],
  813. )
  814. df_iter = read_xml_iterparse(
  815. xml,
  816. parser=parser,
  817. iterparse={"shape": ["value", "value", "value", "value"]},
  818. names=["name", "family", "degrees", "sides"],
  819. )
  820. df_expected = DataFrame(
  821. {
  822. "name": ["circle", "triangle", "square"],
  823. "family": ["ellipse", "polygon", "polygon"],
  824. "degrees": [360, 180, 360],
  825. "sides": [0, 3, 4],
  826. }
  827. )
  828. tm.assert_frame_equal(df_xpath, df_expected)
  829. tm.assert_frame_equal(df_iter, df_expected)
  830. def test_names_option_wrong_length(datapath, parser):
  831. filename = datapath("io", "data", "xml", "books.xml")
  832. with pytest.raises(ValueError, match=("names does not match length")):
  833. read_xml(filename, names=["Col1", "Col2", "Col3"], parser=parser)
  834. def test_names_option_wrong_type(datapath, parser):
  835. filename = datapath("io", "data", "xml", "books.xml")
  836. with pytest.raises(TypeError, match=("is not a valid type for names")):
  837. read_xml(filename, names="Col1, Col2, Col3", parser=parser)
  838. # ENCODING
  839. def test_wrong_encoding(datapath, parser):
  840. filename = datapath("io", "data", "xml", "baby_names.xml")
  841. with pytest.raises(UnicodeDecodeError, match=("'utf-8' codec can't decode")):
  842. read_xml(filename, parser=parser)
  843. def test_utf16_encoding(datapath, parser):
  844. filename = datapath("io", "data", "xml", "baby_names.xml")
  845. with pytest.raises(
  846. UnicodeError,
  847. match=(
  848. "UTF-16 stream does not start with BOM|"
  849. "'utf-16-le' codec can't decode byte"
  850. ),
  851. ):
  852. read_xml(filename, encoding="UTF-16", parser=parser)
  853. def test_unknown_encoding(datapath, parser):
  854. filename = datapath("io", "data", "xml", "baby_names.xml")
  855. with pytest.raises(LookupError, match=("unknown encoding: UFT-8")):
  856. read_xml(filename, encoding="UFT-8", parser=parser)
  857. def test_ascii_encoding(datapath, parser):
  858. filename = datapath("io", "data", "xml", "baby_names.xml")
  859. with pytest.raises(UnicodeDecodeError, match=("'ascii' codec can't decode byte")):
  860. read_xml(filename, encoding="ascii", parser=parser)
  861. @td.skip_if_no("lxml")
  862. def test_parser_consistency_with_encoding(datapath):
  863. filename = datapath("io", "data", "xml", "baby_names.xml")
  864. df_xpath_lxml = read_xml(filename, parser="lxml", encoding="ISO-8859-1")
  865. df_xpath_etree = read_xml(filename, parser="etree", encoding="iso-8859-1")
  866. df_iter_lxml = read_xml(
  867. filename,
  868. parser="lxml",
  869. encoding="ISO-8859-1",
  870. iterparse={"row": ["rank", "malename", "femalename"]},
  871. )
  872. df_iter_etree = read_xml(
  873. filename,
  874. parser="etree",
  875. encoding="ISO-8859-1",
  876. iterparse={"row": ["rank", "malename", "femalename"]},
  877. )
  878. tm.assert_frame_equal(df_xpath_lxml, df_xpath_etree)
  879. tm.assert_frame_equal(df_xpath_etree, df_iter_etree)
  880. tm.assert_frame_equal(df_iter_lxml, df_iter_etree)
  881. @td.skip_if_no("lxml")
  882. def test_wrong_encoding_for_lxml():
  883. # GH#45133
  884. data = """<data>
  885. <row>
  886. <a>c</a>
  887. </row>
  888. </data>
  889. """
  890. with pytest.raises(TypeError, match="encoding None"):
  891. read_xml(StringIO(data), parser="lxml", encoding=None)
  892. def test_none_encoding_etree():
  893. # GH#45133
  894. data = """<data>
  895. <row>
  896. <a>c</a>
  897. </row>
  898. </data>
  899. """
  900. result = read_xml(StringIO(data), parser="etree", encoding=None)
  901. expected = DataFrame({"a": ["c"]})
  902. tm.assert_frame_equal(result, expected)
  903. # PARSER
  904. @td.skip_if_installed("lxml")
  905. def test_default_parser_no_lxml(datapath):
  906. filename = datapath("io", "data", "xml", "books.xml")
  907. with pytest.raises(
  908. ImportError, match=("lxml not found, please install or use the etree parser.")
  909. ):
  910. read_xml(filename)
  911. def test_wrong_parser(datapath):
  912. filename = datapath("io", "data", "xml", "books.xml")
  913. with pytest.raises(
  914. ValueError, match=("Values for parser can only be lxml or etree.")
  915. ):
  916. read_xml(filename, parser="bs4")
  917. # STYLESHEET
  918. @td.skip_if_no("lxml")
  919. def test_stylesheet_file(datapath):
  920. kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
  921. xsl = datapath("io", "data", "xml", "flatten_doc.xsl")
  922. df_style = read_xml(
  923. kml,
  924. xpath=".//k:Placemark",
  925. namespaces={"k": "http://www.opengis.net/kml/2.2"},
  926. stylesheet=xsl,
  927. )
  928. df_iter = read_xml(
  929. kml,
  930. iterparse={
  931. "Placemark": [
  932. "id",
  933. "name",
  934. "styleUrl",
  935. "extrude",
  936. "altitudeMode",
  937. "coordinates",
  938. ]
  939. },
  940. )
  941. tm.assert_frame_equal(df_kml, df_style)
  942. tm.assert_frame_equal(df_kml, df_iter)
  943. @td.skip_if_no("lxml")
  944. def test_stylesheet_file_like(datapath, mode):
  945. kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
  946. xsl = datapath("io", "data", "xml", "flatten_doc.xsl")
  947. with open(xsl, mode) as f:
  948. df_style = read_xml(
  949. kml,
  950. xpath=".//k:Placemark",
  951. namespaces={"k": "http://www.opengis.net/kml/2.2"},
  952. stylesheet=f,
  953. )
  954. tm.assert_frame_equal(df_kml, df_style)
  955. @td.skip_if_no("lxml")
  956. def test_stylesheet_io(datapath, mode):
  957. kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
  958. xsl = datapath("io", "data", "xml", "flatten_doc.xsl")
  959. xsl_obj: BytesIO | StringIO
  960. with open(xsl, mode) as f:
  961. if mode == "rb":
  962. xsl_obj = BytesIO(f.read())
  963. else:
  964. xsl_obj = StringIO(f.read())
  965. df_style = read_xml(
  966. kml,
  967. xpath=".//k:Placemark",
  968. namespaces={"k": "http://www.opengis.net/kml/2.2"},
  969. stylesheet=xsl_obj,
  970. )
  971. tm.assert_frame_equal(df_kml, df_style)
  972. @td.skip_if_no("lxml")
  973. def test_stylesheet_buffered_reader(datapath, mode):
  974. kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
  975. xsl = datapath("io", "data", "xml", "flatten_doc.xsl")
  976. with open(xsl, mode) as f:
  977. xsl_obj = f.read()
  978. df_style = read_xml(
  979. kml,
  980. xpath=".//k:Placemark",
  981. namespaces={"k": "http://www.opengis.net/kml/2.2"},
  982. stylesheet=xsl_obj,
  983. )
  984. tm.assert_frame_equal(df_kml, df_style)
  985. @td.skip_if_no("lxml")
  986. def test_style_charset():
  987. xml = "<中文標籤><row><c1>1</c1><c2>2</c2></row></中文標籤>"
  988. xsl = """\
  989. <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  990. <xsl:output omit-xml-declaration="yes" indent="yes"/>
  991. <xsl:strip-space elements="*"/>
  992. <xsl:template match="node()|@*">
  993. <xsl:copy>
  994. <xsl:apply-templates select="node()|@*"/>
  995. </xsl:copy>
  996. </xsl:template>
  997. <xsl:template match="中文標籤">
  998. <根>
  999. <xsl:apply-templates />
  1000. </根>
  1001. </xsl:template>
  1002. </xsl:stylesheet>"""
  1003. df_orig = read_xml(xml)
  1004. df_style = read_xml(xml, stylesheet=xsl)
  1005. tm.assert_frame_equal(df_orig, df_style)
  1006. @td.skip_if_no("lxml")
  1007. def test_not_stylesheet(datapath):
  1008. from lxml.etree import XSLTParseError
  1009. kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
  1010. xsl = datapath("io", "data", "xml", "books.xml")
  1011. with pytest.raises(XSLTParseError, match=("document is not a stylesheet")):
  1012. read_xml(kml, stylesheet=xsl)
  1013. @td.skip_if_no("lxml")
  1014. def test_incorrect_xsl_syntax(datapath):
  1015. from lxml.etree import XMLSyntaxError
  1016. xsl = """\
  1017. <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  1018. xmlns:k="http://www.opengis.net/kml/2.2"/>
  1019. <xsl:output method="xml" omit-xml-declaration="yes"
  1020. cdata-section-elements="k:description" indent="yes"/>
  1021. <xsl:strip-space elements="*"/>
  1022. <xsl:template match="node()|@*">
  1023. <xsl:copy>
  1024. <xsl:apply-templates select="node()|@*"/>
  1025. </xsl:copy>
  1026. </xsl:template>
  1027. <xsl:template match="k:MultiGeometry|k:LineString">
  1028. <xsl:apply-templates select='*'/>
  1029. </xsl:template>
  1030. <xsl:template match="k:description|k:Snippet|k:Style"/>
  1031. </xsl:stylesheet>"""
  1032. kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
  1033. with pytest.raises(
  1034. XMLSyntaxError, match=("Extra content at the end of the document")
  1035. ):
  1036. read_xml(kml, stylesheet=xsl)
  1037. @td.skip_if_no("lxml")
  1038. def test_incorrect_xsl_eval(datapath):
  1039. from lxml.etree import XSLTParseError
  1040. xsl = """\
  1041. <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
  1042. xmlns:k="http://www.opengis.net/kml/2.2">
  1043. <xsl:output method="xml" omit-xml-declaration="yes"
  1044. cdata-section-elements="k:description" indent="yes"/>
  1045. <xsl:strip-space elements="*"/>
  1046. <xsl:template match="node(*)|@*">
  1047. <xsl:copy>
  1048. <xsl:apply-templates select="node()|@*"/>
  1049. </xsl:copy>
  1050. </xsl:template>
  1051. <xsl:template match="k:MultiGeometry|k:LineString">
  1052. <xsl:apply-templates select='*'/>
  1053. </xsl:template>
  1054. <xsl:template match="k:description|k:Snippet|k:Style"/>
  1055. </xsl:stylesheet>"""
  1056. kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
  1057. with pytest.raises(XSLTParseError, match=("failed to compile")):
  1058. read_xml(kml, stylesheet=xsl)
  1059. @td.skip_if_no("lxml")
  1060. def test_incorrect_xsl_apply(datapath):
  1061. from lxml.etree import XSLTApplyError
  1062. xsl = """\
  1063. <xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
  1064. <xsl:output method="xml" encoding="utf-8" indent="yes" />
  1065. <xsl:strip-space elements="*"/>
  1066. <xsl:template match="@*|node()">
  1067. <xsl:copy>
  1068. <xsl:copy-of select="document('non_existent.xml')/*"/>
  1069. </xsl:copy>
  1070. </xsl:template>
  1071. </xsl:stylesheet>"""
  1072. kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
  1073. with pytest.raises(XSLTApplyError, match=("Cannot resolve URI")):
  1074. read_xml(kml, stylesheet=xsl)
  1075. @td.skip_if_no("lxml")
  1076. def test_wrong_stylesheet():
  1077. from lxml.etree import XMLSyntaxError
  1078. kml = os.path.join("data", "xml", "cta_rail_lines.kml")
  1079. xsl = os.path.join("data", "xml", "flatten.xsl")
  1080. with pytest.raises(
  1081. XMLSyntaxError,
  1082. match=("Start tag expected, '<' not found"),
  1083. ):
  1084. read_xml(kml, stylesheet=xsl)
  1085. @td.skip_if_no("lxml")
  1086. def test_stylesheet_file_close(datapath, mode):
  1087. kml = datapath("io", "data", "xml", "cta_rail_lines.kml")
  1088. xsl = datapath("io", "data", "xml", "flatten_doc.xsl")
  1089. xsl_obj: BytesIO | StringIO
  1090. with open(xsl, mode) as f:
  1091. if mode == "rb":
  1092. xsl_obj = BytesIO(f.read())
  1093. else:
  1094. xsl_obj = StringIO(f.read())
  1095. read_xml(kml, stylesheet=xsl_obj)
  1096. assert not f.closed
  1097. @td.skip_if_no("lxml")
  1098. def test_stylesheet_with_etree():
  1099. kml = os.path.join("data", "xml", "cta_rail_lines.kml")
  1100. xsl = os.path.join("data", "xml", "flatten_doc.xsl")
  1101. with pytest.raises(
  1102. ValueError, match=("To use stylesheet, you need lxml installed")
  1103. ):
  1104. read_xml(kml, parser="etree", stylesheet=xsl)
  1105. @td.skip_if_no("lxml")
  1106. @pytest.mark.parametrize("val", ["", b""])
  1107. def test_empty_stylesheet(val):
  1108. from lxml.etree import XMLSyntaxError
  1109. kml = os.path.join("data", "xml", "cta_rail_lines.kml")
  1110. with pytest.raises(
  1111. XMLSyntaxError, match=("Document is empty|Start tag expected, '<' not found")
  1112. ):
  1113. read_xml(kml, stylesheet=val)
  1114. # ITERPARSE
  1115. def test_string_error(parser):
  1116. with pytest.raises(
  1117. ParserError, match=("iterparse is designed for large XML files")
  1118. ):
  1119. read_xml(
  1120. xml_default_nmsp,
  1121. parser=parser,
  1122. iterparse={"row": ["shape", "degrees", "sides", "date"]},
  1123. )
  1124. def test_file_like_iterparse(datapath, parser, mode):
  1125. filename = datapath("io", "data", "xml", "books.xml")
  1126. with open(filename, mode) as f:
  1127. if mode == "r" and parser == "lxml":
  1128. with pytest.raises(
  1129. TypeError, match=("reading file objects must return bytes objects")
  1130. ):
  1131. read_xml(
  1132. f,
  1133. parser=parser,
  1134. iterparse={
  1135. "book": ["category", "title", "year", "author", "price"]
  1136. },
  1137. )
  1138. return None
  1139. else:
  1140. df_filelike = read_xml(
  1141. f,
  1142. parser=parser,
  1143. iterparse={"book": ["category", "title", "year", "author", "price"]},
  1144. )
  1145. df_expected = DataFrame(
  1146. {
  1147. "category": ["cooking", "children", "web"],
  1148. "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
  1149. "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
  1150. "year": [2005, 2005, 2003],
  1151. "price": [30.00, 29.99, 39.95],
  1152. }
  1153. )
  1154. tm.assert_frame_equal(df_filelike, df_expected)
  1155. def test_file_io_iterparse(datapath, parser, mode):
  1156. filename = datapath("io", "data", "xml", "books.xml")
  1157. funcIO = StringIO if mode == "r" else BytesIO
  1158. with open(filename, mode) as f:
  1159. with funcIO(f.read()) as b:
  1160. if mode == "r" and parser == "lxml":
  1161. with pytest.raises(
  1162. TypeError, match=("reading file objects must return bytes objects")
  1163. ):
  1164. read_xml(
  1165. b,
  1166. parser=parser,
  1167. iterparse={
  1168. "book": ["category", "title", "year", "author", "price"]
  1169. },
  1170. )
  1171. return None
  1172. else:
  1173. df_fileio = read_xml(
  1174. b,
  1175. parser=parser,
  1176. iterparse={
  1177. "book": ["category", "title", "year", "author", "price"]
  1178. },
  1179. )
  1180. df_expected = DataFrame(
  1181. {
  1182. "category": ["cooking", "children", "web"],
  1183. "title": ["Everyday Italian", "Harry Potter", "Learning XML"],
  1184. "author": ["Giada De Laurentiis", "J K. Rowling", "Erik T. Ray"],
  1185. "year": [2005, 2005, 2003],
  1186. "price": [30.00, 29.99, 39.95],
  1187. }
  1188. )
  1189. tm.assert_frame_equal(df_fileio, df_expected)
  1190. @pytest.mark.network
  1191. @tm.network(url="https://www.w3schools.com/xml/books.xml", check_before_test=True)
  1192. def test_url_path_error(parser):
  1193. url = "https://www.w3schools.com/xml/books.xml"
  1194. with pytest.raises(
  1195. ParserError, match=("iterparse is designed for large XML files")
  1196. ):
  1197. read_xml(
  1198. url,
  1199. parser=parser,
  1200. iterparse={"row": ["shape", "degrees", "sides", "date"]},
  1201. )
  1202. def test_compression_error(parser, compression_only):
  1203. with tm.ensure_clean(filename="geom_xml.zip") as path:
  1204. geom_df.to_xml(path, parser=parser, compression=compression_only)
  1205. with pytest.raises(
  1206. ParserError, match=("iterparse is designed for large XML files")
  1207. ):
  1208. read_xml(
  1209. path,
  1210. parser=parser,
  1211. iterparse={"row": ["shape", "degrees", "sides", "date"]},
  1212. compression=compression_only,
  1213. )
  1214. def test_wrong_dict_type(datapath, parser):
  1215. filename = datapath("io", "data", "xml", "books.xml")
  1216. with pytest.raises(TypeError, match="list is not a valid type for iterparse"):
  1217. read_xml(
  1218. filename,
  1219. parser=parser,
  1220. iterparse=["category", "title", "year", "author", "price"],
  1221. )
  1222. def test_wrong_dict_value(datapath, parser):
  1223. filename = datapath("io", "data", "xml", "books.xml")
  1224. with pytest.raises(
  1225. TypeError, match="<class 'str'> is not a valid type for value in iterparse"
  1226. ):
  1227. read_xml(filename, parser=parser, iterparse={"book": "category"})
  1228. def test_bad_xml(parser):
  1229. bad_xml = """\
  1230. <?xml version='1.0' encoding='utf-8'?>
  1231. <row>
  1232. <shape>square</shape>
  1233. <degrees>00360</degrees>
  1234. <sides>4.0</sides>
  1235. <date>2020-01-01</date>
  1236. </row>
  1237. <row>
  1238. <shape>circle</shape>
  1239. <degrees>00360</degrees>
  1240. <sides/>
  1241. <date>2021-01-01</date>
  1242. </row>
  1243. <row>
  1244. <shape>triangle</shape>
  1245. <degrees>00180</degrees>
  1246. <sides>3.0</sides>
  1247. <date>2022-01-01</date>
  1248. </row>
  1249. """
  1250. with tm.ensure_clean(filename="bad.xml") as path:
  1251. with open(path, "w") as f:
  1252. f.write(bad_xml)
  1253. with pytest.raises(
  1254. SyntaxError,
  1255. match=(
  1256. "Extra content at the end of the document|"
  1257. "junk after document element"
  1258. ),
  1259. ):
  1260. read_xml(
  1261. path,
  1262. parser=parser,
  1263. parse_dates=["date"],
  1264. iterparse={"row": ["shape", "degrees", "sides", "date"]},
  1265. )
  1266. def test_comment(parser):
  1267. xml = """\
  1268. <!-- comment before root -->
  1269. <shapes>
  1270. <!-- comment within root -->
  1271. <shape>
  1272. <name>circle</name>
  1273. <type>2D</type>
  1274. </shape>
  1275. <shape>
  1276. <name>sphere</name>
  1277. <type>3D</type>
  1278. <!-- comment within child -->
  1279. </shape>
  1280. <!-- comment within root -->
  1281. </shapes>
  1282. <!-- comment after root -->"""
  1283. df_xpath = read_xml(xml, xpath=".//shape", parser=parser)
  1284. df_iter = read_xml_iterparse(
  1285. xml, parser=parser, iterparse={"shape": ["name", "type"]}
  1286. )
  1287. df_expected = DataFrame(
  1288. {
  1289. "name": ["circle", "sphere"],
  1290. "type": ["2D", "3D"],
  1291. }
  1292. )
  1293. tm.assert_frame_equal(df_xpath, df_expected)
  1294. tm.assert_frame_equal(df_iter, df_expected)
  1295. def test_dtd(parser):
  1296. xml = """\
  1297. <?xml version="1.0" encoding="UTF-8"?>
  1298. <!DOCTYPE non-profits [
  1299. <!ELEMENT shapes (shape*) >
  1300. <!ELEMENT shape ( name, type )>
  1301. <!ELEMENT name (#PCDATA)>
  1302. ]>
  1303. <shapes>
  1304. <shape>
  1305. <name>circle</name>
  1306. <type>2D</type>
  1307. </shape>
  1308. <shape>
  1309. <name>sphere</name>
  1310. <type>3D</type>
  1311. </shape>
  1312. </shapes>"""
  1313. df_xpath = read_xml(xml, xpath=".//shape", parser=parser)
  1314. df_iter = read_xml_iterparse(
  1315. xml, parser=parser, iterparse={"shape": ["name", "type"]}
  1316. )
  1317. df_expected = DataFrame(
  1318. {
  1319. "name": ["circle", "sphere"],
  1320. "type": ["2D", "3D"],
  1321. }
  1322. )
  1323. tm.assert_frame_equal(df_xpath, df_expected)
  1324. tm.assert_frame_equal(df_iter, df_expected)
  1325. def test_processing_instruction(parser):
  1326. xml = """\
  1327. <?xml version="1.0" encoding="UTF-8"?>
  1328. <?xml-stylesheet type="text/xsl" href="style.xsl"?>
  1329. <?display table-view?>
  1330. <?sort alpha-ascending?>
  1331. <?textinfo whitespace is allowed ?>
  1332. <?elementnames <shape>, <name>, <type> ?>
  1333. <shapes>
  1334. <shape>
  1335. <name>circle</name>
  1336. <type>2D</type>
  1337. </shape>
  1338. <shape>
  1339. <name>sphere</name>
  1340. <type>3D</type>
  1341. </shape>
  1342. </shapes>"""
  1343. df_xpath = read_xml(xml, xpath=".//shape", parser=parser)
  1344. df_iter = read_xml_iterparse(
  1345. xml, parser=parser, iterparse={"shape": ["name", "type"]}
  1346. )
  1347. df_expected = DataFrame(
  1348. {
  1349. "name": ["circle", "sphere"],
  1350. "type": ["2D", "3D"],
  1351. }
  1352. )
  1353. tm.assert_frame_equal(df_xpath, df_expected)
  1354. tm.assert_frame_equal(df_iter, df_expected)
  1355. def test_no_result(datapath, parser):
  1356. filename = datapath("io", "data", "xml", "books.xml")
  1357. with pytest.raises(
  1358. ParserError, match="No result from selected items in iterparse."
  1359. ):
  1360. read_xml(
  1361. filename,
  1362. parser=parser,
  1363. iterparse={"node": ["attr1", "elem1", "elem2", "elem3"]},
  1364. )
  1365. def test_empty_data(datapath, parser):
  1366. filename = datapath("io", "data", "xml", "books.xml")
  1367. with pytest.raises(EmptyDataError, match="No columns to parse from file"):
  1368. read_xml(
  1369. filename,
  1370. parser=parser,
  1371. iterparse={"book": ["attr1", "elem1", "elem2", "elem3"]},
  1372. )
  1373. @pytest.mark.network
  1374. @td.skip_if_no("lxml")
  1375. @tm.network(
  1376. url="https://www.w3schools.com/xml/cdcatalog_with_xsl.xml", check_before_test=True
  1377. )
  1378. def test_online_stylesheet():
  1379. xml = "https://www.w3schools.com/xml/cdcatalog_with_xsl.xml"
  1380. xsl = "https://www.w3schools.com/xml/cdcatalog.xsl"
  1381. df_xsl = read_xml(
  1382. xml,
  1383. xpath=".//tr[td and position() <= 6]",
  1384. names=["title", "artist"],
  1385. stylesheet=xsl,
  1386. )
  1387. df_expected = DataFrame(
  1388. {
  1389. "title": {
  1390. 0: "Empire Burlesque",
  1391. 1: "Hide your heart",
  1392. 2: "Greatest Hits",
  1393. 3: "Still got the blues",
  1394. 4: "Eros",
  1395. },
  1396. "artist": {
  1397. 0: "Bob Dylan",
  1398. 1: "Bonnie Tyler",
  1399. 2: "Dolly Parton",
  1400. 3: "Gary Moore",
  1401. 4: "Eros Ramazzotti",
  1402. },
  1403. }
  1404. )
  1405. tm.assert_frame_equal(df_expected, df_xsl)
  1406. # COMPRESSION
  1407. def test_compression_read(parser, compression_only):
  1408. with tm.ensure_clean() as comp_path:
  1409. geom_df.to_xml(
  1410. comp_path, index=False, parser=parser, compression=compression_only
  1411. )
  1412. df_xpath = read_xml(comp_path, parser=parser, compression=compression_only)
  1413. df_iter = read_xml_iterparse_comp(
  1414. comp_path,
  1415. compression_only,
  1416. parser=parser,
  1417. iterparse={"row": ["shape", "degrees", "sides"]},
  1418. compression=compression_only,
  1419. )
  1420. tm.assert_frame_equal(df_xpath, geom_df)
  1421. tm.assert_frame_equal(df_iter, geom_df)
  1422. def test_wrong_compression(parser, compression, compression_only):
  1423. actual_compression = compression
  1424. attempted_compression = compression_only
  1425. if actual_compression == attempted_compression:
  1426. return
  1427. errors = {
  1428. "bz2": (OSError, "Invalid data stream"),
  1429. "gzip": (OSError, "Not a gzipped file"),
  1430. "zip": (BadZipFile, "File is not a zip file"),
  1431. "tar": (ReadError, "file could not be opened successfully"),
  1432. }
  1433. zstd = import_optional_dependency("zstandard", errors="ignore")
  1434. if zstd is not None:
  1435. errors["zstd"] = (zstd.ZstdError, "Unknown frame descriptor")
  1436. lzma = import_optional_dependency("lzma", errors="ignore")
  1437. if lzma is not None:
  1438. errors["xz"] = (LZMAError, "Input format not supported by decoder")
  1439. error_cls, error_str = errors[attempted_compression]
  1440. with tm.ensure_clean() as path:
  1441. geom_df.to_xml(path, parser=parser, compression=actual_compression)
  1442. with pytest.raises(error_cls, match=error_str):
  1443. read_xml(path, parser=parser, compression=attempted_compression)
  1444. def test_unsuported_compression(parser):
  1445. with pytest.raises(ValueError, match="Unrecognized compression type"):
  1446. with tm.ensure_clean() as path:
  1447. read_xml(path, parser=parser, compression="7z")
  1448. # STORAGE OPTIONS
  1449. @pytest.mark.network
  1450. @td.skip_if_no("s3fs")
  1451. @td.skip_if_no("lxml")
  1452. @pytest.mark.skipif(
  1453. is_ci_environment(),
  1454. reason="2022.1.17: Hanging on the CI min versions build.",
  1455. )
  1456. @tm.network
  1457. def test_s3_parser_consistency():
  1458. # Python Software Foundation (2019 IRS-990 RETURN)
  1459. s3 = "s3://irs-form-990/201923199349319487_public.xml"
  1460. df_lxml = read_xml(
  1461. s3,
  1462. xpath=".//irs:Form990PartVIISectionAGrp",
  1463. namespaces={"irs": "http://www.irs.gov/efile"},
  1464. parser="lxml",
  1465. storage_options={"anon": True},
  1466. )
  1467. df_etree = read_xml(
  1468. s3,
  1469. xpath=".//irs:Form990PartVIISectionAGrp",
  1470. namespaces={"irs": "http://www.irs.gov/efile"},
  1471. parser="etree",
  1472. storage_options={"anon": True},
  1473. )
  1474. tm.assert_frame_equal(df_lxml, df_etree)
  1475. def test_read_xml_nullable_dtypes(parser, string_storage, dtype_backend):
  1476. # GH#50500
  1477. data = """<?xml version='1.0' encoding='utf-8'?>
  1478. <data xmlns="http://example.com">
  1479. <row>
  1480. <a>x</a>
  1481. <b>1</b>
  1482. <c>4.0</c>
  1483. <d>x</d>
  1484. <e>2</e>
  1485. <f>4.0</f>
  1486. <g></g>
  1487. <h>True</h>
  1488. <i>False</i>
  1489. </row>
  1490. <row>
  1491. <a>y</a>
  1492. <b>2</b>
  1493. <c>5.0</c>
  1494. <d></d>
  1495. <e></e>
  1496. <f></f>
  1497. <g></g>
  1498. <h>False</h>
  1499. <i></i>
  1500. </row>
  1501. </data>"""
  1502. if string_storage == "python":
  1503. string_array = StringArray(np.array(["x", "y"], dtype=np.object_))
  1504. string_array_na = StringArray(np.array(["x", NA], dtype=np.object_))
  1505. else:
  1506. pa = pytest.importorskip("pyarrow")
  1507. string_array = ArrowStringArray(pa.array(["x", "y"]))
  1508. string_array_na = ArrowStringArray(pa.array(["x", None]))
  1509. with pd.option_context("mode.string_storage", string_storage):
  1510. result = read_xml(data, parser=parser, dtype_backend=dtype_backend)
  1511. expected = DataFrame(
  1512. {
  1513. "a": string_array,
  1514. "b": Series([1, 2], dtype="Int64"),
  1515. "c": Series([4.0, 5.0], dtype="Float64"),
  1516. "d": string_array_na,
  1517. "e": Series([2, NA], dtype="Int64"),
  1518. "f": Series([4.0, NA], dtype="Float64"),
  1519. "g": Series([NA, NA], dtype="Int64"),
  1520. "h": Series([True, False], dtype="boolean"),
  1521. "i": Series([False, NA], dtype="boolean"),
  1522. }
  1523. )
  1524. if dtype_backend == "pyarrow":
  1525. pa = pytest.importorskip("pyarrow")
  1526. from pandas.arrays import ArrowExtensionArray
  1527. expected = DataFrame(
  1528. {
  1529. col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
  1530. for col in expected.columns
  1531. }
  1532. )
  1533. expected["g"] = ArrowExtensionArray(pa.array([None, None]))
  1534. tm.assert_frame_equal(result, expected)
  1535. def test_invalid_dtype_backend():
  1536. msg = (
  1537. "dtype_backend numpy is invalid, only 'numpy_nullable' and "
  1538. "'pyarrow' are allowed."
  1539. )
  1540. with pytest.raises(ValueError, match=msg):
  1541. read_xml("test", dtype_backend="numpy")