xmlparser.h 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. *
  6. * Copyright (C) 2004-2005, International Business Machines
  7. * Corporation and others. All Rights Reserved.
  8. *
  9. *******************************************************************************
  10. * file name: xmlparser.h
  11. * encoding: UTF-8
  12. * tab size: 8 (not used)
  13. * indentation:4
  14. *
  15. * created on: 2004jul21
  16. * created by: Andy Heninger
  17. *
  18. * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
  19. * Not suitable for production use. Not supported.
  20. * Not conformant. Not efficient.
  21. * But very small.
  22. */
  23. #ifndef __XMLPARSER_H__
  24. #define __XMLPARSER_H__
  25. #include "unicode/uobject.h"
  26. #include "unicode/unistr.h"
  27. #include "unicode/regex.h"
  28. #include "uvector.h"
  29. #include "hash.h"
  30. #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
  31. enum UXMLNodeType {
  32. /** Node type string (text contents), stored as a UnicodeString. */
  33. UXML_NODE_TYPE_STRING,
  34. /** Node type element, stored as a UXMLElement. */
  35. UXML_NODE_TYPE_ELEMENT,
  36. UXML_NODE_TYPE_COUNT
  37. };
  38. U_NAMESPACE_BEGIN
  39. class UXMLParser;
  40. /**
  41. * This class represents an element node in a parsed XML tree.
  42. */
  43. class U_TOOLUTIL_API UXMLElement : public UObject {
  44. public:
  45. /**
  46. * Destructor.
  47. */
  48. virtual ~UXMLElement();
  49. /**
  50. * Get the tag name of this element.
  51. */
  52. const UnicodeString &getTagName() const;
  53. /**
  54. * Get the text contents of the element.
  55. * Append the contents of all text child nodes.
  56. * @param recurse If TRUE, also recursively appends the contents of all
  57. * text child nodes of element children.
  58. * @return The text contents.
  59. */
  60. UnicodeString getText(UBool recurse) const;
  61. /**
  62. * Get the number of attributes.
  63. */
  64. int32_t countAttributes() const;
  65. /**
  66. * Get the i-th attribute.
  67. * @param i Index of the attribute.
  68. * @param name Output parameter, receives the attribute name.
  69. * @param value Output parameter, receives the attribute value.
  70. * @return A pointer to the attribute value (may be &value or a pointer to an
  71. * internal string object), or NULL if i is out of bounds.
  72. */
  73. const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
  74. /**
  75. * Get the value of the attribute with the given name.
  76. * @param name Attribute name to be looked up.
  77. * @return A pointer to the attribute value, or NULL if this element
  78. * does not have this attribute.
  79. */
  80. const UnicodeString *getAttribute(const UnicodeString &name) const;
  81. /**
  82. * Get the number of child nodes.
  83. */
  84. int32_t countChildren() const;
  85. /**
  86. * Get the i-th child node.
  87. * @param i Index of the child node.
  88. * @param type The child node type.
  89. * @return A pointer to the child node object, or NULL if i is out of bounds.
  90. */
  91. const UObject *getChild(int32_t i, UXMLNodeType &type) const;
  92. /**
  93. * Get the next child element node, skipping non-element child nodes.
  94. * @param i Enumeration index; initialize to 0 before getting the first child element.
  95. * @return A pointer to the next child element, or NULL if there is none.
  96. */
  97. const UXMLElement *nextChildElement(int32_t &i) const;
  98. /**
  99. * Get the immediate child element with the given name.
  100. * If there are multiple child elements with this name, then return
  101. * the first one.
  102. * @param name Element name to be looked up.
  103. * @return A pointer to the element node, or NULL if this element
  104. * does not have this immediate child element.
  105. */
  106. const UXMLElement *getChildElement(const UnicodeString &name) const;
  107. /**
  108. * ICU "poor man's RTTI", returns a UClassID for the actual class.
  109. */
  110. virtual UClassID getDynamicClassID() const;
  111. /**
  112. * ICU "poor man's RTTI", returns a UClassID for this class.
  113. */
  114. static UClassID U_EXPORT2 getStaticClassID();
  115. private:
  116. // prevent default construction etc.
  117. UXMLElement();
  118. UXMLElement(const UXMLElement &other);
  119. UXMLElement &operator=(const UXMLElement &other);
  120. void appendText(UnicodeString &text, UBool recurse) const;
  121. friend class UXMLParser;
  122. UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
  123. const UXMLParser *fParser;
  124. const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser)
  125. UnicodeString fContent; // The text content of this node. All element content is
  126. // concatenated even when there are intervening nested elements
  127. // (which doesn't happen with most xml files we care about)
  128. // Sections of content containing only white space are dropped,
  129. // which gets rid the bogus white space content from
  130. // elements which are primarily containers for nested elements.
  131. UVector fAttNames; // A vector containing the names of this element's attributes
  132. // The names are UnicodeString objects, owned by the UXMLParser.
  133. UVector fAttValues; // A vector containing the attribute values for
  134. // this element's attributes. The order is the same
  135. // as that of the attribute name vector.
  136. UVector fChildren; // The child nodes of this element (a Vector)
  137. UXMLElement *fParent; // A pointer to the parent element of this element.
  138. };
  139. /**
  140. * A simple XML parser; it is neither efficient nor conformant and only useful for
  141. * restricted types of XML documents.
  142. *
  143. * The parse methods parse whole documents and return the parse trees via their
  144. * root elements.
  145. */
  146. class U_TOOLUTIL_API UXMLParser : public UObject {
  147. public:
  148. /**
  149. * Create an XML parser.
  150. */
  151. static UXMLParser *createParser(UErrorCode &errorCode);
  152. /**
  153. * Destructor.
  154. */
  155. virtual ~UXMLParser();
  156. /**
  157. * Parse an XML document, create the entire document tree, and
  158. * return a pointer to the root element of the parsed tree.
  159. * The caller must delete the element.
  160. */
  161. UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
  162. /**
  163. * Parse an XML file, create the entire document tree, and
  164. * return a pointer to the root element of the parsed tree.
  165. * The caller must delete the element.
  166. */
  167. UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
  168. /**
  169. * ICU "poor man's RTTI", returns a UClassID for the actual class.
  170. */
  171. virtual UClassID getDynamicClassID() const;
  172. /**
  173. * ICU "poor man's RTTI", returns a UClassID for this class.
  174. */
  175. static UClassID U_EXPORT2 getStaticClassID();
  176. private:
  177. // prevent default construction etc.
  178. UXMLParser();
  179. UXMLParser(const UXMLParser &other);
  180. UXMLParser &operator=(const UXMLParser &other);
  181. // constructor
  182. UXMLParser(UErrorCode &status);
  183. void parseMisc(UErrorCode &status);
  184. UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status);
  185. void error(const char *message, UErrorCode &status);
  186. UnicodeString scanContent(UErrorCode &status);
  187. void replaceCharRefs(UnicodeString &s, UErrorCode &status);
  188. const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
  189. public:
  190. // public for UXMLElement only
  191. const UnicodeString *findName(const UnicodeString &s) const;
  192. private:
  193. // There is one ICU regex matcher for each of the major XML syntax items
  194. // that are recognized.
  195. RegexMatcher mXMLDecl;
  196. RegexMatcher mXMLComment;
  197. RegexMatcher mXMLSP;
  198. RegexMatcher mXMLDoctype;
  199. RegexMatcher mXMLPI;
  200. RegexMatcher mXMLElemStart;
  201. RegexMatcher mXMLElemEnd;
  202. RegexMatcher mXMLElemEmpty;
  203. RegexMatcher mXMLCharData;
  204. RegexMatcher mAttrValue;
  205. RegexMatcher mAttrNormalizer;
  206. RegexMatcher mNewLineNormalizer;
  207. RegexMatcher mAmps;
  208. Hashtable fNames; // interned element/attribute name strings
  209. UStack fElementStack; // Stack holds the parent elements when nested
  210. // elements are being parsed. All items on this
  211. // stack are of type UXMLElement.
  212. int32_t fPos; // String index of the current scan position in
  213. // xml source (in fSrc).
  214. UnicodeString fOneLF;
  215. };
  216. U_NAMESPACE_END
  217. #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
  218. #endif