123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /*
- *******************************************************************************
- *
- * Copyright (C) 2004-2005, International Business Machines
- * Corporation and others. All Rights Reserved.
- *
- *******************************************************************************
- * file name: xmlparser.h
- * encoding: UTF-8
- * tab size: 8 (not used)
- * indentation:4
- *
- * created on: 2004jul21
- * created by: Andy Heninger
- *
- * Tiny XML parser using ICU and intended for use in ICU tests and in build tools.
- * Not suitable for production use. Not supported.
- * Not conformant. Not efficient.
- * But very small.
- */
- #ifndef __XMLPARSER_H__
- #define __XMLPARSER_H__
- #include "unicode/uobject.h"
- #include "unicode/unistr.h"
- #include "unicode/regex.h"
- #include "uvector.h"
- #include "hash.h"
- #if !UCONFIG_NO_REGULAR_EXPRESSIONS && !UCONFIG_NO_CONVERSION
- enum UXMLNodeType {
- /** Node type string (text contents), stored as a UnicodeString. */
- UXML_NODE_TYPE_STRING,
- /** Node type element, stored as a UXMLElement. */
- UXML_NODE_TYPE_ELEMENT,
- UXML_NODE_TYPE_COUNT
- };
- U_NAMESPACE_BEGIN
- class UXMLParser;
- /**
- * This class represents an element node in a parsed XML tree.
- */
- class U_TOOLUTIL_API UXMLElement : public UObject {
- public:
- /**
- * Destructor.
- */
- virtual ~UXMLElement();
- /**
- * Get the tag name of this element.
- */
- const UnicodeString &getTagName() const;
- /**
- * Get the text contents of the element.
- * Append the contents of all text child nodes.
- * @param recurse If TRUE, also recursively appends the contents of all
- * text child nodes of element children.
- * @return The text contents.
- */
- UnicodeString getText(UBool recurse) const;
- /**
- * Get the number of attributes.
- */
- int32_t countAttributes() const;
- /**
- * Get the i-th attribute.
- * @param i Index of the attribute.
- * @param name Output parameter, receives the attribute name.
- * @param value Output parameter, receives the attribute value.
- * @return A pointer to the attribute value (may be &value or a pointer to an
- * internal string object), or NULL if i is out of bounds.
- */
- const UnicodeString *getAttribute(int32_t i, UnicodeString &name, UnicodeString &value) const;
- /**
- * Get the value of the attribute with the given name.
- * @param name Attribute name to be looked up.
- * @return A pointer to the attribute value, or NULL if this element
- * does not have this attribute.
- */
- const UnicodeString *getAttribute(const UnicodeString &name) const;
- /**
- * Get the number of child nodes.
- */
- int32_t countChildren() const;
- /**
- * Get the i-th child node.
- * @param i Index of the child node.
- * @param type The child node type.
- * @return A pointer to the child node object, or NULL if i is out of bounds.
- */
- const UObject *getChild(int32_t i, UXMLNodeType &type) const;
- /**
- * Get the next child element node, skipping non-element child nodes.
- * @param i Enumeration index; initialize to 0 before getting the first child element.
- * @return A pointer to the next child element, or NULL if there is none.
- */
- const UXMLElement *nextChildElement(int32_t &i) const;
- /**
- * Get the immediate child element with the given name.
- * If there are multiple child elements with this name, then return
- * the first one.
- * @param name Element name to be looked up.
- * @return A pointer to the element node, or NULL if this element
- * does not have this immediate child element.
- */
- const UXMLElement *getChildElement(const UnicodeString &name) const;
- /**
- * ICU "poor man's RTTI", returns a UClassID for the actual class.
- */
- virtual UClassID getDynamicClassID() const;
- /**
- * ICU "poor man's RTTI", returns a UClassID for this class.
- */
- static UClassID U_EXPORT2 getStaticClassID();
- private:
- // prevent default construction etc.
- UXMLElement();
- UXMLElement(const UXMLElement &other);
- UXMLElement &operator=(const UXMLElement &other);
- void appendText(UnicodeString &text, UBool recurse) const;
- friend class UXMLParser;
- UXMLElement(const UXMLParser *parser, const UnicodeString *name, UErrorCode &errorCode);
- const UXMLParser *fParser;
- const UnicodeString *fName; // The tag name of this element (owned by the UXMLParser)
- UnicodeString fContent; // The text content of this node. All element content is
- // concatenated even when there are intervening nested elements
- // (which doesn't happen with most xml files we care about)
- // Sections of content containing only white space are dropped,
- // which gets rid the bogus white space content from
- // elements which are primarily containers for nested elements.
- UVector fAttNames; // A vector containing the names of this element's attributes
- // The names are UnicodeString objects, owned by the UXMLParser.
- UVector fAttValues; // A vector containing the attribute values for
- // this element's attributes. The order is the same
- // as that of the attribute name vector.
- UVector fChildren; // The child nodes of this element (a Vector)
- UXMLElement *fParent; // A pointer to the parent element of this element.
- };
- /**
- * A simple XML parser; it is neither efficient nor conformant and only useful for
- * restricted types of XML documents.
- *
- * The parse methods parse whole documents and return the parse trees via their
- * root elements.
- */
- class U_TOOLUTIL_API UXMLParser : public UObject {
- public:
- /**
- * Create an XML parser.
- */
- static UXMLParser *createParser(UErrorCode &errorCode);
- /**
- * Destructor.
- */
- virtual ~UXMLParser();
- /**
- * Parse an XML document, create the entire document tree, and
- * return a pointer to the root element of the parsed tree.
- * The caller must delete the element.
- */
- UXMLElement *parse(const UnicodeString &src, UErrorCode &errorCode);
- /**
- * Parse an XML file, create the entire document tree, and
- * return a pointer to the root element of the parsed tree.
- * The caller must delete the element.
- */
- UXMLElement *parseFile(const char *filename, UErrorCode &errorCode);
- /**
- * ICU "poor man's RTTI", returns a UClassID for the actual class.
- */
- virtual UClassID getDynamicClassID() const;
- /**
- * ICU "poor man's RTTI", returns a UClassID for this class.
- */
- static UClassID U_EXPORT2 getStaticClassID();
- private:
- // prevent default construction etc.
- UXMLParser();
- UXMLParser(const UXMLParser &other);
- UXMLParser &operator=(const UXMLParser &other);
- // constructor
- UXMLParser(UErrorCode &status);
- void parseMisc(UErrorCode &status);
- UXMLElement *createElement(RegexMatcher &mEl, UErrorCode &status);
- void error(const char *message, UErrorCode &status);
- UnicodeString scanContent(UErrorCode &status);
- void replaceCharRefs(UnicodeString &s, UErrorCode &status);
- const UnicodeString *intern(const UnicodeString &s, UErrorCode &errorCode);
- public:
- // public for UXMLElement only
- const UnicodeString *findName(const UnicodeString &s) const;
- private:
- // There is one ICU regex matcher for each of the major XML syntax items
- // that are recognized.
- RegexMatcher mXMLDecl;
- RegexMatcher mXMLComment;
- RegexMatcher mXMLSP;
- RegexMatcher mXMLDoctype;
- RegexMatcher mXMLPI;
- RegexMatcher mXMLElemStart;
- RegexMatcher mXMLElemEnd;
- RegexMatcher mXMLElemEmpty;
- RegexMatcher mXMLCharData;
- RegexMatcher mAttrValue;
- RegexMatcher mAttrNormalizer;
- RegexMatcher mNewLineNormalizer;
- RegexMatcher mAmps;
- Hashtable fNames; // interned element/attribute name strings
- UStack fElementStack; // Stack holds the parent elements when nested
- // elements are being parsed. All items on this
- // stack are of type UXMLElement.
- int32_t fPos; // String index of the current scan position in
- // xml source (in fSrc).
- UnicodeString fOneLF;
- };
- U_NAMESPACE_END
- #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */
- #endif
|