collationruleparser.h 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /*
  4. *******************************************************************************
  5. * Copyright (C) 2013-2014, International Business Machines
  6. * Corporation and others. All Rights Reserved.
  7. *******************************************************************************
  8. * collationruleparser.h
  9. *
  10. * created on: 2013apr10
  11. * created by: Markus W. Scherer
  12. */
  13. #ifndef __COLLATIONRULEPARSER_H__
  14. #define __COLLATIONRULEPARSER_H__
  15. #include "unicode/utypes.h"
  16. #if !UCONFIG_NO_COLLATION
  17. #include "unicode/ucol.h"
  18. #include "unicode/uniset.h"
  19. #include "unicode/unistr.h"
  20. struct UParseError;
  21. U_NAMESPACE_BEGIN
  22. struct CollationData;
  23. struct CollationTailoring;
  24. class Locale;
  25. class Normalizer2;
  26. struct CollationSettings;
  27. class U_I18N_API CollationRuleParser : public UMemory {
  28. public:
  29. /** Special reset positions. */
  30. enum Position {
  31. FIRST_TERTIARY_IGNORABLE,
  32. LAST_TERTIARY_IGNORABLE,
  33. FIRST_SECONDARY_IGNORABLE,
  34. LAST_SECONDARY_IGNORABLE,
  35. FIRST_PRIMARY_IGNORABLE,
  36. LAST_PRIMARY_IGNORABLE,
  37. FIRST_VARIABLE,
  38. LAST_VARIABLE,
  39. FIRST_REGULAR,
  40. LAST_REGULAR,
  41. FIRST_IMPLICIT,
  42. LAST_IMPLICIT,
  43. FIRST_TRAILING,
  44. LAST_TRAILING
  45. };
  46. /**
  47. * First character of contractions that encode special reset positions.
  48. * U+FFFE cannot be tailored via rule syntax.
  49. *
  50. * The second contraction character is POS_BASE + Position.
  51. */
  52. static const UChar POS_LEAD = 0xfffe;
  53. /**
  54. * Base for the second character of contractions that encode special reset positions.
  55. * Braille characters U+28xx are printable and normalization-inert.
  56. * @see POS_LEAD
  57. */
  58. static const UChar POS_BASE = 0x2800;
  59. class U_I18N_API Sink : public UObject {
  60. public:
  61. virtual ~Sink();
  62. /**
  63. * Adds a reset.
  64. * strength=UCOL_IDENTICAL for &str.
  65. * strength=UCOL_PRIMARY/UCOL_SECONDARY/UCOL_TERTIARY for &[before n]str where n=1/2/3.
  66. */
  67. virtual void addReset(int32_t strength, const UnicodeString &str,
  68. const char *&errorReason, UErrorCode &errorCode) = 0;
  69. /**
  70. * Adds a relation with strength and prefix | str / extension.
  71. */
  72. virtual void addRelation(int32_t strength, const UnicodeString &prefix,
  73. const UnicodeString &str, const UnicodeString &extension,
  74. const char *&errorReason, UErrorCode &errorCode) = 0;
  75. virtual void suppressContractions(const UnicodeSet &set, const char *&errorReason,
  76. UErrorCode &errorCode);
  77. virtual void optimize(const UnicodeSet &set, const char *&errorReason,
  78. UErrorCode &errorCode);
  79. };
  80. class U_I18N_API Importer : public UObject {
  81. public:
  82. virtual ~Importer();
  83. virtual void getRules(
  84. const char *localeID, const char *collationType,
  85. UnicodeString &rules,
  86. const char *&errorReason, UErrorCode &errorCode) = 0;
  87. };
  88. /**
  89. * Constructor.
  90. * The Sink must be set before parsing.
  91. * The Importer can be set, otherwise [import locale] syntax is not supported.
  92. */
  93. CollationRuleParser(const CollationData *base, UErrorCode &errorCode);
  94. ~CollationRuleParser();
  95. /**
  96. * Sets the pointer to a Sink object.
  97. * The pointer is aliased: Pointer copy without cloning or taking ownership.
  98. */
  99. void setSink(Sink *sinkAlias) {
  100. sink = sinkAlias;
  101. }
  102. /**
  103. * Sets the pointer to an Importer object.
  104. * The pointer is aliased: Pointer copy without cloning or taking ownership.
  105. */
  106. void setImporter(Importer *importerAlias) {
  107. importer = importerAlias;
  108. }
  109. void parse(const UnicodeString &ruleString,
  110. CollationSettings &outSettings,
  111. UParseError *outParseError,
  112. UErrorCode &errorCode);
  113. const char *getErrorReason() const { return errorReason; }
  114. /**
  115. * Gets a script or reorder code from its string representation.
  116. * @return the script/reorder code, or
  117. * -1 if not recognized
  118. */
  119. static int32_t getReorderCode(const char *word);
  120. private:
  121. /** UCOL_PRIMARY=0 .. UCOL_IDENTICAL=15 */
  122. static const int32_t STRENGTH_MASK = 0xf;
  123. static const int32_t STARRED_FLAG = 0x10;
  124. static const int32_t OFFSET_SHIFT = 8;
  125. void parse(const UnicodeString &ruleString, UErrorCode &errorCode);
  126. void parseRuleChain(UErrorCode &errorCode);
  127. int32_t parseResetAndPosition(UErrorCode &errorCode);
  128. int32_t parseRelationOperator(UErrorCode &errorCode);
  129. void parseRelationStrings(int32_t strength, int32_t i, UErrorCode &errorCode);
  130. void parseStarredCharacters(int32_t strength, int32_t i, UErrorCode &errorCode);
  131. int32_t parseTailoringString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
  132. int32_t parseString(int32_t i, UnicodeString &raw, UErrorCode &errorCode);
  133. /**
  134. * Sets str to a contraction of U+FFFE and (U+2800 + Position).
  135. * @return rule index after the special reset position
  136. */
  137. int32_t parseSpecialPosition(int32_t i, UnicodeString &str, UErrorCode &errorCode);
  138. void parseSetting(UErrorCode &errorCode);
  139. void parseReordering(const UnicodeString &raw, UErrorCode &errorCode);
  140. static UColAttributeValue getOnOffValue(const UnicodeString &s);
  141. int32_t parseUnicodeSet(int32_t i, UnicodeSet &set, UErrorCode &errorCode);
  142. int32_t readWords(int32_t i, UnicodeString &raw) const;
  143. int32_t skipComment(int32_t i) const;
  144. void setParseError(const char *reason, UErrorCode &errorCode);
  145. void setErrorContext();
  146. /**
  147. * ASCII [:P:] and [:S:]:
  148. * [\u0021-\u002F \u003A-\u0040 \u005B-\u0060 \u007B-\u007E]
  149. */
  150. static UBool isSyntaxChar(UChar32 c);
  151. int32_t skipWhiteSpace(int32_t i) const;
  152. const Normalizer2 &nfd, &nfc;
  153. const UnicodeString *rules;
  154. const CollationData *const baseData;
  155. CollationSettings *settings;
  156. UParseError *parseError;
  157. const char *errorReason;
  158. Sink *sink;
  159. Importer *importer;
  160. int32_t ruleIndex;
  161. };
  162. U_NAMESPACE_END
  163. #endif // !UCONFIG_NO_COLLATION
  164. #endif // __COLLATIONRULEPARSER_H__