rbbirb.h 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // rbbirb.h
  5. //
  6. // Copyright (C) 2002-2008, International Business Machines Corporation and others.
  7. // All Rights Reserved.
  8. //
  9. // This file contains declarations for several classes from the
  10. // Rule Based Break Iterator rule builder.
  11. //
  12. #ifndef RBBIRB_H
  13. #define RBBIRB_H
  14. #include "unicode/utypes.h"
  15. #if !UCONFIG_NO_BREAK_ITERATION
  16. #include <utility>
  17. #include "unicode/uobject.h"
  18. #include "unicode/rbbi.h"
  19. #include "unicode/uniset.h"
  20. #include "unicode/parseerr.h"
  21. #include "uhash.h"
  22. #include "uvector.h"
  23. #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
  24. // looks up references to $variables within a set.
  25. U_NAMESPACE_BEGIN
  26. class RBBIRuleScanner;
  27. struct RBBIRuleTableEl;
  28. class RBBISetBuilder;
  29. class RBBINode;
  30. class RBBITableBuilder;
  31. //--------------------------------------------------------------------------------
  32. //
  33. // RBBISymbolTable. Implements SymbolTable interface that is used by the
  34. // UnicodeSet parser to resolve references to $variables.
  35. //
  36. //--------------------------------------------------------------------------------
  37. class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
  38. public: // of these structs for each entry.
  39. RBBISymbolTableEntry();
  40. UnicodeString key;
  41. RBBINode *val;
  42. ~RBBISymbolTableEntry();
  43. private:
  44. RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class
  45. RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class
  46. };
  47. class RBBISymbolTable : public UMemory, public SymbolTable {
  48. private:
  49. const UnicodeString &fRules;
  50. UHashtable *fHashTable;
  51. RBBIRuleScanner *fRuleScanner;
  52. // These next two fields are part of the mechanism for passing references to
  53. // already-constructed UnicodeSets back to the UnicodeSet constructor
  54. // when the pattern includes $variable references.
  55. const UnicodeString ffffString; // = "/uffff"
  56. UnicodeSet *fCachedSetLookup;
  57. public:
  58. // API inherited from class SymbolTable
  59. virtual const UnicodeString* lookup(const UnicodeString& s) const;
  60. virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
  61. virtual UnicodeString parseReference(const UnicodeString& text,
  62. ParsePosition& pos, int32_t limit) const;
  63. // Additional Functions
  64. RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
  65. virtual ~RBBISymbolTable();
  66. virtual RBBINode *lookupNode(const UnicodeString &key) const;
  67. virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err);
  68. #ifdef RBBI_DEBUG
  69. virtual void rbbiSymtablePrint() const;
  70. #else
  71. // A do-nothing inline function for non-debug builds. Member funcs can't be empty
  72. // or the call sites won't compile.
  73. int32_t fFakeField;
  74. #define rbbiSymtablePrint() fFakeField=0;
  75. #endif
  76. private:
  77. RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
  78. RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
  79. };
  80. //--------------------------------------------------------------------------------
  81. //
  82. // class RBBIRuleBuilder The top-level class handling RBBI rule compiling.
  83. //
  84. //--------------------------------------------------------------------------------
  85. class RBBIRuleBuilder : public UMemory {
  86. public:
  87. // Create a rule based break iterator from a set of rules.
  88. // This function is the main entry point into the rule builder. The
  89. // public ICU API for creating RBBIs uses this function to do the actual work.
  90. //
  91. static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules,
  92. UParseError *parseError,
  93. UErrorCode &status);
  94. public:
  95. // The "public" functions and data members that appear below are accessed
  96. // (and shared) by the various parts that make up the rule builder. They
  97. // are NOT intended to be accessed by anything outside of the
  98. // rule builder implementation.
  99. RBBIRuleBuilder(const UnicodeString &rules,
  100. UParseError *parseErr,
  101. UErrorCode &status
  102. );
  103. virtual ~RBBIRuleBuilder();
  104. /**
  105. * Build the state tables and char class Trie from the source rules.
  106. */
  107. RBBIDataHeader *build(UErrorCode &status);
  108. /**
  109. * Fold together redundant character classes (table columns) and
  110. * redundant states (table rows). Done after initial table generation,
  111. * before serializing the result.
  112. */
  113. void optimizeTables();
  114. char *fDebugEnv; // controls debug trace output
  115. UErrorCode *fStatus; // Error reporting. Keeping status
  116. UParseError *fParseError; // here avoids passing it everywhere.
  117. const UnicodeString &fRules; // The rule string that we are compiling
  118. UnicodeString fStrippedRules; // The rule string, with comments stripped.
  119. RBBIRuleScanner *fScanner; // The scanner.
  120. RBBINode *fForwardTree; // The parse trees, generated by the scanner,
  121. RBBINode *fReverseTree; // then manipulated by subsequent steps.
  122. RBBINode *fSafeFwdTree;
  123. RBBINode *fSafeRevTree;
  124. RBBINode **fDefaultTree; // For rules not qualified with a !
  125. // the tree to which they belong to.
  126. UBool fChainRules; // True for chained Unicode TR style rules.
  127. // False for traditional regexp rules.
  128. UBool fLBCMNoChain; // True: suppress chaining of rules on
  129. // chars with LineBreak property == CM.
  130. UBool fLookAheadHardBreak; // True: Look ahead matches cause an
  131. // immediate break, no continuing for the
  132. // longest match.
  133. RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
  134. UVector *fUSetNodes; // Vector of all uset nodes.
  135. RBBITableBuilder *fForwardTable; // State transition table, build time form.
  136. UVector *fRuleStatusVals; // The values that can be returned
  137. // from getRuleStatus().
  138. RBBIDataHeader *flattenData(); // Create the flattened (runtime format)
  139. // data tables..
  140. private:
  141. RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class
  142. RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class
  143. };
  144. //----------------------------------------------------------------------------
  145. //
  146. // RBBISetTableEl is an entry in the hash table of UnicodeSets that have
  147. // been encountered. The val Node will be of nodetype uset
  148. // and contain pointers to the actual UnicodeSets.
  149. // The Key is the source string for initializing the set.
  150. //
  151. // The hash table is used to avoid creating duplicate
  152. // unnamed (not $var references) UnicodeSets.
  153. //
  154. // Memory Management:
  155. // The Hash Table owns these RBBISetTableEl structs and
  156. // the key strings. It does NOT own the val nodes.
  157. //
  158. //----------------------------------------------------------------------------
  159. struct RBBISetTableEl {
  160. UnicodeString *key;
  161. RBBINode *val;
  162. };
  163. /**
  164. * A pair of ints, used to bundle pairs of states or pairs of character classes.
  165. */
  166. typedef std::pair<int32_t, int32_t> IntPair;
  167. //----------------------------------------------------------------------------
  168. //
  169. // RBBIDebugPrintf Printf equivalent, for debugging output.
  170. // Conditional compilation of the implementation lets us
  171. // get rid of the stdio dependency in environments where it
  172. // is unavailable.
  173. //
  174. //----------------------------------------------------------------------------
  175. #ifdef RBBI_DEBUG
  176. #include <stdio.h>
  177. #define RBBIDebugPrintf printf
  178. #define RBBIDebugPuts puts
  179. #else
  180. #undef RBBIDebugPrintf
  181. #define RBBIDebugPuts(arg)
  182. #endif
  183. U_NAMESPACE_END
  184. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  185. #endif