rbbiscan.h 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // rbbiscan.h
  5. //
  6. // Copyright (C) 2002-2016, International Business Machines Corporation and others.
  7. // All Rights Reserved.
  8. //
  9. // This file contains declarations for class RBBIRuleScanner
  10. //
  11. #ifndef RBBISCAN_H
  12. #define RBBISCAN_H
  13. #include "unicode/utypes.h"
  14. #include "unicode/uobject.h"
  15. #include "unicode/rbbi.h"
  16. #include "unicode/uniset.h"
  17. #include "unicode/parseerr.h"
  18. #include "uhash.h"
  19. #include "uvector.h"
  20. #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
  21. // looks up references to $variables within a set.
  22. #include "rbbinode.h"
  23. #include "rbbirpt.h"
  24. U_NAMESPACE_BEGIN
  25. class RBBIRuleBuilder;
  26. class RBBISymbolTable;
  27. //--------------------------------------------------------------------------------
  28. //
  29. // class RBBIRuleScanner does the lowest level, character-at-a-time
  30. // scanning of break iterator rules.
  31. //
  32. // The output of the scanner is parse trees for
  33. // the rule expressions and a list of all Unicode Sets
  34. // encountered.
  35. //
  36. //--------------------------------------------------------------------------------
  37. class RBBIRuleScanner : public UMemory {
  38. public:
  39. enum {
  40. kStackSize = 100 // The size of the state stack for
  41. }; // rules parsing. Corresponds roughly
  42. // to the depth of parentheses nesting
  43. // that is allowed in the rules.
  44. struct RBBIRuleChar {
  45. UChar32 fChar;
  46. UBool fEscaped;
  47. RBBIRuleChar() : fChar(0), fEscaped(FALSE) {}
  48. };
  49. RBBIRuleScanner(RBBIRuleBuilder *rb);
  50. virtual ~RBBIRuleScanner();
  51. void nextChar(RBBIRuleChar &c); // Get the next char from the input stream.
  52. // Return false if at end.
  53. UBool push(const RBBIRuleChar &c); // Push (unget) one character.
  54. // Only a single character may be pushed.
  55. void parse(); // Parse the rules, generating two parse
  56. // trees, one each for the forward and
  57. // reverse rules,
  58. // and a list of UnicodeSets encountered.
  59. int32_t numRules(); // Return the number of rules that have been seen.
  60. /**
  61. * Return a rules string without unnecessary
  62. * characters.
  63. */
  64. static UnicodeString stripRules(const UnicodeString &rules);
  65. private:
  66. UBool doParseActions(int32_t a);
  67. void error(UErrorCode e); // error reporting convenience function.
  68. void fixOpStack(RBBINode::OpPrecedence p);
  69. // a character.
  70. void findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt = NULL);
  71. UChar32 nextCharLL();
  72. #ifdef RBBI_DEBUG
  73. void printNodeStack(const char *title);
  74. #endif
  75. RBBINode *pushNewNode(RBBINode::NodeType t);
  76. void scanSet();
  77. RBBIRuleBuilder *fRB; // The rule builder that we are part of.
  78. int32_t fScanIndex; // Index of current character being processed
  79. // in the rule input string.
  80. int32_t fNextIndex; // Index of the next character, which
  81. // is the first character not yet scanned.
  82. UBool fQuoteMode; // Scan is in a 'quoted region'
  83. int32_t fLineNum; // Line number in input file.
  84. int32_t fCharNum; // Char position within the line.
  85. UChar32 fLastChar; // Previous char, needed to count CR-LF
  86. // as a single line, not two.
  87. RBBIRuleChar fC; // Current char for parse state machine
  88. // processing.
  89. UnicodeString fVarName; // $variableName, valid when we've just
  90. // scanned one.
  91. RBBIRuleTableEl **fStateTable; // State Transition Table for RBBI Rule
  92. // parsing. index by p[state][char-class]
  93. uint16_t fStack[kStackSize]; // State stack, holds state pushes
  94. int32_t fStackPtr; // and pops as specified in the state
  95. // transition rules.
  96. RBBINode *fNodeStack[kStackSize]; // Node stack, holds nodes created
  97. // during the parse of a rule
  98. int32_t fNodeStackPtr;
  99. UBool fReverseRule; // True if the rule currently being scanned
  100. // is a reverse direction rule (if it
  101. // starts with a '!')
  102. UBool fLookAheadRule; // True if the rule includes a '/'
  103. // somewhere within it.
  104. UBool fNoChainInRule; // True if the current rule starts with a '^'.
  105. RBBISymbolTable *fSymbolTable; // symbol table, holds definitions of
  106. // $variable symbols.
  107. UHashtable *fSetTable; // UnicocodeSet hash table, holds indexes to
  108. // the sets created while parsing rules.
  109. // The key is the string used for creating
  110. // the set.
  111. UnicodeSet fRuleSets[10]; // Unicode Sets that are needed during
  112. // the scanning of RBBI rules. The
  113. // indicies for these are assigned by the
  114. // perl script that builds the state tables.
  115. // See rbbirpt.h.
  116. int32_t fRuleNum; // Counts each rule as it is scanned.
  117. int32_t fOptionStart; // Input index of start of a !!option
  118. // keyword, while being scanned.
  119. UnicodeSet *gRuleSet_rule_char;
  120. UnicodeSet *gRuleSet_white_space;
  121. UnicodeSet *gRuleSet_name_char;
  122. UnicodeSet *gRuleSet_name_start_char;
  123. RBBIRuleScanner(const RBBIRuleScanner &other); // forbid copying of this class
  124. RBBIRuleScanner &operator=(const RBBIRuleScanner &other); // forbid copying of this class
  125. };
  126. U_NAMESPACE_END
  127. #endif