regexcmp.h 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // regexcmp.h
  5. //
  6. // Copyright (C) 2002-2016, International Business Machines Corporation and others.
  7. // All Rights Reserved.
  8. //
  9. // This file contains declarations for the class RegexCompile
  10. //
  11. // This class is internal to the regular expression implementation.
  12. // For the public Regular Expression API, see the file "unicode/regex.h"
  13. //
  14. #ifndef RBBISCAN_H
  15. #define RBBISCAN_H
  16. #include "unicode/utypes.h"
  17. #if !UCONFIG_NO_REGULAR_EXPRESSIONS
  18. #include "unicode/parseerr.h"
  19. #include "unicode/uniset.h"
  20. #include "unicode/uobject.h"
  21. #include "unicode/utext.h"
  22. #include "uhash.h"
  23. #include "uvector.h"
  24. #include "uvectr32.h"
  25. U_NAMESPACE_BEGIN
  26. //--------------------------------------------------------------------------------
  27. //
  28. // class RegexCompile Contains the regular expression compiler.
  29. //
  30. //--------------------------------------------------------------------------------
  31. struct RegexTableEl;
  32. class RegexPattern;
  33. class U_I18N_API RegexCompile : public UMemory {
  34. public:
  35. enum {
  36. kStackSize = 100 // The size of the state stack for
  37. }; // pattern parsing. Corresponds roughly
  38. // to the depth of parentheses nesting
  39. // that is allowed in the rules.
  40. struct RegexPatternChar {
  41. UChar32 fChar;
  42. UBool fQuoted;
  43. };
  44. RegexCompile(RegexPattern *rp, UErrorCode &e);
  45. void compile(const UnicodeString &pat, UParseError &pp, UErrorCode &e);
  46. void compile(UText *pat, UParseError &pp, UErrorCode &e);
  47. virtual ~RegexCompile();
  48. void nextChar(RegexPatternChar &c); // Get the next char from the input stream.
  49. static void cleanup(); // Memory cleanup
  50. // Categories of parentheses in pattern.
  51. // The category is saved in the compile-time parentheses stack frame, and
  52. // determines the code to be generated when the matching close ) is encountered.
  53. enum EParenClass {
  54. plain = -1, // No special handling
  55. capturing = -2,
  56. atomic = -3,
  57. lookAhead = -4,
  58. negLookAhead = -5,
  59. flags = -6,
  60. lookBehind = -7,
  61. lookBehindN = -8
  62. };
  63. private:
  64. UBool doParseActions(int32_t a);
  65. void error(UErrorCode e); // error reporting convenience function.
  66. UChar32 nextCharLL();
  67. UChar32 peekCharLL();
  68. UnicodeSet *scanProp();
  69. UnicodeSet *scanPosixProp();
  70. void handleCloseParen();
  71. int32_t blockTopLoc(UBool reserve); // Locate a position in the compiled pattern
  72. // at the top of the just completed block
  73. // or operation, and optionally ensure that
  74. // there is space to add an opcode there.
  75. void compileSet(UnicodeSet *theSet); // Generate the compiled pattern for
  76. // a reference to a UnicodeSet.
  77. void compileInterval(int32_t InitOp, // Generate the code for a {min,max} quantifier.
  78. int32_t LoopOp);
  79. UBool compileInlineInterval(); // Generate inline code for a {min,max} quantifier
  80. void literalChar(UChar32 c); // Compile a literal char
  81. void fixLiterals(UBool split=FALSE); // Generate code for pending literal characters.
  82. void insertOp(int32_t where); // Open up a slot for a new op in the
  83. // generated code at the specified location.
  84. void appendOp(int32_t op); // Append a new op to the compiled pattern.
  85. void appendOp(int32_t type, int32_t val); // Build & append a new op to the compiled pattern.
  86. int32_t buildOp(int32_t type, int32_t val); // Construct a new pcode instruction.
  87. int32_t allocateData(int32_t size); // Allocate space in the matcher data area.
  88. // Return index of the newly allocated data.
  89. int32_t allocateStackData(int32_t size); // Allocate space in the match back-track stack frame.
  90. // Return offset index in the frame.
  91. int32_t minMatchLength(int32_t start,
  92. int32_t end);
  93. int32_t maxMatchLength(int32_t start,
  94. int32_t end);
  95. void matchStartType();
  96. void stripNOPs();
  97. void setEval(int32_t op);
  98. void setPushOp(int32_t op);
  99. UChar32 scanNamedChar();
  100. UnicodeSet *createSetForProperty(const UnicodeString &propName, UBool negated);
  101. public: // Public for testing only.
  102. static void U_EXPORT2 findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars);
  103. private:
  104. UErrorCode *fStatus;
  105. RegexPattern *fRXPat;
  106. UParseError *fParseErr;
  107. //
  108. // Data associated with low level character scanning
  109. //
  110. int64_t fScanIndex; // Index of current character being processed
  111. // in the rule input string.
  112. UBool fQuoteMode; // Scan is in a \Q...\E quoted region
  113. UBool fInBackslashQuote; // Scan is between a '\' and the following char.
  114. UBool fEOLComments; // When scan is just after '(?', inhibit #... to
  115. // end of line comments, in favor of (?#...) comments.
  116. int64_t fLineNum; // Line number in input file.
  117. int64_t fCharNum; // Char position within the line.
  118. UChar32 fLastChar; // Previous char, needed to count CR-LF
  119. // as a single line, not two.
  120. UChar32 fPeekChar; // Saved char, if we've scanned ahead.
  121. RegexPatternChar fC; // Current char for parse state machine
  122. // processing.
  123. //
  124. // Data for the state machine that parses the regular expression.
  125. //
  126. RegexTableEl **fStateTable; // State Transition Table for regex Rule
  127. // parsing. index by p[state][char-class]
  128. uint16_t fStack[kStackSize]; // State stack, holds state pushes
  129. int32_t fStackPtr; // and pops as specified in the state
  130. // transition rules.
  131. //
  132. // Data associated with the generation of the pcode for the match engine
  133. //
  134. int32_t fModeFlags; // Match Flags. (Case Insensitive, etc.)
  135. // Always has high bit (31) set so that flag values
  136. // on the paren stack are distinguished from relocatable
  137. // pcode addresses.
  138. int32_t fNewModeFlags; // New flags, while compiling (?i, holds state
  139. // until last flag is scanned.
  140. UBool fSetModeFlag; // true for (?ismx, false for (?-ismx
  141. UnicodeString fLiteralChars; // Literal chars or strings from the pattern are accumulated here.
  142. // Once completed, meaning that some non-literal pattern
  143. // construct is encountered, the appropriate opcodes
  144. // to match the literal will be generated, and this
  145. // string will be cleared.
  146. int64_t fPatternLength; // Length of the input pattern string.
  147. UVector32 fParenStack; // parentheses stack. Each frame consists of
  148. // the positions of compiled pattern operations
  149. // needing fixup, followed by negative value. The
  150. // first entry in each frame is the position of the
  151. // spot reserved for use when a quantifier
  152. // needs to add a SAVE at the start of a (block)
  153. // The negative value (-1, -2,...) indicates
  154. // the kind of paren that opened the frame. Some
  155. // need special handling on close.
  156. int32_t fMatchOpenParen; // The position in the compiled pattern
  157. // of the slot reserved for a state save
  158. // at the start of the most recently processed
  159. // parenthesized block. Updated when processing
  160. // a close to the location for the corresponding open.
  161. int32_t fMatchCloseParen; // The position in the pattern of the first
  162. // location after the most recently processed
  163. // parenthesized block.
  164. int32_t fIntervalLow; // {lower, upper} interval quantifier values.
  165. int32_t fIntervalUpper; // Placed here temporarily, when pattern is
  166. // initially scanned. Each new interval
  167. // encountered overwrites these values.
  168. // -1 for the upper interval value means none
  169. // was specified (unlimited occurences.)
  170. int64_t fNameStartPos; // Starting position of a \N{NAME} name in a
  171. // pattern, valid while remainder of name is
  172. // scanned.
  173. UStack fSetStack; // Stack of UnicodeSets, used while evaluating
  174. // (at compile time) set expressions within
  175. // the pattern.
  176. UStack fSetOpStack; // Stack of pending set operators (&&, --, union)
  177. UChar32 fLastSetLiteral; // The last single code point added to a set.
  178. // needed when "-y" is scanned, and we need
  179. // to turn "x-y" into a range.
  180. UnicodeString *fCaptureName; // Named Capture, the group name is built up
  181. // in this string while being scanned.
  182. };
  183. // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
  184. // The high 16 bits are the operator precedence, and the low 16 are a code for the operation itself.
  185. enum SetOperations {
  186. setStart = 0 << 16 | 1,
  187. setEnd = 1 << 16 | 2,
  188. setNegation = 2 << 16 | 3,
  189. setCaseClose = 2 << 16 | 9,
  190. setDifference2 = 3 << 16 | 4, // '--' set difference operator
  191. setIntersection2 = 3 << 16 | 5, // '&&' set intersection operator
  192. setUnion = 4 << 16 | 6, // implicit union of adjacent items
  193. setDifference1 = 4 << 16 | 7, // '-', single dash difference op, for compatibility with old UnicodeSet.
  194. setIntersection1 = 4 << 16 | 8 // '&', single amp intersection op, for compatibility with old UnicodeSet.
  195. };
  196. U_NAMESPACE_END
  197. #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS
  198. #endif // RBBISCAN_H