rbbitblb.h 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // rbbitblb.h
  5. //
  6. /*
  7. **********************************************************************
  8. * Copyright (c) 2002-2016, International Business Machines
  9. * Corporation and others. All Rights Reserved.
  10. **********************************************************************
  11. */
  12. #ifndef RBBITBLB_H
  13. #define RBBITBLB_H
  14. #include "unicode/utypes.h"
  15. #if !UCONFIG_NO_BREAK_ITERATION
  16. #include "unicode/uobject.h"
  17. #include "unicode/rbbi.h"
  18. #include "rbbirb.h"
  19. #include "rbbinode.h"
  20. U_NAMESPACE_BEGIN
  21. class RBBIRuleScanner;
  22. class RBBIRuleBuilder;
  23. class UVector32;
  24. //
  25. // class RBBITableBuilder is part of the RBBI rule compiler.
  26. // It builds the state transition table used by the RBBI runtime
  27. // from the expression syntax tree generated by the rule scanner.
  28. //
  29. // This class is part of the RBBI implementation only.
  30. // There is no user-visible public API here.
  31. //
  32. class RBBITableBuilder : public UMemory {
  33. public:
  34. RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
  35. ~RBBITableBuilder();
  36. void buildForwardTable();
  37. /** Return the runtime size in bytes of the built state table. */
  38. int32_t getTableSize() const;
  39. /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
  40. */
  41. void exportTable(void *where);
  42. /**
  43. * Find duplicate (redundant) character classes. Begin looking with categories.first.
  44. * Duplicate, if found are returned in the categories parameter.
  45. * This is an iterator-like function, used to identify character classes
  46. * (state table columns) that can be eliminated.
  47. * @param categories in/out parameter, specifies where to start looking for duplicates,
  48. * and returns the first pair of duplicates found, if any.
  49. * @return true if duplicate char classes were found, false otherwise.
  50. */
  51. bool findDuplCharClassFrom(IntPair *categories);
  52. /** Remove a column from the state table. Used when two character categories
  53. * have been found equivalent, and merged together, to eliminate the uneeded table column.
  54. */
  55. void removeColumn(int32_t column);
  56. /**
  57. * Check for, and remove dupicate states (table rows).
  58. * @return the number of states removed.
  59. */
  60. int32_t removeDuplicateStates();
  61. /** Build the safe reverse table from the already-constructed forward table. */
  62. void buildSafeReverseTable(UErrorCode &status);
  63. /** Return the runtime size in bytes of the built safe reverse state table. */
  64. int32_t getSafeTableSize() const;
  65. /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
  66. */
  67. void exportSafeTable(void *where);
  68. private:
  69. void calcNullable(RBBINode *n);
  70. void calcFirstPos(RBBINode *n);
  71. void calcLastPos(RBBINode *n);
  72. void calcFollowPos(RBBINode *n);
  73. void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
  74. void bofFixup();
  75. void buildStateTable();
  76. void mapLookAheadRules();
  77. void flagAcceptingStates();
  78. void flagLookAheadStates();
  79. void flagTaggedStates();
  80. void mergeRuleStatusVals();
  81. /**
  82. * Merge redundant state table columns, eliminating character classes with identical behavior.
  83. * Done after the state tables are generated, just before converting to their run-time format.
  84. */
  85. int32_t mergeColumns();
  86. void addRuleRootNodes(UVector *dest, RBBINode *node);
  87. /**
  88. * Find duplicate (redundant) states, beginning at the specified pair,
  89. * within this state table. This is an iterator-like function, used to
  90. * identify states (state table rows) that can be eliminated.
  91. * @param states in/out parameter, specifies where to start looking for duplicates,
  92. * and returns the first pair of duplicates found, if any.
  93. * @return true if duplicate states were found, false otherwise.
  94. */
  95. bool findDuplicateState(IntPair *states);
  96. /** Remove a duplicate state.
  97. * @param duplStates The duplicate states. The first is kept, the second is removed.
  98. * All references to the second in the state table are retargeted
  99. * to the first.
  100. */
  101. void removeState(IntPair duplStates);
  102. /** Find the next duplicate state in the safe reverse table. An iterator function.
  103. * @param states in/out parameter, specifies where to start looking for duplicates,
  104. * and returns the first pair of duplicates found, if any.
  105. * @return true if a duplicate pair of states was found.
  106. */
  107. bool findDuplicateSafeState(IntPair *states);
  108. /** Remove a duplicate state from the safe table.
  109. * @param duplStates The duplicate states. The first is kept, the second is removed.
  110. * All references to the second in the state table are retargeted
  111. * to the first.
  112. */
  113. void removeSafeState(IntPair duplStates);
  114. // Set functions for UVector.
  115. // TODO: make a USet subclass of UVector
  116. void setAdd(UVector *dest, UVector *source);
  117. UBool setEquals(UVector *a, UVector *b);
  118. void sortedAdd(UVector **dest, int32_t val);
  119. public:
  120. #ifdef RBBI_DEBUG
  121. void printSet(UVector *s);
  122. void printPosSets(RBBINode *n /* = NULL*/);
  123. void printStates();
  124. void printRuleStatusTable();
  125. void printReverseTable();
  126. #else
  127. #define printSet(s)
  128. #define printPosSets(n)
  129. #define printStates()
  130. #define printRuleStatusTable()
  131. #define printReverseTable()
  132. #endif
  133. private:
  134. RBBIRuleBuilder *fRB;
  135. RBBINode *&fTree; // The root node of the parse tree to build a
  136. // table for.
  137. UErrorCode *fStatus;
  138. /** State Descriptors, UVector<RBBIStateDescriptor> */
  139. UVector *fDStates; // D states (Aho's terminology)
  140. // Index is state number
  141. // Contents are RBBIStateDescriptor pointers.
  142. /** Synthesized safe table, UVector of UnicodeString, one string per table row. */
  143. UVector *fSafeTable;
  144. /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
  145. UVector32 *fLookAheadRuleMap = nullptr;
  146. RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
  147. RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
  148. };
  149. //
  150. // RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
  151. // one for each state.
  152. class RBBIStateDescriptor : public UMemory {
  153. public:
  154. UBool fMarked;
  155. int32_t fAccepting;
  156. int32_t fLookAhead;
  157. UVector *fTagVals;
  158. int32_t fTagsIdx;
  159. UVector *fPositions; // Set of parse tree positions associated
  160. // with this state. Unordered (it's a set).
  161. // UVector contents are RBBINode *
  162. UVector32 *fDtran; // Transitions out of this state.
  163. // indexed by input character
  164. // contents is int index of dest state
  165. // in RBBITableBuilder.fDStates
  166. RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus);
  167. ~RBBIStateDescriptor();
  168. private:
  169. RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class
  170. RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class
  171. };
  172. U_NAMESPACE_END
  173. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  174. #endif