rbbisetb.h 5.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. //
  4. // rbbisetb.h
  5. /*
  6. **********************************************************************
  7. * Copyright (c) 2001-2005, International Business Machines
  8. * Corporation and others. All Rights Reserved.
  9. **********************************************************************
  10. */
  11. #ifndef RBBISETB_H
  12. #define RBBISETB_H
  13. #include "unicode/utypes.h"
  14. #if !UCONFIG_NO_BREAK_ITERATION
  15. #include "unicode/uobject.h"
  16. #include "rbbirb.h"
  17. #include "utrie2.h"
  18. #include "uvector.h"
  19. U_NAMESPACE_BEGIN
  20. //
  21. // RBBISetBuilder Derives the character categories used by the runtime RBBI engine
  22. // from the Unicode Sets appearing in the source RBBI rules, and
  23. // creates the TRIE table used to map from Unicode to the
  24. // character categories.
  25. //
  26. //
  27. // RangeDescriptor
  28. //
  29. // Each of the non-overlapping character ranges gets one of these descriptors.
  30. // All of them are strung together in a linked list, which is kept in order
  31. // (by character)
  32. //
  33. class RangeDescriptor : public UMemory {
  34. public:
  35. UChar32 fStartChar; // Start of range, unicode 32 bit value.
  36. UChar32 fEndChar; // End of range, unicode 32 bit value.
  37. int32_t fNum; // runtime-mapped input value for this range.
  38. UVector *fIncludesSets; // vector of the the original
  39. // Unicode sets that include this range.
  40. // (Contains ptrs to uset nodes)
  41. RangeDescriptor *fNext; // Next RangeDescriptor in the linked list.
  42. RangeDescriptor(UErrorCode &status);
  43. RangeDescriptor(const RangeDescriptor &other, UErrorCode &status);
  44. ~RangeDescriptor();
  45. void split(UChar32 where, UErrorCode &status); // Spit this range in two at "where", with
  46. // where appearing in the second (higher) part.
  47. void setDictionaryFlag(); // Check whether this range appears as part of
  48. // the Unicode set named "dictionary"
  49. private:
  50. RangeDescriptor(const RangeDescriptor &other); // forbid copying of this class
  51. RangeDescriptor &operator=(const RangeDescriptor &other); // forbid copying of this class
  52. };
  53. //
  54. // RBBISetBuilder Handles processing of Unicode Sets from RBBI rules.
  55. //
  56. // Starting with the rules parse tree from the scanner,
  57. //
  58. // - Enumerate the set of UnicodeSets that are referenced
  59. // by the RBBI rules.
  60. // - compute a derived set of non-overlapping UnicodeSets
  61. // that will correspond to columns in the state table for
  62. // the RBBI execution engine.
  63. // - construct the trie table that maps input characters
  64. // to set numbers in the non-overlapping set of sets.
  65. //
  66. class RBBISetBuilder : public UMemory {
  67. public:
  68. RBBISetBuilder(RBBIRuleBuilder *rb);
  69. ~RBBISetBuilder();
  70. void buildRanges();
  71. void buildTrie();
  72. void addValToSets(UVector *sets, uint32_t val);
  73. void addValToSet (RBBINode *usetNode, uint32_t val);
  74. int32_t getNumCharCategories() const; // CharCategories are the same as input symbol set to the
  75. // runtime state machine, which are the same as
  76. // columns in the DFA state table
  77. int32_t getTrieSize() /*const*/; // Size in bytes of the serialized Trie.
  78. void serializeTrie(uint8_t *where); // write out the serialized Trie.
  79. UChar32 getFirstChar(int32_t val) const;
  80. UBool sawBOF() const; // Indicate whether any references to the {bof} pseudo
  81. // character were encountered.
  82. /**
  83. * Merge two character categories that have been identified as having equivalent behavior.
  84. * The ranges belonging to the second category (table column) will be added to the first.
  85. * @param categories the pair of categories to be merged.
  86. */
  87. void mergeCategories(IntPair categories);
  88. static constexpr int32_t DICT_BIT = 0x4000;
  89. #ifdef RBBI_DEBUG
  90. void printSets();
  91. void printRanges();
  92. void printRangeGroups();
  93. #else
  94. #define printSets()
  95. #define printRanges()
  96. #define printRangeGroups()
  97. #endif
  98. private:
  99. void numberSets();
  100. RBBIRuleBuilder *fRB; // The RBBI Rule Compiler that owns us.
  101. UErrorCode *fStatus;
  102. RangeDescriptor *fRangeList; // Head of the linked list of RangeDescriptors
  103. UTrie2 *fTrie; // The mapping TRIE that is the end result of processing
  104. uint32_t fTrieSize; // the Unicode Sets.
  105. // Groups correspond to character categories -
  106. // groups of ranges that are in the same original UnicodeSets.
  107. // fGroupCount is the index of the last used group.
  108. // fGroupCount+1 is also the number of columns in the RBBI state table being compiled.
  109. // State table column 0 is not used. Column 1 is for end-of-input.
  110. // column 2 is for group 0. Funny counting.
  111. int32_t fGroupCount;
  112. UBool fSawBOF;
  113. RBBISetBuilder(const RBBISetBuilder &other); // forbid copying of this class
  114. RBBISetBuilder &operator=(const RBBISetBuilder &other); // forbid copying of this class
  115. };
  116. U_NAMESPACE_END
  117. #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
  118. #endif