123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- //
- // rbbirb.h
- //
- // Copyright (C) 2002-2008, International Business Machines Corporation and others.
- // All Rights Reserved.
- //
- // This file contains declarations for several classes from the
- // Rule Based Break Iterator rule builder.
- //
- #ifndef RBBIRB_H
- #define RBBIRB_H
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_BREAK_ITERATION
- #include <utility>
- #include "unicode/uobject.h"
- #include "unicode/rbbi.h"
- #include "unicode/uniset.h"
- #include "unicode/parseerr.h"
- #include "uhash.h"
- #include "uvector.h"
- #include "unicode/symtable.h"// For UnicodeSet parsing, is the interface that
- // looks up references to $variables within a set.
- U_NAMESPACE_BEGIN
- class RBBIRuleScanner;
- struct RBBIRuleTableEl;
- class RBBISetBuilder;
- class RBBINode;
- class RBBITableBuilder;
- //--------------------------------------------------------------------------------
- //
- // RBBISymbolTable. Implements SymbolTable interface that is used by the
- // UnicodeSet parser to resolve references to $variables.
- //
- //--------------------------------------------------------------------------------
- class RBBISymbolTableEntry : public UMemory { // The symbol table hash table contains one
- public: // of these structs for each entry.
- RBBISymbolTableEntry();
- UnicodeString key;
- RBBINode *val;
- ~RBBISymbolTableEntry();
- private:
- RBBISymbolTableEntry(const RBBISymbolTableEntry &other); // forbid copying of this class
- RBBISymbolTableEntry &operator=(const RBBISymbolTableEntry &other); // forbid copying of this class
- };
- class RBBISymbolTable : public UMemory, public SymbolTable {
- private:
- const UnicodeString &fRules;
- UHashtable *fHashTable;
- RBBIRuleScanner *fRuleScanner;
- // These next two fields are part of the mechanism for passing references to
- // already-constructed UnicodeSets back to the UnicodeSet constructor
- // when the pattern includes $variable references.
- const UnicodeString ffffString; // = "/uffff"
- UnicodeSet *fCachedSetLookup;
- public:
- // API inherited from class SymbolTable
- virtual const UnicodeString* lookup(const UnicodeString& s) const;
- virtual const UnicodeFunctor* lookupMatcher(UChar32 ch) const;
- virtual UnicodeString parseReference(const UnicodeString& text,
- ParsePosition& pos, int32_t limit) const;
- // Additional Functions
- RBBISymbolTable(RBBIRuleScanner *, const UnicodeString &fRules, UErrorCode &status);
- virtual ~RBBISymbolTable();
- virtual RBBINode *lookupNode(const UnicodeString &key) const;
- virtual void addEntry (const UnicodeString &key, RBBINode *val, UErrorCode &err);
- #ifdef RBBI_DEBUG
- virtual void rbbiSymtablePrint() const;
- #else
- // A do-nothing inline function for non-debug builds. Member funcs can't be empty
- // or the call sites won't compile.
- int32_t fFakeField;
- #define rbbiSymtablePrint() fFakeField=0;
- #endif
- private:
- RBBISymbolTable(const RBBISymbolTable &other); // forbid copying of this class
- RBBISymbolTable &operator=(const RBBISymbolTable &other); // forbid copying of this class
- };
- //--------------------------------------------------------------------------------
- //
- // class RBBIRuleBuilder The top-level class handling RBBI rule compiling.
- //
- //--------------------------------------------------------------------------------
- class RBBIRuleBuilder : public UMemory {
- public:
- // Create a rule based break iterator from a set of rules.
- // This function is the main entry point into the rule builder. The
- // public ICU API for creating RBBIs uses this function to do the actual work.
- //
- static BreakIterator * createRuleBasedBreakIterator( const UnicodeString &rules,
- UParseError *parseError,
- UErrorCode &status);
- public:
- // The "public" functions and data members that appear below are accessed
- // (and shared) by the various parts that make up the rule builder. They
- // are NOT intended to be accessed by anything outside of the
- // rule builder implementation.
- RBBIRuleBuilder(const UnicodeString &rules,
- UParseError *parseErr,
- UErrorCode &status
- );
- virtual ~RBBIRuleBuilder();
- /**
- * Build the state tables and char class Trie from the source rules.
- */
- RBBIDataHeader *build(UErrorCode &status);
- /**
- * Fold together redundant character classes (table columns) and
- * redundant states (table rows). Done after initial table generation,
- * before serializing the result.
- */
- void optimizeTables();
- char *fDebugEnv; // controls debug trace output
- UErrorCode *fStatus; // Error reporting. Keeping status
- UParseError *fParseError; // here avoids passing it everywhere.
- const UnicodeString &fRules; // The rule string that we are compiling
- UnicodeString fStrippedRules; // The rule string, with comments stripped.
- RBBIRuleScanner *fScanner; // The scanner.
- RBBINode *fForwardTree; // The parse trees, generated by the scanner,
- RBBINode *fReverseTree; // then manipulated by subsequent steps.
- RBBINode *fSafeFwdTree;
- RBBINode *fSafeRevTree;
- RBBINode **fDefaultTree; // For rules not qualified with a !
- // the tree to which they belong to.
- UBool fChainRules; // True for chained Unicode TR style rules.
- // False for traditional regexp rules.
- UBool fLBCMNoChain; // True: suppress chaining of rules on
- // chars with LineBreak property == CM.
- UBool fLookAheadHardBreak; // True: Look ahead matches cause an
- // immediate break, no continuing for the
- // longest match.
- RBBISetBuilder *fSetBuilder; // Set and Character Category builder.
- UVector *fUSetNodes; // Vector of all uset nodes.
- RBBITableBuilder *fForwardTable; // State transition table, build time form.
- UVector *fRuleStatusVals; // The values that can be returned
- // from getRuleStatus().
- RBBIDataHeader *flattenData(); // Create the flattened (runtime format)
- // data tables..
- private:
- RBBIRuleBuilder(const RBBIRuleBuilder &other); // forbid copying of this class
- RBBIRuleBuilder &operator=(const RBBIRuleBuilder &other); // forbid copying of this class
- };
- //----------------------------------------------------------------------------
- //
- // RBBISetTableEl is an entry in the hash table of UnicodeSets that have
- // been encountered. The val Node will be of nodetype uset
- // and contain pointers to the actual UnicodeSets.
- // The Key is the source string for initializing the set.
- //
- // The hash table is used to avoid creating duplicate
- // unnamed (not $var references) UnicodeSets.
- //
- // Memory Management:
- // The Hash Table owns these RBBISetTableEl structs and
- // the key strings. It does NOT own the val nodes.
- //
- //----------------------------------------------------------------------------
- struct RBBISetTableEl {
- UnicodeString *key;
- RBBINode *val;
- };
- /**
- * A pair of ints, used to bundle pairs of states or pairs of character classes.
- */
- typedef std::pair<int32_t, int32_t> IntPair;
- //----------------------------------------------------------------------------
- //
- // RBBIDebugPrintf Printf equivalent, for debugging output.
- // Conditional compilation of the implementation lets us
- // get rid of the stdio dependency in environments where it
- // is unavailable.
- //
- //----------------------------------------------------------------------------
- #ifdef RBBI_DEBUG
- #include <stdio.h>
- #define RBBIDebugPrintf printf
- #define RBBIDebugPuts puts
- #else
- #undef RBBIDebugPrintf
- #define RBBIDebugPuts(arg)
- #endif
- U_NAMESPACE_END
- #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
- #endif
|