123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- //
- // rbbitblb.h
- //
- /*
- **********************************************************************
- * Copyright (c) 2002-2016, International Business Machines
- * Corporation and others. All Rights Reserved.
- **********************************************************************
- */
- #ifndef RBBITBLB_H
- #define RBBITBLB_H
- #include "unicode/utypes.h"
- #if !UCONFIG_NO_BREAK_ITERATION
- #include "unicode/uobject.h"
- #include "unicode/rbbi.h"
- #include "rbbirb.h"
- #include "rbbinode.h"
- U_NAMESPACE_BEGIN
- class RBBIRuleScanner;
- class RBBIRuleBuilder;
- class UVector32;
- //
- // class RBBITableBuilder is part of the RBBI rule compiler.
- // It builds the state transition table used by the RBBI runtime
- // from the expression syntax tree generated by the rule scanner.
- //
- // This class is part of the RBBI implementation only.
- // There is no user-visible public API here.
- //
- class RBBITableBuilder : public UMemory {
- public:
- RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status);
- ~RBBITableBuilder();
- void buildForwardTable();
- /** Return the runtime size in bytes of the built state table. */
- int32_t getTableSize() const;
- /** Fill in the runtime state table. Sufficient memory must exist at the specified location.
- */
- void exportTable(void *where);
- /**
- * Find duplicate (redundant) character classes. Begin looking with categories.first.
- * Duplicate, if found are returned in the categories parameter.
- * This is an iterator-like function, used to identify character classes
- * (state table columns) that can be eliminated.
- * @param categories in/out parameter, specifies where to start looking for duplicates,
- * and returns the first pair of duplicates found, if any.
- * @return true if duplicate char classes were found, false otherwise.
- */
- bool findDuplCharClassFrom(IntPair *categories);
- /** Remove a column from the state table. Used when two character categories
- * have been found equivalent, and merged together, to eliminate the uneeded table column.
- */
- void removeColumn(int32_t column);
- /**
- * Check for, and remove dupicate states (table rows).
- * @return the number of states removed.
- */
- int32_t removeDuplicateStates();
- /** Build the safe reverse table from the already-constructed forward table. */
- void buildSafeReverseTable(UErrorCode &status);
- /** Return the runtime size in bytes of the built safe reverse state table. */
- int32_t getSafeTableSize() const;
- /** Fill in the runtime safe state table. Sufficient memory must exist at the specified location.
- */
- void exportSafeTable(void *where);
- private:
- void calcNullable(RBBINode *n);
- void calcFirstPos(RBBINode *n);
- void calcLastPos(RBBINode *n);
- void calcFollowPos(RBBINode *n);
- void calcChainedFollowPos(RBBINode *n, RBBINode *endMarkNode);
- void bofFixup();
- void buildStateTable();
- void mapLookAheadRules();
- void flagAcceptingStates();
- void flagLookAheadStates();
- void flagTaggedStates();
- void mergeRuleStatusVals();
- /**
- * Merge redundant state table columns, eliminating character classes with identical behavior.
- * Done after the state tables are generated, just before converting to their run-time format.
- */
- int32_t mergeColumns();
- void addRuleRootNodes(UVector *dest, RBBINode *node);
- /**
- * Find duplicate (redundant) states, beginning at the specified pair,
- * within this state table. This is an iterator-like function, used to
- * identify states (state table rows) that can be eliminated.
- * @param states in/out parameter, specifies where to start looking for duplicates,
- * and returns the first pair of duplicates found, if any.
- * @return true if duplicate states were found, false otherwise.
- */
- bool findDuplicateState(IntPair *states);
- /** Remove a duplicate state.
- * @param duplStates The duplicate states. The first is kept, the second is removed.
- * All references to the second in the state table are retargeted
- * to the first.
- */
- void removeState(IntPair duplStates);
- /** Find the next duplicate state in the safe reverse table. An iterator function.
- * @param states in/out parameter, specifies where to start looking for duplicates,
- * and returns the first pair of duplicates found, if any.
- * @return true if a duplicate pair of states was found.
- */
- bool findDuplicateSafeState(IntPair *states);
- /** Remove a duplicate state from the safe table.
- * @param duplStates The duplicate states. The first is kept, the second is removed.
- * All references to the second in the state table are retargeted
- * to the first.
- */
- void removeSafeState(IntPair duplStates);
- // Set functions for UVector.
- // TODO: make a USet subclass of UVector
- void setAdd(UVector *dest, UVector *source);
- UBool setEquals(UVector *a, UVector *b);
- void sortedAdd(UVector **dest, int32_t val);
- public:
- #ifdef RBBI_DEBUG
- void printSet(UVector *s);
- void printPosSets(RBBINode *n /* = NULL*/);
- void printStates();
- void printRuleStatusTable();
- void printReverseTable();
- #else
- #define printSet(s)
- #define printPosSets(n)
- #define printStates()
- #define printRuleStatusTable()
- #define printReverseTable()
- #endif
- private:
- RBBIRuleBuilder *fRB;
- RBBINode *&fTree; // The root node of the parse tree to build a
- // table for.
- UErrorCode *fStatus;
- /** State Descriptors, UVector<RBBIStateDescriptor> */
- UVector *fDStates; // D states (Aho's terminology)
- // Index is state number
- // Contents are RBBIStateDescriptor pointers.
- /** Synthesized safe table, UVector of UnicodeString, one string per table row. */
- UVector *fSafeTable;
- /** Map from rule number (fVal in look ahead nodes) to sequential lookahead index. */
- UVector32 *fLookAheadRuleMap = nullptr;
- RBBITableBuilder(const RBBITableBuilder &other); // forbid copying of this class
- RBBITableBuilder &operator=(const RBBITableBuilder &other); // forbid copying of this class
- };
- //
- // RBBIStateDescriptor - The DFA is constructed as a set of these descriptors,
- // one for each state.
- class RBBIStateDescriptor : public UMemory {
- public:
- UBool fMarked;
- int32_t fAccepting;
- int32_t fLookAhead;
- UVector *fTagVals;
- int32_t fTagsIdx;
- UVector *fPositions; // Set of parse tree positions associated
- // with this state. Unordered (it's a set).
- // UVector contents are RBBINode *
- UVector32 *fDtran; // Transitions out of this state.
- // indexed by input character
- // contents is int index of dest state
- // in RBBITableBuilder.fDStates
- RBBIStateDescriptor(int maxInputSymbol, UErrorCode *fStatus);
- ~RBBIStateDescriptor();
- private:
- RBBIStateDescriptor(const RBBIStateDescriptor &other); // forbid copying of this class
- RBBIStateDescriptor &operator=(const RBBIStateDescriptor &other); // forbid copying of this class
- };
- U_NAMESPACE_END
- #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
- #endif
|