123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402 |
- // © 2016 and later: Unicode, Inc. and others.
- // License & terms of use: http://www.unicode.org/copyright.html
- /**
- *******************************************************************************
- * Copyright (C) 2006-2014, International Business Machines Corporation *
- * and others. All Rights Reserved. *
- *******************************************************************************
- */
- #ifndef DICTBE_H
- #define DICTBE_H
- #include "unicode/utypes.h"
- #include "unicode/uniset.h"
- #include "unicode/utext.h"
- #include "brkeng.h"
- #include "uvectr32.h"
- U_NAMESPACE_BEGIN
- class DictionaryMatcher;
- class Normalizer2;
- /*******************************************************************
- * DictionaryBreakEngine
- */
- /**
- * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
- * dictionary to determine language-specific breaks.</p>
- *
- * <p>After it is constructed a DictionaryBreakEngine may be shared between
- * threads without synchronization.</p>
- */
- class DictionaryBreakEngine : public LanguageBreakEngine {
- private:
- /**
- * The set of characters handled by this engine
- * @internal
- */
- UnicodeSet fSet;
- public:
- /**
- * <p>Constructor </p>
- */
- DictionaryBreakEngine();
- /**
- * <p>Virtual destructor.</p>
- */
- virtual ~DictionaryBreakEngine();
- /**
- * <p>Indicate whether this engine handles a particular character for
- * a particular kind of break.</p>
- *
- * @param c A character which begins a run that the engine might handle
- * @return TRUE if this engine handles the particular character and break
- * type.
- */
- virtual UBool handles(UChar32 c) const;
- /**
- * <p>Find any breaks within a run in the supplied text.</p>
- *
- * @param text A UText representing the text. The iterator is left at
- * the end of the run of characters which the engine is capable of handling
- * that starts from the first character in the range.
- * @param startPos The start of the run within the supplied text.
- * @param endPos The end of the run within the supplied text.
- * @param foundBreaks vector of int32_t to receive the break positions
- * @return The number of breaks found.
- */
- virtual int32_t findBreaks( UText *text,
- int32_t startPos,
- int32_t endPos,
- UVector32 &foundBreaks ) const;
- protected:
- /**
- * <p>Set the character set handled by this engine.</p>
- *
- * @param set A UnicodeSet of the set of characters handled by the engine
- */
- virtual void setCharacters( const UnicodeSet &set );
- /**
- * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
- *
- * @param text A UText representing the text
- * @param rangeStart The start of the range of dictionary characters
- * @param rangeEnd The end of the range of dictionary characters
- * @param foundBreaks Output of C array of int32_t break positions, or 0
- * @return The number of breaks found
- */
- virtual int32_t divideUpDictionaryRange( UText *text,
- int32_t rangeStart,
- int32_t rangeEnd,
- UVector32 &foundBreaks ) const = 0;
- };
- /*******************************************************************
- * ThaiBreakEngine
- */
- /**
- * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
- * dictionary and heuristics to determine Thai-specific breaks.</p>
- *
- * <p>After it is constructed a ThaiBreakEngine may be shared between
- * threads without synchronization.</p>
- */
- class ThaiBreakEngine : public DictionaryBreakEngine {
- private:
- /**
- * The set of characters handled by this engine
- * @internal
- */
- UnicodeSet fThaiWordSet;
- UnicodeSet fEndWordSet;
- UnicodeSet fBeginWordSet;
- UnicodeSet fSuffixSet;
- UnicodeSet fMarkSet;
- DictionaryMatcher *fDictionary;
- public:
- /**
- * <p>Default constructor.</p>
- *
- * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
- * engine is deleted.
- */
- ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
- /**
- * <p>Virtual destructor.</p>
- */
- virtual ~ThaiBreakEngine();
- protected:
- /**
- * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
- *
- * @param text A UText representing the text
- * @param rangeStart The start of the range of dictionary characters
- * @param rangeEnd The end of the range of dictionary characters
- * @param foundBreaks Output of C array of int32_t break positions, or 0
- * @return The number of breaks found
- */
- virtual int32_t divideUpDictionaryRange( UText *text,
- int32_t rangeStart,
- int32_t rangeEnd,
- UVector32 &foundBreaks ) const;
- };
- /*******************************************************************
- * LaoBreakEngine
- */
- /**
- * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
- * dictionary and heuristics to determine Lao-specific breaks.</p>
- *
- * <p>After it is constructed a LaoBreakEngine may be shared between
- * threads without synchronization.</p>
- */
- class LaoBreakEngine : public DictionaryBreakEngine {
- private:
- /**
- * The set of characters handled by this engine
- * @internal
- */
- UnicodeSet fLaoWordSet;
- UnicodeSet fEndWordSet;
- UnicodeSet fBeginWordSet;
- UnicodeSet fMarkSet;
- DictionaryMatcher *fDictionary;
- public:
- /**
- * <p>Default constructor.</p>
- *
- * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
- * engine is deleted.
- */
- LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
- /**
- * <p>Virtual destructor.</p>
- */
- virtual ~LaoBreakEngine();
- protected:
- /**
- * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
- *
- * @param text A UText representing the text
- * @param rangeStart The start of the range of dictionary characters
- * @param rangeEnd The end of the range of dictionary characters
- * @param foundBreaks Output of C array of int32_t break positions, or 0
- * @return The number of breaks found
- */
- virtual int32_t divideUpDictionaryRange( UText *text,
- int32_t rangeStart,
- int32_t rangeEnd,
- UVector32 &foundBreaks ) const;
- };
- /*******************************************************************
- * BurmeseBreakEngine
- */
-
- /**
- * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
- * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
- *
- * <p>After it is constructed a BurmeseBreakEngine may be shared between
- * threads without synchronization.</p>
- */
- class BurmeseBreakEngine : public DictionaryBreakEngine {
- private:
- /**
- * The set of characters handled by this engine
- * @internal
- */
-
- UnicodeSet fBurmeseWordSet;
- UnicodeSet fEndWordSet;
- UnicodeSet fBeginWordSet;
- UnicodeSet fMarkSet;
- DictionaryMatcher *fDictionary;
-
- public:
-
- /**
- * <p>Default constructor.</p>
- *
- * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
- * engine is deleted.
- */
- BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
-
- /**
- * <p>Virtual destructor.</p>
- */
- virtual ~BurmeseBreakEngine();
-
- protected:
- /**
- * <p>Divide up a range of known dictionary characters.</p>
- *
- * @param text A UText representing the text
- * @param rangeStart The start of the range of dictionary characters
- * @param rangeEnd The end of the range of dictionary characters
- * @param foundBreaks Output of C array of int32_t break positions, or 0
- * @return The number of breaks found
- */
- virtual int32_t divideUpDictionaryRange( UText *text,
- int32_t rangeStart,
- int32_t rangeEnd,
- UVector32 &foundBreaks ) const;
-
- };
-
- /*******************************************************************
- * KhmerBreakEngine
- */
-
- /**
- * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
- * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
- *
- * <p>After it is constructed a KhmerBreakEngine may be shared between
- * threads without synchronization.</p>
- */
- class KhmerBreakEngine : public DictionaryBreakEngine {
- private:
- /**
- * The set of characters handled by this engine
- * @internal
- */
-
- UnicodeSet fKhmerWordSet;
- UnicodeSet fEndWordSet;
- UnicodeSet fBeginWordSet;
- UnicodeSet fMarkSet;
- DictionaryMatcher *fDictionary;
-
- public:
-
- /**
- * <p>Default constructor.</p>
- *
- * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
- * engine is deleted.
- */
- KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
-
- /**
- * <p>Virtual destructor.</p>
- */
- virtual ~KhmerBreakEngine();
-
- protected:
- /**
- * <p>Divide up a range of known dictionary characters.</p>
- *
- * @param text A UText representing the text
- * @param rangeStart The start of the range of dictionary characters
- * @param rangeEnd The end of the range of dictionary characters
- * @param foundBreaks Output of C array of int32_t break positions, or 0
- * @return The number of breaks found
- */
- virtual int32_t divideUpDictionaryRange( UText *text,
- int32_t rangeStart,
- int32_t rangeEnd,
- UVector32 &foundBreaks ) const;
-
- };
-
- #if !UCONFIG_NO_NORMALIZATION
- /*******************************************************************
- * CjkBreakEngine
- */
- //indicates language/script that the CjkBreakEngine will handle
- enum LanguageType {
- kKorean,
- kChineseJapanese
- };
- /**
- * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
- * dictionary with costs associated with each word and
- * Viterbi decoding to determine CJK-specific breaks.</p>
- */
- class CjkBreakEngine : public DictionaryBreakEngine {
- protected:
- /**
- * The set of characters handled by this engine
- * @internal
- */
- UnicodeSet fHangulWordSet;
- UnicodeSet fHanWordSet;
- UnicodeSet fKatakanaWordSet;
- UnicodeSet fHiraganaWordSet;
- DictionaryMatcher *fDictionary;
- const Normalizer2 *nfkcNorm2;
- public:
- /**
- * <p>Default constructor.</p>
- *
- * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
- * engine is deleted. The DictionaryMatcher must contain costs for each word
- * in order for the dictionary to work properly.
- */
- CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
- /**
- * <p>Virtual destructor.</p>
- */
- virtual ~CjkBreakEngine();
- protected:
- /**
- * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
- *
- * @param text A UText representing the text
- * @param rangeStart The start of the range of dictionary characters
- * @param rangeEnd The end of the range of dictionary characters
- * @param foundBreaks Output of C array of int32_t break positions, or 0
- * @return The number of breaks found
- */
- virtual int32_t divideUpDictionaryRange( UText *text,
- int32_t rangeStart,
- int32_t rangeEnd,
- UVector32 &foundBreaks ) const;
- };
- #endif
- U_NAMESPACE_END
- /* DICTBE_H */
- #endif
|