dictbe.h 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /**
  4. *******************************************************************************
  5. * Copyright (C) 2006-2014, International Business Machines Corporation *
  6. * and others. All Rights Reserved. *
  7. *******************************************************************************
  8. */
  9. #ifndef DICTBE_H
  10. #define DICTBE_H
  11. #include "unicode/utypes.h"
  12. #include "unicode/uniset.h"
  13. #include "unicode/utext.h"
  14. #include "brkeng.h"
  15. #include "uvectr32.h"
  16. U_NAMESPACE_BEGIN
  17. class DictionaryMatcher;
  18. class Normalizer2;
  19. /*******************************************************************
  20. * DictionaryBreakEngine
  21. */
  22. /**
  23. * <p>DictionaryBreakEngine is a kind of LanguageBreakEngine that uses a
  24. * dictionary to determine language-specific breaks.</p>
  25. *
  26. * <p>After it is constructed a DictionaryBreakEngine may be shared between
  27. * threads without synchronization.</p>
  28. */
  29. class DictionaryBreakEngine : public LanguageBreakEngine {
  30. private:
  31. /**
  32. * The set of characters handled by this engine
  33. * @internal
  34. */
  35. UnicodeSet fSet;
  36. public:
  37. /**
  38. * <p>Constructor </p>
  39. */
  40. DictionaryBreakEngine();
  41. /**
  42. * <p>Virtual destructor.</p>
  43. */
  44. virtual ~DictionaryBreakEngine();
  45. /**
  46. * <p>Indicate whether this engine handles a particular character for
  47. * a particular kind of break.</p>
  48. *
  49. * @param c A character which begins a run that the engine might handle
  50. * @return TRUE if this engine handles the particular character and break
  51. * type.
  52. */
  53. virtual UBool handles(UChar32 c) const;
  54. /**
  55. * <p>Find any breaks within a run in the supplied text.</p>
  56. *
  57. * @param text A UText representing the text. The iterator is left at
  58. * the end of the run of characters which the engine is capable of handling
  59. * that starts from the first character in the range.
  60. * @param startPos The start of the run within the supplied text.
  61. * @param endPos The end of the run within the supplied text.
  62. * @param foundBreaks vector of int32_t to receive the break positions
  63. * @return The number of breaks found.
  64. */
  65. virtual int32_t findBreaks( UText *text,
  66. int32_t startPos,
  67. int32_t endPos,
  68. UVector32 &foundBreaks ) const;
  69. protected:
  70. /**
  71. * <p>Set the character set handled by this engine.</p>
  72. *
  73. * @param set A UnicodeSet of the set of characters handled by the engine
  74. */
  75. virtual void setCharacters( const UnicodeSet &set );
  76. /**
  77. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  78. *
  79. * @param text A UText representing the text
  80. * @param rangeStart The start of the range of dictionary characters
  81. * @param rangeEnd The end of the range of dictionary characters
  82. * @param foundBreaks Output of C array of int32_t break positions, or 0
  83. * @return The number of breaks found
  84. */
  85. virtual int32_t divideUpDictionaryRange( UText *text,
  86. int32_t rangeStart,
  87. int32_t rangeEnd,
  88. UVector32 &foundBreaks ) const = 0;
  89. };
  90. /*******************************************************************
  91. * ThaiBreakEngine
  92. */
  93. /**
  94. * <p>ThaiBreakEngine is a kind of DictionaryBreakEngine that uses a
  95. * dictionary and heuristics to determine Thai-specific breaks.</p>
  96. *
  97. * <p>After it is constructed a ThaiBreakEngine may be shared between
  98. * threads without synchronization.</p>
  99. */
  100. class ThaiBreakEngine : public DictionaryBreakEngine {
  101. private:
  102. /**
  103. * The set of characters handled by this engine
  104. * @internal
  105. */
  106. UnicodeSet fThaiWordSet;
  107. UnicodeSet fEndWordSet;
  108. UnicodeSet fBeginWordSet;
  109. UnicodeSet fSuffixSet;
  110. UnicodeSet fMarkSet;
  111. DictionaryMatcher *fDictionary;
  112. public:
  113. /**
  114. * <p>Default constructor.</p>
  115. *
  116. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  117. * engine is deleted.
  118. */
  119. ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  120. /**
  121. * <p>Virtual destructor.</p>
  122. */
  123. virtual ~ThaiBreakEngine();
  124. protected:
  125. /**
  126. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  127. *
  128. * @param text A UText representing the text
  129. * @param rangeStart The start of the range of dictionary characters
  130. * @param rangeEnd The end of the range of dictionary characters
  131. * @param foundBreaks Output of C array of int32_t break positions, or 0
  132. * @return The number of breaks found
  133. */
  134. virtual int32_t divideUpDictionaryRange( UText *text,
  135. int32_t rangeStart,
  136. int32_t rangeEnd,
  137. UVector32 &foundBreaks ) const;
  138. };
  139. /*******************************************************************
  140. * LaoBreakEngine
  141. */
  142. /**
  143. * <p>LaoBreakEngine is a kind of DictionaryBreakEngine that uses a
  144. * dictionary and heuristics to determine Lao-specific breaks.</p>
  145. *
  146. * <p>After it is constructed a LaoBreakEngine may be shared between
  147. * threads without synchronization.</p>
  148. */
  149. class LaoBreakEngine : public DictionaryBreakEngine {
  150. private:
  151. /**
  152. * The set of characters handled by this engine
  153. * @internal
  154. */
  155. UnicodeSet fLaoWordSet;
  156. UnicodeSet fEndWordSet;
  157. UnicodeSet fBeginWordSet;
  158. UnicodeSet fMarkSet;
  159. DictionaryMatcher *fDictionary;
  160. public:
  161. /**
  162. * <p>Default constructor.</p>
  163. *
  164. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  165. * engine is deleted.
  166. */
  167. LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  168. /**
  169. * <p>Virtual destructor.</p>
  170. */
  171. virtual ~LaoBreakEngine();
  172. protected:
  173. /**
  174. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  175. *
  176. * @param text A UText representing the text
  177. * @param rangeStart The start of the range of dictionary characters
  178. * @param rangeEnd The end of the range of dictionary characters
  179. * @param foundBreaks Output of C array of int32_t break positions, or 0
  180. * @return The number of breaks found
  181. */
  182. virtual int32_t divideUpDictionaryRange( UText *text,
  183. int32_t rangeStart,
  184. int32_t rangeEnd,
  185. UVector32 &foundBreaks ) const;
  186. };
  187. /*******************************************************************
  188. * BurmeseBreakEngine
  189. */
  190. /**
  191. * <p>BurmeseBreakEngine is a kind of DictionaryBreakEngine that uses a
  192. * DictionaryMatcher and heuristics to determine Burmese-specific breaks.</p>
  193. *
  194. * <p>After it is constructed a BurmeseBreakEngine may be shared between
  195. * threads without synchronization.</p>
  196. */
  197. class BurmeseBreakEngine : public DictionaryBreakEngine {
  198. private:
  199. /**
  200. * The set of characters handled by this engine
  201. * @internal
  202. */
  203. UnicodeSet fBurmeseWordSet;
  204. UnicodeSet fEndWordSet;
  205. UnicodeSet fBeginWordSet;
  206. UnicodeSet fMarkSet;
  207. DictionaryMatcher *fDictionary;
  208. public:
  209. /**
  210. * <p>Default constructor.</p>
  211. *
  212. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  213. * engine is deleted.
  214. */
  215. BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  216. /**
  217. * <p>Virtual destructor.</p>
  218. */
  219. virtual ~BurmeseBreakEngine();
  220. protected:
  221. /**
  222. * <p>Divide up a range of known dictionary characters.</p>
  223. *
  224. * @param text A UText representing the text
  225. * @param rangeStart The start of the range of dictionary characters
  226. * @param rangeEnd The end of the range of dictionary characters
  227. * @param foundBreaks Output of C array of int32_t break positions, or 0
  228. * @return The number of breaks found
  229. */
  230. virtual int32_t divideUpDictionaryRange( UText *text,
  231. int32_t rangeStart,
  232. int32_t rangeEnd,
  233. UVector32 &foundBreaks ) const;
  234. };
  235. /*******************************************************************
  236. * KhmerBreakEngine
  237. */
  238. /**
  239. * <p>KhmerBreakEngine is a kind of DictionaryBreakEngine that uses a
  240. * DictionaryMatcher and heuristics to determine Khmer-specific breaks.</p>
  241. *
  242. * <p>After it is constructed a KhmerBreakEngine may be shared between
  243. * threads without synchronization.</p>
  244. */
  245. class KhmerBreakEngine : public DictionaryBreakEngine {
  246. private:
  247. /**
  248. * The set of characters handled by this engine
  249. * @internal
  250. */
  251. UnicodeSet fKhmerWordSet;
  252. UnicodeSet fEndWordSet;
  253. UnicodeSet fBeginWordSet;
  254. UnicodeSet fMarkSet;
  255. DictionaryMatcher *fDictionary;
  256. public:
  257. /**
  258. * <p>Default constructor.</p>
  259. *
  260. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  261. * engine is deleted.
  262. */
  263. KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status);
  264. /**
  265. * <p>Virtual destructor.</p>
  266. */
  267. virtual ~KhmerBreakEngine();
  268. protected:
  269. /**
  270. * <p>Divide up a range of known dictionary characters.</p>
  271. *
  272. * @param text A UText representing the text
  273. * @param rangeStart The start of the range of dictionary characters
  274. * @param rangeEnd The end of the range of dictionary characters
  275. * @param foundBreaks Output of C array of int32_t break positions, or 0
  276. * @return The number of breaks found
  277. */
  278. virtual int32_t divideUpDictionaryRange( UText *text,
  279. int32_t rangeStart,
  280. int32_t rangeEnd,
  281. UVector32 &foundBreaks ) const;
  282. };
  283. #if !UCONFIG_NO_NORMALIZATION
  284. /*******************************************************************
  285. * CjkBreakEngine
  286. */
  287. //indicates language/script that the CjkBreakEngine will handle
  288. enum LanguageType {
  289. kKorean,
  290. kChineseJapanese
  291. };
  292. /**
  293. * <p>CjkBreakEngine is a kind of DictionaryBreakEngine that uses a
  294. * dictionary with costs associated with each word and
  295. * Viterbi decoding to determine CJK-specific breaks.</p>
  296. */
  297. class CjkBreakEngine : public DictionaryBreakEngine {
  298. protected:
  299. /**
  300. * The set of characters handled by this engine
  301. * @internal
  302. */
  303. UnicodeSet fHangulWordSet;
  304. UnicodeSet fHanWordSet;
  305. UnicodeSet fKatakanaWordSet;
  306. UnicodeSet fHiraganaWordSet;
  307. DictionaryMatcher *fDictionary;
  308. const Normalizer2 *nfkcNorm2;
  309. public:
  310. /**
  311. * <p>Default constructor.</p>
  312. *
  313. * @param adoptDictionary A DictionaryMatcher to adopt. Deleted when the
  314. * engine is deleted. The DictionaryMatcher must contain costs for each word
  315. * in order for the dictionary to work properly.
  316. */
  317. CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status);
  318. /**
  319. * <p>Virtual destructor.</p>
  320. */
  321. virtual ~CjkBreakEngine();
  322. protected:
  323. /**
  324. * <p>Divide up a range of known dictionary characters handled by this break engine.</p>
  325. *
  326. * @param text A UText representing the text
  327. * @param rangeStart The start of the range of dictionary characters
  328. * @param rangeEnd The end of the range of dictionary characters
  329. * @param foundBreaks Output of C array of int32_t break positions, or 0
  330. * @return The number of breaks found
  331. */
  332. virtual int32_t divideUpDictionaryRange( UText *text,
  333. int32_t rangeStart,
  334. int32_t rangeEnd,
  335. UVector32 &foundBreaks ) const;
  336. };
  337. #endif
  338. U_NAMESPACE_END
  339. /* DICTBE_H */
  340. #endif