brkeng.h 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. // © 2016 and later: Unicode, Inc. and others.
  2. // License & terms of use: http://www.unicode.org/copyright.html
  3. /**
  4. ************************************************************************************
  5. * Copyright (C) 2006-2012, International Business Machines Corporation and others. *
  6. * All Rights Reserved. *
  7. ************************************************************************************
  8. */
  9. #ifndef BRKENG_H
  10. #define BRKENG_H
  11. #include "unicode/utypes.h"
  12. #include "unicode/uobject.h"
  13. #include "unicode/utext.h"
  14. #include "unicode/uscript.h"
  15. U_NAMESPACE_BEGIN
  16. class UnicodeSet;
  17. class UStack;
  18. class UVector32;
  19. class DictionaryMatcher;
  20. /*******************************************************************
  21. * LanguageBreakEngine
  22. */
  23. /**
  24. * <p>LanguageBreakEngines implement language-specific knowledge for
  25. * finding text boundaries within a run of characters belonging to a
  26. * specific set. The boundaries will be of a specific kind, e.g. word,
  27. * line, etc.</p>
  28. *
  29. * <p>LanguageBreakEngines should normally be implemented so as to
  30. * be shared between threads without locking.</p>
  31. */
  32. class LanguageBreakEngine : public UMemory {
  33. public:
  34. /**
  35. * <p>Default constructor.</p>
  36. *
  37. */
  38. LanguageBreakEngine();
  39. /**
  40. * <p>Virtual destructor.</p>
  41. */
  42. virtual ~LanguageBreakEngine();
  43. /**
  44. * <p>Indicate whether this engine handles a particular character for
  45. * a particular kind of break.</p>
  46. *
  47. * @param c A character which begins a run that the engine might handle
  48. * @return TRUE if this engine handles the particular character and break
  49. * type.
  50. */
  51. virtual UBool handles(UChar32 c) const = 0;
  52. /**
  53. * <p>Find any breaks within a run in the supplied text.</p>
  54. *
  55. * @param text A UText representing the text. The
  56. * iterator is left at the end of the run of characters which the engine
  57. * is capable of handling.
  58. * @param startPos The start of the run within the supplied text.
  59. * @param endPos The end of the run within the supplied text.
  60. * @param foundBreaks A Vector of int32_t to receive the breaks.
  61. * @return The number of breaks found.
  62. */
  63. virtual int32_t findBreaks( UText *text,
  64. int32_t startPos,
  65. int32_t endPos,
  66. UVector32 &foundBreaks ) const = 0;
  67. };
  68. /*******************************************************************
  69. * LanguageBreakFactory
  70. */
  71. /**
  72. * <p>LanguageBreakFactorys find and return a LanguageBreakEngine
  73. * that can determine breaks for characters in a specific set, if
  74. * such an object can be found.</p>
  75. *
  76. * <p>If a LanguageBreakFactory is to be shared between threads,
  77. * appropriate synchronization must be used; there is none internal
  78. * to the factory.</p>
  79. *
  80. * <p>A LanguageBreakEngine returned by a LanguageBreakFactory can
  81. * normally be shared between threads without synchronization, unless
  82. * the specific subclass of LanguageBreakFactory indicates otherwise.</p>
  83. *
  84. * <p>A LanguageBreakFactory is responsible for deleting any LanguageBreakEngine
  85. * it returns when it itself is deleted, unless the specific subclass of
  86. * LanguageBreakFactory indicates otherwise. Naturally, the factory should
  87. * not be deleted until the LanguageBreakEngines it has returned are no
  88. * longer needed.</p>
  89. */
  90. class LanguageBreakFactory : public UMemory {
  91. public:
  92. /**
  93. * <p>Default constructor.</p>
  94. *
  95. */
  96. LanguageBreakFactory();
  97. /**
  98. * <p>Virtual destructor.</p>
  99. */
  100. virtual ~LanguageBreakFactory();
  101. /**
  102. * <p>Find and return a LanguageBreakEngine that can find the desired
  103. * kind of break for the set of characters to which the supplied
  104. * character belongs. It is up to the set of available engines to
  105. * determine what the sets of characters are.</p>
  106. *
  107. * @param c A character that begins a run for which a LanguageBreakEngine is
  108. * sought.
  109. * @return A LanguageBreakEngine with the desired characteristics, or 0.
  110. */
  111. virtual const LanguageBreakEngine *getEngineFor(UChar32 c) = 0;
  112. };
  113. /*******************************************************************
  114. * UnhandledEngine
  115. */
  116. /**
  117. * <p>UnhandledEngine is a special subclass of LanguageBreakEngine that
  118. * handles characters that no other LanguageBreakEngine is available to
  119. * handle. It is told the character and the type of break; at its
  120. * discretion it may handle more than the specified character (e.g.,
  121. * the entire script to which that character belongs.</p>
  122. *
  123. * <p>UnhandledEngines may not be shared between threads without
  124. * external synchronization.</p>
  125. */
  126. class UnhandledEngine : public LanguageBreakEngine {
  127. private:
  128. /**
  129. * The sets of characters handled.
  130. * @internal
  131. */
  132. UnicodeSet *fHandled;
  133. public:
  134. /**
  135. * <p>Default constructor.</p>
  136. *
  137. */
  138. UnhandledEngine(UErrorCode &status);
  139. /**
  140. * <p>Virtual destructor.</p>
  141. */
  142. virtual ~UnhandledEngine();
  143. /**
  144. * <p>Indicate whether this engine handles a particular character for
  145. * a particular kind of break.</p>
  146. *
  147. * @param c A character which begins a run that the engine might handle
  148. * @return TRUE if this engine handles the particular character and break
  149. * type.
  150. */
  151. virtual UBool handles(UChar32 c) const;
  152. /**
  153. * <p>Find any breaks within a run in the supplied text.</p>
  154. *
  155. * @param text A UText representing the text (TODO: UText). The
  156. * iterator is left at the end of the run of characters which the engine
  157. * is capable of handling.
  158. * @param startPos The start of the run within the supplied text.
  159. * @param endPos The end of the run within the supplied text.
  160. * @param foundBreaks An allocated C array of the breaks found, if any
  161. * @return The number of breaks found.
  162. */
  163. virtual int32_t findBreaks( UText *text,
  164. int32_t startPos,
  165. int32_t endPos,
  166. UVector32 &foundBreaks ) const;
  167. /**
  168. * <p>Tell the engine to handle a particular character and break type.</p>
  169. *
  170. * @param c A character which the engine should handle
  171. */
  172. virtual void handleCharacter(UChar32 c);
  173. };
  174. /*******************************************************************
  175. * ICULanguageBreakFactory
  176. */
  177. /**
  178. * <p>ICULanguageBreakFactory is the default LanguageBreakFactory for
  179. * ICU. It creates dictionary-based LanguageBreakEngines from dictionary
  180. * data in the ICU data file.</p>
  181. */
  182. class ICULanguageBreakFactory : public LanguageBreakFactory {
  183. private:
  184. /**
  185. * The stack of break engines created by this factory
  186. * @internal
  187. */
  188. UStack *fEngines;
  189. public:
  190. /**
  191. * <p>Standard constructor.</p>
  192. *
  193. */
  194. ICULanguageBreakFactory(UErrorCode &status);
  195. /**
  196. * <p>Virtual destructor.</p>
  197. */
  198. virtual ~ICULanguageBreakFactory();
  199. /**
  200. * <p>Find and return a LanguageBreakEngine that can find the desired
  201. * kind of break for the set of characters to which the supplied
  202. * character belongs. It is up to the set of available engines to
  203. * determine what the sets of characters are.</p>
  204. *
  205. * @param c A character that begins a run for which a LanguageBreakEngine is
  206. * sought.
  207. * @return A LanguageBreakEngine with the desired characteristics, or 0.
  208. */
  209. virtual const LanguageBreakEngine *getEngineFor(UChar32 c);
  210. protected:
  211. /**
  212. * <p>Create a LanguageBreakEngine for the set of characters to which
  213. * the supplied character belongs, for the specified break type.</p>
  214. *
  215. * @param c A character that begins a run for which a LanguageBreakEngine is
  216. * sought.
  217. * @return A LanguageBreakEngine with the desired characteristics, or 0.
  218. */
  219. virtual const LanguageBreakEngine *loadEngineFor(UChar32 c);
  220. /**
  221. * <p>Create a DictionaryMatcher for the specified script and break type.</p>
  222. * @param script An ISO 15924 script code that identifies the dictionary to be
  223. * created.
  224. * @return A DictionaryMatcher with the desired characteristics, or NULL.
  225. */
  226. virtual DictionaryMatcher *loadDictionaryMatcherFor(UScriptCode script);
  227. };
  228. U_NAMESPACE_END
  229. /* BRKENG_H */
  230. #endif