break_iterator.h 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. // Copyright (c) 2011 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #ifndef BASE_I18N_BREAK_ITERATOR_H_
  5. #define BASE_I18N_BREAK_ITERATOR_H_
  6. #include <stddef.h>
  7. #include "base/i18n/base_i18n_export.h"
  8. #include "base/macros.h"
  9. #include "base/strings/string16.h"
  10. #include "base/strings/string_piece.h"
  11. // The BreakIterator class iterates through the words, word breaks, and
  12. // line breaks in a UTF-16 string.
  13. //
  14. // It provides several modes, BREAK_WORD, BREAK_LINE, BREAK_NEWLINE, and
  15. // BREAK_SENTENCE which modify how characters are aggregated into the returned
  16. // string.
  17. //
  18. // Under BREAK_WORD mode, once a word is encountered any non-word
  19. // characters are not included in the returned string (e.g. in the
  20. // UTF-16 equivalent of the string " foo bar! ", the word breaks are at
  21. // the periods in ". .foo. .bar.!. .").
  22. // Note that Chinese/Japanese/Thai do not use spaces between words so that
  23. // boundaries can fall in the middle of a continuous run of non-space /
  24. // non-punctuation characters.
  25. //
  26. // Under BREAK_LINE mode, once a line breaking opportunity is encountered,
  27. // any non-word characters are included in the returned string, breaking
  28. // only when a space-equivalent character or a line breaking opportunity
  29. // is encountered (e.g. in the UTF16-equivalent of the string " foo bar! ",
  30. // the breaks are at the periods in ". .foo .bar! .").
  31. //
  32. // Note that lines can be broken at any character/syllable/grapheme cluster
  33. // boundary in Chinese/Japanese/Korean and at word boundaries in Thai
  34. // (Thai does not use spaces between words). Therefore, this is NOT the same
  35. // as breaking only at space-equivalent characters where its former
  36. // name (BREAK_SPACE) implied.
  37. //
  38. // Under BREAK_NEWLINE mode, all characters are included in the returned
  39. // string, breaking only when a newline-equivalent character is encountered
  40. // (eg. in the UTF-16 equivalent of the string "foo\nbar!\n\n", the line
  41. // breaks are at the periods in ".foo\n.bar\n.\n.").
  42. //
  43. // Under BREAK_SENTENCE mode, all characters are included in the returned
  44. // string, breaking only on sentence boundaries defined in "Unicode Standard
  45. // Annex #29: Text Segmentation." Whitespace immediately following the sentence
  46. // is also included. For example, in the UTF-16 equivalent of the string
  47. // "foo bar! baz qux?" the breaks are at the periods in ".foo bar! .baz quz?."
  48. //
  49. // To extract the words from a string, move a BREAK_WORD BreakIterator
  50. // through the string and test whether IsWord() is true. E.g.,
  51. // BreakIterator iter(str, BreakIterator::BREAK_WORD);
  52. // if (!iter.Init())
  53. // return false;
  54. // while (iter.Advance()) {
  55. // if (iter.IsWord()) {
  56. // // Region [iter.prev(), iter.pos()) contains a word.
  57. // VLOG(1) << "word: " << iter.GetString();
  58. // }
  59. // }
  60. namespace base {
  61. namespace i18n {
  62. class BASE_I18N_EXPORT BreakIterator {
  63. public:
  64. enum BreakType {
  65. BREAK_WORD,
  66. BREAK_LINE,
  67. // TODO(jshin): Remove this after reviewing call sites.
  68. // If call sites really need break only on space-like characters
  69. // implement it separately.
  70. BREAK_SPACE = BREAK_LINE,
  71. BREAK_NEWLINE,
  72. BREAK_CHARACTER,
  73. // But don't remove this one!
  74. RULE_BASED,
  75. BREAK_SENTENCE,
  76. };
  77. enum WordBreakStatus {
  78. // The end of text that the iterator recognizes as word characters.
  79. // Non-word characters are things like punctuation and spaces.
  80. IS_WORD_BREAK,
  81. // Characters that the iterator can skip past, such as punctuation,
  82. // whitespace, and, if using RULE_BASED mode, characters from another
  83. // character set.
  84. IS_SKIPPABLE_WORD,
  85. // Only used if not in BREAK_WORD or RULE_BASED mode. This is returned for
  86. // newlines, line breaks, and character breaks.
  87. IS_LINE_OR_CHAR_BREAK
  88. };
  89. // Requires |str| to live as long as the BreakIterator does.
  90. BreakIterator(const StringPiece16& str, BreakType break_type);
  91. // Make a rule-based iterator. BreakType == RULE_BASED is implied.
  92. // TODO(andrewhayden): This signature could easily be misinterpreted as
  93. // "(const string16& str, const string16& locale)". We should do something
  94. // better.
  95. BreakIterator(const StringPiece16& str, const string16& rules);
  96. ~BreakIterator();
  97. // Init() must be called before any of the iterators are valid.
  98. // Returns false if ICU failed to initialize.
  99. bool Init();
  100. // Advance to the next break. Returns false if we've run past the end of
  101. // the string. (Note that the very last "break" is after the final
  102. // character in the string, and when we advance to that position it's the
  103. // last time Advance() returns true.)
  104. bool Advance();
  105. // Updates the text used by the iterator, resetting the iterator as if
  106. // if Init() had been called again. Any old state is lost. Returns true
  107. // unless there is an error setting the text.
  108. bool SetText(const base::char16* text, const size_t length);
  109. // Under BREAK_WORD mode, returns true if the break we just hit is the
  110. // end of a word. (Otherwise, the break iterator just skipped over e.g.
  111. // whitespace or punctuation.) Under BREAK_LINE and BREAK_NEWLINE modes,
  112. // this distinction doesn't apply and it always returns false.
  113. bool IsWord() const;
  114. // Under BREAK_WORD mode:
  115. // - Returns IS_SKIPPABLE_WORD if non-word characters, such as punctuation or
  116. // spaces, are found.
  117. // - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence
  118. // of word characters.
  119. // Under RULE_BASED mode:
  120. // - Returns IS_SKIPPABLE_WORD if characters outside the rules' character set
  121. // or non-word characters, such as punctuation or spaces, are found.
  122. // - Returns IS_WORD_BREAK if the break we just hit is the end of a sequence
  123. // of word characters that are in the rules' character set.
  124. // Not under BREAK_WORD or RULE_BASED mode:
  125. // - Returns IS_LINE_OR_CHAR_BREAK.
  126. BreakIterator::WordBreakStatus GetWordBreakStatus() const;
  127. // Under BREAK_WORD mode, returns true if |position| is at the end of word or
  128. // at the start of word. It always returns false under modes that are not
  129. // BREAK_WORD or RULE_BASED.
  130. bool IsEndOfWord(size_t position) const;
  131. bool IsStartOfWord(size_t position) const;
  132. // Under BREAK_SENTENCE mode, returns true if |position| is at a sentence
  133. // boundary. It always returns false under modes that are not BREAK_SENTENCE
  134. // or RULE_BASED.
  135. bool IsSentenceBoundary(size_t position) const;
  136. // Under BREAK_CHARACTER mode, returns whether |position| is a Unicode
  137. // grapheme boundary.
  138. bool IsGraphemeBoundary(size_t position) const;
  139. // Returns the string between prev() and pos().
  140. // Advance() must have been called successfully at least once for pos() to
  141. // have advanced to somewhere useful.
  142. string16 GetString() const;
  143. StringPiece16 GetStringPiece() const;
  144. // Returns the value of pos() returned before Advance() was last called.
  145. size_t prev() const { return prev_; }
  146. // Returns the current break position within the string,
  147. // or BreakIterator::npos when done.
  148. size_t pos() const { return pos_; }
  149. private:
  150. // ICU iterator, avoiding ICU ubrk.h dependence.
  151. // This is actually an ICU UBreakiterator* type, which turns out to be
  152. // a typedef for a void* in the ICU headers. Using void* directly prevents
  153. // callers from needing access to the ICU public headers directory.
  154. void* iter_;
  155. // The string we're iterating over. Can be changed with SetText(...)
  156. StringPiece16 string_;
  157. // Rules for our iterator. Mutually exclusive with break_type_.
  158. const string16 rules_;
  159. // The breaking style (word/space/newline). Mutually exclusive with rules_
  160. BreakType break_type_;
  161. // Previous and current iterator positions.
  162. size_t prev_, pos_;
  163. DISALLOW_COPY_AND_ASSIGN(BreakIterator);
  164. };
  165. } // namespace i18n
  166. } // namespace base
  167. #endif // BASE_I18N_BREAK_ITERATOR_H_