char_iterator.h 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. // Copyright (c) 2011 The Chromium Authors. All rights reserved.
  2. // Use of this source code is governed by a BSD-style license that can be
  3. // found in the LICENSE file.
  4. #ifndef BASE_I18N_CHAR_ITERATOR_H_
  5. #define BASE_I18N_CHAR_ITERATOR_H_
  6. #include <stddef.h>
  7. #include <stdint.h>
  8. #include <string>
  9. #include "base/gtest_prod_util.h"
  10. #include "base/i18n/base_i18n_export.h"
  11. #include "base/macros.h"
  12. #include "base/strings/string16.h"
  13. #include "build/build_config.h"
  14. // The CharIterator classes iterate through the characters in UTF8 and
  15. // UTF16 strings. Example usage:
  16. //
  17. // UTF8CharIterator iter(&str);
  18. // while (!iter.end()) {
  19. // VLOG(1) << iter.get();
  20. // iter.Advance();
  21. // }
  22. #if defined(OS_WIN)
  23. typedef unsigned char uint8_t;
  24. #endif
  25. namespace base {
  26. namespace i18n {
  27. class BASE_I18N_EXPORT UTF8CharIterator {
  28. public:
  29. // Requires |str| to live as long as the UTF8CharIterator does.
  30. explicit UTF8CharIterator(const std::string* str);
  31. ~UTF8CharIterator();
  32. // Return the starting array index of the current character within the
  33. // string.
  34. int32_t array_pos() const { return array_pos_; }
  35. // Return the logical index of the current character, independent of the
  36. // number of bytes each character takes.
  37. int32_t char_pos() const { return char_pos_; }
  38. // Return the current char.
  39. int32_t get() const { return char_; }
  40. // Returns true if we're at the end of the string.
  41. bool end() const { return array_pos_ == len_; }
  42. // Advance to the next actual character. Returns false if we're at the
  43. // end of the string.
  44. bool Advance();
  45. private:
  46. // The string we're iterating over.
  47. const uint8_t* str_;
  48. // The length of the encoded string.
  49. int32_t len_;
  50. // Array index.
  51. int32_t array_pos_;
  52. // The next array index.
  53. int32_t next_pos_;
  54. // Character index.
  55. int32_t char_pos_;
  56. // The current character.
  57. int32_t char_;
  58. DISALLOW_COPY_AND_ASSIGN(UTF8CharIterator);
  59. };
  60. class BASE_I18N_EXPORT UTF16CharIterator {
  61. public:
  62. // Requires |str| to live as long as the UTF16CharIterator does.
  63. explicit UTF16CharIterator(const string16* str);
  64. UTF16CharIterator(const char16* str, size_t str_len);
  65. UTF16CharIterator(UTF16CharIterator&& to_move);
  66. ~UTF16CharIterator();
  67. UTF16CharIterator& operator=(UTF16CharIterator&& to_move);
  68. // Returns an iterator starting on the unicode character at offset
  69. // |array_index| into the string, or the previous array offset if
  70. // |array_index| is the second half of a surrogate pair.
  71. static UTF16CharIterator LowerBound(const string16* str, size_t array_index);
  72. static UTF16CharIterator LowerBound(const char16* str,
  73. size_t str_len,
  74. size_t array_index);
  75. // Returns an iterator starting on the unicode character at offset
  76. // |array_index| into the string, or the next offset if |array_index| is the
  77. // second half of a surrogate pair.
  78. static UTF16CharIterator UpperBound(const string16* str, size_t array_index);
  79. static UTF16CharIterator UpperBound(const char16* str,
  80. size_t str_len,
  81. size_t array_index);
  82. // Return the starting array index of the current character within the
  83. // string.
  84. int32_t array_pos() const { return array_pos_; }
  85. // Returns the offset in code points from the initial iterator position, which
  86. // could be negative if Rewind() is called. The initial value is always zero,
  87. // regardless of how the iterator is constructed.
  88. int32_t char_offset() const { return char_offset_; }
  89. // Returns the code point at the current position.
  90. int32_t get() const { return char_; }
  91. // Returns the code point (i.e. the full Unicode character, not half of a
  92. // surrogate pair) following the current one. Should not be called if end() is
  93. // true. If the current code point is the last one in the string, returns
  94. // zero.
  95. int32_t NextCodePoint() const;
  96. // Returns the code point (i.e. the full Unicode character, not half of a
  97. // surrogate pair) preceding the current one. Should not be called if start()
  98. // is true.
  99. int32_t PreviousCodePoint() const;
  100. // Returns true if we're at the start of the string.
  101. bool start() const { return array_pos_ == 0; }
  102. // Returns true if we're at the end of the string.
  103. bool end() const { return array_pos_ == len_; }
  104. // Advances to the next actual character. Returns false if we're at the
  105. // end of the string.
  106. bool Advance();
  107. // Moves to the previous actual character. Returns false if we're at the start
  108. // of the string.
  109. bool Rewind();
  110. private:
  111. UTF16CharIterator(const string16* str, int32_t initial_pos);
  112. UTF16CharIterator(const char16* str, size_t str_len, int32_t initial_pos);
  113. // Fills in the current character we found and advances to the next
  114. // character, updating all flags as necessary.
  115. void ReadChar();
  116. // The string we're iterating over.
  117. const char16* str_;
  118. // The length of the encoded string.
  119. int32_t len_;
  120. // Array index.
  121. int32_t array_pos_;
  122. // The next array index.
  123. int32_t next_pos_;
  124. // Character offset from the initial position of the iterator.
  125. int32_t char_offset_;
  126. // The current character.
  127. int32_t char_;
  128. DISALLOW_COPY_AND_ASSIGN(UTF16CharIterator);
  129. };
  130. } // namespace i18n
  131. } // namespace base
  132. #endif // BASE_I18N_CHAR_ITERATOR_H_