utf8.hpp 4.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. //
  2. // Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. //
  7. // Official repository: https://github.com/boostorg/json
  8. //
  9. #ifndef BOOST_JSON_DETAIL_UTF8_HPP
  10. #define BOOST_JSON_DETAIL_UTF8_HPP
  11. #include <cstddef>
  12. #include <cstring>
  13. #include <cstdint>
  14. BOOST_JSON_NS_BEGIN
  15. namespace detail {
  16. template<int N>
  17. std::uint32_t
  18. load_little_endian(void const* p)
  19. {
  20. // VFALCO do we need to initialize this to 0?
  21. std::uint32_t v;
  22. std::memcpy(&v, p, N);
  23. #ifdef BOOST_JSON_BIG_ENDIAN
  24. v = ((v & 0xFF000000) >> 24) |
  25. ((v & 0x00FF0000) >> 8) |
  26. ((v & 0x0000FF00) << 8) |
  27. ((v & 0x000000FF) << 24);
  28. #endif
  29. return v;
  30. }
  31. inline
  32. uint16_t
  33. classify_utf8(char c)
  34. {
  35. // 0x000 = invalid
  36. // 0x102 = 2 bytes, second byte [80, BF]
  37. // 0x203 = 3 bytes, second byte [A0, BF]
  38. // 0x303 = 3 bytes, second byte [80, BF]
  39. // 0x403 = 3 bytes, second byte [80, 9F]
  40. // 0x504 = 4 bytes, second byte [90, BF]
  41. // 0x604 = 4 bytes, second byte [80, BF]
  42. // 0x704 = 4 bytes, second byte [80, 8F]
  43. static constexpr uint16_t first[128]
  44. {
  45. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  46. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  47. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  48. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  49. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  50. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  51. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  52. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  53. 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
  54. 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
  55. 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
  56. 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102,
  57. 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303,
  58. 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303,
  59. 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000,
  60. 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000,
  61. };
  62. return first[static_cast<unsigned char>(c)];
  63. }
  64. inline
  65. bool
  66. is_valid_utf8(const char* p, uint16_t first)
  67. {
  68. uint32_t v;
  69. switch(first >> 8)
  70. {
  71. default:
  72. return false;
  73. // 2 bytes, second byte [80, BF]
  74. case 1:
  75. v = load_little_endian<2>(p);
  76. return (v & 0xC000) == 0x8000;
  77. // 3 bytes, second byte [A0, BF]
  78. case 2:
  79. v = load_little_endian<3>(p);
  80. std::memcpy(&v, p, 3);
  81. return (v & 0xC0E000) == 0x80A000;
  82. // 3 bytes, second byte [80, BF]
  83. case 3:
  84. v = load_little_endian<3>(p);
  85. return (v & 0xC0C000) == 0x808000;
  86. // 3 bytes, second byte [80, 9F]
  87. case 4:
  88. v = load_little_endian<3>(p);
  89. return (v & 0xC0E000) == 0x808000;
  90. // 4 bytes, second byte [90, BF]
  91. case 5:
  92. v = load_little_endian<4>(p);
  93. return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00;
  94. // 4 bytes, second byte [80, BF]
  95. case 6:
  96. v = load_little_endian<4>(p);
  97. return (v & 0xC0C0C000) == 0x80808000;
  98. // 4 bytes, second byte [80, 8F]
  99. case 7:
  100. v = load_little_endian<4>(p);
  101. return (v & 0xC0C0F000) == 0x80808000;
  102. }
  103. }
  104. class utf8_sequence
  105. {
  106. char seq_[4];
  107. uint16_t first_;
  108. uint8_t size_;
  109. public:
  110. void
  111. save(
  112. const char* p,
  113. std::size_t remain) noexcept
  114. {
  115. first_ = classify_utf8(*p & 0x7F);
  116. if(remain >= length())
  117. size_ = length();
  118. else
  119. size_ = static_cast<uint8_t>(remain);
  120. std::memcpy(seq_, p, size_);
  121. }
  122. uint8_t
  123. length() const noexcept
  124. {
  125. return first_ & 0xFF;
  126. }
  127. bool
  128. complete() const noexcept
  129. {
  130. return size_ >= length();
  131. }
  132. // returns true if complete
  133. bool
  134. append(
  135. const char* p,
  136. std::size_t remain) noexcept
  137. {
  138. if(BOOST_JSON_UNLIKELY(needed() == 0))
  139. return true;
  140. if(BOOST_JSON_LIKELY(remain >= needed()))
  141. {
  142. std::memcpy(
  143. seq_ + size_, p, needed());
  144. size_ = length();
  145. return true;
  146. }
  147. if(BOOST_JSON_LIKELY(remain > 0))
  148. {
  149. std::memcpy(seq_ + size_, p, remain);
  150. size_ += static_cast<uint8_t>(remain);
  151. }
  152. return false;
  153. }
  154. const char*
  155. data() const noexcept
  156. {
  157. return seq_;
  158. }
  159. uint8_t
  160. needed() const noexcept
  161. {
  162. return length() - size_;
  163. }
  164. bool
  165. valid() const noexcept
  166. {
  167. BOOST_ASSERT(size_ >= length());
  168. return is_valid_utf8(seq_, first_);
  169. }
  170. };
  171. } // detail
  172. BOOST_JSON_NS_END
  173. #endif