utf.hpp 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. // Copyright (c) 2020 Alexander Grund
  4. //
  5. // Distributed under the Boost Software License, Version 1.0. (See
  6. // accompanying file LICENSE or copy at
  7. // http://www.boost.org/LICENSE_1_0.txt)
  8. //
  9. #ifndef BOOST_NOWIDE_UTF_HPP_INCLUDED
  10. #define BOOST_NOWIDE_UTF_HPP_INCLUDED
  11. #include <boost/nowide/config.hpp>
  12. #include <cstdint>
  13. namespace boost {
  14. namespace nowide {
  15. ///
  16. /// \brief Namespace that holds basic operations on UTF encoded sequences
  17. ///
  18. /// All functions defined in this namespace do not require linking with Boost.Nowide library.
  19. /// Extracted from Boost.Locale
  20. ///
  21. namespace utf {
  22. ///
  23. /// \brief The integral type that can hold a Unicode code point
  24. ///
  25. using code_point = uint32_t;
  26. ///
  27. /// \brief Special constant that defines illegal code point
  28. ///
  29. static const code_point illegal = 0xFFFFFFFFu;
  30. ///
  31. /// \brief Special constant that defines incomplete code point
  32. ///
  33. static const code_point incomplete = 0xFFFFFFFEu;
  34. ///
  35. /// \brief the function checks if \a v is a valid code point
  36. ///
  37. inline bool is_valid_codepoint(code_point v)
  38. {
  39. if(v > 0x10FFFF)
  40. return false;
  41. if(0xD800 <= v && v <= 0xDFFF) // surrogates
  42. return false;
  43. return true;
  44. }
  45. #ifdef BOOST_NOWIDE_DOXYGEN
  46. ///
  47. /// \brief UTF Traits class - functions to convert UTF sequences to and from Unicode code points
  48. ///
  49. template<typename CharType, int size = sizeof(CharType)>
  50. struct utf_traits
  51. {
  52. ///
  53. /// The type of the character
  54. ///
  55. using char_type = CharType;
  56. ///
  57. /// Read one code point from the range [p,e) and return it.
  58. ///
  59. /// - If the sequence that was read is incomplete sequence returns \ref incomplete,
  60. /// - If illegal sequence detected returns \ref illegal
  61. ///
  62. /// Requirements
  63. ///
  64. /// - Iterator is valid input iterator
  65. ///
  66. /// Postconditions
  67. ///
  68. /// - p points to the last consumed character
  69. ///
  70. template<typename Iterator>
  71. static code_point decode(Iterator& p, Iterator e);
  72. ///
  73. /// Maximal width of valid sequence in the code units:
  74. ///
  75. /// - UTF-8 - 4
  76. /// - UTF-16 - 2
  77. /// - UTF-32 - 1
  78. ///
  79. static const int max_width;
  80. ///
  81. /// The width of specific code point in the code units.
  82. ///
  83. /// Requirement: value is a valid Unicode code point
  84. /// Returns value in range [1..max_width]
  85. ///
  86. static int width(code_point value);
  87. ///
  88. /// Get the size of the trail part of variable length encoded sequence.
  89. ///
  90. /// Returns -1 if C is not valid lead character
  91. ///
  92. static int trail_length(char_type c);
  93. ///
  94. /// Returns true if c is trail code unit, always false for UTF-32
  95. ///
  96. static bool is_trail(char_type c);
  97. ///
  98. /// Returns true if c is lead code unit, always true of UTF-32
  99. ///
  100. static bool is_lead(char_type c);
  101. ///
  102. /// Convert valid Unicode code point \a value to the UTF sequence.
  103. ///
  104. /// Requirements:
  105. ///
  106. /// - \a value is valid code point
  107. /// - \a out is an output iterator should be able to accept at least width(value) units
  108. ///
  109. /// Returns the iterator past the last written code unit.
  110. ///
  111. template<typename Iterator>
  112. static Iterator encode(code_point value, Iterator out);
  113. ///
  114. /// Decodes valid UTF sequence that is pointed by p into code point.
  115. ///
  116. /// If the sequence is invalid or points to end the behavior is undefined
  117. ///
  118. template<typename Iterator>
  119. static code_point decode_valid(Iterator& p);
  120. };
  121. #else
  122. template<typename CharType, int size = sizeof(CharType)>
  123. struct utf_traits;
  124. template<typename CharType>
  125. struct utf_traits<CharType, 1>
  126. {
  127. using char_type = CharType;
  128. static int trail_length(char_type ci)
  129. {
  130. unsigned char c = ci;
  131. if(c < 128)
  132. return 0;
  133. if(BOOST_UNLIKELY(c < 194))
  134. return -1;
  135. if(c < 224)
  136. return 1;
  137. if(c < 240)
  138. return 2;
  139. if(BOOST_LIKELY(c <= 244))
  140. return 3;
  141. return -1;
  142. }
  143. static const int max_width = 4;
  144. static int width(code_point value)
  145. {
  146. if(value <= 0x7F)
  147. {
  148. return 1;
  149. } else if(value <= 0x7FF)
  150. {
  151. return 2;
  152. } else if(BOOST_LIKELY(value <= 0xFFFF))
  153. {
  154. return 3;
  155. } else
  156. {
  157. return 4;
  158. }
  159. }
  160. static bool is_trail(char_type ci)
  161. {
  162. unsigned char c = ci;
  163. return (c & 0xC0) == 0x80;
  164. }
  165. static bool is_lead(char_type ci)
  166. {
  167. return !is_trail(ci);
  168. }
  169. template<typename Iterator>
  170. static code_point decode(Iterator& p, Iterator e)
  171. {
  172. if(BOOST_UNLIKELY(p == e))
  173. return incomplete;
  174. unsigned char lead = *p++;
  175. // First byte is fully validated here
  176. int trail_size = trail_length(lead);
  177. if(BOOST_UNLIKELY(trail_size < 0))
  178. return illegal;
  179. // OK as only ASCII may be of size = 0
  180. // also optimize for ASCII text
  181. if(trail_size == 0)
  182. return lead;
  183. code_point c = lead & ((1 << (6 - trail_size)) - 1);
  184. // Read the rest
  185. unsigned char tmp;
  186. switch(trail_size)
  187. {
  188. case 3:
  189. if(BOOST_UNLIKELY(p == e))
  190. return incomplete;
  191. tmp = *p++;
  192. if(!is_trail(tmp))
  193. return illegal;
  194. c = (c << 6) | (tmp & 0x3F);
  195. BOOST_NOWIDE_FALLTHROUGH;
  196. case 2:
  197. if(BOOST_UNLIKELY(p == e))
  198. return incomplete;
  199. tmp = *p++;
  200. if(!is_trail(tmp))
  201. return illegal;
  202. c = (c << 6) | (tmp & 0x3F);
  203. BOOST_NOWIDE_FALLTHROUGH;
  204. case 1:
  205. if(BOOST_UNLIKELY(p == e))
  206. return incomplete;
  207. tmp = *p++;
  208. if(!is_trail(tmp))
  209. return illegal;
  210. c = (c << 6) | (tmp & 0x3F);
  211. }
  212. // Check code point validity:
  213. // - no surrogates and valid range
  214. // - most compact representation
  215. if(BOOST_UNLIKELY(!is_valid_codepoint(c)) || BOOST_UNLIKELY(width(c) != trail_size + 1))
  216. {
  217. p -= trail_size;
  218. return illegal;
  219. }
  220. return c;
  221. }
  222. template<typename Iterator>
  223. static code_point decode_valid(Iterator& p)
  224. {
  225. unsigned char lead = *p++;
  226. if(lead < 192)
  227. return lead;
  228. int trail_size;
  229. if(lead < 224)
  230. trail_size = 1;
  231. else if(BOOST_LIKELY(lead < 240)) // non-BMP rare
  232. trail_size = 2;
  233. else
  234. trail_size = 3;
  235. code_point c = lead & ((1 << (6 - trail_size)) - 1);
  236. switch(trail_size)
  237. {
  238. case 3: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
  239. case 2: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F); BOOST_NOWIDE_FALLTHROUGH;
  240. case 1: c = (c << 6) | (static_cast<unsigned char>(*p++) & 0x3F);
  241. }
  242. return c;
  243. }
  244. template<typename Iterator>
  245. static Iterator encode(code_point value, Iterator out)
  246. {
  247. if(value <= 0x7F)
  248. {
  249. *out++ = static_cast<char_type>(value);
  250. } else if(value <= 0x7FF)
  251. {
  252. *out++ = static_cast<char_type>((value >> 6) | 0xC0);
  253. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  254. } else if(BOOST_LIKELY(value <= 0xFFFF))
  255. {
  256. *out++ = static_cast<char_type>((value >> 12) | 0xE0);
  257. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  258. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  259. } else
  260. {
  261. *out++ = static_cast<char_type>((value >> 18) | 0xF0);
  262. *out++ = static_cast<char_type>(((value >> 12) & 0x3F) | 0x80);
  263. *out++ = static_cast<char_type>(((value >> 6) & 0x3F) | 0x80);
  264. *out++ = static_cast<char_type>((value & 0x3F) | 0x80);
  265. }
  266. return out;
  267. }
  268. }; // utf8
  269. template<typename CharType>
  270. struct utf_traits<CharType, 2>
  271. {
  272. using char_type = CharType;
  273. // See RFC 2781
  274. static bool is_first_surrogate(uint16_t x)
  275. {
  276. return 0xD800 <= x && x <= 0xDBFF;
  277. }
  278. static bool is_second_surrogate(uint16_t x)
  279. {
  280. return 0xDC00 <= x && x <= 0xDFFF;
  281. }
  282. static code_point combine_surrogate(uint16_t w1, uint16_t w2)
  283. {
  284. return ((code_point(w1 & 0x3FF) << 10) | (w2 & 0x3FF)) + 0x10000;
  285. }
  286. static int trail_length(char_type c)
  287. {
  288. if(is_first_surrogate(c))
  289. return 1;
  290. if(is_second_surrogate(c))
  291. return -1;
  292. return 0;
  293. }
  294. ///
  295. /// Returns true if c is trail code unit, always false for UTF-32
  296. ///
  297. static bool is_trail(char_type c)
  298. {
  299. return is_second_surrogate(c);
  300. }
  301. ///
  302. /// Returns true if c is lead code unit, always true of UTF-32
  303. ///
  304. static bool is_lead(char_type c)
  305. {
  306. return !is_second_surrogate(c);
  307. }
  308. template<typename It>
  309. static code_point decode(It& current, It last)
  310. {
  311. if(BOOST_UNLIKELY(current == last))
  312. return incomplete;
  313. uint16_t w1 = *current++;
  314. if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
  315. {
  316. return w1;
  317. }
  318. if(w1 > 0xDBFF)
  319. return illegal;
  320. if(current == last)
  321. return incomplete;
  322. uint16_t w2 = *current++;
  323. if(w2 < 0xDC00 || 0xDFFF < w2)
  324. return illegal;
  325. return combine_surrogate(w1, w2);
  326. }
  327. template<typename It>
  328. static code_point decode_valid(It& current)
  329. {
  330. uint16_t w1 = *current++;
  331. if(BOOST_LIKELY(w1 < 0xD800 || 0xDFFF < w1))
  332. {
  333. return w1;
  334. }
  335. uint16_t w2 = *current++;
  336. return combine_surrogate(w1, w2);
  337. }
  338. static const int max_width = 2;
  339. static int width(code_point u)
  340. {
  341. return u >= 0x10000 ? 2 : 1;
  342. }
  343. template<typename It>
  344. static It encode(code_point u, It out)
  345. {
  346. if(BOOST_LIKELY(u <= 0xFFFF))
  347. {
  348. *out++ = static_cast<char_type>(u);
  349. } else
  350. {
  351. u -= 0x10000;
  352. *out++ = static_cast<char_type>(0xD800 | (u >> 10));
  353. *out++ = static_cast<char_type>(0xDC00 | (u & 0x3FF));
  354. }
  355. return out;
  356. }
  357. }; // utf16;
  358. template<typename CharType>
  359. struct utf_traits<CharType, 4>
  360. {
  361. using char_type = CharType;
  362. static int trail_length(char_type c)
  363. {
  364. if(is_valid_codepoint(c))
  365. return 0;
  366. return -1;
  367. }
  368. static bool is_trail(char_type /*c*/)
  369. {
  370. return false;
  371. }
  372. static bool is_lead(char_type /*c*/)
  373. {
  374. return true;
  375. }
  376. template<typename It>
  377. static code_point decode_valid(It& current)
  378. {
  379. return *current++;
  380. }
  381. template<typename It>
  382. static code_point decode(It& current, It last)
  383. {
  384. if(BOOST_UNLIKELY(current == last))
  385. return incomplete;
  386. code_point c = *current++;
  387. if(BOOST_UNLIKELY(!is_valid_codepoint(c)))
  388. return illegal;
  389. return c;
  390. }
  391. static const int max_width = 1;
  392. static int width(code_point /*u*/)
  393. {
  394. return 1;
  395. }
  396. template<typename It>
  397. static It encode(code_point u, It out)
  398. {
  399. *out++ = static_cast<char_type>(u);
  400. return out;
  401. }
  402. }; // utf32
  403. #endif
  404. } // namespace utf
  405. } // namespace nowide
  406. } // namespace boost
  407. #endif