unicode_iterator.hpp 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864
  1. /*
  2. *
  3. * Copyright (c) 2004
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. /*
  12. * LOCATION: see http://www.boost.org for most recent version.
  13. * FILE unicode_iterator.hpp
  14. * VERSION see <boost/version.hpp>
  15. * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
  16. */
  17. /****************************************************************************
  18. Contents:
  19. ~~~~~~~~~
  20. 1) Read Only, Input Adapters:
  21. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  22. template <class BaseIterator, class U8Type = std::uint8_t>
  23. class u32_to_u8_iterator;
  24. Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
  25. template <class BaseIterator, class U32Type = std::uint32_t>
  26. class u8_to_u32_iterator;
  27. Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
  28. template <class BaseIterator, class U16Type = std::uint16_t>
  29. class u32_to_u16_iterator;
  30. Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
  31. template <class BaseIterator, class U32Type = std::uint32_t>
  32. class u16_to_u32_iterator;
  33. Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
  34. 2) Single pass output iterator adapters:
  35. template <class BaseIterator>
  36. class utf8_output_iterator;
  37. Accepts UTF-32 code points and forwards them on as UTF-8 code points.
  38. template <class BaseIterator>
  39. class utf16_output_iterator;
  40. Accepts UTF-32 code points and forwards them on as UTF-16 code points.
  41. ****************************************************************************/
  42. #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
  43. #define BOOST_REGEX_UNICODE_ITERATOR_HPP
  44. #include <cstdint>
  45. #include <boost/regex/config.hpp>
  46. #include <stdexcept>
  47. #include <sstream>
  48. #include <ios>
  49. #include <limits.h> // CHAR_BIT
  50. #include <iostream>
  51. #ifndef BOOST_REGEX_STANDALONE
  52. #include <boost/throw_exception.hpp>
  53. #endif
  54. namespace boost{
  55. namespace detail{
  56. static const std::uint16_t high_surrogate_base = 0xD7C0u;
  57. static const std::uint16_t low_surrogate_base = 0xDC00u;
  58. static const std::uint32_t ten_bit_mask = 0x3FFu;
  59. inline bool is_high_surrogate(std::uint16_t v)
  60. {
  61. return (v & 0xFFFFFC00u) == 0xd800u;
  62. }
  63. inline bool is_low_surrogate(std::uint16_t v)
  64. {
  65. return (v & 0xFFFFFC00u) == 0xdc00u;
  66. }
  67. template <class T>
  68. inline bool is_surrogate(T v)
  69. {
  70. return (v & 0xFFFFF800u) == 0xd800;
  71. }
  72. inline unsigned utf8_byte_count(std::uint8_t c)
  73. {
  74. // if the most significant bit with a zero in it is in position
  75. // 8-N then there are N bytes in this UTF-8 sequence:
  76. std::uint8_t mask = 0x80u;
  77. unsigned result = 0;
  78. while(c & mask)
  79. {
  80. ++result;
  81. mask >>= 1;
  82. }
  83. return (result == 0) ? 1 : ((result > 4) ? 4 : result);
  84. }
  85. inline unsigned utf8_trailing_byte_count(std::uint8_t c)
  86. {
  87. return utf8_byte_count(c) - 1;
  88. }
  89. #ifdef BOOST_REGEX_MSVC
  90. #pragma warning(push)
  91. #pragma warning(disable:4100)
  92. #endif
  93. #ifndef BOOST_NO_EXCEPTIONS
  94. BOOST_REGEX_NORETURN
  95. #endif
  96. inline void invalid_utf32_code_point(std::uint32_t val)
  97. {
  98. std::stringstream ss;
  99. ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
  100. std::out_of_range e(ss.str());
  101. #ifndef BOOST_REGEX_STANDALONE
  102. boost::throw_exception(e);
  103. #else
  104. throw e;
  105. #endif
  106. }
  107. #ifdef BOOST_REGEX_MSVC
  108. #pragma warning(pop)
  109. #endif
  110. } // namespace detail
  111. template <class BaseIterator, class U16Type = std::uint16_t>
  112. class u32_to_u16_iterator
  113. {
  114. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  115. static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  116. static_assert(sizeof(U16Type)*CHAR_BIT == 16, "Incorrectly sized template argument");
  117. public:
  118. typedef std::ptrdiff_t difference_type;
  119. typedef U16Type value_type;
  120. typedef value_type const* pointer;
  121. typedef value_type const reference;
  122. typedef std::bidirectional_iterator_tag iterator_category;
  123. reference operator*()const
  124. {
  125. if(m_current == 2)
  126. extract_current();
  127. return m_values[m_current];
  128. }
  129. bool operator==(const u32_to_u16_iterator& that)const
  130. {
  131. if(m_position == that.m_position)
  132. {
  133. // Both m_currents must be equal, or both even
  134. // this is the same as saying their sum must be even:
  135. return (m_current + that.m_current) & 1u ? false : true;
  136. }
  137. return false;
  138. }
  139. bool operator!=(const u32_to_u16_iterator& that)const
  140. {
  141. return !(*this == that);
  142. }
  143. u32_to_u16_iterator& operator++()
  144. {
  145. // if we have a pending read then read now, so that we know whether
  146. // to skip a position, or move to a low-surrogate:
  147. if(m_current == 2)
  148. {
  149. // pending read:
  150. extract_current();
  151. }
  152. // move to the next surrogate position:
  153. ++m_current;
  154. // if we've reached the end skip a position:
  155. if(m_values[m_current] == 0)
  156. {
  157. m_current = 2;
  158. ++m_position;
  159. }
  160. return *this;
  161. }
  162. u32_to_u16_iterator operator++(int)
  163. {
  164. u32_to_u16_iterator r(*this);
  165. ++(*this);
  166. return r;
  167. }
  168. u32_to_u16_iterator& operator--()
  169. {
  170. if(m_current != 1)
  171. {
  172. // decrementing an iterator always leads to a valid position:
  173. --m_position;
  174. extract_current();
  175. m_current = m_values[1] ? 1 : 0;
  176. }
  177. else
  178. {
  179. m_current = 0;
  180. }
  181. return *this;
  182. }
  183. u32_to_u16_iterator operator--(int)
  184. {
  185. u32_to_u16_iterator r(*this);
  186. --(*this);
  187. return r;
  188. }
  189. BaseIterator base()const
  190. {
  191. return m_position;
  192. }
  193. // construct:
  194. u32_to_u16_iterator() : m_position(), m_current(0)
  195. {
  196. m_values[0] = 0;
  197. m_values[1] = 0;
  198. m_values[2] = 0;
  199. }
  200. u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
  201. {
  202. m_values[0] = 0;
  203. m_values[1] = 0;
  204. m_values[2] = 0;
  205. }
  206. private:
  207. void extract_current()const
  208. {
  209. // begin by checking for a code point out of range:
  210. std::uint32_t v = *m_position;
  211. if(v >= 0x10000u)
  212. {
  213. if(v > 0x10FFFFu)
  214. detail::invalid_utf32_code_point(*m_position);
  215. // split into two surrogates:
  216. m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
  217. m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
  218. m_current = 0;
  219. BOOST_REGEX_ASSERT(detail::is_high_surrogate(m_values[0]));
  220. BOOST_REGEX_ASSERT(detail::is_low_surrogate(m_values[1]));
  221. }
  222. else
  223. {
  224. // 16-bit code point:
  225. m_values[0] = static_cast<U16Type>(*m_position);
  226. m_values[1] = 0;
  227. m_current = 0;
  228. // value must not be a surrogate:
  229. if(detail::is_surrogate(m_values[0]))
  230. detail::invalid_utf32_code_point(*m_position);
  231. }
  232. }
  233. BaseIterator m_position;
  234. mutable U16Type m_values[3];
  235. mutable unsigned m_current;
  236. };
  237. template <class BaseIterator, class U32Type = std::uint32_t>
  238. class u16_to_u32_iterator
  239. {
  240. // special values for pending iterator reads:
  241. static const U32Type pending_read = 0xffffffffu;
  242. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  243. static_assert(sizeof(base_value_type)*CHAR_BIT == 16, "Incorrectly sized template argument");
  244. static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  245. public:
  246. typedef std::ptrdiff_t difference_type;
  247. typedef U32Type value_type;
  248. typedef value_type const* pointer;
  249. typedef value_type const reference;
  250. typedef std::bidirectional_iterator_tag iterator_category;
  251. reference operator*()const
  252. {
  253. if(m_value == pending_read)
  254. extract_current();
  255. return m_value;
  256. }
  257. bool operator==(const u16_to_u32_iterator& that)const
  258. {
  259. return m_position == that.m_position;
  260. }
  261. bool operator!=(const u16_to_u32_iterator& that)const
  262. {
  263. return !(*this == that);
  264. }
  265. u16_to_u32_iterator& operator++()
  266. {
  267. // skip high surrogate first if there is one:
  268. if(detail::is_high_surrogate(*m_position)) ++m_position;
  269. ++m_position;
  270. m_value = pending_read;
  271. return *this;
  272. }
  273. u16_to_u32_iterator operator++(int)
  274. {
  275. u16_to_u32_iterator r(*this);
  276. ++(*this);
  277. return r;
  278. }
  279. u16_to_u32_iterator& operator--()
  280. {
  281. --m_position;
  282. // if we have a low surrogate then go back one more:
  283. if(detail::is_low_surrogate(*m_position))
  284. --m_position;
  285. m_value = pending_read;
  286. return *this;
  287. }
  288. u16_to_u32_iterator operator--(int)
  289. {
  290. u16_to_u32_iterator r(*this);
  291. --(*this);
  292. return r;
  293. }
  294. BaseIterator base()const
  295. {
  296. return m_position;
  297. }
  298. // construct:
  299. u16_to_u32_iterator() : m_position()
  300. {
  301. m_value = pending_read;
  302. }
  303. u16_to_u32_iterator(BaseIterator b) : m_position(b)
  304. {
  305. m_value = pending_read;
  306. }
  307. //
  308. // Range checked version:
  309. //
  310. u16_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
  311. {
  312. m_value = pending_read;
  313. //
  314. // The range must not start with a low surrogate, or end in a high surrogate,
  315. // otherwise we run the risk of running outside the underlying input range.
  316. // Likewise b must not be located at a low surrogate.
  317. //
  318. std::uint16_t val;
  319. if(start != end)
  320. {
  321. if((b != start) && (b != end))
  322. {
  323. val = *b;
  324. if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
  325. invalid_code_point(val);
  326. }
  327. val = *start;
  328. if(detail::is_surrogate(val) && ((val & 0xFC00u) == 0xDC00u))
  329. invalid_code_point(val);
  330. val = *--end;
  331. if(detail::is_high_surrogate(val))
  332. invalid_code_point(val);
  333. }
  334. }
  335. private:
  336. static void invalid_code_point(std::uint16_t val)
  337. {
  338. std::stringstream ss;
  339. ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
  340. std::out_of_range e(ss.str());
  341. #ifndef BOOST_REGEX_STANDALONE
  342. boost::throw_exception(e);
  343. #else
  344. throw e;
  345. #endif
  346. }
  347. void extract_current()const
  348. {
  349. m_value = static_cast<U32Type>(static_cast< std::uint16_t>(*m_position));
  350. // if the last value is a high surrogate then adjust m_position and m_value as needed:
  351. if(detail::is_high_surrogate(*m_position))
  352. {
  353. // precondition; next value must have be a low-surrogate:
  354. BaseIterator next(m_position);
  355. std::uint16_t t = *++next;
  356. if((t & 0xFC00u) != 0xDC00u)
  357. invalid_code_point(t);
  358. m_value = (m_value - detail::high_surrogate_base) << 10;
  359. m_value |= (static_cast<U32Type>(static_cast< std::uint16_t>(t)) & detail::ten_bit_mask);
  360. }
  361. // postcondition; result must not be a surrogate:
  362. if(detail::is_surrogate(m_value))
  363. invalid_code_point(static_cast< std::uint16_t>(m_value));
  364. }
  365. BaseIterator m_position;
  366. mutable U32Type m_value;
  367. };
  368. template <class BaseIterator, class U8Type = std::uint8_t>
  369. class u32_to_u8_iterator
  370. {
  371. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  372. static_assert(sizeof(base_value_type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  373. static_assert(sizeof(U8Type)*CHAR_BIT == 8, "Incorrectly sized template argument");
  374. public:
  375. typedef std::ptrdiff_t difference_type;
  376. typedef U8Type value_type;
  377. typedef value_type const* pointer;
  378. typedef value_type const reference;
  379. typedef std::bidirectional_iterator_tag iterator_category;
  380. reference operator*()const
  381. {
  382. if(m_current == 4)
  383. extract_current();
  384. return m_values[m_current];
  385. }
  386. bool operator==(const u32_to_u8_iterator& that)const
  387. {
  388. if(m_position == that.m_position)
  389. {
  390. // either the m_current's must be equal, or one must be 0 and
  391. // the other 4: which means neither must have bits 1 or 2 set:
  392. return (m_current == that.m_current)
  393. || (((m_current | that.m_current) & 3) == 0);
  394. }
  395. return false;
  396. }
  397. bool operator!=(const u32_to_u8_iterator& that)const
  398. {
  399. return !(*this == that);
  400. }
  401. u32_to_u8_iterator& operator++()
  402. {
  403. // if we have a pending read then read now, so that we know whether
  404. // to skip a position, or move to a low-surrogate:
  405. if(m_current == 4)
  406. {
  407. // pending read:
  408. extract_current();
  409. }
  410. // move to the next surrogate position:
  411. ++m_current;
  412. // if we've reached the end skip a position:
  413. if(m_values[m_current] == 0)
  414. {
  415. m_current = 4;
  416. ++m_position;
  417. }
  418. return *this;
  419. }
  420. u32_to_u8_iterator operator++(int)
  421. {
  422. u32_to_u8_iterator r(*this);
  423. ++(*this);
  424. return r;
  425. }
  426. u32_to_u8_iterator& operator--()
  427. {
  428. if((m_current & 3) == 0)
  429. {
  430. --m_position;
  431. extract_current();
  432. m_current = 3;
  433. while(m_current && (m_values[m_current] == 0))
  434. --m_current;
  435. }
  436. else
  437. --m_current;
  438. return *this;
  439. }
  440. u32_to_u8_iterator operator--(int)
  441. {
  442. u32_to_u8_iterator r(*this);
  443. --(*this);
  444. return r;
  445. }
  446. BaseIterator base()const
  447. {
  448. return m_position;
  449. }
  450. // construct:
  451. u32_to_u8_iterator() : m_position(), m_current(0)
  452. {
  453. m_values[0] = 0;
  454. m_values[1] = 0;
  455. m_values[2] = 0;
  456. m_values[3] = 0;
  457. m_values[4] = 0;
  458. }
  459. u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
  460. {
  461. m_values[0] = 0;
  462. m_values[1] = 0;
  463. m_values[2] = 0;
  464. m_values[3] = 0;
  465. m_values[4] = 0;
  466. }
  467. private:
  468. void extract_current()const
  469. {
  470. std::uint32_t c = *m_position;
  471. if(c > 0x10FFFFu)
  472. detail::invalid_utf32_code_point(c);
  473. if(c < 0x80u)
  474. {
  475. m_values[0] = static_cast<unsigned char>(c);
  476. m_values[1] = static_cast<unsigned char>(0u);
  477. m_values[2] = static_cast<unsigned char>(0u);
  478. m_values[3] = static_cast<unsigned char>(0u);
  479. }
  480. else if(c < 0x800u)
  481. {
  482. m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
  483. m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  484. m_values[2] = static_cast<unsigned char>(0u);
  485. m_values[3] = static_cast<unsigned char>(0u);
  486. }
  487. else if(c < 0x10000u)
  488. {
  489. m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
  490. m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  491. m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  492. m_values[3] = static_cast<unsigned char>(0u);
  493. }
  494. else
  495. {
  496. m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
  497. m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
  498. m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  499. m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  500. }
  501. m_current= 0;
  502. }
  503. BaseIterator m_position;
  504. mutable U8Type m_values[5];
  505. mutable unsigned m_current;
  506. };
  507. template <class BaseIterator, class U32Type = std::uint32_t>
  508. class u8_to_u32_iterator
  509. {
  510. // special values for pending iterator reads:
  511. static const U32Type pending_read = 0xffffffffu;
  512. typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
  513. static_assert(sizeof(base_value_type)*CHAR_BIT == 8, "Incorrectly sized template argument");
  514. static_assert(sizeof(U32Type)*CHAR_BIT == 32, "Incorrectly sized template argument");
  515. public:
  516. typedef std::ptrdiff_t difference_type;
  517. typedef U32Type value_type;
  518. typedef value_type const* pointer;
  519. typedef value_type const reference;
  520. typedef std::bidirectional_iterator_tag iterator_category;
  521. reference operator*()const
  522. {
  523. if(m_value == pending_read)
  524. extract_current();
  525. return m_value;
  526. }
  527. bool operator==(const u8_to_u32_iterator& that)const
  528. {
  529. return m_position == that.m_position;
  530. }
  531. bool operator!=(const u8_to_u32_iterator& that)const
  532. {
  533. return !(*this == that);
  534. }
  535. u8_to_u32_iterator& operator++()
  536. {
  537. // We must not start with a continuation character:
  538. if((static_cast<std::uint8_t>(*m_position) & 0xC0) == 0x80)
  539. invalid_sequence();
  540. // skip high surrogate first if there is one:
  541. unsigned c = detail::utf8_byte_count(*m_position);
  542. if(m_value == pending_read)
  543. {
  544. // Since we haven't read in a value, we need to validate the code points:
  545. for(unsigned i = 0; i < c; ++i)
  546. {
  547. ++m_position;
  548. // We must have a continuation byte:
  549. if((i != c - 1) && ((static_cast<std::uint8_t>(*m_position) & 0xC0) != 0x80))
  550. invalid_sequence();
  551. }
  552. }
  553. else
  554. {
  555. std::advance(m_position, c);
  556. }
  557. m_value = pending_read;
  558. return *this;
  559. }
  560. u8_to_u32_iterator operator++(int)
  561. {
  562. u8_to_u32_iterator r(*this);
  563. ++(*this);
  564. return r;
  565. }
  566. u8_to_u32_iterator& operator--()
  567. {
  568. // Keep backtracking until we don't have a trailing character:
  569. unsigned count = 0;
  570. while((*--m_position & 0xC0u) == 0x80u) ++count;
  571. // now check that the sequence was valid:
  572. if(count != detail::utf8_trailing_byte_count(*m_position))
  573. invalid_sequence();
  574. m_value = pending_read;
  575. return *this;
  576. }
  577. u8_to_u32_iterator operator--(int)
  578. {
  579. u8_to_u32_iterator r(*this);
  580. --(*this);
  581. return r;
  582. }
  583. BaseIterator base()const
  584. {
  585. return m_position;
  586. }
  587. // construct:
  588. u8_to_u32_iterator() : m_position()
  589. {
  590. m_value = pending_read;
  591. }
  592. u8_to_u32_iterator(BaseIterator b) : m_position(b)
  593. {
  594. m_value = pending_read;
  595. }
  596. //
  597. // Checked constructor:
  598. //
  599. u8_to_u32_iterator(BaseIterator b, BaseIterator start, BaseIterator end) : m_position(b)
  600. {
  601. m_value = pending_read;
  602. //
  603. // We must not start with a continuation character, or end with a
  604. // truncated UTF-8 sequence otherwise we run the risk of going past
  605. // the start/end of the underlying sequence:
  606. //
  607. if(start != end)
  608. {
  609. unsigned char v = *start;
  610. if((v & 0xC0u) == 0x80u)
  611. invalid_sequence();
  612. if((b != start) && (b != end) && ((*b & 0xC0u) == 0x80u))
  613. invalid_sequence();
  614. BaseIterator pos = end;
  615. do
  616. {
  617. v = *--pos;
  618. }
  619. while((start != pos) && ((v & 0xC0u) == 0x80u));
  620. std::ptrdiff_t extra = detail::utf8_byte_count(v);
  621. if(std::distance(pos, end) < extra)
  622. invalid_sequence();
  623. }
  624. }
  625. private:
  626. static void invalid_sequence()
  627. {
  628. std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
  629. #ifndef BOOST_REGEX_STANDALONE
  630. boost::throw_exception(e);
  631. #else
  632. throw e;
  633. #endif
  634. }
  635. void extract_current()const
  636. {
  637. m_value = static_cast<U32Type>(static_cast< std::uint8_t>(*m_position));
  638. // we must not have a continuation character:
  639. if((m_value & 0xC0u) == 0x80u)
  640. invalid_sequence();
  641. // see how many extra bytes we have:
  642. unsigned extra = detail::utf8_trailing_byte_count(*m_position);
  643. // extract the extra bits, 6 from each extra byte:
  644. BaseIterator next(m_position);
  645. for(unsigned c = 0; c < extra; ++c)
  646. {
  647. ++next;
  648. m_value <<= 6;
  649. // We must have a continuation byte:
  650. if((static_cast<std::uint8_t>(*next) & 0xC0) != 0x80)
  651. invalid_sequence();
  652. m_value += static_cast<std::uint8_t>(*next) & 0x3Fu;
  653. }
  654. // we now need to remove a few of the leftmost bits, but how many depends
  655. // upon how many extra bytes we've extracted:
  656. static const std::uint32_t masks[4] =
  657. {
  658. 0x7Fu,
  659. 0x7FFu,
  660. 0xFFFFu,
  661. 0x1FFFFFu,
  662. };
  663. m_value &= masks[extra];
  664. // check the result is in range:
  665. if(m_value > static_cast<U32Type>(0x10FFFFu))
  666. invalid_sequence();
  667. // The result must not be a surrogate:
  668. if((m_value >= static_cast<U32Type>(0xD800)) && (m_value <= static_cast<U32Type>(0xDFFF)))
  669. invalid_sequence();
  670. // We should not have had an invalidly encoded UTF8 sequence:
  671. if((extra > 0) && (m_value <= static_cast<U32Type>(masks[extra - 1])))
  672. invalid_sequence();
  673. }
  674. BaseIterator m_position;
  675. mutable U32Type m_value;
  676. };
  677. template <class BaseIterator>
  678. class utf16_output_iterator
  679. {
  680. public:
  681. typedef void difference_type;
  682. typedef void value_type;
  683. typedef std::uint32_t* pointer;
  684. typedef std::uint32_t& reference;
  685. typedef std::output_iterator_tag iterator_category;
  686. utf16_output_iterator(const BaseIterator& b)
  687. : m_position(b){}
  688. utf16_output_iterator(const utf16_output_iterator& that)
  689. : m_position(that.m_position){}
  690. utf16_output_iterator& operator=(const utf16_output_iterator& that)
  691. {
  692. m_position = that.m_position;
  693. return *this;
  694. }
  695. const utf16_output_iterator& operator*()const
  696. {
  697. return *this;
  698. }
  699. void operator=(std::uint32_t val)const
  700. {
  701. push(val);
  702. }
  703. utf16_output_iterator& operator++()
  704. {
  705. return *this;
  706. }
  707. utf16_output_iterator& operator++(int)
  708. {
  709. return *this;
  710. }
  711. BaseIterator base()const
  712. {
  713. return m_position;
  714. }
  715. private:
  716. void push(std::uint32_t v)const
  717. {
  718. if(v >= 0x10000u)
  719. {
  720. // begin by checking for a code point out of range:
  721. if(v > 0x10FFFFu)
  722. detail::invalid_utf32_code_point(v);
  723. // split into two surrogates:
  724. *m_position++ = static_cast<std::uint16_t>(v >> 10) + detail::high_surrogate_base;
  725. *m_position++ = static_cast<std::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
  726. }
  727. else
  728. {
  729. // 16-bit code point:
  730. // value must not be a surrogate:
  731. if(detail::is_surrogate(v))
  732. detail::invalid_utf32_code_point(v);
  733. *m_position++ = static_cast<std::uint16_t>(v);
  734. }
  735. }
  736. mutable BaseIterator m_position;
  737. };
  738. template <class BaseIterator>
  739. class utf8_output_iterator
  740. {
  741. public:
  742. typedef void difference_type;
  743. typedef void value_type;
  744. typedef std::uint32_t* pointer;
  745. typedef std::uint32_t& reference;
  746. typedef std::output_iterator_tag iterator_category;
  747. utf8_output_iterator(const BaseIterator& b)
  748. : m_position(b){}
  749. utf8_output_iterator(const utf8_output_iterator& that)
  750. : m_position(that.m_position){}
  751. utf8_output_iterator& operator=(const utf8_output_iterator& that)
  752. {
  753. m_position = that.m_position;
  754. return *this;
  755. }
  756. const utf8_output_iterator& operator*()const
  757. {
  758. return *this;
  759. }
  760. void operator=(std::uint32_t val)const
  761. {
  762. push(val);
  763. }
  764. utf8_output_iterator& operator++()
  765. {
  766. return *this;
  767. }
  768. utf8_output_iterator& operator++(int)
  769. {
  770. return *this;
  771. }
  772. BaseIterator base()const
  773. {
  774. return m_position;
  775. }
  776. private:
  777. void push(std::uint32_t c)const
  778. {
  779. if(c > 0x10FFFFu)
  780. detail::invalid_utf32_code_point(c);
  781. if(c < 0x80u)
  782. {
  783. *m_position++ = static_cast<unsigned char>(c);
  784. }
  785. else if(c < 0x800u)
  786. {
  787. *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
  788. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  789. }
  790. else if(c < 0x10000u)
  791. {
  792. *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
  793. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  794. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  795. }
  796. else
  797. {
  798. *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
  799. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
  800. *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
  801. *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
  802. }
  803. }
  804. mutable BaseIterator m_position;
  805. };
  806. } // namespace boost
  807. #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP