// // Copyright (c) 2020 Krystian Stasiowski (sdkrystian@gmail.com) // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) // // Official repository: https://github.com/boostorg/json // #ifndef BOOST_JSON_DETAIL_UTF8_HPP #define BOOST_JSON_DETAIL_UTF8_HPP #include #include #include BOOST_JSON_NS_BEGIN namespace detail { template std::uint32_t load_little_endian(void const* p) { // VFALCO do we need to initialize this to 0? std::uint32_t v; std::memcpy(&v, p, N); #ifdef BOOST_JSON_BIG_ENDIAN v = ((v & 0xFF000000) >> 24) | ((v & 0x00FF0000) >> 8) | ((v & 0x0000FF00) << 8) | ((v & 0x000000FF) << 24); #endif return v; } inline uint16_t classify_utf8(char c) { // 0x000 = invalid // 0x102 = 2 bytes, second byte [80, BF] // 0x203 = 3 bytes, second byte [A0, BF] // 0x303 = 3 bytes, second byte [80, BF] // 0x403 = 3 bytes, second byte [80, 9F] // 0x504 = 4 bytes, second byte [90, BF] // 0x604 = 4 bytes, second byte [80, BF] // 0x704 = 4 bytes, second byte [80, 8F] static constexpr uint16_t first[128] { 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x102, 0x203, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x303, 0x403, 0x303, 0x303, 0x504, 0x604, 0x604, 0x604, 0x704, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, 0x000, }; return first[static_cast(c)]; } inline bool is_valid_utf8(const char* p, uint16_t first) { uint32_t v; switch(first >> 8) { default: return false; // 2 bytes, second byte [80, BF] case 1: v = load_little_endian<2>(p); return (v & 0xC000) == 0x8000; // 3 bytes, second byte [A0, BF] case 2: v = load_little_endian<3>(p); std::memcpy(&v, p, 3); return (v & 0xC0E000) == 0x80A000; // 3 bytes, second byte [80, BF] case 3: v = load_little_endian<3>(p); return (v & 0xC0C000) == 0x808000; // 3 bytes, second byte [80, 9F] case 4: v = load_little_endian<3>(p); return (v & 0xC0E000) == 0x808000; // 4 bytes, second byte [90, BF] case 5: v = load_little_endian<4>(p); return (v & 0xC0C0FF00) + 0x7F7F7000 <= 0x2F00; // 4 bytes, second byte [80, BF] case 6: v = load_little_endian<4>(p); return (v & 0xC0C0C000) == 0x80808000; // 4 bytes, second byte [80, 8F] case 7: v = load_little_endian<4>(p); return (v & 0xC0C0F000) == 0x80808000; } } class utf8_sequence { char seq_[4]; uint16_t first_; uint8_t size_; public: void save( const char* p, std::size_t remain) noexcept { first_ = classify_utf8(*p & 0x7F); if(remain >= length()) size_ = length(); else size_ = static_cast(remain); std::memcpy(seq_, p, size_); } uint8_t length() const noexcept { return first_ & 0xFF; } bool complete() const noexcept { return size_ >= length(); } // returns true if complete bool append( const char* p, std::size_t remain) noexcept { if(BOOST_JSON_UNLIKELY(needed() == 0)) return true; if(BOOST_JSON_LIKELY(remain >= needed())) { std::memcpy( seq_ + size_, p, needed()); size_ = length(); return true; } if(BOOST_JSON_LIKELY(remain > 0)) { std::memcpy(seq_ + size_, p, remain); size_ += static_cast(remain); } return false; } const char* data() const noexcept { return seq_; } uint8_t needed() const noexcept { return length() - size_; } bool valid() const noexcept { BOOST_ASSERT(size_ >= length()); return is_valid_utf8(seq_, first_); } }; } // detail BOOST_JSON_NS_END #endif