123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368 |
- ///////////////////////////////////////////////////////////////////////////////
- // parse_charset.hpp
- //
- // Copyright 2008 Eric Niebler. Distributed under the Boost
- // Software License, Version 1.0. (See accompanying file
- // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
- #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
- #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
- // MS compatible compilers support #pragma once
- #if defined(_MSC_VER)
- # pragma once
- #endif
- #include <boost/config.hpp>
- #include <boost/integer.hpp>
- #include <boost/mpl/bool.hpp>
- #include <boost/throw_exception.hpp>
- #include <boost/numeric/conversion/converter.hpp>
- #include <boost/xpressive/detail/detail_fwd.hpp>
- #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
- #include <boost/xpressive/detail/utility/literals.hpp>
- #include <boost/xpressive/detail/utility/chset/chset.hpp>
- #include <boost/xpressive/regex_constants.hpp>
- namespace boost { namespace xpressive { namespace detail
- {
- enum escape_type
- {
- escape_char
- , escape_mark
- , escape_class
- };
- ///////////////////////////////////////////////////////////////////////////////
- // escape_value
- //
- template<typename Char, typename Class>
- struct escape_value
- {
- Char ch_;
- int mark_nbr_;
- Class class_;
- escape_type type_;
- };
- ///////////////////////////////////////////////////////////////////////////////
- // char_overflow_handler
- //
- struct char_overflow_handler
- {
- void operator ()(numeric::range_check_result result) const // throw(regex_error)
- {
- if(numeric::cInRange != result)
- {
- BOOST_THROW_EXCEPTION(
- regex_error(
- regex_constants::error_escape
- , "character escape too large to fit in target character type"
- )
- );
- }
- }
- };
- ///////////////////////////////////////////////////////////////////////////////
- // parse_escape
- //
- template<typename FwdIter, typename CompilerTraits>
- escape_value<typename iterator_value<FwdIter>::type, typename CompilerTraits::regex_traits::char_class_type>
- parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &tr)
- {
- using namespace regex_constants;
- typedef typename iterator_value<FwdIter>::type char_type;
- typedef typename CompilerTraits::regex_traits regex_traits;
- typedef typename regex_traits::char_class_type char_class_type;
- // define an unsigned type the same size as char_type
- typedef typename boost::uint_t<CHAR_BIT * sizeof(char_type)>::least uchar_t;
- BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type));
- typedef numeric::conversion_traits<uchar_t, int> converstion_traits;
- BOOST_XPR_ENSURE_(begin != end, error_escape, "unexpected end of pattern found");
- numeric::converter<int, uchar_t, converstion_traits, char_overflow_handler> converter;
- escape_value<char_type,char_class_type> esc = { 0, 0, 0, escape_char };
- bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
- regex_traits const &rxtraits = tr.traits();
- FwdIter tmp;
- esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase);
- if(0 != esc.class_)
- {
- esc.type_ = escape_class;
- return esc;
- }
- if(-1 != rxtraits.value(*begin, 8))
- {
- esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777));
- return esc;
- }
- switch(*begin)
- {
- // bell character
- case BOOST_XPR_CHAR_(char_type, 'a'):
- esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a');
- ++begin;
- break;
- // escape character
- case BOOST_XPR_CHAR_(char_type, 'e'):
- esc.ch_ = converter(27);
- ++begin;
- break;
- // control character
- case BOOST_XPR_CHAR_(char_type, 'c'):
- BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
- BOOST_XPR_ENSURE_
- (
- rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin)
- || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin)
- , error_escape
- , "invalid escape control letter; must be one of a-z or A-Z"
- );
- // Convert to character according to ECMA-262, section 15.10.2.10:
- esc.ch_ = converter(*begin % 32);
- ++begin;
- break;
- // formfeed character
- case BOOST_XPR_CHAR_(char_type, 'f'):
- esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f');
- ++begin;
- break;
- // newline
- case BOOST_XPR_CHAR_(char_type, 'n'):
- esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n');
- ++begin;
- break;
- // return
- case BOOST_XPR_CHAR_(char_type, 'r'):
- esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r');
- ++begin;
- break;
- // horizontal tab
- case BOOST_XPR_CHAR_(char_type, 't'):
- esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t');
- ++begin;
- break;
- // vertical tab
- case BOOST_XPR_CHAR_(char_type, 'v'):
- esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v');
- ++begin;
- break;
- // hex escape sequence
- case BOOST_XPR_CHAR_(char_type, 'x'):
- BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
- tmp = begin;
- esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff));
- BOOST_XPR_ENSURE_(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : "
- "must be \\x HexDigit HexDigit");
- break;
- // Unicode escape sequence
- case BOOST_XPR_CHAR_(char_type, 'u'):
- BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
- tmp = begin;
- esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff));
- BOOST_XPR_ENSURE_(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : "
- "must be \\u HexDigit HexDigit HexDigit HexDigit");
- break;
- // backslash
- case BOOST_XPR_CHAR_(char_type, '\\'):
- //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\');
- //++begin;
- //break;
- // all other escaped characters represent themselves
- default:
- esc.ch_ = *begin;
- ++begin;
- break;
- }
- return esc;
- }
- //////////////////////////////////////////////////////////////////////////
- // parse_charset
- //
- template<typename FwdIter, typename RegexTraits, typename CompilerTraits>
- inline void parse_charset
- (
- FwdIter &begin
- , FwdIter end
- , compound_charset<RegexTraits> &chset
- , CompilerTraits &tr
- )
- {
- using namespace regex_constants;
- typedef typename RegexTraits::char_type char_type;
- typedef typename RegexTraits::char_class_type char_class_type;
- BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
- RegexTraits const &rxtraits = tr.traits();
- bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
- FwdIter iprev = FwdIter();
- escape_value<char_type, char_class_type> esc = {0, 0, 0, escape_char};
- bool invert = false;
- // check to see if we have an inverse charset
- if(begin != end && token_charset_invert == tr.get_charset_token(iprev = begin, end))
- {
- begin = iprev;
- invert = true;
- }
- // skip the end token if-and-only-if it is the first token in the charset
- if(begin != end && token_charset_end == tr.get_charset_token(iprev = begin, end))
- {
- for(; begin != iprev; ++begin)
- {
- chset.set_char(*begin, rxtraits, icase);
- }
- }
- compiler_token_type tok;
- char_type ch_prev = char_type(), ch_next = char_type();
- bool have_prev = false;
- BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
- // remember the current position and grab the next token
- iprev = begin;
- tok = tr.get_charset_token(begin, end);
- do
- {
- BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
- if(token_charset_hyphen == tok && have_prev)
- {
- // remember the current position
- FwdIter iprev2 = begin;
- have_prev = false;
- // ch_prev is lower bound of a range
- switch(tr.get_charset_token(begin, end))
- {
- case token_charset_hyphen:
- case token_charset_invert:
- begin = iprev2; // un-get these tokens and fall through
- BOOST_FALLTHROUGH;
- case token_literal:
- ch_next = *begin++;
- BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
- chset.set_range(ch_prev, ch_next, rxtraits, icase);
- continue;
- case token_charset_backspace:
- ch_next = char_type(8); // backspace
- BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
- chset.set_range(ch_prev, ch_next, rxtraits, icase);
- continue;
- case token_escape:
- esc = parse_escape(begin, end, tr);
- if(escape_char == esc.type_)
- {
- BOOST_XPR_ENSURE_(ch_prev <= esc.ch_, error_range, "invalid charset range");
- chset.set_range(ch_prev, esc.ch_, rxtraits, icase);
- continue;
- }
- BOOST_FALLTHROUGH;
- case token_charset_end:
- default: // not a range.
- begin = iprev; // backup to hyphen token
- chset.set_char(ch_prev, rxtraits, icase);
- chset.set_char(*begin++, rxtraits, icase);
- continue;
- }
- }
- if(have_prev)
- {
- chset.set_char(ch_prev, rxtraits, icase);
- have_prev = false;
- }
- switch(tok)
- {
- case token_charset_hyphen:
- case token_charset_invert:
- case token_charset_end:
- case token_posix_charset_end:
- begin = iprev; // un-get these tokens
- ch_prev = *begin++;
- have_prev = true;
- continue;
- case token_charset_backspace:
- ch_prev = char_type(8); // backspace
- have_prev = true;
- continue;
- case token_posix_charset_begin:
- {
- FwdIter tmp = begin, start = begin;
- bool invert = (token_charset_invert == tr.get_charset_token(tmp, end));
- if(invert)
- {
- begin = start = tmp;
- }
- while(token_literal == (tok = tr.get_charset_token(begin, end)))
- {
- tmp = ++begin;
- BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
- }
- if(token_posix_charset_end == tok)
- {
- char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase);
- BOOST_XPR_ENSURE_(0 != chclass, error_ctype, "unknown class name");
- chset.set_class(chclass, invert);
- continue;
- }
- begin = iprev; // un-get this token
- ch_prev = *begin++;
- have_prev = true;
- }
- continue;
- case token_escape:
- esc = parse_escape(begin, end, tr);
- if(escape_char == esc.type_)
- {
- ch_prev = esc.ch_;
- have_prev = true;
- }
- else if(escape_class == esc.type_)
- {
- char_class_type upper_ = lookup_classname(rxtraits, "upper");
- BOOST_ASSERT(0 != upper_);
- chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_));
- }
- else
- {
- BOOST_ASSERT(false);
- }
- continue;
- default:
- ch_prev = *begin++;
- have_prev = true;
- continue;
- }
- }
- while(BOOST_XPR_ENSURE_((iprev = begin) != end, error_brack, "unexpected end of pattern found"),
- token_charset_end != (tok = tr.get_charset_token(begin, end)));
- if(have_prev)
- {
- chset.set_char(ch_prev, rxtraits, icase);
- }
- if(invert)
- {
- chset.inverse();
- }
- }
- }}} // namespace boost::xpressive::detail
- #endif
|