lexer.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. // Copyright (c) 2001-2011 Hartmut Kaiser
  2. //
  3. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_13_2007_0145PM)
  6. #define BOOST_SPIRIT_LEX_LEXER_MAR_13_2007_0145PM
  7. #if defined(_MSC_VER)
  8. #pragma once
  9. #endif
  10. #include <boost/spirit/home/support/info.hpp>
  11. #include <boost/spirit/home/qi/skip_over.hpp>
  12. #include <boost/spirit/home/qi/parser.hpp>
  13. #include <boost/spirit/home/qi/detail/assign_to.hpp>
  14. #include <boost/spirit/home/lex/reference.hpp>
  15. #include <boost/spirit/home/lex/meta_compiler.hpp>
  16. #include <boost/spirit/home/lex/lexer_type.hpp>
  17. #include <boost/spirit/home/lex/lexer/token_def.hpp>
  18. #include <boost/assert.hpp>
  19. #include <boost/noncopyable.hpp>
  20. #include <boost/fusion/include/vector.hpp>
  21. #include <boost/mpl/assert.hpp>
  22. #include <boost/proto/extends.hpp>
  23. #include <boost/proto/traits.hpp>
  24. #include <boost/range/iterator_range_core.hpp>
  25. #include <iterator> // for std::iterator_traits
  26. #include <string>
  27. namespace boost { namespace spirit { namespace lex
  28. {
  29. ///////////////////////////////////////////////////////////////////////////
  30. namespace detail
  31. {
  32. ///////////////////////////////////////////////////////////////////////
  33. template <typename LexerDef>
  34. struct lexer_def_
  35. : proto::extends<
  36. typename proto::terminal<
  37. lex::reference<lexer_def_<LexerDef> const>
  38. >::type
  39. , lexer_def_<LexerDef> >
  40. , qi::parser<lexer_def_<LexerDef> >
  41. , lex::lexer_type<lexer_def_<LexerDef> >
  42. {
  43. private:
  44. // avoid warnings about using 'this' in constructor
  45. lexer_def_& this_() { return *this; }
  46. typedef typename LexerDef::char_type char_type;
  47. typedef typename LexerDef::string_type string_type;
  48. typedef typename LexerDef::id_type id_type;
  49. typedef lex::reference<lexer_def_ const> reference_;
  50. typedef typename proto::terminal<reference_>::type terminal_type;
  51. typedef proto::extends<terminal_type, lexer_def_> proto_base_type;
  52. reference_ alias() const
  53. {
  54. return reference_(*this);
  55. }
  56. public:
  57. // Qi interface: metafunction calculating parser attribute type
  58. template <typename Context, typename Iterator>
  59. struct attribute
  60. {
  61. // the return value of a token set contains the matched token
  62. // id, and the corresponding pair of iterators
  63. typedef typename Iterator::base_iterator_type iterator_type;
  64. typedef
  65. fusion::vector2<id_type, iterator_range<iterator_type> >
  66. type;
  67. };
  68. // Qi interface: parse functionality
  69. template <typename Iterator, typename Context
  70. , typename Skipper, typename Attribute>
  71. bool parse(Iterator& first, Iterator const& last
  72. , Context& /*context*/, Skipper const& skipper
  73. , Attribute& attr) const
  74. {
  75. qi::skip_over(first, last, skipper); // always do a pre-skip
  76. if (first != last) {
  77. typedef typename
  78. std::iterator_traits<Iterator>::value_type
  79. token_type;
  80. token_type const& t = *first;
  81. if (token_is_valid(t) && t.state() == first.get_state()) {
  82. // any of the token definitions matched
  83. spirit::traits::assign_to(t, attr);
  84. ++first;
  85. return true;
  86. }
  87. }
  88. return false;
  89. }
  90. // Qi interface: 'what' functionality
  91. template <typename Context>
  92. info what(Context& /*context*/) const
  93. {
  94. return info("lexer");
  95. }
  96. private:
  97. // allow to use the lexer.self.add("regex1", id1)("regex2", id2);
  98. // syntax
  99. struct adder
  100. {
  101. adder(lexer_def_& def_)
  102. : def(def_) {}
  103. // Add a token definition based on a single character as given
  104. // by the first parameter, the second parameter allows to
  105. // specify the token id to use for the new token. If no token
  106. // id is given the character code is used.
  107. adder const& operator()(char_type c
  108. , id_type token_id = id_type()) const
  109. {
  110. if (id_type() == token_id)
  111. token_id = static_cast<id_type>(c);
  112. def.def.add_token (def.state.c_str(), c, token_id
  113. , def.targetstate.empty() ? 0 : def.targetstate.c_str());
  114. return *this;
  115. }
  116. // Add a token definition based on a character sequence as
  117. // given by the first parameter, the second parameter allows to
  118. // specify the token id to use for the new token. If no token
  119. // id is given this function will generate a unique id to be
  120. // used as the token's id.
  121. adder const& operator()(string_type const& s
  122. , id_type token_id = id_type()) const
  123. {
  124. if (id_type() == token_id)
  125. token_id = def.def.get_next_id();
  126. def.def.add_token (def.state.c_str(), s, token_id
  127. , def.targetstate.empty() ? 0 : def.targetstate.c_str());
  128. return *this;
  129. }
  130. template <typename Attribute>
  131. adder const& operator()(
  132. token_def<Attribute, char_type, id_type>& tokdef
  133. , id_type token_id = id_type()) const
  134. {
  135. // make sure we have a token id
  136. if (id_type() == token_id) {
  137. if (id_type() == tokdef.id()) {
  138. token_id = def.def.get_next_id();
  139. tokdef.id(token_id);
  140. }
  141. else {
  142. token_id = tokdef.id();
  143. }
  144. }
  145. else {
  146. // the following assertion makes sure that the token_def
  147. // instance has not been assigned a different id earlier
  148. BOOST_ASSERT(id_type() == tokdef.id()
  149. || token_id == tokdef.id());
  150. tokdef.id(token_id);
  151. }
  152. def.define(tokdef);
  153. return *this;
  154. }
  155. // template <typename F>
  156. // adder const& operator()(char_type c, id_type token_id, F act) const
  157. // {
  158. // if (id_type() == token_id)
  159. // token_id = def.def.get_next_id();
  160. // std::size_t unique_id =
  161. // def.def.add_token (def.state.c_str(), s, token_id);
  162. // def.def.add_action(unique_id, def.state.c_str(), act);
  163. // return *this;
  164. // }
  165. lexer_def_& def;
  166. // silence MSVC warning C4512: assignment operator could not be generated
  167. BOOST_DELETED_FUNCTION(adder& operator= (adder const&))
  168. };
  169. friend struct adder;
  170. // allow to use lexer.self.add_pattern("pattern1", "regex1")(...);
  171. // syntax
  172. struct pattern_adder
  173. {
  174. pattern_adder(lexer_def_& def_)
  175. : def(def_) {}
  176. pattern_adder const& operator()(string_type const& p
  177. , string_type const& s) const
  178. {
  179. def.def.add_pattern (def.state.c_str(), p, s);
  180. return *this;
  181. }
  182. lexer_def_& def;
  183. // silence MSVC warning C4512: assignment operator could not be generated
  184. BOOST_DELETED_FUNCTION(pattern_adder& operator= (pattern_adder const&))
  185. };
  186. friend struct pattern_adder;
  187. private:
  188. // Helper function to invoke the necessary 2 step compilation
  189. // process on token definition expressions
  190. template <typename TokenExpr>
  191. void compile2pass(TokenExpr const& expr)
  192. {
  193. expr.collect(def, state, targetstate);
  194. expr.add_actions(def);
  195. }
  196. public:
  197. ///////////////////////////////////////////////////////////////////
  198. template <typename Expr>
  199. void define(Expr const& expr)
  200. {
  201. compile2pass(compile<lex::domain>(expr));
  202. }
  203. lexer_def_(LexerDef& def_, string_type const& state_
  204. , string_type const& targetstate_ = string_type())
  205. : proto_base_type(terminal_type::make(alias()))
  206. , add(this_()), add_pattern(this_()), def(def_)
  207. , state(state_), targetstate(targetstate_)
  208. {}
  209. // allow to switch states
  210. lexer_def_ operator()(char_type const* state) const
  211. {
  212. return lexer_def_(def, state);
  213. }
  214. lexer_def_ operator()(char_type const* state
  215. , char_type const* targetstate) const
  216. {
  217. return lexer_def_(def, state, targetstate);
  218. }
  219. lexer_def_ operator()(string_type const& state
  220. , string_type const& targetstate = string_type()) const
  221. {
  222. return lexer_def_(def, state, targetstate);
  223. }
  224. // allow to assign a token definition expression
  225. template <typename Expr>
  226. lexer_def_& operator= (Expr const& xpr)
  227. {
  228. // Report invalid expression error as early as possible.
  229. // If you got an error_invalid_expression error message here,
  230. // then the expression (expr) is not a valid spirit lex
  231. // expression.
  232. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  233. def.clear(state.c_str());
  234. define(xpr);
  235. return *this;
  236. }
  237. // explicitly tell the lexer that the given state will be defined
  238. // (useful in conjunction with "*")
  239. std::size_t add_state(char_type const* state = 0)
  240. {
  241. return def.add_state(state ? state : def.initial_state().c_str());
  242. }
  243. adder add;
  244. pattern_adder add_pattern;
  245. private:
  246. LexerDef& def;
  247. string_type state;
  248. string_type targetstate;
  249. // silence MSVC warning C4512: assignment operator could not be generated
  250. BOOST_DELETED_FUNCTION(lexer_def_& operator= (lexer_def_ const&))
  251. };
  252. #if defined(BOOST_NO_CXX11_RVALUE_REFERENCES)
  253. // allow to assign a token definition expression
  254. template <typename LexerDef, typename Expr>
  255. inline lexer_def_<LexerDef>&
  256. operator+= (lexer_def_<LexerDef>& lexdef, Expr& xpr)
  257. {
  258. // Report invalid expression error as early as possible.
  259. // If you got an error_invalid_expression error message here,
  260. // then the expression (expr) is not a valid spirit lex
  261. // expression.
  262. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  263. lexdef.define(xpr);
  264. return lexdef;
  265. }
  266. #else
  267. // allow to assign a token definition expression
  268. template <typename LexerDef, typename Expr>
  269. inline lexer_def_<LexerDef>&
  270. operator+= (lexer_def_<LexerDef>& lexdef, Expr&& xpr)
  271. {
  272. // Report invalid expression error as early as possible.
  273. // If you got an error_invalid_expression error message here,
  274. // then the expression (expr) is not a valid spirit lex
  275. // expression.
  276. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  277. lexdef.define(xpr);
  278. return lexdef;
  279. }
  280. #endif
  281. template <typename LexerDef, typename Expr>
  282. inline lexer_def_<LexerDef>&
  283. operator+= (lexer_def_<LexerDef>& lexdef, Expr const& xpr)
  284. {
  285. // Report invalid expression error as early as possible.
  286. // If you got an error_invalid_expression error message here,
  287. // then the expression (expr) is not a valid spirit lex
  288. // expression.
  289. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  290. lexdef.define(xpr);
  291. return lexdef;
  292. }
  293. }
  294. ///////////////////////////////////////////////////////////////////////////
  295. // The match_flags flags are used to influence different matching
  296. // modes of the lexer
  297. struct match_flags
  298. {
  299. enum enum_type
  300. {
  301. match_default = 0, // no flags
  302. match_not_dot_newline = 1, // the regex '.' doesn't match newlines
  303. match_icase = 2 // all matching operations are case insensitive
  304. };
  305. };
  306. ///////////////////////////////////////////////////////////////////////////
  307. // This represents a lexer object
  308. ///////////////////////////////////////////////////////////////////////////
  309. ///////////////////////////////////////////////////////////////////////////
  310. // This is the first token id automatically assigned by the library
  311. // if needed
  312. enum tokenids
  313. {
  314. min_token_id = 0x10000
  315. };
  316. template <typename Lexer>
  317. class lexer : public Lexer
  318. {
  319. private:
  320. // avoid warnings about using 'this' in constructor
  321. lexer& this_() { return *this; }
  322. std::size_t next_token_id; // has to be an integral type
  323. public:
  324. typedef Lexer lexer_type;
  325. typedef typename Lexer::id_type id_type;
  326. typedef typename Lexer::char_type char_type;
  327. typedef typename Lexer::iterator_type iterator_type;
  328. typedef lexer base_type;
  329. typedef detail::lexer_def_<lexer> lexer_def;
  330. typedef std::basic_string<char_type> string_type;
  331. // if `id_type` was specified but `first_id` is not provided
  332. // the `min_token_id` value may be out of range for `id_type`,
  333. // but it will be a problem only if unique ids feature is in use.
  334. lexer(unsigned int flags = match_flags::match_default)
  335. : lexer_type(flags)
  336. , next_token_id(min_token_id)
  337. , self(this_(), lexer_type::initial_state())
  338. {}
  339. lexer(unsigned int flags, id_type first_id)
  340. : lexer_type(flags)
  341. , next_token_id(first_id)
  342. , self(this_(), lexer_type::initial_state())
  343. {}
  344. // access iterator interface
  345. template <typename Iterator>
  346. iterator_type begin(Iterator& first, Iterator const& last
  347. , char_type const* initial_state = 0) const
  348. { return this->lexer_type::begin(first, last, initial_state); }
  349. iterator_type end() const
  350. { return this->lexer_type::end(); }
  351. std::size_t map_state(char_type const* state)
  352. { return this->lexer_type::add_state(state); }
  353. // create a unique token id
  354. id_type get_next_id() { return id_type(next_token_id++); }
  355. lexer_def self; // allow for easy token definition
  356. };
  357. }}}
  358. #endif