parser_traits.hpp 18 KB


  1. ///////////////////////////////////////////////////////////////////////////////
  2. // detail/dynamic/parser_traits.hpp
  3. //
  4. // Copyright 2008 Eric Niebler. Distributed under the Boost
  5. // Software License, Version 1.0. (See accompanying file
  6. // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  7. #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSER_TRAITS_HPP_EAN_10_04_2005
  8. #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSER_TRAITS_HPP_EAN_10_04_2005
  9. // MS compatible compilers support #pragma once
  10. #if defined(_MSC_VER)
  11. # pragma once
  12. #endif
  13. #include <string>
  14. #include <climits>
  15. #include <boost/config.hpp>
  16. #include <boost/assert.hpp>
  17. #include <boost/throw_exception.hpp>
  18. #include <boost/xpressive/regex_error.hpp>
  19. #include <boost/xpressive/regex_traits.hpp>
  20. #include <boost/xpressive/detail/detail_fwd.hpp>
  21. #include <boost/xpressive/detail/dynamic/matchable.hpp>
  22. #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
  23. #include <boost/xpressive/detail/utility/literals.hpp>
  24. #include <boost/xpressive/detail/utility/algorithm.hpp>
  25. namespace boost { namespace xpressive
  26. {
  27. ///////////////////////////////////////////////////////////////////////////////
  28. // compiler_traits
  29. // this works for char and wchar_t. it must be specialized for anything else.
  30. //
  31. template<typename RegexTraits>
  32. struct compiler_traits
  33. {
  34. typedef RegexTraits regex_traits;
  35. typedef typename regex_traits::char_type char_type;
  36. typedef typename regex_traits::string_type string_type;
  37. typedef typename regex_traits::locale_type locale_type;
  38. ///////////////////////////////////////////////////////////////////////////////
  39. // constructor
  40. explicit compiler_traits(RegexTraits const &traits = RegexTraits())
  41. : traits_(traits)
  42. , flags_(regex_constants::ECMAScript)
  43. , space_(lookup_classname(traits_, "space"))
  44. , alnum_(lookup_classname(traits_, "alnum"))
  45. {
  46. }
  47. ///////////////////////////////////////////////////////////////////////////////
  48. // flags
  49. regex_constants::syntax_option_type flags() const
  50. {
  51. return this->flags_;
  52. }
  53. ///////////////////////////////////////////////////////////////////////////////
  54. // flags
  55. void flags(regex_constants::syntax_option_type flags)
  56. {
  57. this->flags_ = flags;
  58. }
  59. ///////////////////////////////////////////////////////////////////////////////
  60. // traits
  61. regex_traits &traits()
  62. {
  63. return this->traits_;
  64. }
  65. regex_traits const &traits() const
  66. {
  67. return this->traits_;
  68. }
  69. ///////////////////////////////////////////////////////////////////////////////
  70. // imbue
  71. locale_type imbue(locale_type const &loc)
  72. {
  73. locale_type oldloc = this->traits().imbue(loc);
  74. this->space_ = lookup_classname(this->traits(), "space");
  75. this->alnum_ = lookup_classname(this->traits(), "alnum");
  76. return oldloc;
  77. }
  78. ///////////////////////////////////////////////////////////////////////////////
  79. // getloc
  80. locale_type getloc() const
  81. {
  82. return this->traits().getloc();
  83. }
  84. ///////////////////////////////////////////////////////////////////////////////
  85. // get_token
  86. // get a token and advance the iterator
  87. template<typename FwdIter>
  88. regex_constants::compiler_token_type get_token(FwdIter &begin, FwdIter end)
  89. {
  90. using namespace regex_constants;
  91. if(this->eat_ws_(begin, end) == end)
  92. {
  93. return regex_constants::token_end_of_pattern;
  94. }
  95. switch(*begin)
  96. {
  97. case BOOST_XPR_CHAR_(char_type, '\\'): return this->get_escape_token(++begin, end);
  98. case BOOST_XPR_CHAR_(char_type, '.'): ++begin; return token_any;
  99. case BOOST_XPR_CHAR_(char_type, '^'): ++begin; return token_assert_begin_line;
  100. case BOOST_XPR_CHAR_(char_type, '$'): ++begin; return token_assert_end_line;
  101. case BOOST_XPR_CHAR_(char_type, '('): ++begin; return token_group_begin;
  102. case BOOST_XPR_CHAR_(char_type, ')'): ++begin; return token_group_end;
  103. case BOOST_XPR_CHAR_(char_type, '|'): ++begin; return token_alternate;
  104. case BOOST_XPR_CHAR_(char_type, '['): ++begin; return token_charset_begin;
  105. case BOOST_XPR_CHAR_(char_type, '*'):
  106. case BOOST_XPR_CHAR_(char_type, '+'):
  107. case BOOST_XPR_CHAR_(char_type, '?'):
  108. return token_invalid_quantifier;
  109. case BOOST_XPR_CHAR_(char_type, ']'):
  110. case BOOST_XPR_CHAR_(char_type, '{'):
  111. default:
  112. return token_literal;
  113. }
  114. }
  115. ///////////////////////////////////////////////////////////////////////////////
  116. // get_quant_spec
  117. template<typename FwdIter>
  118. bool get_quant_spec(FwdIter &begin, FwdIter end, detail::quant_spec &spec)
  119. {
  120. using namespace regex_constants;
  121. FwdIter old_begin;
  122. if(this->eat_ws_(begin, end) == end)
  123. {
  124. return false;
  125. }
  126. switch(*begin)
  127. {
  128. case BOOST_XPR_CHAR_(char_type, '*'):
  129. spec.min_ = 0;
  130. spec.max_ = (std::numeric_limits<unsigned int>::max)();
  131. break;
  132. case BOOST_XPR_CHAR_(char_type, '+'):
  133. spec.min_ = 1;
  134. spec.max_ = (std::numeric_limits<unsigned int>::max)();
  135. break;
  136. case BOOST_XPR_CHAR_(char_type, '?'):
  137. spec.min_ = 0;
  138. spec.max_ = 1;
  139. break;
  140. case BOOST_XPR_CHAR_(char_type, '{'):
  141. old_begin = this->eat_ws_(++begin, end);
  142. spec.min_ = spec.max_ = detail::toi(begin, end, this->traits());
  143. BOOST_XPR_ENSURE_
  144. (
  145. begin != old_begin && begin != end, error_brace, "invalid quantifier"
  146. );
  147. if(*begin == BOOST_XPR_CHAR_(char_type, ','))
  148. {
  149. old_begin = this->eat_ws_(++begin, end);
  150. spec.max_ = detail::toi(begin, end, this->traits());
  151. BOOST_XPR_ENSURE_
  152. (
  153. begin != end && BOOST_XPR_CHAR_(char_type, '}') == *begin
  154. , error_brace, "invalid quantifier"
  155. );
  156. if(begin == old_begin)
  157. {
  158. spec.max_ = (std::numeric_limits<unsigned int>::max)();
  159. }
  160. else
  161. {
  162. BOOST_XPR_ENSURE_
  163. (
  164. spec.min_ <= spec.max_, error_badbrace, "invalid quantification range"
  165. );
  166. }
  167. }
  168. else
  169. {
  170. BOOST_XPR_ENSURE_
  171. (
  172. BOOST_XPR_CHAR_(char_type, '}') == *begin, error_brace, "invalid quantifier"
  173. );
  174. }
  175. break;
  176. default:
  177. return false;
  178. }
  179. spec.greedy_ = true;
  180. if(this->eat_ws_(++begin, end) != end && BOOST_XPR_CHAR_(char_type, '?') == *begin)
  181. {
  182. ++begin;
  183. spec.greedy_ = false;
  184. }
  185. return true;
  186. }
  187. ///////////////////////////////////////////////////////////////////////////
  188. // get_group_type
  189. template<typename FwdIter>
  190. regex_constants::compiler_token_type get_group_type(FwdIter &begin, FwdIter end, string_type &name)
  191. {
  192. using namespace regex_constants;
  193. if(this->eat_ws_(begin, end) != end && BOOST_XPR_CHAR_(char_type, '?') == *begin)
  194. {
  195. this->eat_ws_(++begin, end);
  196. BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
  197. switch(*begin)
  198. {
  199. case BOOST_XPR_CHAR_(char_type, ':'): ++begin; return token_no_mark;
  200. case BOOST_XPR_CHAR_(char_type, '>'): ++begin; return token_independent_sub_expression;
  201. case BOOST_XPR_CHAR_(char_type, '#'): ++begin; return token_comment;
  202. case BOOST_XPR_CHAR_(char_type, '='): ++begin; return token_positive_lookahead;
  203. case BOOST_XPR_CHAR_(char_type, '!'): ++begin; return token_negative_lookahead;
  204. case BOOST_XPR_CHAR_(char_type, 'R'): ++begin; return token_recurse;
  205. case BOOST_XPR_CHAR_(char_type, '$'):
  206. this->get_name_(++begin, end, name);
  207. BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
  208. if(BOOST_XPR_CHAR_(char_type, '=') == *begin)
  209. {
  210. ++begin;
  211. return token_rule_assign;
  212. }
  213. return token_rule_ref;
  214. case BOOST_XPR_CHAR_(char_type, '<'):
  215. this->eat_ws_(++begin, end);
  216. BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
  217. switch(*begin)
  218. {
  219. case BOOST_XPR_CHAR_(char_type, '='): ++begin; return token_positive_lookbehind;
  220. case BOOST_XPR_CHAR_(char_type, '!'): ++begin; return token_negative_lookbehind;
  221. default:
  222. BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension"));
  223. }
  224. case BOOST_XPR_CHAR_(char_type, 'P'):
  225. this->eat_ws_(++begin, end);
  226. BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
  227. switch(*begin)
  228. {
  229. case BOOST_XPR_CHAR_(char_type, '<'):
  230. this->get_name_(++begin, end, name);
  231. BOOST_XPR_ENSURE_(begin != end && BOOST_XPR_CHAR_(char_type, '>') == *begin++, error_paren, "incomplete extension");
  232. return token_named_mark;
  233. case BOOST_XPR_CHAR_(char_type, '='):
  234. this->get_name_(++begin, end, name);
  235. BOOST_XPR_ENSURE_(begin != end, error_paren, "incomplete extension");
  236. return token_named_mark_ref;
  237. default:
  238. BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension"));
  239. }
  240. case BOOST_XPR_CHAR_(char_type, 'i'):
  241. case BOOST_XPR_CHAR_(char_type, 'm'):
  242. case BOOST_XPR_CHAR_(char_type, 's'):
  243. case BOOST_XPR_CHAR_(char_type, 'x'):
  244. case BOOST_XPR_CHAR_(char_type, '-'):
  245. return this->parse_mods_(begin, end);
  246. default:
  247. BOOST_THROW_EXCEPTION(regex_error(error_badbrace, "unrecognized extension"));
  248. }
  249. }
  250. return token_literal;
  251. }
  252. //////////////////////////////////////////////////////////////////////////
  253. // get_charset_token
  254. // NOTE: white-space is *never* ignored in a charset.
  255. template<typename FwdIter>
  256. regex_constants::compiler_token_type get_charset_token(FwdIter &begin, FwdIter end)
  257. {
  258. using namespace regex_constants;
  259. BOOST_ASSERT(begin != end);
  260. switch(*begin)
  261. {
  262. case BOOST_XPR_CHAR_(char_type, '^'): ++begin; return token_charset_invert;
  263. case BOOST_XPR_CHAR_(char_type, '-'): ++begin; return token_charset_hyphen;
  264. case BOOST_XPR_CHAR_(char_type, ']'): ++begin; return token_charset_end;
  265. case BOOST_XPR_CHAR_(char_type, '['):
  266. {
  267. FwdIter next = begin; ++next;
  268. if(next != end)
  269. {
  270. BOOST_XPR_ENSURE_(
  271. *next != BOOST_XPR_CHAR_(char_type, '=')
  272. , error_collate
  273. , "equivalence classes are not yet supported"
  274. );
  275. BOOST_XPR_ENSURE_(
  276. *next != BOOST_XPR_CHAR_(char_type, '.')
  277. , error_collate
  278. , "collation sequences are not yet supported"
  279. );
  280. if(*next == BOOST_XPR_CHAR_(char_type, ':'))
  281. {
  282. begin = ++next;
  283. return token_posix_charset_begin;
  284. }
  285. }
  286. }
  287. break;
  288. case BOOST_XPR_CHAR_(char_type, ':'):
  289. {
  290. FwdIter next = begin; ++next;
  291. if(next != end && *next == BOOST_XPR_CHAR_(char_type, ']'))
  292. {
  293. begin = ++next;
  294. return token_posix_charset_end;
  295. }
  296. }
  297. break;
  298. case BOOST_XPR_CHAR_(char_type, '\\'):
  299. if(++begin != end)
  300. {
  301. switch(*begin)
  302. {
  303. case BOOST_XPR_CHAR_(char_type, 'b'): ++begin; return token_charset_backspace;
  304. default:;
  305. }
  306. }
  307. return token_escape;
  308. default:;
  309. }
  310. return token_literal;
  311. }
  312. //////////////////////////////////////////////////////////////////////////
  313. // get_escape_token
  314. template<typename FwdIter>
  315. regex_constants::compiler_token_type get_escape_token(FwdIter &begin, FwdIter end)
  316. {
  317. using namespace regex_constants;
  318. if(begin != end)
  319. {
  320. switch(*begin)
  321. {
  322. //case BOOST_XPR_CHAR_(char_type, 'a'): ++begin; return token_escape_bell;
  323. //case BOOST_XPR_CHAR_(char_type, 'c'): ++begin; return token_escape_control;
  324. //case BOOST_XPR_CHAR_(char_type, 'e'): ++begin; return token_escape_escape;
  325. //case BOOST_XPR_CHAR_(char_type, 'f'): ++begin; return token_escape_formfeed;
  326. //case BOOST_XPR_CHAR_(char_type, 'n'): ++begin; return token_escape_newline;
  327. //case BOOST_XPR_CHAR_(char_type, 't'): ++begin; return token_escape_horizontal_tab;
  328. //case BOOST_XPR_CHAR_(char_type, 'v'): ++begin; return token_escape_vertical_tab;
  329. case BOOST_XPR_CHAR_(char_type, 'A'): ++begin; return token_assert_begin_sequence;
  330. case BOOST_XPR_CHAR_(char_type, 'b'): ++begin; return token_assert_word_boundary;
  331. case BOOST_XPR_CHAR_(char_type, 'B'): ++begin; return token_assert_not_word_boundary;
  332. case BOOST_XPR_CHAR_(char_type, 'E'): ++begin; return token_quote_meta_end;
  333. case BOOST_XPR_CHAR_(char_type, 'Q'): ++begin; return token_quote_meta_begin;
  334. case BOOST_XPR_CHAR_(char_type, 'Z'): ++begin; return token_assert_end_sequence;
  335. // Non-standard extension to ECMAScript syntax
  336. case BOOST_XPR_CHAR_(char_type, '<'): ++begin; return token_assert_word_begin;
  337. case BOOST_XPR_CHAR_(char_type, '>'): ++begin; return token_assert_word_end;
  338. default:; // fall-through
  339. }
  340. }
  341. return token_escape;
  342. }
  343. private:
  344. //////////////////////////////////////////////////////////////////////////
  345. // parse_mods_
  346. template<typename FwdIter>
  347. regex_constants::compiler_token_type parse_mods_(FwdIter &begin, FwdIter end)
  348. {
  349. using namespace regex_constants;
  350. bool set = true;
  351. do switch(*begin)
  352. {
  353. case BOOST_XPR_CHAR_(char_type, 'i'): this->flag_(set, icase_); break;
  354. case BOOST_XPR_CHAR_(char_type, 'm'): this->flag_(!set, single_line); break;
  355. case BOOST_XPR_CHAR_(char_type, 's'): this->flag_(!set, not_dot_newline); break;
  356. case BOOST_XPR_CHAR_(char_type, 'x'): this->flag_(set, ignore_white_space); break;
  357. case BOOST_XPR_CHAR_(char_type, ':'): ++begin; BOOST_FALLTHROUGH;
  358. case BOOST_XPR_CHAR_(char_type, ')'): return token_no_mark;
  359. case BOOST_XPR_CHAR_(char_type, '-'): if(false == (set = !set)) break; BOOST_FALLTHROUGH;
  360. default: BOOST_THROW_EXCEPTION(regex_error(error_paren, "unknown pattern modifier"));
  361. }
  362. while(BOOST_XPR_ENSURE_(++begin != end, error_paren, "incomplete extension"));
  363. // this return is technically unreachable, but this must
  364. // be here to work around a bug in gcc 4.0
  365. return token_no_mark;
  366. }
  367. ///////////////////////////////////////////////////////////////////////////////
  368. // flag_
  369. void flag_(bool set, regex_constants::syntax_option_type flag)
  370. {
  371. this->flags_ = set ? (this->flags_ | flag) : (this->flags_ & ~flag);
  372. }
  373. ///////////////////////////////////////////////////////////////////////////
  374. // is_space_
  375. bool is_space_(char_type ch) const
  376. {
  377. return 0 != this->space_ && this->traits().isctype(ch, this->space_);
  378. }
  379. ///////////////////////////////////////////////////////////////////////////
  380. // is_alnum_
  381. bool is_alnum_(char_type ch) const
  382. {
  383. return 0 != this->alnum_ && this->traits().isctype(ch, this->alnum_);
  384. }
  385. ///////////////////////////////////////////////////////////////////////////
  386. // get_name_
  387. template<typename FwdIter>
  388. void get_name_(FwdIter &begin, FwdIter end, string_type &name)
  389. {
  390. this->eat_ws_(begin, end);
  391. for(name.clear(); begin != end && this->is_alnum_(*begin); ++begin)
  392. {
  393. name.push_back(*begin);
  394. }
  395. this->eat_ws_(begin, end);
  396. BOOST_XPR_ENSURE_(!name.empty(), regex_constants::error_paren, "incomplete extension");
  397. }
  398. ///////////////////////////////////////////////////////////////////////////////
  399. // eat_ws_
  400. template<typename FwdIter>
  401. FwdIter &eat_ws_(FwdIter &begin, FwdIter end)
  402. {
  403. if(0 != (regex_constants::ignore_white_space & this->flags()))
  404. {
  405. while(end != begin && (BOOST_XPR_CHAR_(char_type, '#') == *begin || this->is_space_(*begin)))
  406. {
  407. if(BOOST_XPR_CHAR_(char_type, '#') == *begin++)
  408. {
  409. while(end != begin && BOOST_XPR_CHAR_(char_type, '\n') != *begin++) {}
  410. }
  411. else
  412. {
  413. for(; end != begin && this->is_space_(*begin); ++begin) {}
  414. }
  415. }
  416. }
  417. return begin;
  418. }
  419. regex_traits traits_;
  420. regex_constants::syntax_option_type flags_;
  421. typename regex_traits::char_class_type space_;
  422. typename regex_traits::char_class_type alnum_;
  423. };
  424. }} // namespace boost::xpressive
  425. #endif