Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

basic_regex_parser.hpp @ 857

Revision 857, 65.9 KB checked in by igarcia, 18 years ago (diff)

Rev	Line
[857]	1	/*
	2	*
	3	* Copyright (c) 2004
	4	* John Maddock
	5	*
	6	* Use, modification and distribution are subject to the
	7	* Boost Software License, Version 1.0. (See accompanying file
	8	* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
	9	*
	10	*/
	11
	12	/*
	13	* LOCATION: see http://www.boost.org for most recent version.
	14	* FILE basic_regex_parser.cpp
	15	* VERSION see <boost/version.hpp>
	16	* DESCRIPTION: Declares template class basic_regex_parser.
	17	*/
	18
	19	#ifndef BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
	20	#define BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
	21
	22	#ifdef BOOST_HAS_ABI_HEADERS
	23	# include BOOST_ABI_PREFIX
	24	#endif
	25
	26	namespace boost{
	27	namespace re_detail{
	28
	29	#ifdef BOOST_MSVC
	30	#pragma warning(push)
	31	#pragma warning(disable:4244)
	32	#endif
	33
	34	template <class charT, class traits>
	35	class basic_regex_parser : public basic_regex_creator<charT, traits>
	36	{
	37	public:
	38	basic_regex_parser(regex_data<charT, traits>* data);
	39	void parse(const charT* p1, const charT* p2, unsigned flags);
	40	void fail(regex_constants::error_type error_code, std::ptrdiff_t position);
	41
	42	bool parse_all();
	43	bool parse_basic();
	44	bool parse_extended();
	45	bool parse_literal();
	46	bool parse_open_paren();
	47	bool parse_basic_escape();
	48	bool parse_extended_escape();
	49	bool parse_match_any();
	50	bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)());
	51	bool parse_repeat_range(bool isbasic);
	52	bool parse_alt();
	53	bool parse_set();
	54	bool parse_backref();
	55	void parse_set_literal(basic_char_set<charT, traits>& char_set);
	56	bool parse_inner_set(basic_char_set<charT, traits>& char_set);
	57	bool parse_QE();
	58	bool parse_perl_extension();
	59	bool add_emacs_code(bool negate);
	60	bool unwind_alts(std::ptrdiff_t last_paren_start);
	61	digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
	62	charT unescape_character();
	63	regex_constants::syntax_option_type parse_options();
	64
	65	private:
	66	typedef bool (basic_regex_parser::*parser_proc_type)();
	67	typedef typename traits::string_type string_type;
	68	typedef typename traits::char_class_type char_class_type;
	69	parser_proc_type m_parser_proc; // the main parser to use
	70	const charT* m_base; // the start of the string being parsed
	71	const charT* m_end; // the end of the string being parsed
	72	const charT* m_position; // our current parser position
	73	unsigned m_mark_count; // how many sub-expressions we have
	74	std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted).
	75	std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative
	76	bool m_has_case_change; // true if somewhere in the current block the case has changed
	77	#if defined(BOOST_MSVC) && defined(_M_IX86)
	78	// This is an ugly warning suppression workaround (for warnings inside std::vector
	79	// that can not otherwise be suppressed)...
	80	BOOST_STATIC_ASSERT(sizeof(long) >= sizeof(void*));
	81	std::vector<long> m_alt_jumps; // list of alternative in the current scope.
	82	#else
	83	std::vector<std::ptrdiff_t> m_alt_jumps; // list of alternative in the current scope.
	84	#endif
	85
	86	basic_regex_parser& operator=(const basic_regex_parser&);
	87	basic_regex_parser(const basic_regex_parser&);
	88	};
	89
	90	template <class charT, class traits>
	91	basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data)
	92	: basic_regex_creator<charT, traits>(data), m_mark_count(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false)
	93	{
	94	}
	95
	96	template <class charT, class traits>
	97	void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned flags)
	98	{
	99	// pass flags on to base class:
	100	this->init(flags);
	101	// set up pointers:
	102	m_position = m_base = p1;
	103	m_end = p2;
	104	// empty strings are errors:
	105	if(p1 == p2)
	106	{
	107	fail(regex_constants::error_empty, 0);
	108	return;
	109	}
	110	// select which parser to use:
	111	switch(flags & regbase::main_option_type)
	112	{
	113	case regbase::perl_syntax_group:
	114	m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended;
	115	break;
	116	case regbase::basic_syntax_group:
	117	m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic;
	118	break;
	119	case regbase::literal:
	120	m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal;
	121	break;
	122	}
	123
	124	// parse all our characters:
	125	bool result = parse_all();
	126	//
	127	// Unwind our alternatives:
	128	//
	129	unwind_alts(-1);
	130	// reset flags as a global scope (?imsx) may have altered them:
	131	this->flags(flags);
	132	// if we haven't gobbled up all the characters then we must
	133	// have had an unexpected ')' :
	134	if(!result)
	135	{
	136	fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_position));
	137	return;
	138	}
	139	// if an error has been set then give up now:
	140	if(this->m_pdata->m_status)
	141	return;
	142	// fill in our sub-expression count:
	143	this->m_pdata->m_mark_count = 1 + m_mark_count;
	144	this->finalize(p1, p2);
	145	}
	146
	147	template <class charT, class traits>
	148	void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position)
	149	{
	150	if(0 == this->m_pdata->m_status) // update the error code if not already set
	151	this->m_pdata->m_status = error_code;
	152	m_position = m_end; // don't bother parsing anything else
	153	// get the error message:
	154	std::string message = this->m_pdata->m_ptraits->error_string(error_code);
	155	// and raise the exception, this will do nothing if exceptions are disabled:
	156	#ifndef BOOST_NO_EXCEPTIONS
	157	if(0 == (this->flags() & regex_constants::no_except))
	158	{
	159	boost::regex_error e(message, error_code, position);
	160	e.raise();
	161	}
	162	#else
	163	(void)position; // suppress warnings.
	164	#endif
	165	}
	166
	167	template <class charT, class traits>
	168	bool basic_regex_parser<charT, traits>::parse_all()
	169	{
	170	bool result = true;
	171	while(result && (m_position != m_end))
	172	{
	173	result = (this->*m_parser_proc)();
	174	}
	175	return result;
	176	}
	177
	178	#ifdef BOOST_MSVC
	179	#pragma warning(push)
	180	#pragma warning(disable:4702)
	181	#endif
	182	template <class charT, class traits>
	183	bool basic_regex_parser<charT, traits>::parse_basic()
	184	{
	185	switch(this->m_traits.syntax_type(*m_position))
	186	{
	187	case regex_constants::syntax_escape:
	188	return parse_basic_escape();
	189	case regex_constants::syntax_dot:
	190	return parse_match_any();
	191	case regex_constants::syntax_caret:
	192	++m_position;
	193	this->append_state(syntax_element_start_line);
	194	break;
	195	case regex_constants::syntax_dollar:
	196	++m_position;
	197	this->append_state(syntax_element_end_line);
	198	break;
	199	case regex_constants::syntax_star:
	200	if(!(this->m_last_state) \|\| (this->m_last_state->type == syntax_element_start_line))
	201	return parse_literal();
	202	else
	203	{
	204	++m_position;
	205	return parse_repeat();
	206	}
	207	case regex_constants::syntax_plus:
	208	if(!(this->m_last_state) \|\| (this->m_last_state->type == syntax_element_start_line) \|\| !(this->flags() & regbase::emacs_ex))
	209	return parse_literal();
	210	else
	211	{
	212	++m_position;
	213	return parse_repeat(1);
	214	}
	215	case regex_constants::syntax_question:
	216	if(!(this->m_last_state) \|\| (this->m_last_state->type == syntax_element_start_line) \|\| !(this->flags() & regbase::emacs_ex))
	217	return parse_literal();
	218	else
	219	{
	220	++m_position;
	221	return parse_repeat(0, 1);
	222	}
	223	case regex_constants::syntax_open_set:
	224	return parse_set();
	225	default:
	226	return parse_literal();
	227	}
	228	return true;
	229	}
	230
	231	template <class charT, class traits>
	232	bool basic_regex_parser<charT, traits>::parse_extended()
	233	{
	234	bool result = true;
	235	switch(this->m_traits.syntax_type(*m_position))
	236	{
	237	case regex_constants::syntax_open_mark:
	238	return parse_open_paren();
	239	case regex_constants::syntax_close_mark:
	240	return false;
	241	case regex_constants::syntax_escape:
	242	return parse_extended_escape();
	243	case regex_constants::syntax_dot:
	244	return parse_match_any();
	245	case regex_constants::syntax_caret:
	246	++m_position;
	247	this->append_state(
	248	(this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line));
	249	break;
	250	case regex_constants::syntax_dollar:
	251	++m_position;
	252	this->append_state(
	253	(this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line));
	254	break;
	255	case regex_constants::syntax_star:
	256	if(m_position == this->m_base)
	257	{
	258	fail(regex_constants::error_badrepeat, 0);
	259	return false;
	260	}
	261	++m_position;
	262	return parse_repeat();
	263	case regex_constants::syntax_question:
	264	if(m_position == this->m_base)
	265	{
	266	fail(regex_constants::error_badrepeat, 0);
	267	return false;
	268	}
	269	++m_position;
	270	return parse_repeat(0,1);
	271	case regex_constants::syntax_plus:
	272	if(m_position == this->m_base)
	273	{
	274	fail(regex_constants::error_badrepeat, 0);
	275	return false;
	276	}
	277	++m_position;
	278	return parse_repeat(1);
	279	case regex_constants::syntax_open_brace:
	280	++m_position;
	281	return parse_repeat_range(false);
	282	case regex_constants::syntax_close_brace:
	283	fail(regex_constants::error_brace, this->m_position - this->m_end);
	284	return false;
	285	case regex_constants::syntax_or:
	286	return parse_alt();
	287	case regex_constants::syntax_open_set:
	288	return parse_set();
	289	case regex_constants::syntax_hash:
	290	//
	291	// If we have a mod_x flag set, then skip until
	292	// we get to a newline character:
	293	//
	294	if((this->flags()
	295	& (regbase::no_perl_ex\|regbase::mod_x))
	296	== regbase::mod_x)
	297	{
	298	while((m_position != m_end) && !is_separator(*m_position++)){}
	299	return true;
	300	}
	301	// Otherwise fall through:
	302	default:
	303	result = parse_literal();
	304	break;
	305	}
	306	return result;
	307	}
	308	#ifdef BOOST_MSVC
	309	#pragma warning(pop)
	310	#endif
	311
	312	template <class charT, class traits>
	313	bool basic_regex_parser<charT, traits>::parse_literal()
	314	{
	315	// append this as a literal provided it's not a space character
	316	// or the perl option regbase::mod_x is not set:
	317	if(
	318	((this->flags()
	319	& (regbase::main_option_type\|regbase::mod_x\|regbase::no_perl_ex))
	320	!= regbase::mod_x)
	321	\|\| !this->m_traits.isctype(*m_position, this->m_mask_space))
	322	this->append_literal(*m_position);
	323	++m_position;
	324	return true;
	325	}
	326
	327	template <class charT, class traits>
	328	bool basic_regex_parser<charT, traits>::parse_open_paren()
	329	{
	330	//
	331	// skip the '(' and error check:
	332	//
	333	if(++m_position == m_end)
	334	{
	335	fail(regex_constants::error_paren, m_position - m_base);
	336	return false;
	337	}
	338	//
	339	// begin by checking for a perl-style (?...) extension:
	340	//
	341	if(
	342	((this->flags() & (regbase::main_option_type \| regbase::no_perl_ex)) == 0)
	343	\|\| ((this->flags() & (regbase::main_option_type \| regbase::emacs_ex)) == (regbase::basic_syntax_group\|regbase::emacs_ex))
	344	)
	345	{
	346	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
	347	return parse_perl_extension();
	348	}
	349	//
	350	// update our mark count, and append the required state:
	351	//
	352	unsigned markid = 0;
	353	if(0 == (this->flags() & regbase::nosubs))
	354	markid = ++m_mark_count;
	355	re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
	356	pb->index = markid;
	357	std::ptrdiff_t last_paren_start = this->getoffset(pb);
	358	// back up insertion point for alternations, and set new point:
	359	std::ptrdiff_t last_alt_point = m_alt_insert_point;
	360	this->m_pdata->m_data.align();
	361	m_alt_insert_point = this->m_pdata->m_data.size();
	362	//
	363	// back up the current flags in case we have a nested (?imsx) group:
	364	//
	365	regex_constants::syntax_option_type opts = this->flags();
	366	bool old_case_change = m_has_case_change;
	367	m_has_case_change = false; // no changes to this scope as yet...
	368	//
	369	// now recursively add more states, this will terminate when we get to a
	370	// matching ')' :
	371	//
	372	parse_all();
	373	//
	374	// Unwind pushed alternatives:
	375	//
	376	if(0 == unwind_alts(last_paren_start))
	377	return false;
	378	//
	379	// restore flags:
	380	//
	381	if(m_has_case_change)
	382	{
	383	// the case has changed in one or more of the alternatives
	384	// within the scoped (...) block: we have to add a state
	385	// to reset the case sensitivity:
	386	static_cast<re_case*>(
	387	this->append_state(syntax_element_toggle_case, sizeof(re_case))
	388	)->icase = opts & regbase::icase;
	389	}
	390	this->flags(opts);
	391	m_has_case_change = old_case_change;
	392	//
	393	// we either have a ')' or we have run out of characters prematurely:
	394	//
	395	if(m_position == m_end)
	396	{
	397	this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end));
	398	return false;
	399	}
	400	BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
	401	++m_position;
	402	//
	403	// append closing parenthesis state:
	404	//
	405	pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
	406	pb->index = markid;
	407	this->m_paren_start = last_paren_start;
	408	//
	409	// restore the alternate insertion point:
	410	//
	411	this->m_alt_insert_point = last_alt_point;
	412	//
	413	// allow backrefs to this mark:
	414	//
	415	if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT))
	416	this->m_backrefs \|= 1u << (markid - 1);
	417
	418	return true;
	419	}
	420
	421	template <class charT, class traits>
	422	bool basic_regex_parser<charT, traits>::parse_basic_escape()
	423	{
	424	++m_position;
	425	bool result = true;
	426	switch(this->m_traits.escape_syntax_type(*m_position))
	427	{
	428	case regex_constants::syntax_open_mark:
	429	return parse_open_paren();
	430	case regex_constants::syntax_close_mark:
	431	return false;
	432	case regex_constants::syntax_plus:
	433	if(this->flags() & regex_constants::bk_plus_qm)
	434	{
	435	++m_position;
	436	return parse_repeat(1);
	437	}
	438	else
	439	return parse_literal();
	440	case regex_constants::syntax_question:
	441	if(this->flags() & regex_constants::bk_plus_qm)
	442	{
	443	++m_position;
	444	return parse_repeat(0, 1);
	445	}
	446	else
	447	return parse_literal();
	448	case regex_constants::syntax_open_brace:
	449	if(this->flags() & regbase::no_intervals)
	450	return parse_literal();
	451	++m_position;
	452	return parse_repeat_range(true);
	453	case regex_constants::syntax_close_brace:
	454	if(this->flags() & regbase::no_intervals)
	455	return parse_literal();
	456	fail(regex_constants::error_brace, this->m_position - this->m_base);
	457	return false;
	458	case regex_constants::syntax_or:
	459	if(this->flags() & regbase::bk_vbar)
	460	return parse_alt();
	461	else
	462	result = parse_literal();
	463	break;
	464	case regex_constants::syntax_digit:
	465	return parse_backref();
	466	case regex_constants::escape_type_start_buffer:
	467	if(this->flags() & regbase::emacs_ex)
	468	{
	469	++m_position;
	470	this->append_state(syntax_element_buffer_start);
	471	}
	472	else
	473	result = parse_literal();
	474	break;
	475	case regex_constants::escape_type_end_buffer:
	476	if(this->flags() & regbase::emacs_ex)
	477	{
	478	++m_position;
	479	this->append_state(syntax_element_buffer_end);
	480	}
	481	else
	482	result = parse_literal();
	483	break;
	484	case regex_constants::escape_type_word_assert:
	485	if(this->flags() & regbase::emacs_ex)
	486	{
	487	++m_position;
	488	this->append_state(syntax_element_word_boundary);
	489	}
	490	else
	491	result = parse_literal();
	492	break;
	493	case regex_constants::escape_type_not_word_assert:
	494	if(this->flags() & regbase::emacs_ex)
	495	{
	496	++m_position;
	497	this->append_state(syntax_element_within_word);
	498	}
	499	else
	500	result = parse_literal();
	501	break;
	502	case regex_constants::escape_type_left_word:
	503	if(this->flags() & regbase::emacs_ex)
	504	{
	505	++m_position;
	506	this->append_state(syntax_element_word_start);
	507	}
	508	else
	509	result = parse_literal();
	510	break;
	511	case regex_constants::escape_type_right_word:
	512	if(this->flags() & regbase::emacs_ex)
	513	{
	514	++m_position;
	515	this->append_state(syntax_element_word_end);
	516	}
	517	else
	518	result = parse_literal();
	519	break;
	520	default:
	521	if(this->flags() & regbase::emacs_ex)
	522	{
	523	bool negate = true;
	524	switch(*m_position)
	525	{
	526	case 'w':
	527	negate = false;
	528	// fall through:
	529	case 'W':
	530	{
	531	basic_char_set<charT, traits> char_set;
	532	if(negate)
	533	char_set.negate();
	534	char_set.add_class(this->m_word_mask);
	535	if(0 == this->append_set(char_set))
	536	{
	537	fail(regex_constants::error_ctype, m_position - m_base);
	538	return false;
	539	}
	540	++m_position;
	541	return true;
	542	}
	543	case 's':
	544	negate = false;
	545	// fall through:
	546	case 'S':
	547	return add_emacs_code(negate);
	548	case 'c':
	549	case 'C':
	550	// not supported yet:
	551	fail(regex_constants::error_escape, m_position - m_base);
	552	return false;
	553	default:
	554	break;
	555	}
	556	}
	557	result = parse_literal();
	558	break;
	559	}
	560	return result;
	561	}
	562
	563	template <class charT, class traits>
	564	bool basic_regex_parser<charT, traits>::parse_extended_escape()
	565	{
	566	++m_position;
	567	bool negate = false; // in case this is a character class escape: \w \d etc
	568	switch(this->m_traits.escape_syntax_type(*m_position))
	569	{
	570	case regex_constants::escape_type_not_class:
	571	negate = true;
	572	// fall through:
	573	case regex_constants::escape_type_class:
	574	{
	575	typedef typename traits::char_class_type mask_type;
	576	mask_type m = this->m_traits.lookup_classname(m_position, m_position+1);
	577	if(m != 0)
	578	{
	579	basic_char_set<charT, traits> char_set;
	580	if(negate)
	581	char_set.negate();
	582	char_set.add_class(m);
	583	if(0 == this->append_set(char_set))
	584	{
	585	fail(regex_constants::error_ctype, m_position - m_base);
	586	return false;
	587	}
	588	++m_position;
	589	return true;
	590	}
	591	//
	592	// not a class, just a regular unknown escape:
	593	//
	594	this->append_literal(unescape_character());
	595	break;
	596	}
	597	case regex_constants::syntax_digit:
	598	return parse_backref();
	599	case regex_constants::escape_type_left_word:
	600	++m_position;
	601	this->append_state(syntax_element_word_start);
	602	break;
	603	case regex_constants::escape_type_right_word:
	604	++m_position;
	605	this->append_state(syntax_element_word_end);
	606	break;
	607	case regex_constants::escape_type_start_buffer:
	608	++m_position;
	609	this->append_state(syntax_element_buffer_start);
	610	break;
	611	case regex_constants::escape_type_end_buffer:
	612	++m_position;
	613	this->append_state(syntax_element_buffer_end);
	614	break;
	615	case regex_constants::escape_type_word_assert:
	616	++m_position;
	617	this->append_state(syntax_element_word_boundary);
	618	break;
	619	case regex_constants::escape_type_not_word_assert:
	620	++m_position;
	621	this->append_state(syntax_element_within_word);
	622	break;
	623	case regex_constants::escape_type_Z:
	624	++m_position;
	625	this->append_state(syntax_element_soft_buffer_end);
	626	break;
	627	case regex_constants::escape_type_Q:
	628	return parse_QE();
	629	case regex_constants::escape_type_C:
	630	return parse_match_any();
	631	case regex_constants::escape_type_X:
	632	++m_position;
	633	this->append_state(syntax_element_combining);
	634	break;
	635	case regex_constants::escape_type_G:
	636	++m_position;
	637	this->append_state(syntax_element_restart_continue);
	638	break;
	639	case regex_constants::escape_type_not_property:
	640	negate = true;
	641	// fall through:
	642	case regex_constants::escape_type_property:
	643	{
	644	++m_position;
	645	char_class_type m;
	646	if(m_position == m_end)
	647	{
	648	fail(regex_constants::error_escape, m_position - m_base);
	649	return false;
	650	}
	651	// maybe have \p{ddd}
	652	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
	653	{
	654	const charT* base = m_position;
	655	// skip forward until we find enclosing brace:
	656	while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
	657	++m_position;
	658	if(m_position == m_end)
	659	{
	660	fail(regex_constants::error_escape, m_position - m_base);
	661	return false;
	662	}
	663	m = this->m_traits.lookup_classname(++base, m_position++);
	664	}
	665	else
	666	{
	667	m = this->m_traits.lookup_classname(m_position, m_position+1);
	668	++m_position;
	669	}
	670	if(m != 0)
	671	{
	672	basic_char_set<charT, traits> char_set;
	673	if(negate)
	674	char_set.negate();
	675	char_set.add_class(m);
	676	if(0 == this->append_set(char_set))
	677	{
	678	fail(regex_constants::error_ctype, m_position - m_base);
	679	return false;
	680	}
	681	return true;
	682	}
	683	fail(regex_constants::error_ctype, m_position - m_base);
	684	}
	685	default:
	686	this->append_literal(unescape_character());
	687	break;
	688	}
	689	return true;
	690	}
	691
	692	template <class charT, class traits>
	693	bool basic_regex_parser<charT, traits>::parse_match_any()
	694	{
	695	//
	696	// we have a '.' that can match any character:
	697	//
	698	++m_position;
	699	static_cast<re_dot*>(
	700	this->append_state(syntax_element_wild, sizeof(re_dot))
	701	)->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s
	702	? re_detail::force_not_newline
	703	: this->flags() & regbase::mod_s ?
	704	re_detail::force_newline : re_detail::dont_care);
	705	return true;
	706	}
	707
	708	template <class charT, class traits>
	709	bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
	710	{
	711	bool greedy = true;
	712	std::size_t insert_point;
	713	//
	714	// when we get to here we may have a non-greedy ? mark still to come:
	715	//
	716	if((m_position != m_end)
	717	&& (
	718	(0 == (this->flags() & (regbase::main_option_type \| regbase::no_perl_ex)))
	719	\|\| ((regbase::basic_syntax_group\|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type \| regbase::emacs_ex)))
	720	)
	721	)
	722	{
	723	// OK we have a perl regex, check for a '?':
	724	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
	725	{
	726	greedy = false;
	727	++m_position;
	728	}
	729	}
	730	if(0 == this->m_last_state)
	731	{
	732	fail(regex_constants::error_badrepeat, ::boost::re_detail::distance(m_base, m_position));
	733	return false;
	734	}
	735	if(this->m_last_state->type == syntax_element_endmark)
	736	{
	737	// insert a repeat before the '(' matching the last ')':
	738	insert_point = this->m_paren_start;
	739	}
	740	else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
	741	{
	742	// the last state was a literal with more than one character, split it in two:
	743	re_literal* lit = static_cast<re_literal*>(this->m_last_state);
	744	charT c = (static_cast<charT>(static_cast<void>(lit+1)))[lit->length - 1];
	745	--(lit->length);
	746	// now append new state:
	747	lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
	748	lit->length = 1;
	749	(static_cast<charT>(static_cast<void>(lit+1)))[0] = c;
	750	insert_point = this->getoffset(this->m_last_state);
	751	}
	752	else
	753	{
	754	// repeat the last state whatever it was, need to add some error checking here:
	755	switch(this->m_last_state->type)
	756	{
	757	case syntax_element_start_line:
	758	case syntax_element_end_line:
	759	case syntax_element_word_boundary:
	760	case syntax_element_within_word:
	761	case syntax_element_word_start:
	762	case syntax_element_word_end:
	763	case syntax_element_buffer_start:
	764	case syntax_element_buffer_end:
	765	case syntax_element_alt:
	766	case syntax_element_soft_buffer_end:
	767	case syntax_element_restart_continue:
	768	case syntax_element_jump:
	769	case syntax_element_startmark:
	770	// can't legally repeat any of the above:
	771	fail(regex_constants::error_badrepeat, m_position - m_base);
	772	return false;
	773	default:
	774	// do nothing...
	775	break;
	776	}
	777	insert_point = this->getoffset(this->m_last_state);
	778	}
	779	//
	780	// OK we now know what to repeat, so insert the repeat around it:
	781	//
	782	re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
	783	rep->min = low;
	784	rep->max = high;
	785	rep->greedy = greedy;
	786	rep->leading = false;
	787	// store our repeater position for later:
	788	std::ptrdiff_t rep_off = this->getoffset(rep);
	789	// and append a back jump to the repeat:
	790	re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
	791	jmp->alt.i = rep_off - this->getoffset(jmp);
	792	this->m_pdata->m_data.align();
	793	// now fill in the alt jump for the repeat:
	794	rep = static_cast<re_repeat*>(this->getaddress(rep_off));
	795	rep->alt.i = this->m_pdata->m_data.size() - rep_off;
	796	return true;
	797	}
	798
	799	template <class charT, class traits>
	800	bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic)
	801	{
	802	//
	803	// parse a repeat-range:
	804	//
	805	std::size_t min, max;
	806	int v;
	807	// skip whitespace:
	808	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
	809	++m_position;
	810	// fail if at end:
	811	if(this->m_position == this->m_end)
	812	{
	813	fail(regex_constants::error_brace, this->m_position - this->m_base);
	814	return false;
	815	}
	816	// get min:
	817	v = this->m_traits.toi(m_position, m_end, 10);
	818	// skip whitespace:
	819	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
	820	++m_position;
	821	if(v < 0)
	822	{
	823	fail(regex_constants::error_badbrace, this->m_position - this->m_base);
	824	return false;
	825	}
	826	else if(this->m_position == this->m_end)
	827	{
	828	fail(regex_constants::error_brace, this->m_position - this->m_base);
	829	return false;
	830	}
	831	min = v;
	832	// see if we have a comma:
	833	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma)
	834	{
	835	// move on and error check:
	836	++m_position;
	837	// skip whitespace:
	838	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
	839	++m_position;
	840	if(this->m_position == this->m_end)
	841	{
	842	fail(regex_constants::error_brace, this->m_position - this->m_base);
	843	return false;
	844	}
	845	// get the value if any:
	846	v = this->m_traits.toi(m_position, m_end, 10);
	847	max = (v >= 0) ? v : (std::numeric_limits<std::size_t>::max)();
	848	}
	849	else
	850	{
	851	// no comma, max = min:
	852	max = min;
	853	}
	854	// skip whitespace:
	855	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
	856	++m_position;
	857	// OK now check trailing }:
	858	if(this->m_position == this->m_end)
	859	{
	860	fail(regex_constants::error_brace, this->m_position - this->m_base);
	861	return false;
	862	}
	863	if(isbasic)
	864	{
	865	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape)
	866	{
	867	++m_position;
	868	if(this->m_position == this->m_end)
	869	{
	870	fail(regex_constants::error_brace, this->m_position - this->m_base);
	871	return false;
	872	}
	873	}
	874	else
	875	{
	876	fail(regex_constants::error_badbrace, this->m_position - this->m_base);
	877	return false;
	878	}
	879	}
	880	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace)
	881	++m_position;
	882	else
	883	{
	884	fail(regex_constants::error_badbrace, this->m_position - this->m_base);
	885	return false;
	886	}
	887	//
	888	// finally go and add the repeat, unless error:
	889	//
	890	if(min > max)
	891	{
	892	fail(regex_constants::error_range, this->m_position - this->m_base);
	893	return false;
	894	}
	895	return parse_repeat(min, max);
	896	}
	897
	898	template <class charT, class traits>
	899	bool basic_regex_parser<charT, traits>::parse_alt()
	900	{
	901	//
	902	// error check: if there have been no previous states,
	903	// or if the last state was a '(' then error:
	904	//
	905	if((this->m_last_state == 0) \|\| (this->m_last_state->type == syntax_element_startmark))
	906	{
	907	fail(regex_constants::error_empty, this->m_position - this->m_base);
	908	return false;
	909	}
	910	++m_position;
	911	//
	912	// we need to append a trailing jump:
	913	//
	914	re_syntax_base* pj = this->append_state(re_detail::syntax_element_jump, sizeof(re_jump));
	915	std::ptrdiff_t jump_offset = this->getoffset(pj);
	916	//
	917	// now insert the alternative:
	918	//
	919	re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size));
	920	jump_offset += re_alt_size;
	921	this->m_pdata->m_data.align();
	922	palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt);
	923	//
	924	// update m_alt_insert_point so that the next alternate gets
	925	// inserted at the start of the second of the two we've just created:
	926	//
	927	this->m_alt_insert_point = this->m_pdata->m_data.size();
	928	//
	929	// the start of this alternative must have a case changes state
	930	// if the current block has messed around with case changes:
	931	//
	932	if(m_has_case_change)
	933	{
	934	static_cast<re_case*>(
	935	this->append_state(syntax_element_toggle_case, sizeof(re_case))
	936	)->icase = this->m_icase;
	937	}
	938	//
	939	// push the alternative onto our stack, a recursive
	940	// implementation here is easier to understand (and faster
	941	// as it happens), but causes all kinds of stack overflow problems
	942	// on programs with small stacks (COM+).
	943	//
	944	m_alt_jumps.push_back(jump_offset);
	945	return true;
	946	}
	947
	948	template <class charT, class traits>
	949	bool basic_regex_parser<charT, traits>::parse_set()
	950	{
	951	++m_position;
	952	if(m_position == m_end)
	953	{
	954	fail(regex_constants::error_brack, m_position - m_base);
	955	return false;
	956	}
	957	basic_char_set<charT, traits> char_set;
	958
	959	const charT* base = m_position; // where the '[' was
	960	const charT* item_base = m_position; // where the '[' or '^' was
	961
	962	while(m_position != m_end)
	963	{
	964	switch(this->m_traits.syntax_type(*m_position))
	965	{
	966	case regex_constants::syntax_caret:
	967	if(m_position == base)
	968	{
	969	char_set.negate();
	970	++m_position;
	971	item_base = m_position;
	972	}
	973	else
	974	parse_set_literal(char_set);
	975	break;
	976	case regex_constants::syntax_close_set:
	977	if(m_position == item_base)
	978	{
	979	parse_set_literal(char_set);
	980	break;
	981	}
	982	else
	983	{
	984	++m_position;
	985	if(0 == this->append_set(char_set))
	986	{
	987	fail(regex_constants::error_range, m_position - m_base);
	988	return false;
	989	}
	990	}
	991	return true;
	992	case regex_constants::syntax_open_set:
	993	if(parse_inner_set(char_set))
	994	break;
	995	return true;
	996	case regex_constants::syntax_escape:
	997	{
	998	//
	999	// look ahead and see if this is a character class shortcut
	1000	// \d \w \s etc...
	1001	//
	1002	++m_position;
	1003	if(this->m_traits.escape_syntax_type(*m_position)
	1004	== regex_constants::escape_type_class)
	1005	{
	1006	char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
	1007	if(m != 0)
	1008	{
	1009	char_set.add_class(m);
	1010	++m_position;
	1011	break;
	1012	}
	1013	}
	1014	else if(this->m_traits.escape_syntax_type(*m_position)
	1015	== regex_constants::escape_type_not_class)
	1016	{
	1017	// negated character class:
	1018	char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
	1019	if(m != 0)
	1020	{
	1021	char_set.add_negated_class(m);
	1022	++m_position;
	1023	break;
	1024	}
	1025	}
	1026	// not a character class, just a regular escape:
	1027	--m_position;
	1028	parse_set_literal(char_set);
	1029	break;
	1030	}
	1031	default:
	1032	parse_set_literal(char_set);
	1033	break;
	1034	}
	1035	}
	1036	return m_position != m_end;
	1037	}
	1038
	1039	template <class charT, class traits>
	1040	bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set)
	1041	{
	1042	//
	1043	// we have either a character class [:name:]
	1044	// a collating element [.name.]
	1045	// or an equivalence class [=name=]
	1046	//
	1047	if(m_end == ++m_position)
	1048	{
	1049	fail(regex_constants::error_brack, m_position - m_base);
	1050	return false;
	1051	}
	1052	switch(this->m_traits.syntax_type(*m_position))
	1053	{
	1054	case regex_constants::syntax_dot:
	1055	//
	1056	// a collating element is treated as a literal:
	1057	//
	1058	--m_position;
	1059	parse_set_literal(char_set);
	1060	return true;
	1061	case regex_constants::syntax_colon:
	1062	{
	1063	// check that character classes are actually enabled:
	1064	if((this->flags() & (regbase::main_option_type \| regbase::no_char_classes))
	1065	== (regbase::basic_syntax_group \| regbase::no_char_classes))
	1066	{
	1067	--m_position;
	1068	parse_set_literal(char_set);
	1069	return true;
	1070	}
	1071	// skip the ':'
	1072	if(m_end == ++m_position)
	1073	{
	1074	fail(regex_constants::error_brack, m_position - m_base);
	1075	return false;
	1076	}
	1077	const charT* name_first = m_position;
	1078	// skip at least one character, then find the matching ':]'
	1079	if(m_end == ++m_position)
	1080	{
	1081	fail(regex_constants::error_brack, m_position - m_base);
	1082	return false;
	1083	}
	1084	while((m_position != m_end)
	1085	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon))
	1086	++m_position;
	1087	const charT* name_last = m_position;
	1088	if(m_end == m_position)
	1089	{
	1090	fail(regex_constants::error_brack, m_position - m_base);
	1091	return false;
	1092	}
	1093	if((m_end == ++m_position)
	1094	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
	1095	{
	1096	fail(regex_constants::error_brack, m_position - m_base);
	1097	return false;
	1098	}
	1099	//
	1100	// check for negated class:
	1101	//
	1102	bool negated = false;
	1103	if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret)
	1104	{
	1105	++name_first;
	1106	negated = true;
	1107	}
	1108	typedef typename traits::char_class_type mask_type;
	1109	mask_type m = this->m_traits.lookup_classname(name_first, name_last);
	1110	if(m == 0)
	1111	{
	1112	if(char_set.empty() && (name_last - name_first == 1))
	1113	{
	1114	// maybe a special case:
	1115	++m_position;
	1116	if( (m_position != m_end)
	1117	&& (this->m_traits.syntax_type(*m_position)
	1118	== regex_constants::syntax_close_set))
	1119	{
	1120	if(this->m_traits.escape_syntax_type(*name_first)
	1121	== regex_constants::escape_type_left_word)
	1122	{
	1123	++m_position;
	1124	this->append_state(syntax_element_word_start);
	1125	return false;
	1126	}
	1127	if(this->m_traits.escape_syntax_type(*name_first)
	1128	== regex_constants::escape_type_right_word)
	1129	{
	1130	++m_position;
	1131	this->append_state(syntax_element_word_end);
	1132	return false;
	1133	}
	1134	}
	1135	}
	1136	fail(regex_constants::error_ctype, name_first - m_base);
	1137	return false;
	1138	}
	1139	if(negated == false)
	1140	char_set.add_class(m);
	1141	else
	1142	char_set.add_negated_class(m);
	1143	++m_position;
	1144	break;
	1145	}
	1146	case regex_constants::syntax_equal:
	1147	{
	1148	// skip the '='
	1149	if(m_end == ++m_position)
	1150	{
	1151	fail(regex_constants::error_brack, m_position - m_base);
	1152	return false;
	1153	}
	1154	const charT* name_first = m_position;
	1155	// skip at least one character, then find the matching '=]'
	1156	if(m_end == ++m_position)
	1157	{
	1158	fail(regex_constants::error_brack, m_position - m_base);
	1159	return false;
	1160	}
	1161	while((m_position != m_end)
	1162	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
	1163	++m_position;
	1164	const charT* name_last = m_position;
	1165	if(m_end == m_position)
	1166	{
	1167	fail(regex_constants::error_brack, m_position - m_base);
	1168	return false;
	1169	}
	1170	if((m_end == ++m_position)
	1171	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
	1172	{
	1173	fail(regex_constants::error_brack, m_position - m_base);
	1174	return false;
	1175	}
	1176	string_type m = this->m_traits.lookup_collatename(name_first, name_last);
	1177	if((0 == m.size()) \|\| (m.size() > 2))
	1178	{
	1179	fail(regex_constants::error_collate, name_first - m_base);
	1180	return false;
	1181	}
	1182	digraph<charT> d;
	1183	d.first = m[0];
	1184	if(m.size() > 1)
	1185	d.second = m[1];
	1186	else
	1187	d.second = 0;
	1188	char_set.add_equivalent(d);
	1189	++m_position;
	1190	break;
	1191	}
	1192	default:
	1193	--m_position;
	1194	parse_set_literal(char_set);
	1195	break;
	1196	}
	1197	return true;
	1198	}
	1199
	1200	template <class charT, class traits>
	1201	void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
	1202	{
	1203	digraph<charT> start_range(get_next_set_literal(char_set));
	1204	if(m_end == m_position)
	1205	{
	1206	fail(regex_constants::error_brack, m_position - m_base);
	1207	return;
	1208	}
	1209	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
	1210	{
	1211	// we have a range:
	1212	if(m_end == ++m_position)
	1213	{
	1214	fail(regex_constants::error_brack, m_position - m_base);
	1215	return;
	1216	}
	1217	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
	1218	{
	1219	digraph<charT> end_range = get_next_set_literal(char_set);
	1220	char_set.add_range(start_range, end_range);
	1221	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
	1222	{
	1223	if(m_end == ++m_position)
	1224	{
	1225	fail(regex_constants::error_brack, m_position - m_base);
	1226	return;
	1227	}
	1228	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set)
	1229	{
	1230	// trailing - :
	1231	--m_position;
	1232	return;
	1233	}
	1234	fail(regex_constants::error_range, m_position - m_base);
	1235	return;
	1236	}
	1237	return;
	1238	}
	1239	--m_position;
	1240	}
	1241	char_set.add_single(start_range);
	1242	}
	1243
	1244	template <class charT, class traits>
	1245	digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
	1246	{
	1247	typedef typename traits::string_type string_type;
	1248	digraph<charT> result;
	1249	switch(this->m_traits.syntax_type(*m_position))
	1250	{
	1251	case regex_constants::syntax_dash:
	1252	if(!char_set.empty())
	1253	{
	1254	// see if we are at the end of the set:
	1255	if((++m_position == m_end) \|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
	1256	{
	1257	fail(regex_constants::error_range, m_position - m_base);
	1258	return result;
	1259	}
	1260	--m_position;
	1261	}
	1262	result.first = *m_position++;
	1263	return result;
	1264	case regex_constants::syntax_escape:
	1265	// check to see if escapes are supported first:
	1266	if(this->flags() & regex_constants::no_escape_in_lists)
	1267	{
	1268	result = *m_position++;
	1269	break;
	1270	}
	1271	++m_position;
	1272	result = unescape_character();
	1273	break;
	1274	case regex_constants::syntax_open_set:
	1275	{
	1276	if(m_end == ++m_position)
	1277	{
	1278	fail(regex_constants::error_collate, m_position - m_base);
	1279	return result;
	1280	}
	1281	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
	1282	{
	1283	--m_position;
	1284	result.first = *m_position;
	1285	++m_position;
	1286	return result;
	1287	}
	1288	if(m_end == ++m_position)
	1289	{
	1290	fail(regex_constants::error_collate, m_position - m_base);
	1291	return result;
	1292	}
	1293	const charT* name_first = m_position;
	1294	// skip at least one character, then find the matching ':]'
	1295	if(m_end == ++m_position)
	1296	{
	1297	fail(regex_constants::error_collate, name_first - m_base);
	1298	return result;
	1299	}
	1300	while((m_position != m_end)
	1301	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
	1302	++m_position;
	1303	const charT* name_last = m_position;
	1304	if(m_end == m_position)
	1305	{
	1306	fail(regex_constants::error_collate, name_first - m_base);
	1307	return result;
	1308	}
	1309	if((m_end == ++m_position)
	1310	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
	1311	{
	1312	fail(regex_constants::error_collate, name_first - m_base);
	1313	return result;
	1314	}
	1315	++m_position;
	1316	string_type s = this->m_traits.lookup_collatename(name_first, name_last);
	1317	if(s.empty() \|\| (s.size() > 2))
	1318	{
	1319	fail(regex_constants::error_collate, name_first - m_base);
	1320	return result;
	1321	}
	1322	result.first = s[0];
	1323	if(s.size() > 1)
	1324	result.second = s[1];
	1325	else
	1326	result.second = 0;
	1327	return result;
	1328	}
	1329	default:
	1330	result = *m_position++;
	1331	}
	1332	return result;
	1333	}
	1334
	1335	//
	1336	// does a value fit in the specified charT type?
	1337	//
	1338	template <class charT>
	1339	bool valid_value(charT, int v, const mpl::true_&)
	1340	{
	1341	return (v >> (sizeof(charT) * CHAR_BIT)) == 0;
	1342	}
	1343	template <class charT>
	1344	bool valid_value(charT, int, const mpl::false_&)
	1345	{
	1346	return true; // v will alsways fit in a charT
	1347	}
	1348	template <class charT>
	1349	bool valid_value(charT c, int v)
	1350	{
	1351	return valid_value(c, v, mpl::bool_<(sizeof(charT) < sizeof(int))>());
	1352	}
	1353
	1354	template <class charT, class traits>
	1355	charT basic_regex_parser<charT, traits>::unescape_character()
	1356	{
	1357	#ifdef BOOST_MSVC
	1358	#pragma warning(push)
	1359	#pragma warning(disable:4127)
	1360	#endif
	1361	charT result(0);
	1362	if(m_position == m_end)
	1363	{
	1364	fail(regex_constants::error_escape, m_position - m_base);
	1365	return false;
	1366	}
	1367	switch(this->m_traits.escape_syntax_type(*m_position))
	1368	{
	1369	case regex_constants::escape_type_control_a:
	1370	result = charT('\a');
	1371	break;
	1372	case regex_constants::escape_type_e:
	1373	result = charT(27);
	1374	break;
	1375	case regex_constants::escape_type_control_f:
	1376	result = charT('\f');
	1377	break;
	1378	case regex_constants::escape_type_control_n:
	1379	result = charT('\n');
	1380	break;
	1381	case regex_constants::escape_type_control_r:
	1382	result = charT('\r');
	1383	break;
	1384	case regex_constants::escape_type_control_t:
	1385	result = charT('\t');
	1386	break;
	1387	case regex_constants::escape_type_control_v:
	1388	result = charT('\v');
	1389	break;
	1390	case regex_constants::escape_type_word_assert:
	1391	result = charT('\b');
	1392	break;
	1393	case regex_constants::escape_type_ascii_control:
	1394	++m_position;
	1395	if(m_position == m_end)
	1396	{
	1397	fail(regex_constants::error_escape, m_position - m_base);
	1398	return result;
	1399	}
	1400	/*
	1401	if((*m_position < charT('@'))
	1402	\|\| (*m_position > charT(125)) )
	1403	{
	1404	fail(regex_constants::error_escape, m_position - m_base);
	1405	return result;
	1406	}
	1407	*/
	1408	result = static_cast<charT>(*m_position % 32);
	1409	break;
	1410	case regex_constants::escape_type_hex:
	1411	++m_position;
	1412	if(m_position == m_end)
	1413	{
	1414	fail(regex_constants::error_escape, m_position - m_base);
	1415	return result;
	1416	}
	1417	// maybe have \x{ddd}
	1418	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
	1419	{
	1420	++m_position;
	1421	if(m_position == m_end)
	1422	{
	1423	fail(regex_constants::error_escape, m_position - m_base);
	1424	return result;
	1425	}
	1426	int i = this->m_traits.toi(m_position, m_end, 16);
	1427	if((m_position == m_end)
	1428	\|\| (i < 0)
	1429	\|\| ((std::numeric_limits<charT>::is_specialized) && (charT(i) > (std::numeric_limits<charT>::max)()))
	1430	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
	1431	{
	1432	fail(regex_constants::error_badbrace, m_position - m_base);
	1433	return result;
	1434	}
	1435	++m_position;
	1436	result = charT(i);
	1437	}
	1438	else
	1439	{
	1440	std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), m_end - m_position);
	1441	int i = this->m_traits.toi(m_position, m_position + len, 16);
	1442	if((i < 0)
	1443	\|\| !valid_value(charT(0), i))
	1444	{
	1445	fail(regex_constants::error_escape, m_position - m_base);
	1446	return result;
	1447	}
	1448	result = charT(i);
	1449	}
	1450	return result;
	1451	case regex_constants::syntax_digit:
	1452	{
	1453	// an octal escape sequence, the first character must be a zero
	1454	// followed by up to 3 octal digits:
	1455	std::ptrdiff_t len = (std::min)(::boost::re_detail::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4));
	1456	const charT* bp = m_position;
	1457	int val = this->m_traits.toi(bp, bp + 1, 8);
	1458	if(val != 0)
	1459	{
	1460	// Oops not an octal escape after all:
	1461	fail(regex_constants::error_escape, m_position - m_base);
	1462	return result;
	1463	}
	1464	val = this->m_traits.toi(m_position, m_position + len, 8);
	1465	if(val < 0)
	1466	{
	1467	fail(regex_constants::error_escape, m_position - m_base);
	1468	return result;
	1469	}
	1470	return static_cast<charT>(val);
	1471	}
	1472	case regex_constants::escape_type_named_char:
	1473	{
	1474	++m_position;
	1475	if(m_position == m_end)
	1476	{
	1477	fail(regex_constants::error_escape, m_position - m_base);
	1478	return false;
	1479	}
	1480	// maybe have \N{name}
	1481	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
	1482	{
	1483	const charT* base = m_position;
	1484	// skip forward until we find enclosing brace:
	1485	while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
	1486	++m_position;
	1487	if(m_position == m_end)
	1488	{
	1489	fail(regex_constants::error_escape, m_position - m_base);
	1490	return false;
	1491	}
	1492	string_type s = this->m_traits.lookup_collatename(++base, m_position++);
	1493	if(s.empty())
	1494	{
	1495	fail(regex_constants::error_collate, m_position - m_base);
	1496	return false;
	1497	}
	1498	if(s.size() == 1)
	1499	{
	1500	return s[0];
	1501	}
	1502	}
	1503	// fall through is a failure:
	1504	fail(regex_constants::error_escape, m_position - m_base);
	1505	return false;
	1506	}
	1507	default:
	1508	result = *m_position;
	1509	break;
	1510	}
	1511	++m_position;
	1512	return result;
	1513	#ifdef BOOST_MSVC
	1514	#pragma warning(pop)
	1515	#endif
	1516	}
	1517
	1518	template <class charT, class traits>
	1519	bool basic_regex_parser<charT, traits>::parse_backref()
	1520	{
	1521	BOOST_ASSERT(m_position != m_end);
	1522	const charT* pc = m_position;
	1523	int i = this->m_traits.toi(pc, pc + 1, 10);
	1524	if((i == 0) \|\| (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs)))
	1525	{
	1526	// not a backref at all but an octal escape sequence:
	1527	charT c = unescape_character();
	1528	this->append_literal(c);
	1529	}
	1530	else if((i > 0) && (this->m_backrefs & (1u << (i-1))))
	1531	{
	1532	m_position = pc;
	1533	re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
	1534	pb->index = i;
	1535	}
	1536	else
	1537	{
	1538	fail(regex_constants::error_backref, m_position - m_end);
	1539	return false;
	1540	}
	1541	return true;
	1542	}
	1543
	1544	template <class charT, class traits>
	1545	bool basic_regex_parser<charT, traits>::parse_QE()
	1546	{
	1547	#ifdef BOOST_MSVC
	1548	#pragma warning(push)
	1549	#pragma warning(disable:4127)
	1550	#endif
	1551	//
	1552	// parse a \Q...\E sequence:
	1553	//
	1554	++m_position; // skip the Q
	1555	const charT* start = m_position;
	1556	const charT* end;
	1557	do
	1558	{
	1559	while((m_position != m_end)
	1560	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
	1561	++m_position;
	1562	if(m_position == m_end)
	1563	{
	1564	// a \Q...\E sequence may terminate with the end of the expression:
	1565	end = m_position;
	1566	break;
	1567	}
	1568	if(++m_position == m_end) // skip the escape
	1569	{
	1570	fail(regex_constants::error_escape, m_position - m_base);
	1571	return false;
	1572	}
	1573	// check to see if it's a \E:
	1574	if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E)
	1575	{
	1576	++m_position;
	1577	end = m_position - 2;
	1578	break;
	1579	}
	1580	// otherwise go round again:
	1581	}while(true);
	1582	//
	1583	// now add all the character between the two escapes as literals:
	1584	//
	1585	while(start != end)
	1586	{
	1587	this->append_literal(*start);
	1588	++start;
	1589	}
	1590	return true;
	1591	#ifdef BOOST_MSVC
	1592	#pragma warning(pop)
	1593	#endif
	1594	}
	1595
	1596	template <class charT, class traits>
	1597	bool basic_regex_parser<charT, traits>::parse_perl_extension()
	1598	{
	1599	if(++m_position == m_end)
	1600	{
	1601	fail(regex_constants::error_badrepeat, m_position - m_base);
	1602	return false;
	1603	}
	1604	//
	1605	// treat comments as a special case, as these
	1606	// are the only ones that don't start with a leading
	1607	// startmark state:
	1608	//
	1609	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
	1610	{
	1611	while((m_position != m_end)
	1612	&& (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark))
	1613	{}
	1614	return true;
	1615	}
	1616	//
	1617	// backup some state, and prepare the way:
	1618	//
	1619	int markid = 0;
	1620	std::ptrdiff_t jump_offset = 0;
	1621	re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
	1622	std::ptrdiff_t last_paren_start = this->getoffset(pb);
	1623	// back up insertion point for alternations, and set new point:
	1624	std::ptrdiff_t last_alt_point = m_alt_insert_point;
	1625	this->m_pdata->m_data.align();
	1626	m_alt_insert_point = this->m_pdata->m_data.size();
	1627	std::ptrdiff_t expected_alt_point = m_alt_insert_point;
	1628	bool restore_flags = true;
	1629	regex_constants::syntax_option_type old_flags = this->flags();
	1630	bool old_case_change = m_has_case_change;
	1631	m_has_case_change = false;
	1632	//
	1633	// select the actual extension used:
	1634	//
	1635	switch(this->m_traits.syntax_type(*m_position))
	1636	{
	1637	case regex_constants::syntax_colon:
	1638	//
	1639	// a non-capturing mark:
	1640	//
	1641	pb->index = markid = 0;
	1642	++m_position;
	1643	break;
	1644	case regex_constants::syntax_equal:
	1645	pb->index = markid = -1;
	1646	++m_position;
	1647	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
	1648	this->m_pdata->m_data.align();
	1649	m_alt_insert_point = this->m_pdata->m_data.size();
	1650	break;
	1651	case regex_constants::syntax_not:
	1652	pb->index = markid = -2;
	1653	++m_position;
	1654	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
	1655	this->m_pdata->m_data.align();
	1656	m_alt_insert_point = this->m_pdata->m_data.size();
	1657	break;
	1658	case regex_constants::escape_type_left_word:
	1659	{
	1660	// a lookbehind assertion:
	1661	if(++m_position == m_end)
	1662	{
	1663	fail(regex_constants::error_badrepeat, m_position - m_base);
	1664	return false;
	1665	}
	1666	regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position);
	1667	if(t == regex_constants::syntax_not)
	1668	pb->index = markid = -2;
	1669	else if(t == regex_constants::syntax_equal)
	1670	pb->index = markid = -1;
	1671	else
	1672	{
	1673	fail(regex_constants::error_badrepeat, m_position - m_base);
	1674	return false;
	1675	}
	1676	++m_position;
	1677	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
	1678	this->append_state(syntax_element_backstep, sizeof(re_brace));
	1679	this->m_pdata->m_data.align();
	1680	m_alt_insert_point = this->m_pdata->m_data.size();
	1681	break;
	1682	}
	1683	case regex_constants::escape_type_right_word:
	1684	//
	1685	// an independent sub-expression:
	1686	//
	1687	pb->index = markid = -3;
	1688	++m_position;
	1689	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
	1690	this->m_pdata->m_data.align();
	1691	m_alt_insert_point = this->m_pdata->m_data.size();
	1692	break;
	1693	case regex_constants::syntax_open_mark:
	1694	{
	1695	// a conditional expression:
	1696	pb->index = markid = -4;
	1697	if(++m_position == m_end)
	1698	{
	1699	fail(regex_constants::error_badrepeat, m_position - m_base);
	1700	return false;
	1701	}
	1702	int v = this->m_traits.toi(m_position, m_end, 10);
	1703	if(v > 0)
	1704	{
	1705	re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
	1706	br->index = v;
	1707	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
	1708	{
	1709	fail(regex_constants::error_badrepeat, m_position - m_base);
	1710	return false;
	1711	}
	1712	if(++m_position == m_end)
	1713	{
	1714	fail(regex_constants::error_badrepeat, m_position - m_base);
	1715	return false;
	1716	}
	1717	}
	1718	else
	1719	{
	1720	// verify that we have a lookahead or lookbehind assert:
	1721	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question)
	1722	{
	1723	fail(regex_constants::error_badrepeat, m_position - m_base);
	1724	return false;
	1725	}
	1726	if(++m_position == m_end)
	1727	{
	1728	fail(regex_constants::error_badrepeat, m_position - m_base);
	1729	return false;
	1730	}
	1731	if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
	1732	{
	1733	if(++m_position == m_end)
	1734	{
	1735	fail(regex_constants::error_badrepeat, m_position - m_base);
	1736	return false;
	1737	}
	1738	if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
	1739	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
	1740	{
	1741	fail(regex_constants::error_badrepeat, m_position - m_base);
	1742	return false;
	1743	}
	1744	m_position -= 3;
	1745	}
	1746	else
	1747	{
	1748	if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
	1749	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
	1750	{
	1751	fail(regex_constants::error_badrepeat, m_position - m_base);
	1752	return false;
	1753	}
	1754	m_position -= 2;
	1755	}
	1756	}
	1757	break;
	1758	}
	1759	case regex_constants::syntax_close_mark:
	1760	fail(regex_constants::error_badrepeat, m_position - m_base);
	1761	return false;
	1762	default:
	1763	//
	1764	// lets assume that we have a (?imsx) group and try and parse it:
	1765	//
	1766	regex_constants::syntax_option_type opts = parse_options();
	1767	if(m_position == m_end)
	1768	return false;
	1769	// make a note of whether we have a case change:
	1770	m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase));
	1771	pb->index = markid = 0;
	1772	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark)
	1773	{
	1774	// update flags and carry on as normal:
	1775	this->flags(opts);
	1776	restore_flags = false;
	1777	old_case_change \|= m_has_case_change; // defer end of scope by one ')'
	1778	}
	1779	else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon)
	1780	{
	1781	// update flags and carry on until the matching ')' is found:
	1782	this->flags(opts);
	1783	++m_position;
	1784	}
	1785	else
	1786	{
	1787	fail(regex_constants::error_badrepeat, m_position - m_base);
	1788	return false;
	1789	}
	1790
	1791	// finally append a case change state if we need it:
	1792	if(m_has_case_change)
	1793	{
	1794	static_cast<re_case*>(
	1795	this->append_state(syntax_element_toggle_case, sizeof(re_case))
	1796	)->icase = opts & regbase::icase;
	1797	}
	1798
	1799	}
	1800	//
	1801	// now recursively add more states, this will terminate when we get to a
	1802	// matching ')' :
	1803	//
	1804	parse_all();
	1805	//
	1806	// Unwind alternatives:
	1807	//
	1808	if(0 == unwind_alts(last_paren_start))
	1809	return false;
	1810	//
	1811	// we either have a ')' or we have run out of characters prematurely:
	1812	//
	1813	if(m_position == m_end)
	1814	{
	1815	this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end));
	1816	return false;
	1817	}
	1818	BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
	1819	++m_position;
	1820	//
	1821	// restore the flags:
	1822	//
	1823	if(restore_flags)
	1824	{
	1825	// append a case change state if we need it:
	1826	if(m_has_case_change)
	1827	{
	1828	static_cast<re_case*>(
	1829	this->append_state(syntax_element_toggle_case, sizeof(re_case))
	1830	)->icase = old_flags & regbase::icase;
	1831	}
	1832	this->flags(old_flags);
	1833	}
	1834	//
	1835	// set up the jump pointer if we have one:
	1836	//
	1837	if(jump_offset)
	1838	{
	1839	this->m_pdata->m_data.align();
	1840	re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
	1841	jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
	1842	if(this->m_last_state == jmp)
	1843	{
	1844	// Oops... we didn't have anything inside the assertion:
	1845	fail(regex_constants::error_empty, m_position - m_base);
	1846	return false;
	1847	}
	1848	}
	1849	//
	1850	// verify that if this is conditional expression, that we do have
	1851	// an alternative, if not add one:
	1852	//
	1853	if(markid == -4)
	1854	{
	1855	re_syntax_base* b = this->getaddress(expected_alt_point);
	1856	if(b->type != syntax_element_alt)
	1857	{
	1858	re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt)));
	1859	alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt);
	1860	}
	1861	else if(this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt)
	1862	{
	1863	fail(regex_constants::error_bad_pattern, m_position - m_base);
	1864	return false;
	1865	}
	1866	}
	1867	//
	1868	// append closing parenthesis state:
	1869	//
	1870	pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
	1871	pb->index = markid;
	1872	this->m_paren_start = last_paren_start;
	1873	//
	1874	// restore the alternate insertion point:
	1875	//
	1876	this->m_alt_insert_point = last_alt_point;
	1877	//
	1878	// and the case change data:
	1879	//
	1880	m_has_case_change = old_case_change;
	1881	return true;
	1882	}
	1883
	1884	template <class charT, class traits>
	1885	bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate)
	1886	{
	1887	//
	1888	// parses an emacs style \sx or \Sx construct.
	1889	//
	1890	if(++m_position == m_end)
	1891	{
	1892	fail(regex_constants::error_escape, m_position - m_base);
	1893	return false;
	1894	}
	1895	basic_char_set<charT, traits> char_set;
	1896	if(negate)
	1897	char_set.negate();
	1898
	1899	static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', };
	1900
	1901	switch(*m_position)
	1902	{
	1903	case 's':
	1904	case ' ':
	1905	char_set.add_class(this->m_mask_space);
	1906	break;
	1907	case 'w':
	1908	char_set.add_class(this->m_word_mask);
	1909	break;
	1910	case '_':
	1911	char_set.add_single(digraph<charT>(charT('$')));
	1912	char_set.add_single(digraph<charT>(charT('&')));
	1913	char_set.add_single(digraph<charT>(charT('*')));
	1914	char_set.add_single(digraph<charT>(charT('+')));
	1915	char_set.add_single(digraph<charT>(charT('-')));
	1916	char_set.add_single(digraph<charT>(charT('_')));
	1917	char_set.add_single(digraph<charT>(charT('<')));
	1918	char_set.add_single(digraph<charT>(charT('>')));
	1919	break;
	1920	case '.':
	1921	char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5));
	1922	break;
	1923	case '(':
	1924	char_set.add_single(digraph<charT>(charT('(')));
	1925	char_set.add_single(digraph<charT>(charT('[')));
	1926	char_set.add_single(digraph<charT>(charT('{')));
	1927	break;
	1928	case ')':
	1929	char_set.add_single(digraph<charT>(charT(')')));
	1930	char_set.add_single(digraph<charT>(charT(']')));
	1931	char_set.add_single(digraph<charT>(charT('}')));
	1932	break;
	1933	case '"':
	1934	char_set.add_single(digraph<charT>(charT('"')));
	1935	char_set.add_single(digraph<charT>(charT('\'')));
	1936	char_set.add_single(digraph<charT>(charT('`')));
	1937	break;
	1938	case '\'':
	1939	char_set.add_single(digraph<charT>(charT('\'')));
	1940	char_set.add_single(digraph<charT>(charT(',')));
	1941	char_set.add_single(digraph<charT>(charT('#')));
	1942	break;
	1943	case '<':
	1944	char_set.add_single(digraph<charT>(charT(';')));
	1945	break;
	1946	case '>':
	1947	char_set.add_single(digraph<charT>(charT('\n')));
	1948	char_set.add_single(digraph<charT>(charT('\f')));
	1949	break;
	1950	default:
	1951	fail(regex_constants::error_ctype, m_position - m_base);
	1952	return false;
	1953	}
	1954	if(0 == this->append_set(char_set))
	1955	{
	1956	fail(regex_constants::error_ctype, m_position - m_base);
	1957	return false;
	1958	}
	1959	++m_position;
	1960	return true;
	1961	}
	1962
	1963	template <class charT, class traits>
	1964	regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options()
	1965	{
	1966	// we have a (?imsx-imsx) group, convert it into a set of flags:
	1967	regex_constants::syntax_option_type f = this->flags();
	1968	bool breakout = false;
	1969	do
	1970	{
	1971	switch(*m_position)
	1972	{
	1973	case 's':
	1974	f \|= regex_constants::mod_s;
	1975	f &= ~regex_constants::no_mod_s;
	1976	break;
	1977	case 'm':
	1978	f &= ~regex_constants::no_mod_m;
	1979	break;
	1980	case 'i':
	1981	f \|= regex_constants::icase;
	1982	break;
	1983	case 'x':
	1984	f \|= regex_constants::mod_x;
	1985	break;
	1986	default:
	1987	breakout = true;
	1988	continue;
	1989	}
	1990	if(++m_position == m_end)
	1991	{
	1992	fail(regex_constants::error_paren, m_position - m_base);
	1993	return false;
	1994	}
	1995	}
	1996	while(!breakout);
	1997
	1998	if(*m_position == static_cast<charT>('-'))
	1999	{
	2000	if(++m_position == m_end)
	2001	{
	2002	fail(regex_constants::error_paren, m_position - m_base);
	2003	return false;
	2004	}
	2005	do
	2006	{
	2007	switch(*m_position)
	2008	{
	2009	case 's':
	2010	f &= ~regex_constants::mod_s;
	2011	f \|= regex_constants::no_mod_s;
	2012	break;
	2013	case 'm':
	2014	f \|= regex_constants::no_mod_m;
	2015	break;
	2016	case 'i':
	2017	f &= ~regex_constants::icase;
	2018	break;
	2019	case 'x':
	2020	f &= ~regex_constants::mod_x;
	2021	break;
	2022	default:
	2023	breakout = true;
	2024	continue;
	2025	}
	2026	if(++m_position == m_end)
	2027	{
	2028	fail(regex_constants::error_paren, m_position - m_base);
	2029	return false;
	2030	}
	2031	}
	2032	while(!breakout);
	2033	}
	2034	return f;
	2035	}
	2036
	2037	template <class charT, class traits>
	2038	bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start)
	2039	{
	2040	//
	2041	// If we didn't actually add any states after the last
	2042	// alternative then that's an error:
	2043	//
	2044	if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size()))
	2045	&& m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
	2046	{
	2047	fail(regex_constants::error_empty, this->m_position - this->m_base);
	2048	return false;
	2049	}
	2050	//
	2051	// Fix up our alternatives:
	2052	//
	2053	while(m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
	2054	{
	2055	//
	2056	// fix up the jump to point to the end of the states
	2057	// that we've just added:
	2058	//
	2059	std::ptrdiff_t jump_offset = m_alt_jumps.back();
	2060	m_alt_jumps.pop_back();
	2061	this->m_pdata->m_data.align();
	2062	re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
	2063	BOOST_ASSERT(jmp->type == syntax_element_jump);
	2064	jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
	2065	}
	2066	return true;
	2067	}
	2068
	2069	#ifdef BOOST_MSVC
	2070	#pragma warning(pop)
	2071	#endif
	2072
	2073	} // namespace re_detail
	2074	} // namespace boost
	2075
	2076	#ifdef BOOST_HAS_ABI_HEADERS
	2077	# include BOOST_ABI_SUFFIX
	2078	#endif
	2079
	2080	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: NonGTP/Boost/boost/regex/v4/basic_regex_parser.hpp @ 857

Download in other formats: