Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

basic_regex_parser.hpp @ 857

Revision 857, 65.9 KB checked in by igarcia, 19 years ago (diff)

Line
1	/*
2	*
3	* Copyright (c) 2004
4	* John Maddock
5	*
6	* Use, modification and distribution are subject to the
7	* Boost Software License, Version 1.0. (See accompanying file
8	* LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9	*
10	*/
11
12	/*
13	* LOCATION: see http://www.boost.org for most recent version.
14	* FILE basic_regex_parser.cpp
15	* VERSION see <boost/version.hpp>
16	* DESCRIPTION: Declares template class basic_regex_parser.
17	*/
18
19	#ifndef BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
20	#define BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
21
22	#ifdef BOOST_HAS_ABI_HEADERS
23	# include BOOST_ABI_PREFIX
24	#endif
25
26	namespace boost{
27	namespace re_detail{
28
29	#ifdef BOOST_MSVC
30	#pragma warning(push)
31	#pragma warning(disable:4244)
32	#endif
33
34	template <class charT, class traits>
35	class basic_regex_parser : public basic_regex_creator<charT, traits>
36	{
37	public:
38	basic_regex_parser(regex_data<charT, traits>* data);
39	void parse(const charT* p1, const charT* p2, unsigned flags);
40	void fail(regex_constants::error_type error_code, std::ptrdiff_t position);
41
42	bool parse_all();
43	bool parse_basic();
44	bool parse_extended();
45	bool parse_literal();
46	bool parse_open_paren();
47	bool parse_basic_escape();
48	bool parse_extended_escape();
49	bool parse_match_any();
50	bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)());
51	bool parse_repeat_range(bool isbasic);
52	bool parse_alt();
53	bool parse_set();
54	bool parse_backref();
55	void parse_set_literal(basic_char_set<charT, traits>& char_set);
56	bool parse_inner_set(basic_char_set<charT, traits>& char_set);
57	bool parse_QE();
58	bool parse_perl_extension();
59	bool add_emacs_code(bool negate);
60	bool unwind_alts(std::ptrdiff_t last_paren_start);
61	digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
62	charT unescape_character();
63	regex_constants::syntax_option_type parse_options();
64
65	private:
66	typedef bool (basic_regex_parser::*parser_proc_type)();
67	typedef typename traits::string_type string_type;
68	typedef typename traits::char_class_type char_class_type;
69	parser_proc_type m_parser_proc; // the main parser to use
70	const charT* m_base; // the start of the string being parsed
71	const charT* m_end; // the end of the string being parsed
72	const charT* m_position; // our current parser position
73	unsigned m_mark_count; // how many sub-expressions we have
74	std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted).
75	std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative
76	bool m_has_case_change; // true if somewhere in the current block the case has changed
77	#if defined(BOOST_MSVC) && defined(_M_IX86)
78	// This is an ugly warning suppression workaround (for warnings inside std::vector
79	// that can not otherwise be suppressed)...
80	BOOST_STATIC_ASSERT(sizeof(long) >= sizeof(void*));
81	std::vector<long> m_alt_jumps; // list of alternative in the current scope.
82	#else
83	std::vector<std::ptrdiff_t> m_alt_jumps; // list of alternative in the current scope.
84	#endif
85
86	basic_regex_parser& operator=(const basic_regex_parser&);
87	basic_regex_parser(const basic_regex_parser&);
88	};
89
90	template <class charT, class traits>
91	basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data)
92	: basic_regex_creator<charT, traits>(data), m_mark_count(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false)
93	{
94	}
95
96	template <class charT, class traits>
97	void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned flags)
98	{
99	// pass flags on to base class:
100	this->init(flags);
101	// set up pointers:
102	m_position = m_base = p1;
103	m_end = p2;
104	// empty strings are errors:
105	if(p1 == p2)
106	{
107	fail(regex_constants::error_empty, 0);
108	return;
109	}
110	// select which parser to use:
111	switch(flags & regbase::main_option_type)
112	{
113	case regbase::perl_syntax_group:
114	m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended;
115	break;
116	case regbase::basic_syntax_group:
117	m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic;
118	break;
119	case regbase::literal:
120	m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal;
121	break;
122	}
123
124	// parse all our characters:
125	bool result = parse_all();
126	//
127	// Unwind our alternatives:
128	//
129	unwind_alts(-1);
130	// reset flags as a global scope (?imsx) may have altered them:
131	this->flags(flags);
132	// if we haven't gobbled up all the characters then we must
133	// have had an unexpected ')' :
134	if(!result)
135	{
136	fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_position));
137	return;
138	}
139	// if an error has been set then give up now:
140	if(this->m_pdata->m_status)
141	return;
142	// fill in our sub-expression count:
143	this->m_pdata->m_mark_count = 1 + m_mark_count;
144	this->finalize(p1, p2);
145	}
146
147	template <class charT, class traits>
148	void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position)
149	{
150	if(0 == this->m_pdata->m_status) // update the error code if not already set
151	this->m_pdata->m_status = error_code;
152	m_position = m_end; // don't bother parsing anything else
153	// get the error message:
154	std::string message = this->m_pdata->m_ptraits->error_string(error_code);
155	// and raise the exception, this will do nothing if exceptions are disabled:
156	#ifndef BOOST_NO_EXCEPTIONS
157	if(0 == (this->flags() & regex_constants::no_except))
158	{
159	boost::regex_error e(message, error_code, position);
160	e.raise();
161	}
162	#else
163	(void)position; // suppress warnings.
164	#endif
165	}
166
167	template <class charT, class traits>
168	bool basic_regex_parser<charT, traits>::parse_all()
169	{
170	bool result = true;
171	while(result && (m_position != m_end))
172	{
173	result = (this->*m_parser_proc)();
174	}
175	return result;
176	}
177
178	#ifdef BOOST_MSVC
179	#pragma warning(push)
180	#pragma warning(disable:4702)
181	#endif
182	template <class charT, class traits>
183	bool basic_regex_parser<charT, traits>::parse_basic()
184	{
185	switch(this->m_traits.syntax_type(*m_position))
186	{
187	case regex_constants::syntax_escape:
188	return parse_basic_escape();
189	case regex_constants::syntax_dot:
190	return parse_match_any();
191	case regex_constants::syntax_caret:
192	++m_position;
193	this->append_state(syntax_element_start_line);
194	break;
195	case regex_constants::syntax_dollar:
196	++m_position;
197	this->append_state(syntax_element_end_line);
198	break;
199	case regex_constants::syntax_star:
200	if(!(this->m_last_state) \|\| (this->m_last_state->type == syntax_element_start_line))
201	return parse_literal();
202	else
203	{
204	++m_position;
205	return parse_repeat();
206	}
207	case regex_constants::syntax_plus:
208	if(!(this->m_last_state) \|\| (this->m_last_state->type == syntax_element_start_line) \|\| !(this->flags() & regbase::emacs_ex))
209	return parse_literal();
210	else
211	{
212	++m_position;
213	return parse_repeat(1);
214	}
215	case regex_constants::syntax_question:
216	if(!(this->m_last_state) \|\| (this->m_last_state->type == syntax_element_start_line) \|\| !(this->flags() & regbase::emacs_ex))
217	return parse_literal();
218	else
219	{
220	++m_position;
221	return parse_repeat(0, 1);
222	}
223	case regex_constants::syntax_open_set:
224	return parse_set();
225	default:
226	return parse_literal();
227	}
228	return true;
229	}
230
231	template <class charT, class traits>
232	bool basic_regex_parser<charT, traits>::parse_extended()
233	{
234	bool result = true;
235	switch(this->m_traits.syntax_type(*m_position))
236	{
237	case regex_constants::syntax_open_mark:
238	return parse_open_paren();
239	case regex_constants::syntax_close_mark:
240	return false;
241	case regex_constants::syntax_escape:
242	return parse_extended_escape();
243	case regex_constants::syntax_dot:
244	return parse_match_any();
245	case regex_constants::syntax_caret:
246	++m_position;
247	this->append_state(
248	(this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line));
249	break;
250	case regex_constants::syntax_dollar:
251	++m_position;
252	this->append_state(
253	(this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line));
254	break;
255	case regex_constants::syntax_star:
256	if(m_position == this->m_base)
257	{
258	fail(regex_constants::error_badrepeat, 0);
259	return false;
260	}
261	++m_position;
262	return parse_repeat();
263	case regex_constants::syntax_question:
264	if(m_position == this->m_base)
265	{
266	fail(regex_constants::error_badrepeat, 0);
267	return false;
268	}
269	++m_position;
270	return parse_repeat(0,1);
271	case regex_constants::syntax_plus:
272	if(m_position == this->m_base)
273	{
274	fail(regex_constants::error_badrepeat, 0);
275	return false;
276	}
277	++m_position;
278	return parse_repeat(1);
279	case regex_constants::syntax_open_brace:
280	++m_position;
281	return parse_repeat_range(false);
282	case regex_constants::syntax_close_brace:
283	fail(regex_constants::error_brace, this->m_position - this->m_end);
284	return false;
285	case regex_constants::syntax_or:
286	return parse_alt();
287	case regex_constants::syntax_open_set:
288	return parse_set();
289	case regex_constants::syntax_hash:
290	//
291	// If we have a mod_x flag set, then skip until
292	// we get to a newline character:
293	//
294	if((this->flags()
295	& (regbase::no_perl_ex\|regbase::mod_x))
296	== regbase::mod_x)
297	{
298	while((m_position != m_end) && !is_separator(*m_position++)){}
299	return true;
300	}
301	// Otherwise fall through:
302	default:
303	result = parse_literal();
304	break;
305	}
306	return result;
307	}
308	#ifdef BOOST_MSVC
309	#pragma warning(pop)
310	#endif
311
312	template <class charT, class traits>
313	bool basic_regex_parser<charT, traits>::parse_literal()
314	{
315	// append this as a literal provided it's not a space character
316	// or the perl option regbase::mod_x is not set:
317	if(
318	((this->flags()
319	& (regbase::main_option_type\|regbase::mod_x\|regbase::no_perl_ex))
320	!= regbase::mod_x)
321	\|\| !this->m_traits.isctype(*m_position, this->m_mask_space))
322	this->append_literal(*m_position);
323	++m_position;
324	return true;
325	}
326
327	template <class charT, class traits>
328	bool basic_regex_parser<charT, traits>::parse_open_paren()
329	{
330	//
331	// skip the '(' and error check:
332	//
333	if(++m_position == m_end)
334	{
335	fail(regex_constants::error_paren, m_position - m_base);
336	return false;
337	}
338	//
339	// begin by checking for a perl-style (?...) extension:
340	//
341	if(
342	((this->flags() & (regbase::main_option_type \| regbase::no_perl_ex)) == 0)
343	\|\| ((this->flags() & (regbase::main_option_type \| regbase::emacs_ex)) == (regbase::basic_syntax_group\|regbase::emacs_ex))
344	)
345	{
346	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
347	return parse_perl_extension();
348	}
349	//
350	// update our mark count, and append the required state:
351	//
352	unsigned markid = 0;
353	if(0 == (this->flags() & regbase::nosubs))
354	markid = ++m_mark_count;
355	re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
356	pb->index = markid;
357	std::ptrdiff_t last_paren_start = this->getoffset(pb);
358	// back up insertion point for alternations, and set new point:
359	std::ptrdiff_t last_alt_point = m_alt_insert_point;
360	this->m_pdata->m_data.align();
361	m_alt_insert_point = this->m_pdata->m_data.size();
362	//
363	// back up the current flags in case we have a nested (?imsx) group:
364	//
365	regex_constants::syntax_option_type opts = this->flags();
366	bool old_case_change = m_has_case_change;
367	m_has_case_change = false; // no changes to this scope as yet...
368	//
369	// now recursively add more states, this will terminate when we get to a
370	// matching ')' :
371	//
372	parse_all();
373	//
374	// Unwind pushed alternatives:
375	//
376	if(0 == unwind_alts(last_paren_start))
377	return false;
378	//
379	// restore flags:
380	//
381	if(m_has_case_change)
382	{
383	// the case has changed in one or more of the alternatives
384	// within the scoped (...) block: we have to add a state
385	// to reset the case sensitivity:
386	static_cast<re_case*>(
387	this->append_state(syntax_element_toggle_case, sizeof(re_case))
388	)->icase = opts & regbase::icase;
389	}
390	this->flags(opts);
391	m_has_case_change = old_case_change;
392	//
393	// we either have a ')' or we have run out of characters prematurely:
394	//
395	if(m_position == m_end)
396	{
397	this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end));
398	return false;
399	}
400	BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
401	++m_position;
402	//
403	// append closing parenthesis state:
404	//
405	pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
406	pb->index = markid;
407	this->m_paren_start = last_paren_start;
408	//
409	// restore the alternate insertion point:
410	//
411	this->m_alt_insert_point = last_alt_point;
412	//
413	// allow backrefs to this mark:
414	//
415	if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT))
416	this->m_backrefs \|= 1u << (markid - 1);
417
418	return true;
419	}
420
421	template <class charT, class traits>
422	bool basic_regex_parser<charT, traits>::parse_basic_escape()
423	{
424	++m_position;
425	bool result = true;
426	switch(this->m_traits.escape_syntax_type(*m_position))
427	{
428	case regex_constants::syntax_open_mark:
429	return parse_open_paren();
430	case regex_constants::syntax_close_mark:
431	return false;
432	case regex_constants::syntax_plus:
433	if(this->flags() & regex_constants::bk_plus_qm)
434	{
435	++m_position;
436	return parse_repeat(1);
437	}
438	else
439	return parse_literal();
440	case regex_constants::syntax_question:
441	if(this->flags() & regex_constants::bk_plus_qm)
442	{
443	++m_position;
444	return parse_repeat(0, 1);
445	}
446	else
447	return parse_literal();
448	case regex_constants::syntax_open_brace:
449	if(this->flags() & regbase::no_intervals)
450	return parse_literal();
451	++m_position;
452	return parse_repeat_range(true);
453	case regex_constants::syntax_close_brace:
454	if(this->flags() & regbase::no_intervals)
455	return parse_literal();
456	fail(regex_constants::error_brace, this->m_position - this->m_base);
457	return false;
458	case regex_constants::syntax_or:
459	if(this->flags() & regbase::bk_vbar)
460	return parse_alt();
461	else
462	result = parse_literal();
463	break;
464	case regex_constants::syntax_digit:
465	return parse_backref();
466	case regex_constants::escape_type_start_buffer:
467	if(this->flags() & regbase::emacs_ex)
468	{
469	++m_position;
470	this->append_state(syntax_element_buffer_start);
471	}
472	else
473	result = parse_literal();
474	break;
475	case regex_constants::escape_type_end_buffer:
476	if(this->flags() & regbase::emacs_ex)
477	{
478	++m_position;
479	this->append_state(syntax_element_buffer_end);
480	}
481	else
482	result = parse_literal();
483	break;
484	case regex_constants::escape_type_word_assert:
485	if(this->flags() & regbase::emacs_ex)
486	{
487	++m_position;
488	this->append_state(syntax_element_word_boundary);
489	}
490	else
491	result = parse_literal();
492	break;
493	case regex_constants::escape_type_not_word_assert:
494	if(this->flags() & regbase::emacs_ex)
495	{
496	++m_position;
497	this->append_state(syntax_element_within_word);
498	}
499	else
500	result = parse_literal();
501	break;
502	case regex_constants::escape_type_left_word:
503	if(this->flags() & regbase::emacs_ex)
504	{
505	++m_position;
506	this->append_state(syntax_element_word_start);
507	}
508	else
509	result = parse_literal();
510	break;
511	case regex_constants::escape_type_right_word:
512	if(this->flags() & regbase::emacs_ex)
513	{
514	++m_position;
515	this->append_state(syntax_element_word_end);
516	}
517	else
518	result = parse_literal();
519	break;
520	default:
521	if(this->flags() & regbase::emacs_ex)
522	{
523	bool negate = true;
524	switch(*m_position)
525	{
526	case 'w':
527	negate = false;
528	// fall through:
529	case 'W':
530	{
531	basic_char_set<charT, traits> char_set;
532	if(negate)
533	char_set.negate();
534	char_set.add_class(this->m_word_mask);
535	if(0 == this->append_set(char_set))
536	{
537	fail(regex_constants::error_ctype, m_position - m_base);
538	return false;
539	}
540	++m_position;
541	return true;
542	}
543	case 's':
544	negate = false;
545	// fall through:
546	case 'S':
547	return add_emacs_code(negate);
548	case 'c':
549	case 'C':
550	// not supported yet:
551	fail(regex_constants::error_escape, m_position - m_base);
552	return false;
553	default:
554	break;
555	}
556	}
557	result = parse_literal();
558	break;
559	}
560	return result;
561	}
562
563	template <class charT, class traits>
564	bool basic_regex_parser<charT, traits>::parse_extended_escape()
565	{
566	++m_position;
567	bool negate = false; // in case this is a character class escape: \w \d etc
568	switch(this->m_traits.escape_syntax_type(*m_position))
569	{
570	case regex_constants::escape_type_not_class:
571	negate = true;
572	// fall through:
573	case regex_constants::escape_type_class:
574	{
575	typedef typename traits::char_class_type mask_type;
576	mask_type m = this->m_traits.lookup_classname(m_position, m_position+1);
577	if(m != 0)
578	{
579	basic_char_set<charT, traits> char_set;
580	if(negate)
581	char_set.negate();
582	char_set.add_class(m);
583	if(0 == this->append_set(char_set))
584	{
585	fail(regex_constants::error_ctype, m_position - m_base);
586	return false;
587	}
588	++m_position;
589	return true;
590	}
591	//
592	// not a class, just a regular unknown escape:
593	//
594	this->append_literal(unescape_character());
595	break;
596	}
597	case regex_constants::syntax_digit:
598	return parse_backref();
599	case regex_constants::escape_type_left_word:
600	++m_position;
601	this->append_state(syntax_element_word_start);
602	break;
603	case regex_constants::escape_type_right_word:
604	++m_position;
605	this->append_state(syntax_element_word_end);
606	break;
607	case regex_constants::escape_type_start_buffer:
608	++m_position;
609	this->append_state(syntax_element_buffer_start);
610	break;
611	case regex_constants::escape_type_end_buffer:
612	++m_position;
613	this->append_state(syntax_element_buffer_end);
614	break;
615	case regex_constants::escape_type_word_assert:
616	++m_position;
617	this->append_state(syntax_element_word_boundary);
618	break;
619	case regex_constants::escape_type_not_word_assert:
620	++m_position;
621	this->append_state(syntax_element_within_word);
622	break;
623	case regex_constants::escape_type_Z:
624	++m_position;
625	this->append_state(syntax_element_soft_buffer_end);
626	break;
627	case regex_constants::escape_type_Q:
628	return parse_QE();
629	case regex_constants::escape_type_C:
630	return parse_match_any();
631	case regex_constants::escape_type_X:
632	++m_position;
633	this->append_state(syntax_element_combining);
634	break;
635	case regex_constants::escape_type_G:
636	++m_position;
637	this->append_state(syntax_element_restart_continue);
638	break;
639	case regex_constants::escape_type_not_property:
640	negate = true;
641	// fall through:
642	case regex_constants::escape_type_property:
643	{
644	++m_position;
645	char_class_type m;
646	if(m_position == m_end)
647	{
648	fail(regex_constants::error_escape, m_position - m_base);
649	return false;
650	}
651	// maybe have \p{ddd}
652	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
653	{
654	const charT* base = m_position;
655	// skip forward until we find enclosing brace:
656	while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
657	++m_position;
658	if(m_position == m_end)
659	{
660	fail(regex_constants::error_escape, m_position - m_base);
661	return false;
662	}
663	m = this->m_traits.lookup_classname(++base, m_position++);
664	}
665	else
666	{
667	m = this->m_traits.lookup_classname(m_position, m_position+1);
668	++m_position;
669	}
670	if(m != 0)
671	{
672	basic_char_set<charT, traits> char_set;
673	if(negate)
674	char_set.negate();
675	char_set.add_class(m);
676	if(0 == this->append_set(char_set))
677	{
678	fail(regex_constants::error_ctype, m_position - m_base);
679	return false;
680	}
681	return true;
682	}
683	fail(regex_constants::error_ctype, m_position - m_base);
684	}
685	default:
686	this->append_literal(unescape_character());
687	break;
688	}
689	return true;
690	}
691
692	template <class charT, class traits>
693	bool basic_regex_parser<charT, traits>::parse_match_any()
694	{
695	//
696	// we have a '.' that can match any character:
697	//
698	++m_position;
699	static_cast<re_dot*>(
700	this->append_state(syntax_element_wild, sizeof(re_dot))
701	)->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s
702	? re_detail::force_not_newline
703	: this->flags() & regbase::mod_s ?
704	re_detail::force_newline : re_detail::dont_care);
705	return true;
706	}
707
708	template <class charT, class traits>
709	bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
710	{
711	bool greedy = true;
712	std::size_t insert_point;
713	//
714	// when we get to here we may have a non-greedy ? mark still to come:
715	//
716	if((m_position != m_end)
717	&& (
718	(0 == (this->flags() & (regbase::main_option_type \| regbase::no_perl_ex)))
719	\|\| ((regbase::basic_syntax_group\|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type \| regbase::emacs_ex)))
720	)
721	)
722	{
723	// OK we have a perl regex, check for a '?':
724	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
725	{
726	greedy = false;
727	++m_position;
728	}
729	}
730	if(0 == this->m_last_state)
731	{
732	fail(regex_constants::error_badrepeat, ::boost::re_detail::distance(m_base, m_position));
733	return false;
734	}
735	if(this->m_last_state->type == syntax_element_endmark)
736	{
737	// insert a repeat before the '(' matching the last ')':
738	insert_point = this->m_paren_start;
739	}
740	else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
741	{
742	// the last state was a literal with more than one character, split it in two:
743	re_literal* lit = static_cast<re_literal*>(this->m_last_state);
744	charT c = (static_cast<charT>(static_cast<void>(lit+1)))[lit->length - 1];
745	--(lit->length);
746	// now append new state:
747	lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
748	lit->length = 1;
749	(static_cast<charT>(static_cast<void>(lit+1)))[0] = c;
750	insert_point = this->getoffset(this->m_last_state);
751	}
752	else
753	{
754	// repeat the last state whatever it was, need to add some error checking here:
755	switch(this->m_last_state->type)
756	{
757	case syntax_element_start_line:
758	case syntax_element_end_line:
759	case syntax_element_word_boundary:
760	case syntax_element_within_word:
761	case syntax_element_word_start:
762	case syntax_element_word_end:
763	case syntax_element_buffer_start:
764	case syntax_element_buffer_end:
765	case syntax_element_alt:
766	case syntax_element_soft_buffer_end:
767	case syntax_element_restart_continue:
768	case syntax_element_jump:
769	case syntax_element_startmark:
770	// can't legally repeat any of the above:
771	fail(regex_constants::error_badrepeat, m_position - m_base);
772	return false;
773	default:
774	// do nothing...
775	break;
776	}
777	insert_point = this->getoffset(this->m_last_state);
778	}
779	//
780	// OK we now know what to repeat, so insert the repeat around it:
781	//
782	re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
783	rep->min = low;
784	rep->max = high;
785	rep->greedy = greedy;
786	rep->leading = false;
787	// store our repeater position for later:
788	std::ptrdiff_t rep_off = this->getoffset(rep);
789	// and append a back jump to the repeat:
790	re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
791	jmp->alt.i = rep_off - this->getoffset(jmp);
792	this->m_pdata->m_data.align();
793	// now fill in the alt jump for the repeat:
794	rep = static_cast<re_repeat*>(this->getaddress(rep_off));
795	rep->alt.i = this->m_pdata->m_data.size() - rep_off;
796	return true;
797	}
798
799	template <class charT, class traits>
800	bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic)
801	{
802	//
803	// parse a repeat-range:
804	//
805	std::size_t min, max;
806	int v;
807	// skip whitespace:
808	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
809	++m_position;
810	// fail if at end:
811	if(this->m_position == this->m_end)
812	{
813	fail(regex_constants::error_brace, this->m_position - this->m_base);
814	return false;
815	}
816	// get min:
817	v = this->m_traits.toi(m_position, m_end, 10);
818	// skip whitespace:
819	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
820	++m_position;
821	if(v < 0)
822	{
823	fail(regex_constants::error_badbrace, this->m_position - this->m_base);
824	return false;
825	}
826	else if(this->m_position == this->m_end)
827	{
828	fail(regex_constants::error_brace, this->m_position - this->m_base);
829	return false;
830	}
831	min = v;
832	// see if we have a comma:
833	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma)
834	{
835	// move on and error check:
836	++m_position;
837	// skip whitespace:
838	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
839	++m_position;
840	if(this->m_position == this->m_end)
841	{
842	fail(regex_constants::error_brace, this->m_position - this->m_base);
843	return false;
844	}
845	// get the value if any:
846	v = this->m_traits.toi(m_position, m_end, 10);
847	max = (v >= 0) ? v : (std::numeric_limits<std::size_t>::max)();
848	}
849	else
850	{
851	// no comma, max = min:
852	max = min;
853	}
854	// skip whitespace:
855	while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
856	++m_position;
857	// OK now check trailing }:
858	if(this->m_position == this->m_end)
859	{
860	fail(regex_constants::error_brace, this->m_position - this->m_base);
861	return false;
862	}
863	if(isbasic)
864	{
865	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape)
866	{
867	++m_position;
868	if(this->m_position == this->m_end)
869	{
870	fail(regex_constants::error_brace, this->m_position - this->m_base);
871	return false;
872	}
873	}
874	else
875	{
876	fail(regex_constants::error_badbrace, this->m_position - this->m_base);
877	return false;
878	}
879	}
880	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace)
881	++m_position;
882	else
883	{
884	fail(regex_constants::error_badbrace, this->m_position - this->m_base);
885	return false;
886	}
887	//
888	// finally go and add the repeat, unless error:
889	//
890	if(min > max)
891	{
892	fail(regex_constants::error_range, this->m_position - this->m_base);
893	return false;
894	}
895	return parse_repeat(min, max);
896	}
897
898	template <class charT, class traits>
899	bool basic_regex_parser<charT, traits>::parse_alt()
900	{
901	//
902	// error check: if there have been no previous states,
903	// or if the last state was a '(' then error:
904	//
905	if((this->m_last_state == 0) \|\| (this->m_last_state->type == syntax_element_startmark))
906	{
907	fail(regex_constants::error_empty, this->m_position - this->m_base);
908	return false;
909	}
910	++m_position;
911	//
912	// we need to append a trailing jump:
913	//
914	re_syntax_base* pj = this->append_state(re_detail::syntax_element_jump, sizeof(re_jump));
915	std::ptrdiff_t jump_offset = this->getoffset(pj);
916	//
917	// now insert the alternative:
918	//
919	re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size));
920	jump_offset += re_alt_size;
921	this->m_pdata->m_data.align();
922	palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt);
923	//
924	// update m_alt_insert_point so that the next alternate gets
925	// inserted at the start of the second of the two we've just created:
926	//
927	this->m_alt_insert_point = this->m_pdata->m_data.size();
928	//
929	// the start of this alternative must have a case changes state
930	// if the current block has messed around with case changes:
931	//
932	if(m_has_case_change)
933	{
934	static_cast<re_case*>(
935	this->append_state(syntax_element_toggle_case, sizeof(re_case))
936	)->icase = this->m_icase;
937	}
938	//
939	// push the alternative onto our stack, a recursive
940	// implementation here is easier to understand (and faster
941	// as it happens), but causes all kinds of stack overflow problems
942	// on programs with small stacks (COM+).
943	//
944	m_alt_jumps.push_back(jump_offset);
945	return true;
946	}
947
948	template <class charT, class traits>
949	bool basic_regex_parser<charT, traits>::parse_set()
950	{
951	++m_position;
952	if(m_position == m_end)
953	{
954	fail(regex_constants::error_brack, m_position - m_base);
955	return false;
956	}
957	basic_char_set<charT, traits> char_set;
958
959	const charT* base = m_position; // where the '[' was
960	const charT* item_base = m_position; // where the '[' or '^' was
961
962	while(m_position != m_end)
963	{
964	switch(this->m_traits.syntax_type(*m_position))
965	{
966	case regex_constants::syntax_caret:
967	if(m_position == base)
968	{
969	char_set.negate();
970	++m_position;
971	item_base = m_position;
972	}
973	else
974	parse_set_literal(char_set);
975	break;
976	case regex_constants::syntax_close_set:
977	if(m_position == item_base)
978	{
979	parse_set_literal(char_set);
980	break;
981	}
982	else
983	{
984	++m_position;
985	if(0 == this->append_set(char_set))
986	{
987	fail(regex_constants::error_range, m_position - m_base);
988	return false;
989	}
990	}
991	return true;
992	case regex_constants::syntax_open_set:
993	if(parse_inner_set(char_set))
994	break;
995	return true;
996	case regex_constants::syntax_escape:
997	{
998	//
999	// look ahead and see if this is a character class shortcut
1000	// \d \w \s etc...
1001	//
1002	++m_position;
1003	if(this->m_traits.escape_syntax_type(*m_position)
1004	== regex_constants::escape_type_class)
1005	{
1006	char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1007	if(m != 0)
1008	{
1009	char_set.add_class(m);
1010	++m_position;
1011	break;
1012	}
1013	}
1014	else if(this->m_traits.escape_syntax_type(*m_position)
1015	== regex_constants::escape_type_not_class)
1016	{
1017	// negated character class:
1018	char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
1019	if(m != 0)
1020	{
1021	char_set.add_negated_class(m);
1022	++m_position;
1023	break;
1024	}
1025	}
1026	// not a character class, just a regular escape:
1027	--m_position;
1028	parse_set_literal(char_set);
1029	break;
1030	}
1031	default:
1032	parse_set_literal(char_set);
1033	break;
1034	}
1035	}
1036	return m_position != m_end;
1037	}
1038
1039	template <class charT, class traits>
1040	bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set)
1041	{
1042	//
1043	// we have either a character class [:name:]
1044	// a collating element [.name.]
1045	// or an equivalence class [=name=]
1046	//
1047	if(m_end == ++m_position)
1048	{
1049	fail(regex_constants::error_brack, m_position - m_base);
1050	return false;
1051	}
1052	switch(this->m_traits.syntax_type(*m_position))
1053	{
1054	case regex_constants::syntax_dot:
1055	//
1056	// a collating element is treated as a literal:
1057	//
1058	--m_position;
1059	parse_set_literal(char_set);
1060	return true;
1061	case regex_constants::syntax_colon:
1062	{
1063	// check that character classes are actually enabled:
1064	if((this->flags() & (regbase::main_option_type \| regbase::no_char_classes))
1065	== (regbase::basic_syntax_group \| regbase::no_char_classes))
1066	{
1067	--m_position;
1068	parse_set_literal(char_set);
1069	return true;
1070	}
1071	// skip the ':'
1072	if(m_end == ++m_position)
1073	{
1074	fail(regex_constants::error_brack, m_position - m_base);
1075	return false;
1076	}
1077	const charT* name_first = m_position;
1078	// skip at least one character, then find the matching ':]'
1079	if(m_end == ++m_position)
1080	{
1081	fail(regex_constants::error_brack, m_position - m_base);
1082	return false;
1083	}
1084	while((m_position != m_end)
1085	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon))
1086	++m_position;
1087	const charT* name_last = m_position;
1088	if(m_end == m_position)
1089	{
1090	fail(regex_constants::error_brack, m_position - m_base);
1091	return false;
1092	}
1093	if((m_end == ++m_position)
1094	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1095	{
1096	fail(regex_constants::error_brack, m_position - m_base);
1097	return false;
1098	}
1099	//
1100	// check for negated class:
1101	//
1102	bool negated = false;
1103	if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret)
1104	{
1105	++name_first;
1106	negated = true;
1107	}
1108	typedef typename traits::char_class_type mask_type;
1109	mask_type m = this->m_traits.lookup_classname(name_first, name_last);
1110	if(m == 0)
1111	{
1112	if(char_set.empty() && (name_last - name_first == 1))
1113	{
1114	// maybe a special case:
1115	++m_position;
1116	if( (m_position != m_end)
1117	&& (this->m_traits.syntax_type(*m_position)
1118	== regex_constants::syntax_close_set))
1119	{
1120	if(this->m_traits.escape_syntax_type(*name_first)
1121	== regex_constants::escape_type_left_word)
1122	{
1123	++m_position;
1124	this->append_state(syntax_element_word_start);
1125	return false;
1126	}
1127	if(this->m_traits.escape_syntax_type(*name_first)
1128	== regex_constants::escape_type_right_word)
1129	{
1130	++m_position;
1131	this->append_state(syntax_element_word_end);
1132	return false;
1133	}
1134	}
1135	}
1136	fail(regex_constants::error_ctype, name_first - m_base);
1137	return false;
1138	}
1139	if(negated == false)
1140	char_set.add_class(m);
1141	else
1142	char_set.add_negated_class(m);
1143	++m_position;
1144	break;
1145	}
1146	case regex_constants::syntax_equal:
1147	{
1148	// skip the '='
1149	if(m_end == ++m_position)
1150	{
1151	fail(regex_constants::error_brack, m_position - m_base);
1152	return false;
1153	}
1154	const charT* name_first = m_position;
1155	// skip at least one character, then find the matching '=]'
1156	if(m_end == ++m_position)
1157	{
1158	fail(regex_constants::error_brack, m_position - m_base);
1159	return false;
1160	}
1161	while((m_position != m_end)
1162	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
1163	++m_position;
1164	const charT* name_last = m_position;
1165	if(m_end == m_position)
1166	{
1167	fail(regex_constants::error_brack, m_position - m_base);
1168	return false;
1169	}
1170	if((m_end == ++m_position)
1171	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1172	{
1173	fail(regex_constants::error_brack, m_position - m_base);
1174	return false;
1175	}
1176	string_type m = this->m_traits.lookup_collatename(name_first, name_last);
1177	if((0 == m.size()) \|\| (m.size() > 2))
1178	{
1179	fail(regex_constants::error_collate, name_first - m_base);
1180	return false;
1181	}
1182	digraph<charT> d;
1183	d.first = m[0];
1184	if(m.size() > 1)
1185	d.second = m[1];
1186	else
1187	d.second = 0;
1188	char_set.add_equivalent(d);
1189	++m_position;
1190	break;
1191	}
1192	default:
1193	--m_position;
1194	parse_set_literal(char_set);
1195	break;
1196	}
1197	return true;
1198	}
1199
1200	template <class charT, class traits>
1201	void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
1202	{
1203	digraph<charT> start_range(get_next_set_literal(char_set));
1204	if(m_end == m_position)
1205	{
1206	fail(regex_constants::error_brack, m_position - m_base);
1207	return;
1208	}
1209	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1210	{
1211	// we have a range:
1212	if(m_end == ++m_position)
1213	{
1214	fail(regex_constants::error_brack, m_position - m_base);
1215	return;
1216	}
1217	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
1218	{
1219	digraph<charT> end_range = get_next_set_literal(char_set);
1220	char_set.add_range(start_range, end_range);
1221	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
1222	{
1223	if(m_end == ++m_position)
1224	{
1225	fail(regex_constants::error_brack, m_position - m_base);
1226	return;
1227	}
1228	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set)
1229	{
1230	// trailing - :
1231	--m_position;
1232	return;
1233	}
1234	fail(regex_constants::error_range, m_position - m_base);
1235	return;
1236	}
1237	return;
1238	}
1239	--m_position;
1240	}
1241	char_set.add_single(start_range);
1242	}
1243
1244	template <class charT, class traits>
1245	digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
1246	{
1247	typedef typename traits::string_type string_type;
1248	digraph<charT> result;
1249	switch(this->m_traits.syntax_type(*m_position))
1250	{
1251	case regex_constants::syntax_dash:
1252	if(!char_set.empty())
1253	{
1254	// see if we are at the end of the set:
1255	if((++m_position == m_end) \|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1256	{
1257	fail(regex_constants::error_range, m_position - m_base);
1258	return result;
1259	}
1260	--m_position;
1261	}
1262	result.first = *m_position++;
1263	return result;
1264	case regex_constants::syntax_escape:
1265	// check to see if escapes are supported first:
1266	if(this->flags() & regex_constants::no_escape_in_lists)
1267	{
1268	result = *m_position++;
1269	break;
1270	}
1271	++m_position;
1272	result = unescape_character();
1273	break;
1274	case regex_constants::syntax_open_set:
1275	{
1276	if(m_end == ++m_position)
1277	{
1278	fail(regex_constants::error_collate, m_position - m_base);
1279	return result;
1280	}
1281	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
1282	{
1283	--m_position;
1284	result.first = *m_position;
1285	++m_position;
1286	return result;
1287	}
1288	if(m_end == ++m_position)
1289	{
1290	fail(regex_constants::error_collate, m_position - m_base);
1291	return result;
1292	}
1293	const charT* name_first = m_position;
1294	// skip at least one character, then find the matching ':]'
1295	if(m_end == ++m_position)
1296	{
1297	fail(regex_constants::error_collate, name_first - m_base);
1298	return result;
1299	}
1300	while((m_position != m_end)
1301	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
1302	++m_position;
1303	const charT* name_last = m_position;
1304	if(m_end == m_position)
1305	{
1306	fail(regex_constants::error_collate, name_first - m_base);
1307	return result;
1308	}
1309	if((m_end == ++m_position)
1310	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
1311	{
1312	fail(regex_constants::error_collate, name_first - m_base);
1313	return result;
1314	}
1315	++m_position;
1316	string_type s = this->m_traits.lookup_collatename(name_first, name_last);
1317	if(s.empty() \|\| (s.size() > 2))
1318	{
1319	fail(regex_constants::error_collate, name_first - m_base);
1320	return result;
1321	}
1322	result.first = s[0];
1323	if(s.size() > 1)
1324	result.second = s[1];
1325	else
1326	result.second = 0;
1327	return result;
1328	}
1329	default:
1330	result = *m_position++;
1331	}
1332	return result;
1333	}
1334
1335	//
1336	// does a value fit in the specified charT type?
1337	//
1338	template <class charT>
1339	bool valid_value(charT, int v, const mpl::true_&)
1340	{
1341	return (v >> (sizeof(charT) * CHAR_BIT)) == 0;
1342	}
1343	template <class charT>
1344	bool valid_value(charT, int, const mpl::false_&)
1345	{
1346	return true; // v will alsways fit in a charT
1347	}
1348	template <class charT>
1349	bool valid_value(charT c, int v)
1350	{
1351	return valid_value(c, v, mpl::bool_<(sizeof(charT) < sizeof(int))>());
1352	}
1353
1354	template <class charT, class traits>
1355	charT basic_regex_parser<charT, traits>::unescape_character()
1356	{
1357	#ifdef BOOST_MSVC
1358	#pragma warning(push)
1359	#pragma warning(disable:4127)
1360	#endif
1361	charT result(0);
1362	if(m_position == m_end)
1363	{
1364	fail(regex_constants::error_escape, m_position - m_base);
1365	return false;
1366	}
1367	switch(this->m_traits.escape_syntax_type(*m_position))
1368	{
1369	case regex_constants::escape_type_control_a:
1370	result = charT('\a');
1371	break;
1372	case regex_constants::escape_type_e:
1373	result = charT(27);
1374	break;
1375	case regex_constants::escape_type_control_f:
1376	result = charT('\f');
1377	break;
1378	case regex_constants::escape_type_control_n:
1379	result = charT('\n');
1380	break;
1381	case regex_constants::escape_type_control_r:
1382	result = charT('\r');
1383	break;
1384	case regex_constants::escape_type_control_t:
1385	result = charT('\t');
1386	break;
1387	case regex_constants::escape_type_control_v:
1388	result = charT('\v');
1389	break;
1390	case regex_constants::escape_type_word_assert:
1391	result = charT('\b');
1392	break;
1393	case regex_constants::escape_type_ascii_control:
1394	++m_position;
1395	if(m_position == m_end)
1396	{
1397	fail(regex_constants::error_escape, m_position - m_base);
1398	return result;
1399	}
1400	/*
1401	if((*m_position < charT('@'))
1402	\|\| (*m_position > charT(125)) )
1403	{
1404	fail(regex_constants::error_escape, m_position - m_base);
1405	return result;
1406	}
1407	*/
1408	result = static_cast<charT>(*m_position % 32);
1409	break;
1410	case regex_constants::escape_type_hex:
1411	++m_position;
1412	if(m_position == m_end)
1413	{
1414	fail(regex_constants::error_escape, m_position - m_base);
1415	return result;
1416	}
1417	// maybe have \x{ddd}
1418	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1419	{
1420	++m_position;
1421	if(m_position == m_end)
1422	{
1423	fail(regex_constants::error_escape, m_position - m_base);
1424	return result;
1425	}
1426	int i = this->m_traits.toi(m_position, m_end, 16);
1427	if((m_position == m_end)
1428	\|\| (i < 0)
1429	\|\| ((std::numeric_limits<charT>::is_specialized) && (charT(i) > (std::numeric_limits<charT>::max)()))
1430	\|\| (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1431	{
1432	fail(regex_constants::error_badbrace, m_position - m_base);
1433	return result;
1434	}
1435	++m_position;
1436	result = charT(i);
1437	}
1438	else
1439	{
1440	std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), m_end - m_position);
1441	int i = this->m_traits.toi(m_position, m_position + len, 16);
1442	if((i < 0)
1443	\|\| !valid_value(charT(0), i))
1444	{
1445	fail(regex_constants::error_escape, m_position - m_base);
1446	return result;
1447	}
1448	result = charT(i);
1449	}
1450	return result;
1451	case regex_constants::syntax_digit:
1452	{
1453	// an octal escape sequence, the first character must be a zero
1454	// followed by up to 3 octal digits:
1455	std::ptrdiff_t len = (std::min)(::boost::re_detail::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4));
1456	const charT* bp = m_position;
1457	int val = this->m_traits.toi(bp, bp + 1, 8);
1458	if(val != 0)
1459	{
1460	// Oops not an octal escape after all:
1461	fail(regex_constants::error_escape, m_position - m_base);
1462	return result;
1463	}
1464	val = this->m_traits.toi(m_position, m_position + len, 8);
1465	if(val < 0)
1466	{
1467	fail(regex_constants::error_escape, m_position - m_base);
1468	return result;
1469	}
1470	return static_cast<charT>(val);
1471	}
1472	case regex_constants::escape_type_named_char:
1473	{
1474	++m_position;
1475	if(m_position == m_end)
1476	{
1477	fail(regex_constants::error_escape, m_position - m_base);
1478	return false;
1479	}
1480	// maybe have \N{name}
1481	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
1482	{
1483	const charT* base = m_position;
1484	// skip forward until we find enclosing brace:
1485	while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
1486	++m_position;
1487	if(m_position == m_end)
1488	{
1489	fail(regex_constants::error_escape, m_position - m_base);
1490	return false;
1491	}
1492	string_type s = this->m_traits.lookup_collatename(++base, m_position++);
1493	if(s.empty())
1494	{
1495	fail(regex_constants::error_collate, m_position - m_base);
1496	return false;
1497	}
1498	if(s.size() == 1)
1499	{
1500	return s[0];
1501	}
1502	}
1503	// fall through is a failure:
1504	fail(regex_constants::error_escape, m_position - m_base);
1505	return false;
1506	}
1507	default:
1508	result = *m_position;
1509	break;
1510	}
1511	++m_position;
1512	return result;
1513	#ifdef BOOST_MSVC
1514	#pragma warning(pop)
1515	#endif
1516	}
1517
1518	template <class charT, class traits>
1519	bool basic_regex_parser<charT, traits>::parse_backref()
1520	{
1521	BOOST_ASSERT(m_position != m_end);
1522	const charT* pc = m_position;
1523	int i = this->m_traits.toi(pc, pc + 1, 10);
1524	if((i == 0) \|\| (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs)))
1525	{
1526	// not a backref at all but an octal escape sequence:
1527	charT c = unescape_character();
1528	this->append_literal(c);
1529	}
1530	else if((i > 0) && (this->m_backrefs & (1u << (i-1))))
1531	{
1532	m_position = pc;
1533	re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
1534	pb->index = i;
1535	}
1536	else
1537	{
1538	fail(regex_constants::error_backref, m_position - m_end);
1539	return false;
1540	}
1541	return true;
1542	}
1543
1544	template <class charT, class traits>
1545	bool basic_regex_parser<charT, traits>::parse_QE()
1546	{
1547	#ifdef BOOST_MSVC
1548	#pragma warning(push)
1549	#pragma warning(disable:4127)
1550	#endif
1551	//
1552	// parse a \Q...\E sequence:
1553	//
1554	++m_position; // skip the Q
1555	const charT* start = m_position;
1556	const charT* end;
1557	do
1558	{
1559	while((m_position != m_end)
1560	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
1561	++m_position;
1562	if(m_position == m_end)
1563	{
1564	// a \Q...\E sequence may terminate with the end of the expression:
1565	end = m_position;
1566	break;
1567	}
1568	if(++m_position == m_end) // skip the escape
1569	{
1570	fail(regex_constants::error_escape, m_position - m_base);
1571	return false;
1572	}
1573	// check to see if it's a \E:
1574	if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E)
1575	{
1576	++m_position;
1577	end = m_position - 2;
1578	break;
1579	}
1580	// otherwise go round again:
1581	}while(true);
1582	//
1583	// now add all the character between the two escapes as literals:
1584	//
1585	while(start != end)
1586	{
1587	this->append_literal(*start);
1588	++start;
1589	}
1590	return true;
1591	#ifdef BOOST_MSVC
1592	#pragma warning(pop)
1593	#endif
1594	}
1595
1596	template <class charT, class traits>
1597	bool basic_regex_parser<charT, traits>::parse_perl_extension()
1598	{
1599	if(++m_position == m_end)
1600	{
1601	fail(regex_constants::error_badrepeat, m_position - m_base);
1602	return false;
1603	}
1604	//
1605	// treat comments as a special case, as these
1606	// are the only ones that don't start with a leading
1607	// startmark state:
1608	//
1609	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
1610	{
1611	while((m_position != m_end)
1612	&& (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark))
1613	{}
1614	return true;
1615	}
1616	//
1617	// backup some state, and prepare the way:
1618	//
1619	int markid = 0;
1620	std::ptrdiff_t jump_offset = 0;
1621	re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
1622	std::ptrdiff_t last_paren_start = this->getoffset(pb);
1623	// back up insertion point for alternations, and set new point:
1624	std::ptrdiff_t last_alt_point = m_alt_insert_point;
1625	this->m_pdata->m_data.align();
1626	m_alt_insert_point = this->m_pdata->m_data.size();
1627	std::ptrdiff_t expected_alt_point = m_alt_insert_point;
1628	bool restore_flags = true;
1629	regex_constants::syntax_option_type old_flags = this->flags();
1630	bool old_case_change = m_has_case_change;
1631	m_has_case_change = false;
1632	//
1633	// select the actual extension used:
1634	//
1635	switch(this->m_traits.syntax_type(*m_position))
1636	{
1637	case regex_constants::syntax_colon:
1638	//
1639	// a non-capturing mark:
1640	//
1641	pb->index = markid = 0;
1642	++m_position;
1643	break;
1644	case regex_constants::syntax_equal:
1645	pb->index = markid = -1;
1646	++m_position;
1647	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
1648	this->m_pdata->m_data.align();
1649	m_alt_insert_point = this->m_pdata->m_data.size();
1650	break;
1651	case regex_constants::syntax_not:
1652	pb->index = markid = -2;
1653	++m_position;
1654	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
1655	this->m_pdata->m_data.align();
1656	m_alt_insert_point = this->m_pdata->m_data.size();
1657	break;
1658	case regex_constants::escape_type_left_word:
1659	{
1660	// a lookbehind assertion:
1661	if(++m_position == m_end)
1662	{
1663	fail(regex_constants::error_badrepeat, m_position - m_base);
1664	return false;
1665	}
1666	regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position);
1667	if(t == regex_constants::syntax_not)
1668	pb->index = markid = -2;
1669	else if(t == regex_constants::syntax_equal)
1670	pb->index = markid = -1;
1671	else
1672	{
1673	fail(regex_constants::error_badrepeat, m_position - m_base);
1674	return false;
1675	}
1676	++m_position;
1677	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
1678	this->append_state(syntax_element_backstep, sizeof(re_brace));
1679	this->m_pdata->m_data.align();
1680	m_alt_insert_point = this->m_pdata->m_data.size();
1681	break;
1682	}
1683	case regex_constants::escape_type_right_word:
1684	//
1685	// an independent sub-expression:
1686	//
1687	pb->index = markid = -3;
1688	++m_position;
1689	jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
1690	this->m_pdata->m_data.align();
1691	m_alt_insert_point = this->m_pdata->m_data.size();
1692	break;
1693	case regex_constants::syntax_open_mark:
1694	{
1695	// a conditional expression:
1696	pb->index = markid = -4;
1697	if(++m_position == m_end)
1698	{
1699	fail(regex_constants::error_badrepeat, m_position - m_base);
1700	return false;
1701	}
1702	int v = this->m_traits.toi(m_position, m_end, 10);
1703	if(v > 0)
1704	{
1705	re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
1706	br->index = v;
1707	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
1708	{
1709	fail(regex_constants::error_badrepeat, m_position - m_base);
1710	return false;
1711	}
1712	if(++m_position == m_end)
1713	{
1714	fail(regex_constants::error_badrepeat, m_position - m_base);
1715	return false;
1716	}
1717	}
1718	else
1719	{
1720	// verify that we have a lookahead or lookbehind assert:
1721	if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question)
1722	{
1723	fail(regex_constants::error_badrepeat, m_position - m_base);
1724	return false;
1725	}
1726	if(++m_position == m_end)
1727	{
1728	fail(regex_constants::error_badrepeat, m_position - m_base);
1729	return false;
1730	}
1731	if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
1732	{
1733	if(++m_position == m_end)
1734	{
1735	fail(regex_constants::error_badrepeat, m_position - m_base);
1736	return false;
1737	}
1738	if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
1739	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
1740	{
1741	fail(regex_constants::error_badrepeat, m_position - m_base);
1742	return false;
1743	}
1744	m_position -= 3;
1745	}
1746	else
1747	{
1748	if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
1749	&& (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
1750	{
1751	fail(regex_constants::error_badrepeat, m_position - m_base);
1752	return false;
1753	}
1754	m_position -= 2;
1755	}
1756	}
1757	break;
1758	}
1759	case regex_constants::syntax_close_mark:
1760	fail(regex_constants::error_badrepeat, m_position - m_base);
1761	return false;
1762	default:
1763	//
1764	// lets assume that we have a (?imsx) group and try and parse it:
1765	//
1766	regex_constants::syntax_option_type opts = parse_options();
1767	if(m_position == m_end)
1768	return false;
1769	// make a note of whether we have a case change:
1770	m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase));
1771	pb->index = markid = 0;
1772	if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark)
1773	{
1774	// update flags and carry on as normal:
1775	this->flags(opts);
1776	restore_flags = false;
1777	old_case_change \|= m_has_case_change; // defer end of scope by one ')'
1778	}
1779	else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon)
1780	{
1781	// update flags and carry on until the matching ')' is found:
1782	this->flags(opts);
1783	++m_position;
1784	}
1785	else
1786	{
1787	fail(regex_constants::error_badrepeat, m_position - m_base);
1788	return false;
1789	}
1790
1791	// finally append a case change state if we need it:
1792	if(m_has_case_change)
1793	{
1794	static_cast<re_case*>(
1795	this->append_state(syntax_element_toggle_case, sizeof(re_case))
1796	)->icase = opts & regbase::icase;
1797	}
1798
1799	}
1800	//
1801	// now recursively add more states, this will terminate when we get to a
1802	// matching ')' :
1803	//
1804	parse_all();
1805	//
1806	// Unwind alternatives:
1807	//
1808	if(0 == unwind_alts(last_paren_start))
1809	return false;
1810	//
1811	// we either have a ')' or we have run out of characters prematurely:
1812	//
1813	if(m_position == m_end)
1814	{
1815	this->fail(regex_constants::error_paren, ::boost::re_detail::distance(m_base, m_end));
1816	return false;
1817	}
1818	BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
1819	++m_position;
1820	//
1821	// restore the flags:
1822	//
1823	if(restore_flags)
1824	{
1825	// append a case change state if we need it:
1826	if(m_has_case_change)
1827	{
1828	static_cast<re_case*>(
1829	this->append_state(syntax_element_toggle_case, sizeof(re_case))
1830	)->icase = old_flags & regbase::icase;
1831	}
1832	this->flags(old_flags);
1833	}
1834	//
1835	// set up the jump pointer if we have one:
1836	//
1837	if(jump_offset)
1838	{
1839	this->m_pdata->m_data.align();
1840	re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
1841	jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
1842	if(this->m_last_state == jmp)
1843	{
1844	// Oops... we didn't have anything inside the assertion:
1845	fail(regex_constants::error_empty, m_position - m_base);
1846	return false;
1847	}
1848	}
1849	//
1850	// verify that if this is conditional expression, that we do have
1851	// an alternative, if not add one:
1852	//
1853	if(markid == -4)
1854	{
1855	re_syntax_base* b = this->getaddress(expected_alt_point);
1856	if(b->type != syntax_element_alt)
1857	{
1858	re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt)));
1859	alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt);
1860	}
1861	else if(this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt)
1862	{
1863	fail(regex_constants::error_bad_pattern, m_position - m_base);
1864	return false;
1865	}
1866	}
1867	//
1868	// append closing parenthesis state:
1869	//
1870	pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
1871	pb->index = markid;
1872	this->m_paren_start = last_paren_start;
1873	//
1874	// restore the alternate insertion point:
1875	//
1876	this->m_alt_insert_point = last_alt_point;
1877	//
1878	// and the case change data:
1879	//
1880	m_has_case_change = old_case_change;
1881	return true;
1882	}
1883
1884	template <class charT, class traits>
1885	bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate)
1886	{
1887	//
1888	// parses an emacs style \sx or \Sx construct.
1889	//
1890	if(++m_position == m_end)
1891	{
1892	fail(regex_constants::error_escape, m_position - m_base);
1893	return false;
1894	}
1895	basic_char_set<charT, traits> char_set;
1896	if(negate)
1897	char_set.negate();
1898
1899	static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', };
1900
1901	switch(*m_position)
1902	{
1903	case 's':
1904	case ' ':
1905	char_set.add_class(this->m_mask_space);
1906	break;
1907	case 'w':
1908	char_set.add_class(this->m_word_mask);
1909	break;
1910	case '_':
1911	char_set.add_single(digraph<charT>(charT('$')));
1912	char_set.add_single(digraph<charT>(charT('&')));
1913	char_set.add_single(digraph<charT>(charT('*')));
1914	char_set.add_single(digraph<charT>(charT('+')));
1915	char_set.add_single(digraph<charT>(charT('-')));
1916	char_set.add_single(digraph<charT>(charT('_')));
1917	char_set.add_single(digraph<charT>(charT('<')));
1918	char_set.add_single(digraph<charT>(charT('>')));
1919	break;
1920	case '.':
1921	char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5));
1922	break;
1923	case '(':
1924	char_set.add_single(digraph<charT>(charT('(')));
1925	char_set.add_single(digraph<charT>(charT('[')));
1926	char_set.add_single(digraph<charT>(charT('{')));
1927	break;
1928	case ')':
1929	char_set.add_single(digraph<charT>(charT(')')));
1930	char_set.add_single(digraph<charT>(charT(']')));
1931	char_set.add_single(digraph<charT>(charT('}')));
1932	break;
1933	case '"':
1934	char_set.add_single(digraph<charT>(charT('"')));
1935	char_set.add_single(digraph<charT>(charT('\'')));
1936	char_set.add_single(digraph<charT>(charT('`')));
1937	break;
1938	case '\'':
1939	char_set.add_single(digraph<charT>(charT('\'')));
1940	char_set.add_single(digraph<charT>(charT(',')));
1941	char_set.add_single(digraph<charT>(charT('#')));
1942	break;
1943	case '<':
1944	char_set.add_single(digraph<charT>(charT(';')));
1945	break;
1946	case '>':
1947	char_set.add_single(digraph<charT>(charT('\n')));
1948	char_set.add_single(digraph<charT>(charT('\f')));
1949	break;
1950	default:
1951	fail(regex_constants::error_ctype, m_position - m_base);
1952	return false;
1953	}
1954	if(0 == this->append_set(char_set))
1955	{
1956	fail(regex_constants::error_ctype, m_position - m_base);
1957	return false;
1958	}
1959	++m_position;
1960	return true;
1961	}
1962
1963	template <class charT, class traits>
1964	regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options()
1965	{
1966	// we have a (?imsx-imsx) group, convert it into a set of flags:
1967	regex_constants::syntax_option_type f = this->flags();
1968	bool breakout = false;
1969	do
1970	{
1971	switch(*m_position)
1972	{
1973	case 's':
1974	f \|= regex_constants::mod_s;
1975	f &= ~regex_constants::no_mod_s;
1976	break;
1977	case 'm':
1978	f &= ~regex_constants::no_mod_m;
1979	break;
1980	case 'i':
1981	f \|= regex_constants::icase;
1982	break;
1983	case 'x':
1984	f \|= regex_constants::mod_x;
1985	break;
1986	default:
1987	breakout = true;
1988	continue;
1989	}
1990	if(++m_position == m_end)
1991	{
1992	fail(regex_constants::error_paren, m_position - m_base);
1993	return false;
1994	}
1995	}
1996	while(!breakout);
1997
1998	if(*m_position == static_cast<charT>('-'))
1999	{
2000	if(++m_position == m_end)
2001	{
2002	fail(regex_constants::error_paren, m_position - m_base);
2003	return false;
2004	}
2005	do
2006	{
2007	switch(*m_position)
2008	{
2009	case 's':
2010	f &= ~regex_constants::mod_s;
2011	f \|= regex_constants::no_mod_s;
2012	break;
2013	case 'm':
2014	f \|= regex_constants::no_mod_m;
2015	break;
2016	case 'i':
2017	f &= ~regex_constants::icase;
2018	break;
2019	case 'x':
2020	f &= ~regex_constants::mod_x;
2021	break;
2022	default:
2023	breakout = true;
2024	continue;
2025	}
2026	if(++m_position == m_end)
2027	{
2028	fail(regex_constants::error_paren, m_position - m_base);
2029	return false;
2030	}
2031	}
2032	while(!breakout);
2033	}
2034	return f;
2035	}
2036
2037	template <class charT, class traits>
2038	bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start)
2039	{
2040	//
2041	// If we didn't actually add any states after the last
2042	// alternative then that's an error:
2043	//
2044	if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size()))
2045	&& m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
2046	{
2047	fail(regex_constants::error_empty, this->m_position - this->m_base);
2048	return false;
2049	}
2050	//
2051	// Fix up our alternatives:
2052	//
2053	while(m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
2054	{
2055	//
2056	// fix up the jump to point to the end of the states
2057	// that we've just added:
2058	//
2059	std::ptrdiff_t jump_offset = m_alt_jumps.back();
2060	m_alt_jumps.pop_back();
2061	this->m_pdata->m_data.align();
2062	re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
2063	BOOST_ASSERT(jmp->type == syntax_element_jump);
2064	jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
2065	}
2066	return true;
2067	}
2068
2069	#ifdef BOOST_MSVC
2070	#pragma warning(pop)
2071	#endif
2072
2073	} // namespace re_detail
2074	} // namespace boost
2075
2076	#ifdef BOOST_HAS_ABI_HEADERS
2077	# include BOOST_ABI_SUFFIX
2078	#endif
2079
2080	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: NonGTP/Boost/boost/regex/v4/basic_regex_parser.hpp @ 857

Download in other formats: