source: NonGTP/Boost/boost/regex/v4/states.hpp @ 857

Revision 857, 10.6 KB checked in by igarcia, 18 years ago (diff)
Line 
1/*
2 *
3 * Copyright (c) 1998-2002
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12 /*
13  *   LOCATION:    see http://www.boost.org for most recent version.
14  *   FILE         states.cpp
15  *   VERSION      see <boost/version.hpp>
16  *   DESCRIPTION: Declares internal state machine structures.
17  */
18
19#ifndef BOOST_REGEX_V4_STATES_HPP
20#define BOOST_REGEX_V4_STATES_HPP
21
22#ifdef BOOST_HAS_ABI_HEADERS
23#  include BOOST_ABI_PREFIX
24#endif
25
26namespace boost{
27namespace re_detail{
28
29/*** mask_type *******************************************************
30Whenever we have a choice of two alternatives, we use an array of bytes
31to indicate which of the two alternatives it is possible to take for any
32given input character.  If mask_take is set, then we can take the next
33state, and if mask_skip is set then we can take the alternative.
34***********************************************************************/
35enum mask_type
36{
37   mask_take = 1,
38   mask_skip = 2,
39   mask_init = 4,
40   mask_any = mask_skip | mask_take,
41   mask_all = mask_any
42};
43
44/*** helpers **********************************************************
45These helpers let us use function overload resolution to detect whether
46we have narrow or wide character strings:
47***********************************************************************/
48struct _narrow_type{};
49struct _wide_type{};
50template <class charT> struct is_byte;
51template<>             struct is_byte<char>         { typedef _narrow_type width_type; };
52template<>             struct is_byte<unsigned char>{ typedef _narrow_type width_type; };
53template<>             struct is_byte<signed char>  { typedef _narrow_type width_type; };
54template <class charT> struct is_byte               { typedef _wide_type width_type; };
55
56/*** enum syntax_element_type ******************************************
57Every record in the state machine falls into one of the following types:
58***********************************************************************/
59enum syntax_element_type
60{
61   // start of a marked sub-expression, or perl-style (?...) extension
62   syntax_element_startmark = 0,
63   // end of a marked sub-expression, or perl-style (?...) extension
64   syntax_element_endmark = syntax_element_startmark + 1,
65   // any sequence of literal characters
66   syntax_element_literal = syntax_element_endmark + 1,
67   // start of line assertion: ^
68   syntax_element_start_line = syntax_element_literal + 1,
69   // end of line assertion $
70   syntax_element_end_line = syntax_element_start_line + 1,
71   // match any character: .
72   syntax_element_wild = syntax_element_end_line + 1,
73   // end of expression: we have a match when we get here
74   syntax_element_match = syntax_element_wild + 1,
75   // perl style word boundary: \b
76   syntax_element_word_boundary = syntax_element_match + 1,
77   // perl style within word boundary: \B
78   syntax_element_within_word = syntax_element_word_boundary + 1,
79   // start of word assertion: \<
80   syntax_element_word_start = syntax_element_within_word + 1,
81   // end of word assertion: \>
82   syntax_element_word_end = syntax_element_word_start + 1,
83   // start of buffer assertion: \`
84   syntax_element_buffer_start = syntax_element_word_end + 1,
85   // end of buffer assertion: \'
86   syntax_element_buffer_end = syntax_element_buffer_start + 1,
87   // backreference to previously matched sub-expression
88   syntax_element_backref = syntax_element_buffer_end + 1,
89   // either a wide character set [..] or one with multicharacter collating elements:
90   syntax_element_long_set = syntax_element_backref + 1,
91   // narrow character set: [...]
92   syntax_element_set = syntax_element_long_set + 1,
93   // jump to a new state in the machine:
94   syntax_element_jump = syntax_element_set + 1,
95   // choose between two production states:
96   syntax_element_alt = syntax_element_jump + 1,
97   // a repeat
98   syntax_element_rep = syntax_element_alt + 1,
99   // match a combining character sequence
100   syntax_element_combining = syntax_element_rep + 1,
101   // perl style soft buffer end: \z
102   syntax_element_soft_buffer_end = syntax_element_combining + 1,
103   // perl style continuation: \G
104   syntax_element_restart_continue = syntax_element_soft_buffer_end + 1,
105   // single character repeats:
106   syntax_element_dot_rep = syntax_element_restart_continue + 1,
107   syntax_element_char_rep = syntax_element_dot_rep + 1,
108   syntax_element_short_set_rep = syntax_element_char_rep + 1,
109   syntax_element_long_set_rep = syntax_element_short_set_rep + 1,
110   // a backstep for lookbehind repeats:
111   syntax_element_backstep = syntax_element_long_set_rep + 1,
112   // an assertion that a mark was matched:
113   syntax_element_assert_backref = syntax_element_backstep + 1,
114   syntax_element_toggle_case = syntax_element_assert_backref + 1
115};
116
117#ifdef BOOST_REGEX_DEBUG
118// dwa 09/26/00 - This is needed to suppress warnings about an ambiguous conversion
119std::ostream& operator<<(std::ostream&, syntax_element_type);
120#endif
121
122struct re_syntax_base;
123
124/*** union offset_type ************************************************
125Points to another state in the machine.  During machine construction
126we use integral offsets, but these are converted to pointers before
127execution of the machine.
128***********************************************************************/
129union offset_type
130{
131   re_syntax_base*   p;
132   std::ptrdiff_t    i;
133};
134
135/*** struct re_syntax_base ********************************************
136Base class for all states in the machine.
137***********************************************************************/
138struct re_syntax_base
139{
140   syntax_element_type   type;         // what kind of state this is
141   offset_type           next;         // next state in the machine
142};
143
144/*** struct re_brace **************************************************
145A marked parenthesis.
146***********************************************************************/
147struct re_brace : public re_syntax_base
148{
149   // The index to match, can be zero (don't mark the sub-expression)
150   // or negative (for perl style (?...) extentions):
151   int index;
152};
153
154/*** struct re_dot **************************************************
155Match anything.
156***********************************************************************/
157enum
158{
159   dont_care = 1,
160   force_not_newline = 0,
161   force_newline = 2,
162
163   test_not_newline = 2,
164   test_newline = 3
165};
166struct re_dot : public re_syntax_base
167{
168   unsigned char mask;
169};
170
171/*** struct re_literal ************************************************
172A string of literals, following this structure will be an
173array of characters: charT[length]
174***********************************************************************/
175struct re_literal : public re_syntax_base
176{
177   unsigned int length;
178};
179
180/*** struct re_case ************************************************
181Indicates whether we are moving to a case insensive block or not
182***********************************************************************/
183struct re_case : public re_syntax_base
184{
185   bool icase;
186};
187
188/*** struct re_set_long ***********************************************
189A wide character set of characters, following this structure will be
190an array of type charT:
191First csingles null-terminated strings
192Then 2 * cranges NULL terminated strings
193Then cequivalents NULL terminated strings
194***********************************************************************/
195template <class mask_type>
196struct re_set_long : public re_syntax_base
197{
198   unsigned int            csingles, cranges, cequivalents;
199   mask_type               cclasses;
200   mask_type               cnclasses;
201   bool                    isnot;
202   bool                    singleton;
203};
204
205/*** struct re_set ****************************************************
206A set of narrow-characters, matches any of _map which is none-zero
207***********************************************************************/
208struct re_set : public re_syntax_base
209{
210   unsigned char _map[1 << CHAR_BIT];
211};
212
213/*** struct re_jump ***************************************************
214Jump to a new location in the machine (not next).
215***********************************************************************/
216struct re_jump : public re_syntax_base
217{
218   offset_type     alt;                 // location to jump to
219};
220
221/*** struct re_alt ***************************************************
222Jump to a new location in the machine (possibly next).
223***********************************************************************/
224struct re_alt : public re_jump
225{
226   unsigned char   _map[1 << CHAR_BIT]; // which characters can take the jump
227   unsigned int    can_be_null;         // true if we match a NULL string
228};
229
230/*** struct re_repeat *************************************************
231Repeat a section of the machine
232***********************************************************************/
233struct re_repeat : public re_alt
234{
235   std::size_t   min, max;  // min and max allowable repeats
236   int           id;        // Unique identifier for this repeat
237   bool          leading;   // True if this repeat is at the start of the machine (lets us optimize some searches)
238   bool          greedy;    // True if this is a greedy repeat
239};
240
241/*** enum re_jump_size_type *******************************************
242Provides compiled size of re_jump structure (allowing for trailing alignment).
243We provide this so we know how manybytes to insert when constructing the machine
244(The value of padding_mask is defined in regex_raw_buffer.hpp).
245***********************************************************************/
246enum re_jump_size_type
247{
248   re_jump_size = (sizeof(re_jump) + padding_mask) & ~(padding_mask),
249   re_repeater_size = (sizeof(re_repeat) + padding_mask) & ~(padding_mask),
250   re_alt_size = (sizeof(re_alt) + padding_mask) & ~(padding_mask)
251};
252
253/*** proc re_is_set_member *********************************************
254Forward declaration: we'll need this one later...
255***********************************************************************/
256
257template<class charT, class traits>
258struct regex_data;
259
260template <class iterator, class charT, class traits_type, class char_classT>
261iterator BOOST_REGEX_CALL re_is_set_member(iterator next,
262                          iterator last,
263                          const re_set_long<char_classT>* set_,
264                          const regex_data<charT, traits_type>& e, bool icase);
265
266} // namespace re_detail
267
268} // namespace boost
269
270#ifdef BOOST_HAS_ABI_HEADERS
271#  include BOOST_ABI_SUFFIX
272#endif
273
274#endif
275
276
Note: See TracBrowser for help on using the repository browser.