1 | /*=============================================================================
|
---|
2 | Boost.Wave: A Standard compliant C++ preprocessor library
|
---|
3 |
|
---|
4 | Re2C based C++ lexer
|
---|
5 |
|
---|
6 | http://www.boost.org/
|
---|
7 |
|
---|
8 | Copyright (c) 2001-2005 Hartmut Kaiser. Distributed under the Boost
|
---|
9 | Software License, Version 1.0. (See accompanying file
|
---|
10 | LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
---|
11 | =============================================================================*/
|
---|
12 |
|
---|
13 | #if !defined(CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED)
|
---|
14 | #define CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED
|
---|
15 |
|
---|
16 | #include <string>
|
---|
17 | #include <cstdio>
|
---|
18 | #include <cstdarg>
|
---|
19 | #if defined(BOOST_SPIRIT_DEBUG)
|
---|
20 | #include <iostream>
|
---|
21 | #endif // defined(BOOST_SPIRIT_DEBUG)
|
---|
22 |
|
---|
23 | #include <boost/concept_check.hpp>
|
---|
24 | #include <boost/assert.hpp>
|
---|
25 | #include <boost/spirit/core.hpp>
|
---|
26 |
|
---|
27 | #include <boost/wave/wave_config.hpp>
|
---|
28 | #include <boost/wave/language_support.hpp>
|
---|
29 | #include <boost/wave/token_ids.hpp>
|
---|
30 | #include <boost/wave/util/file_position.hpp>
|
---|
31 | #include <boost/wave/cpplexer/validate_universal_char.hpp>
|
---|
32 | #include <boost/wave/cpplexer/cpplexer_exceptions.hpp>
|
---|
33 | #include <boost/wave/cpplexer/token_cache.hpp>
|
---|
34 | #include <boost/wave/cpplexer/convert_trigraphs.hpp>
|
---|
35 |
|
---|
36 | #include <boost/wave/cpplexer/cpp_lex_token.hpp>
|
---|
37 | #include <boost/wave/cpplexer/cpp_lex_interface.hpp>
|
---|
38 | #include <boost/wave/cpplexer/re2clex/scanner.hpp>
|
---|
39 | #include <boost/wave/cpplexer/re2clex/cpp_re.hpp>
|
---|
40 |
|
---|
41 | ///////////////////////////////////////////////////////////////////////////////
|
---|
42 | namespace boost {
|
---|
43 | namespace wave {
|
---|
44 | namespace cpplexer {
|
---|
45 | namespace re2clex {
|
---|
46 |
|
---|
47 | ///////////////////////////////////////////////////////////////////////////////
|
---|
48 | //
|
---|
49 | // encapsulation of the re2c based cpp lexer
|
---|
50 | //
|
---|
51 | ///////////////////////////////////////////////////////////////////////////////
|
---|
52 |
|
---|
53 | template <typename IteratorT, typename PositionT = boost::wave::util::file_position_type>
|
---|
54 | class lexer
|
---|
55 | {
|
---|
56 | public:
|
---|
57 |
|
---|
58 | typedef char char_t;
|
---|
59 | typedef Scanner base_t;
|
---|
60 | typedef lex_token<PositionT> token_type;
|
---|
61 | typedef typename token_type::string_type string_type;
|
---|
62 |
|
---|
63 | lexer(IteratorT const &first, IteratorT const &last,
|
---|
64 | PositionT const &pos, boost::wave::language_support language);
|
---|
65 | ~lexer();
|
---|
66 |
|
---|
67 | lex_token<PositionT> get();
|
---|
68 | void set_position(PositionT const &pos)
|
---|
69 | {
|
---|
70 | // set position has to change the file name and line number only
|
---|
71 | filename = pos.get_file();
|
---|
72 | scanner.line = pos.get_line();
|
---|
73 | // scanner.column = scanner.curr_column = pos.get_column();
|
---|
74 | scanner.file_name = filename.c_str();
|
---|
75 | }
|
---|
76 |
|
---|
77 | // error reporting from the re2c generated lexer
|
---|
78 | static int report_error(Scanner const* s, char const *, ...);
|
---|
79 |
|
---|
80 | private:
|
---|
81 | static char const *tok_names[];
|
---|
82 |
|
---|
83 | Scanner scanner;
|
---|
84 | string_type filename;
|
---|
85 | string_type value;
|
---|
86 | bool at_eof;
|
---|
87 | boost::wave::language_support language;
|
---|
88 |
|
---|
89 | static token_cache<string_type> const cache;
|
---|
90 | };
|
---|
91 |
|
---|
92 | ///////////////////////////////////////////////////////////////////////////////
|
---|
93 | // initialize cpp lexer
|
---|
94 | template <typename IteratorT, typename PositionT>
|
---|
95 | inline
|
---|
96 | lexer<IteratorT, PositionT>::lexer(IteratorT const &first,
|
---|
97 | IteratorT const &last, PositionT const &pos,
|
---|
98 | boost::wave::language_support language)
|
---|
99 | : filename(pos.get_file()), at_eof(false), language(language)
|
---|
100 | {
|
---|
101 | using namespace std; // some systems have memset in std
|
---|
102 | memset(&scanner, '\0', sizeof(Scanner));
|
---|
103 | scanner.fd = -1;
|
---|
104 | scanner.eol_offsets = aq_create();
|
---|
105 | scanner.first = scanner.act = (uchar *)&(*first);
|
---|
106 | scanner.last = scanner.first + std::distance(first, last);
|
---|
107 | scanner.line = pos.get_line();
|
---|
108 | scanner.column = scanner.curr_column = pos.get_column();
|
---|
109 | scanner.error_proc = report_error;
|
---|
110 | scanner.file_name = filename.c_str();
|
---|
111 |
|
---|
112 | #if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
|
---|
113 | scanner.enable_ms_extensions = 1;
|
---|
114 | #else
|
---|
115 | scanner.enable_ms_extensions = 0;
|
---|
116 | #endif
|
---|
117 |
|
---|
118 | #if BOOST_WAVE_SUPPORT_VARIADICS_PLACEMARKERS != 0
|
---|
119 | scanner.act_in_c99_mode = boost::wave::need_c99(language);
|
---|
120 | #endif
|
---|
121 | }
|
---|
122 |
|
---|
123 | template <typename IteratorT, typename PositionT>
|
---|
124 | inline
|
---|
125 | lexer<IteratorT, PositionT>::~lexer()
|
---|
126 | {
|
---|
127 | using namespace std; // some systems have memset in std
|
---|
128 | aq_terminate(scanner.eol_offsets);
|
---|
129 | free(scanner.bot);
|
---|
130 | }
|
---|
131 |
|
---|
132 | ///////////////////////////////////////////////////////////////////////////////
|
---|
133 | // get the next token from the input stream
|
---|
134 | template <typename IteratorT, typename PositionT>
|
---|
135 | inline lex_token<PositionT>
|
---|
136 | lexer<IteratorT, PositionT>::get()
|
---|
137 | {
|
---|
138 | if (at_eof)
|
---|
139 | return lex_token<PositionT>(); // return T_EOI
|
---|
140 |
|
---|
141 | unsigned int actline = scanner.line;
|
---|
142 | token_id id = token_id(scan(&scanner));
|
---|
143 |
|
---|
144 | switch (static_cast<unsigned int>(id)) {
|
---|
145 | case T_IDENTIFIER:
|
---|
146 | // test identifier characters for validity (throws if invalid chars found)
|
---|
147 | value = string_type((char const *)scanner.tok,
|
---|
148 | scanner.cur-scanner.tok);
|
---|
149 | if (!(language & support_option_no_character_validation))
|
---|
150 | impl::validate_identifier_name(value, actline, scanner.column, filename);
|
---|
151 | break;
|
---|
152 |
|
---|
153 | case T_STRINGLIT:
|
---|
154 | case T_CHARLIT:
|
---|
155 | // test literal characters for validity (throws if invalid chars found)
|
---|
156 | value = string_type((char const *)scanner.tok,
|
---|
157 | scanner.cur-scanner.tok);
|
---|
158 | if (language & support_option_convert_trigraphs)
|
---|
159 | value = impl::convert_trigraphs(value, actline, scanner.column, filename);
|
---|
160 | if (!(language & support_option_no_character_validation))
|
---|
161 | impl::validate_literal(value, actline, scanner.column, filename);
|
---|
162 | break;
|
---|
163 |
|
---|
164 | #if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
|
---|
165 | case T_PP_HHEADER:
|
---|
166 | case T_PP_QHEADER:
|
---|
167 | case T_PP_INCLUDE:
|
---|
168 | // convert to the corresponding ..._next token, if appropriate
|
---|
169 | {
|
---|
170 | value = string_type((char const *)scanner.tok,
|
---|
171 | scanner.cur-scanner.tok);
|
---|
172 |
|
---|
173 | // Skip '#' and whitespace and see whether we find an 'include_next' here.
|
---|
174 | typename string_type::size_type start = value.find("include");
|
---|
175 | if (value.compare(start, 12, "include_next", 12) == 0)
|
---|
176 | id = token_id(id | AltTokenType);
|
---|
177 | break;
|
---|
178 | }
|
---|
179 | #endif
|
---|
180 |
|
---|
181 | case T_LONGINTLIT: // supported in C99 and long_long mode
|
---|
182 | value = string_type((char const *)scanner.tok,
|
---|
183 | scanner.cur-scanner.tok);
|
---|
184 | if (!boost::wave::need_long_long(language)) {
|
---|
185 | // syntax error: not allowed in C++ mode
|
---|
186 | BOOST_WAVE_LEXER_THROW(lexing_exception, invalid_long_long_literal,
|
---|
187 | value.c_str(), actline, scanner.column, filename.c_str());
|
---|
188 | }
|
---|
189 | break;
|
---|
190 |
|
---|
191 | case T_OCTALINT:
|
---|
192 | case T_DECIMALINT:
|
---|
193 | case T_HEXAINT:
|
---|
194 | case T_INTLIT:
|
---|
195 | case T_FLOATLIT:
|
---|
196 | case T_FIXEDPOINTLIT:
|
---|
197 | case T_CCOMMENT:
|
---|
198 | case T_CPPCOMMENT:
|
---|
199 | case T_SPACE:
|
---|
200 | case T_SPACE2:
|
---|
201 | case T_ANY:
|
---|
202 | value = string_type((char const *)scanner.tok,
|
---|
203 | scanner.cur-scanner.tok);
|
---|
204 | break;
|
---|
205 |
|
---|
206 | case T_EOF:
|
---|
207 | // T_EOF is returned as a valid token, the next call will return T_EOI,
|
---|
208 | // i.e. the actual end of input
|
---|
209 | at_eof = true;
|
---|
210 | value.clear();
|
---|
211 | break;
|
---|
212 |
|
---|
213 | case T_OR_TRIGRAPH:
|
---|
214 | case T_XOR_TRIGRAPH:
|
---|
215 | case T_LEFTBRACE_TRIGRAPH:
|
---|
216 | case T_RIGHTBRACE_TRIGRAPH:
|
---|
217 | case T_LEFTBRACKET_TRIGRAPH:
|
---|
218 | case T_RIGHTBRACKET_TRIGRAPH:
|
---|
219 | case T_COMPL_TRIGRAPH:
|
---|
220 | case T_POUND_TRIGRAPH:
|
---|
221 | if (language & support_option_convert_trigraphs) {
|
---|
222 | value = cache.get_token_value(BASEID_FROM_TOKEN(id));
|
---|
223 | }
|
---|
224 | else {
|
---|
225 | value = string_type((char const *)scanner.tok,
|
---|
226 | scanner.cur-scanner.tok);
|
---|
227 | }
|
---|
228 | break;
|
---|
229 |
|
---|
230 | case T_ANY_TRIGRAPH:
|
---|
231 | if (language & support_option_convert_trigraphs) {
|
---|
232 | value = impl::convert_trigraph(
|
---|
233 | string_type((char const *)scanner.tok,
|
---|
234 | scanner.cur-scanner.tok),
|
---|
235 | actline, scanner.column, filename);
|
---|
236 | }
|
---|
237 | else {
|
---|
238 | value = string_type((char const *)scanner.tok,
|
---|
239 | scanner.cur-scanner.tok);
|
---|
240 | }
|
---|
241 | break;
|
---|
242 |
|
---|
243 | default:
|
---|
244 | if (CATEGORY_FROM_TOKEN(id) != EXTCATEGORY_FROM_TOKEN(id) ||
|
---|
245 | IS_CATEGORY(id, UnknownTokenType))
|
---|
246 | {
|
---|
247 | value = string_type((char const *)scanner.tok,
|
---|
248 | scanner.cur-scanner.tok);
|
---|
249 | }
|
---|
250 | else {
|
---|
251 | value = cache.get_token_value(id);
|
---|
252 | }
|
---|
253 | break;
|
---|
254 | }
|
---|
255 |
|
---|
256 | // the re2c lexer reports the new line number for newline tokens
|
---|
257 | return lex_token<PositionT>(id, value,
|
---|
258 | PositionT(filename, actline, scanner.column));
|
---|
259 | }
|
---|
260 |
|
---|
261 | template <typename IteratorT, typename PositionT>
|
---|
262 | inline int
|
---|
263 | lexer<IteratorT, PositionT>::report_error(Scanner const *s, char const *msg, ...)
|
---|
264 | {
|
---|
265 | BOOST_ASSERT(0 != s);
|
---|
266 | BOOST_ASSERT(0 != msg);
|
---|
267 |
|
---|
268 | using namespace std; // some system have vsprintf in namespace std
|
---|
269 |
|
---|
270 | char buffer[200]; // should be large enough
|
---|
271 | va_list params;
|
---|
272 | va_start(params, msg);
|
---|
273 | vsprintf(buffer, msg, params);
|
---|
274 | va_end(params);
|
---|
275 |
|
---|
276 | BOOST_WAVE_LEXER_THROW(lexing_exception, generic_lexing_error, buffer,
|
---|
277 | s->line, s->column, s->file_name);
|
---|
278 | BOOST_UNREACHABLE_RETURN(0);
|
---|
279 | }
|
---|
280 |
|
---|
281 | ///////////////////////////////////////////////////////////////////////////////
|
---|
282 | //
|
---|
283 | // lex_functor
|
---|
284 | //
|
---|
285 | ///////////////////////////////////////////////////////////////////////////////
|
---|
286 |
|
---|
287 | template <typename IteratorT, typename PositionT = boost::wave::util::file_position_type>
|
---|
288 | class lex_functor
|
---|
289 | : public lex_input_interface<typename lexer<IteratorT, PositionT>::token_type>
|
---|
290 | {
|
---|
291 | public:
|
---|
292 |
|
---|
293 | typedef typename lexer<IteratorT, PositionT>::token_type token_type;
|
---|
294 |
|
---|
295 | lex_functor(IteratorT const &first, IteratorT const &last,
|
---|
296 | PositionT const &pos, boost::wave::language_support language)
|
---|
297 | : lexer(first, last, pos, language)
|
---|
298 | {}
|
---|
299 | virtual ~lex_functor() {}
|
---|
300 |
|
---|
301 | // get the next token from the input stream
|
---|
302 | token_type get() { return lexer.get(); }
|
---|
303 | void set_position(PositionT const &pos)
|
---|
304 | { lexer.set_position(pos); }
|
---|
305 |
|
---|
306 | private:
|
---|
307 | lexer<IteratorT, PositionT> lexer;
|
---|
308 | };
|
---|
309 |
|
---|
310 | ///////////////////////////////////////////////////////////////////////////////
|
---|
311 | template <typename IteratorT, typename PositionT>
|
---|
312 | token_cache<typename lexer<IteratorT, PositionT>::string_type> const
|
---|
313 | lexer<IteratorT, PositionT>::cache =
|
---|
314 | token_cache<typename lexer<IteratorT, PositionT>::string_type>();
|
---|
315 |
|
---|
316 | } // namespace re2clex
|
---|
317 |
|
---|
318 | ///////////////////////////////////////////////////////////////////////////////
|
---|
319 | //
|
---|
320 | // The new_lexer_gen<>::new_lexer function (declared in cpp_lex_interface.hpp)
|
---|
321 | // should be defined inline, if the lex_functor shouldn't be instantiated
|
---|
322 | // separately from the lex_iterator.
|
---|
323 | //
|
---|
324 | // Separate (explicit) instantiation helps to reduce compilation time.
|
---|
325 | //
|
---|
326 | ///////////////////////////////////////////////////////////////////////////////
|
---|
327 |
|
---|
328 | #if BOOST_WAVE_SEPARATE_LEXER_INSTANTIATION != 0
|
---|
329 | #define BOOST_WAVE_RE2C_NEW_LEXER_INLINE
|
---|
330 | #else
|
---|
331 | #define BOOST_WAVE_RE2C_NEW_LEXER_INLINE inline
|
---|
332 | #endif
|
---|
333 |
|
---|
334 | ///////////////////////////////////////////////////////////////////////////////
|
---|
335 | //
|
---|
336 | // The 'new_lexer' function allows the opaque generation of a new lexer object.
|
---|
337 | // It is coupled to the iterator type to allow to decouple the lexer/iterator
|
---|
338 | // configurations at compile time.
|
---|
339 | //
|
---|
340 | // This function is declared inside the cpp_slex_token.hpp file, which is
|
---|
341 | // referenced by the source file calling the lexer and the source file, which
|
---|
342 | // instantiates the lex_functor. But is is defined here, so it will be
|
---|
343 | // instantiated only while compiling the source file, which instantiates the
|
---|
344 | // lex_functor. While the cpp_re2c_token.hpp file may be included everywhere,
|
---|
345 | // this file (cpp_re2c_lexer.hpp) should be included only once. This allows
|
---|
346 | // to decouple the lexer interface from the lexer implementation and reduces
|
---|
347 | // compilation time.
|
---|
348 | //
|
---|
349 | ///////////////////////////////////////////////////////////////////////////////
|
---|
350 |
|
---|
351 | template <typename IteratorT, typename PositionT>
|
---|
352 | BOOST_WAVE_RE2C_NEW_LEXER_INLINE
|
---|
353 | lex_input_interface<lex_token<PositionT> > *
|
---|
354 | new_lexer_gen<IteratorT, PositionT>::new_lexer(IteratorT const &first,
|
---|
355 | IteratorT const &last, PositionT const &pos,
|
---|
356 | boost::wave::language_support language)
|
---|
357 | {
|
---|
358 | return new re2clex::lex_functor<IteratorT, PositionT>(first, last, pos,
|
---|
359 | language);
|
---|
360 | }
|
---|
361 |
|
---|
362 | #undef BOOST_WAVE_RE2C_NEW_LEXER_INLINE
|
---|
363 |
|
---|
364 | ///////////////////////////////////////////////////////////////////////////////
|
---|
365 | } // namespace cpplexer
|
---|
366 | } // namespace wave
|
---|
367 | } // namespace boost
|
---|
368 |
|
---|
369 | #endif // !defined(CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED)
|
---|