source: NonGTP/Boost/boost/wave/cpplexer/re2clex/cpp_re2c_lexer.hpp @ 857

Revision 857, 12.9 KB checked in by igarcia, 18 years ago (diff)
Line 
1/*=============================================================================
2    Boost.Wave: A Standard compliant C++ preprocessor library
3
4    Re2C based C++ lexer
5   
6    http://www.boost.org/
7
8    Copyright (c) 2001-2005 Hartmut Kaiser. Distributed under the Boost
9    Software License, Version 1.0. (See accompanying file
10    LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
11=============================================================================*/
12
13#if !defined(CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED)
14#define CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED
15
16#include <string>
17#include <cstdio>
18#include <cstdarg>
19#if defined(BOOST_SPIRIT_DEBUG)
20#include <iostream>
21#endif // defined(BOOST_SPIRIT_DEBUG)
22
23#include <boost/concept_check.hpp>
24#include <boost/assert.hpp>
25#include <boost/spirit/core.hpp>
26
27#include <boost/wave/wave_config.hpp>
28#include <boost/wave/language_support.hpp>
29#include <boost/wave/token_ids.hpp>
30#include <boost/wave/util/file_position.hpp>
31#include <boost/wave/cpplexer/validate_universal_char.hpp>
32#include <boost/wave/cpplexer/cpplexer_exceptions.hpp>
33#include <boost/wave/cpplexer/token_cache.hpp>
34#include <boost/wave/cpplexer/convert_trigraphs.hpp>
35
36#include <boost/wave/cpplexer/cpp_lex_token.hpp>
37#include <boost/wave/cpplexer/cpp_lex_interface.hpp>
38#include <boost/wave/cpplexer/re2clex/scanner.hpp>
39#include <boost/wave/cpplexer/re2clex/cpp_re.hpp>
40
41///////////////////////////////////////////////////////////////////////////////
42namespace boost {
43namespace wave {
44namespace cpplexer {
45namespace re2clex {
46
47///////////////////////////////////////////////////////////////////////////////
48//
49//  encapsulation of the re2c based cpp lexer
50//
51///////////////////////////////////////////////////////////////////////////////
52
53template <typename IteratorT, typename PositionT = boost::wave::util::file_position_type>
54class lexer
55{
56public:
57
58    typedef char                        char_t;
59    typedef Scanner                     base_t;
60    typedef lex_token<PositionT>        token_type;
61    typedef typename token_type::string_type  string_type;
62   
63    lexer(IteratorT const &first, IteratorT const &last,
64        PositionT const &pos, boost::wave::language_support language);
65    ~lexer();
66
67    lex_token<PositionT> get();
68    void set_position(PositionT const &pos)
69    {
70        // set position has to change the file name and line number only
71        filename = pos.get_file();
72        scanner.line = pos.get_line();
73//        scanner.column = scanner.curr_column = pos.get_column();
74        scanner.file_name = filename.c_str();
75    }
76
77// error reporting from the re2c generated lexer
78    static int report_error(Scanner const* s, char const *, ...);
79
80private:
81    static char const *tok_names[];
82   
83    Scanner scanner;
84    string_type filename;
85    string_type value;
86    bool at_eof;
87    boost::wave::language_support language;
88   
89    static token_cache<string_type> const cache;
90};
91
92///////////////////////////////////////////////////////////////////////////////
93// initialize cpp lexer
94template <typename IteratorT, typename PositionT>
95inline
96lexer<IteratorT, PositionT>::lexer(IteratorT const &first,
97        IteratorT const &last, PositionT const &pos,
98        boost::wave::language_support language)
99:   filename(pos.get_file()), at_eof(false), language(language)
100{
101    using namespace std;        // some systems have memset in std
102    memset(&scanner, '\0', sizeof(Scanner));
103    scanner.fd = -1;
104    scanner.eol_offsets = aq_create();
105    scanner.first = scanner.act = (uchar *)&(*first);
106    scanner.last = scanner.first + std::distance(first, last); 
107    scanner.line = pos.get_line();
108    scanner.column = scanner.curr_column = pos.get_column();
109    scanner.error_proc = report_error;
110    scanner.file_name = filename.c_str();
111   
112#if BOOST_WAVE_SUPPORT_MS_EXTENSIONS != 0
113    scanner.enable_ms_extensions = 1;
114#else
115    scanner.enable_ms_extensions = 0;
116#endif
117
118#if BOOST_WAVE_SUPPORT_VARIADICS_PLACEMARKERS != 0
119    scanner.act_in_c99_mode = boost::wave::need_c99(language);
120#endif
121}
122
123template <typename IteratorT, typename PositionT>
124inline
125lexer<IteratorT, PositionT>::~lexer()
126{
127    using namespace std;        // some systems have memset in std
128    aq_terminate(scanner.eol_offsets);
129    free(scanner.bot);
130}
131
132///////////////////////////////////////////////////////////////////////////////
133//  get the next token from the input stream
134template <typename IteratorT, typename PositionT>
135inline lex_token<PositionT>
136lexer<IteratorT, PositionT>::get()
137{
138    if (at_eof)
139        return lex_token<PositionT>();  // return T_EOI
140
141    unsigned int actline = scanner.line;
142    token_id id = token_id(scan(&scanner));
143   
144    switch (static_cast<unsigned int>(id)) {
145    case T_IDENTIFIER:
146    // test identifier characters for validity (throws if invalid chars found)
147        value = string_type((char const *)scanner.tok,
148            scanner.cur-scanner.tok);
149        if (!(language & support_option_no_character_validation))
150            impl::validate_identifier_name(value, actline, scanner.column, filename);
151        break;
152 
153    case T_STRINGLIT:
154    case T_CHARLIT:
155    // test literal characters for validity (throws if invalid chars found)
156        value = string_type((char const *)scanner.tok,
157            scanner.cur-scanner.tok);
158        if (language & support_option_convert_trigraphs)
159            value = impl::convert_trigraphs(value, actline, scanner.column, filename);
160        if (!(language & support_option_no_character_validation))
161            impl::validate_literal(value, actline, scanner.column, filename);
162        break;
163
164#if BOOST_WAVE_SUPPORT_INCLUDE_NEXT != 0
165    case T_PP_HHEADER:
166    case T_PP_QHEADER:
167    case T_PP_INCLUDE:
168    // convert to the corresponding ..._next token, if appropriate
169      {
170          value = string_type((char const *)scanner.tok,
171              scanner.cur-scanner.tok);
172
173      // Skip '#' and whitespace and see whether we find an 'include_next' here.
174          typename string_type::size_type start = value.find("include");
175          if (value.compare(start, 12, "include_next", 12) == 0)
176              id = token_id(id | AltTokenType);
177          break;
178      }
179#endif
180
181    case T_LONGINTLIT:  // supported in C99 and long_long mode
182        value = string_type((char const *)scanner.tok,
183            scanner.cur-scanner.tok);
184        if (!boost::wave::need_long_long(language)) {
185        // syntax error: not allowed in C++ mode
186            BOOST_WAVE_LEXER_THROW(lexing_exception, invalid_long_long_literal,
187                value.c_str(), actline, scanner.column, filename.c_str());
188        }
189        break;
190
191    case T_OCTALINT:
192    case T_DECIMALINT:
193    case T_HEXAINT:
194    case T_INTLIT:
195    case T_FLOATLIT:
196    case T_FIXEDPOINTLIT:
197    case T_CCOMMENT:
198    case T_CPPCOMMENT:
199    case T_SPACE:
200    case T_SPACE2:
201    case T_ANY:
202        value = string_type((char const *)scanner.tok,
203            scanner.cur-scanner.tok);
204        break;
205       
206    case T_EOF:
207    // T_EOF is returned as a valid token, the next call will return T_EOI,
208    // i.e. the actual end of input
209        at_eof = true;
210        value.clear();
211        break;
212       
213    case T_OR_TRIGRAPH:
214    case T_XOR_TRIGRAPH:
215    case T_LEFTBRACE_TRIGRAPH:
216    case T_RIGHTBRACE_TRIGRAPH:
217    case T_LEFTBRACKET_TRIGRAPH:
218    case T_RIGHTBRACKET_TRIGRAPH:
219    case T_COMPL_TRIGRAPH:
220    case T_POUND_TRIGRAPH:
221        if (language & support_option_convert_trigraphs) {
222            value = cache.get_token_value(BASEID_FROM_TOKEN(id));
223        }
224        else {
225            value = string_type((char const *)scanner.tok,
226                scanner.cur-scanner.tok);
227        }
228        break;
229       
230    case T_ANY_TRIGRAPH:
231        if (language & support_option_convert_trigraphs) {
232            value = impl::convert_trigraph(
233                string_type((char const *)scanner.tok,
234                    scanner.cur-scanner.tok),
235                actline, scanner.column, filename);
236        }
237        else {
238            value = string_type((char const *)scanner.tok,
239                scanner.cur-scanner.tok);
240        }
241        break;
242       
243    default:
244        if (CATEGORY_FROM_TOKEN(id) != EXTCATEGORY_FROM_TOKEN(id) ||
245            IS_CATEGORY(id, UnknownTokenType))
246        {
247            value = string_type((char const *)scanner.tok,
248                scanner.cur-scanner.tok);
249        }
250        else {
251            value = cache.get_token_value(id);
252        }
253        break;
254    }
255   
256    // the re2c lexer reports the new line number for newline tokens
257    return lex_token<PositionT>(id, value,
258        PositionT(filename, actline, scanner.column));
259}
260
261template <typename IteratorT, typename PositionT>
262inline int
263lexer<IteratorT, PositionT>::report_error(Scanner const *s, char const *msg, ...)
264{
265    BOOST_ASSERT(0 != s);
266    BOOST_ASSERT(0 != msg);
267
268    using namespace std;    // some system have vsprintf in namespace std
269   
270    char buffer[200];           // should be large enough
271    va_list params;
272    va_start(params, msg);
273    vsprintf(buffer, msg, params);
274    va_end(params);
275   
276    BOOST_WAVE_LEXER_THROW(lexing_exception, generic_lexing_error, buffer,
277        s->line, s->column, s->file_name);
278    BOOST_UNREACHABLE_RETURN(0);
279}
280
281///////////////////////////////////////////////////////////////////////////////
282//   
283//  lex_functor
284//   
285///////////////////////////////////////////////////////////////////////////////
286     
287template <typename IteratorT, typename PositionT = boost::wave::util::file_position_type>
288class lex_functor
289:   public lex_input_interface<typename lexer<IteratorT, PositionT>::token_type>
290{   
291public:
292
293    typedef typename lexer<IteratorT, PositionT>::token_type   token_type;
294   
295    lex_functor(IteratorT const &first, IteratorT const &last,
296            PositionT const &pos, boost::wave::language_support language)
297    :   lexer(first, last, pos, language)
298    {}
299    virtual ~lex_functor() {}
300   
301// get the next token from the input stream
302    token_type get() { return lexer.get(); }
303    void set_position(PositionT const &pos)
304    { lexer.set_position(pos); }
305
306private:
307    lexer<IteratorT, PositionT> lexer;
308};
309
310///////////////////////////////////////////////////////////////////////////////
311template <typename IteratorT, typename PositionT>
312token_cache<typename lexer<IteratorT, PositionT>::string_type> const
313    lexer<IteratorT, PositionT>::cache =
314        token_cache<typename lexer<IteratorT, PositionT>::string_type>();
315   
316}   // namespace re2clex
317
318///////////////////////////////////////////////////////////////////////////////
319// 
320//  The new_lexer_gen<>::new_lexer function (declared in cpp_lex_interface.hpp)
321//  should be defined inline, if the lex_functor shouldn't be instantiated
322//  separately from the lex_iterator.
323//
324//  Separate (explicit) instantiation helps to reduce compilation time.
325//
326///////////////////////////////////////////////////////////////////////////////
327
328#if BOOST_WAVE_SEPARATE_LEXER_INSTANTIATION != 0
329#define BOOST_WAVE_RE2C_NEW_LEXER_INLINE
330#else
331#define BOOST_WAVE_RE2C_NEW_LEXER_INLINE inline
332#endif
333
334///////////////////////////////////////////////////////////////////////////////
335//
336//  The 'new_lexer' function allows the opaque generation of a new lexer object.
337//  It is coupled to the iterator type to allow to decouple the lexer/iterator
338//  configurations at compile time.
339//
340//  This function is declared inside the cpp_slex_token.hpp file, which is
341//  referenced by the source file calling the lexer and the source file, which
342//  instantiates the lex_functor. But is is defined here, so it will be
343//  instantiated only while compiling the source file, which instantiates the
344//  lex_functor. While the cpp_re2c_token.hpp file may be included everywhere,
345//  this file (cpp_re2c_lexer.hpp) should be included only once. This allows
346//  to decouple the lexer interface from the lexer implementation and reduces
347//  compilation time.
348//
349///////////////////////////////////////////////////////////////////////////////
350
351template <typename IteratorT, typename PositionT>
352BOOST_WAVE_RE2C_NEW_LEXER_INLINE
353lex_input_interface<lex_token<PositionT> > *
354new_lexer_gen<IteratorT, PositionT>::new_lexer(IteratorT const &first,
355    IteratorT const &last, PositionT const &pos,
356    boost::wave::language_support language)
357{
358    return new re2clex::lex_functor<IteratorT, PositionT>(first, last, pos,
359        language);
360}
361
362#undef BOOST_WAVE_RE2C_NEW_LEXER_INLINE
363
364///////////////////////////////////////////////////////////////////////////////
365}   // namespace cpplexer
366}   // namespace wave
367}   // namespace boost
368     
369#endif // !defined(CPP_RE2C_LEXER_HPP_B81A2629_D5B1_4944_A97D_60254182B9A8_INCLUDED)
Note: See TracBrowser for help on using the repository browser.