source: NonGTP/Boost/boost/token_functions.hpp @ 857

Revision 857, 18.1 KB checked in by igarcia, 19 years ago (diff)
Line 
1// Boost token_functions.hpp  ------------------------------------------------//
2
3// Copyright John R. Bandela 2001.
4
5// Distributed under the Boost Software License, Version 1.0. (See
6// accompanying file LICENSE_1_0.txt or copy at
7// http://www.boost.org/LICENSE_1_0.txt)
8
9// See http://www.boost.org/libs/tokenizer for documentation.
10
11// Revision History:
12// 01 Oct 2004   Joaquín M López Muñoz
13//      Workaround for a problem with string::assign in msvc-stlport
14// 06 Apr 2004   John Bandela
15//      Fixed a bug involving using char_delimiter with a true input iterator
16// 28 Nov 2003   Robert Zeh and John Bandela
17//      Converted into "fast" functions that avoid using += when
18//      the supplied iterator isn't an input_iterator; based on
19//      some work done at Archelon and a version that was checked into
20//      the boost CVS for a short period of time.
21// 20 Feb 2002   John Maddock
22//      Removed using namespace std declarations and added
23//      workaround for BOOST_NO_STDC_NAMESPACE (the library
24//      can be safely mixed with regex).
25// 06 Feb 2002   Jeremy Siek
26//      Added char_separator.
27// 02 Feb 2002   Jeremy Siek
28//      Removed tabs and a little cleanup.
29
30
31#ifndef BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
32#define BOOST_TOKEN_FUNCTIONS_JRB120303_HPP_
33
34#include <vector>
35#include <stdexcept>
36#include <cassert>
37#include <string>
38#include <cctype>
39#include <algorithm> // for find_if
40#include <boost/config.hpp>
41#include <boost/detail/workaround.hpp>
42#include <boost/mpl/if.hpp>
43
44//
45// the following must not be macros if we are to prefix them
46// with std:: (they shouldn't be macros anyway...)
47//
48#ifdef ispunct
49#  undef ispunct
50#endif
51#ifdef isspace
52#  undef isspace
53#endif
54//
55// fix namespace problems:
56//
57#ifdef BOOST_NO_STDC_NAMESPACE
58namespace std{
59 using ::ispunct;
60 using ::isspace;
61}
62#endif
63
64namespace boost{
65
66  //===========================================================================
67  // The escaped_list_separator class. Which is a model of TokenizerFunction
68  // An escaped list is a super-set of what is commonly known as a comma
69  // separated value (csv) list.It is separated into fields by a comma or
70  // other character. If the delimiting character is inside quotes, then it is
71  // counted as a regular character.To allow for embedded quotes in a field,
72  // there can be escape sequences using the \ much like C.
73  // The role of the comma, the quotation mark, and the escape
74  // character (backslash \), can be assigned to other characters.
75
76  struct escaped_list_error : public std::runtime_error{
77    escaped_list_error(const std::string& what):std::runtime_error(what) { }
78  };
79 
80
81// The out of the box GCC 2.95 on cygwin does not have a char_traits class.
82// MSVC does not like the following typename
83#if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
84  template <class Char,
85    class Traits = typename std::basic_string<Char>::traits_type >
86#else
87  template <class Char,
88    class Traits = std::basic_string<Char>::traits_type >
89#endif
90  class escaped_list_separator {
91
92  private:
93    typedef std::basic_string<Char,Traits> string_type;
94    struct char_eq {
95      Char e_;
96      char_eq(Char e):e_(e) { }
97      bool operator()(Char c) {
98        return Traits::eq(e_,c);
99      }
100    };
101    string_type  escape_;
102    string_type  c_;
103    string_type  quote_;     
104    bool last_;
105
106    bool is_escape(Char e) {
107      char_eq f(e);
108      return std::find_if(escape_.begin(),escape_.end(),f)!=escape_.end();
109    }
110    bool is_c(Char e) {
111      char_eq f(e);
112      return std::find_if(c_.begin(),c_.end(),f)!=c_.end();
113    }
114    bool is_quote(Char e) {
115      char_eq f(e);
116      return std::find_if(quote_.begin(),quote_.end(),f)!=quote_.end();
117    }
118    template <typename iterator, typename Token>
119    void do_escape(iterator& next,iterator end,Token& tok) {
120      if (++next == end)
121        throw escaped_list_error(std::string("cannot end with escape"));
122      if (Traits::eq(*next,'n')) {
123        tok+='\n';
124        return;
125      }
126      else if (is_quote(*next)) {
127        tok+=*next;
128        return;
129      }
130      else if (is_c(*next)) {
131        tok+=*next;
132        return;
133      }
134      else if (is_escape(*next)) {
135        tok+=*next;
136        return;
137      }
138      else
139        throw escaped_list_error(std::string("unknown escape sequence"));
140    }
141
142    public:
143   
144    explicit escaped_list_separator(Char  e = '\\',
145                                    Char c = ',',Char  q = '\"')
146      : escape_(1,e), c_(1,c), quote_(1,q), last_(false) { }
147   
148    escaped_list_separator(string_type e, string_type c, string_type q)
149      : escape_(e), c_(c), quote_(q), last_(false) { }
150   
151    void reset() {last_=false;}
152
153    template <typename InputIterator, typename Token>
154    bool operator()(InputIterator& next,InputIterator end,Token& tok) {
155      bool bInQuote = false;
156      tok = Token();
157     
158      if (next == end) {
159        if (last_) {
160          last_ = false;
161          return true;
162        }
163        else
164          return false;
165      }
166      last_ = false;
167      for (;next != end;++next) {
168        if (is_escape(*next)) {
169          do_escape(next,end,tok);
170        }
171        else if (is_c(*next)) {
172          if (!bInQuote) {
173            // If we are not in quote, then we are done
174            ++next;
175            // The last character was a c, that means there is
176            // 1 more blank field
177            last_ = true;
178            return true;
179          }
180          else tok+=*next;
181        }
182        else if (is_quote(*next)) {
183          bInQuote=!bInQuote;
184        }
185        else {
186          tok += *next;
187        }
188      }
189      return true;
190    }
191  };
192
193  //===========================================================================
194  // The classes here are used by offset_separator and char_separator to implement
195  // faster assigning of tokens using assign instead of +=
196 
197  namespace tokenizer_detail {
198
199  // The assign_or_plus_equal struct contains functions that implement
200  // assign, +=, and clearing based on the iterator type.  The
201  // generic case does nothing for plus_equal and clearing, while
202  // passing through the call for assign.
203  //
204  // When an input iterator is being used, the situation is reversed.
205  // The assign method does nothing, plus_equal invokes operator +=,
206  // and the clearing method sets the supplied token to the default
207  // token constructor's result.
208  //
209
210  template<class IteratorTag>
211  struct assign_or_plus_equal {
212    template<class Iterator, class Token>
213    static void assign(Iterator b, Iterator e, Token &t) {
214
215#if BOOST_WORKAROUND(BOOST_MSVC, == 1200) &&\
216    BOOST_WORKAROUND(__SGI_STL_PORT, < 0x500) &&\
217    defined(_STLP_DEBUG) &&\
218    (defined(_STLP_USE_DYNAMIC_LIB) || defined(_DLL))
219    // Problem with string::assign for msvc-stlport in debug mode: the
220    // linker tries to import the templatized version of this memfun,
221    // which is obviously not exported.
222    // See http://www.stlport.com/dcforum/DCForumID6/1763.html for details.
223
224      t = Token();
225      while(b != e) t += *b++;
226#else
227      t.assign(b, e);
228#endif
229
230    }
231
232    template<class Token, class Value>
233    static void plus_equal(Token &, const Value &) {
234   
235  }
236
237    // If we are doing an assign, there is no need for the
238    // the clear.
239    //
240    template<class Token>
241    static void clear(Token &) {
242
243    }
244  };
245
246  template <>
247  struct assign_or_plus_equal<std::input_iterator_tag> {
248    template<class Iterator, class Token>
249    static void assign(Iterator b, Iterator e, Token &t) {
250
251    }
252    template<class Token, class Value>
253    static void plus_equal(Token &t, const Value &v) {
254      t += v;
255    }
256    template<class Token>
257    static void clear(Token &t) {
258      t = Token();
259    }
260  };
261
262
263  template<class Iterator>
264  struct pointer_iterator_category{
265    typedef std::random_access_iterator_tag type;
266  };
267
268
269  template<class Iterator>
270  struct class_iterator_category{
271    typedef typename Iterator::iterator_category type;
272  };
273
274
275
276  // This portably gets the iterator_tag without partial template specialization
277  template<class Iterator>
278    struct get_iterator_category{
279    typedef typename mpl::if_<is_pointer<Iterator>,
280      pointer_iterator_category<Iterator>,
281      class_iterator_category<Iterator>
282    >::type cat;
283
284    typedef typename cat::type iterator_category;
285  };
286
287 
288}
289
290   
291  //===========================================================================
292  // The offset_separator class, which is a model of TokenizerFunction.
293  // Offset breaks a string into tokens based on a range of offsets
294
295  class offset_separator {
296  private:
297
298    std::vector<int> offsets_;
299    unsigned int current_offset_;
300    bool wrap_offsets_;
301    bool return_partial_last_;
302   
303  public:
304    template <typename Iter>
305    offset_separator(Iter begin, Iter end, bool wrap_offsets = true,
306                     bool return_partial_last = true)
307      : offsets_(begin,end), current_offset_(0),
308        wrap_offsets_(wrap_offsets),
309        return_partial_last_(return_partial_last) { }
310   
311    offset_separator()
312      : offsets_(1,1), current_offset_(),
313        wrap_offsets_(true), return_partial_last_(true) { }
314
315    void reset() {
316      current_offset_ = 0;
317    }
318
319    template <typename InputIterator, typename Token>
320    bool operator()(InputIterator& next, InputIterator end, Token& tok)
321    {
322      typedef tokenizer_detail::assign_or_plus_equal<
323#if     !defined(BOOST_MSVC) || BOOST_MSVC > 1300
324        typename
325#endif
326        tokenizer_detail::get_iterator_category<
327        InputIterator>::iterator_category> assigner;
328
329
330      assert(!offsets_.empty());
331   
332      assigner::clear(tok);
333      InputIterator start(next);
334     
335      if (next == end)
336        return false;
337
338      if (current_offset_ == offsets_.size())
339        if (wrap_offsets_)
340          current_offset_=0;
341        else
342          return false;
343     
344      int c = offsets_[current_offset_];
345      int i = 0;
346      for (; i < c; ++i) {
347        if (next == end)break;
348        assigner::plus_equal(tok,*next++);
349      }
350      assigner::assign(start,next,tok);
351   
352      if (!return_partial_last_)
353        if (i < (c-1) )
354          return false;
355     
356      ++current_offset_;
357      return true;
358    }
359  };
360
361
362  //===========================================================================
363  // The char_separator class breaks a sequence of characters into
364  // tokens based on the character delimiters (very much like bad old
365  // strtok). A delimiter character can either be kept or dropped. A
366  // kept delimiter shows up as an output token, whereas a dropped
367  // delimiter does not.
368
369  // This class replaces the char_delimiters_separator class. The
370  // constructor for the char_delimiters_separator class was too
371  // confusing and needed to be deprecated. However, because of the
372  // default arguments to the constructor, adding the new constructor
373  // would cause ambiguity, so instead I deprecated the whole class.
374  // The implementation of the class was also simplified considerably.
375
376  enum empty_token_policy { drop_empty_tokens, keep_empty_tokens };
377
378  // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
379#if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
380  template <typename Char,
381    typename Traits = typename std::basic_string<Char>::traits_type >
382#else
383  template <typename Char,
384    typename Traits = std::basic_string<Char>::traits_type >
385#endif
386  class char_separator
387  {
388    typedef std::basic_string<Char,Traits> string_type;
389  public:
390    explicit
391    char_separator(const Char* dropped_delims,
392                   const Char* kept_delims = 0,
393                   empty_token_policy empty_tokens = drop_empty_tokens)
394      : m_dropped_delims(dropped_delims),
395        m_use_ispunct(false),
396        m_use_isspace(false),
397        m_empty_tokens(empty_tokens),
398        m_output_done(false)
399    {
400      // Borland workaround
401      if (kept_delims)
402        m_kept_delims = kept_delims;
403    }
404
405                // use ispunct() for kept delimiters and isspace for dropped.
406    explicit
407    char_separator()
408      : m_use_ispunct(true),
409        m_use_isspace(true),
410        m_empty_tokens(drop_empty_tokens) { }
411
412    void reset() { }
413
414    template <typename InputIterator, typename Token>
415    bool operator()(InputIterator& next, InputIterator end, Token& tok)
416    {
417      typedef tokenizer_detail::assign_or_plus_equal<
418#if     !defined(BOOST_MSVC) || BOOST_MSVC > 1300
419        typename
420#endif
421        tokenizer_detail::get_iterator_category<
422        InputIterator>::iterator_category> assigner;
423
424      assigner::clear(tok);
425
426      // skip past all dropped_delims
427      if (m_empty_tokens == drop_empty_tokens)
428        for (; next != end  && is_dropped(*next); ++next)
429          { }
430     
431      InputIterator start(next);
432
433      if (m_empty_tokens == drop_empty_tokens) {
434
435        if (next == end)
436          return false;
437
438
439        // if we are on a kept_delims move past it and stop
440        if (is_kept(*next)) {
441          assigner::plus_equal(tok,*next);
442          ++next;
443        } else
444          // append all the non delim characters
445          for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
446            assigner::plus_equal(tok,*next);
447      }
448      else { // m_empty_tokens == keep_empty_tokens
449       
450        // Handle empty token at the end
451        if (next == end)
452          if (m_output_done == false) {
453            m_output_done = true;
454            assigner::assign(start,next,tok);
455            return true;
456          } else
457            return false;
458       
459        if (is_kept(*next)) {
460          if (m_output_done == false)
461            m_output_done = true;
462          else {
463            assigner::plus_equal(tok,*next);
464            ++next;
465            m_output_done = false;
466          }
467        }
468        else if (m_output_done == false && is_dropped(*next)) {
469          m_output_done = true;
470        }
471        else {
472          if (is_dropped(*next))
473            start=++next;
474          for (; next != end && !is_dropped(*next) && !is_kept(*next); ++next)
475            assigner::plus_equal(tok,*next);
476          m_output_done = true;
477        }
478      }
479      assigner::assign(start,next,tok);
480      return true;
481    }
482
483  private:
484    string_type m_kept_delims;
485    string_type m_dropped_delims;
486    bool m_use_ispunct;
487    bool m_use_isspace;
488    empty_token_policy m_empty_tokens;
489    bool m_output_done;
490   
491    bool is_kept(Char E) const
492    { 
493      if (m_kept_delims.length())
494        return m_kept_delims.find(E) != string_type::npos;
495      else if (m_use_ispunct) {
496        return std::ispunct(E) != 0;
497      } else
498        return false;
499    }
500    bool is_dropped(Char E) const
501    {
502      if (m_dropped_delims.length())
503        return m_dropped_delims.find(E) != string_type::npos;
504      else if (m_use_isspace) {
505        return std::isspace(E) != 0;
506      } else
507        return false;
508    }
509  };
510
511  //===========================================================================
512  // The following class is DEPRECATED, use class char_separators instead.
513  //
514  // The char_delimiters_separator class, which is a model of
515  // TokenizerFunction.  char_delimiters_separator breaks a string
516  // into tokens based on character delimiters. There are 2 types of
517  // delimiters. returnable delimiters can be returned as
518  // tokens. These are often punctuation. nonreturnable delimiters
519  // cannot be returned as tokens. These are often whitespace
520
521  // The out of the box GCC 2.95 on cygwin does not have a char_traits class.
522#if !defined(BOOST_MSVC) || BOOST_MSVC > 1300
523  template <class Char,
524    class Traits = typename std::basic_string<Char>::traits_type >
525#else
526  template <class Char,
527    class Traits = std::basic_string<Char>::traits_type >
528#endif
529  class char_delimiters_separator {
530  private: 
531
532    typedef std::basic_string<Char,Traits> string_type;
533    string_type returnable_;
534    string_type nonreturnable_;
535    bool return_delims_;
536    bool no_ispunct_;
537    bool no_isspace_;
538   
539    bool is_ret(Char E)const
540    { 
541      if (returnable_.length())
542        return  returnable_.find(E) != string_type::npos;
543      else{
544        if (no_ispunct_) {return false;}
545        else{
546          int r = std::ispunct(E);
547          return r != 0;
548        }
549      }
550    }
551    bool is_nonret(Char E)const
552    {
553      if (nonreturnable_.length())
554        return  nonreturnable_.find(E) != string_type::npos;
555      else{
556        if (no_isspace_) {return false;}
557        else{
558          int r = std::isspace(E);
559          return r != 0;
560        }
561      }
562    }
563   
564  public:
565    explicit char_delimiters_separator(bool return_delims = false,
566                                       const Char* returnable = 0,
567                                       const Char* nonreturnable = 0)
568      : returnable_(returnable ? returnable : string_type().c_str()),
569        nonreturnable_(nonreturnable ? nonreturnable:string_type().c_str()),
570        return_delims_(return_delims), no_ispunct_(returnable!=0),
571        no_isspace_(nonreturnable!=0) { }
572   
573    void reset() { }
574
575  public:
576
577     template <typename InputIterator, typename Token>
578     bool operator()(InputIterator& next, InputIterator end,Token& tok) {
579     tok = Token();
580     
581     // skip past all nonreturnable delims
582     // skip past the returnable only if we are not returning delims
583     for (;next!=end && ( is_nonret(*next) || (is_ret(*next)
584       && !return_delims_ ) );++next) { }
585     
586     if (next == end) {
587       return false;
588     }
589     
590     // if we are to return delims and we are one a returnable one
591     // move past it and stop
592     if (is_ret(*next) && return_delims_) {
593       tok+=*next;
594       ++next;
595     }
596     else
597       // append all the non delim characters
598       for (;next!=end && !is_nonret(*next) && !is_ret(*next);++next)
599         tok+=*next;
600       
601       
602     return true;
603   }
604  };
605
606
607} //namespace boost
608
609
610#endif
611
612
613
614
615
Note: See TracBrowser for help on using the repository browser.