source: NonGTP/Boost/boost/regex/v4/basic_regex_creator.hpp @ 857

Revision 857, 42.8 KB checked in by igarcia, 19 years ago (diff)
Line 
1/*
2 *
3 * Copyright (c) 2004
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11
12 /*
13  *   LOCATION:    see http://www.boost.org for most recent version.
14  *   FILE         basic_regex_creator.cpp
15  *   VERSION      see <boost/version.hpp>
16  *   DESCRIPTION: Declares template class basic_regex_creator which fills in
17  *                the data members of a regex_data object.
18  */
19
20#ifndef BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
21#define BOOST_REGEX_V4_BASIC_REGEX_CREATOR_HPP
22
23#ifdef BOOST_HAS_ABI_HEADERS
24#  include BOOST_ABI_PREFIX
25#endif
26
27namespace boost{
28
29namespace re_detail{
30
31template <class charT>
32struct digraph : public std::pair<charT, charT>
33{
34   digraph() : std::pair<charT, charT>(0, 0){}
35   digraph(charT c1) : std::pair<charT, charT>(c1, 0){}
36   digraph(charT c1, charT c2) : std::pair<charT, charT>(c1, c2)
37   {}
38#if !BOOST_WORKAROUND(BOOST_MSVC, < 1300)
39   digraph(const digraph<charT>& d) : std::pair<charT, charT>(d.first, d.second){}
40#endif
41   template <class Seq>
42   digraph(const Seq& s) : std::pair<charT, charT>()
43   {
44      BOOST_ASSERT(s.size() <= 2);
45      BOOST_ASSERT(s.size());
46      this->first = s[0];
47      this->second = (s.size() > 1) ? s[1] : 0;
48   }
49};
50
51template <class charT, class traits>
52class basic_char_set
53{
54public:
55   typedef digraph<charT>                   digraph_type;
56   typedef typename traits::string_type     string_type;
57   typedef typename traits::char_class_type mask_type;
58
59   basic_char_set()
60   {
61      m_negate = false;
62      m_has_digraphs = false;
63      m_classes = 0;
64      m_negated_classes = 0;
65      m_empty = true;
66   }
67
68   void add_single(const digraph_type& s)
69   {
70      m_singles.insert(m_singles.end(), s);
71      if(s.second)
72         m_has_digraphs = true;
73      m_empty = false;
74   }
75   void add_range(const digraph_type& first, const digraph_type& end)
76   {
77      m_ranges.insert(m_ranges.end(), first);
78      m_ranges.insert(m_ranges.end(), end);
79      if(first.second)
80      {
81         m_has_digraphs = true;
82         add_single(first);
83      }
84      if(end.second)
85      {
86         m_has_digraphs = true;
87         add_single(end);
88      }
89      m_empty = false;
90   }
91   void add_class(mask_type m)
92   {
93      m_classes |= m;
94      m_empty = false;
95   }
96   void add_negated_class(mask_type m)
97   {
98      m_negated_classes |= m;
99      m_empty = false;
100   }
101   void add_equivalent(const digraph_type& s)
102   {
103      m_equivalents.insert(m_equivalents.end(), s);
104      if(s.second)
105      {
106         m_has_digraphs = true;
107         add_single(s);
108      }
109      m_empty = false;
110   }
111   void negate()
112   {
113      m_negate = true;
114      //m_empty = false;
115   }
116
117   //
118   // accessor functions:
119   //
120   bool has_digraphs()const
121   {
122      return m_has_digraphs;
123   }
124   bool is_negated()const
125   {
126      return m_negate;
127   }
128   typedef typename std::vector<digraph_type>::const_iterator  list_iterator;
129   list_iterator singles_begin()const
130   {
131      return m_singles.begin();
132   }
133   list_iterator singles_end()const
134   {
135      return m_singles.end();
136   }
137   list_iterator ranges_begin()const
138   {
139      return m_ranges.begin();
140   }
141   list_iterator ranges_end()const
142   {
143      return m_ranges.end();
144   }
145   list_iterator equivalents_begin()const
146   {
147      return m_equivalents.begin();
148   }
149   list_iterator equivalents_end()const
150   {
151      return m_equivalents.end();
152   }
153   mask_type classes()const
154   {
155      return m_classes;
156   }
157   mask_type negated_classes()const
158   {
159      return m_negated_classes;
160   }
161   bool empty()const
162   {
163      return m_empty;
164   }
165private:
166   std::vector<digraph_type> m_singles;         // a list of single characters to match
167   std::vector<digraph_type> m_ranges;          // a list of end points of our ranges
168   bool                      m_negate;          // true if the set is to be negated
169   bool                      m_has_digraphs;    // true if we have digraphs present
170   mask_type                 m_classes;         // character classes to match
171   mask_type                 m_negated_classes; // negated character classes to match
172   bool                      m_empty;           // whether we've added anything yet
173   std::vector<digraph_type> m_equivalents;     // a list of equivalence classes
174};
175   
176template <class charT, class traits>
177class basic_regex_creator
178{
179public:
180   basic_regex_creator(regex_data<charT, traits>* data);
181   std::ptrdiff_t getoffset(void* addr)
182   {
183      return getoffset(addr, m_pdata->m_data.data());
184   }
185   std::ptrdiff_t getoffset(const void* addr, const void* base)
186   {
187      return static_cast<const char*>(addr) - static_cast<const char*>(base);
188   }
189   re_syntax_base* getaddress(std::ptrdiff_t off)
190   {
191      return getaddress(off, m_pdata->m_data.data());
192   }
193   re_syntax_base* getaddress(std::ptrdiff_t off, void* base)
194   {
195      return static_cast<re_syntax_base*>(static_cast<void*>(static_cast<char*>(base) + off));
196   }
197   void init(unsigned flags)
198   {
199      m_pdata->m_flags = flags;
200      m_icase = flags & regex_constants::icase;
201   }
202   regbase::flag_type flags()
203   {
204      return m_pdata->m_flags;
205   }
206   void flags(regbase::flag_type f)
207   {
208      m_pdata->m_flags = f;
209      if(m_icase != static_cast<bool>(f & regbase::icase))
210      {
211         m_icase = static_cast<bool>(f & regbase::icase);
212      }
213   }
214   re_syntax_base* append_state(syntax_element_type t, std::size_t s = sizeof(re_syntax_base));
215   re_syntax_base* insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s = sizeof(re_syntax_base));
216   re_literal* append_literal(charT c);
217   re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set);
218   re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set, mpl::false_*);
219   re_syntax_base* append_set(const basic_char_set<charT, traits>& char_set, mpl::true_*);
220   void finalize(const charT* p1, const charT* p2);
221protected:
222   regex_data<charT, traits>*    m_pdata;              // pointer to the basic_regex_data struct we are filling in
223   const ::boost::regex_traits_wrapper<traits>& 
224                                 m_traits;             // convenience reference to traits class
225   re_syntax_base*               m_last_state;         // the last state we added
226   bool                          m_icase;              // true for case insensitive matches
227   unsigned                      m_repeater_id;        // the id of the next repeater
228   bool                          m_has_backrefs;       // true if there are actually any backrefs
229   unsigned                      m_backrefs;           // bitmask of permitted backrefs
230   boost::uintmax_t              m_bad_repeats;        // bitmask of repeats we can't deduce a startmap for;
231   typename traits::char_class_type m_word_mask;       // mask used to determine if a character is a word character
232   typename traits::char_class_type m_mask_space;      // mask used to determine if a character is a word character
233   typename traits::char_class_type m_lower_mask;       // mask used to determine if a character is a lowercase character
234   typename traits::char_class_type m_upper_mask;      // mask used to determine if a character is an uppercase character
235   typename traits::char_class_type m_alpha_mask;      // mask used to determine if a character is an alphabetic character
236private:
237   basic_regex_creator& operator=(const basic_regex_creator&);
238   basic_regex_creator(const basic_regex_creator&);
239
240   void fixup_pointers(re_syntax_base* state);
241   void create_startmaps(re_syntax_base* state);
242   int calculate_backstep(re_syntax_base* state);
243   void create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask);
244   unsigned get_restart_type(re_syntax_base* state);
245   void set_all_masks(unsigned char* bits, unsigned char);
246   bool is_bad_repeat(re_syntax_base* pt);
247   void set_bad_repeat(re_syntax_base* pt);
248   syntax_element_type get_repeat_type(re_syntax_base* state);
249   void probe_leading_repeat(re_syntax_base* state);
250};
251
252template <class charT, class traits>
253basic_regex_creator<charT, traits>::basic_regex_creator(regex_data<charT, traits>* data)
254   : m_pdata(data), m_traits(*(data->m_ptraits)), m_last_state(0), m_repeater_id(0), m_has_backrefs(false), m_backrefs(0)
255{
256   m_pdata->m_data.clear();
257   m_pdata->m_status = ::boost::regex_constants::error_ok;
258   static const charT w = 'w';
259   static const charT s = 's';
260   static const charT l[5] = { 'l', 'o', 'w', 'e', 'r', };
261   static const charT u[5] = { 'u', 'p', 'p', 'e', 'r', };
262   static const charT a[5] = { 'a', 'l', 'p', 'h', 'a', };
263   m_word_mask = m_traits.lookup_classname(&w, &w +1);
264   m_mask_space = m_traits.lookup_classname(&s, &s +1);
265   m_lower_mask = m_traits.lookup_classname(l, l + 5);
266   m_upper_mask = m_traits.lookup_classname(u, u + 5);
267   m_alpha_mask = m_traits.lookup_classname(a, a + 5);
268   BOOST_ASSERT(m_word_mask != 0);
269   BOOST_ASSERT(m_mask_space != 0);
270   BOOST_ASSERT(m_lower_mask != 0);
271   BOOST_ASSERT(m_upper_mask != 0);
272   BOOST_ASSERT(m_alpha_mask != 0);
273}
274
275template <class charT, class traits>
276re_syntax_base* basic_regex_creator<charT, traits>::append_state(syntax_element_type t, std::size_t s)
277{
278   // if the state is a backref then make a note of it:
279   if(t == syntax_element_backref)
280      this->m_has_backrefs = true;
281   // append a new state, start by aligning our last one:
282   m_pdata->m_data.align();
283   // set the offset to the next state in our last one:
284   if(m_last_state)
285      m_last_state->next.i = m_pdata->m_data.size() - getoffset(m_last_state);
286   // now actually extent our data:
287   m_last_state = static_cast<re_syntax_base*>(m_pdata->m_data.extend(s));
288   // fill in boilerplate options in the new state:
289   m_last_state->next.i = 0;
290   m_last_state->type = t;
291   return m_last_state;
292}
293
294template <class charT, class traits>
295re_syntax_base* basic_regex_creator<charT, traits>::insert_state(std::ptrdiff_t pos, syntax_element_type t, std::size_t s)
296{
297   // append a new state, start by aligning our last one:
298   m_pdata->m_data.align();
299   // set the offset to the next state in our last one:
300   if(m_last_state)
301      m_last_state->next.i = m_pdata->m_data.size() - getoffset(m_last_state);
302   // remember the last state position:
303   std::ptrdiff_t off = getoffset(m_last_state) + s;
304   // now actually insert our data:
305   re_syntax_base* new_state = static_cast<re_syntax_base*>(m_pdata->m_data.insert(pos, s));
306   // fill in boilerplate options in the new state:
307   new_state->next.i = s;
308   new_state->type = t;
309   m_last_state = getaddress(off);
310   return new_state;
311}
312
313template <class charT, class traits>
314re_literal* basic_regex_creator<charT, traits>::append_literal(charT c)
315{
316   re_literal* result;
317   // start by seeing if we have an existing re_literal we can extend:
318   if((0 == m_last_state) || (m_last_state->type != syntax_element_literal))
319   {
320      // no existing re_literal, create a new one:
321      result = static_cast<re_literal*>(append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
322      result->length = 1;
323      *static_cast<charT*>(static_cast<void*>(result+1)) = m_traits.translate(c, m_icase);
324   }
325   else
326   {
327      // we have an existing re_literal, extend it:
328      std::ptrdiff_t off = getoffset(m_last_state);
329      m_pdata->m_data.extend(sizeof(charT));
330      m_last_state = result = static_cast<re_literal*>(getaddress(off));
331      charT* characters = static_cast<charT*>(static_cast<void*>(result+1));
332      characters[result->length] = m_traits.translate(c, m_icase);
333      ++(result->length);
334   }
335   return result;
336}
337
338template <class charT, class traits>
339inline re_syntax_base* basic_regex_creator<charT, traits>::append_set(
340   const basic_char_set<charT, traits>& char_set)
341{
342   typedef mpl::bool_< (sizeof(charT) == 1) > truth_type;
343   return char_set.has_digraphs()
344      ? append_set(char_set, static_cast<mpl::false_*>(0))
345      : append_set(char_set, static_cast<truth_type*>(0));
346}
347
348template <class charT, class traits>
349re_syntax_base* basic_regex_creator<charT, traits>::append_set(
350   const basic_char_set<charT, traits>& char_set, mpl::false_*)
351{
352   typedef typename traits::string_type string_type;
353   typedef typename basic_char_set<charT, traits>::list_iterator item_iterator;
354   typedef typename traits::char_class_type mask_type;
355   
356   re_set_long<mask_type>* result = static_cast<re_set_long<mask_type>*>(append_state(syntax_element_long_set, sizeof(re_set_long<mask_type>)));
357   //
358   // fill in the basics:
359   //
360   result->csingles = static_cast<unsigned int>(::boost::re_detail::distance(char_set.singles_begin(), char_set.singles_end()));
361   result->cranges = static_cast<unsigned int>(::boost::re_detail::distance(char_set.ranges_begin(), char_set.ranges_end())) / 2;
362   result->cequivalents = static_cast<unsigned int>(::boost::re_detail::distance(char_set.equivalents_begin(), char_set.equivalents_end()));
363   result->cclasses = char_set.classes();
364   result->cnclasses = char_set.negated_classes();
365   if(flags() & regbase::icase)
366   {
367      // adjust classes as needed:
368      if(((result->cclasses & m_lower_mask) == m_lower_mask) || ((result->cclasses & m_upper_mask) == m_upper_mask))
369         result->cclasses |= m_alpha_mask;
370      if(((result->cnclasses & m_lower_mask) == m_lower_mask) || ((result->cnclasses & m_upper_mask) == m_upper_mask))
371         result->cnclasses |= m_alpha_mask;
372   }
373
374   result->isnot = char_set.is_negated();
375   result->singleton = !char_set.has_digraphs();
376   //
377   // remember where the state is for later:
378   //
379   std::ptrdiff_t offset = getoffset(result);
380   //
381   // now extend with all the singles:
382   //
383   item_iterator first, last;
384   first = char_set.singles_begin();
385   last = char_set.singles_end();
386   while(first != last)
387   {
388      charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (first->second ? 3 : 2)));
389      p[0] = m_traits.translate(first->first, m_icase);
390      if(first->second)
391      {
392         p[1] = m_traits.translate(first->second, m_icase);
393         p[2] = 0;
394      }
395      else
396         p[1] = 0;
397      ++first;
398   }
399   //
400   // now extend with all the ranges:
401   //
402   first = char_set.ranges_begin();
403   last = char_set.ranges_end();
404   while(first != last)
405   {
406      // first grab the endpoints of the range:
407      digraph<charT> c1 = *first;
408      c1.first = this->m_traits.translate(c1.first, this->m_icase);
409      c1.second = this->m_traits.translate(c1.second, this->m_icase);
410      ++first;
411      digraph<charT> c2 = *first;
412      c2.first = this->m_traits.translate(c2.first, this->m_icase);
413      c2.second = this->m_traits.translate(c2.second, this->m_icase);
414      ++first;
415      string_type s1, s2;
416      // different actions now depending upon whether collation is turned on:
417      if(flags() & regex_constants::collate)
418      {
419         // we need to transform our range into sort keys:
420#if BOOST_WORKAROUND(__GNUC__, < 3)
421         string_type in(3, charT(0));
422         in[0] = c1.first;
423         in[1] = c1.second;
424         s1 = this->m_traits.transform(in.c_str(), (in[1] ? in.c_str()+2 : in.c_str()+1));
425         in[0] = c2.first;
426         in[1] = c2.second;
427         s2 = this->m_traits.transform(in.c_str(), (in[1] ? in.c_str()+2 : in.c_str()+1));
428#else
429         charT a1[3] = { c1.first, c1.second, charT(0), };
430         charT a2[3] = { c2.first, c2.second, charT(0), };
431         s1 = this->m_traits.transform(a1, (a1[1] ? a1+2 : a1+1));
432         s2 = this->m_traits.transform(a2, (a2[1] ? a2+2 : a2+1));
433#endif
434         if(s1.size() == 0)
435            s1 = string_type(1, charT(0));
436         if(s2.size() == 0)
437            s2 = string_type(1, charT(0));
438      }
439      else
440      {
441         if(c1.second)
442         {
443            s1.insert(s1.end(), c1.first);
444            s1.insert(s1.end(), c1.second);
445         }
446         else
447            s1 = string_type(1, c1.first);
448         if(c2.second)
449         {
450            s2.insert(s2.end(), c2.first);
451            s2.insert(s2.end(), c2.second);
452         }
453         else
454            s2.insert(s2.end(), c2.first);
455      }
456      if(s1 > s2)
457      {
458         // Oops error:
459         return 0;
460      }
461      charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s1.size() + s2.size() + 2) ) );
462      re_detail::copy(s1.begin(), s1.end(), p);
463      p[s1.size()] = charT(0);
464      p += s1.size() + 1;
465      re_detail::copy(s2.begin(), s2.end(), p);
466      p[s2.size()] = charT(0);
467   }
468   //
469   // now process the equivalence classes:
470   //
471   first = char_set.equivalents_begin();
472   last = char_set.equivalents_end();
473   while(first != last)
474   {
475      string_type s;
476      if(first->second)
477      {
478#if BOOST_WORKAROUND(__GNUC__, < 3)
479         string_type in(3, charT(0));
480         in[0] = first->first;
481         in[1] = first->second;
482         s = m_traits.transform_primary(in.c_str(), in.c_str()+2);
483#else
484         charT cs[3] = { first->first, first->second, charT(0), };
485         s = m_traits.transform_primary(cs, cs+2);
486#endif
487      }
488      else
489         s = m_traits.transform_primary(&first->first, &first->first+1);
490      if(s.empty())
491         return 0;  // invalid or unsupported equivalence class
492      charT* p = static_cast<charT*>(this->m_pdata->m_data.extend(sizeof(charT) * (s.size()+1) ) );
493      re_detail::copy(s.begin(), s.end(), p);
494      p[s.size()] = charT(0);
495      ++first;
496   }
497   //
498   // finally reset the address of our last state:
499   //
500   m_last_state = result = static_cast<re_set_long<mask_type>*>(getaddress(offset));
501   return result;
502}
503
504namespace{
505
506template<class T>
507inline bool char_less(T t1, T t2)
508{
509   return t1 < t2;
510}
511template<>
512inline bool char_less<char>(char t1, char t2)
513{
514   return static_cast<unsigned char>(t1) < static_cast<unsigned char>(t2);
515}
516template<>
517inline bool char_less<signed char>(signed char t1, signed char t2)
518{
519   return static_cast<unsigned char>(t1) < static_cast<unsigned char>(t2);
520}
521}
522
523template <class charT, class traits>
524re_syntax_base* basic_regex_creator<charT, traits>::append_set(
525   const basic_char_set<charT, traits>& char_set, mpl::true_*)
526{
527   typedef typename traits::string_type string_type;
528   typedef typename basic_char_set<charT, traits>::list_iterator item_iterator;
529   
530   re_set* result = static_cast<re_set*>(append_state(syntax_element_set, sizeof(re_set)));
531   bool negate = char_set.is_negated();
532   std::memset(result->_map, 0, sizeof(result->_map));
533   //
534   // handle singles first:
535   //
536   item_iterator first, last;
537   first = char_set.singles_begin();
538   last = char_set.singles_end();
539   while(first != last)
540   {
541      for(unsigned int i = 0; i < (1 << CHAR_BIT); ++i)
542      {
543         if(this->m_traits.translate(static_cast<charT>(i), this->m_icase)
544            == this->m_traits.translate(first->first, this->m_icase))
545            result->_map[i] = true;
546      }
547      ++first;
548   }
549   //
550   // OK now handle ranges:
551   //
552   first = char_set.ranges_begin();
553   last = char_set.ranges_end();
554   while(first != last)
555   {
556      // first grab the endpoints of the range:
557      charT c1 = this->m_traits.translate(first->first, this->m_icase);
558      ++first;
559      charT c2 = this->m_traits.translate(first->first, this->m_icase);
560      ++first;
561      // different actions now depending upon whether collation is turned on:
562      if(flags() & regex_constants::collate)
563      {
564         // we need to transform our range into sort keys:
565         charT c3[2] = { c1, charT(0), };
566         string_type s1 = this->m_traits.transform(c3, c3+1);
567         c3[0] = c2;
568         string_type s2 = this->m_traits.transform(c3, c3+1);
569         if(s1 > s2)
570         {
571            // Oops error:
572            return 0;
573         }
574         for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
575         {
576            charT c3[2] = { static_cast<charT>(i), charT(0), };
577            string_type s3 = this->m_traits.transform(c3, c3 +1);
578            if((s1 <= s3) && (s3 <= s2))
579               result->_map[i] = true;
580         }
581      }
582      else
583      {
584         if(char_less<charT>(c2, c1))
585         {
586            // Oops error:
587            return 0;
588         }
589         // everything in range matches:
590         std::memset(result->_map + static_cast<unsigned char>(c1), true, 1 + static_cast<unsigned char>(c2) - static_cast<unsigned char>(c1));
591      }
592   }
593   //
594   // and now the classes:
595   //
596   typedef typename traits::char_class_type mask_type;
597   mask_type m = char_set.classes();
598   if(flags() & regbase::icase)
599   {
600      // adjust m as needed:
601      if(((m & m_lower_mask) == m_lower_mask) || ((m & m_upper_mask) == m_upper_mask))
602         m |= m_alpha_mask;
603   }
604   if(m != 0)
605   {
606      for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
607      {
608         if(this->m_traits.isctype(static_cast<charT>(i), m))
609            result->_map[i] = true;
610      }
611   }
612   //
613   // and now the negated classes:
614   //
615   m = char_set.negated_classes();
616   if(flags() & regbase::icase)
617   {
618      // adjust m as needed:
619      if(((m & m_lower_mask) == m_lower_mask) || ((m & m_upper_mask) == m_upper_mask))
620         m |= m_alpha_mask;
621   }
622   if(m != 0)
623   {
624      for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
625      {
626         if(0 == this->m_traits.isctype(static_cast<charT>(i), m))
627            result->_map[i] = true;
628      }
629   }
630   //
631   // now process the equivalence classes:
632   //
633   first = char_set.equivalents_begin();
634   last = char_set.equivalents_end();
635   while(first != last)
636   {
637      string_type s;
638      BOOST_ASSERT(static_cast<charT>(0) == first->second);
639      s = m_traits.transform_primary(&first->first, &first->first+1);
640      if(s.empty())
641         return 0;  // invalid or unsupported equivalence class
642      for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
643      {
644         charT c[2] = { (static_cast<charT>(i)), charT(0), };
645         string_type s2 = this->m_traits.transform_primary(c, c+1);
646         if(s == s2)
647            result->_map[i] = true;
648      }
649      ++first;
650   }
651   if(negate)
652   {
653      for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
654      {
655         result->_map[i] = !(result->_map[i]);
656      }
657   }
658   return result;
659}
660
661template <class charT, class traits>
662void basic_regex_creator<charT, traits>::finalize(const charT* p1, const charT* p2)
663{
664   // we've added all the states we need, now finish things off.
665   // start by adding a terminating state:
666   append_state(syntax_element_match);
667   // extend storage to store original expression:
668   std::ptrdiff_t len = p2 - p1;
669   m_pdata->m_expression_len = len;
670   charT* ps = static_cast<charT*>(m_pdata->m_data.extend(sizeof(charT) * (1 + (p2 - p1))));
671   m_pdata->m_expression = ps;
672   re_detail::copy(p1, p2, ps);
673   ps[p2 - p1] = 0;
674   // fill in our other data...
675   // successful parsing implies a zero status:
676   m_pdata->m_status = 0;
677   // get the first state of the machine:
678   m_pdata->m_first_state = static_cast<re_syntax_base*>(m_pdata->m_data.data());
679   // fixup pointers in the machine:
680   fixup_pointers(m_pdata->m_first_state);
681   // create nested startmaps:
682   create_startmaps(m_pdata->m_first_state);
683   // create main startmap:
684   std::memset(m_pdata->m_startmap, 0, sizeof(m_pdata->m_startmap));
685   m_pdata->m_can_be_null = 0;
686
687   m_bad_repeats = 0;
688   create_startmap(m_pdata->m_first_state, m_pdata->m_startmap, &(m_pdata->m_can_be_null), mask_all);
689   // get the restart type:
690   m_pdata->m_restart_type = get_restart_type(m_pdata->m_first_state);
691   // optimise a leading repeat if there is one:
692   probe_leading_repeat(m_pdata->m_first_state);
693}
694
695template <class charT, class traits>
696void basic_regex_creator<charT, traits>::fixup_pointers(re_syntax_base* state)
697{
698   while(state)
699   {
700      switch(state->type)
701      {
702      case syntax_element_rep:
703      case syntax_element_dot_rep:
704      case syntax_element_char_rep:
705      case syntax_element_short_set_rep:
706      case syntax_element_long_set_rep:
707         // set the id of this repeat:
708         static_cast<re_repeat*>(state)->id = m_repeater_id++;
709         // fall through:
710      case syntax_element_alt:
711         std::memset(static_cast<re_alt*>(state)->_map, 0, sizeof(static_cast<re_alt*>(state)->_map));
712         static_cast<re_alt*>(state)->can_be_null = 0;
713         // fall through:
714      case syntax_element_jump:
715         static_cast<re_jump*>(state)->alt.p = getaddress(static_cast<re_jump*>(state)->alt.i, state);
716         // fall through again:
717      default:
718         if(state->next.i)
719            state->next.p = getaddress(state->next.i, state);
720         else
721            state->next.p = 0;
722      }
723      state = state->next.p;
724   }
725}
726
727template <class charT, class traits>
728void basic_regex_creator<charT, traits>::create_startmaps(re_syntax_base* state)
729{
730   // non-recursive implementation:
731   // create the last map in the machine first, so that earlier maps
732   // can make use of the result...
733   //
734   // This was originally a recursive implementation, but that caused stack
735   // overflows with complex expressions on small stacks (think COM+).
736
737   // start by saving the case setting:
738   bool l_icase = m_icase;
739   std::vector<std::pair<bool, re_syntax_base*> > v;
740
741   while(state)
742   {
743      switch(state->type)
744      {
745      case syntax_element_toggle_case:
746         // we need to track case changes here:
747         m_icase = static_cast<re_case*>(state)->icase;
748         state = state->next.p;
749         continue;
750      case syntax_element_alt:
751      case syntax_element_rep:
752      case syntax_element_dot_rep:
753      case syntax_element_char_rep:
754      case syntax_element_short_set_rep:
755      case syntax_element_long_set_rep:
756         // just push the state onto our stack for now:
757         v.push_back(std::pair<bool, re_syntax_base*>(m_icase, state));
758         state = state->next.p;
759         break;
760      case syntax_element_backstep:
761         // we need to calculate how big the backstep is:
762         static_cast<re_brace*>(state)->index
763            = this->calculate_backstep(state->next.p);
764         if(static_cast<re_brace*>(state)->index < 0)
765         {
766            // Oops error:
767            if(0 == this->m_pdata->m_status) // update the error code if not already set
768               this->m_pdata->m_status = boost::regex_constants::error_bad_pattern;
769            //
770            // clear the expression, we should be empty:
771            //
772            this->m_pdata->m_expression = 0;
773            this->m_pdata->m_expression_len = 0;
774            //
775            // and throw if required:
776            //
777            if(0 == (this->flags() & regex_constants::no_except))
778            {
779               std::string message = this->m_pdata->m_ptraits->error_string(boost::regex_constants::error_bad_pattern);
780               boost::regex_error e(message, boost::regex_constants::error_bad_pattern, 0);
781               e.raise();
782            }
783         }
784         // fall through:
785      default:
786         state = state->next.p;
787      }
788   }
789   // now work through our list, building all the maps as we go:
790   while(v.size())
791   {
792      const std::pair<bool, re_syntax_base*>& p = v.back();
793      m_icase = p.first;
794      state = p.second;
795      v.pop_back();
796
797      // Build maps:
798      create_startmap(state->next.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_take);
799      m_bad_repeats = 0;
800      create_startmap(static_cast<re_alt*>(state)->alt.p, static_cast<re_alt*>(state)->_map, &static_cast<re_alt*>(state)->can_be_null, mask_skip);
801      // adjust the type of the state to allow for faster matching:
802      state->type = this->get_repeat_type(state);
803   }
804   // restore case sensitivity:
805   m_icase = l_icase;
806}
807
808template <class charT, class traits>
809int basic_regex_creator<charT, traits>::calculate_backstep(re_syntax_base* state)
810{
811   typedef typename traits::char_class_type mask_type;
812   int result = 0;
813   while(state)
814   {
815      switch(state->type)
816      {
817      case syntax_element_startmark:
818         if((static_cast<re_brace*>(state)->index == -1)
819            || (static_cast<re_brace*>(state)->index == -2))
820         {
821            state = static_cast<re_jump*>(state->next.p)->alt.p->next.p;
822            continue;
823         }
824         else if(static_cast<re_brace*>(state)->index == -3)
825         {
826            state = state->next.p->next.p;
827            continue;
828         }
829         break;
830      case syntax_element_endmark:
831         if((static_cast<re_brace*>(state)->index == -1)
832            || (static_cast<re_brace*>(state)->index == -2))
833            return result;
834         break;
835      case syntax_element_literal:
836         result += static_cast<re_literal*>(state)->length;
837         break;
838      case syntax_element_wild:
839      case syntax_element_set:
840         result += 1;
841         break;
842      case syntax_element_dot_rep:
843      case syntax_element_char_rep:
844      case syntax_element_short_set_rep:
845      case syntax_element_backref:
846      case syntax_element_rep:
847      case syntax_element_combining:
848      case syntax_element_long_set_rep:
849      case syntax_element_backstep:
850         {
851            re_repeat* rep = static_cast<re_repeat *>(state);
852            // adjust the type of the state to allow for faster matching:
853            state->type = this->get_repeat_type(state);
854            if((state->type == syntax_element_dot_rep)
855               || (state->type == syntax_element_char_rep)
856               || (state->type == syntax_element_short_set_rep))
857            {
858               if(rep->max != rep->min)
859                  return -1;
860               result += static_cast<int>(rep->min);
861               state = rep->alt.p;
862               continue;
863            }
864            else if((state->type == syntax_element_long_set_rep))
865            {
866               BOOST_ASSERT(rep->next.p->type == syntax_element_long_set);
867               if(static_cast<re_set_long<mask_type>*>(rep->next.p)->singleton == 0)
868                  return -1;
869               if(rep->max != rep->min)
870                  return -1;
871               result += static_cast<int>(rep->min);
872               state = rep->alt.p;
873               continue;
874            }
875         }
876         return -1;
877      case syntax_element_long_set:
878         if(static_cast<re_set_long<mask_type>*>(state)->singleton == 0)
879            return -1;
880         result += 1;
881         break;
882      case syntax_element_jump:
883         state = static_cast<re_jump*>(state)->alt.p;
884         continue;
885      default:
886         break;
887      }
888      state = state->next.p;
889   }
890   return -1;
891}
892
893template <class charT, class traits>
894void basic_regex_creator<charT, traits>::create_startmap(re_syntax_base* state, unsigned char* l_map, unsigned int* pnull, unsigned char mask)
895{
896   int not_last_jump = 1;
897
898   // track case sensitivity:
899   bool l_icase = m_icase;
900
901   while(state)
902   {
903      switch(state->type)
904      {
905      case syntax_element_toggle_case:
906         l_icase = static_cast<re_case*>(state)->icase;
907         state = state->next.p;
908         break;
909      case syntax_element_literal:
910      {
911         // don't set anything in *pnull, set each element in l_map
912         // that could match the first character in the literal:
913         if(l_map)
914         {
915            l_map[0] |= mask_init;
916            charT first_char = *static_cast<charT*>(static_cast<void*>(static_cast<re_literal*>(state) + 1));
917            for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
918            {
919               if(m_traits.translate(static_cast<charT>(i), l_icase) == first_char)
920                  l_map[i] |= mask;
921            }
922         }
923         return;
924      }
925      case syntax_element_end_line:
926      {
927         // next character must be a line separator (if there is one):
928         if(l_map)
929         {
930            l_map[0] |= mask_init;
931            l_map['\n'] |= mask;
932            l_map['\r'] |= mask;
933            l_map['\f'] |= mask;
934            l_map[0x85] |= mask;
935         }
936         // now figure out if we can match a NULL string at this point:
937         if(pnull)
938            create_startmap(state->next.p, 0, pnull, mask);
939         return;
940      }
941      case syntax_element_backref:
942         // can be null, and any character can match:
943         if(pnull)
944            *pnull |= mask;
945         // fall through:
946      case syntax_element_wild:
947      {
948         // can't be null, any character can match:
949         set_all_masks(l_map, mask);
950         return;
951      }
952      case syntax_element_match:
953      {
954         // must be null, any character can match:
955         set_all_masks(l_map, mask);
956         if(pnull)
957            *pnull |= mask;
958         return;
959      }
960      case syntax_element_word_start:
961      {
962         // recurse, then AND with all the word characters:
963         create_startmap(state->next.p, l_map, pnull, mask);
964         if(l_map)
965         {
966            l_map[0] |= mask_init;
967            for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
968            {
969               if(!m_traits.isctype(static_cast<charT>(i), m_word_mask))
970                  l_map[i] &= static_cast<unsigned char>(~mask);
971            }
972         }
973         return;
974      }
975      case syntax_element_word_end:
976      {
977         // recurse, then AND with all the word characters:
978         create_startmap(state->next.p, l_map, pnull, mask);
979         if(l_map)
980         {
981            l_map[0] |= mask_init;
982            for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
983            {
984               if(m_traits.isctype(static_cast<charT>(i), m_word_mask))
985                  l_map[i] &= static_cast<unsigned char>(~mask);
986            }
987         }
988         return;
989      }
990      case syntax_element_buffer_end:
991      {
992         // we *must be null* :
993         if(pnull)
994            *pnull |= mask;
995         return;
996      }
997      case syntax_element_long_set:
998         if(l_map)
999         {
1000            typedef typename traits::char_class_type mask_type;
1001            if(static_cast<re_set_long<mask_type>*>(state)->singleton)
1002            {
1003               l_map[0] |= mask_init;
1004               for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1005               {
1006                  charT c = static_cast<charT>(i);
1007                  if(&c != re_is_set_member(&c, &c + 1, static_cast<re_set_long<mask_type>*>(state), *m_pdata, m_icase))
1008                     l_map[i] |= mask;
1009               }
1010            }
1011            else
1012               set_all_masks(l_map, mask);
1013         }
1014         return;
1015      case syntax_element_set:
1016         if(l_map)
1017         {
1018            l_map[0] |= mask_init;
1019            for(unsigned int i = 0; i < (1u << CHAR_BIT); ++i)
1020            {
1021               if(static_cast<re_set*>(state)->_map[
1022                  static_cast<unsigned char>(m_traits.translate(static_cast<charT>(i), l_icase))])
1023                  l_map[i] |= mask;
1024            }
1025         }
1026         return;
1027      case syntax_element_jump:
1028         // take the jump:
1029         state = static_cast<re_alt*>(state)->alt.p;
1030         not_last_jump = -1;
1031         break;
1032      case syntax_element_alt:
1033      case syntax_element_rep:
1034      case syntax_element_dot_rep:
1035      case syntax_element_char_rep:
1036      case syntax_element_short_set_rep:
1037      case syntax_element_long_set_rep:
1038         {
1039            re_alt* rep = static_cast<re_alt*>(state);
1040            if(rep->_map[0] & mask_init)
1041            {
1042               if(l_map)
1043               {
1044                  // copy previous results:
1045                  l_map[0] |= mask_init;
1046                  for(unsigned int i = 0; i <= UCHAR_MAX; ++i)
1047                  {
1048                     if(rep->_map[i] & mask_any)
1049                        l_map[i] |= mask;
1050                  }
1051               }
1052               if(pnull)
1053               {
1054                  if(rep->can_be_null & mask_any)
1055                     *pnull |= mask;
1056               }
1057            }
1058            else
1059            {
1060               // we haven't created a startmap for this alternative yet
1061               // so take the union of the two options:
1062               if(is_bad_repeat(state))
1063               {
1064                  set_all_masks(l_map, mask);
1065                  return;
1066               }
1067               set_bad_repeat(state);
1068               create_startmap(state->next.p, l_map, pnull, mask);
1069               if((state->type == syntax_element_alt)
1070                  || (static_cast<re_repeat*>(state)->min == 0)
1071                  || (not_last_jump == 0))
1072                  create_startmap(rep->alt.p, l_map, pnull, mask);
1073            }
1074         }
1075         return;
1076      case syntax_element_soft_buffer_end:
1077         // match newline or null:
1078         if(l_map)
1079         {
1080            l_map[0] |= mask_init;
1081            l_map['\n'] |= mask;
1082            l_map['\r'] |= mask;
1083         }
1084         if(pnull)
1085            *pnull |= mask;
1086         return;
1087      case syntax_element_endmark:
1088         // need to handle independent subs as a special case:
1089         if(static_cast<re_brace*>(state)->index < 0)
1090         {
1091            // can be null, any character can match:
1092            set_all_masks(l_map, mask);
1093            if(pnull)
1094               *pnull |= mask;
1095            return;
1096         }
1097         else
1098         {
1099            state = state->next.p;
1100            break;
1101         }
1102
1103      case syntax_element_startmark:
1104         // need to handle independent subs as a special case:
1105         if(static_cast<re_brace*>(state)->index == -3)
1106         {
1107            state = state->next.p->next.p;
1108            break;
1109         }
1110         // otherwise fall through:
1111      default:
1112         state = state->next.p;
1113      }
1114      ++not_last_jump;
1115   }
1116}
1117
1118template <class charT, class traits>
1119unsigned basic_regex_creator<charT, traits>::get_restart_type(re_syntax_base* state)
1120{
1121   //
1122   // find out how the machine starts, so we can optimise the search:
1123   //
1124   while(state)
1125   {
1126      switch(state->type)
1127      {
1128      case syntax_element_startmark:
1129      case syntax_element_endmark:
1130         state = state->next.p;
1131         continue;
1132      case syntax_element_start_line:
1133         return regbase::restart_line;
1134      case syntax_element_word_start:
1135         return regbase::restart_word;
1136      case syntax_element_buffer_start:
1137         return regbase::restart_buf;
1138      case syntax_element_restart_continue:
1139         return regbase::restart_continue;
1140      default:
1141         state = 0;
1142         continue;
1143      }
1144   }
1145   return regbase::restart_any;
1146}
1147
1148template <class charT, class traits>
1149void basic_regex_creator<charT, traits>::set_all_masks(unsigned char* bits, unsigned char mask)
1150{
1151   //
1152   // set mask in all of bits elements,
1153   // if bits[0] has mask_init not set then we can
1154   // optimise this to a call to memset:
1155   //
1156   if(bits)
1157   {
1158      if(bits[0] == 0)
1159         (std::memset)(bits, mask, 1u << CHAR_BIT);
1160      else
1161      {
1162         for(unsigned i = 0; i < (1u << CHAR_BIT); ++i)
1163            bits[i] |= mask;
1164      }
1165      bits[0] |= mask_init;
1166   }
1167}
1168
1169template <class charT, class traits>
1170bool basic_regex_creator<charT, traits>::is_bad_repeat(re_syntax_base* pt)
1171{
1172   switch(pt->type)
1173   {
1174   case syntax_element_rep:
1175   case syntax_element_dot_rep:
1176   case syntax_element_char_rep:
1177   case syntax_element_short_set_rep:
1178   case syntax_element_long_set_rep:
1179      {
1180         unsigned id = static_cast<re_repeat*>(pt)->id;
1181         if(id > sizeof(m_bad_repeats) * CHAR_BIT)
1182            return true;  // run out of bits, assume we can't traverse this one.
1183         return m_bad_repeats & static_cast<boost::uintmax_t>(1uL << id);
1184      }
1185   default:
1186      return false;
1187   }
1188}
1189
1190template <class charT, class traits>
1191void basic_regex_creator<charT, traits>::set_bad_repeat(re_syntax_base* pt)
1192{
1193   switch(pt->type)
1194   {
1195   case syntax_element_rep:
1196   case syntax_element_dot_rep:
1197   case syntax_element_char_rep:
1198   case syntax_element_short_set_rep:
1199   case syntax_element_long_set_rep:
1200      {
1201         unsigned id = static_cast<re_repeat*>(pt)->id;
1202         if(id <= sizeof(m_bad_repeats) * CHAR_BIT)
1203            m_bad_repeats |= static_cast<boost::uintmax_t>(1uL << id);
1204      }
1205   default:
1206      break;
1207   }
1208}
1209
1210template <class charT, class traits>
1211syntax_element_type basic_regex_creator<charT, traits>::get_repeat_type(re_syntax_base* state)
1212{
1213   typedef typename traits::char_class_type mask_type;
1214   if(state->type == syntax_element_rep)
1215   {
1216      // check to see if we are repeating a single state:
1217      if(state->next.p->next.p->next.p == static_cast<re_alt*>(state)->alt.p)
1218      {
1219         switch(state->next.p->type)
1220         {
1221         case re_detail::syntax_element_wild:
1222            return re_detail::syntax_element_dot_rep;
1223         case re_detail::syntax_element_literal:
1224            return re_detail::syntax_element_char_rep;
1225         case re_detail::syntax_element_set:
1226            return re_detail::syntax_element_short_set_rep;
1227         case re_detail::syntax_element_long_set:
1228            if(static_cast<re_detail::re_set_long<mask_type>*>(state->next.p)->singleton)
1229               return re_detail::syntax_element_long_set_rep;
1230            break;
1231         default:
1232            break;
1233         }
1234      }
1235   }
1236   return state->type;
1237}
1238
1239template <class charT, class traits>
1240void basic_regex_creator<charT, traits>::probe_leading_repeat(re_syntax_base* state)
1241{
1242   // enumerate our states, and see if we have a leading repeat
1243   // for which failed search restarts can be optimised;
1244   do
1245   {
1246      switch(state->type)
1247      {
1248      case syntax_element_startmark:
1249         if(static_cast<re_brace*>(state)->index >= 0)
1250         {
1251            state = state->next.p;
1252            continue;
1253         }
1254         return;
1255      case syntax_element_endmark:
1256      case syntax_element_start_line:
1257      case syntax_element_end_line:
1258      case syntax_element_word_boundary:
1259      case syntax_element_within_word:
1260      case syntax_element_word_start:
1261      case syntax_element_word_end:
1262      case syntax_element_buffer_start:
1263      case syntax_element_buffer_end:
1264      case syntax_element_restart_continue:
1265         state = state->next.p;
1266         break;
1267      case syntax_element_dot_rep:
1268      case syntax_element_char_rep:
1269      case syntax_element_short_set_rep:
1270      case syntax_element_long_set_rep:
1271         if(this->m_has_backrefs == 0)
1272            static_cast<re_repeat*>(state)->leading = true;
1273         // fall through:
1274      default:
1275         return;
1276      }
1277   }while(state);
1278}
1279
1280
1281} // namespace re_detail
1282
1283} // namespace boost
1284
1285#ifdef BOOST_HAS_ABI_HEADERS
1286#  include BOOST_ABI_SUFFIX
1287#endif
1288
1289#endif
Note: See TracBrowser for help on using the repository browser.