source: NonGTP/Boost/boost/regex/pending/unicode_iterator.hpp @ 857

Revision 857, 21.0 KB checked in by igarcia, 18 years ago (diff)
Line 
1/*
2 *
3 * Copyright (c) 2004
4 * John Maddock
5 *
6 * Use, modification and distribution are subject to the
7 * Boost Software License, Version 1.0. (See accompanying file
8 * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
9 *
10 */
11 
12 /*
13  *   LOCATION:    see http://www.boost.org for most recent version.
14  *   FILE         unicode_iterator.hpp
15  *   VERSION      see <boost/version.hpp>
16  *   DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
17  */
18
19/****************************************************************************
20
21Contents:
22~~~~~~~~~
23
241) Read Only, Input Adapters:
25~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26
27template <class BaseIterator, class U8Type = ::boost::uint8_t>
28class u32_to_u8_iterator;
29
30Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
31
32template <class BaseIterator, class U32Type = ::boost::uint32_t>
33class u8_to_u32_iterator;
34
35Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
36
37template <class BaseIterator, class U16Type = ::boost::uint16_t>
38class u32_to_u16_iterator;
39
40Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
41
42template <class BaseIterator, class U32Type = ::boost::uint32_t>
43class u16_to_u32_iterator;
44
45Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
46
472) Single pass output iterator adapters:
48
49template <class BaseIterator>
50class utf8_output_iterator;
51
52Accepts UTF-32 code points and forwards them on as UTF-8 code points.
53
54template <class BaseIterator>
55class utf16_output_iterator;
56
57Accepts UTF-32 code points and forwards them on as UTF-16 code points.
58
59****************************************************************************/
60
61#ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
62#define BOOST_REGEX_UNICODE_ITERATOR_HPP
63#include <boost/cstdint.hpp>
64#include <boost/assert.hpp>
65#include <boost/iterator/iterator_facade.hpp>
66#include <boost/static_assert.hpp>
67#include <boost/throw_exception.hpp>
68#include <stdexcept>
69#ifndef BOOST_NO_STD_LOCALE
70#include <sstream>
71#endif
72
73namespace boost{
74
75namespace detail{
76
77static const ::boost::uint16_t high_surrogate_base = 0xD7C0u;
78static const ::boost::uint16_t low_surrogate_base = 0xDC00u;
79static const ::boost::uint32_t ten_bit_mask = 0x3FFu;
80
81inline bool is_high_surrogate(::boost::uint16_t v)
82{
83   return (v & 0xFC00u) == 0xd800u;
84}
85inline bool is_low_surrogate(::boost::uint16_t v)
86{
87   return (v & 0xFC00u) == 0xdc00u;
88}
89template <class T>
90inline bool is_surrogate(T v)
91{
92   return (v & 0xF800u) == 0xd800;
93}
94
95inline unsigned utf8_byte_count(boost::uint8_t c)
96{
97   // if the most significant bit with a zero in it is in position
98   // 8-N then there are N bytes in this UTF-8 sequence:
99   boost::uint8_t mask = 0x80u;
100   unsigned result = 0;
101   while(c & mask)
102   {
103      ++result;
104      mask >>= 1;
105   }
106   return (result == 0) ? 1 : ((result > 4) ? 4 : result);
107}
108
109inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
110{
111   return utf8_byte_count(c) - 1;
112}
113
114inline void invalid_utf32_code_point(::boost::uint32_t val)
115{
116#ifndef BOOST_NO_STD_LOCALE
117   std::stringstream ss;
118   ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
119   std::out_of_range e(ss.str());
120#else
121   std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence");
122#endif
123   boost::throw_exception(e);
124}
125
126
127} // namespace detail
128
129template <class BaseIterator, class U16Type = ::boost::uint16_t>
130class u32_to_u16_iterator
131   : public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type>
132{
133   typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
134
135#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
136   typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
137
138   BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
139   BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
140#endif
141
142public:
143   typename base_type::reference
144      dereference()const
145   {
146      if(m_current == 2)
147         extract_current();
148      return m_values[m_current];
149   }
150   bool equal(const u32_to_u16_iterator& that)const
151   {
152      if(m_position == that.m_position)
153      {
154         // Both m_currents must be equal, or both even
155         // this is the same as saying their sum must be even:
156         return (m_current + that.m_current) & 1u ? false : true;
157      }
158      return false;
159   }
160   void increment()
161   {
162      // if we have a pending read then read now, so that we know whether
163      // to skip a position, or move to a low-surrogate:
164      if(m_current == 2)
165      {
166         // pending read:
167         extract_current();
168      }
169      // move to the next surrogate position:
170      ++m_current;
171      // if we've reached the end skip a position:
172      if(m_values[m_current] == 0)
173      {
174         m_current = 2;
175         ++m_position;
176      }
177   }
178   void decrement()
179   {
180      if(m_current != 1)
181      {
182         // decrementing an iterator always leads to a valid position:
183         --m_position;
184         extract_current();
185         m_current = m_values[1] ? 1 : 0;
186      }
187      else
188      {
189         m_current = 0;
190      }
191   }
192   BaseIterator base()const
193   {
194      return m_position;
195   }
196   // construct:
197   u32_to_u16_iterator() : m_position(), m_current(0)
198   {
199      m_values[0] = 0;
200      m_values[1] = 0;
201      m_values[2] = 0;
202   }
203   u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
204   {
205      m_values[0] = 0;
206      m_values[1] = 0;
207      m_values[2] = 0;
208   }
209private:
210
211   void extract_current()const
212   {
213      // begin by checking for a code point out of range:
214      ::boost::uint32_t v = *m_position;
215      if(v >= 0x10000u)
216      {
217         if(v > 0x10FFFFu)
218            detail::invalid_utf32_code_point(*m_position);
219         // split into two surrogates:
220         m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
221         m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
222         m_current = 0;
223         BOOST_ASSERT(detail::is_high_surrogate(m_values[0]));
224         BOOST_ASSERT(detail::is_low_surrogate(m_values[1]));
225      }
226      else
227      {
228         // 16-bit code point:
229         m_values[0] = static_cast<U16Type>(*m_position);
230         m_values[1] = 0;
231         m_current = 0;
232         // value must not be a surrogate:
233         if(detail::is_surrogate(m_values[0]))
234            detail::invalid_utf32_code_point(*m_position);
235      }
236   }
237   BaseIterator m_position;
238   mutable U16Type m_values[3];
239   mutable unsigned m_current;
240};
241
242template <class BaseIterator, class U32Type = ::boost::uint32_t>
243class u16_to_u32_iterator
244   : public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
245{
246   typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
247   // special values for pending iterator reads:
248   BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
249
250#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
251   typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
252
253   BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
254   BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
255#endif
256
257public:
258   typename base_type::reference
259      dereference()const
260   {
261      if(m_value == pending_read)
262         extract_current();
263      return m_value;
264   }
265   bool equal(const u16_to_u32_iterator& that)const
266   {
267      return m_position == that.m_position;
268   }
269   void increment()
270   {
271      // skip high surrogate first if there is one:
272      if(detail::is_high_surrogate(*m_position)) ++m_position;
273      ++m_position;
274      m_value = pending_read;
275   }
276   void decrement()
277   {
278      --m_position;
279      // if we have a low surrogate then go back one more:
280      if(detail::is_low_surrogate(*m_position))
281         --m_position;
282      m_value = pending_read;
283   }
284   BaseIterator base()const
285   {
286      return m_position;
287   }
288   // construct:
289   u16_to_u32_iterator() : m_position()
290   {
291      m_value = pending_read;
292   }
293   u16_to_u32_iterator(BaseIterator b) : m_position(b)
294   {
295      m_value = pending_read;
296   }
297private:
298   static void invalid_code_point(::boost::uint16_t val)
299   {
300#ifndef BOOST_NO_STD_LOCALE
301      std::stringstream ss;
302      ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
303      std::out_of_range e(ss.str());
304#else
305      std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence");
306#endif
307      boost::throw_exception(e);
308   }
309   void extract_current()const
310   {
311      m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
312      // if the last value is a high surrogate then adjust m_position and m_value as needed:
313      if(detail::is_high_surrogate(*m_position))
314      {
315         // precondition; next value must have be a low-surrogate:
316         BaseIterator next(m_position);
317         ::boost::uint16_t t = *++next;
318         if((t & 0xFC00u) != 0xDC00u)
319            invalid_code_point(t);
320         m_value = (m_value - detail::high_surrogate_base) << 10;
321         m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
322      }
323      // postcondition; result must not be a surrogate:
324      if(detail::is_surrogate(m_value))
325         invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
326   }
327   BaseIterator m_position;
328   mutable U32Type m_value;
329};
330
331template <class BaseIterator, class U8Type = ::boost::uint8_t>
332class u32_to_u8_iterator
333   : public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type>
334{
335   typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type;
336   
337#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
338   typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
339
340   BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
341   BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
342#endif
343
344public:
345   typename base_type::reference
346      dereference()const
347   {
348      if(m_current == 4)
349         extract_current();
350      return m_values[m_current];
351   }
352   bool equal(const u32_to_u8_iterator& that)const
353   {
354      if(m_position == that.m_position)
355      {
356         // either the m_current's must be equal, or one must be 0 and
357         // the other 4: which means neither must have bits 1 or 2 set:
358         return (m_current == that.m_current)
359            || (((m_current | that.m_current) & 3) == 0);
360      }
361      return false;
362   }
363   void increment()
364   {
365      // if we have a pending read then read now, so that we know whether
366      // to skip a position, or move to a low-surrogate:
367      if(m_current == 4)
368      {
369         // pending read:
370         extract_current();
371      }
372      // move to the next surrogate position:
373      ++m_current;
374      // if we've reached the end skip a position:
375      if(m_values[m_current] == 0)
376      {
377         m_current = 4;
378         ++m_position;
379      }
380   }
381   void decrement()
382   {
383      if((m_current & 3) == 0)
384      {
385         --m_position;
386         extract_current();
387         m_current = 3;
388         while(m_current && (m_values[m_current] == 0))
389            --m_current;
390      }
391      else
392         --m_current;
393   }
394   BaseIterator base()const
395   {
396      return m_position;
397   }
398   // construct:
399   u32_to_u8_iterator() : m_position(), m_current(0)
400   {
401      m_values[0] = 0;
402      m_values[1] = 0;
403      m_values[2] = 0;
404      m_values[3] = 0;
405      m_values[4] = 0;
406   }
407   u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
408   {
409      m_values[0] = 0;
410      m_values[1] = 0;
411      m_values[2] = 0;
412      m_values[3] = 0;
413      m_values[4] = 0;
414   }
415private:
416
417   void extract_current()const
418   {
419      boost::uint32_t c = *m_position;
420      if(c > 0x10FFFFu)
421         detail::invalid_utf32_code_point(c);
422      if(c < 0x80u)
423      {
424         m_values[0] = static_cast<unsigned char>(c);
425         m_values[1] = static_cast<unsigned char>(0u);
426         m_values[2] = static_cast<unsigned char>(0u);
427         m_values[3] = static_cast<unsigned char>(0u);
428      }
429      else if(c < 0x800u)
430      {
431         m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
432         m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
433         m_values[2] = static_cast<unsigned char>(0u);
434         m_values[3] = static_cast<unsigned char>(0u);
435      }
436      else if(c < 0x10000u)
437      {
438         m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
439         m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
440         m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
441         m_values[3] = static_cast<unsigned char>(0u);
442      }
443      else
444      {
445         m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
446         m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
447         m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
448         m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
449      }
450      m_current= 0;
451   }
452   BaseIterator m_position;
453   mutable U8Type m_values[5];
454   mutable unsigned m_current;
455};
456
457template <class BaseIterator, class U32Type = ::boost::uint32_t>
458class u8_to_u32_iterator
459   : public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
460{
461   typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
462   // special values for pending iterator reads:
463   BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
464
465#if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
466   typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
467
468   BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
469   BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
470#endif
471
472public:
473   typename base_type::reference
474      dereference()const
475   {
476      if(m_value == pending_read)
477         extract_current();
478      return m_value;
479   }
480   bool equal(const u8_to_u32_iterator& that)const
481   {
482      return m_position == that.m_position;
483   }
484   void increment()
485   {
486      // skip high surrogate first if there is one:
487      unsigned c = detail::utf8_byte_count(*m_position);
488      std::advance(m_position, c);
489      m_value = pending_read;
490   }
491   void decrement()
492   {
493      // Keep backtracking until we don't have a trailing character:
494      unsigned count = 0;
495      while((*--m_position & 0xC0u) == 0x80u) ++count;
496      // now check that the sequence was valid:
497      if(count != detail::utf8_trailing_byte_count(*m_position))
498         invalid_sequnce();
499      m_value = pending_read;
500   }
501   BaseIterator base()const
502   {
503      return m_position;
504   }
505   // construct:
506   u8_to_u32_iterator() : m_position()
507   {
508      m_value = pending_read;
509   }
510   u8_to_u32_iterator(BaseIterator b) : m_position(b)
511   {
512      m_value = pending_read;
513   }
514private:
515   static void invalid_sequnce()
516   {
517      std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
518      boost::throw_exception(e);
519   }
520   void extract_current()const
521   {
522      m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
523      // we must not have a continuation character:
524      if((m_value & 0xC0u) == 0x80u)
525         invalid_sequnce();
526      // see how many extra byts we have:
527      unsigned extra = detail::utf8_trailing_byte_count(*m_position);
528      // extract the extra bits, 6 from each extra byte:
529      BaseIterator next(m_position);
530      for(unsigned c = 0; c < extra; ++c)
531      {
532         ++next;
533         m_value <<= 6;
534         m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
535      }
536      // we now need to remove a few of the leftmost bits, but how many depends
537      // upon how many extra bytes we've extracted:
538      static const boost::uint32_t masks[4] =
539      {
540         0x7Fu,
541         0x7FFu,
542         0xFFFFu,
543         0x1FFFFFu,
544      };
545      m_value &= masks[extra];
546      // check the result:
547      if(m_value > static_cast<U32Type>(0x10FFFFu))
548         invalid_sequnce();
549   }
550   BaseIterator m_position;
551   mutable U32Type m_value;
552};
553
554template <class BaseIterator>
555class utf16_output_iterator
556{
557public:
558   typedef void                                   difference_type;
559   typedef void                                   value_type;
560   typedef boost::uint32_t*                       pointer;
561   typedef boost::uint32_t&                       reference;
562   typedef std::output_iterator_tag               iterator_category;
563
564   utf16_output_iterator(const BaseIterator& b)
565      : m_position(b){}
566   utf16_output_iterator(const utf16_output_iterator& that)
567      : m_position(that.m_position){}
568   utf16_output_iterator& operator=(const utf16_output_iterator& that)
569   {
570      m_position = that.m_position;
571      return *this;
572   }
573   const utf16_output_iterator& operator*()const
574   {
575      return *this;
576   }
577   void operator=(boost::uint32_t val)const
578   {
579      push(val);
580   }
581   utf16_output_iterator& operator++()
582   {
583      return *this;
584   }
585   utf16_output_iterator& operator++(int)
586   {
587      return *this;
588   }
589   BaseIterator base()const
590   {
591      return m_position;
592   }
593private:
594   void push(boost::uint32_t v)const
595   {
596      if(v >= 0x10000u)
597      {
598         // begin by checking for a code point out of range:
599         if(v > 0x10FFFFu)
600            detail::invalid_utf32_code_point(v);
601         // split into two surrogates:
602         *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base;
603         *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
604      }
605      else
606      {
607         // 16-bit code point:
608         // value must not be a surrogate:
609         if(detail::is_surrogate(v))
610            detail::invalid_utf32_code_point(v);
611         *m_position++ = static_cast<boost::uint16_t>(v);
612      }
613   }
614   mutable BaseIterator m_position;
615};
616
617template <class BaseIterator>
618class utf8_output_iterator
619{
620public:
621   typedef void                                   difference_type;
622   typedef void                                   value_type;
623   typedef boost::uint32_t*                       pointer;
624   typedef boost::uint32_t&                       reference;
625   typedef std::output_iterator_tag               iterator_category;
626
627   utf8_output_iterator(const BaseIterator& b)
628      : m_position(b){}
629   utf8_output_iterator(const utf8_output_iterator& that)
630      : m_position(that.m_position){}
631   utf8_output_iterator& operator=(const utf8_output_iterator& that)
632   {
633      m_position = that.m_position;
634      return *this;
635   }
636   const utf8_output_iterator& operator*()const
637   {
638      return *this;
639   }
640   void operator=(boost::uint32_t val)const
641   {
642      push(val);
643   }
644   utf8_output_iterator& operator++()
645   {
646      return *this;
647   }
648   utf8_output_iterator& operator++(int)
649   {
650      return *this;
651   }
652   BaseIterator base()const
653   {
654      return m_position;
655   }
656private:
657   void push(boost::uint32_t c)const
658   {
659      if(c > 0x10FFFFu)
660         detail::invalid_utf32_code_point(c);
661      if(c < 0x80u)
662      {
663         *m_position++ = static_cast<unsigned char>(c);
664      }
665      else if(c < 0x800u)
666      {
667         *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
668         *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
669      }
670      else if(c < 0x10000u)
671      {
672         *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
673         *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
674         *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
675      }
676      else
677      {
678         *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
679         *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
680         *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
681         *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
682      }
683   }
684   mutable BaseIterator m_position;
685};
686
687} // namespace boost
688
689#endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
Note: See TracBrowser for help on using the repository browser.