[857] | 1 | /*
|
---|
| 2 | *
|
---|
| 3 | * Copyright (c) 2004
|
---|
| 4 | * John Maddock
|
---|
| 5 | *
|
---|
| 6 | * Use, modification and distribution are subject to the
|
---|
| 7 | * Boost Software License, Version 1.0. (See accompanying file
|
---|
| 8 | * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
|
---|
| 9 | *
|
---|
| 10 | */
|
---|
| 11 |
|
---|
| 12 | /*
|
---|
| 13 | * LOCATION: see http://www.boost.org for most recent version.
|
---|
| 14 | * FILE unicode_iterator.hpp
|
---|
| 15 | * VERSION see <boost/version.hpp>
|
---|
| 16 | * DESCRIPTION: Iterator adapters for converting between different Unicode encodings.
|
---|
| 17 | */
|
---|
| 18 |
|
---|
| 19 | /****************************************************************************
|
---|
| 20 |
|
---|
| 21 | Contents:
|
---|
| 22 | ~~~~~~~~~
|
---|
| 23 |
|
---|
| 24 | 1) Read Only, Input Adapters:
|
---|
| 25 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
---|
| 26 |
|
---|
| 27 | template <class BaseIterator, class U8Type = ::boost::uint8_t>
|
---|
| 28 | class u32_to_u8_iterator;
|
---|
| 29 |
|
---|
| 30 | Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-8.
|
---|
| 31 |
|
---|
| 32 | template <class BaseIterator, class U32Type = ::boost::uint32_t>
|
---|
| 33 | class u8_to_u32_iterator;
|
---|
| 34 |
|
---|
| 35 | Adapts sequence of UTF-8 code points to "look like" a sequence of UTF-32.
|
---|
| 36 |
|
---|
| 37 | template <class BaseIterator, class U16Type = ::boost::uint16_t>
|
---|
| 38 | class u32_to_u16_iterator;
|
---|
| 39 |
|
---|
| 40 | Adapts sequence of UTF-32 code points to "look like" a sequence of UTF-16.
|
---|
| 41 |
|
---|
| 42 | template <class BaseIterator, class U32Type = ::boost::uint32_t>
|
---|
| 43 | class u16_to_u32_iterator;
|
---|
| 44 |
|
---|
| 45 | Adapts sequence of UTF-16 code points to "look like" a sequence of UTF-32.
|
---|
| 46 |
|
---|
| 47 | 2) Single pass output iterator adapters:
|
---|
| 48 |
|
---|
| 49 | template <class BaseIterator>
|
---|
| 50 | class utf8_output_iterator;
|
---|
| 51 |
|
---|
| 52 | Accepts UTF-32 code points and forwards them on as UTF-8 code points.
|
---|
| 53 |
|
---|
| 54 | template <class BaseIterator>
|
---|
| 55 | class utf16_output_iterator;
|
---|
| 56 |
|
---|
| 57 | Accepts UTF-32 code points and forwards them on as UTF-16 code points.
|
---|
| 58 |
|
---|
| 59 | ****************************************************************************/
|
---|
| 60 |
|
---|
| 61 | #ifndef BOOST_REGEX_UNICODE_ITERATOR_HPP
|
---|
| 62 | #define BOOST_REGEX_UNICODE_ITERATOR_HPP
|
---|
| 63 | #include <boost/cstdint.hpp>
|
---|
| 64 | #include <boost/assert.hpp>
|
---|
| 65 | #include <boost/iterator/iterator_facade.hpp>
|
---|
| 66 | #include <boost/static_assert.hpp>
|
---|
| 67 | #include <boost/throw_exception.hpp>
|
---|
| 68 | #include <stdexcept>
|
---|
| 69 | #ifndef BOOST_NO_STD_LOCALE
|
---|
| 70 | #include <sstream>
|
---|
| 71 | #endif
|
---|
| 72 |
|
---|
| 73 | namespace boost{
|
---|
| 74 |
|
---|
| 75 | namespace detail{
|
---|
| 76 |
|
---|
| 77 | static const ::boost::uint16_t high_surrogate_base = 0xD7C0u;
|
---|
| 78 | static const ::boost::uint16_t low_surrogate_base = 0xDC00u;
|
---|
| 79 | static const ::boost::uint32_t ten_bit_mask = 0x3FFu;
|
---|
| 80 |
|
---|
| 81 | inline bool is_high_surrogate(::boost::uint16_t v)
|
---|
| 82 | {
|
---|
| 83 | return (v & 0xFC00u) == 0xd800u;
|
---|
| 84 | }
|
---|
| 85 | inline bool is_low_surrogate(::boost::uint16_t v)
|
---|
| 86 | {
|
---|
| 87 | return (v & 0xFC00u) == 0xdc00u;
|
---|
| 88 | }
|
---|
| 89 | template <class T>
|
---|
| 90 | inline bool is_surrogate(T v)
|
---|
| 91 | {
|
---|
| 92 | return (v & 0xF800u) == 0xd800;
|
---|
| 93 | }
|
---|
| 94 |
|
---|
| 95 | inline unsigned utf8_byte_count(boost::uint8_t c)
|
---|
| 96 | {
|
---|
| 97 | // if the most significant bit with a zero in it is in position
|
---|
| 98 | // 8-N then there are N bytes in this UTF-8 sequence:
|
---|
| 99 | boost::uint8_t mask = 0x80u;
|
---|
| 100 | unsigned result = 0;
|
---|
| 101 | while(c & mask)
|
---|
| 102 | {
|
---|
| 103 | ++result;
|
---|
| 104 | mask >>= 1;
|
---|
| 105 | }
|
---|
| 106 | return (result == 0) ? 1 : ((result > 4) ? 4 : result);
|
---|
| 107 | }
|
---|
| 108 |
|
---|
| 109 | inline unsigned utf8_trailing_byte_count(boost::uint8_t c)
|
---|
| 110 | {
|
---|
| 111 | return utf8_byte_count(c) - 1;
|
---|
| 112 | }
|
---|
| 113 |
|
---|
| 114 | inline void invalid_utf32_code_point(::boost::uint32_t val)
|
---|
| 115 | {
|
---|
| 116 | #ifndef BOOST_NO_STD_LOCALE
|
---|
| 117 | std::stringstream ss;
|
---|
| 118 | ss << "Invalid UTF-32 code point U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-16 sequence";
|
---|
| 119 | std::out_of_range e(ss.str());
|
---|
| 120 | #else
|
---|
| 121 | std::out_of_range e("Invalid UTF-32 code point encountered while trying to encode UTF-16 sequence");
|
---|
| 122 | #endif
|
---|
| 123 | boost::throw_exception(e);
|
---|
| 124 | }
|
---|
| 125 |
|
---|
| 126 |
|
---|
| 127 | } // namespace detail
|
---|
| 128 |
|
---|
| 129 | template <class BaseIterator, class U16Type = ::boost::uint16_t>
|
---|
| 130 | class u32_to_u16_iterator
|
---|
| 131 | : public boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type>
|
---|
| 132 | {
|
---|
| 133 | typedef boost::iterator_facade<u32_to_u16_iterator<BaseIterator, U16Type>, U16Type, std::bidirectional_iterator_tag, const U16Type> base_type;
|
---|
| 134 |
|
---|
| 135 | #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
---|
| 136 | typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
---|
| 137 |
|
---|
| 138 | BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
|
---|
| 139 | BOOST_STATIC_ASSERT(sizeof(U16Type)*CHAR_BIT == 16);
|
---|
| 140 | #endif
|
---|
| 141 |
|
---|
| 142 | public:
|
---|
| 143 | typename base_type::reference
|
---|
| 144 | dereference()const
|
---|
| 145 | {
|
---|
| 146 | if(m_current == 2)
|
---|
| 147 | extract_current();
|
---|
| 148 | return m_values[m_current];
|
---|
| 149 | }
|
---|
| 150 | bool equal(const u32_to_u16_iterator& that)const
|
---|
| 151 | {
|
---|
| 152 | if(m_position == that.m_position)
|
---|
| 153 | {
|
---|
| 154 | // Both m_currents must be equal, or both even
|
---|
| 155 | // this is the same as saying their sum must be even:
|
---|
| 156 | return (m_current + that.m_current) & 1u ? false : true;
|
---|
| 157 | }
|
---|
| 158 | return false;
|
---|
| 159 | }
|
---|
| 160 | void increment()
|
---|
| 161 | {
|
---|
| 162 | // if we have a pending read then read now, so that we know whether
|
---|
| 163 | // to skip a position, or move to a low-surrogate:
|
---|
| 164 | if(m_current == 2)
|
---|
| 165 | {
|
---|
| 166 | // pending read:
|
---|
| 167 | extract_current();
|
---|
| 168 | }
|
---|
| 169 | // move to the next surrogate position:
|
---|
| 170 | ++m_current;
|
---|
| 171 | // if we've reached the end skip a position:
|
---|
| 172 | if(m_values[m_current] == 0)
|
---|
| 173 | {
|
---|
| 174 | m_current = 2;
|
---|
| 175 | ++m_position;
|
---|
| 176 | }
|
---|
| 177 | }
|
---|
| 178 | void decrement()
|
---|
| 179 | {
|
---|
| 180 | if(m_current != 1)
|
---|
| 181 | {
|
---|
| 182 | // decrementing an iterator always leads to a valid position:
|
---|
| 183 | --m_position;
|
---|
| 184 | extract_current();
|
---|
| 185 | m_current = m_values[1] ? 1 : 0;
|
---|
| 186 | }
|
---|
| 187 | else
|
---|
| 188 | {
|
---|
| 189 | m_current = 0;
|
---|
| 190 | }
|
---|
| 191 | }
|
---|
| 192 | BaseIterator base()const
|
---|
| 193 | {
|
---|
| 194 | return m_position;
|
---|
| 195 | }
|
---|
| 196 | // construct:
|
---|
| 197 | u32_to_u16_iterator() : m_position(), m_current(0)
|
---|
| 198 | {
|
---|
| 199 | m_values[0] = 0;
|
---|
| 200 | m_values[1] = 0;
|
---|
| 201 | m_values[2] = 0;
|
---|
| 202 | }
|
---|
| 203 | u32_to_u16_iterator(BaseIterator b) : m_position(b), m_current(2)
|
---|
| 204 | {
|
---|
| 205 | m_values[0] = 0;
|
---|
| 206 | m_values[1] = 0;
|
---|
| 207 | m_values[2] = 0;
|
---|
| 208 | }
|
---|
| 209 | private:
|
---|
| 210 |
|
---|
| 211 | void extract_current()const
|
---|
| 212 | {
|
---|
| 213 | // begin by checking for a code point out of range:
|
---|
| 214 | ::boost::uint32_t v = *m_position;
|
---|
| 215 | if(v >= 0x10000u)
|
---|
| 216 | {
|
---|
| 217 | if(v > 0x10FFFFu)
|
---|
| 218 | detail::invalid_utf32_code_point(*m_position);
|
---|
| 219 | // split into two surrogates:
|
---|
| 220 | m_values[0] = static_cast<U16Type>(v >> 10) + detail::high_surrogate_base;
|
---|
| 221 | m_values[1] = static_cast<U16Type>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
|
---|
| 222 | m_current = 0;
|
---|
| 223 | BOOST_ASSERT(detail::is_high_surrogate(m_values[0]));
|
---|
| 224 | BOOST_ASSERT(detail::is_low_surrogate(m_values[1]));
|
---|
| 225 | }
|
---|
| 226 | else
|
---|
| 227 | {
|
---|
| 228 | // 16-bit code point:
|
---|
| 229 | m_values[0] = static_cast<U16Type>(*m_position);
|
---|
| 230 | m_values[1] = 0;
|
---|
| 231 | m_current = 0;
|
---|
| 232 | // value must not be a surrogate:
|
---|
| 233 | if(detail::is_surrogate(m_values[0]))
|
---|
| 234 | detail::invalid_utf32_code_point(*m_position);
|
---|
| 235 | }
|
---|
| 236 | }
|
---|
| 237 | BaseIterator m_position;
|
---|
| 238 | mutable U16Type m_values[3];
|
---|
| 239 | mutable unsigned m_current;
|
---|
| 240 | };
|
---|
| 241 |
|
---|
| 242 | template <class BaseIterator, class U32Type = ::boost::uint32_t>
|
---|
| 243 | class u16_to_u32_iterator
|
---|
| 244 | : public boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
|
---|
| 245 | {
|
---|
| 246 | typedef boost::iterator_facade<u16_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
|
---|
| 247 | // special values for pending iterator reads:
|
---|
| 248 | BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
|
---|
| 249 |
|
---|
| 250 | #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
---|
| 251 | typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
---|
| 252 |
|
---|
| 253 | BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 16);
|
---|
| 254 | BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
|
---|
| 255 | #endif
|
---|
| 256 |
|
---|
| 257 | public:
|
---|
| 258 | typename base_type::reference
|
---|
| 259 | dereference()const
|
---|
| 260 | {
|
---|
| 261 | if(m_value == pending_read)
|
---|
| 262 | extract_current();
|
---|
| 263 | return m_value;
|
---|
| 264 | }
|
---|
| 265 | bool equal(const u16_to_u32_iterator& that)const
|
---|
| 266 | {
|
---|
| 267 | return m_position == that.m_position;
|
---|
| 268 | }
|
---|
| 269 | void increment()
|
---|
| 270 | {
|
---|
| 271 | // skip high surrogate first if there is one:
|
---|
| 272 | if(detail::is_high_surrogate(*m_position)) ++m_position;
|
---|
| 273 | ++m_position;
|
---|
| 274 | m_value = pending_read;
|
---|
| 275 | }
|
---|
| 276 | void decrement()
|
---|
| 277 | {
|
---|
| 278 | --m_position;
|
---|
| 279 | // if we have a low surrogate then go back one more:
|
---|
| 280 | if(detail::is_low_surrogate(*m_position))
|
---|
| 281 | --m_position;
|
---|
| 282 | m_value = pending_read;
|
---|
| 283 | }
|
---|
| 284 | BaseIterator base()const
|
---|
| 285 | {
|
---|
| 286 | return m_position;
|
---|
| 287 | }
|
---|
| 288 | // construct:
|
---|
| 289 | u16_to_u32_iterator() : m_position()
|
---|
| 290 | {
|
---|
| 291 | m_value = pending_read;
|
---|
| 292 | }
|
---|
| 293 | u16_to_u32_iterator(BaseIterator b) : m_position(b)
|
---|
| 294 | {
|
---|
| 295 | m_value = pending_read;
|
---|
| 296 | }
|
---|
| 297 | private:
|
---|
| 298 | static void invalid_code_point(::boost::uint16_t val)
|
---|
| 299 | {
|
---|
| 300 | #ifndef BOOST_NO_STD_LOCALE
|
---|
| 301 | std::stringstream ss;
|
---|
| 302 | ss << "Misplaced UTF-16 surrogate U+" << std::showbase << std::hex << val << " encountered while trying to encode UTF-32 sequence";
|
---|
| 303 | std::out_of_range e(ss.str());
|
---|
| 304 | #else
|
---|
| 305 | std::out_of_range e("Misplaced UTF-16 surrogate encountered while trying to encode UTF-32 sequence");
|
---|
| 306 | #endif
|
---|
| 307 | boost::throw_exception(e);
|
---|
| 308 | }
|
---|
| 309 | void extract_current()const
|
---|
| 310 | {
|
---|
| 311 | m_value = static_cast<U32Type>(static_cast< ::boost::uint16_t>(*m_position));
|
---|
| 312 | // if the last value is a high surrogate then adjust m_position and m_value as needed:
|
---|
| 313 | if(detail::is_high_surrogate(*m_position))
|
---|
| 314 | {
|
---|
| 315 | // precondition; next value must have be a low-surrogate:
|
---|
| 316 | BaseIterator next(m_position);
|
---|
| 317 | ::boost::uint16_t t = *++next;
|
---|
| 318 | if((t & 0xFC00u) != 0xDC00u)
|
---|
| 319 | invalid_code_point(t);
|
---|
| 320 | m_value = (m_value - detail::high_surrogate_base) << 10;
|
---|
| 321 | m_value |= (static_cast<U32Type>(static_cast< ::boost::uint16_t>(t)) & detail::ten_bit_mask);
|
---|
| 322 | }
|
---|
| 323 | // postcondition; result must not be a surrogate:
|
---|
| 324 | if(detail::is_surrogate(m_value))
|
---|
| 325 | invalid_code_point(static_cast< ::boost::uint16_t>(m_value));
|
---|
| 326 | }
|
---|
| 327 | BaseIterator m_position;
|
---|
| 328 | mutable U32Type m_value;
|
---|
| 329 | };
|
---|
| 330 |
|
---|
| 331 | template <class BaseIterator, class U8Type = ::boost::uint8_t>
|
---|
| 332 | class u32_to_u8_iterator
|
---|
| 333 | : public boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type>
|
---|
| 334 | {
|
---|
| 335 | typedef boost::iterator_facade<u32_to_u8_iterator<BaseIterator, U8Type>, U8Type, std::bidirectional_iterator_tag, const U8Type> base_type;
|
---|
| 336 |
|
---|
| 337 | #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
---|
| 338 | typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
---|
| 339 |
|
---|
| 340 | BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 32);
|
---|
| 341 | BOOST_STATIC_ASSERT(sizeof(U8Type)*CHAR_BIT == 8);
|
---|
| 342 | #endif
|
---|
| 343 |
|
---|
| 344 | public:
|
---|
| 345 | typename base_type::reference
|
---|
| 346 | dereference()const
|
---|
| 347 | {
|
---|
| 348 | if(m_current == 4)
|
---|
| 349 | extract_current();
|
---|
| 350 | return m_values[m_current];
|
---|
| 351 | }
|
---|
| 352 | bool equal(const u32_to_u8_iterator& that)const
|
---|
| 353 | {
|
---|
| 354 | if(m_position == that.m_position)
|
---|
| 355 | {
|
---|
| 356 | // either the m_current's must be equal, or one must be 0 and
|
---|
| 357 | // the other 4: which means neither must have bits 1 or 2 set:
|
---|
| 358 | return (m_current == that.m_current)
|
---|
| 359 | || (((m_current | that.m_current) & 3) == 0);
|
---|
| 360 | }
|
---|
| 361 | return false;
|
---|
| 362 | }
|
---|
| 363 | void increment()
|
---|
| 364 | {
|
---|
| 365 | // if we have a pending read then read now, so that we know whether
|
---|
| 366 | // to skip a position, or move to a low-surrogate:
|
---|
| 367 | if(m_current == 4)
|
---|
| 368 | {
|
---|
| 369 | // pending read:
|
---|
| 370 | extract_current();
|
---|
| 371 | }
|
---|
| 372 | // move to the next surrogate position:
|
---|
| 373 | ++m_current;
|
---|
| 374 | // if we've reached the end skip a position:
|
---|
| 375 | if(m_values[m_current] == 0)
|
---|
| 376 | {
|
---|
| 377 | m_current = 4;
|
---|
| 378 | ++m_position;
|
---|
| 379 | }
|
---|
| 380 | }
|
---|
| 381 | void decrement()
|
---|
| 382 | {
|
---|
| 383 | if((m_current & 3) == 0)
|
---|
| 384 | {
|
---|
| 385 | --m_position;
|
---|
| 386 | extract_current();
|
---|
| 387 | m_current = 3;
|
---|
| 388 | while(m_current && (m_values[m_current] == 0))
|
---|
| 389 | --m_current;
|
---|
| 390 | }
|
---|
| 391 | else
|
---|
| 392 | --m_current;
|
---|
| 393 | }
|
---|
| 394 | BaseIterator base()const
|
---|
| 395 | {
|
---|
| 396 | return m_position;
|
---|
| 397 | }
|
---|
| 398 | // construct:
|
---|
| 399 | u32_to_u8_iterator() : m_position(), m_current(0)
|
---|
| 400 | {
|
---|
| 401 | m_values[0] = 0;
|
---|
| 402 | m_values[1] = 0;
|
---|
| 403 | m_values[2] = 0;
|
---|
| 404 | m_values[3] = 0;
|
---|
| 405 | m_values[4] = 0;
|
---|
| 406 | }
|
---|
| 407 | u32_to_u8_iterator(BaseIterator b) : m_position(b), m_current(4)
|
---|
| 408 | {
|
---|
| 409 | m_values[0] = 0;
|
---|
| 410 | m_values[1] = 0;
|
---|
| 411 | m_values[2] = 0;
|
---|
| 412 | m_values[3] = 0;
|
---|
| 413 | m_values[4] = 0;
|
---|
| 414 | }
|
---|
| 415 | private:
|
---|
| 416 |
|
---|
| 417 | void extract_current()const
|
---|
| 418 | {
|
---|
| 419 | boost::uint32_t c = *m_position;
|
---|
| 420 | if(c > 0x10FFFFu)
|
---|
| 421 | detail::invalid_utf32_code_point(c);
|
---|
| 422 | if(c < 0x80u)
|
---|
| 423 | {
|
---|
| 424 | m_values[0] = static_cast<unsigned char>(c);
|
---|
| 425 | m_values[1] = static_cast<unsigned char>(0u);
|
---|
| 426 | m_values[2] = static_cast<unsigned char>(0u);
|
---|
| 427 | m_values[3] = static_cast<unsigned char>(0u);
|
---|
| 428 | }
|
---|
| 429 | else if(c < 0x800u)
|
---|
| 430 | {
|
---|
| 431 | m_values[0] = static_cast<unsigned char>(0xC0u + (c >> 6));
|
---|
| 432 | m_values[1] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
---|
| 433 | m_values[2] = static_cast<unsigned char>(0u);
|
---|
| 434 | m_values[3] = static_cast<unsigned char>(0u);
|
---|
| 435 | }
|
---|
| 436 | else if(c < 0x10000u)
|
---|
| 437 | {
|
---|
| 438 | m_values[0] = static_cast<unsigned char>(0xE0u + (c >> 12));
|
---|
| 439 | m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
|
---|
| 440 | m_values[2] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
---|
| 441 | m_values[3] = static_cast<unsigned char>(0u);
|
---|
| 442 | }
|
---|
| 443 | else
|
---|
| 444 | {
|
---|
| 445 | m_values[0] = static_cast<unsigned char>(0xF0u + (c >> 18));
|
---|
| 446 | m_values[1] = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
|
---|
| 447 | m_values[2] = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
|
---|
| 448 | m_values[3] = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
---|
| 449 | }
|
---|
| 450 | m_current= 0;
|
---|
| 451 | }
|
---|
| 452 | BaseIterator m_position;
|
---|
| 453 | mutable U8Type m_values[5];
|
---|
| 454 | mutable unsigned m_current;
|
---|
| 455 | };
|
---|
| 456 |
|
---|
| 457 | template <class BaseIterator, class U32Type = ::boost::uint32_t>
|
---|
| 458 | class u8_to_u32_iterator
|
---|
| 459 | : public boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type>
|
---|
| 460 | {
|
---|
| 461 | typedef boost::iterator_facade<u8_to_u32_iterator<BaseIterator, U32Type>, U32Type, std::bidirectional_iterator_tag, const U32Type> base_type;
|
---|
| 462 | // special values for pending iterator reads:
|
---|
| 463 | BOOST_STATIC_CONSTANT(U32Type, pending_read = 0xffffffffu);
|
---|
| 464 |
|
---|
| 465 | #if !defined(BOOST_NO_STD_ITERATOR_TRAITS) && !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION)
|
---|
| 466 | typedef typename std::iterator_traits<BaseIterator>::value_type base_value_type;
|
---|
| 467 |
|
---|
| 468 | BOOST_STATIC_ASSERT(sizeof(base_value_type)*CHAR_BIT == 8);
|
---|
| 469 | BOOST_STATIC_ASSERT(sizeof(U32Type)*CHAR_BIT == 32);
|
---|
| 470 | #endif
|
---|
| 471 |
|
---|
| 472 | public:
|
---|
| 473 | typename base_type::reference
|
---|
| 474 | dereference()const
|
---|
| 475 | {
|
---|
| 476 | if(m_value == pending_read)
|
---|
| 477 | extract_current();
|
---|
| 478 | return m_value;
|
---|
| 479 | }
|
---|
| 480 | bool equal(const u8_to_u32_iterator& that)const
|
---|
| 481 | {
|
---|
| 482 | return m_position == that.m_position;
|
---|
| 483 | }
|
---|
| 484 | void increment()
|
---|
| 485 | {
|
---|
| 486 | // skip high surrogate first if there is one:
|
---|
| 487 | unsigned c = detail::utf8_byte_count(*m_position);
|
---|
| 488 | std::advance(m_position, c);
|
---|
| 489 | m_value = pending_read;
|
---|
| 490 | }
|
---|
| 491 | void decrement()
|
---|
| 492 | {
|
---|
| 493 | // Keep backtracking until we don't have a trailing character:
|
---|
| 494 | unsigned count = 0;
|
---|
| 495 | while((*--m_position & 0xC0u) == 0x80u) ++count;
|
---|
| 496 | // now check that the sequence was valid:
|
---|
| 497 | if(count != detail::utf8_trailing_byte_count(*m_position))
|
---|
| 498 | invalid_sequnce();
|
---|
| 499 | m_value = pending_read;
|
---|
| 500 | }
|
---|
| 501 | BaseIterator base()const
|
---|
| 502 | {
|
---|
| 503 | return m_position;
|
---|
| 504 | }
|
---|
| 505 | // construct:
|
---|
| 506 | u8_to_u32_iterator() : m_position()
|
---|
| 507 | {
|
---|
| 508 | m_value = pending_read;
|
---|
| 509 | }
|
---|
| 510 | u8_to_u32_iterator(BaseIterator b) : m_position(b)
|
---|
| 511 | {
|
---|
| 512 | m_value = pending_read;
|
---|
| 513 | }
|
---|
| 514 | private:
|
---|
| 515 | static void invalid_sequnce()
|
---|
| 516 | {
|
---|
| 517 | std::out_of_range e("Invalid UTF-8 sequence encountered while trying to encode UTF-32 character");
|
---|
| 518 | boost::throw_exception(e);
|
---|
| 519 | }
|
---|
| 520 | void extract_current()const
|
---|
| 521 | {
|
---|
| 522 | m_value = static_cast<U32Type>(static_cast< ::boost::uint8_t>(*m_position));
|
---|
| 523 | // we must not have a continuation character:
|
---|
| 524 | if((m_value & 0xC0u) == 0x80u)
|
---|
| 525 | invalid_sequnce();
|
---|
| 526 | // see how many extra byts we have:
|
---|
| 527 | unsigned extra = detail::utf8_trailing_byte_count(*m_position);
|
---|
| 528 | // extract the extra bits, 6 from each extra byte:
|
---|
| 529 | BaseIterator next(m_position);
|
---|
| 530 | for(unsigned c = 0; c < extra; ++c)
|
---|
| 531 | {
|
---|
| 532 | ++next;
|
---|
| 533 | m_value <<= 6;
|
---|
| 534 | m_value += static_cast<boost::uint8_t>(*next) & 0x3Fu;
|
---|
| 535 | }
|
---|
| 536 | // we now need to remove a few of the leftmost bits, but how many depends
|
---|
| 537 | // upon how many extra bytes we've extracted:
|
---|
| 538 | static const boost::uint32_t masks[4] =
|
---|
| 539 | {
|
---|
| 540 | 0x7Fu,
|
---|
| 541 | 0x7FFu,
|
---|
| 542 | 0xFFFFu,
|
---|
| 543 | 0x1FFFFFu,
|
---|
| 544 | };
|
---|
| 545 | m_value &= masks[extra];
|
---|
| 546 | // check the result:
|
---|
| 547 | if(m_value > static_cast<U32Type>(0x10FFFFu))
|
---|
| 548 | invalid_sequnce();
|
---|
| 549 | }
|
---|
| 550 | BaseIterator m_position;
|
---|
| 551 | mutable U32Type m_value;
|
---|
| 552 | };
|
---|
| 553 |
|
---|
| 554 | template <class BaseIterator>
|
---|
| 555 | class utf16_output_iterator
|
---|
| 556 | {
|
---|
| 557 | public:
|
---|
| 558 | typedef void difference_type;
|
---|
| 559 | typedef void value_type;
|
---|
| 560 | typedef boost::uint32_t* pointer;
|
---|
| 561 | typedef boost::uint32_t& reference;
|
---|
| 562 | typedef std::output_iterator_tag iterator_category;
|
---|
| 563 |
|
---|
| 564 | utf16_output_iterator(const BaseIterator& b)
|
---|
| 565 | : m_position(b){}
|
---|
| 566 | utf16_output_iterator(const utf16_output_iterator& that)
|
---|
| 567 | : m_position(that.m_position){}
|
---|
| 568 | utf16_output_iterator& operator=(const utf16_output_iterator& that)
|
---|
| 569 | {
|
---|
| 570 | m_position = that.m_position;
|
---|
| 571 | return *this;
|
---|
| 572 | }
|
---|
| 573 | const utf16_output_iterator& operator*()const
|
---|
| 574 | {
|
---|
| 575 | return *this;
|
---|
| 576 | }
|
---|
| 577 | void operator=(boost::uint32_t val)const
|
---|
| 578 | {
|
---|
| 579 | push(val);
|
---|
| 580 | }
|
---|
| 581 | utf16_output_iterator& operator++()
|
---|
| 582 | {
|
---|
| 583 | return *this;
|
---|
| 584 | }
|
---|
| 585 | utf16_output_iterator& operator++(int)
|
---|
| 586 | {
|
---|
| 587 | return *this;
|
---|
| 588 | }
|
---|
| 589 | BaseIterator base()const
|
---|
| 590 | {
|
---|
| 591 | return m_position;
|
---|
| 592 | }
|
---|
| 593 | private:
|
---|
| 594 | void push(boost::uint32_t v)const
|
---|
| 595 | {
|
---|
| 596 | if(v >= 0x10000u)
|
---|
| 597 | {
|
---|
| 598 | // begin by checking for a code point out of range:
|
---|
| 599 | if(v > 0x10FFFFu)
|
---|
| 600 | detail::invalid_utf32_code_point(v);
|
---|
| 601 | // split into two surrogates:
|
---|
| 602 | *m_position++ = static_cast<boost::uint16_t>(v >> 10) + detail::high_surrogate_base;
|
---|
| 603 | *m_position++ = static_cast<boost::uint16_t>(v & detail::ten_bit_mask) + detail::low_surrogate_base;
|
---|
| 604 | }
|
---|
| 605 | else
|
---|
| 606 | {
|
---|
| 607 | // 16-bit code point:
|
---|
| 608 | // value must not be a surrogate:
|
---|
| 609 | if(detail::is_surrogate(v))
|
---|
| 610 | detail::invalid_utf32_code_point(v);
|
---|
| 611 | *m_position++ = static_cast<boost::uint16_t>(v);
|
---|
| 612 | }
|
---|
| 613 | }
|
---|
| 614 | mutable BaseIterator m_position;
|
---|
| 615 | };
|
---|
| 616 |
|
---|
| 617 | template <class BaseIterator>
|
---|
| 618 | class utf8_output_iterator
|
---|
| 619 | {
|
---|
| 620 | public:
|
---|
| 621 | typedef void difference_type;
|
---|
| 622 | typedef void value_type;
|
---|
| 623 | typedef boost::uint32_t* pointer;
|
---|
| 624 | typedef boost::uint32_t& reference;
|
---|
| 625 | typedef std::output_iterator_tag iterator_category;
|
---|
| 626 |
|
---|
| 627 | utf8_output_iterator(const BaseIterator& b)
|
---|
| 628 | : m_position(b){}
|
---|
| 629 | utf8_output_iterator(const utf8_output_iterator& that)
|
---|
| 630 | : m_position(that.m_position){}
|
---|
| 631 | utf8_output_iterator& operator=(const utf8_output_iterator& that)
|
---|
| 632 | {
|
---|
| 633 | m_position = that.m_position;
|
---|
| 634 | return *this;
|
---|
| 635 | }
|
---|
| 636 | const utf8_output_iterator& operator*()const
|
---|
| 637 | {
|
---|
| 638 | return *this;
|
---|
| 639 | }
|
---|
| 640 | void operator=(boost::uint32_t val)const
|
---|
| 641 | {
|
---|
| 642 | push(val);
|
---|
| 643 | }
|
---|
| 644 | utf8_output_iterator& operator++()
|
---|
| 645 | {
|
---|
| 646 | return *this;
|
---|
| 647 | }
|
---|
| 648 | utf8_output_iterator& operator++(int)
|
---|
| 649 | {
|
---|
| 650 | return *this;
|
---|
| 651 | }
|
---|
| 652 | BaseIterator base()const
|
---|
| 653 | {
|
---|
| 654 | return m_position;
|
---|
| 655 | }
|
---|
| 656 | private:
|
---|
| 657 | void push(boost::uint32_t c)const
|
---|
| 658 | {
|
---|
| 659 | if(c > 0x10FFFFu)
|
---|
| 660 | detail::invalid_utf32_code_point(c);
|
---|
| 661 | if(c < 0x80u)
|
---|
| 662 | {
|
---|
| 663 | *m_position++ = static_cast<unsigned char>(c);
|
---|
| 664 | }
|
---|
| 665 | else if(c < 0x800u)
|
---|
| 666 | {
|
---|
| 667 | *m_position++ = static_cast<unsigned char>(0xC0u + (c >> 6));
|
---|
| 668 | *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
---|
| 669 | }
|
---|
| 670 | else if(c < 0x10000u)
|
---|
| 671 | {
|
---|
| 672 | *m_position++ = static_cast<unsigned char>(0xE0u + (c >> 12));
|
---|
| 673 | *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
|
---|
| 674 | *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
---|
| 675 | }
|
---|
| 676 | else
|
---|
| 677 | {
|
---|
| 678 | *m_position++ = static_cast<unsigned char>(0xF0u + (c >> 18));
|
---|
| 679 | *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 12) & 0x3Fu));
|
---|
| 680 | *m_position++ = static_cast<unsigned char>(0x80u + ((c >> 6) & 0x3Fu));
|
---|
| 681 | *m_position++ = static_cast<unsigned char>(0x80u + (c & 0x3Fu));
|
---|
| 682 | }
|
---|
| 683 | }
|
---|
| 684 | mutable BaseIterator m_position;
|
---|
| 685 | };
|
---|
| 686 |
|
---|
| 687 | } // namespace boost
|
---|
| 688 |
|
---|
| 689 | #endif // BOOST_REGEX_UNICODE_ITERATOR_HPP
|
---|