Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

utf8_codecvt_facet.hpp @ 857

Revision 857, 7.2 KB checked in by igarcia, 18 years ago (diff)

Line
1	// Copyright © 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
2	// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). Permission to copy,
3	// use, modify, sell and distribute this software is granted provided this
4	// copyright notice appears in all copies. This software is provided "as is"
5	// without express or implied warranty, and with no claim as to its suitability
6	// for any purpose.
7
8	#ifndef BOOST_UTF8_CODECVT_FACET_HPP
9	#define BOOST_UTF8_CODECVT_FACET_HPP
10
11	// MS compatible compilers support #pragma once
12	#if defined(_MSC_VER) && (_MSC_VER >= 1020)
13	# pragma once
14	#endif
15
16	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
17	// utf8_codecvt_facet.hpp
18
19	// This header defines class utf8_codecvt_facet, derived fro
20	// std::codecvt<wchar_t, char>, which can be used to convert utf8 data in
21	// files into wchar_t strings in the application.
22	//
23	// The header is NOT STANDALONE, and is not to be included by the USER.
24	// There are at least two libraries which want to use this functionality, and
25	// we want to avoid code duplication. It would be possible to create utf8
26	// library, but:
27	// - this requires review process first
28	// - in the case, when linking the a library which uses utf8
29	// (say 'program_options'), user should also link to the utf8 library.
30	// This seems inconvenient, and asking a user to link to an unrevieved
31	// library is strange.
32	// Until the above points are fixed, a library which wants to use utf8 must:
33	// - include this header from one of it's headers or sources
34	// - include the corresponding .cpp file from one of the sources
35	// - before including either file, the library must define
36	// - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used
37	// - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace
38	// - declaration.
39	// - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable'
40	// symbols.
41	//
42	// For example, program_options library might contain:
43	// #define BOOST_UTF8_BEGIN_NAMESPACE <backslash character>
44	// namespace boost { namespace program_options {
45	// #define BOOST_UTF8_END_NAMESPACE }}
46	// #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL
47	// #include "../../detail/utf8/utf8_codecvt.cpp"
48	//
49	// Essentially, each library will have its own copy of utf8 code, in
50	// different namespaces.
51
52	// Note:(Robert Ramey). I have made the following alterations in the original
53	// code.
54	// a) Rendered utf8_codecvt<wchar_t, char> with using templates
55	// b) Move longer functions outside class definition to prevent inlining
56	// and make code smaller
57	// c) added on a derived class to permit translation to/from current
58	// locale to utf8
59
60	// See http://www.boost.org for updates, documentation, and revision history.
61
62	// archives stored as text - note these ar templated on the basic
63	// stream templates to accommodate wide (and other?) kind of characters
64	//
65	// note the fact that on libraries without wide characters, ostream is
66	// is not a specialization of basic_ostream which in fact is not defined
67	// in such cases. So we can't use basic_ostream<OStream::char_type> but rather
68	// use two template parameters
69	//
70	// utf8_codecvt_facet
71	// This is an implementation of a std::codecvt facet for translating
72	// from UTF-8 externally to UCS-4. Note that this is not tied to
73	// any specific types in order to allow customization on platforms
74	// where wchar_t is not big enough.
75	//
76	// NOTES: The current implementation jumps through some unpleasant hoops in
77	// order to deal with signed character types. As a std::codecvt_base::result,
78	// it is necessary for the ExternType to be convertible to unsigned char.
79	// I chose not to tie the extern_type explicitly to char. But if any combination
80	// of types other than <wchar_t,char_t> is used, then std::codecvt must be
81	// specialized on those types for this to work.
82
83	#include <locale>
84	// for mbstate_t
85	#include <wchar.h>
86	// for std::size_t
87	#include <cstddef>
88
89	#include <boost/config.hpp>
90	#include <boost/detail/workaround.hpp>
91
92	namespace std {
93	#if defined(__LIBCOMO__)
94	using ::mbstate_t;
95	#elif defined(BOOST_DINKUMWARE_STDLIB)
96	using ::mbstate_t;
97	#elif defined(__SGI_STL_PORT)
98	#elif defined(BOOST_NO_STDC_NAMESPACE)
99	using ::mbstate_t;
100	using ::codecvt;
101	#endif
102	} // namespace std
103
104	#if !defined(__MSL_CPP__) && !defined(__LIBCOMO__)
105	#define BOOST_CODECVT_DO_LENGTH_CONST const
106	#else
107	#define BOOST_CODECVT_DO_LENGTH_CONST
108	#endif
109
110	// maximum lenght of a multibyte string
111	#define MB_LENGTH_MAX 8
112
113	BOOST_UTF8_BEGIN_NAMESPACE
114
115	struct BOOST_UTF8_DECL utf8_codecvt_facet :
116	public std::codecvt<wchar_t, char, std::mbstate_t>
117	{
118	public:
119	explicit utf8_codecvt_facet(std::size_t no_locale_manage=0)
120	: std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
121	{}
122	protected:
123	virtual std::codecvt_base::result do_in(
124	std::mbstate_t& state,
125	const char * from,
126	const char * from_end,
127	const char * & from_next,
128	wchar_t * to,
129	wchar_t * to_end,
130	wchar_t*& to_next
131	) const;
132
133	virtual std::codecvt_base::result do_out(
134	std::mbstate_t & state, const wchar_t * from,
135	const wchar_t * from_end, const wchar_t* & from_next,
136	char * to, char * to_end, char * & to_next
137	) const;
138
139	bool invalid_continuing_octet(unsigned char octet_1) const {
140	return (octet_1 < 0x80\|\| 0xbf< octet_1);
141	}
142
143	bool invalid_leading_octet(unsigned char octet_1) const {
144	return (0x7f < octet_1 && octet_1 < 0xc0) \|\|
145	(octet_1 > 0xfd);
146	}
147
148	// continuing octets = octets except for the leading octet
149	static unsigned int get_cont_octet_count(unsigned char lead_octet) {
150	return get_octet_count(lead_octet) - 1;
151	}
152
153	static unsigned int get_octet_count(unsigned char lead_octet);
154
155	// How many "continuing octets" will be needed for this word
156	// == total octets - 1.
157	int get_cont_octet_out_count(wchar_t word) const ;
158
159	virtual bool do_always_noconv() const throw() { return false; }
160
161	// UTF-8 isn't really stateful since we rewind on partial conversions
162	virtual std::codecvt_base::result do_unshift(
163	std::mbstate_t&,
164	char * from,
165	char * to,
166	char * & next
167	) const
168	{
169	next = from;
170	return ok;
171	}
172
173	virtual int do_encoding() const throw() {
174	const int variable_byte_external_encoding=0;
175	return variable_byte_external_encoding;
176	}
177
178	// How many char objects can I process to get <= max_limit
179	// wchar_t objects?
180	virtual int do_length(
181	BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
182	const char * from,
183	const char * from_end,
184	std::size_t max_limit
185	#if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
186	) const throw();
187	#else
188	) const;
189	#endif
190
191	// Largest possible value do_length(state,from,from_end,1) could return.
192	virtual int do_max_length() const throw () {
193	return 6; // largest UTF-8 encoding of a UCS-4 character
194	}
195	};
196
197	BOOST_UTF8_END_NAMESPACE
198
199	#endif // BOOST_UTF8_CODECVT_FACET_HPP

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: NonGTP/Boost/boost/detail/utf8_codecvt_facet.hpp @ 857

Download in other formats: