Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Normal
Revision Log

utf8_codecvt_facet.hpp @ 857

Revision 857, 7.2 KB checked in by igarcia, 19 years ago (diff)

Rev	Line
[857]	1	// Copyright © 2001 Ronald Garcia, Indiana University (garcia@osl.iu.edu)
	2	// Andrew Lumsdaine, Indiana University (lums@osl.iu.edu). Permission to copy,
	3	// use, modify, sell and distribute this software is granted provided this
	4	// copyright notice appears in all copies. This software is provided "as is"
	5	// without express or implied warranty, and with no claim as to its suitability
	6	// for any purpose.
	7
	8	#ifndef BOOST_UTF8_CODECVT_FACET_HPP
	9	#define BOOST_UTF8_CODECVT_FACET_HPP
	10
	11	// MS compatible compilers support #pragma once
	12	#if defined(_MSC_VER) && (_MSC_VER >= 1020)
	13	# pragma once
	14	#endif
	15
	16	/////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
	17	// utf8_codecvt_facet.hpp
	18
	19	// This header defines class utf8_codecvt_facet, derived fro
	20	// std::codecvt<wchar_t, char>, which can be used to convert utf8 data in
	21	// files into wchar_t strings in the application.
	22	//
	23	// The header is NOT STANDALONE, and is not to be included by the USER.
	24	// There are at least two libraries which want to use this functionality, and
	25	// we want to avoid code duplication. It would be possible to create utf8
	26	// library, but:
	27	// - this requires review process first
	28	// - in the case, when linking the a library which uses utf8
	29	// (say 'program_options'), user should also link to the utf8 library.
	30	// This seems inconvenient, and asking a user to link to an unrevieved
	31	// library is strange.
	32	// Until the above points are fixed, a library which wants to use utf8 must:
	33	// - include this header from one of it's headers or sources
	34	// - include the corresponding .cpp file from one of the sources
	35	// - before including either file, the library must define
	36	// - BOOST_UTF8_BEGIN_NAMESPACE to the namespace declaration that must be used
	37	// - BOOST_UTF8_END_NAMESPACE to the code to close the previous namespace
	38	// - declaration.
	39	// - BOOST_UTF8_DECL -- to the code which must be used for all 'exportable'
	40	// symbols.
	41	//
	42	// For example, program_options library might contain:
	43	// #define BOOST_UTF8_BEGIN_NAMESPACE <backslash character>
	44	// namespace boost { namespace program_options {
	45	// #define BOOST_UTF8_END_NAMESPACE }}
	46	// #define BOOST_UTF8_DECL BOOST_PROGRAM_OPTIONS_DECL
	47	// #include "../../detail/utf8/utf8_codecvt.cpp"
	48	//
	49	// Essentially, each library will have its own copy of utf8 code, in
	50	// different namespaces.
	51
	52	// Note:(Robert Ramey). I have made the following alterations in the original
	53	// code.
	54	// a) Rendered utf8_codecvt<wchar_t, char> with using templates
	55	// b) Move longer functions outside class definition to prevent inlining
	56	// and make code smaller
	57	// c) added on a derived class to permit translation to/from current
	58	// locale to utf8
	59
	60	// See http://www.boost.org for updates, documentation, and revision history.
	61
	62	// archives stored as text - note these ar templated on the basic
	63	// stream templates to accommodate wide (and other?) kind of characters
	64	//
	65	// note the fact that on libraries without wide characters, ostream is
	66	// is not a specialization of basic_ostream which in fact is not defined
	67	// in such cases. So we can't use basic_ostream<OStream::char_type> but rather
	68	// use two template parameters
	69	//
	70	// utf8_codecvt_facet
	71	// This is an implementation of a std::codecvt facet for translating
	72	// from UTF-8 externally to UCS-4. Note that this is not tied to
	73	// any specific types in order to allow customization on platforms
	74	// where wchar_t is not big enough.
	75	//
	76	// NOTES: The current implementation jumps through some unpleasant hoops in
	77	// order to deal with signed character types. As a std::codecvt_base::result,
	78	// it is necessary for the ExternType to be convertible to unsigned char.
	79	// I chose not to tie the extern_type explicitly to char. But if any combination
	80	// of types other than <wchar_t,char_t> is used, then std::codecvt must be
	81	// specialized on those types for this to work.
	82
	83	#include <locale>
	84	// for mbstate_t
	85	#include <wchar.h>
	86	// for std::size_t
	87	#include <cstddef>
	88
	89	#include <boost/config.hpp>
	90	#include <boost/detail/workaround.hpp>
	91
	92	namespace std {
	93	#if defined(__LIBCOMO__)
	94	using ::mbstate_t;
	95	#elif defined(BOOST_DINKUMWARE_STDLIB)
	96	using ::mbstate_t;
	97	#elif defined(__SGI_STL_PORT)
	98	#elif defined(BOOST_NO_STDC_NAMESPACE)
	99	using ::mbstate_t;
	100	using ::codecvt;
	101	#endif
	102	} // namespace std
	103
	104	#if !defined(__MSL_CPP__) && !defined(__LIBCOMO__)
	105	#define BOOST_CODECVT_DO_LENGTH_CONST const
	106	#else
	107	#define BOOST_CODECVT_DO_LENGTH_CONST
	108	#endif
	109
	110	// maximum lenght of a multibyte string
	111	#define MB_LENGTH_MAX 8
	112
	113	BOOST_UTF8_BEGIN_NAMESPACE
	114
	115	struct BOOST_UTF8_DECL utf8_codecvt_facet :
	116	public std::codecvt<wchar_t, char, std::mbstate_t>
	117	{
	118	public:
	119	explicit utf8_codecvt_facet(std::size_t no_locale_manage=0)
	120	: std::codecvt<wchar_t, char, std::mbstate_t>(no_locale_manage)
	121	{}
	122	protected:
	123	virtual std::codecvt_base::result do_in(
	124	std::mbstate_t& state,
	125	const char * from,
	126	const char * from_end,
	127	const char * & from_next,
	128	wchar_t * to,
	129	wchar_t * to_end,
	130	wchar_t*& to_next
	131	) const;
	132
	133	virtual std::codecvt_base::result do_out(
	134	std::mbstate_t & state, const wchar_t * from,
	135	const wchar_t * from_end, const wchar_t* & from_next,
	136	char * to, char * to_end, char * & to_next
	137	) const;
	138
	139	bool invalid_continuing_octet(unsigned char octet_1) const {
	140	return (octet_1 < 0x80\|\| 0xbf< octet_1);
	141	}
	142
	143	bool invalid_leading_octet(unsigned char octet_1) const {
	144	return (0x7f < octet_1 && octet_1 < 0xc0) \|\|
	145	(octet_1 > 0xfd);
	146	}
	147
	148	// continuing octets = octets except for the leading octet
	149	static unsigned int get_cont_octet_count(unsigned char lead_octet) {
	150	return get_octet_count(lead_octet) - 1;
	151	}
	152
	153	static unsigned int get_octet_count(unsigned char lead_octet);
	154
	155	// How many "continuing octets" will be needed for this word
	156	// == total octets - 1.
	157	int get_cont_octet_out_count(wchar_t word) const ;
	158
	159	virtual bool do_always_noconv() const throw() { return false; }
	160
	161	// UTF-8 isn't really stateful since we rewind on partial conversions
	162	virtual std::codecvt_base::result do_unshift(
	163	std::mbstate_t&,
	164	char * from,
	165	char * to,
	166	char * & next
	167	) const
	168	{
	169	next = from;
	170	return ok;
	171	}
	172
	173	virtual int do_encoding() const throw() {
	174	const int variable_byte_external_encoding=0;
	175	return variable_byte_external_encoding;
	176	}
	177
	178	// How many char objects can I process to get <= max_limit
	179	// wchar_t objects?
	180	virtual int do_length(
	181	BOOST_CODECVT_DO_LENGTH_CONST std::mbstate_t &,
	182	const char * from,
	183	const char * from_end,
	184	std::size_t max_limit
	185	#if BOOST_WORKAROUND(__IBMCPP__, BOOST_TESTED_AT(600))
	186	) const throw();
	187	#else
	188	) const;
	189	#endif
	190
	191	// Largest possible value do_length(state,from,from_end,1) could return.
	192	virtual int do_max_length() const throw () {
	193	return 6; // largest UTF-8 encoding of a UCS-4 character
	194	}
	195	};
	196
	197	BOOST_UTF8_END_NAMESPACE
	198
	199	#endif // BOOST_UTF8_CODECVT_FACET_HPP

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: NonGTP/Boost/boost/detail/utf8_codecvt_facet.hpp @ 857

Download in other formats: