1 | /* |
---|
2 | * Licensed to the Apache Software Foundation (ASF) under one or more |
---|
3 | * contributor license agreements. See the NOTICE file distributed with |
---|
4 | * this work for additional information regarding copyright ownership. |
---|
5 | * The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
6 | * (the "License"); you may not use this file except in compliance with |
---|
7 | * the License. You may obtain a copy of the License at |
---|
8 | * |
---|
9 | * http://www.apache.org/licenses/LICENSE-2.0 |
---|
10 | * |
---|
11 | * Unless required by applicable law or agreed to in writing, software |
---|
12 | * distributed under the License is distributed on an "AS IS" BASIS, |
---|
13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
14 | * See the License for the specific language governing permissions and |
---|
15 | * limitations under the License. |
---|
16 | */ |
---|
17 | |
---|
18 | /* |
---|
19 | * $Id: XMLRecognizer.hpp 568078 2007-08-21 11:43:25Z amassari $ |
---|
20 | */ |
---|
21 | |
---|
22 | #if !defined(XMLRECOGNIZER_HPP) |
---|
23 | #define XMLRECOGNIZER_HPP |
---|
24 | |
---|
25 | #include <xercesc/util/XercesDefs.hpp> |
---|
26 | #include <xercesc/util/PlatformUtils.hpp> |
---|
27 | |
---|
28 | XERCES_CPP_NAMESPACE_BEGIN |
---|
29 | |
---|
30 | /** |
---|
31 | * This class provides some simple code to recognize the encodings of |
---|
32 | * XML files. This recognition only does very basic sensing of the encoding |
---|
33 | * in a broad sense. Basically its just enough to let us get started and |
---|
34 | * read the XMLDecl line. The scanner, once it reads the XMLDecl, will |
---|
35 | * tell the reader any actual encoding string it found and the reader can |
---|
36 | * update itself to be more specific at that point. |
---|
37 | */ |
---|
38 | class XMLPARSER_EXPORT XMLRecognizer |
---|
39 | { |
---|
40 | public : |
---|
41 | // ----------------------------------------------------------------------- |
---|
42 | // Class types |
---|
43 | // |
---|
44 | // This enum represents the various encoding families that we have to |
---|
45 | // deal with individually at the scanner level. This does not indicate |
---|
46 | // the exact encoding, just the rough family that would let us scan |
---|
47 | // the XML/TextDecl to find the encoding string. |
---|
48 | // |
---|
49 | // The 'L's and 'B's stand for little or big endian. We conditionally |
---|
50 | // create versions that will automatically map to the local UTF-16 and |
---|
51 | // UCS-4 endian modes. |
---|
52 | // |
---|
53 | // OtherEncoding means that its some transcoder based encoding, i.e. not |
---|
54 | // one of the ones that we do internally. Its a special case and should |
---|
55 | // never be used directly outside of the reader. |
---|
56 | // |
---|
57 | // NOTE: Keep this in sync with the name map array in the Cpp file!! |
---|
58 | // ----------------------------------------------------------------------- |
---|
59 | enum Encodings |
---|
60 | { |
---|
61 | EBCDIC = 0 |
---|
62 | , UCS_4B = 1 |
---|
63 | , UCS_4L = 2 |
---|
64 | , US_ASCII = 3 |
---|
65 | , UTF_8 = 4 |
---|
66 | , UTF_16B = 5 |
---|
67 | , UTF_16L = 6 |
---|
68 | , XERCES_XMLCH = 7 |
---|
69 | |
---|
70 | , Encodings_Count |
---|
71 | , Encodings_Min = EBCDIC |
---|
72 | , Encodings_Max = XERCES_XMLCH |
---|
73 | |
---|
74 | , OtherEncoding = 999 |
---|
75 | |
---|
76 | #if defined(ENDIANMODE_BIG) |
---|
77 | , Def_UTF16 = UTF_16B |
---|
78 | , Def_UCS4 = UCS_4B |
---|
79 | #else |
---|
80 | , Def_UTF16 = UTF_16L |
---|
81 | , Def_UCS4 = UCS_4L |
---|
82 | #endif |
---|
83 | }; |
---|
84 | |
---|
85 | |
---|
86 | // ----------------------------------------------------------------------- |
---|
87 | // Public, const static data |
---|
88 | // |
---|
89 | // These are the byte sequences for each of the encodings that we can |
---|
90 | // auto sense, and their lengths. |
---|
91 | // ----------------------------------------------------------------------- |
---|
92 | static const char fgASCIIPre[]; |
---|
93 | static const unsigned int fgASCIIPreLen; |
---|
94 | static const XMLByte fgEBCDICPre[]; |
---|
95 | static const unsigned int fgEBCDICPreLen; |
---|
96 | static const XMLByte fgUTF16BPre[]; |
---|
97 | static const XMLByte fgUTF16LPre[]; |
---|
98 | static const unsigned int fgUTF16PreLen; |
---|
99 | static const XMLByte fgUCS4BPre[]; |
---|
100 | static const XMLByte fgUCS4LPre[]; |
---|
101 | static const unsigned int fgUCS4PreLen; |
---|
102 | static const char fgUTF8BOM[]; |
---|
103 | static const unsigned int fgUTF8BOMLen; |
---|
104 | |
---|
105 | |
---|
106 | // ----------------------------------------------------------------------- |
---|
107 | // Encoding recognition methods |
---|
108 | // ----------------------------------------------------------------------- |
---|
109 | static Encodings basicEncodingProbe |
---|
110 | ( |
---|
111 | const XMLByte* const rawBuffer |
---|
112 | , const unsigned int rawByteCount |
---|
113 | ); |
---|
114 | |
---|
115 | static Encodings encodingForName |
---|
116 | ( |
---|
117 | const XMLCh* const theEncName |
---|
118 | ); |
---|
119 | |
---|
120 | static const XMLCh* nameForEncoding(const Encodings theEncoding |
---|
121 | , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
122 | |
---|
123 | |
---|
124 | protected : |
---|
125 | // ----------------------------------------------------------------------- |
---|
126 | // Unimplemented constructors, operators, and destructor |
---|
127 | // |
---|
128 | // This class is effectively being used as a namespace for some static |
---|
129 | // methods. |
---|
130 | // |
---|
131 | // (these functions are protected rather than private only to get rid of |
---|
132 | // some annoying compiler warnings.) |
---|
133 | // |
---|
134 | // ----------------------------------------------------------------------- |
---|
135 | XMLRecognizer(); |
---|
136 | ~XMLRecognizer(); |
---|
137 | |
---|
138 | private: |
---|
139 | // ----------------------------------------------------------------------- |
---|
140 | // Unimplemented constructors and operators |
---|
141 | // ----------------------------------------------------------------------- |
---|
142 | XMLRecognizer(const XMLRecognizer&); |
---|
143 | XMLRecognizer& operator=(const XMLRecognizer&); |
---|
144 | }; |
---|
145 | |
---|
146 | XERCES_CPP_NAMESPACE_END |
---|
147 | |
---|
148 | #endif |
---|