[2674] | 1 | /* |
---|
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more |
---|
| 3 | * contributor license agreements. See the NOTICE file distributed with |
---|
| 4 | * this work for additional information regarding copyright ownership. |
---|
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
| 6 | * (the "License"); you may not use this file except in compliance with |
---|
| 7 | * the License. You may obtain a copy of the License at |
---|
| 8 | * |
---|
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 |
---|
| 10 | * |
---|
| 11 | * Unless required by applicable law or agreed to in writing, software |
---|
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, |
---|
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
| 14 | * See the License for the specific language governing permissions and |
---|
| 15 | * limitations under the License. |
---|
| 16 | */ |
---|
| 17 | |
---|
| 18 | /* |
---|
| 19 | * $Id: XMLRecognizer.hpp 568078 2007-08-21 11:43:25Z amassari $ |
---|
| 20 | */ |
---|
| 21 | |
---|
| 22 | #if !defined(XMLRECOGNIZER_HPP) |
---|
| 23 | #define XMLRECOGNIZER_HPP |
---|
| 24 | |
---|
| 25 | #include <xercesc/util/XercesDefs.hpp> |
---|
| 26 | #include <xercesc/util/PlatformUtils.hpp> |
---|
| 27 | |
---|
| 28 | XERCES_CPP_NAMESPACE_BEGIN |
---|
| 29 | |
---|
| 30 | /** |
---|
| 31 | * This class provides some simple code to recognize the encodings of |
---|
| 32 | * XML files. This recognition only does very basic sensing of the encoding |
---|
| 33 | * in a broad sense. Basically its just enough to let us get started and |
---|
| 34 | * read the XMLDecl line. The scanner, once it reads the XMLDecl, will |
---|
| 35 | * tell the reader any actual encoding string it found and the reader can |
---|
| 36 | * update itself to be more specific at that point. |
---|
| 37 | */ |
---|
| 38 | class XMLPARSER_EXPORT XMLRecognizer |
---|
| 39 | { |
---|
| 40 | public : |
---|
| 41 | // ----------------------------------------------------------------------- |
---|
| 42 | // Class types |
---|
| 43 | // |
---|
| 44 | // This enum represents the various encoding families that we have to |
---|
| 45 | // deal with individually at the scanner level. This does not indicate |
---|
| 46 | // the exact encoding, just the rough family that would let us scan |
---|
| 47 | // the XML/TextDecl to find the encoding string. |
---|
| 48 | // |
---|
| 49 | // The 'L's and 'B's stand for little or big endian. We conditionally |
---|
| 50 | // create versions that will automatically map to the local UTF-16 and |
---|
| 51 | // UCS-4 endian modes. |
---|
| 52 | // |
---|
| 53 | // OtherEncoding means that its some transcoder based encoding, i.e. not |
---|
| 54 | // one of the ones that we do internally. Its a special case and should |
---|
| 55 | // never be used directly outside of the reader. |
---|
| 56 | // |
---|
| 57 | // NOTE: Keep this in sync with the name map array in the Cpp file!! |
---|
| 58 | // ----------------------------------------------------------------------- |
---|
| 59 | enum Encodings |
---|
| 60 | { |
---|
| 61 | EBCDIC = 0 |
---|
| 62 | , UCS_4B = 1 |
---|
| 63 | , UCS_4L = 2 |
---|
| 64 | , US_ASCII = 3 |
---|
| 65 | , UTF_8 = 4 |
---|
| 66 | , UTF_16B = 5 |
---|
| 67 | , UTF_16L = 6 |
---|
| 68 | , XERCES_XMLCH = 7 |
---|
| 69 | |
---|
| 70 | , Encodings_Count |
---|
| 71 | , Encodings_Min = EBCDIC |
---|
| 72 | , Encodings_Max = XERCES_XMLCH |
---|
| 73 | |
---|
| 74 | , OtherEncoding = 999 |
---|
| 75 | |
---|
| 76 | #if defined(ENDIANMODE_BIG) |
---|
| 77 | , Def_UTF16 = UTF_16B |
---|
| 78 | , Def_UCS4 = UCS_4B |
---|
| 79 | #else |
---|
| 80 | , Def_UTF16 = UTF_16L |
---|
| 81 | , Def_UCS4 = UCS_4L |
---|
| 82 | #endif |
---|
| 83 | }; |
---|
| 84 | |
---|
| 85 | |
---|
| 86 | // ----------------------------------------------------------------------- |
---|
| 87 | // Public, const static data |
---|
| 88 | // |
---|
| 89 | // These are the byte sequences for each of the encodings that we can |
---|
| 90 | // auto sense, and their lengths. |
---|
| 91 | // ----------------------------------------------------------------------- |
---|
| 92 | static const char fgASCIIPre[]; |
---|
| 93 | static const unsigned int fgASCIIPreLen; |
---|
| 94 | static const XMLByte fgEBCDICPre[]; |
---|
| 95 | static const unsigned int fgEBCDICPreLen; |
---|
| 96 | static const XMLByte fgUTF16BPre[]; |
---|
| 97 | static const XMLByte fgUTF16LPre[]; |
---|
| 98 | static const unsigned int fgUTF16PreLen; |
---|
| 99 | static const XMLByte fgUCS4BPre[]; |
---|
| 100 | static const XMLByte fgUCS4LPre[]; |
---|
| 101 | static const unsigned int fgUCS4PreLen; |
---|
| 102 | static const char fgUTF8BOM[]; |
---|
| 103 | static const unsigned int fgUTF8BOMLen; |
---|
| 104 | |
---|
| 105 | |
---|
| 106 | // ----------------------------------------------------------------------- |
---|
| 107 | // Encoding recognition methods |
---|
| 108 | // ----------------------------------------------------------------------- |
---|
| 109 | static Encodings basicEncodingProbe |
---|
| 110 | ( |
---|
| 111 | const XMLByte* const rawBuffer |
---|
| 112 | , const unsigned int rawByteCount |
---|
| 113 | ); |
---|
| 114 | |
---|
| 115 | static Encodings encodingForName |
---|
| 116 | ( |
---|
| 117 | const XMLCh* const theEncName |
---|
| 118 | ); |
---|
| 119 | |
---|
| 120 | static const XMLCh* nameForEncoding(const Encodings theEncoding |
---|
| 121 | , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
| 122 | |
---|
| 123 | |
---|
| 124 | protected : |
---|
| 125 | // ----------------------------------------------------------------------- |
---|
| 126 | // Unimplemented constructors, operators, and destructor |
---|
| 127 | // |
---|
| 128 | // This class is effectively being used as a namespace for some static |
---|
| 129 | // methods. |
---|
| 130 | // |
---|
| 131 | // (these functions are protected rather than private only to get rid of |
---|
| 132 | // some annoying compiler warnings.) |
---|
| 133 | // |
---|
| 134 | // ----------------------------------------------------------------------- |
---|
| 135 | XMLRecognizer(); |
---|
| 136 | ~XMLRecognizer(); |
---|
| 137 | |
---|
| 138 | private: |
---|
| 139 | // ----------------------------------------------------------------------- |
---|
| 140 | // Unimplemented constructors and operators |
---|
| 141 | // ----------------------------------------------------------------------- |
---|
| 142 | XMLRecognizer(const XMLRecognizer&); |
---|
| 143 | XMLRecognizer& operator=(const XMLRecognizer&); |
---|
| 144 | }; |
---|
| 145 | |
---|
| 146 | XERCES_CPP_NAMESPACE_END |
---|
| 147 | |
---|
| 148 | #endif |
---|