source: NonGTP/Xerces/xerces/include/xercesc/framework/XMLRecognizer.hpp @ 358

Revision 358, 5.2 KB checked in by bittner, 19 years ago (diff)

xerces added

Line 
1/*
2 * Copyright 1999-2000,2004 The Apache Software Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 *  $Id: XMLRecognizer.hpp,v 1.7 2004/09/08 13:55:59 peiyongz Exp $
19 */
20
21#if !defined(XMLRECOGNIZER_HPP)
22#define XMLRECOGNIZER_HPP
23
24#include <xercesc/util/XercesDefs.hpp>
25#include <xercesc/util/PlatformUtils.hpp>
26
27XERCES_CPP_NAMESPACE_BEGIN
28
29/**
30 *  This class provides some simple code to recognize the encodings of
31 *  XML files. This recognition only does very basic sensing of the encoding
32 *  in a broad sense. Basically its just enough to let us get started and
33 *  read the XMLDecl line. The scanner, once it reads the XMLDecl, will
34 *  tell the reader any actual encoding string it found and the reader can
35 *  update itself to be more specific at that point.
36 */
37class XMLPARSER_EXPORT XMLRecognizer
38{
39public :
40    // -----------------------------------------------------------------------
41    //  Class types
42    //
43    //  This enum represents the various encoding families that we have to
44    //  deal with individually at the scanner level. This does not indicate
45    //  the exact encoding, just the rough family that would let us scan
46    //  the XML/TextDecl to find the encoding string.
47    //
48    //  The 'L's and 'B's stand for little or big endian. We conditionally
49    //  create versions that will automatically map to the local UTF-16 and
50    //  UCS-4 endian modes.
51    //
52    //  OtherEncoding means that its some transcoder based encoding, i.e. not
53    //  one of the ones that we do internally. Its a special case and should
54    //  never be used directly outside of the reader.
55    //
56    //  NOTE: Keep this in sync with the name map array in the Cpp file!!
57    // -----------------------------------------------------------------------
58    enum Encodings
59    {
60        EBCDIC          = 0
61        , UCS_4B        = 1
62        , UCS_4L        = 2
63        , US_ASCII      = 3
64        , UTF_8         = 4
65        , UTF_16B       = 5
66        , UTF_16L       = 6
67        , XERCES_XMLCH  = 7
68
69        , Encodings_Count
70        , Encodings_Min = EBCDIC
71        , Encodings_Max = XERCES_XMLCH
72
73        , OtherEncoding = 999
74
75        #if defined(ENDIANMODE_BIG)
76        , Def_UTF16     = UTF_16B
77        , Def_UCS4      = UCS_4B
78        #else
79        , Def_UTF16     = UTF_16L
80        , Def_UCS4      = UCS_4L
81        #endif
82    };
83
84
85    // -----------------------------------------------------------------------
86    //  Public, const static data
87    //
88    //  These are the byte sequences for each of the encodings that we can
89    //  auto sense, and their lengths.
90    // -----------------------------------------------------------------------
91    static const char           fgASCIIPre[];
92    static const unsigned int   fgASCIIPreLen;
93    static const XMLByte        fgEBCDICPre[];
94    static const unsigned int   fgEBCDICPreLen;
95    static const XMLByte        fgUTF16BPre[];
96    static const XMLByte        fgUTF16LPre[];
97    static const unsigned int   fgUTF16PreLen;
98    static const XMLByte        fgUCS4BPre[];
99    static const XMLByte        fgUCS4LPre[];
100    static const unsigned int   fgUCS4PreLen;
101    static const char           fgUTF8BOM[];
102    static const unsigned int   fgUTF8BOMLen;
103
104
105    // -----------------------------------------------------------------------
106    //  Encoding recognition methods
107    // -----------------------------------------------------------------------
108    static Encodings basicEncodingProbe
109    (
110        const   XMLByte* const      rawBuffer
111        , const unsigned int        rawByteCount
112    );
113
114    static Encodings encodingForName
115    (
116        const   XMLCh* const    theEncName
117    );
118
119    static const XMLCh* nameForEncoding(const Encodings theEncoding
120        , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
121
122
123protected :
124    // -----------------------------------------------------------------------
125    //  Unimplemented constructors, operators, and destructor
126    //
127    //  This class is effectively being used as a namespace for some static
128    //  methods.
129    //
130    //   (these functions are protected rather than private only to get rid of
131    //    some annoying compiler warnings.)
132    //
133    // -----------------------------------------------------------------------
134    XMLRecognizer();
135    ~XMLRecognizer();
136
137private:
138    // -----------------------------------------------------------------------
139    //  Unimplemented constructors and operators
140    // -----------------------------------------------------------------------
141    XMLRecognizer(const XMLRecognizer&);   
142    XMLRecognizer& operator=(const XMLRecognizer&);
143};
144
145XERCES_CPP_NAMESPACE_END
146
147#endif
Note: See TracBrowser for help on using the repository browser.