source: NonGTP/Xerces/xerces/include/xercesc/internal/XMLReader.hpp @ 358

Revision 358, 32.2 KB checked in by bittner, 19 years ago (diff)

xerces added

Line 
1/*
2 * Copyright 1999-2001,2004 The Apache Software Foundation.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17/*
18 * $Log: XMLReader.hpp,v $
19 * Revision 1.20  2004/09/29 00:24:01  knoaman
20 * Performance: improve src offset calculation. Patch by Anthony O'Dowd.
21 *
22 * Revision 1.19  2004/09/08 13:56:14  peiyongz
23 * Apache License Version 2.0
24 *
25 * Revision 1.18  2004/06/15 15:51:21  peiyongz
26 * Compilation error on (AIX, Solaris) solved
27 *
28 * Revision 1.17  2004/06/14 15:18:53  peiyongz
29 * Consolidated End Of Line Handling
30 *
31 * Revision 1.16  2004/06/03 15:38:27  peiyongz
32 * XML1.1:  The characters #x85 and #x2028 cannot be reliably recognized
33 * and translated until an entity's encoding declaration (if present) has been
34 * read. Therefore, it is a fatal error to use them within the XML declaration or
35 * text declaration.
36 *
37 * Revision 1.15  2004/01/29 11:46:30  cargilld
38 * Code cleanup changes to get rid of various compiler diagnostic messages.
39 *
40 * Revision 1.14  2003/05/16 21:36:58  knoaman
41 * Memory manager implementation: Modify constructors to pass in the memory manager.
42 *
43 * Revision 1.13  2003/05/15 18:26:29  knoaman
44 * Partial implementation of the configurable memory manager.
45 *
46 * Revision 1.12  2003/01/27 16:50:27  knoaman
47 * some cleanup.
48 *
49 * Revision 1.11  2002/12/20 22:09:56  tng
50 * XML 1.1
51 *
52 * Revision 1.10  2002/12/11 22:09:08  knoaman
53 * Performance: reduce instructions count.
54 *
55 * Revision 1.9  2002/12/03 15:31:19  knoaman
56 * Enable/disable calculation of src offset.
57 *
58 * Revision 1.8  2002/12/02 17:20:05  knoaman
59 * Remove unused data member.
60 *
61 * Revision 1.7  2002/11/28 19:19:12  knoaman
62 * Performance: remove unnecessary if condition.
63 *
64 * Revision 1.6  2002/11/28 18:17:22  knoaman
65 * Performance: make getNextChar/peekNextChar inline.
66 *
67 * Revision 1.5  2002/11/25 21:31:08  tng
68 * Performance:
69 * 1. use XMLRecognizer::Encodings enum to make new transcode, faster than comparing the encoding string every time.
70 * 2. Pre uppercase the encodingString before calling encodingForName to avoid calling compareIString
71 *
72 * Revision 1.4  2002/11/04 14:58:19  tng
73 * C++ Namespace Support.
74 *
75 * Revision 1.3  2002/09/27 12:56:23  tng
76 * [Bug 12740] Extra include.  By Peter Volchek.
77 *
78 * Revision 1.2  2002/05/27 18:42:14  tng
79 * To get ready for 64 bit large file, use XMLSSize_t to represent line and column number.
80 *
81 * Revision 1.1.1.1  2002/02/01 22:22:02  peiyongz
82 * sane_include
83 *
84 * Revision 1.18  2001/12/06 17:47:04  tng
85 * Performance Enhancement.  Modify the handling of the fNEL option so that it results in fgCharCharsTable being modified, instead of having all of the low-level routines check the option.  This seemed acceptable because the code appears to only permit the option to be turned on and not turned off again.   By Henry Zongaro.
86 *
87 * Revision 1.17  2001/07/12 18:50:13  tng
88 * Some performance modification regarding standalone check and xml decl check.
89 *
90 * Revision 1.16  2001/05/11 13:26:17  tng
91 * Copyright update.
92 *
93 * Revision 1.15  2001/05/03 18:42:51  knoaman
94 * Added new option to the parsers so that the NEL (0x85) char can be treated as a newline character.
95 *
96 * Revision 1.14  2001/01/25 19:16:58  tng
97 * const should be used instead of static const.  Fixed by Khaled Noaman.
98 *
99 * Revision 1.13  2000/07/25 22:33:05  aruna1
100 * Char definitions in XMLUni moved to XMLUniDefs
101 *
102 * Revision 1.12  2000/07/08 00:17:13  andyh
103 * Cleanup of yesterday's speedup changes.  Merged new bit into the
104 * scanner character properties table.
105 *
106 * Revision 1.11  2000/07/07 01:08:44  andyh
107 * Parser speed up in scan of XML content.
108 *
109 * Revision 1.10  2000/07/06 21:00:52  jpolast
110 * inlined getNextCharIfNot() for better performance
111 *
112 * Revision 1.9  2000/05/11 23:11:33  andyh
113 * Add missing validity checks for stand-alone documents, character range
114 * and Well-formed parsed entities.  Changes contributed by Sean MacRoibeaird
115 * <sean.Macroibeaird@ireland.sun.com>
116 *
117 * Revision 1.8  2000/03/02 19:54:29  roddey
118 * This checkin includes many changes done while waiting for the
119 * 1.1.0 code to be finished. I can't list them all here, but a list is
120 * available elsewhere.
121 *
122 * Revision 1.7  2000/02/24 20:18:07  abagchi
123 * Swat for removing Log from API docs
124 *
125 * Revision 1.6  2000/02/06 07:47:53  rahulj
126 * Year 2K copyright swat.
127 *
128 * Revision 1.5  2000/01/25 01:04:21  roddey
129 * Fixes a bogus error about ]]> in char data.
130 *
131 * Revision 1.4  2000/01/22 00:01:08  roddey
132 * Simple change to get rid of two hard coded 'x' type characters, which won't
133 * work on EBCDIC systems.
134 *
135 * Revision 1.3  1999/12/18 00:20:00  roddey
136 * More changes to support the new, completely orthagonal, support for
137 * intrinsic encodings.
138 *
139 * Revision 1.2  1999/12/15 19:48:03  roddey
140 * Changed to use new split of transcoder interfaces into XML transcoders and
141 * LCP transcoders, and implementation of intrinsic transcoders as pluggable
142 * transcoders, and addition of Latin1 intrinsic support.
143 *
144 * Revision 1.1.1.1  1999/11/09 01:08:22  twl
145 * Initial checkin
146 *
147 * Revision 1.3  1999/11/08 20:44:47  rahul
148 * Swat for adding in Product name and CVS comment log variable.
149 *
150 */
151
152#if !defined(XMLREADER_HPP)
153#define XMLREADER_HPP
154
155#include <xercesc/util/XMLChar.hpp>
156#include <xercesc/framework/XMLRecognizer.hpp>
157#include <xercesc/framework/XMLBuffer.hpp>
158#include <xercesc/util/TranscodingException.hpp>
159
160XERCES_CPP_NAMESPACE_BEGIN
161
162class InputSource;
163class BinInputStream;
164class ReaderMgr;
165class XMLScanner;
166class XMLTranscoder;
167
168
169// ---------------------------------------------------------------------------
170//  Instances of this class are used to manage the content of entities. The
171//  scanner maintains a stack of these, one for each entity (this means entity
172//  in the sense of any parsed file or internal entity) currently being
173//  scanned. This class, given a binary input stream will handle reading in
174//  the data and decoding it from its external decoding into the internal
175//  Unicode format. Once internallized, this class provides the access
176//  methods to read in the data in various ways, maintains line and column
177//  information, and provides high performance character attribute checking
178//  methods.
179//
180//  This is NOT to be derived from.
181//
182// ---------------------------------------------------------------------------
183class XMLPARSER_EXPORT XMLReader : public XMemory
184{
185public:
186    // -----------------------------------------------------------------------
187    //  Public types
188    // -----------------------------------------------------------------------
189    enum Types
190    {
191        Type_PE
192        , Type_General
193    };
194
195    enum Sources
196    {
197        Source_Internal
198        , Source_External
199    };
200
201    enum RefFrom
202    {
203        RefFrom_Literal
204        , RefFrom_NonLiteral
205    };
206
207    enum XMLVersion
208    {
209        XMLV1_0
210        , XMLV1_1
211        , XMLV_Unknown
212    };
213
214
215    // -----------------------------------------------------------------------
216    //  Public, query methods
217    // -----------------------------------------------------------------------
218    bool isAllSpaces
219    (
220        const   XMLCh* const    toCheck
221        , const unsigned int    count
222    );
223
224    bool containsWhiteSpace
225    (
226        const   XMLCh* const    toCheck
227        , const unsigned int    count
228    );
229
230
231    bool isXMLLetter(const XMLCh toCheck);
232    bool isFirstNameChar(const XMLCh toCheck);
233    bool isNameChar(const XMLCh toCheck);
234    bool isPlainContentChar(const XMLCh toCheck);
235    bool isSpecialStartTagChar(const XMLCh toCheck);
236    bool isXMLChar(const XMLCh toCheck);
237    bool isWhitespace(const XMLCh toCheck);
238    bool isControlChar(const XMLCh toCheck);
239    bool isPublicIdChar(const XMLCh toCheck);
240
241    // -----------------------------------------------------------------------
242    //  Constructors and Destructor
243    // -----------------------------------------------------------------------
244    XMLReader
245    (
246        const   XMLCh* const          pubId
247        , const XMLCh* const          sysId
248        ,       BinInputStream* const streamToAdopt
249        , const RefFrom               from
250        , const Types                 type
251        , const Sources               source
252        , const bool                  throwAtEnd = false
253        , const bool                  calculateSrcOfs = true
254        , const XMLVersion            xmlVersion = XMLV1_0
255        ,       MemoryManager* const  manager = XMLPlatformUtils::fgMemoryManager
256    );
257
258    XMLReader
259    (
260        const   XMLCh* const          pubId
261        , const XMLCh* const          sysId
262        ,       BinInputStream* const streamToAdopt
263        , const XMLCh* const          encodingStr
264        , const RefFrom               from
265        , const Types                 type
266        , const Sources               source
267        , const bool                  throwAtEnd = false
268        , const bool                  calculateSrcOfs = true
269        , const XMLVersion            xmlVersion = XMLV1_0
270        ,       MemoryManager* const  manager = XMLPlatformUtils::fgMemoryManager
271    );
272
273    XMLReader
274    (
275        const   XMLCh* const          pubId
276        , const XMLCh* const          sysId
277        ,       BinInputStream* const streamToAdopt
278        , XMLRecognizer::Encodings    encodingEnum
279        , const RefFrom               from
280        , const Types                 type
281        , const Sources               source
282        , const bool                  throwAtEnd = false
283        , const bool                  calculateSrcOfs = true
284        , const XMLVersion            xmlVersion = XMLV1_0
285        ,       MemoryManager* const  manager = XMLPlatformUtils::fgMemoryManager
286    );
287
288    ~XMLReader();
289
290
291    // -----------------------------------------------------------------------
292    //  Character buffer management methods
293    // -----------------------------------------------------------------------
294    unsigned long charsLeftInBuffer() const;
295    bool refreshCharBuffer();
296
297
298    // -----------------------------------------------------------------------
299    //  Scanning methods
300    // -----------------------------------------------------------------------
301    bool getName(XMLBuffer& toFill, const bool token);
302    bool getNextChar(XMLCh& chGotten);
303    bool getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten);
304    void movePlainContentChars(XMLBuffer &dest);
305    bool getSpaces(XMLBuffer& toFill);
306    bool getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck);
307    bool peekNextChar(XMLCh& chGotten);
308    bool skipIfQuote(XMLCh& chGotten);
309    bool skipSpaces(bool& skippedSomething, bool inDecl = false);
310    bool skippedChar(const XMLCh toSkip);
311    bool skippedSpace();
312    bool skippedString(const XMLCh* const toSkip);
313    bool peekString(const XMLCh* const toPeek);
314
315
316    // -----------------------------------------------------------------------
317    //  Getter methods
318    // -----------------------------------------------------------------------
319    XMLSSize_t getColumnNumber() const;
320    const XMLCh* getEncodingStr() const;
321    XMLSSize_t getLineNumber() const;
322    bool getNoMoreFlag() const;
323    const XMLCh* getPublicId() const;
324    unsigned int getReaderNum() const;
325    RefFrom getRefFrom() const;
326    Sources getSource() const;
327    unsigned int getSrcOffset() const;
328    const XMLCh* getSystemId() const;
329    bool getThrowAtEnd() const;
330    Types getType() const;
331
332
333    // -----------------------------------------------------------------------
334    //  Setter methods
335    // -----------------------------------------------------------------------
336    bool setEncoding
337    (
338        const   XMLCh* const    newEncoding
339    );
340    void setReaderNum(const unsigned int newNum);
341    void setThrowAtEnd(const bool newValue);
342    void setXMLVersion(const XMLVersion version);
343
344
345private:
346    // -----------------------------------------------------------------------
347    //  Unimplemented constructors and operators
348    // -----------------------------------------------------------------------
349    XMLReader(const XMLReader&);
350    XMLReader& operator=(const XMLReader&);
351
352    // ---------------------------------------------------------------------------
353    //  Class Constants
354    //
355    //  kCharBufSize
356    //      The size of the character spool buffer that we use. Its not terribly
357    //      large because its just getting filled with data from a raw byte
358    //      buffer as we go along. We don't want to decode all the text at
359    //      once before we find out that there is an error.
360    //
361    //      NOTE: This is a size in characters, not bytes.
362    //
363    //  kRawBufSize
364    //      The size of the raw buffer from which raw bytes are spooled out
365    //      as we transcode chunks of data. As it is emptied, it is filled back
366    //      in again from the source stream.
367    // ---------------------------------------------------------------------------
368    enum Constants
369    {
370        kCharBufSize        = 16 * 1024
371        , kRawBufSize       = 48 * 1024
372    };
373
374
375    // -----------------------------------------------------------------------
376    //  Private helper methods
377    // -----------------------------------------------------------------------
378    void checkForSwapped();
379
380    void doInitCharSizeChecks();
381
382    void doInitDecode();
383
384    XMLByte getNextRawByte
385    (
386        const   bool            eoiOk
387    );
388
389    void refreshRawBuffer();
390
391    void setTranscoder
392    (
393        const   XMLCh* const    newEncoding
394    );
395
396    unsigned int xcodeMoreChars
397    (
398                XMLCh* const            bufToFill
399        ,       unsigned char* const    charSizes
400        , const unsigned int            maxChars
401    );
402
403    inline void handleEOL
404    (
405              XMLCh&   curCh
406            , bool     inDecl = false
407    );
408
409    // -----------------------------------------------------------------------
410    //  Data members
411    //
412    //  fCharIndex
413    //      The index into the character buffer. When this hits fCharsAvail
414    //      then its time to refill.
415    //
416    //  fCharBuf
417    //      A buffer that the reader manager fills up with transcoded
418    //      characters a small amount at a time.
419    //
420    //  fCharsAvail
421    //      The characters currently available in the character buffer.
422    //
423    //  fCharSizeBuf
424    //      This buffer is an array that contains the number of source chars
425    //      eaten to create each char in the fCharBuf buffer. So the entry
426    //      fCharSizeBuf[x] is the number of source chars that were eaten
427    //      to make the internalized char fCharBuf[x]. This only contains
428    //      useful data if fSrcOfsSupported is true.
429    //
430    //  fCharOfsBuf
431    //      This buffer is an array that contains the offset in the
432    //      fRawByteBuf buffer of each char in the fCharBuf buffer. It
433    //      only contains useful data if fSrcOfsSupported is true.
434    //
435    //  fCurCol
436    //  fCurLine
437    //      The current line and column that we are in within this reader's
438    //      text.
439    //
440    //  fEncoding
441    //      This is the rough encoding setting. This enum is set during
442    //      construction and just tells us the rough family of encoding that
443    //      we are doing.
444    //
445    //  fEncodingStr
446    //      This is the name of the encoding we are using. It will be
447    //      provisionally set during construction, from the auto-sensed
448    //      encoding. But it might be overridden when the XMLDecl is finally
449    //      seen by the scanner. It can also be forced to a particular
450    //      encoding, in which case fForcedEncoding is set.
451    //
452    //  fForcedEncoding
453    //      If the encoding if forced then this is set and all other
454    //      information will be ignored. This encoding will be taken as
455    //      gospel. This is done by calling an alternate constructor.
456    //
457    //  fNoMore
458    //      This is set when the source text is exhausted. It lets us know
459    //      quickly that no more text is available.
460    //
461    //  fRawBufIndex
462    //      The current index into the raw byte buffer. When its equal to
463    //      fRawBytesAvail then we need to read another buffer.
464    //
465    //  fRawByteBuf
466    //      This is the raw byte buffer that is used to spool out bytes
467    //      from into the fCharBuf buffer, as we transcode in blocks.
468    //
469    //  fRawBytesAvail
470    //      The number of bytes currently available in the raw buffer. This
471    //      helps deal with the last buffer's worth, which will usually not
472    //      be a full one.
473    //
474    //  fReaderNum
475    //      Each reader from a particular reader manager (which means from a
476    //      particular document) is given a unique number. The reader manager
477    //      sets these numbers. They are used to catch things like partial
478    //      markup errors.
479    //
480    //  fRefFrom
481    //      This flag is provided in the ctor, and tells us if we represent
482    //      some entity being expanded inside a literal. Sometimes things
483    //      happen differently inside and outside literals.
484    //
485    //  fPublicId
486    //  fSystemId
487    //      These are the system and public ids of the source that this
488    //      reader is reading.
489    //
490    //  fSentTrailingSpace
491    //      If we are a PE entity being read and we not referenced from a
492    //      literal, then a leading and trailing space must be faked into the
493    //      data. This lets us know we've done the trailing space already (so
494    //      we don't just keep doing it again and again.)
495    //
496    //  fSource
497    //      Indicates whether the content this reader is spooling as already
498    //      been internalized. This will prevent multiple processing of
499    //      whitespace when an already internalized entity is being spooled
500    //      out.
501    //
502    //  fSpareChar
503    //      Some encodings can create two chars in an atomic way, e.g.
504    //      surrogate pairs. We might not be able to store both, so we store
505    //      it here until the next buffer transcoding operation.
506    //
507    //  fSrcOfsBase
508    //      This is the base offset within the source of this entity. Values
509    //      in the curent fCharSizeBuf array are relative to this value.
510    //
511    //  fSrcOfsSupported
512    //      This flag is set to indicate whether source byte offset info
513    //      is supported. For intrinsic encodings, its always set since we
514    //      can always support it. For transcoder based encodings, we ask
515    //      the transcoder if it supports it or not.
516    //
517    //  fStream
518    //      This is the input stream that provides the data for the reader.
519    //      Its always treated as a raw byte stream. The derived class will
520    //      ask for buffers of text from it and will handle making some
521    //      sense of it.
522    //
523    //  fSwapped
524    //      If the encoding is one of the ones we do intrinsically, and its
525    //      in a different byte order from our native order, then this is
526    //      set to remind us to byte swap it during transcoding.
527    //
528    //  fThrowAtEnd
529    //      Indicates whether the reader manager should throw an end of entity
530    //      exception at the end of this reader instance. This is usually
531    //      set for top level external entity references. It overrides the
532    //      reader manager's global flag that controls throwing at the end
533    //      of entities. Defaults to false.
534    //
535    //  fTranscoder
536    //      If the encoding is not one that we handle intrinsically, then
537    //      we use an an external transcoder to do it. This class is an
538    //      abstraction that allows us to use pluggable external transcoding
539    //      services (via XMLTransService in util.)
540    //
541    //  fType
542    //      Indicates whether this reader represents a PE or not. If this
543    //      flag is true and the fInLiteral flag is false, then we will put
544    //      out an extra space at the end.
545    //
546    //  fgCharCharsTable;
547    //      Pointer to XMLChar table, depends on XML version
548    //
549    //  fNEL
550    //      Boolean indicates if NEL and LSEP should be recognized as NEL
551    //
552    //  fXMLVersion
553    //      Enum to indicate if this Reader is conforming to XML 1.0 or XML 1.1
554    // -----------------------------------------------------------------------
555    unsigned int                fCharIndex;
556    XMLCh                       fCharBuf[kCharBufSize];
557    unsigned int                fCharsAvail;
558    unsigned char               fCharSizeBuf[kCharBufSize];
559    unsigned int                fCharOfsBuf[kCharBufSize];
560    XMLSSize_t                  fCurCol;
561    XMLSSize_t                  fCurLine;
562    XMLRecognizer::Encodings    fEncoding;
563    XMLCh*                      fEncodingStr;
564    bool                        fForcedEncoding;
565    bool                        fNoMore;
566    XMLCh*                      fPublicId;
567    unsigned int                fRawBufIndex;
568    XMLByte                     fRawByteBuf[kRawBufSize];
569    unsigned int                fRawBytesAvail;
570    unsigned int                fReaderNum;
571    RefFrom                     fRefFrom;
572    bool                        fSentTrailingSpace;
573    Sources                     fSource;
574    unsigned int                fSrcOfsBase;
575    bool                        fSrcOfsSupported;
576    bool                        fCalculateSrcOfs;
577    XMLCh*                      fSystemId;
578    BinInputStream*             fStream;
579    bool                        fSwapped;
580    bool                        fThrowAtEnd;
581    XMLTranscoder*              fTranscoder;
582    Types                       fType;
583    XMLByte*                    fgCharCharsTable;
584    bool                        fNEL;
585    XMLVersion                  fXMLVersion;
586    MemoryManager*              fMemoryManager;
587};
588
589
590// ---------------------------------------------------------------------------
591//  XMLReader: Public, query methods
592// ---------------------------------------------------------------------------
593inline bool XMLReader::isNameChar(const XMLCh toCheck)
594{
595    return ((fgCharCharsTable[toCheck] & gNameCharMask) != 0);
596}
597
598inline bool XMLReader::isPlainContentChar(const XMLCh toCheck)
599{
600    return ((fgCharCharsTable[toCheck] & gPlainContentCharMask) != 0);
601}
602
603
604inline bool XMLReader::isFirstNameChar(const XMLCh toCheck)
605{
606    return ((fgCharCharsTable[toCheck] & gFirstNameCharMask) != 0);
607}
608
609inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck)
610{
611    return ((fgCharCharsTable[toCheck] & gSpecialStartTagCharMask) != 0);
612}
613
614inline bool XMLReader::isXMLChar(const XMLCh toCheck)
615{
616    return ((fgCharCharsTable[toCheck] & gXMLCharMask) != 0);
617}
618
619inline bool XMLReader::isXMLLetter(const XMLCh toCheck)
620{
621    return ((fgCharCharsTable[toCheck] & gLetterCharMask) != 0);
622}
623
624inline bool XMLReader::isWhitespace(const XMLCh toCheck)
625{
626    return ((fgCharCharsTable[toCheck] & gWhitespaceCharMask) != 0);
627}
628
629inline bool XMLReader::isControlChar(const XMLCh toCheck)
630{
631    return ((fgCharCharsTable[toCheck] & gControlCharMask) != 0);
632}
633
634// ---------------------------------------------------------------------------
635//  XMLReader: Buffer management methods
636// ---------------------------------------------------------------------------
637inline unsigned long XMLReader::charsLeftInBuffer() const
638{
639    return fCharsAvail - fCharIndex;
640}
641
642
643// ---------------------------------------------------------------------------
644//  XMLReader: Getter methods
645// ---------------------------------------------------------------------------
646inline XMLSSize_t XMLReader::getColumnNumber() const
647{
648    return fCurCol;
649}
650
651inline const XMLCh* XMLReader::getEncodingStr() const
652{
653    return fEncodingStr;
654}
655
656inline XMLSSize_t XMLReader::getLineNumber() const
657{
658    return fCurLine;
659}
660
661inline bool XMLReader::getNoMoreFlag() const
662{
663    return fNoMore;
664}
665
666inline const XMLCh* XMLReader::getPublicId() const
667{
668    return fPublicId;
669}
670
671inline unsigned int XMLReader::getReaderNum() const
672{
673    return fReaderNum;
674}
675
676inline XMLReader::RefFrom XMLReader::getRefFrom() const
677{
678    return fRefFrom;
679}
680
681inline XMLReader::Sources XMLReader::getSource() const
682{
683    return fSource;
684}
685
686inline const XMLCh* XMLReader::getSystemId() const
687{
688    return fSystemId;
689}
690
691inline bool XMLReader::getThrowAtEnd() const
692{
693    return fThrowAtEnd;
694}
695
696inline XMLReader::Types XMLReader::getType() const
697{
698    return fType;
699}
700
701// ---------------------------------------------------------------------------
702//  XMLReader: Setter methods
703// ---------------------------------------------------------------------------
704inline void XMLReader::setReaderNum(const unsigned int newNum)
705{
706    fReaderNum = newNum;
707}
708
709inline void XMLReader::setThrowAtEnd(const bool newValue)
710{
711    fThrowAtEnd = newValue;
712}
713
714inline void XMLReader::setXMLVersion(const XMLVersion version)
715{
716    fXMLVersion = version;
717    if (version == XMLV1_1) {
718        fNEL = true;
719        fgCharCharsTable = XMLChar1_1::fgCharCharsTable1_1;
720    }
721    else {
722        fNEL = XMLChar1_0::enableNEL;
723        fgCharCharsTable = XMLChar1_0::fgCharCharsTable1_0;
724    }
725
726}
727
728
729
730// ---------------------------------------------------------------------------
731//
732//  XMLReader: movePlainContentChars()
733//
734//       Move as many plain (no special handling of any sort required) content
735//       characters as possible from this reader to the supplied destination buffer.
736//
737//       This is THE hottest performance spot in the parser.
738//
739// ---------------------------------------------------------------------------
740inline void XMLReader::movePlainContentChars(XMLBuffer &dest)
741{
742    unsigned int count = fCharIndex;
743
744    while (fCharIndex < fCharsAvail)
745    {
746        if (!isPlainContentChar(fCharBuf[fCharIndex]))
747            break;
748        fCharIndex++;
749    }
750
751    if (count != fCharIndex)
752    {
753        fCurCol    += (fCharIndex - count);
754        dest.append(&fCharBuf[count], fCharIndex - count);
755    }
756}
757
758
759// ---------------------------------------------------------------------------
760//  XMLReader: getNextCharIfNot() method inlined for speed
761// ---------------------------------------------------------------------------
762inline bool XMLReader::getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten)
763{
764    //
765    //  See if there is at least a char in the buffer. Else, do the buffer
766    //  reload logic.
767    //
768    if (fCharIndex >= fCharsAvail)
769    {
770        // If fNoMore is set, then we have nothing else to give
771        if (fNoMore)
772            return false;
773
774        // Try to refresh
775        if (!refreshCharBuffer())
776            return false;
777    }
778
779    // Check the next char
780    if (fCharBuf[fCharIndex] == chNotToGet)
781        return false;
782
783    // Its not the one we want to skip so bump the index
784    chGotten = fCharBuf[fCharIndex++];
785
786    // Handle end of line normalization and line/col member maintenance.
787    handleEOL(chGotten, false);
788
789    return true;
790}
791
792// ---------------------------------------------------------------------------
793//  XMLReader: getNextChar() method inlined for speed
794// ---------------------------------------------------------------------------
795inline bool XMLReader::getNextChar(XMLCh& chGotten)
796{
797    //
798    //  See if there is at least a char in the buffer. Else, do the buffer
799    //  reload logic.
800    //
801    if (fCharIndex >= fCharsAvail)
802    {
803        // If fNoMore is set, then we have nothing else to give
804        if (fNoMore)
805            return false;
806
807        // Try to refresh
808        if (!refreshCharBuffer())
809            return false;
810    }
811
812    chGotten = fCharBuf[fCharIndex++];
813
814    // Handle end of line normalization and line/col member maintenance.
815    handleEOL(chGotten, false);
816
817    return true;
818}
819
820
821// ---------------------------------------------------------------------------
822//  XMLReader: peekNextChar() method inlined for speed
823// ---------------------------------------------------------------------------
824inline bool XMLReader::peekNextChar(XMLCh& chGotten)
825{
826    //
827    //  If there is something still in the buffer, get it. Else do the reload
828    //  scenario.
829    //
830    if (fCharIndex >= fCharsAvail)
831    {
832        // Try to refresh the buffer
833        if (!refreshCharBuffer())
834        {
835            chGotten = chNull;
836            return false;
837        }
838    }
839
840    chGotten = fCharBuf[fCharIndex];
841
842    //
843    //  Even though we are only peeking, we have to act the same as the
844    //  normal char get method in regards to newline normalization, though
845    //  its not as complicated as the actual character getting method's.
846    //
847    if ((chGotten == chCR || ((chGotten == chNEL || chGotten == chLineSeparator) && fNEL))
848        && (fSource == Source_External))
849        chGotten = chLF;
850
851    return true;
852}
853
854/***
855 *
856 * XML1.1
857 *
858 * 2.11 End-of-Line Handling
859 *
860 *    XML parsed entities are often stored in computer files which, for editing
861 *    convenience, are organized into lines. These lines are typically separated
862 *    by some combination of the characters CARRIAGE RETURN (#xD) and LINE FEED (#xA).
863 *
864 *    To simplify the tasks of applications, the XML processor MUST behave as if
865 *    it normalized all line breaks in external parsed entities (including the document
866 *    entity) on input, before parsing, by translating all of the following to a single
867 *    #xA character:
868 *
869 *  1. the two-character sequence #xD #xA
870 *  2. the two-character sequence #xD #x85
871 *  3. the single character #x85
872 *  4. the single character #x2028
873 *  5. any #xD character that is not immediately followed by #xA or #x85.
874 *
875 *
876 ***/
877inline void XMLReader::handleEOL(XMLCh& curCh, bool inDecl)
878{
879    // 1. the two-character sequence #xD #xA
880    // 2. the two-character sequence #xD #x85
881    // 5. any #xD character that is not immediately followed by #xA or #x85.
882    if (curCh == chCR)
883    {
884        fCurCol = 1;
885        fCurLine++;
886
887        //
888        //  If not already internalized, then convert it to an
889        //  LF and eat any following LF.
890        //
891        if (fSource == Source_External)
892        {
893            if ((fCharIndex < fCharsAvail) || refreshCharBuffer())
894            {
895                if ( fCharBuf[fCharIndex] == chLF              ||
896                    ((fCharBuf[fCharIndex] == chNEL) && fNEL)  )
897                {
898                    fCharIndex++;
899                }
900            }
901            curCh = chLF;
902        }
903    }
904    else if (curCh == chLF)                   
905    {
906        fCurCol = 1;
907        fCurLine++;
908    }
909    // 3. the single character #x85
910    // 4. the single character #x2028
911    else if (curCh == chNEL || curCh == chLineSeparator)
912    {
913        if (inDecl && fXMLVersion == XMLV1_1)
914        {
915
916        /***
917         * XML1.1
918         *
919         * 2.11 End-of-Line Handling
920         *  ...
921         *   The characters #x85 and #x2028 cannot be reliably recognized and translated
922         *   until an entity's encoding declaration (if present) has been read.
923         *   Therefore, it is a fatal error to use them within the XML declaration or
924         *   text declaration.
925         *
926         ***/
927            ThrowXMLwithMemMgr1
928                (
929                TranscodingException
930                , XMLExcepts::Reader_NelLsepinDecl
931                , fSystemId
932                , fMemoryManager
933                );
934        }
935
936        if (fNEL && fSource == Source_External)
937        {
938            fCurCol = 1;
939            fCurLine++;
940            curCh = chLF;
941        }
942    }
943    else
944    {
945        fCurCol++;
946    }
947
948    return;
949}
950
951XERCES_CPP_NAMESPACE_END
952
953#endif
Note: See TracBrowser for help on using the repository browser.