/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ /* * $Id: XMLReader.hpp 568078 2007-08-21 11:43:25Z amassari $ */ #if !defined(XMLREADER_HPP) #define XMLREADER_HPP #include #include #include #include XERCES_CPP_NAMESPACE_BEGIN class InputSource; class BinInputStream; class ReaderMgr; class XMLScanner; class XMLTranscoder; // --------------------------------------------------------------------------- // Instances of this class are used to manage the content of entities. The // scanner maintains a stack of these, one for each entity (this means entity // in the sense of any parsed file or internal entity) currently being // scanned. This class, given a binary input stream will handle reading in // the data and decoding it from its external decoding into the internal // Unicode format. Once internallized, this class provides the access // methods to read in the data in various ways, maintains line and column // information, and provides high performance character attribute checking // methods. // // This is NOT to be derived from. // // --------------------------------------------------------------------------- class XMLPARSER_EXPORT XMLReader : public XMemory { public: // ----------------------------------------------------------------------- // Public types // ----------------------------------------------------------------------- enum Types { Type_PE , Type_General }; enum Sources { Source_Internal , Source_External }; enum RefFrom { RefFrom_Literal , RefFrom_NonLiteral }; enum XMLVersion { XMLV1_0 , XMLV1_1 , XMLV_Unknown }; // ----------------------------------------------------------------------- // Public, query methods // ----------------------------------------------------------------------- bool isAllSpaces ( const XMLCh* const toCheck , const unsigned int count ) const; bool containsWhiteSpace ( const XMLCh* const toCheck , const unsigned int count ) const; bool isXMLLetter(const XMLCh toCheck) const; bool isFirstNameChar(const XMLCh toCheck) const; bool isNameChar(const XMLCh toCheck) const; bool isPlainContentChar(const XMLCh toCheck) const; bool isSpecialStartTagChar(const XMLCh toCheck) const; bool isXMLChar(const XMLCh toCheck) const; bool isWhitespace(const XMLCh toCheck) const; bool isControlChar(const XMLCh toCheck) const; bool isPublicIdChar(const XMLCh toCheck) const; bool isFirstNCNameChar(const XMLCh toCheck) const; bool isNCNameChar(const XMLCh toCheck) const; // ----------------------------------------------------------------------- // Constructors and Destructor // ----------------------------------------------------------------------- XMLReader ( const XMLCh* const pubId , const XMLCh* const sysId , BinInputStream* const streamToAdopt , const RefFrom from , const Types type , const Sources source , const bool throwAtEnd = false , const bool calculateSrcOfs = true , const XMLVersion xmlVersion = XMLV1_0 , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager ); XMLReader ( const XMLCh* const pubId , const XMLCh* const sysId , BinInputStream* const streamToAdopt , const XMLCh* const encodingStr , const RefFrom from , const Types type , const Sources source , const bool throwAtEnd = false , const bool calculateSrcOfs = true , const XMLVersion xmlVersion = XMLV1_0 , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager ); XMLReader ( const XMLCh* const pubId , const XMLCh* const sysId , BinInputStream* const streamToAdopt , XMLRecognizer::Encodings encodingEnum , const RefFrom from , const Types type , const Sources source , const bool throwAtEnd = false , const bool calculateSrcOfs = true , const XMLVersion xmlVersion = XMLV1_0 , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager ); ~XMLReader(); // ----------------------------------------------------------------------- // Character buffer management methods // ----------------------------------------------------------------------- unsigned long charsLeftInBuffer() const; bool refreshCharBuffer(); // ----------------------------------------------------------------------- // Scanning methods // ----------------------------------------------------------------------- bool getName(XMLBuffer& toFill, const bool token); bool getQName(XMLBuffer& toFill, int* colonPosition); bool getNextChar(XMLCh& chGotten); bool getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten); void movePlainContentChars(XMLBuffer &dest); bool getSpaces(XMLBuffer& toFill); bool getUpToCharOrWS(XMLBuffer& toFill, const XMLCh toCheck); bool peekNextChar(XMLCh& chGotten); bool skipIfQuote(XMLCh& chGotten); bool skipSpaces(bool& skippedSomething, bool inDecl = false); bool skippedChar(const XMLCh toSkip); bool skippedSpace(); bool skippedString(const XMLCh* const toSkip); bool peekString(const XMLCh* const toPeek); // ----------------------------------------------------------------------- // Getter methods // ----------------------------------------------------------------------- XMLSSize_t getColumnNumber() const; const XMLCh* getEncodingStr() const; XMLSSize_t getLineNumber() const; bool getNoMoreFlag() const; const XMLCh* getPublicId() const; unsigned int getReaderNum() const; RefFrom getRefFrom() const; Sources getSource() const; unsigned int getSrcOffset() const; const XMLCh* getSystemId() const; bool getThrowAtEnd() const; Types getType() const; // ----------------------------------------------------------------------- // Setter methods // ----------------------------------------------------------------------- bool setEncoding ( const XMLCh* const newEncoding ); void setReaderNum(const unsigned int newNum); void setThrowAtEnd(const bool newValue); void setXMLVersion(const XMLVersion version); private: // ----------------------------------------------------------------------- // Unimplemented constructors and operators // ----------------------------------------------------------------------- XMLReader(const XMLReader&); XMLReader& operator=(const XMLReader&); // --------------------------------------------------------------------------- // Class Constants // // kCharBufSize // The size of the character spool buffer that we use. Its not terribly // large because its just getting filled with data from a raw byte // buffer as we go along. We don't want to decode all the text at // once before we find out that there is an error. // // NOTE: This is a size in characters, not bytes. // // kRawBufSize // The size of the raw buffer from which raw bytes are spooled out // as we transcode chunks of data. As it is emptied, it is filled back // in again from the source stream. // --------------------------------------------------------------------------- enum Constants { kCharBufSize = 16 * 1024 , kRawBufSize = 48 * 1024 }; // ----------------------------------------------------------------------- // Private helper methods // ----------------------------------------------------------------------- void checkForSwapped(); void doInitCharSizeChecks(); void doInitDecode(); XMLByte getNextRawByte ( const bool eoiOk ); void refreshRawBuffer(); void setTranscoder ( const XMLCh* const newEncoding ); unsigned int xcodeMoreChars ( XMLCh* const bufToFill , unsigned char* const charSizes , const unsigned int maxChars ); void handleEOL ( XMLCh& curCh , bool inDecl = false ); // ----------------------------------------------------------------------- // Data members // // fCharIndex // The index into the character buffer. When this hits fCharsAvail // then its time to refill. // // fCharBuf // A buffer that the reader manager fills up with transcoded // characters a small amount at a time. // // fCharsAvail // The characters currently available in the character buffer. // // fCharSizeBuf // This buffer is an array that contains the number of source chars // eaten to create each char in the fCharBuf buffer. So the entry // fCharSizeBuf[x] is the number of source chars that were eaten // to make the internalized char fCharBuf[x]. This only contains // useful data if fSrcOfsSupported is true. // // fCharOfsBuf // This buffer is an array that contains the offset in the // fRawByteBuf buffer of each char in the fCharBuf buffer. It // only contains useful data if fSrcOfsSupported is true. // // fCurCol // fCurLine // The current line and column that we are in within this reader's // text. // // fEncoding // This is the rough encoding setting. This enum is set during // construction and just tells us the rough family of encoding that // we are doing. // // fEncodingStr // This is the name of the encoding we are using. It will be // provisionally set during construction, from the auto-sensed // encoding. But it might be overridden when the XMLDecl is finally // seen by the scanner. It can also be forced to a particular // encoding, in which case fForcedEncoding is set. // // fForcedEncoding // If the encoding if forced then this is set and all other // information will be ignored. This encoding will be taken as // gospel. This is done by calling an alternate constructor. // // fNoMore // This is set when the source text is exhausted. It lets us know // quickly that no more text is available. // // fRawBufIndex // The current index into the raw byte buffer. When its equal to // fRawBytesAvail then we need to read another buffer. // // fRawByteBuf // This is the raw byte buffer that is used to spool out bytes // from into the fCharBuf buffer, as we transcode in blocks. // // fRawBytesAvail // The number of bytes currently available in the raw buffer. This // helps deal with the last buffer's worth, which will usually not // be a full one. // // fReaderNum // Each reader from a particular reader manager (which means from a // particular document) is given a unique number. The reader manager // sets these numbers. They are used to catch things like partial // markup errors. // // fRefFrom // This flag is provided in the ctor, and tells us if we represent // some entity being expanded inside a literal. Sometimes things // happen differently inside and outside literals. // // fPublicId // fSystemId // These are the system and public ids of the source that this // reader is reading. // // fSentTrailingSpace // If we are a PE entity being read and we not referenced from a // literal, then a leading and trailing space must be faked into the // data. This lets us know we've done the trailing space already (so // we don't just keep doing it again and again.) // // fSource // Indicates whether the content this reader is spooling as already // been internalized. This will prevent multiple processing of // whitespace when an already internalized entity is being spooled // out. // // fSpareChar // Some encodings can create two chars in an atomic way, e.g. // surrogate pairs. We might not be able to store both, so we store // it here until the next buffer transcoding operation. // // fSrcOfsBase // This is the base offset within the source of this entity. Values // in the curent fCharSizeBuf array are relative to this value. // // fSrcOfsSupported // This flag is set to indicate whether source byte offset info // is supported. For intrinsic encodings, its always set since we // can always support it. For transcoder based encodings, we ask // the transcoder if it supports it or not. // // fStream // This is the input stream that provides the data for the reader. // Its always treated as a raw byte stream. The derived class will // ask for buffers of text from it and will handle making some // sense of it. // // fSwapped // If the encoding is one of the ones we do intrinsically, and its // in a different byte order from our native order, then this is // set to remind us to byte swap it during transcoding. // // fThrowAtEnd // Indicates whether the reader manager should throw an end of entity // exception at the end of this reader instance. This is usually // set for top level external entity references. It overrides the // reader manager's global flag that controls throwing at the end // of entities. Defaults to false. // // fTranscoder // If the encoding is not one that we handle intrinsically, then // we use an an external transcoder to do it. This class is an // abstraction that allows us to use pluggable external transcoding // services (via XMLTransService in util.) // // fType // Indicates whether this reader represents a PE or not. If this // flag is true and the fInLiteral flag is false, then we will put // out an extra space at the end. // // fgCharCharsTable; // Pointer to XMLChar table, depends on XML version // // fNEL // Boolean indicates if NEL and LSEP should be recognized as NEL // // fXMLVersion // Enum to indicate if this Reader is conforming to XML 1.0 or XML 1.1 // ----------------------------------------------------------------------- unsigned int fCharIndex; XMLCh fCharBuf[kCharBufSize]; unsigned int fCharsAvail; unsigned char fCharSizeBuf[kCharBufSize]; unsigned int fCharOfsBuf[kCharBufSize]; XMLSSize_t fCurCol; XMLSSize_t fCurLine; XMLRecognizer::Encodings fEncoding; XMLCh* fEncodingStr; bool fForcedEncoding; bool fNoMore; XMLCh* fPublicId; unsigned int fRawBufIndex; XMLByte fRawByteBuf[kRawBufSize]; unsigned int fRawBytesAvail; unsigned int fReaderNum; RefFrom fRefFrom; bool fSentTrailingSpace; Sources fSource; unsigned int fSrcOfsBase; bool fSrcOfsSupported; bool fCalculateSrcOfs; XMLCh* fSystemId; BinInputStream* fStream; bool fSwapped; bool fThrowAtEnd; XMLTranscoder* fTranscoder; Types fType; XMLByte* fgCharCharsTable; bool fNEL; XMLVersion fXMLVersion; MemoryManager* fMemoryManager; }; // --------------------------------------------------------------------------- // XMLReader: Public, query methods // --------------------------------------------------------------------------- inline bool XMLReader::isNameChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gNameCharMask) != 0); } inline bool XMLReader::isNCNameChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gNCNameCharMask) != 0); } inline bool XMLReader::isPlainContentChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gPlainContentCharMask) != 0); } inline bool XMLReader::isFirstNameChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gFirstNameCharMask) != 0); } inline bool XMLReader::isFirstNCNameChar(const XMLCh toCheck) const { return (((fgCharCharsTable[toCheck] & gFirstNameCharMask) != 0) && (toCheck != chColon)); } inline bool XMLReader::isSpecialStartTagChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gSpecialStartTagCharMask) != 0); } inline bool XMLReader::isXMLChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gXMLCharMask) != 0); } inline bool XMLReader::isXMLLetter(const XMLCh toCheck) const { return (((fgCharCharsTable[toCheck] & gFirstNameCharMask) != 0) && (toCheck != chColon) && (toCheck != chUnderscore)); } inline bool XMLReader::isWhitespace(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gWhitespaceCharMask) != 0); } inline bool XMLReader::isControlChar(const XMLCh toCheck) const { return ((fgCharCharsTable[toCheck] & gControlCharMask) != 0); } // --------------------------------------------------------------------------- // XMLReader: Buffer management methods // --------------------------------------------------------------------------- inline unsigned long XMLReader::charsLeftInBuffer() const { return fCharsAvail - fCharIndex; } // --------------------------------------------------------------------------- // XMLReader: Getter methods // --------------------------------------------------------------------------- inline XMLSSize_t XMLReader::getColumnNumber() const { return fCurCol; } inline const XMLCh* XMLReader::getEncodingStr() const { return fEncodingStr; } inline XMLSSize_t XMLReader::getLineNumber() const { return fCurLine; } inline bool XMLReader::getNoMoreFlag() const { return fNoMore; } inline const XMLCh* XMLReader::getPublicId() const { return fPublicId; } inline unsigned int XMLReader::getReaderNum() const { return fReaderNum; } inline XMLReader::RefFrom XMLReader::getRefFrom() const { return fRefFrom; } inline XMLReader::Sources XMLReader::getSource() const { return fSource; } inline const XMLCh* XMLReader::getSystemId() const { return fSystemId; } inline bool XMLReader::getThrowAtEnd() const { return fThrowAtEnd; } inline XMLReader::Types XMLReader::getType() const { return fType; } // --------------------------------------------------------------------------- // XMLReader: Setter methods // --------------------------------------------------------------------------- inline void XMLReader::setReaderNum(const unsigned int newNum) { fReaderNum = newNum; } inline void XMLReader::setThrowAtEnd(const bool newValue) { fThrowAtEnd = newValue; } inline void XMLReader::setXMLVersion(const XMLVersion version) { fXMLVersion = version; if (version == XMLV1_1) { fNEL = true; fgCharCharsTable = XMLChar1_1::fgCharCharsTable1_1; } else { fNEL = XMLChar1_0::enableNEL; fgCharCharsTable = XMLChar1_0::fgCharCharsTable1_0; } } // --------------------------------------------------------------------------- // // XMLReader: movePlainContentChars() // // Move as many plain (no special handling of any sort required) content // characters as possible from this reader to the supplied destination buffer. // // This is THE hottest performance spot in the parser. // // --------------------------------------------------------------------------- inline void XMLReader::movePlainContentChars(XMLBuffer &dest) { unsigned int count = fCharIndex; while (fCharIndex < fCharsAvail) { if (!isPlainContentChar(fCharBuf[fCharIndex])) break; fCharIndex++; } if (count != fCharIndex) { fCurCol += (fCharIndex - count); dest.append(&fCharBuf[count], fCharIndex - count); } } // --------------------------------------------------------------------------- // XMLReader: getNextCharIfNot() method inlined for speed // --------------------------------------------------------------------------- inline bool XMLReader::getNextCharIfNot(const XMLCh chNotToGet, XMLCh& chGotten) { // // See if there is at least a char in the buffer. Else, do the buffer // reload logic. // if (fCharIndex >= fCharsAvail) { // If fNoMore is set, then we have nothing else to give if (fNoMore) return false; // Try to refresh if (!refreshCharBuffer()) return false; } // Check the next char if (fCharBuf[fCharIndex] == chNotToGet) return false; // Its not the one we want to skip so bump the index chGotten = fCharBuf[fCharIndex++]; // Handle end of line normalization and line/col member maintenance. // // we can have end-of-line combinations with a leading // chCR(xD), chLF(xA), chNEL(x85), or chLineSeparator(x2028) // // 0000000000001101 chCR // 0000000000001010 chLF // 0000000010000101 chNEL // 0010000000101000 chLineSeparator // ----------------------- // 1101111101010000 == ~(chCR|chLF|chNEL|chLineSeparator) // // if the result of the logical-& operation is // true : 'curCh' can not be chCR, chLF, chNEL or chLineSeparator // false : 'curCh' can be chCR, chLF, chNEL or chLineSeparator // if ( chGotten & (XMLCh) ~(chCR|chLF|chNEL|chLineSeparator) ) { fCurCol++; } else { handleEOL(chGotten, false); } return true; } // --------------------------------------------------------------------------- // XMLReader: getNextChar() method inlined for speed // --------------------------------------------------------------------------- inline bool XMLReader::getNextChar(XMLCh& chGotten) { // // See if there is at least a char in the buffer. Else, do the buffer // reload logic. // if (fCharIndex >= fCharsAvail) { // If fNoMore is set, then we have nothing else to give if (fNoMore) return false; // Try to refresh if (!refreshCharBuffer()) return false; } chGotten = fCharBuf[fCharIndex++]; // Handle end of line normalization and line/col member maintenance. // // we can have end-of-line combinations with a leading // chCR(xD), chLF(xA), chNEL(x85), or chLineSeparator(x2028) // // 0000000000001101 chCR // 0000000000001010 chLF // 0000000010000101 chNEL // 0010000000101000 chLineSeparator // ----------------------- // 1101111101010000 == ~(chCR|chLF|chNEL|chLineSeparator) // // if the result of the logical-& operation is // true : 'curCh' can not be chCR, chLF, chNEL or chLineSeparator // false : 'curCh' can be chCR, chLF, chNEL or chLineSeparator // if ( chGotten & (XMLCh) ~(chCR|chLF|chNEL|chLineSeparator) ) { fCurCol++; } else { handleEOL(chGotten, false); } return true; } // --------------------------------------------------------------------------- // XMLReader: peekNextChar() method inlined for speed // --------------------------------------------------------------------------- inline bool XMLReader::peekNextChar(XMLCh& chGotten) { // // If there is something still in the buffer, get it. Else do the reload // scenario. // if (fCharIndex >= fCharsAvail) { // Try to refresh the buffer if (!refreshCharBuffer()) { chGotten = chNull; return false; } } chGotten = fCharBuf[fCharIndex]; // // Even though we are only peeking, we have to act the same as the // normal char get method in regards to newline normalization, though // its not as complicated as the actual character getting method's. // if ((chGotten == chCR || (fNEL && (chGotten == chNEL || chGotten == chLineSeparator))) && (fSource == Source_External)) chGotten = chLF; return true; } XERCES_CPP_NAMESPACE_END #endif