[2674] | 1 | /* |
---|
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more |
---|
| 3 | * contributor license agreements. See the NOTICE file distributed with |
---|
| 4 | * this work for additional information regarding copyright ownership. |
---|
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 |
---|
| 6 | * (the "License"); you may not use this file except in compliance with |
---|
| 7 | * the License. You may obtain a copy of the License at |
---|
| 8 | * |
---|
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 |
---|
| 10 | * |
---|
| 11 | * Unless required by applicable law or agreed to in writing, software |
---|
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, |
---|
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
---|
| 14 | * See the License for the specific language governing permissions and |
---|
| 15 | * limitations under the License. |
---|
| 16 | */ |
---|
| 17 | |
---|
| 18 | /* |
---|
| 19 | * $Id: RegularExpression.hpp 568078 2007-08-21 11:43:25Z amassari $ |
---|
| 20 | */ |
---|
| 21 | |
---|
| 22 | #if !defined(REGULAREXPRESSION_HPP) |
---|
| 23 | #define REGULAREXPRESSION_HPP |
---|
| 24 | |
---|
| 25 | // --------------------------------------------------------------------------- |
---|
| 26 | // Includes |
---|
| 27 | // --------------------------------------------------------------------------- |
---|
| 28 | #include <xercesc/util/RefArrayVectorOf.hpp> |
---|
| 29 | #include <xercesc/util/XMLString.hpp> |
---|
| 30 | #include <xercesc/util/Janitor.hpp> |
---|
| 31 | #include <xercesc/util/regx/Op.hpp> |
---|
| 32 | #include <xercesc/util/regx/TokenFactory.hpp> |
---|
| 33 | #include <xercesc/util/regx/BMPattern.hpp> |
---|
| 34 | #include <xercesc/util/regx/ModifierToken.hpp> |
---|
| 35 | #include <xercesc/util/regx/ConditionToken.hpp> |
---|
| 36 | #include <xercesc/util/regx/OpFactory.hpp> |
---|
| 37 | #include <xercesc/util/regx/RegxUtil.hpp> |
---|
| 38 | |
---|
| 39 | XERCES_CPP_NAMESPACE_BEGIN |
---|
| 40 | |
---|
| 41 | // --------------------------------------------------------------------------- |
---|
| 42 | // Forward Declaration |
---|
| 43 | // --------------------------------------------------------------------------- |
---|
| 44 | class RangeToken; |
---|
| 45 | class Match; |
---|
| 46 | |
---|
| 47 | class XMLUTIL_EXPORT RegularExpression : public XMemory |
---|
| 48 | { |
---|
| 49 | public: |
---|
| 50 | // ----------------------------------------------------------------------- |
---|
| 51 | // Public Constructors and Destructor |
---|
| 52 | // ----------------------------------------------------------------------- |
---|
| 53 | RegularExpression |
---|
| 54 | ( |
---|
| 55 | const char* const pattern |
---|
| 56 | , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager |
---|
| 57 | ); |
---|
| 58 | RegularExpression |
---|
| 59 | ( |
---|
| 60 | const char* const pattern |
---|
| 61 | , const char* const options |
---|
| 62 | , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager |
---|
| 63 | ); |
---|
| 64 | RegularExpression |
---|
| 65 | ( |
---|
| 66 | const XMLCh* const pattern |
---|
| 67 | , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager |
---|
| 68 | ); |
---|
| 69 | RegularExpression |
---|
| 70 | ( |
---|
| 71 | const XMLCh* const pattern |
---|
| 72 | , const XMLCh* const options |
---|
| 73 | , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager |
---|
| 74 | ); |
---|
| 75 | ~RegularExpression(); |
---|
| 76 | |
---|
| 77 | // ----------------------------------------------------------------------- |
---|
| 78 | // Public Constants |
---|
| 79 | // ----------------------------------------------------------------------- |
---|
| 80 | static const unsigned int MARK_PARENS; |
---|
| 81 | static const unsigned int IGNORE_CASE; |
---|
| 82 | static const unsigned int SINGLE_LINE; |
---|
| 83 | static const unsigned int MULTIPLE_LINE; |
---|
| 84 | static const unsigned int EXTENDED_COMMENT; |
---|
| 85 | static const unsigned int USE_UNICODE_CATEGORY; |
---|
| 86 | static const unsigned int UNICODE_WORD_BOUNDARY; |
---|
| 87 | static const unsigned int PROHIBIT_HEAD_CHARACTER_OPTIMIZATION; |
---|
| 88 | static const unsigned int PROHIBIT_FIXED_STRING_OPTIMIZATION; |
---|
| 89 | static const unsigned int XMLSCHEMA_MODE; |
---|
| 90 | static const unsigned int SPECIAL_COMMA; |
---|
| 91 | static const unsigned short WT_IGNORE; |
---|
| 92 | static const unsigned short WT_LETTER; |
---|
| 93 | static const unsigned short WT_OTHER; |
---|
| 94 | |
---|
| 95 | // ----------------------------------------------------------------------- |
---|
| 96 | // Public Helper methods |
---|
| 97 | // ----------------------------------------------------------------------- |
---|
| 98 | static int getOptionValue(const XMLCh ch); |
---|
| 99 | |
---|
| 100 | // ----------------------------------------------------------------------- |
---|
| 101 | // Matching methods |
---|
| 102 | // ----------------------------------------------------------------------- |
---|
| 103 | bool matches(const char* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
| 104 | bool matches(const char* const matchString, const int start, |
---|
| 105 | const int end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
| 106 | bool matches(const char* const matchString, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
| 107 | bool matches(const char* const matchString, const int start, |
---|
| 108 | const int end, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
| 109 | |
---|
| 110 | bool matches(const XMLCh* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
| 111 | bool matches(const XMLCh* const matchString, const int start, |
---|
| 112 | const int end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
| 113 | bool matches(const XMLCh* const matchString, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
| 114 | bool matches(const XMLCh* const matchString, const int start, |
---|
| 115 | const int end, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
| 116 | |
---|
| 117 | // ----------------------------------------------------------------------- |
---|
| 118 | // Tokenize methods |
---|
| 119 | // ----------------------------------------------------------------------- |
---|
| 120 | // Note: The caller owns the string vector that is returned, and is responsible |
---|
| 121 | // for deleting it. |
---|
| 122 | RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString); |
---|
| 123 | RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, const int start, |
---|
| 124 | const int end); |
---|
| 125 | |
---|
| 126 | RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString); |
---|
| 127 | RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, |
---|
| 128 | const int start, const int end); |
---|
| 129 | |
---|
| 130 | // ----------------------------------------------------------------------- |
---|
| 131 | // Replace methods |
---|
| 132 | // ----------------------------------------------------------------------- |
---|
| 133 | // Note: The caller owns the XMLCh* that is returned, and is responsible for |
---|
| 134 | // deleting it. |
---|
| 135 | XMLCh *replace(const char* const matchString, const char* const replaceString); |
---|
| 136 | XMLCh *replace(const char* const matchString, const char* const replaceString, |
---|
| 137 | const int start, const int end); |
---|
| 138 | |
---|
| 139 | XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString); |
---|
| 140 | XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString, |
---|
| 141 | const int start, const int end); |
---|
| 142 | |
---|
| 143 | // ----------------------------------------------------------------------- |
---|
| 144 | // Static initialize and cleanup methods |
---|
| 145 | // ----------------------------------------------------------------------- |
---|
| 146 | static void |
---|
| 147 | staticInitialize(MemoryManager* memoryManager); |
---|
| 148 | |
---|
| 149 | static void |
---|
| 150 | staticCleanup(); |
---|
| 151 | |
---|
| 152 | static bool isSet(const int options, const int flag); |
---|
| 153 | |
---|
| 154 | private: |
---|
| 155 | // ----------------------------------------------------------------------- |
---|
| 156 | // Private data types |
---|
| 157 | // ----------------------------------------------------------------------- |
---|
| 158 | class XMLUTIL_EXPORT Context : public XMemory |
---|
| 159 | { |
---|
| 160 | public : |
---|
| 161 | Context(MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager); |
---|
| 162 | Context(Context* src); |
---|
| 163 | ~Context(); |
---|
| 164 | |
---|
| 165 | Context& operator= (const Context& other); |
---|
| 166 | inline const XMLCh* getString() const { return fString; } |
---|
| 167 | void reset(const XMLCh* const string, const int stringLen, |
---|
| 168 | const int start, const int limit, const int noClosures); |
---|
| 169 | bool nextCh(XMLInt32& ch, int& offset, const short direction); |
---|
| 170 | |
---|
| 171 | bool fAdoptMatch; |
---|
| 172 | int fStart; |
---|
| 173 | int fLimit; |
---|
| 174 | int fLength; // fLimit - fStart |
---|
| 175 | int fSize; |
---|
| 176 | int fStringMaxLen; |
---|
| 177 | int* fOffsets; |
---|
| 178 | Match* fMatch; |
---|
| 179 | const XMLCh* fString; |
---|
| 180 | MemoryManager* fMemoryManager; |
---|
| 181 | }; |
---|
| 182 | |
---|
| 183 | // ----------------------------------------------------------------------- |
---|
| 184 | // Unimplemented constructors and operators |
---|
| 185 | // ----------------------------------------------------------------------- |
---|
| 186 | RegularExpression(const RegularExpression&); |
---|
| 187 | RegularExpression& operator=(const RegularExpression&); |
---|
| 188 | |
---|
| 189 | // ----------------------------------------------------------------------- |
---|
| 190 | // Cleanup methods |
---|
| 191 | // ----------------------------------------------------------------------- |
---|
| 192 | void cleanUp(); |
---|
| 193 | |
---|
| 194 | // ----------------------------------------------------------------------- |
---|
| 195 | // Setter methods |
---|
| 196 | // ----------------------------------------------------------------------- |
---|
| 197 | void setPattern(const XMLCh* const pattern, const XMLCh* const options=0); |
---|
| 198 | |
---|
| 199 | // ----------------------------------------------------------------------- |
---|
| 200 | // Private Helper methods |
---|
| 201 | // ----------------------------------------------------------------------- |
---|
| 202 | void prepare(); |
---|
| 203 | int parseOptions(const XMLCh* const options); |
---|
| 204 | unsigned short getWordType(const XMLCh* const target, const int begin, |
---|
| 205 | const int end, const int offset); |
---|
| 206 | unsigned short getCharType(const XMLCh ch); |
---|
| 207 | unsigned short getPreviousWordType(const XMLCh* const target, |
---|
| 208 | const int start, const int end, |
---|
| 209 | int offset); |
---|
| 210 | |
---|
| 211 | /** |
---|
| 212 | * Matching helpers |
---|
| 213 | */ |
---|
| 214 | int match(Context* const context, const Op* const operations, int offset, |
---|
| 215 | const short direction); |
---|
| 216 | bool matchIgnoreCase(const XMLInt32 ch1, const XMLInt32 ch2); |
---|
| 217 | |
---|
| 218 | /** |
---|
| 219 | * Helper methods used by match(Context* ...) |
---|
| 220 | */ |
---|
| 221 | bool matchChar(Context* const context, const XMLInt32 ch, int& offset, |
---|
| 222 | const short direction, const bool ignoreCase); |
---|
| 223 | bool matchDot(Context* const context, int& offset, const short direction); |
---|
| 224 | bool matchRange(Context* const context, const Op* const op, |
---|
| 225 | int& offset, const short direction, const bool ignoreCase); |
---|
| 226 | bool matchAnchor(Context* const context, const XMLInt32 ch, |
---|
| 227 | const int offset); |
---|
| 228 | bool matchBackReference(Context* const context, const XMLInt32 ch, |
---|
| 229 | int& offset, const short direction, |
---|
| 230 | const bool ignoreCase); |
---|
| 231 | bool matchString(Context* const context, const XMLCh* const literal, |
---|
| 232 | int& offset, const short direction, const bool ignoreCase); |
---|
| 233 | int matchUnion(Context* const context, const Op* const op, int offset, |
---|
| 234 | const short direction); |
---|
| 235 | int matchCapture(Context* const context, const Op* const op, int offset, |
---|
| 236 | const short direction); |
---|
| 237 | bool matchCondition(Context* const context, const Op* const op, int offset, |
---|
| 238 | const short direction); |
---|
| 239 | int matchModifier(Context* const context, const Op* const op, int offset, |
---|
| 240 | const short direction); |
---|
| 241 | |
---|
| 242 | /** |
---|
| 243 | * Tokenize helper |
---|
| 244 | * |
---|
| 245 | * This overloaded tokenize is for internal use only. It provides a way to |
---|
| 246 | * keep track of the sub-expressions in each match of the pattern. |
---|
| 247 | * |
---|
| 248 | * It is called by the other tokenize methods, and by the replace method. |
---|
| 249 | * The caller is responsible for the deletion of the returned |
---|
| 250 | * RefArrayVectorOf<XMLCh*> |
---|
| 251 | */ |
---|
| 252 | RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString, |
---|
| 253 | const int start, const int end, |
---|
| 254 | RefVectorOf<Match> *subEx); |
---|
| 255 | /** |
---|
| 256 | * Replace helpers |
---|
| 257 | * |
---|
| 258 | * Note: the caller owns the XMLCh* that is returned |
---|
| 259 | */ |
---|
| 260 | const XMLCh *subInExp(const XMLCh* const repString, |
---|
| 261 | const XMLCh* const origString, |
---|
| 262 | const Match* subEx); |
---|
| 263 | /** |
---|
| 264 | * Converts a token tree into an operation tree |
---|
| 265 | */ |
---|
| 266 | void compile(const Token* const token); |
---|
| 267 | Op* compile(const Token* const token, Op* const next, |
---|
| 268 | const bool reverse); |
---|
| 269 | /** |
---|
| 270 | * Helper methods used by compile |
---|
| 271 | */ |
---|
| 272 | Op* compileSingle(const Token* const token, Op* const next, |
---|
| 273 | const unsigned short tokType); |
---|
| 274 | Op* compileUnion(const Token* const token, Op* const next, |
---|
| 275 | const bool reverse); |
---|
| 276 | Op* compileCondition(const Token* const token, Op* const next, |
---|
| 277 | const bool reverse); |
---|
| 278 | Op* compileParenthesis(const Token* const token, Op* const next, |
---|
| 279 | const bool reverse); |
---|
| 280 | Op* compileLook(const Token* const token, const Op* const next, |
---|
| 281 | const bool reverse, const unsigned short tokType); |
---|
| 282 | Op* compileConcat(const Token* const token, Op* const next, |
---|
| 283 | const bool reverse); |
---|
| 284 | Op* compileClosure(const Token* const token, Op* const next, |
---|
| 285 | const bool reverse, const unsigned short tokType); |
---|
| 286 | |
---|
| 287 | // ----------------------------------------------------------------------- |
---|
| 288 | // Private data members |
---|
| 289 | // ----------------------------------------------------------------------- |
---|
| 290 | bool fHasBackReferences; |
---|
| 291 | bool fFixedStringOnly; |
---|
| 292 | int fNoGroups; |
---|
| 293 | int fMinLength; |
---|
| 294 | int fNoClosures; |
---|
| 295 | unsigned int fOptions; |
---|
| 296 | BMPattern* fBMPattern; |
---|
| 297 | XMLCh* fPattern; |
---|
| 298 | XMLCh* fFixedString; |
---|
| 299 | Op* fOperations; |
---|
| 300 | Token* fTokenTree; |
---|
| 301 | RangeToken* fFirstChar; |
---|
| 302 | static RangeToken* fWordRange; |
---|
| 303 | OpFactory fOpFactory; |
---|
| 304 | TokenFactory* fTokenFactory; |
---|
| 305 | MemoryManager* fMemoryManager; |
---|
| 306 | }; |
---|
| 307 | |
---|
| 308 | |
---|
| 309 | |
---|
| 310 | // ----------------------------------------------------------------------- |
---|
| 311 | // RegularExpression: Static initialize and cleanup methods |
---|
| 312 | // ----------------------------------------------------------------------- |
---|
| 313 | inline void RegularExpression::staticCleanup() |
---|
| 314 | { |
---|
| 315 | fWordRange = 0; |
---|
| 316 | } |
---|
| 317 | |
---|
| 318 | // --------------------------------------------------------------------------- |
---|
| 319 | // RegularExpression: Cleanup methods |
---|
| 320 | // --------------------------------------------------------------------------- |
---|
| 321 | inline void RegularExpression::cleanUp() { |
---|
| 322 | |
---|
| 323 | fMemoryManager->deallocate(fPattern);//delete [] fPattern; |
---|
| 324 | fMemoryManager->deallocate(fFixedString);//delete [] fFixedString; |
---|
| 325 | delete fBMPattern; |
---|
| 326 | delete fTokenFactory; |
---|
| 327 | } |
---|
| 328 | |
---|
| 329 | // --------------------------------------------------------------------------- |
---|
| 330 | // RegularExpression: Helper methods |
---|
| 331 | // --------------------------------------------------------------------------- |
---|
| 332 | inline bool RegularExpression::isSet(const int options, const int flag) { |
---|
| 333 | |
---|
| 334 | return (options & flag) == flag; |
---|
| 335 | } |
---|
| 336 | |
---|
| 337 | inline Op* RegularExpression::compileLook(const Token* const token, |
---|
| 338 | const Op* const next, |
---|
| 339 | const bool reverse, |
---|
| 340 | const unsigned short tokType) { |
---|
| 341 | |
---|
| 342 | Op* ret = 0; |
---|
| 343 | Op* result = compile(token->getChild(0), 0, reverse); |
---|
| 344 | |
---|
| 345 | switch(tokType) { |
---|
| 346 | case Token::T_LOOKAHEAD: |
---|
| 347 | ret = fOpFactory.createLookOp(Op::O_LOOKAHEAD, next, result); |
---|
| 348 | break; |
---|
| 349 | case Token::T_NEGATIVELOOKAHEAD: |
---|
| 350 | ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKAHEAD, next, result); |
---|
| 351 | break; |
---|
| 352 | case Token::T_LOOKBEHIND: |
---|
| 353 | ret = fOpFactory.createLookOp(Op::O_LOOKBEHIND, next, result); |
---|
| 354 | break; |
---|
| 355 | case Token::T_NEGATIVELOOKBEHIND: |
---|
| 356 | ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKBEHIND, next, result); |
---|
| 357 | break; |
---|
| 358 | case Token::T_INDEPENDENT: |
---|
| 359 | ret = fOpFactory.createIndependentOp(next, result); |
---|
| 360 | break; |
---|
| 361 | case Token::T_MODIFIERGROUP: |
---|
| 362 | ret = fOpFactory.createModifierOp(next, result, |
---|
| 363 | ((const ModifierToken *) token)->getOptions(), |
---|
| 364 | ((const ModifierToken *) token)->getOptionsMask()); |
---|
| 365 | break; |
---|
| 366 | } |
---|
| 367 | |
---|
| 368 | |
---|
| 369 | return ret; |
---|
| 370 | } |
---|
| 371 | |
---|
| 372 | inline Op* RegularExpression::compileSingle(const Token* const token, |
---|
| 373 | Op* const next, |
---|
| 374 | const unsigned short tokType) { |
---|
| 375 | |
---|
| 376 | Op* ret = 0; |
---|
| 377 | |
---|
| 378 | switch (tokType) { |
---|
| 379 | case Token::T_DOT: |
---|
| 380 | ret = fOpFactory.createDotOp(); |
---|
| 381 | break; |
---|
| 382 | case Token::T_CHAR: |
---|
| 383 | ret = fOpFactory.createCharOp(token->getChar()); |
---|
| 384 | break; |
---|
| 385 | case Token::T_ANCHOR: |
---|
| 386 | ret = fOpFactory.createAnchorOp(token->getChar()); |
---|
| 387 | break; |
---|
| 388 | case Token::T_RANGE: |
---|
| 389 | case Token::T_NRANGE: |
---|
| 390 | ret = fOpFactory.createRangeOp(token); |
---|
| 391 | break; |
---|
| 392 | case Token::T_EMPTY: |
---|
| 393 | ret = next; |
---|
| 394 | break; |
---|
| 395 | case Token::T_STRING: |
---|
| 396 | ret = fOpFactory.createStringOp(token->getString()); |
---|
| 397 | break; |
---|
| 398 | case Token::T_BACKREFERENCE: |
---|
| 399 | ret = fOpFactory.createBackReferenceOp(token->getReferenceNo()); |
---|
| 400 | break; |
---|
| 401 | } |
---|
| 402 | |
---|
| 403 | if (tokType != Token::T_EMPTY) |
---|
| 404 | ret->setNextOp(next); |
---|
| 405 | |
---|
| 406 | return ret; |
---|
| 407 | } |
---|
| 408 | |
---|
| 409 | |
---|
| 410 | inline Op* RegularExpression::compileUnion(const Token* const token, |
---|
| 411 | Op* const next, |
---|
| 412 | const bool reverse) { |
---|
| 413 | |
---|
| 414 | int tokSize = token->size(); |
---|
| 415 | UnionOp* uniOp = fOpFactory.createUnionOp(tokSize); |
---|
| 416 | |
---|
| 417 | for (int i=0; i<tokSize; i++) { |
---|
| 418 | |
---|
| 419 | uniOp->addElement(compile(token->getChild(i), next, reverse)); |
---|
| 420 | } |
---|
| 421 | |
---|
| 422 | return uniOp; |
---|
| 423 | } |
---|
| 424 | |
---|
| 425 | |
---|
| 426 | inline Op* RegularExpression::compileCondition(const Token* const token, |
---|
| 427 | Op* const next, |
---|
| 428 | const bool reverse) { |
---|
| 429 | |
---|
| 430 | Token* condTok = ((const ConditionToken*) token)->getConditionToken(); |
---|
| 431 | Token* yesTok = token->getChild(0); |
---|
| 432 | Token* noTok = token->getChild(1); |
---|
| 433 | int refNo = token->getReferenceNo(); |
---|
| 434 | Op* condOp = (condTok == 0) ? 0 : compile(condTok, 0, reverse); |
---|
| 435 | Op* yesOp = compile(yesTok, next, reverse); |
---|
| 436 | Op* noOp = (noTok == 0) ? 0 : compile(noTok, next, reverse); |
---|
| 437 | |
---|
| 438 | return fOpFactory.createConditionOp(next, refNo, condOp, yesOp, noOp); |
---|
| 439 | } |
---|
| 440 | |
---|
| 441 | |
---|
| 442 | inline Op* RegularExpression::compileParenthesis(const Token* const token, |
---|
| 443 | Op* const next, |
---|
| 444 | const bool reverse) { |
---|
| 445 | |
---|
| 446 | if (token->getNoParen() == 0) |
---|
| 447 | return compile(token->getChild(0), next, reverse); |
---|
| 448 | |
---|
| 449 | Op* captureOp = 0; |
---|
| 450 | |
---|
| 451 | if (reverse) { |
---|
| 452 | |
---|
| 453 | captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next); |
---|
| 454 | captureOp = compile(token->getChild(0), captureOp, reverse); |
---|
| 455 | |
---|
| 456 | return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp); |
---|
| 457 | } |
---|
| 458 | |
---|
| 459 | captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next); |
---|
| 460 | captureOp = compile(token->getChild(0), captureOp, reverse); |
---|
| 461 | |
---|
| 462 | return fOpFactory.createCaptureOp(token->getNoParen(), captureOp); |
---|
| 463 | } |
---|
| 464 | |
---|
| 465 | inline Op* RegularExpression::compileConcat(const Token* const token, |
---|
| 466 | Op* const next, |
---|
| 467 | const bool reverse) { |
---|
| 468 | |
---|
| 469 | Op* ret = next; |
---|
| 470 | int tokSize = token->size(); |
---|
| 471 | |
---|
| 472 | if (!reverse) { |
---|
| 473 | |
---|
| 474 | for (int i= tokSize - 1; i>=0; i--) { |
---|
| 475 | ret = compile(token->getChild(i), ret, false); |
---|
| 476 | } |
---|
| 477 | } |
---|
| 478 | else { |
---|
| 479 | |
---|
| 480 | for (int i= 0; i< tokSize; i++) { |
---|
| 481 | ret = compile(token->getChild(i), ret, true); |
---|
| 482 | } |
---|
| 483 | } |
---|
| 484 | |
---|
| 485 | return ret; |
---|
| 486 | } |
---|
| 487 | |
---|
| 488 | inline Op* RegularExpression::compileClosure(const Token* const token, |
---|
| 489 | Op* const next, |
---|
| 490 | const bool reverse, |
---|
| 491 | const unsigned short tokType) { |
---|
| 492 | |
---|
| 493 | Op* ret = 0; |
---|
| 494 | Token* childTok = token->getChild(0); |
---|
| 495 | int min = token->getMin(); |
---|
| 496 | int max = token->getMax(); |
---|
| 497 | |
---|
| 498 | if (min >= 0 && min == max) { |
---|
| 499 | |
---|
| 500 | ret = next; |
---|
| 501 | for (int i=0; i< min; i++) { |
---|
| 502 | ret = compile(childTok, ret, reverse); |
---|
| 503 | } |
---|
| 504 | |
---|
| 505 | return ret; |
---|
| 506 | } |
---|
| 507 | |
---|
| 508 | if (min > 0 && max > 0) |
---|
| 509 | max -= min; |
---|
| 510 | |
---|
| 511 | if (max > 0) { |
---|
| 512 | |
---|
| 513 | ret = next; |
---|
| 514 | for (int i=0; i<max; i++) { |
---|
| 515 | |
---|
| 516 | ChildOp* childOp = fOpFactory.createQuestionOp( |
---|
| 517 | tokType == Token::T_NONGREEDYCLOSURE); |
---|
| 518 | |
---|
| 519 | childOp->setNextOp(next); |
---|
| 520 | childOp->setChild(compile(childTok, ret, reverse)); |
---|
| 521 | ret = childOp; |
---|
| 522 | } |
---|
| 523 | } |
---|
| 524 | else { |
---|
| 525 | |
---|
| 526 | ChildOp* childOp = 0; |
---|
| 527 | |
---|
| 528 | if (tokType == Token::T_NONGREEDYCLOSURE) { |
---|
| 529 | childOp = fOpFactory.createNonGreedyClosureOp(); |
---|
| 530 | } |
---|
| 531 | else { |
---|
| 532 | |
---|
| 533 | if (childTok->getMinLength() == 0) |
---|
| 534 | childOp = fOpFactory.createClosureOp(fNoClosures++); |
---|
| 535 | else |
---|
| 536 | childOp = fOpFactory.createClosureOp(-1); |
---|
| 537 | } |
---|
| 538 | |
---|
| 539 | childOp->setNextOp(next); |
---|
| 540 | childOp->setChild(compile(childTok, childOp, reverse)); |
---|
| 541 | ret = childOp; |
---|
| 542 | } |
---|
| 543 | |
---|
| 544 | if (min > 0) { |
---|
| 545 | |
---|
| 546 | for (int i=0; i< min; i++) { |
---|
| 547 | ret = compile(childTok, ret, reverse); |
---|
| 548 | } |
---|
| 549 | } |
---|
| 550 | |
---|
| 551 | return ret; |
---|
| 552 | } |
---|
| 553 | |
---|
| 554 | inline int RegularExpression::matchModifier(Context* const context, |
---|
| 555 | const Op* const op, int offset, |
---|
| 556 | const short direction) |
---|
| 557 | { |
---|
| 558 | int saveOptions = fOptions; |
---|
| 559 | fOptions |= (int) op->getData(); |
---|
| 560 | fOptions &= (int) ~op->getData2(); |
---|
| 561 | |
---|
| 562 | int ret = match(context, op->getChild(), offset, direction); |
---|
| 563 | |
---|
| 564 | fOptions = saveOptions; |
---|
| 565 | |
---|
| 566 | return ret; |
---|
| 567 | } |
---|
| 568 | |
---|
| 569 | inline unsigned short RegularExpression::getWordType(const XMLCh* const target |
---|
| 570 | , const int begin |
---|
| 571 | , const int end |
---|
| 572 | , const int offset) |
---|
| 573 | { |
---|
| 574 | if (offset < begin || offset >= end) |
---|
| 575 | return WT_OTHER; |
---|
| 576 | |
---|
| 577 | return getCharType(target[offset]); |
---|
| 578 | } |
---|
| 579 | |
---|
| 580 | inline |
---|
| 581 | unsigned short RegularExpression::getPreviousWordType(const XMLCh* const target |
---|
| 582 | , const int start |
---|
| 583 | , const int end |
---|
| 584 | , int offset) |
---|
| 585 | { |
---|
| 586 | unsigned short ret = getWordType(target, start, end, --offset); |
---|
| 587 | |
---|
| 588 | while (ret == WT_IGNORE) { |
---|
| 589 | ret = getWordType(target, start, end, --offset); |
---|
| 590 | } |
---|
| 591 | |
---|
| 592 | return ret; |
---|
| 593 | } |
---|
| 594 | |
---|
| 595 | XERCES_CPP_NAMESPACE_END |
---|
| 596 | |
---|
| 597 | #endif |
---|
| 598 | /** |
---|
| 599 | * End of file RegularExpression.hpp |
---|
| 600 | */ |
---|
| 601 | |
---|