source: NonGTP/Xerces/xercesc/util/regx/RegularExpression.hpp @ 188

Revision 188, 25.2 KB checked in by mattausch, 19 years ago (diff)

added xercesc to support

Line 
1/*
2 * The Apache Software License, Version 1.1
3 *
4 * Copyright (c) 2001-2003 The Apache Software Foundation.  All rights
5 * reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 *    notice, this list of conditions and the following disclaimer.
13 *
14 * 2. Redistributions in binary form must reproduce the above copyright
15 *    notice, this list of conditions and the following disclaimer in
16 *    the documentation and/or other materials provided with the
17 *    distribution.
18 *
19 * 3. The end-user documentation included with the redistribution,
20 *    if any, must include the following acknowledgment:
21 *       "This product includes software developed by the
22 *        Apache Software Foundation (http://www.apache.org/)."
23 *    Alternately, this acknowledgment may appear in the software itself,
24 *    if and wherever such third-party acknowledgments normally appear.
25 *
26 * 4. The names "Xerces" and "Apache Software Foundation" must
27 *    not be used to endorse or promote products derived from this
28 *    software without prior written permission. For written
29 *    permission, please contact apache\@apache.org.
30 *
31 * 5. Products derived from this software may not be called "Apache",
32 *    nor may "Apache" appear in their name, without prior written
33 *    permission of the Apache Software Foundation.
34 *
35 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
36 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
37 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
38 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
42 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
43 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
44 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
45 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
46 * SUCH DAMAGE.
47 * ====================================================================
48 *
49 * This software consists of voluntary contributions made by many
50 * individuals on behalf of the Apache Software Foundation, and was
51 * originally based on software copyright (c) 2001, International
52 * Business Machines, Inc., http://www.ibm.com .  For more information
53 * on the Apache Software Foundation, please see
54 * <http://www.apache.org/>.
55 */
56
57/*
58 * $Id: RegularExpression.hpp,v 1.17 2004/01/13 20:05:00 peiyongz Exp $
59 */
60
61#if !defined(REGULAREXPRESSION_HPP)
62#define REGULAREXPRESSION_HPP
63
64// ---------------------------------------------------------------------------
65//  Includes
66// ---------------------------------------------------------------------------
67#include <xercesc/util/RefArrayVectorOf.hpp>
68#include <xercesc/util/XMLString.hpp>
69#include <xercesc/util/Janitor.hpp>
70#include <xercesc/util/Mutexes.hpp>
71#include <xercesc/util/regx/Op.hpp>
72#include <xercesc/util/regx/TokenFactory.hpp>
73#include <xercesc/util/regx/BMPattern.hpp>
74#include <xercesc/util/regx/ModifierToken.hpp>
75#include <xercesc/util/regx/ConditionToken.hpp>
76#include <xercesc/util/regx/OpFactory.hpp>
77
78XERCES_CPP_NAMESPACE_BEGIN
79
80// ---------------------------------------------------------------------------
81//  Forward Declaration
82// ---------------------------------------------------------------------------
83class RangeToken;
84class Match;
85
86class XMLUTIL_EXPORT RegularExpression : public XMemory
87{
88public:
89    // -----------------------------------------------------------------------
90    //  Public Constructors and Destructor
91    // -----------------------------------------------------------------------
92    RegularExpression
93    (
94        const char* const pattern
95        , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager
96    );
97    RegularExpression
98    (
99        const char* const pattern
100        , const char* const options
101        , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager
102    );
103    RegularExpression
104    (
105        const XMLCh* const pattern
106        , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager
107    );
108    RegularExpression
109    (
110        const XMLCh* const pattern
111        , const XMLCh* const options
112        , MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager
113    );
114    ~RegularExpression();
115
116    // -----------------------------------------------------------------------
117    //  Public Constants
118    // -----------------------------------------------------------------------
119    static const unsigned int   MARK_PARENS;
120    static const unsigned int   IGNORE_CASE;
121    static const unsigned int   SINGLE_LINE;
122    static const unsigned int   MULTIPLE_LINE;
123    static const unsigned int   EXTENDED_COMMENT;
124    static const unsigned int   USE_UNICODE_CATEGORY;
125    static const unsigned int   UNICODE_WORD_BOUNDARY;
126    static const unsigned int   PROHIBIT_HEAD_CHARACTER_OPTIMIZATION;
127    static const unsigned int   PROHIBIT_FIXED_STRING_OPTIMIZATION;
128    static const unsigned int   XMLSCHEMA_MODE;
129    static const unsigned int   SPECIAL_COMMA;
130    static const unsigned short WT_IGNORE;
131    static const unsigned short WT_LETTER;
132    static const unsigned short WT_OTHER;
133
134    // -----------------------------------------------------------------------
135    //  Public Helper methods
136    // -----------------------------------------------------------------------
137    static int getOptionValue(const XMLCh ch);
138
139    // -----------------------------------------------------------------------
140    //  Matching methods
141    // -----------------------------------------------------------------------
142    bool matches(const char* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
143    bool matches(const char* const matchString, const int start,
144                 const int end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
145    bool matches(const char* const matchString, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
146    bool matches(const char* const matchString, const int start,
147                 const int end, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
148
149    bool matches(const XMLCh* const matchString, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
150    bool matches(const XMLCh* const matchString, const int start,
151                 const int end, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
152    bool matches(const XMLCh* const matchString, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
153    bool matches(const XMLCh* const matchString, const int start,
154                 const int end, Match* const pMatch, MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
155
156    // -----------------------------------------------------------------------
157    //  Tokenize methods
158    // -----------------------------------------------------------------------
159    // Note: The caller owns the string vector that is returned, and is responsible
160    //       for deleting it.
161    RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString);
162    RefArrayVectorOf<XMLCh> *tokenize(const char* const matchString, const int start,
163                                      const int end);
164
165    RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString);
166    RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString,
167                                      const int start, const int end);
168
169    // -----------------------------------------------------------------------
170    //  Replace methods
171    // -----------------------------------------------------------------------
172    // Note: The caller owns the XMLCh* that is returned, and is responsible for
173    //       deleting it.
174    XMLCh *replace(const char* const matchString, const char* const replaceString);
175    XMLCh *replace(const char* const matchString, const char* const replaceString,
176                   const int start, const int end);
177
178    XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString);
179    XMLCh *replace(const XMLCh* const matchString, const XMLCh* const replaceString,
180                   const int start, const int end);
181
182private:
183    // -----------------------------------------------------------------------
184    //  Private data types
185    // -----------------------------------------------------------------------
186    class XMLUTIL_EXPORT Context : public XMemory
187    {
188        public :
189            Context(MemoryManager* const manager = XMLPlatformUtils::fgMemoryManager);
190            ~Context();
191
192            inline const XMLCh* getString() const { return fString; }
193            void reset(const XMLCh* const string, const int stringLen,
194                       const int start, const int limit, const int noClosures);
195            bool nextCh(XMLInt32& ch, int& offset, const short direction);
196           
197            bool      fAdoptMatch;
198            int       fStart;
199            int       fLimit;
200            int       fLength;    // fLimit - fStart
201            int       fSize;
202            int       fStringMaxLen;
203            int*      fOffsets;
204            Match*    fMatch;
205            XMLCh*    fString;
206            MemoryManager* fMemoryManager;
207
208            friend class Janitor<Context>;
209    };
210
211    // -----------------------------------------------------------------------
212    //  Unimplemented constructors and operators
213    // -----------------------------------------------------------------------
214    RegularExpression(const RegularExpression&);
215    RegularExpression& operator=(const RegularExpression&);
216
217    // -----------------------------------------------------------------------
218    //  Cleanup methods
219    // -----------------------------------------------------------------------
220    void cleanUp();
221
222    // -----------------------------------------------------------------------
223    //  Setter methods
224    // -----------------------------------------------------------------------
225    void setPattern(const XMLCh* const pattern, const XMLCh* const options=0);
226
227    // -----------------------------------------------------------------------
228    //  Private Helper methods
229    // -----------------------------------------------------------------------
230    void prepare();
231    int parseOptions(const XMLCh* const options);
232    bool isSet(const int options, const int flag);
233    unsigned short getWordType(const XMLCh* const target, const int begin,
234                               const int end, const int offset);
235    unsigned short getCharType(const XMLCh ch);
236    unsigned short getPreviousWordType(const XMLCh* const target,
237                                       const int start, const int end,
238                                       int offset);
239
240    /**
241      *    Matching helpers
242      */
243    int match(Context* const context, const Op* const operations, int offset,
244              const short direction);
245    bool matchIgnoreCase(const XMLInt32 ch1, const XMLInt32 ch2);
246
247    /**
248      *    Helper methods used by match(Context* ...)
249      */
250    bool matchChar(Context* const context, const XMLInt32 ch, int& offset,
251                   const short direction, const bool ignoreCase);
252    bool matchDot(Context* const context, int& offset, const short direction);
253    bool matchRange(Context* const context, const Op* const op,
254                    int& offset, const short direction, const bool ignoreCase);
255    bool matchAnchor(Context* const context, const XMLInt32 ch,
256                     const int offset);
257    bool matchBackReference(Context* const context, const XMLInt32 ch,
258                            int& offset, const short direction,
259                            const bool ignoreCase);
260    bool matchString(Context* const context, const XMLCh* const literal,
261                     int& offset, const short direction, const bool ignoreCase);
262    int  matchUnion(Context* const context, const Op* const op, int offset,
263                    const short direction);
264    int matchCapture(Context* const context, const Op* const op, int offset,
265                     const short direction);
266    bool matchCondition(Context* const context, const Op* const op, int offset,
267                        const short direction);
268    int matchModifier(Context* const context, const Op* const op, int offset,
269                      const short direction);
270
271    /**
272     *    Tokenize helper
273     *
274     *    This overloaded tokenize is for internal use only. It provides a way to
275     *    keep track of the sub-expressions in each match of the pattern.
276     *
277     *    It is called by the other tokenize methods, and by the replace method.
278     *    The caller is responsible for the deletion of the returned
279     *    RefArrayVectorOf<XMLCh*>
280     */
281    RefArrayVectorOf<XMLCh> *tokenize(const XMLCh* const matchString,
282                                      const int start, const int end,
283                                      RefVectorOf<Match> *subEx);
284    /**
285     *    Replace helpers
286     *
287     *    Note: the caller owns the XMLCh* that is returned
288     */
289    const XMLCh *subInExp(const XMLCh* const repString,
290                          const XMLCh* const origString,
291                          const Match* subEx);
292    /**
293     *    Converts a token tree into an operation tree
294     */
295    void compile(const Token* const token);
296    Op*  compile(const Token* const token, Op* const next,
297                 const bool reverse);
298    /**
299      *    Helper methods used by compile
300      */
301    Op* compileSingle(const Token* const token, Op* const next,
302                      const unsigned short tokType);
303    Op* compileUnion(const Token* const token, Op* const next,
304                     const bool reverse);
305    Op* compileCondition(const Token* const token, Op* const next,
306                         const bool reverse);
307    Op* compileParenthesis(const Token* const token, Op* const next,
308                           const bool reverse);
309    Op* compileLook(const Token* const token, const Op* const next,
310                    const bool reverse, const unsigned short tokType);
311    Op* compileConcat(const Token* const token, Op* const next,
312                      const bool reverse);
313    Op* compileClosure(const Token* const token, Op* const next,
314                       const bool reverse, const unsigned short tokType);
315
316    // -----------------------------------------------------------------------
317    //  Private data members
318    // -----------------------------------------------------------------------
319    bool               fHasBackReferences;
320    bool               fFixedStringOnly;
321    int                fNoGroups;
322    int                fMinLength;
323    int                fNoClosures;
324    unsigned int       fOptions;
325    BMPattern*         fBMPattern;
326    XMLCh*             fPattern;
327    XMLCh*             fFixedString;
328    Op*                fOperations;
329    Token*             fTokenTree;
330    RangeToken*        fFirstChar;
331    static RangeToken* fWordRange;
332    OpFactory          fOpFactory;
333    XMLMutex           fMutex;
334    TokenFactory*      fTokenFactory;
335    MemoryManager*     fMemoryManager;
336};
337
338
339  // ---------------------------------------------------------------------------
340  //  RegularExpression: Cleanup methods
341  // ---------------------------------------------------------------------------
342  inline void RegularExpression::cleanUp() {
343
344      fMemoryManager->deallocate(fPattern);//delete [] fPattern;
345      fMemoryManager->deallocate(fFixedString);//delete [] fFixedString;     
346      delete fBMPattern;
347      delete fTokenFactory;
348  }
349
350  // ---------------------------------------------------------------------------
351  //  RegularExpression: Helper methods
352  // ---------------------------------------------------------------------------
353  inline bool RegularExpression::isSet(const int options, const int flag) {
354
355      return (options & flag) == flag;
356  }
357
358  inline Op* RegularExpression::compileLook(const Token* const token,
359                                            const Op* const next,
360                                            const bool reverse,
361                                            const unsigned short tokType) {
362
363      Op*    ret = 0;
364      Op*    result = compile(token->getChild(0), 0, reverse);
365
366      switch(tokType) {
367      case Token::T_LOOKAHEAD:
368          ret = fOpFactory.createLookOp(Op::O_LOOKAHEAD, next, result);
369          break;
370      case Token::T_NEGATIVELOOKAHEAD:
371          ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKAHEAD, next, result);
372          break;
373      case Token::T_LOOKBEHIND:
374          ret = fOpFactory.createLookOp(Op::O_LOOKBEHIND, next, result);
375          break;
376      case Token::T_NEGATIVELOOKBEHIND:
377          ret = fOpFactory.createLookOp(Op::O_NEGATIVELOOKBEHIND, next, result);
378          break;
379      case Token::T_INDEPENDENT:
380          ret = fOpFactory.createIndependentOp(next, result);
381          break;
382      case Token::T_MODIFIERGROUP:
383          ret = fOpFactory.createModifierOp(next, result,
384                                     ((ModifierToken *) token)->getOptions(),
385                                     ((ModifierToken *) token)->getOptionsMask());
386          break;
387      }
388
389
390      return ret;
391  }
392
393  inline Op* RegularExpression::compileSingle(const Token* const token,
394                                              Op* const next,
395                                              const unsigned short tokType) {
396
397      Op* ret = 0;
398
399      switch (tokType) {
400      case Token::T_DOT:
401          ret = fOpFactory.createDotOp();
402          break;
403      case Token::T_CHAR:
404          ret = fOpFactory.createCharOp(token->getChar());
405          break;
406      case Token::T_ANCHOR:
407          ret = fOpFactory.createAnchorOp(token->getChar());
408          break;
409      case Token::T_RANGE:
410      case Token::T_NRANGE:
411          ret = fOpFactory.createRangeOp(token);
412          break;
413      case Token::T_EMPTY:
414          ret = next;
415          break;
416      case Token::T_STRING:
417          ret = fOpFactory.createStringOp(token->getString());
418          break;
419      case Token::T_BACKREFERENCE:
420          ret = fOpFactory.createBackReferenceOp(token->getReferenceNo());
421          break;
422      }
423
424      if (tokType != Token::T_EMPTY)
425          ret->setNextOp(next);
426
427      return ret;
428  }
429
430
431  inline Op* RegularExpression::compileUnion(const Token* const token,
432                                             Op* const next,
433                                             const bool reverse) {
434
435      int tokSize = token->size();
436      UnionOp* uniOp = fOpFactory.createUnionOp(tokSize);
437
438      for (int i=0; i<tokSize; i++) {
439
440          uniOp->addElement(compile(token->getChild(i), next, reverse));
441      }
442
443      return uniOp;
444  }
445
446
447  inline Op* RegularExpression::compileCondition(const Token* const token,
448                                                 Op* const next,
449                                                 const bool reverse) {
450
451      Token* condTok = ((ConditionToken*) token)->getConditionToken();
452      Token* yesTok  = token->getChild(0);
453      Token* noTok   = token->getChild(1);
454      int    refNo   = token->getReferenceNo();
455      Op*    condOp  = (condTok == 0) ? 0 : compile(condTok, 0, reverse);
456      Op*    yesOp   = compile(yesTok, next, reverse);
457      Op*    noOp    = (noTok == 0) ? 0 : compile(noTok, next, reverse);
458
459      return fOpFactory.createConditionOp(next, refNo, condOp, yesOp, noOp);
460  }
461
462
463  inline Op* RegularExpression::compileParenthesis(const Token* const token,
464                                                   Op* const next,
465                                                   const bool reverse) {
466
467      if (token->getNoParen() == 0)
468          return compile(token->getChild(0), next, reverse);
469
470      Op* captureOp    = 0;
471
472      if (reverse) {
473
474          captureOp = fOpFactory.createCaptureOp(token->getNoParen(), next);
475          captureOp = compile(token->getChild(0), captureOp, reverse);
476
477          return fOpFactory.createCaptureOp(-token->getNoParen(), captureOp);
478      }
479
480      captureOp = fOpFactory.createCaptureOp(-token->getNoParen(), next);
481      captureOp = compile(token->getChild(0), captureOp, reverse);
482
483      return fOpFactory.createCaptureOp(token->getNoParen(), captureOp);
484  }
485
486  inline Op* RegularExpression::compileConcat(const Token* const token,
487                                              Op*  const next,
488                                              const bool reverse) {
489
490      Op* ret = next;
491      int tokSize = token->size();
492
493      if (!reverse) {
494
495          for (int i= tokSize - 1; i>=0; i--) {
496              ret = compile(token->getChild(i), ret, false);
497          }
498      }
499      else {
500
501          for (int i= 0; i< tokSize; i++) {
502              ret = compile(token->getChild(i), ret, true);
503          }
504      }
505
506      return ret;
507  }
508
509  inline Op* RegularExpression::compileClosure(const Token* const token,
510                                               Op* const next,
511                                               const bool reverse,
512                                               const unsigned short tokType) {
513
514      Op*    ret      = 0;
515      Token* childTok = token->getChild(0);
516      int    min      = token->getMin();
517      int    max      = token->getMax();
518
519      if (min >= 0 && min == max) {
520
521          ret = next;
522          for (int i=0; i< min; i++) {
523              ret = compile(childTok, ret, reverse);
524          }
525
526          return ret;
527      }
528
529      if (min > 0 && max > 0)
530          max -= min;
531
532      if (max > 0) {
533
534          ret = next;
535          for (int i=0; i<max; i++) {
536
537              ChildOp* childOp = fOpFactory.createQuestionOp(
538                  tokType == Token::T_NONGREEDYCLOSURE);
539
540              childOp->setNextOp(next);
541              childOp->setChild(compile(childTok, ret, reverse));
542              ret = childOp;
543          }
544      }
545      else {
546
547          ChildOp* childOp = 0;
548
549          if (tokType == Token::T_NONGREEDYCLOSURE) {
550              childOp = fOpFactory.createNonGreedyClosureOp();
551          }
552          else {
553
554              if (childTok->getMinLength() == 0)
555                  childOp = fOpFactory.createClosureOp(fNoClosures++);
556              else
557                  childOp = fOpFactory.createClosureOp(-1);
558          }
559
560          childOp->setNextOp(next);
561          childOp->setChild(compile(childTok, childOp, reverse));
562          ret = childOp;
563      }
564
565      if (min > 0) {
566
567          for (int i=0; i< min; i++) {
568              ret = compile(childTok, ret, reverse);
569          }
570      }
571
572      return ret;
573  }
574
575  inline int RegularExpression::matchUnion(Context* const context,
576                                           const Op* const op, int offset,
577                                           const short direction)
578  {
579      unsigned int opSize = op->getSize();
580      int ret = -1;
581
582      for (unsigned int i=0; i < opSize; i++) {
583
584          ret = match(context, op->elementAt(i), offset, direction);
585
586          if (ret == context->fLimit)
587              return ret;
588      }
589
590      return -1;
591  }
592
593  inline int RegularExpression::matchModifier(Context* const context,
594                                              const Op* const op, int offset,
595                                              const short direction)
596  {
597      int saveOptions = fOptions;
598      fOptions |= (int) op->getData();
599      fOptions &= (int) ~op->getData2();
600
601      int ret = match(context, op->getChild(), offset, direction);
602
603      fOptions = saveOptions;
604
605      return ret;
606  }
607
608  inline unsigned short RegularExpression::getWordType(const XMLCh* const target
609                                                       , const int begin
610                                                       , const int end
611                                                       , const int offset)
612  {
613      if (offset < begin || offset >= end)
614          return WT_OTHER;
615
616      return getCharType(target[offset]);
617  }
618
619  inline
620  unsigned short RegularExpression::getPreviousWordType(const XMLCh* const target
621                                                        , const int start
622                                                        , const int end
623                                                        , int offset)
624  {
625      unsigned short ret = getWordType(target, start, end, --offset);
626
627      while (ret == WT_IGNORE) {
628          ret = getWordType(target, start, end, --offset);
629      }
630
631      return ret;
632  }
633
634  inline bool RegularExpression::matchIgnoreCase(const XMLInt32 ch1,
635                                               const XMLInt32 ch2)
636{
637
638    return (0==XMLString::compareNIString((XMLCh*)&ch1,(XMLCh*)&ch2, 1));
639}
640
641
642XERCES_CPP_NAMESPACE_END
643
644#endif
645/**
646  * End of file RegularExpression.hpp
647  */
648
Note: See TracBrowser for help on using the repository browser.