regex.h

Go to the documentation of this file.
00001 /*
00002 **********************************************************************
00003 *   Copyright (C) 2002-2010, International Business Machines
00004 *   Corporation and others.  All Rights Reserved.
00005 **********************************************************************
00006 *   file name:  regex.h
00007 *   encoding:   US-ASCII
00008 *   indentation:4
00009 *
00010 *   created on: 2002oct22
00011 *   created by: Andy Heninger
00012 *
00013 *   ICU Regular Expressions, API for C++
00014 */
00015 
00016 #ifndef REGEX_H
00017 #define REGEX_H
00018 
00019 //#define REGEX_DEBUG
00020 
00045 #include "unicode/utypes.h"
00046 
00047 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
00048 
00049 #include "unicode/uobject.h"
00050 #include "unicode/unistr.h"
00051 #include "unicode/utext.h"
00052 #include "unicode/parseerr.h"
00053 
00054 #include "unicode/uregex.h"
00055 
00056 U_NAMESPACE_BEGIN
00057 
00058 
00059 // Forward Declarations...
00060 
00061 class RegexMatcher;
00062 class RegexPattern;
00063 class UVector;
00064 class UVector32;
00065 class UVector64;
00066 class UnicodeSet;
00067 struct REStackFrame;
00068 struct Regex8BitSet;
00069 class  RuleBasedBreakIterator;
00070 class  RegexCImpl;
00071 
00072 
00073 
00074 
00079 #ifdef REGEX_DEBUG
00080 U_INTERNAL void U_EXPORT2
00081     RegexPatternDump(const RegexPattern *pat);
00082 #else
00083     #undef RegexPatternDump
00084     #define RegexPatternDump(pat)
00085 #endif
00086 
00087 
00088 
00100 class U_I18N_API RegexPattern: public UObject {
00101 public:
00102 
00110     RegexPattern();
00111 
00118     RegexPattern(const RegexPattern &source);
00119 
00125     virtual ~RegexPattern();
00126 
00135     UBool           operator==(const RegexPattern& that) const;
00136 
00145     inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);};
00146 
00152     RegexPattern  &operator =(const RegexPattern &source);
00153 
00161     virtual RegexPattern  *clone() const;
00162 
00163 
00188     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00189         UParseError          &pe,
00190         UErrorCode           &status);
00191 
00192 
00219     static RegexPattern * U_EXPORT2 compile( UText *regex,
00220         UParseError          &pe,
00221         UErrorCode           &status);
00222 
00247     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00248         uint32_t             flags,
00249         UParseError          &pe,
00250         UErrorCode           &status);
00251         
00252         
00279     static RegexPattern * U_EXPORT2 compile( UText *regex,
00280         uint32_t             flags,
00281         UParseError          &pe,
00282         UErrorCode           &status);
00283     
00284 
00307     static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
00308         uint32_t             flags,
00309         UErrorCode           &status);
00310 
00311 
00336     static RegexPattern * U_EXPORT2 compile( UText *regex,
00337         uint32_t             flags,
00338         UErrorCode           &status);
00339     
00340 
00346     virtual uint32_t flags() const;
00347 
00365     virtual RegexMatcher *matcher(const UnicodeString &input,
00366         UErrorCode          &status) const;
00367         
00368         
00373     enum PatternIsUTextFlag { PATTERN_IS_UTEXT };
00374 
00394     virtual RegexMatcher *matcher(UText *input,
00395         PatternIsUTextFlag      flag, 
00396         UErrorCode          &status) const;
00397 
00398 private:
00412     RegexMatcher *matcher(const UChar *input,
00413         UErrorCode          &status) const;
00414 public:
00415 
00416 
00428     virtual RegexMatcher *matcher(UErrorCode  &status) const;
00429 
00430 
00445     static UBool U_EXPORT2 matches(const UnicodeString   &regex,
00446         const UnicodeString   &input,
00447               UParseError     &pe,
00448               UErrorCode      &status);
00449 
00450 
00465     static UBool U_EXPORT2 matches(UText *regex,
00466         UText           *input,
00467         UParseError     &pe,
00468         UErrorCode      &status);
00469 
00470 
00479     virtual UnicodeString pattern() const;
00480     
00481     
00492     virtual UText *patternText() const;
00493 
00494 
00520     virtual int32_t  split(const UnicodeString &input,
00521         UnicodeString    dest[],
00522         int32_t          destCapacity,
00523         UErrorCode       &status) const;
00524 
00525 
00551     virtual int32_t  split(UText *input,
00552         UText            *dest[],
00553         int32_t          destCapacity,
00554         UErrorCode       &status) const;
00555 
00556 
00562     virtual UClassID getDynamicClassID() const;
00563 
00569     static UClassID U_EXPORT2 getStaticClassID();
00570 
00571 private:
00572     //
00573     //  Implementation Data
00574     //
00575     UText          *fPattern;      // The original pattern string.
00576     UnicodeString  *fPatternString; // The original pattern UncodeString if relevant
00577     uint32_t        fFlags;        // The flags used when compiling the pattern.
00578                                    //
00579     UVector64       *fCompiledPat; // The compiled pattern p-code.
00580     UnicodeString   fLiteralText;  // Any literal string data from the pattern,
00581                                    //   after un-escaping, for use during the match.
00582 
00583     UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
00584     Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
00585 
00586 
00587     UErrorCode      fDeferredStatus; // status if some prior error has left this
00588                                    //  RegexPattern in an unusable state.
00589 
00590     int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
00591                                    //   >= this value.  For some patterns, this calculated
00592                                    //   value may be less than the true shortest
00593                                    //   possible match.
00594     
00595     int32_t         fFrameSize;    // Size of a state stack frame in the
00596                                    //   execution engine.
00597 
00598     int32_t         fDataSize;     // The size of the data needed by the pattern that
00599                                    //   does not go on the state stack, but has just
00600                                    //   a single copy per matcher.
00601 
00602     UVector32       *fGroupMap;    // Map from capture group number to position of
00603                                    //   the group's variables in the matcher stack frame.
00604 
00605     int32_t         fMaxCaptureDigits;
00606 
00607     UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
00608                                    //   regex character classes, e.g. Word.
00609 
00610     Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
00611                                    //  sets for predefined regex classes.
00612 
00613     int32_t         fStartType;    // Info on how a match must start.
00614     int32_t         fInitialStringIdx;     //
00615     int32_t         fInitialStringLen;
00616     UnicodeSet     *fInitialChars;
00617     UChar32         fInitialChar;
00618     Regex8BitSet   *fInitialChars8;
00619     UBool           fNeedsAltInput;
00620 
00621     friend class RegexCompile;
00622     friend class RegexMatcher;
00623     friend class RegexCImpl;
00624 
00625     //
00626     //  Implementation Methods
00627     //
00628     void        init();            // Common initialization, for use by constructors.
00629     void        zap();             // Common cleanup
00630 #ifdef REGEX_DEBUG
00631     void        dumpOp(int32_t index) const;
00632     friend     void U_EXPORT2 RegexPatternDump(const RegexPattern *);
00633 #endif
00634 
00635 };
00636 
00637 
00638 
00648 class U_I18N_API RegexMatcher: public UObject {
00649 public:
00650 
00665     RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
00666 
00682     RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
00683     
00705     RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
00706         uint32_t flags, UErrorCode &status);
00707 
00729     RegexMatcher(UText *regexp, UText *input,
00730         uint32_t flags, UErrorCode &status);
00731 
00732 private:
00746     RegexMatcher(const UnicodeString &regexp, const UChar *input,
00747         uint32_t flags, UErrorCode &status);
00748 public:
00749 
00750 
00756     virtual ~RegexMatcher();
00757 
00758 
00765     virtual UBool matches(UErrorCode &status);
00766 
00767 
00778     virtual UBool matches(int32_t startIndex, UErrorCode &status);
00779 
00780 
00794     virtual UBool lookingAt(UErrorCode &status);
00795 
00796 
00810     virtual UBool lookingAt(int32_t startIndex, UErrorCode &status);
00811 
00812 
00825     virtual UBool find();
00826 
00827 
00837     virtual UBool find(int32_t start, UErrorCode &status);
00838 
00839 
00849     virtual UnicodeString group(UErrorCode &status) const;
00850 
00851 
00856     enum MatcherDestIsUTextFlag { MATCHER_DEST_IS_UTEXT };
00857 
00873     virtual UText *group(UText *dest, MatcherDestIsUTextFlag flag, UErrorCode &status) const;
00874 
00875 
00888     virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
00889 
00890 
00906     virtual UText *group(int32_t groupNum, UText *dest, UErrorCode &status) const;
00907 
00908 
00914     virtual int32_t groupCount() const;
00915 
00916 
00924     virtual int32_t start(UErrorCode &status) const;
00925 
00926 
00940     virtual int32_t start(int32_t group, UErrorCode &status) const;
00941 
00942 
00952     virtual int32_t end(UErrorCode &status) const;
00953 
00954 
00968     virtual int32_t end(int32_t group, UErrorCode &status) const;
00969 
00970 
00979     virtual RegexMatcher &reset();
00980 
00981 
00997     virtual RegexMatcher &reset(int32_t index, UErrorCode &status);
00998 
00999 
01017     virtual RegexMatcher &reset(const UnicodeString &input);
01018 
01019 
01033     virtual RegexMatcher &reset(UText *input);
01034 
01035 private:
01049     RegexMatcher &reset(const UChar *input);
01050 public:
01051 
01059     virtual const UnicodeString &input() const;
01060     
01069     virtual UText *inputText() const;
01070     
01080     virtual UText *getInput(UText *dest) const;
01081     
01082 
01101      virtual RegexMatcher &region(int32_t start, int32_t limit, UErrorCode &status);
01102 
01103 
01112      virtual int32_t regionStart() const;
01113 
01114 
01123       virtual int32_t regionEnd() const;
01124 
01133       virtual UBool hasTransparentBounds() const;
01134 
01153       virtual RegexMatcher &useTransparentBounds(UBool b);
01154 
01155      
01163       virtual UBool hasAnchoringBounds() const;
01164 
01165 
01178       virtual RegexMatcher &useAnchoringBounds(UBool b);
01179 
01180 
01193       virtual UBool hitEnd() const;
01194 
01204       virtual UBool requireEnd() const;
01205 
01206 
01212     virtual const RegexPattern &pattern() const;
01213 
01214 
01231     virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
01232 
01233 
01254     virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
01255     
01256 
01277     virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
01278     
01279 
01304     virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
01305     
01306     
01334     virtual RegexMatcher &appendReplacement(UnicodeString &dest,
01335         const UnicodeString &replacement, UErrorCode &status);
01336     
01337     
01365     virtual RegexMatcher &appendReplacement(UText *dest,
01366         UText *replacement, UErrorCode &status);
01367 
01368 
01379     virtual UnicodeString &appendTail(UnicodeString &dest);
01380 
01381 
01394     virtual UText *appendTail(UText *dest);
01395 
01396 
01420     virtual int32_t  split(const UnicodeString &input,
01421         UnicodeString    dest[],
01422         int32_t          destCapacity,
01423         UErrorCode       &status);
01424 
01425 
01449     virtual int32_t  split(UText *input,
01450         UText           *dest[],
01451         int32_t          destCapacity,
01452         UErrorCode       &status);
01453     
01475     virtual void setTimeLimit(int32_t limit, UErrorCode &status);
01476 
01483     virtual int32_t getTimeLimit() const;
01484 
01506     virtual void setStackLimit(int32_t  limit, UErrorCode &status);
01507     
01515     virtual int32_t  getStackLimit() const;
01516 
01517 
01531     virtual void setMatchCallback(URegexMatchCallback     *callback,
01532                                   const void              *context,
01533                                   UErrorCode              &status);
01534 
01535 
01546     virtual void getMatchCallback(URegexMatchCallback     *&callback,
01547                                   const void              *&context,
01548                                   UErrorCode              &status);
01549 
01550 
01556     void setTrace(UBool state);
01557 
01558 
01564     static UClassID U_EXPORT2 getStaticClassID();
01565 
01571     virtual UClassID getDynamicClassID() const;
01572 
01573 private:
01574     // Constructors and other object boilerplate are private.
01575     // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
01576     RegexMatcher();                  // default constructor not implemented
01577     RegexMatcher(const RegexPattern *pat);
01578     RegexMatcher(const RegexMatcher &other);
01579     RegexMatcher &operator =(const RegexMatcher &rhs);
01580     void init(UErrorCode &status);                      // Common initialization
01581     void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.
01582 
01583     friend class RegexPattern;
01584     friend class RegexCImpl;
01585 public:
01587     void resetPreserveRegion();  // Reset matcher state, but preserve any region.
01588 private:
01589 
01590     //
01591     //  MatchAt   This is the internal interface to the match engine itself.
01592     //            Match status comes back in matcher member variables.
01593     //
01594     void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
01595     inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
01596     UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
01597     UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
01598     REStackFrame        *resetStack();
01599     inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
01600     void                 IncrementTime(UErrorCode &status);
01601     
01602     int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
01603     
01604     UBool                findUsingChunk();
01605     void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
01606     UBool                isChunkWordBoundary(int32_t pos);
01607 
01608     const RegexPattern  *fPattern;
01609     RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
01610                                            //   should delete it when through.
01611 
01612     const UnicodeString *fInput;           // The string being matched. Only used for input()
01613     UText               *fInputText;       // The text being matched. Is never NULL.
01614     UText               *fAltInputText;    // A shallow copy of the text being matched.
01615                                            //   Only created if the pattern contains backreferences.
01616     int64_t              fInputLength;     // Full length of the input text.
01617     int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
01618     
01619     int64_t              fRegionStart;     // Start of the input region, default = 0.
01620     int64_t              fRegionLimit;     // End of input region, default to input.length.
01621     
01622     int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
01623     int64_t              fAnchorLimit;     //   See useAnchoringBounds
01624     
01625     int64_t              fLookStart;       // Region bounds for look-ahead/behind and
01626     int64_t              fLookLimit;       //   and other boundary tests.  See
01627                                            //   useTransparentBounds
01628 
01629     int64_t              fActiveStart;     // Currently active bounds for matching.
01630     int64_t              fActiveLimit;     //   Usually is the same as region, but
01631                                            //   is changed to fLookStart/Limit when
01632                                            //   entering look around regions.
01633 
01634     UBool                fTransparentBounds;  // True if using transparent bounds.
01635     UBool                fAnchoringBounds; // True if using anchoring bounds.
01636 
01637     UBool                fMatch;           // True if the last attempted match was successful.
01638     int64_t              fMatchStart;      // Position of the start of the most recent match
01639     int64_t              fMatchEnd;        // First position after the end of the most recent match
01640                                            //   Zero if no previous match, even when a region
01641                                            //   is active.
01642     int64_t              fLastMatchEnd;    // First position after the end of the previous match,
01643                                            //   or -1 if there was no previous match.
01644     int64_t              fAppendPosition;  // First position after the end of the previous
01645                                            //   appendReplacement().  As described by the
01646                                            //   JavaDoc for Java Matcher, where it is called 
01647                                            //   "append position"
01648     UBool                fHitEnd;          // True if the last match touched the end of input.
01649     UBool                fRequireEnd;      // True if the last match required end-of-input
01650                                            //    (matched $ or Z)
01651 
01652     UVector64           *fStack;
01653     REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
01654                                            //   which will contain the capture group results.
01655                                            //   NOT valid while match engine is running.
01656 
01657     int64_t             *fData;            // Data area for use by the compiled pattern.
01658     int64_t             fSmallData[8];     //   Use this for data if it's enough.
01659 
01660     int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
01661                                            //   match engine run.  Zero for unlimited.
01662     
01663     int32_t             fTime;             // Match time, accumulates while matching.
01664     int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
01665                                            //   Kept separately from fTime to keep as much
01666                                            //   code as possible out of the inline
01667                                            //   StateSave function.
01668 
01669     int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
01670                                            //   stack, in bytes.  Zero for unlimited.
01671 
01672     URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
01673                                            //   NULL if there is no callback.
01674     const void         *fCallbackContext;  // User Context ptr for callback function.
01675 
01676     UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
01677 
01678     UBool               fTraceDebug;       // Set true for debug tracing of match engine.
01679 
01680     UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
01681                                            //   reported, or that permanently disables this matcher.
01682 
01683     RuleBasedBreakIterator  *fWordBreakItr;
01684 
01685 
01686 };
01687 
01688 U_NAMESPACE_END
01689 #endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
01690 #endif

Generated on Tue Apr 27 15:10:34 2010 for ICU 4.4.1 by  doxygen 1.4.7