ICU 64.2  64.2
regex.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 **********************************************************************
5 * Copyright (C) 2002-2016, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 **********************************************************************
8 * file name: regex.h
9 * encoding: UTF-8
10 * indentation:4
11 *
12 * created on: 2002oct22
13 * created by: Andy Heninger
14 *
15 * ICU Regular Expressions, API for C++
16 */
17 
18 #ifndef REGEX_H
19 #define REGEX_H
20 
21 //#define REGEX_DEBUG
22 
45 #include "unicode/utypes.h"
46 
47 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
48 
49 #include "unicode/uobject.h"
50 #include "unicode/unistr.h"
51 #include "unicode/utext.h"
52 #include "unicode/parseerr.h"
53 
54 #include "unicode/uregex.h"
55 
56 // Forward Declarations
57 
58 struct UHashtable;
59 
61 
62 struct Regex8BitSet;
63 class RegexCImpl;
64 class RegexMatcher;
65 class RegexPattern;
66 struct REStackFrame;
67 class RuleBasedBreakIterator;
68 class UnicodeSet;
69 class UVector;
70 class UVector32;
71 class UVector64;
72 
73 
86 public:
87 
95  RegexPattern();
96 
103  RegexPattern(const RegexPattern &source);
104 
110  virtual ~RegexPattern();
111 
120  UBool operator==(const RegexPattern& that) const;
121 
130  inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
131 
137  RegexPattern &operator =(const RegexPattern &source);
138 
146  virtual RegexPattern *clone() const;
147 
148 
173  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
174  UParseError &pe,
175  UErrorCode &status);
176 
203  static RegexPattern * U_EXPORT2 compile( UText *regex,
204  UParseError &pe,
205  UErrorCode &status);
206 
231  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
232  uint32_t flags,
233  UParseError &pe,
234  UErrorCode &status);
235 
262  static RegexPattern * U_EXPORT2 compile( UText *regex,
263  uint32_t flags,
264  UParseError &pe,
265  UErrorCode &status);
266 
289  static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
290  uint32_t flags,
291  UErrorCode &status);
292 
317  static RegexPattern * U_EXPORT2 compile( UText *regex,
318  uint32_t flags,
319  UErrorCode &status);
320 
326  virtual uint32_t flags() const;
327 
345  virtual RegexMatcher *matcher(const UnicodeString &input,
346  UErrorCode &status) const;
347 
348 private:
361  RegexMatcher *matcher(const char16_t *input,
362  UErrorCode &status) const;
363 public:
364 
365 
377  virtual RegexMatcher *matcher(UErrorCode &status) const;
378 
379 
394  static UBool U_EXPORT2 matches(const UnicodeString &regex,
395  const UnicodeString &input,
396  UParseError &pe,
397  UErrorCode &status);
398 
413  static UBool U_EXPORT2 matches(UText *regex,
414  UText *input,
415  UParseError &pe,
416  UErrorCode &status);
417 
426  virtual UnicodeString pattern() const;
427 
428 
439  virtual UText *patternText(UErrorCode &status) const;
440 
441 
455  virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
456 
457 
474  virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
475 
476 
515  virtual int32_t split(const UnicodeString &input,
516  UnicodeString dest[],
517  int32_t destCapacity,
518  UErrorCode &status) const;
519 
520 
559  virtual int32_t split(UText *input,
560  UText *dest[],
561  int32_t destCapacity,
562  UErrorCode &status) const;
563 
564 
570  virtual UClassID getDynamicClassID() const;
571 
577  static UClassID U_EXPORT2 getStaticClassID();
578 
579 private:
580  //
581  // Implementation Data
582  //
583  UText *fPattern; // The original pattern string.
584  UnicodeString *fPatternString; // The original pattern UncodeString if relevant
585  uint32_t fFlags; // The flags used when compiling the pattern.
586  //
587  UVector64 *fCompiledPat; // The compiled pattern p-code.
588  UnicodeString fLiteralText; // Any literal string data from the pattern,
589  // after un-escaping, for use during the match.
590 
591  UVector *fSets; // Any UnicodeSets referenced from the pattern.
592  Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
593 
594 
595  UErrorCode fDeferredStatus; // status if some prior error has left this
596  // RegexPattern in an unusable state.
597 
598  int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
599  // >= this value. For some patterns, this calculated
600  // value may be less than the true shortest
601  // possible match.
602 
603  int32_t fFrameSize; // Size of a state stack frame in the
604  // execution engine.
605 
606  int32_t fDataSize; // The size of the data needed by the pattern that
607  // does not go on the state stack, but has just
608  // a single copy per matcher.
609 
610  UVector32 *fGroupMap; // Map from capture group number to position of
611  // the group's variables in the matcher stack frame.
612 
613  UnicodeSet **fStaticSets; // Ptr to static (shared) sets for predefined
614  // regex character classes, e.g. Word.
615 
616  Regex8BitSet *fStaticSets8; // Ptr to the static (shared) latin-1 only
617  // sets for predefined regex classes.
618 
619  int32_t fStartType; // Info on how a match must start.
620  int32_t fInitialStringIdx; //
621  int32_t fInitialStringLen;
622  UnicodeSet *fInitialChars;
623  UChar32 fInitialChar;
624  Regex8BitSet *fInitialChars8;
625  UBool fNeedsAltInput;
626 
627  UHashtable *fNamedCaptureMap; // Map from capture group names to numbers.
628 
629  friend class RegexCompile;
630  friend class RegexMatcher;
631  friend class RegexCImpl;
632 
633  //
634  // Implementation Methods
635  //
636  void init(); // Common initialization, for use by constructors.
637  void zap(); // Common cleanup
638 
639  void dumpOp(int32_t index) const;
640 
641  public:
642 #ifndef U_HIDE_INTERNAL_API
643 
647  void dumpPattern() const;
648 #endif /* U_HIDE_INTERNAL_API */
649 };
650 
651 
652 
663 public:
664 
678  RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
679 
694  RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
695 
716  RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
717  uint32_t flags, UErrorCode &status);
718 
739  RegexMatcher(UText *regexp, UText *input,
740  uint32_t flags, UErrorCode &status);
741 
742 private:
754  RegexMatcher(const UnicodeString &regexp, const char16_t *input,
755  uint32_t flags, UErrorCode &status);
756 public:
757 
758 
764  virtual ~RegexMatcher();
765 
766 
773  virtual UBool matches(UErrorCode &status);
774 
775 
786  virtual UBool matches(int64_t startIndex, UErrorCode &status);
787 
788 
802  virtual UBool lookingAt(UErrorCode &status);
803 
804 
818  virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
819 
820 
833  virtual UBool find();
834 
835 
850  virtual UBool find(UErrorCode &status);
851 
861  virtual UBool find(int64_t start, UErrorCode &status);
862 
863 
873  virtual UnicodeString group(UErrorCode &status) const;
874 
875 
893  virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
894 
900  virtual int32_t groupCount() const;
901 
902 
917  virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
918 
939  virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
940 
948  virtual int32_t start(UErrorCode &status) const;
949 
957  virtual int64_t start64(UErrorCode &status) const;
958 
959 
973  virtual int32_t start(int32_t group, UErrorCode &status) const;
974 
988  virtual int64_t start64(int32_t group, UErrorCode &status) const;
989 
1003  virtual int32_t end(UErrorCode &status) const;
1004 
1018  virtual int64_t end64(UErrorCode &status) const;
1019 
1020 
1038  virtual int32_t end(int32_t group, UErrorCode &status) const;
1039 
1057  virtual int64_t end64(int32_t group, UErrorCode &status) const;
1058 
1067  virtual RegexMatcher &reset();
1068 
1069 
1085  virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1086 
1087 
1105  virtual RegexMatcher &reset(const UnicodeString &input);
1106 
1107 
1121  virtual RegexMatcher &reset(UText *input);
1122 
1123 
1148  virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1149 
1150 private:
1163  RegexMatcher &reset(const char16_t *input);
1164 public:
1165 
1173  virtual const UnicodeString &input() const;
1174 
1183  virtual UText *inputText() const;
1184 
1195  virtual UText *getInput(UText *dest, UErrorCode &status) const;
1196 
1197 
1216  virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1217 
1229  virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1230 
1239  virtual int32_t regionStart() const;
1240 
1249  virtual int64_t regionStart64() const;
1250 
1251 
1260  virtual int32_t regionEnd() const;
1261 
1270  virtual int64_t regionEnd64() const;
1271 
1280  virtual UBool hasTransparentBounds() const;
1281 
1300  virtual RegexMatcher &useTransparentBounds(UBool b);
1301 
1302 
1310  virtual UBool hasAnchoringBounds() const;
1311 
1312 
1325  virtual RegexMatcher &useAnchoringBounds(UBool b);
1326 
1327 
1340  virtual UBool hitEnd() const;
1341 
1351  virtual UBool requireEnd() const;
1352 
1353 
1359  virtual const RegexPattern &pattern() const;
1360 
1361 
1378  virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1379 
1380 
1401  virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1402 
1403 
1424  virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1425 
1426 
1451  virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1452 
1453 
1481  virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1482  const UnicodeString &replacement, UErrorCode &status);
1483 
1484 
1512  virtual RegexMatcher &appendReplacement(UText *dest,
1513  UText *replacement, UErrorCode &status);
1514 
1515 
1526  virtual UnicodeString &appendTail(UnicodeString &dest);
1527 
1528 
1542  virtual UText *appendTail(UText *dest, UErrorCode &status);
1543 
1544 
1568  virtual int32_t split(const UnicodeString &input,
1569  UnicodeString dest[],
1570  int32_t destCapacity,
1571  UErrorCode &status);
1572 
1573 
1597  virtual int32_t split(UText *input,
1598  UText *dest[],
1599  int32_t destCapacity,
1600  UErrorCode &status);
1601 
1623  virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1624 
1631  virtual int32_t getTimeLimit() const;
1632 
1654  virtual void setStackLimit(int32_t limit, UErrorCode &status);
1655 
1663  virtual int32_t getStackLimit() const;
1664 
1665 
1679  virtual void setMatchCallback(URegexMatchCallback *callback,
1680  const void *context,
1681  UErrorCode &status);
1682 
1683 
1694  virtual void getMatchCallback(URegexMatchCallback *&callback,
1695  const void *&context,
1696  UErrorCode &status);
1697 
1698 
1712  virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
1713  const void *context,
1714  UErrorCode &status);
1715 
1716 
1727  virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
1728  const void *&context,
1729  UErrorCode &status);
1730 
1731 #ifndef U_HIDE_INTERNAL_API
1732 
1737  void setTrace(UBool state);
1738 #endif /* U_HIDE_INTERNAL_API */
1739 
1745  static UClassID U_EXPORT2 getStaticClassID();
1746 
1752  virtual UClassID getDynamicClassID() const;
1753 
1754 private:
1755  // Constructors and other object boilerplate are private.
1756  // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1757  RegexMatcher(); // default constructor not implemented
1758  RegexMatcher(const RegexPattern *pat);
1759  RegexMatcher(const RegexMatcher &other);
1760  RegexMatcher &operator =(const RegexMatcher &rhs);
1761  void init(UErrorCode &status); // Common initialization
1762  void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
1763 
1764  friend class RegexPattern;
1765  friend class RegexCImpl;
1766 public:
1767 #ifndef U_HIDE_INTERNAL_API
1768 
1769  void resetPreserveRegion(); // Reset matcher state, but preserve any region.
1770 #endif /* U_HIDE_INTERNAL_API */
1771 private:
1772 
1773  //
1774  // MatchAt This is the internal interface to the match engine itself.
1775  // Match status comes back in matcher member variables.
1776  //
1777  void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1778  inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
1779  UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
1780  UBool isUWordBoundary(int64_t pos); // perform RBBI based \b test
1781  REStackFrame *resetStack();
1782  inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1783  void IncrementTime(UErrorCode &status);
1784 
1785  // Call user find callback function, if set. Return TRUE if operation should be interrupted.
1786  inline UBool findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
1787 
1788  int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1789 
1790  UBool findUsingChunk(UErrorCode &status);
1791  void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1792  UBool isChunkWordBoundary(int32_t pos);
1793 
1794  const RegexPattern *fPattern;
1795  RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
1796  // should delete it when through.
1797 
1798  const UnicodeString *fInput; // The string being matched. Only used for input()
1799  UText *fInputText; // The text being matched. Is never NULL.
1800  UText *fAltInputText; // A shallow copy of the text being matched.
1801  // Only created if the pattern contains backreferences.
1802  int64_t fInputLength; // Full length of the input text.
1803  int32_t fFrameSize; // The size of a frame in the backtrack stack.
1804 
1805  int64_t fRegionStart; // Start of the input region, default = 0.
1806  int64_t fRegionLimit; // End of input region, default to input.length.
1807 
1808  int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
1809  int64_t fAnchorLimit; // See useAnchoringBounds
1810 
1811  int64_t fLookStart; // Region bounds for look-ahead/behind and
1812  int64_t fLookLimit; // and other boundary tests. See
1813  // useTransparentBounds
1814 
1815  int64_t fActiveStart; // Currently active bounds for matching.
1816  int64_t fActiveLimit; // Usually is the same as region, but
1817  // is changed to fLookStart/Limit when
1818  // entering look around regions.
1819 
1820  UBool fTransparentBounds; // True if using transparent bounds.
1821  UBool fAnchoringBounds; // True if using anchoring bounds.
1822 
1823  UBool fMatch; // True if the last attempted match was successful.
1824  int64_t fMatchStart; // Position of the start of the most recent match
1825  int64_t fMatchEnd; // First position after the end of the most recent match
1826  // Zero if no previous match, even when a region
1827  // is active.
1828  int64_t fLastMatchEnd; // First position after the end of the previous match,
1829  // or -1 if there was no previous match.
1830  int64_t fAppendPosition; // First position after the end of the previous
1831  // appendReplacement(). As described by the
1832  // JavaDoc for Java Matcher, where it is called
1833  // "append position"
1834  UBool fHitEnd; // True if the last match touched the end of input.
1835  UBool fRequireEnd; // True if the last match required end-of-input
1836  // (matched $ or Z)
1837 
1838  UVector64 *fStack;
1839  REStackFrame *fFrame; // After finding a match, the last active stack frame,
1840  // which will contain the capture group results.
1841  // NOT valid while match engine is running.
1842 
1843  int64_t *fData; // Data area for use by the compiled pattern.
1844  int64_t fSmallData[8]; // Use this for data if it's enough.
1845 
1846  int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
1847  // match engine run. Zero for unlimited.
1848 
1849  int32_t fTime; // Match time, accumulates while matching.
1850  int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
1851  // Kept separately from fTime to keep as much
1852  // code as possible out of the inline
1853  // StateSave function.
1854 
1855  int32_t fStackLimit; // Maximum memory size to use for the backtrack
1856  // stack, in bytes. Zero for unlimited.
1857 
1858  URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
1859  // NULL if there is no callback.
1860  const void *fCallbackContext; // User Context ptr for callback function.
1861 
1862  URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
1863  // NULL if there is no callback.
1864  const void *fFindProgressCallbackContext; // User Context ptr for callback function.
1865 
1866 
1867  UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1868 
1869  UBool fTraceDebug; // Set true for debug tracing of match engine.
1870 
1871  UErrorCode fDeferredStatus; // Save error state that cannot be immediately
1872  // reported, or that permanently disables this matcher.
1873 
1874  RuleBasedBreakIterator *fWordBreakItr;
1875 };
1876 
1878 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
1879 #endif
struct UHashtable UHashtable
Definition: msgfmt.h:41
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition: uobject.h:93
Class RegexPattern represents a compiled regular expression.
Definition: regex.h:85
UBool URegexFindProgressCallback(const void *context, int64_t matchIndex)
Function pointer for a regular expression find callback function.
Definition: uregex.h:1573
C API: Abstract Unicode Text API.
class RegexMatcher bundles together a regular expression pattern and input text to which the expressi...
Definition: regex.h:662
UBool operator!=(const RegexPattern &that) const
Comparison operator.
Definition: regex.h:130
#define U_I18N_API
Set to export library symbols from inside the i18n library, and to import them from outside...
Definition: utypes.h:301
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:137
C API: Regular Expressions.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:389
virtual UClassID getDynamicClassID() const
ICU4C "poor man's RTTI", returns a UClassID for the actual ICU class.
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:277
C++ API: Common ICU base class UObject.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:138
UBool URegexMatchCallback(const void *context, int32_t steps)
Function pointer for a regular expression matching callback function.
Definition: uregex.h:1499
C API: Parse Error Information.
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:401
#define U_FINAL
Defined to the C++11 "final" keyword if available.
Definition: umachine.h:140
UText struct.
Definition: utext.h:1345
A subclass of BreakIterator whose behavior is specified using a list of rules.
Definition: rbbi.h:53
A UParseError struct is used to returned detailed information about parsing errors.
Definition: parseerr.h:58
Basic definitions for ICU, for both C and C++ APIs.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:289
UObject is the common ICU "boilerplate" class.
Definition: uobject.h:223
int8_t UBool
The ICU boolean type.
Definition: umachine.h:225