ICU 64.2  64.2
uniset.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 ***************************************************************************
5 * Copyright (C) 1999-2016, International Business Machines Corporation
6 * and others. All Rights Reserved.
7 ***************************************************************************
8 * Date Name Description
9 * 10/20/99 alan Creation.
10 ***************************************************************************
11 */
12 
13 #ifndef UNICODESET_H
14 #define UNICODESET_H
15 
16 #include "unicode/ucpmap.h"
17 #include "unicode/unifilt.h"
18 #include "unicode/unistr.h"
19 #include "unicode/uset.h"
20 
27 
28 // Forward Declarations.
29 class BMPSet;
30 class ParsePosition;
31 class RBBIRuleScanner;
32 class SymbolTable;
33 class UnicodeSetStringSpan;
34 class UVector;
35 class RuleCharacterIterator;
36 
278 private:
283  static constexpr int32_t INITIAL_CAPACITY = 25;
284  // fFlags constant
285  static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
286 
287  UChar32* list = stackList; // MUST be terminated with HIGH
288  int32_t capacity = INITIAL_CAPACITY; // capacity of list
289  int32_t len = 1; // length of list used; 1 <= len <= capacity
290  uint8_t fFlags = 0; // Bit flag (see constants above)
291 
292  BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL.
293  UChar32* buffer = nullptr; // internal buffer, may be NULL
294  int32_t bufferCapacity = 0; // capacity of buffer
295 
305  char16_t *pat = nullptr;
306  int32_t patLen = 0;
307 
308  UVector* strings = nullptr; // maintained in sorted order
309  UnicodeSetStringSpan *stringSpan = nullptr;
310 
316  UChar32 stackList[INITIAL_CAPACITY];
317 
318 public:
328  inline UBool isBogus(void) const;
329 
346  void setToBogus();
347 
348 public:
349 
350  enum {
355  MIN_VALUE = 0,
356 
361  MAX_VALUE = 0x10ffff
362  };
363 
364  //----------------------------------------------------------------
365  // Constructors &c
366  //----------------------------------------------------------------
367 
368 public:
369 
374  UnicodeSet();
375 
384  UnicodeSet(UChar32 start, UChar32 end);
385 
386 #ifndef U_HIDE_INTERNAL_API
387 
391  kSerialized /* result of serialize() */
392  };
393 
404  UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
405  ESerialization serialization, UErrorCode &status);
406 #endif /* U_HIDE_INTERNAL_API */
407 
416  UnicodeSet(const UnicodeString& pattern,
417  UErrorCode& status);
418 
419 #ifndef U_HIDE_INTERNAL_API
420 
432  UnicodeSet(const UnicodeString& pattern,
433  uint32_t options,
434  const SymbolTable* symbols,
435  UErrorCode& status);
436 #endif /* U_HIDE_INTERNAL_API */
437 
451  UnicodeSet(const UnicodeString& pattern, ParsePosition& pos,
452  uint32_t options,
453  const SymbolTable* symbols,
454  UErrorCode& status);
455 
460  UnicodeSet(const UnicodeSet& o);
461 
466  virtual ~UnicodeSet();
467 
473  UnicodeSet& operator=(const UnicodeSet& o);
474 
486  virtual UBool operator==(const UnicodeSet& o) const;
487 
493  inline UBool operator!=(const UnicodeSet& o) const;
494 
504  virtual UnicodeFunctor* clone() const;
505 
513  virtual int32_t hashCode(void) const;
514 
523  inline static UnicodeSet *fromUSet(USet *uset);
524 
533  inline static const UnicodeSet *fromUSet(const USet *uset);
534 
542  inline USet *toUSet();
543 
544 
552  inline const USet * toUSet() const;
553 
554 
555  //----------------------------------------------------------------
556  // Freezable API
557  //----------------------------------------------------------------
558 
567  inline UBool isFrozen() const;
568 
582  UnicodeFunctor *freeze();
583 
592  UnicodeFunctor *cloneAsThawed() const;
593 
594  //----------------------------------------------------------------
595  // Public API
596  //----------------------------------------------------------------
597 
607  UnicodeSet& set(UChar32 start, UChar32 end);
608 
614  static UBool resemblesPattern(const UnicodeString& pattern,
615  int32_t pos);
616 
629  UnicodeSet& applyPattern(const UnicodeString& pattern,
630  UErrorCode& status);
631 
632 #ifndef U_HIDE_INTERNAL_API
633 
649  UnicodeSet& applyPattern(const UnicodeString& pattern,
650  uint32_t options,
651  const SymbolTable* symbols,
652  UErrorCode& status);
653 #endif /* U_HIDE_INTERNAL_API */
654 
686  UnicodeSet& applyPattern(const UnicodeString& pattern,
687  ParsePosition& pos,
688  uint32_t options,
689  const SymbolTable* symbols,
690  UErrorCode& status);
691 
705  virtual UnicodeString& toPattern(UnicodeString& result,
706  UBool escapeUnprintable = FALSE) const;
707 
730  UnicodeSet& applyIntPropertyValue(UProperty prop,
731  int32_t value,
732  UErrorCode& ec);
733 
763  UnicodeSet& applyPropertyAlias(const UnicodeString& prop,
764  const UnicodeString& value,
765  UErrorCode& ec);
766 
775  virtual int32_t size(void) const;
776 
783  virtual UBool isEmpty(void) const;
784 
792  virtual UBool contains(UChar32 c) const;
793 
802  virtual UBool contains(UChar32 start, UChar32 end) const;
803 
811  UBool contains(const UnicodeString& s) const;
812 
820  virtual UBool containsAll(const UnicodeSet& c) const;
821 
829  UBool containsAll(const UnicodeString& s) const;
830 
839  UBool containsNone(UChar32 start, UChar32 end) const;
840 
848  UBool containsNone(const UnicodeSet& c) const;
849 
857  UBool containsNone(const UnicodeString& s) const;
858 
867  inline UBool containsSome(UChar32 start, UChar32 end) const;
868 
876  inline UBool containsSome(const UnicodeSet& s) const;
877 
885  inline UBool containsSome(const UnicodeString& s) const;
886 
905  int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
906 
919  inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
920 
938  int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
939 
953  inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
954 
973  int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
974 
992  int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
993 
998  virtual UMatchDegree matches(const Replaceable& text,
999  int32_t& offset,
1000  int32_t limit,
1001  UBool incremental);
1002 
1003 private:
1026  static int32_t matchRest(const Replaceable& text,
1027  int32_t start, int32_t limit,
1028  const UnicodeString& s);
1029 
1039  int32_t findCodePoint(UChar32 c) const;
1040 
1041 public:
1042 
1050  virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1051 
1060  int32_t indexOf(UChar32 c) const;
1061 
1071  UChar32 charAt(int32_t index) const;
1072 
1087  virtual UnicodeSet& add(UChar32 start, UChar32 end);
1088 
1096  UnicodeSet& add(UChar32 c);
1097 
1109  UnicodeSet& add(const UnicodeString& s);
1110 
1111  private:
1117  static int32_t getSingleCP(const UnicodeString& s);
1118 
1119  void _add(const UnicodeString& s);
1120 
1121  public:
1130  UnicodeSet& addAll(const UnicodeString& s);
1131 
1140  UnicodeSet& retainAll(const UnicodeString& s);
1141 
1150  UnicodeSet& complementAll(const UnicodeString& s);
1151 
1160  UnicodeSet& removeAll(const UnicodeString& s);
1161 
1170  static UnicodeSet* U_EXPORT2 createFrom(const UnicodeString& s);
1171 
1172 
1180  static UnicodeSet* U_EXPORT2 createFromAll(const UnicodeString& s);
1181 
1195  virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1196 
1197 
1203  UnicodeSet& retain(UChar32 c);
1204 
1218  virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1219 
1227  UnicodeSet& remove(UChar32 c);
1228 
1238  UnicodeSet& remove(const UnicodeString& s);
1239 
1247  virtual UnicodeSet& complement(void);
1248 
1263  virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1264 
1272  UnicodeSet& complement(UChar32 c);
1273 
1284  UnicodeSet& complement(const UnicodeString& s);
1285 
1298  virtual UnicodeSet& addAll(const UnicodeSet& c);
1299 
1311  virtual UnicodeSet& retainAll(const UnicodeSet& c);
1312 
1324  virtual UnicodeSet& removeAll(const UnicodeSet& c);
1325 
1336  virtual UnicodeSet& complementAll(const UnicodeSet& c);
1337 
1344  virtual UnicodeSet& clear(void);
1345 
1371  UnicodeSet& closeOver(int32_t attribute);
1372 
1379  virtual UnicodeSet &removeAllStrings();
1380 
1388  virtual int32_t getRangeCount(void) const;
1389 
1397  virtual UChar32 getRangeStart(int32_t index) const;
1398 
1406  virtual UChar32 getRangeEnd(int32_t index) const;
1407 
1456  int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1457 
1464  virtual UnicodeSet& compact();
1465 
1477  static UClassID U_EXPORT2 getStaticClassID(void);
1478 
1487  virtual UClassID getDynamicClassID(void) const;
1488 
1489 private:
1490 
1491  // Private API for the USet API
1492 
1493  friend class USetAccess;
1494 
1495  const UnicodeString* getString(int32_t index) const;
1496 
1497  //----------------------------------------------------------------
1498  // RuleBasedTransliterator support
1499  //----------------------------------------------------------------
1500 
1501 private:
1502 
1508  virtual UBool matchesIndexValue(uint8_t v) const;
1509 
1510 private:
1511  friend class RBBIRuleScanner;
1512 
1513  //----------------------------------------------------------------
1514  // Implementation: Clone as thawed (see ICU4J Freezable)
1515  //----------------------------------------------------------------
1516 
1517  UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1518  UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1519 
1520  //----------------------------------------------------------------
1521  // Implementation: Pattern parsing
1522  //----------------------------------------------------------------
1523 
1524  void applyPatternIgnoreSpace(const UnicodeString& pattern,
1525  ParsePosition& pos,
1526  const SymbolTable* symbols,
1527  UErrorCode& status);
1528 
1529  void applyPattern(RuleCharacterIterator& chars,
1530  const SymbolTable* symbols,
1531  UnicodeString& rebuiltPat,
1532  uint32_t options,
1533  UnicodeSet& (UnicodeSet::*caseClosure)(int32_t attribute),
1534  int32_t depth,
1535  UErrorCode& ec);
1536 
1537  //----------------------------------------------------------------
1538  // Implementation: Utility methods
1539  //----------------------------------------------------------------
1540 
1541  static int32_t nextCapacity(int32_t minCapacity);
1542 
1543  bool ensureCapacity(int32_t newLen);
1544 
1545  bool ensureBufferCapacity(int32_t newLen);
1546 
1547  void swapBuffers(void);
1548 
1549  UBool allocateStrings(UErrorCode &status);
1550  UBool hasStrings() const;
1551  int32_t stringsSize() const;
1552  UBool stringsContains(const UnicodeString &s) const;
1553 
1554  UnicodeString& _toPattern(UnicodeString& result,
1555  UBool escapeUnprintable) const;
1556 
1557  UnicodeString& _generatePattern(UnicodeString& result,
1558  UBool escapeUnprintable) const;
1559 
1560  static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1561 
1562  static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1563 
1564  //----------------------------------------------------------------
1565  // Implementation: Fundamental operators
1566  //----------------------------------------------------------------
1567 
1568  void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1569 
1570  void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1571 
1572  void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1573 
1579  static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1580  int32_t pos);
1581 
1582  static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1583  int32_t iterOpts);
1584 
1624  UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1625  ParsePosition& ppos,
1626  UErrorCode &ec);
1627 
1628  void applyPropertyPattern(RuleCharacterIterator& chars,
1629  UnicodeString& rebuiltPat,
1630  UErrorCode& ec);
1631 
1632  static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1633 
1638  typedef UBool (*Filter)(UChar32 codePoint, void* context);
1639 
1649  void applyFilter(Filter filter,
1650  void* context,
1651  const UnicodeSet* inclusions,
1652  UErrorCode &status);
1653 
1654 #ifndef U_HIDE_DRAFT_API // Skipped: ucpmap.h is draft only.
1655  void applyIntPropertyValue(const UCPMap *map,
1656  UCPMapValueFilter *filter, const void *context,
1657  UErrorCode &errorCode);
1658 #endif /* U_HIDE_DRAFT_API */
1659 
1663  void setPattern(const UnicodeString& newPat) {
1664  setPattern(newPat.getBuffer(), newPat.length());
1665  }
1666  void setPattern(const char16_t *newPat, int32_t newPatLen);
1670  void releasePattern();
1671 
1672  friend class UnicodeSetIterator;
1673 };
1674 
1675 
1676 
1677 inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1678  return !operator==(o);
1679 }
1680 
1681 inline UBool UnicodeSet::isFrozen() const {
1682  return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1683 }
1684 
1685 inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1686  return !containsNone(start, end);
1687 }
1688 
1690  return !containsNone(s);
1691 }
1692 
1694  return !containsNone(s);
1695 }
1696 
1697 inline UBool UnicodeSet::isBogus() const {
1698  return (UBool)(fFlags & kIsBogus);
1699 }
1700 
1702  return reinterpret_cast<UnicodeSet *>(uset);
1703 }
1704 
1705 inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1706  return reinterpret_cast<const UnicodeSet *>(uset);
1707 }
1708 
1710  return reinterpret_cast<USet *>(this);
1711 }
1712 
1713 inline const USet *UnicodeSet::toUSet() const {
1714  return reinterpret_cast<const USet *>(this);
1715 }
1716 
1717 inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1718  int32_t sLength=s.length();
1719  if(start<0) {
1720  start=0;
1721  } else if(start>sLength) {
1722  start=sLength;
1723  }
1724  return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1725 }
1726 
1727 inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1728  int32_t sLength=s.length();
1729  if(limit<0) {
1730  limit=0;
1731  } else if(limit>sLength) {
1732  limit=sLength;
1733  }
1734  return spanBack(s.getBuffer(), limit, spanCondition);
1735 }
1736 
1738 
1739 #endif
#define INITIAL_CAPACITY
The initial size of an array if it is unspecified.
Definition: RunArrays.h:32
static UClassID getStaticClassID()
ICU "poor man&#39;s RTTI", returns a UClassID for this class.
struct UCPMap UCPMap
Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
Definition: ucpmap.h:33
int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition: unimatch.h:32
C++ API: Unicode String.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UnicodeSetIterator iterates over the contents of a UnicodeSet.
Definition: usetiter.h:63
UBool isBogus(void) const
Determine if this object contains a valid set.
Definition: uniset.h:1697
UBool operator!=(const UnicodeSet &o) const
Compares the specified object with this set for equality.
Definition: uniset.h:1677
void * UClassID
UClassID is used to identify classes without using the compiler&#39;s RTTI.
Definition: uobject.h:93
This file defines an abstract map from Unicode code points to integer values.
static UnicodeSet * fromUSet(USet *uset)
Get a UnicodeSet pointer from a USet.
Definition: uniset.h:1701
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns TRUE if this matcher will match a character c, where c & 0xFF == v, at offset, in the forward direction (with limit > offset).
C API: Unicode Set.
An interface that defines both lookup protocol and parsing of symbolic names.
Definition: symtable.h:56
virtual UClassID getDynamicClassID(void) const =0
Returns a unique class ID polymorphically.
virtual UnicodeFunctor * clone() const =0
Return a copy of this object.
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition: rep.h:73
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:137
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition: unifilt.h:61
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const =0
Union the set of all characters that may be matched by this object into the given set...
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:218
uint32_t UCPMapValueFilter(const void *context, uint32_t value)
Callback function type: Modifies a map value.
Definition: ucpmap.h:116
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:389
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
UnicodeFunctor is an abstract base class for objects that perform match and/or replace operations on ...
Definition: unifunct.h:35
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental)
Implement UnicodeMatcher API.
A mutable set of Unicode characters and multicharacter strings.
Definition: uniset.h:277
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition: uset.h:156
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:138
UProperty
Selection constants for Unicode properties.
Definition: uchar.h:195
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:401
struct USet USet
USet is the C API type corresponding to C++ class UnicodeSet.
Definition: uset.h:47
int32_t length(void) const
Return the length of the UnicodeString object.
Definition: unistr.h:3886
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition: parsepos.h:49
#define U_FINAL
Defined to the C++11 "final" keyword if available.
Definition: umachine.h:140
char16_t * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=FALSE) const =0
Returns a string representation of this matcher.
UBool containsSome(UChar32 start, UChar32 end) const
Returns true if this set contains one or more of the characters in the given range.
Definition: uniset.h:1685
virtual UBool contains(UChar32 c) const =0
Returns true for characters that are in the selected subset.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:233
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:289
UBool isFrozen() const
Determines whether the set has been frozen (made immutable) or not.
Definition: uniset.h:1681
USet * toUSet()
Produce a USet * pointer for this UnicodeSet.
Definition: uniset.h:1709
C++ API: Unicode Filter.
int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
int8_t UBool
The ICU boolean type.
Definition: umachine.h:225