C API: Unicode Set. More...

#include "unicode/utypes.h"
#include "unicode/uchar.h"
#include "unicode/localpointer.h"

Data Structures
struct	USerializedSet
	A serialized form of a Unicode set. More...

Namespaces
	icu
	File coll.h.

Typedefs
typedef struct USet	USet
	USet is the C API type corresponding to C++ class UnicodeSet. More...

typedef enum USetSpanCondition	USetSpanCondition
	Argument values for whether span() and similar functions continue while the current character is contained vs. More...

typedef struct USerializedSet	USerializedSet
	A serialized form of a Unicode set. More...

Enumerations
enum	{ USET_IGNORE_SPACE = 1, USET_CASE_INSENSITIVE = 2, USET_ADD_CASE_MAPPINGS = 4 }
	Bitmask values to be passed to uset_openPatternOptions() or uset_applyPattern() taking an option parameter. More...

enum	USetSpanCondition { USET_SPAN_NOT_CONTAINED = 0, USET_SPAN_CONTAINED = 1, USET_SPAN_SIMPLE = 2, USET_SPAN_CONDITION_COUNT }
	Argument values for whether span() and similar functions continue while the current character is contained vs. More...

enum	{ USET_SERIALIZED_STATIC_ARRAY_CAPACITY =8 }

Functions
USet *	uset_openEmpty (void)
	Create an empty USet object. More...

USet *	uset_open (UChar32 start, UChar32 end)
	Creates a USet object that contains the range of characters start..end, inclusive. More...

USet *	uset_openPattern (const UChar pattern, int32_t patternLength, UErrorCode ec)
	Creates a set from the given pattern. More...

USet *	uset_openPatternOptions (const UChar pattern, int32_t patternLength, uint32_t options, UErrorCode ec)
	Creates a set from the given pattern. More...

void	uset_close (USet *set)
	Disposes of the storage used by a USet object. More...

USet *	uset_clone (const USet *set)
	Returns a copy of this object. More...

UBool	uset_isFrozen (const USet *set)
	Determines whether the set has been frozen (made immutable) or not. More...

void	uset_freeze (USet *set)
	Freeze the set (make it immutable). More...

USet *	uset_cloneAsThawed (const USet *set)
	Clone the set and make the clone mutable. More...

void	uset_set (USet *set, UChar32 start, UChar32 end)
	Causes the USet object to represent the range `start - end`. More...

int32_t	uset_applyPattern (USet set, const UChar pattern, int32_t patternLength, uint32_t options, UErrorCode *status)
	Modifies the set to represent the set specified by the given pattern. More...

void	uset_applyIntPropertyValue (USet set, UProperty prop, int32_t value, UErrorCode ec)
	Modifies the set to contain those code points which have the given value for the given binary or enumerated property, as returned by u_getIntPropertyValue. More...

void	uset_applyPropertyAlias (USet set, const UChar prop, int32_t propLength, const UChar value, int32_t valueLength, UErrorCode ec)
	Modifies the set to contain those code points which have the given value for the given property. More...

UBool	uset_resemblesPattern (const UChar *pattern, int32_t patternLength, int32_t pos)
	Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet pattern. More...

int32_t	uset_toPattern (const USet set, UChar result, int32_t resultCapacity, UBool escapeUnprintable, UErrorCode *ec)
	Returns a string representation of this set. More...

void	uset_add (USet *set, UChar32 c)
	Adds the given character to the given USet. More...

void	uset_addAll (USet set, const USet additionalSet)
	Adds all of the elements in the specified set to this set if they're not already present. More...

void	uset_addRange (USet *set, UChar32 start, UChar32 end)
	Adds the given range of characters to the given USet. More...

void	uset_addString (USet set, const UChar str, int32_t strLen)
	Adds the given string to the given USet. More...

void	uset_addAllCodePoints (USet set, const UChar str, int32_t strLen)
	Adds each of the characters in this string to the set. More...

void	uset_remove (USet *set, UChar32 c)
	Removes the given character from the given USet. More...

void	uset_removeRange (USet *set, UChar32 start, UChar32 end)
	Removes the given range of characters from the given USet. More...

void	uset_removeString (USet set, const UChar str, int32_t strLen)
	Removes the given string to the given USet. More...

void	uset_removeAll (USet set, const USet removeSet)
	Removes from this set all of its elements that are contained in the specified set. More...

void	uset_retain (USet *set, UChar32 start, UChar32 end)
	Retain only the elements in this set that are contained in the specified range. More...

void	uset_retainAll (USet set, const USet retain)
	Retains only the elements in this set that are contained in the specified set. More...

void	uset_compact (USet *set)
	Reallocate this objects internal structures to take up the least possible space, without changing this object's value. More...

void	uset_complement (USet *set)
	Inverts this set. More...

void	uset_complementAll (USet set, const USet complement)
	Complements in this set all elements contained in the specified set. More...

void	uset_clear (USet *set)
	Removes all of the elements from this set. More...

void	uset_closeOver (USet *set, int32_t attributes)
	Close this set over the given attribute. More...

void	uset_removeAllStrings (USet *set)
	Remove all strings from this set. More...

UBool	uset_isEmpty (const USet *set)
	Returns TRUE if the given USet contains no characters and no strings. More...

UBool	uset_contains (const USet *set, UChar32 c)
	Returns TRUE if the given USet contains the given character. More...

UBool	uset_containsRange (const USet *set, UChar32 start, UChar32 end)
	Returns TRUE if the given USet contains all characters c where start <= c && c <= end. More...

UBool	uset_containsString (const USet set, const UChar str, int32_t strLen)
	Returns TRUE if the given USet contains the given string. More...

int32_t	uset_indexOf (const USet *set, UChar32 c)
	Returns the index of the given character within this set, where the set is ordered by ascending code point. More...

UChar32	uset_charAt (const USet *set, int32_t charIndex)
	Returns the character at the given index within this set, where the set is ordered by ascending code point. More...

int32_t	uset_size (const USet *set)
	Returns the number of characters and strings contained in the given USet. More...

int32_t	uset_getItemCount (const USet *set)
	Returns the number of items in this set. More...

int32_t	uset_getItem (const USet set, int32_t itemIndex, UChar32 start, UChar32 end, UChar str, int32_t strCapacity, UErrorCode *ec)
	Returns an item of this set. More...

UBool	uset_containsAll (const USet set1, const USet set2)
	Returns true if set1 contains all the characters and strings of set2. More...

UBool	uset_containsAllCodePoints (const USet set, const UChar str, int32_t strLen)
	Returns true if this set contains all the characters of the given string. More...

UBool	uset_containsNone (const USet set1, const USet set2)
	Returns true if set1 contains none of the characters and strings of set2. More...

UBool	uset_containsSome (const USet set1, const USet set2)
	Returns true if set1 contains some of the characters and strings of set2. More...

int32_t	uset_span (const USet set, const UChar s, int32_t length, USetSpanCondition spanCondition)
	Returns the length of the initial substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). More...

int32_t	uset_spanBack (const USet set, const UChar s, int32_t length, USetSpanCondition spanCondition)
	Returns the start of the trailing substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). More...

int32_t	uset_spanUTF8 (const USet set, const char s, int32_t length, USetSpanCondition spanCondition)
	Returns the length of the initial substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). More...

int32_t	uset_spanBackUTF8 (const USet set, const char s, int32_t length, USetSpanCondition spanCondition)
	Returns the start of the trailing substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED). More...

UBool	uset_equals (const USet set1, const USet set2)
	Returns true if set1 contains all of the characters and strings of set2, and vis versa. More...

int32_t	uset_serialize (const USet set, uint16_t dest, int32_t destCapacity, UErrorCode *pErrorCode)
	Serializes this set into an array of 16-bit integers. More...

UBool	uset_getSerializedSet (USerializedSet fillSet, const uint16_t src, int32_t srcLength)
	Given a serialized array, fill in the given serialized set object. More...

void	uset_setSerializedToOne (USerializedSet *fillSet, UChar32 c)
	Set the USerializedSet to contain the given character (and nothing else). More...

UBool	uset_serializedContains (const USerializedSet *set, UChar32 c)
	Returns TRUE if the given USerializedSet contains the given character. More...

int32_t	uset_getSerializedRangeCount (const USerializedSet *set)
	Returns the number of disjoint ranges of characters contained in the given serialized set. More...

UBool	uset_getSerializedRange (const USerializedSet set, int32_t rangeIndex, UChar32 pStart, UChar32 *pEnd)
	Returns a range of characters contained in the given serialized set. More...

Detailed Description

C API: Unicode Set.

This is a C wrapper around the C++ UnicodeSet class.

Definition in file uset.h.

Typedef Documentation

◆ USerializedSet

typedef struct USerializedSet USerializedSet

A serialized form of a Unicode set.

Limited manipulations are possible directly on a serialized set. See below.

Stable:: ICU 2.4

◆ USet

typedef struct USet USet

USet is the C API type corresponding to C++ class UnicodeSet.

Use the uset_* API to manipulate. Create with uset_open*, and destroy with uset_close.

Stable:: ICU 2.4

Definition at line 47 of file uset.h.

◆ USetSpanCondition

typedef enum USetSpanCondition USetSpanCondition

Argument values for whether span() and similar functions continue while the current character is contained vs.

not contained in the set.

The functionality is straightforward for sets with only single code points, without strings (which is the common case):

USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same.
USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED.
span() and spanBack() partition any string the same way when alternating between span(USET_SPAN_NOT_CONTAINED) and span(either "contained" condition).
Using a complemented (inverted) set and the opposite span conditions yields the same results.

When a set contains multi-code point strings, then these statements may not be true, depending on the strings in the set (for example, whether they overlap with each other) and the string that is processed. For a set with strings:

The complement of the set contains the opposite set of code points, but the same set of strings. Therefore, complementing both the set and the span conditions may yield different results.
When starting spans at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different because a set string may start before the later position.
span(USET_SPAN_SIMPLE) may be shorter than span(USET_SPAN_CONTAINED) because it will not recursively try all possible paths. For example, with a set which contains the three strings "xy", "xya" and "ax", span("xyax", USET_SPAN_CONTAINED) will return 4 but span("xyax", USET_SPAN_SIMPLE) will return 3. span(USET_SPAN_SIMPLE) will never be longer than span(USET_SPAN_CONTAINED).
With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.

Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could be used.

Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point boundaries, never in the middle of a surrogate pair. Illegal UTF-8 sequences are treated like U+FFFD. When processing UTF-8 strings, malformed set strings (strings with unpaired surrogates which cannot be converted to UTF-8) are ignored.

Stable:: ICU 3.8

Enumeration Type Documentation

◆ anonymous enum

anonymous enum

Bitmask values to be passed to uset_openPatternOptions() or uset_applyPattern() taking an option parameter.

Stable:: ICU 2.4

Enumerator

USET_IGNORE_SPACE

Ignore white space within patterns unless quoted or escaped.

Stable:: ICU 2.4

USET_CASE_INSENSITIVE

Enable case insensitive matching.

E.g., "[ab]" with this flag will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will match all except 'a', 'A', 'b', and 'B'. This performs a full closure over case mappings, e.g. U+017F for s.

The resulting set is a superset of the input for the code points but not for the strings. It performs a case mapping closure of the code points and adds full case folding strings for the code points, and reduces strings of the original set to their full case folding equivalents.

This is designed for case-insensitive matches, for example in regular expressions. The full code point case closure allows checking of an input character directly against the closure set. Strings are matched by comparing the case-folded form from the closure set with an incremental case folding of the string in question.

The closure set will also contain single code points if the original set contained case-equivalent strings (like U+00DF for "ss" or "Ss" etc.). This is not necessary (that is, redundant) for the above matching method but results in the same closure sets regardless of whether the original set contained the code point or a string.

Stable:: ICU 2.4

USET_ADD_CASE_MAPPINGS

Enable case insensitive matching.

E.g., "[ab]" with this flag will match 'a', 'A', 'b', and 'B'. "[^ab]" with this flag will match all except 'a', 'A', 'b', and 'B'. This adds the lower-, title-, and uppercase mappings as well as the case folding of each existing element in the set.

Stable:: ICU 3.2

Definition at line 55 of file uset.h.

◆ anonymous enum

anonymous enum

Enumerator

USET_SERIALIZED_STATIC_ARRAY_CAPACITY

Capacity of USerializedSet::staticArray.

Enough for any single-code point set. Also provides padding for nice sizeof(USerializedSet).

Stable:: ICU 2.4

Definition at line 214 of file uset.h.

◆ USetSpanCondition

enum USetSpanCondition

Argument values for whether span() and similar functions continue while the current character is contained vs.

not contained in the set.

The functionality is straightforward for sets with only single code points, without strings (which is the common case):

USET_SPAN_CONTAINED and USET_SPAN_SIMPLE work the same.
USET_SPAN_CONTAINED and USET_SPAN_SIMPLE are inverses of USET_SPAN_NOT_CONTAINED.
span() and spanBack() partition any string the same way when alternating between span(USET_SPAN_NOT_CONTAINED) and span(either "contained" condition).
Using a complemented (inverted) set and the opposite span conditions yields the same results.

When a set contains multi-code point strings, then these statements may not be true, depending on the strings in the set (for example, whether they overlap with each other) and the string that is processed. For a set with strings:

The complement of the set contains the opposite set of code points, but the same set of strings. Therefore, complementing both the set and the span conditions may yield different results.
When starting spans at different positions in a string (span(s, ...) vs. span(s+1, ...)) the ends of the spans may be different because a set string may start before the later position.
span(USET_SPAN_SIMPLE) may be shorter than span(USET_SPAN_CONTAINED) because it will not recursively try all possible paths. For example, with a set which contains the three strings "xy", "xya" and "ax", span("xyax", USET_SPAN_CONTAINED) will return 4 but span("xyax", USET_SPAN_SIMPLE) will return 3. span(USET_SPAN_SIMPLE) will never be longer than span(USET_SPAN_CONTAINED).
With either "contained" condition, span() and spanBack() may partition a string in different ways. For example, with a set which contains the two strings "ab" and "ba", and when processing the string "aba", span() will yield contained/not-contained boundaries of { 0, 2, 3 } while spanBack() will yield boundaries of { 0, 1, 3 }.

Note: If it is important to get the same boundaries whether iterating forward or backward through a string, then either only span() should be used and the boundaries cached for backward operation, or an ICU BreakIterator could be used.

Note: Unpaired surrogates are treated like surrogate code points. Similarly, set strings match only on code point boundaries, never in the middle of a surrogate pair. Illegal UTF-8 sequences are treated like U+FFFD. When processing UTF-8 strings, malformed set strings (strings with unpaired surrogates which cannot be converted to UTF-8) are ignored.

Stable:: ICU 3.8

Enumerator
USET_SPAN_NOT_CONTAINED	Continues a span() while there is no set element at the current position. Increments by one code point at a time. Stops before the first set element (character or string). (For code points only, this is like while contains(current)==FALSE). When span() returns, the substring between where it started and the position it returned consists only of characters that are not in the set, and none of its strings overlap with the span. Stable: ICU 3.8
USET_SPAN_CONTAINED	Spans the longest substring that is a concatenation of set elements (characters or strings). (For characters only, this is like while contains(current)==TRUE). When span() returns, the substring between where it started and the position it returned consists only of set elements (characters or strings) that are in the set. If a set contains strings, then the span will be the longest substring for which there exists at least one non-overlapping concatenation of set elements (characters or strings). This is equivalent to a POSIX regular expression for `(OR of each set element)`. (Java/ICU/Perl regex stops at the first match of an OR.) Stable:* ICU 3.8
USET_SPAN_SIMPLE	Continues a span() while there is a set element at the current position. Increments by the longest matching element at each position. (For characters only, this is like while contains(current)==TRUE). When span() returns, the substring between where it started and the position it returned consists only of set elements (characters or strings) that are in the set. If a set only contains single characters, then this is the same as USET_SPAN_CONTAINED. If a set contains strings, then the span will be the longest substring with a match at each position with the longest single set element (character or string). Use this span condition together with other longest-match algorithms, such as ICU converters (ucnv_getUnicodeSet()). Stable: ICU 3.8
USET_SPAN_CONDITION_COUNT	One more than the last span condition. Deprecated: ICU 58 The numeric value may change over time, see ICU ticket #12420.

Definition at line 156 of file uset.h.

Function Documentation

◆ uset_add()

void uset_add	(	USet *	set,
		UChar32	c
	)

Adds the given character to the given USet.

After this call, uset_contains(set, c) will return TRUE. A frozen set will not be modified.

Parameters

set	the object to which to add the character
c	the character to add

Stable:: ICU 2.4

◆ uset_addAll()

void uset_addAll	(	USet *	set,
		const USet *	additionalSet
	)

Adds all of the elements in the specified set to this set if they're not already present.

This operation effectively modifies this set so that its value is the union of the two sets. The behavior of this operation is unspecified if the specified collection is modified while the operation is in progress. A frozen set will not be modified.

Parameters

set	the object to which to add the set
additionalSet	the source set whose elements are to be added to this set.

Stable:: ICU 2.6

◆ uset_addAllCodePoints()

void uset_addAllCodePoints	(	USet *	set,
		const UChar *	str,
		int32_t	strLen
	)

Adds each of the characters in this string to the set.

Thus "ch" => {"c", "h"} If this set already any particular character, it has no effect on that character. A frozen set will not be modified.

Parameters

set	the object to which to add the character
str	the source string
strLen	the length of the string or -1 if null terminated.

Stable:: ICU 3.4

◆ uset_addRange()

void uset_addRange	(	USet *	set,
		UChar32	start,
		UChar32	end
	)

Adds the given range of characters to the given USet.

After this call, uset_contains(set, start, end) will return TRUE. A frozen set will not be modified.

Parameters

set	the object to which to add the character
start	the first character of the range to add, inclusive
end	the last character of the range to add, inclusive

Stable:: ICU 2.2

◆ uset_addString()

void uset_addString	(	USet *	set,
		const UChar *	str,
		int32_t	strLen
	)

Adds the given string to the given USet.

After this call, uset_containsString(set, str, strLen) will return TRUE. A frozen set will not be modified.

Parameters

set	the object to which to add the character
str	the string to add
strLen	the length of the string or -1 if null terminated.

Stable:: ICU 2.4

◆ uset_applyIntPropertyValue()

void uset_applyIntPropertyValue	(	USet *	set,
		UProperty	prop,
		int32_t	value,
		UErrorCode *	ec
	)

Modifies the set to contain those code points which have the given value for the given binary or enumerated property, as returned by u_getIntPropertyValue.

Prior contents of this set are lost. A frozen set will not be modified.

Parameters

set	the object to contain the code points defined by the property
prop	a property in the range UCHAR_BIN_START..UCHAR_BIN_LIMIT-1 or UCHAR_INT_START..UCHAR_INT_LIMIT-1 or UCHAR_MASK_START..UCHAR_MASK_LIMIT-1.
value	a value in the range u_getIntPropertyMinValue(prop).. u_getIntPropertyMaxValue(prop), with one exception. If prop is UCHAR_GENERAL_CATEGORY_MASK, then value should not be a UCharCategory, but rather a mask value produced by U_GET_GC_MASK(). This allows grouped categories such as [:L:] to be represented.
ec	error code input/output parameter

Stable:: ICU 3.2

◆ uset_applyPattern()

int32_t uset_applyPattern	(	USet *	set,
		const UChar *	pattern,
		int32_t	patternLength,
		uint32_t	options,
		UErrorCode *	status
	)

Modifies the set to represent the set specified by the given pattern.

See the UnicodeSet class description for the syntax of the pattern language. See also the User Guide chapter about UnicodeSet. Empties the set passed before applying the pattern. A frozen set will not be modified.

Parameters

set	The set to which the pattern is to be applied.
pattern	A pointer to UChar string specifying what characters are in the set. The character at pattern[0] must be a '['.
patternLength	The length of the UChar string. -1 if NUL terminated.
options	A bitmask for options to apply to the pattern. Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
status	Returns an error if the pattern cannot be parsed.

Returns: Upon successful parse, the value is either the index of the character after the closing ']' of the parsed pattern. If the status code indicates failure, then the return value is the index of the error in the source.

Stable:: ICU 2.8

◆ uset_applyPropertyAlias()

void uset_applyPropertyAlias	(	USet *	set,
		const UChar *	prop,
		int32_t	propLength,
		const UChar *	value,
		int32_t	valueLength,
		UErrorCode *	ec
	)

Modifies the set to contain those code points which have the given value for the given property.

Prior contents of this set are lost. A frozen set will not be modified.

Parameters

set	the object to contain the code points defined by the given property and value alias
prop	a string specifying a property alias, either short or long. The name is matched loosely. See PropertyAliases.txt for names and a description of loose matching. If the value string is empty, then this string is interpreted as either a General_Category value alias, a Script value alias, a binary property alias, or a special ID. Special IDs are matched loosely and correspond to the following sets:

"ANY" = [\u0000-\U0010FFFF], "ASCII" = [\u0000-\u007F], "Assigned" = [:^Cn:].

Parameters

propLength	the length of the prop, or -1 if NULL
value	a string specifying a value alias, either short or long. The name is matched loosely. See PropertyValueAliases.txt for names and a description of loose matching. In addition to aliases listed, numeric values and canonical combining classes may be expressed numerically, e.g., ("nv", "0.5") or ("ccc", "220"). The value string may also be empty.
valueLength	the length of the value, or -1 if NULL
ec	error code input/output parameter

Stable:: ICU 3.2

◆ uset_charAt()

UChar32 uset_charAt	(	const USet *	set,
		int32_t	charIndex
	)

Returns the character at the given index within this set, where the set is ordered by ascending code point.

If the index is out of range, return (UChar32)-1. The inverse of this method is indexOf().

Parameters

set	the set
charIndex	an index from 0..size()-1 to obtain the char for

Returns: the character at the given index, or (UChar32)-1.

Stable:: ICU 3.2

◆ uset_clear()

void uset_clear ( USet * set )

Removes all of the elements from this set.

This set will be empty after this call returns. A frozen set will not be modified.

Parameters

set the set

Stable:: ICU 2.4

◆ uset_clone()

USet* uset_clone ( const USet * set )

Returns a copy of this object.

If this set is frozen, then the clone will be frozen as well. Use uset_cloneAsThawed() for a mutable clone of a frozen set.

Parameters

set	the original set

Returns: the newly allocated copy of the set

See also: uset_cloneAsThawed

Stable:: ICU 3.8

◆ uset_cloneAsThawed()

USet* uset_cloneAsThawed ( const USet * set )

Clone the set and make the clone mutable.

See the ICU4J Freezable interface for details.

Parameters

set the set

Returns: the mutable clone

See also: uset_freeze; uset_isFrozen; uset_clone

Stable:: ICU 3.8

◆ uset_close()

void uset_close ( USet * set )

Disposes of the storage used by a USet object.

This function should be called exactly once for objects returned by uset_open().

Parameters

set	the object to dispose of

Stable:: ICU 2.4

◆ uset_closeOver()

void uset_closeOver	(	USet *	set,
		int32_t	attributes
	)

Close this set over the given attribute.

For the attribute USET_CASE, the result is to modify this set so that:

For each character or string 'a' in this set, all strings or characters 'b' such that foldCase(a) == foldCase(b) are added to this set.
For each string 'e' in the resulting set, if e != foldCase(e), 'e' will be removed.

Example: [aq\u00DF{Bc}{bC}{Fi}] => [aAqQ\u00DF\uFB01{ss}{bc}{fi}]

(Here foldCase(x) refers to the operation u_strFoldCase, and a == b denotes that the contents are the same, not pointer comparison.)

A frozen set will not be modified.

Parameters

set	the set
attributes	bitmask for attributes to close over. Currently only the USET_CASE bit is supported. Any undefined bits are ignored.

Stable:: ICU 4.2

◆ uset_compact()

void uset_compact ( USet * set )

Reallocate this objects internal structures to take up the least possible space, without changing this object's value.

A frozen set will not be modified.

Parameters

set	the object on which to perfrom the compact

Stable:: ICU 3.2

◆ uset_complement()

void uset_complement ( USet * set )

Inverts this set.

This operation modifies this set so that its value is its complement. This operation does not affect the multicharacter strings, if any. A frozen set will not be modified.

Parameters

set the set

Stable:: ICU 2.4

◆ uset_complementAll()

void uset_complementAll	(	USet *	set,
		const USet *	complement
	)

Complements in this set all elements contained in the specified set.

Any character in the other set will be removed if it is in this set, or will be added if it is not in this set. A frozen set will not be modified.

Parameters

set	the set with which to complement
complement	set that defines which elements will be xor'ed from this set.

Stable:: ICU 3.2

◆ uset_contains()

UBool uset_contains	(	const USet *	set,
		UChar32	c
	)

Returns TRUE if the given USet contains the given character.

This function works faster with a frozen set.

Parameters

set	the set
c	The codepoint to check for within the set

Returns: true if set contains c

Stable:: ICU 2.4

◆ uset_containsAll()

UBool uset_containsAll	(	const USet *	set1,
		const USet *	set2
	)

Returns true if set1 contains all the characters and strings of set2.

It answers the question, 'Is set1 a superset of set2?'

Parameters

set1	set to be checked for containment
set2	set to be checked for containment

Returns: true if the test condition is met

Stable:: ICU 3.2

◆ uset_containsAllCodePoints()

UBool uset_containsAllCodePoints	(	const USet *	set,
		const UChar *	str,
		int32_t	strLen
	)

Returns true if this set contains all the characters of the given string.

This is does not check containment of grapheme clusters, like uset_containsString.

Parameters

set	set of characters to be checked for containment
str	string containing codepoints to be checked for containment
strLen	the length of the string or -1 if null terminated.

Returns: true if the test condition is met

Stable:: ICU 3.4

◆ uset_containsNone()

UBool uset_containsNone	(	const USet *	set1,
		const USet *	set2
	)

Returns true if set1 contains none of the characters and strings of set2.

It answers the question, 'Is set1 a disjoint set of set2?'

Parameters

set1	set to be checked for containment
set2	set to be checked for containment

Returns: true if the test condition is met

Stable:: ICU 3.2

◆ uset_containsRange()

UBool uset_containsRange	(	const USet *	set,
		UChar32	start,
		UChar32	end
	)

Returns TRUE if the given USet contains all characters c where start <= c && c <= end.

Parameters

set	the set
start	the first character of the range to test, inclusive
end	the last character of the range to test, inclusive

Returns: TRUE if set contains the range

Stable:: ICU 2.2

◆ uset_containsSome()

UBool uset_containsSome	(	const USet *	set1,
		const USet *	set2
	)

Returns true if set1 contains some of the characters and strings of set2.

It answers the question, 'Does set1 and set2 have an intersection?'

Parameters

set1	set to be checked for containment
set2	set to be checked for containment

Returns: true if the test condition is met

Stable:: ICU 3.2

◆ uset_containsString()

UBool uset_containsString	(	const USet *	set,
		const UChar *	str,
		int32_t	strLen
	)

Returns TRUE if the given USet contains the given string.

Parameters

set	the set
str	the string
strLen	the length of the string or -1 if null terminated.

Returns: true if set contains str

Stable:: ICU 2.4

◆ uset_equals()

UBool uset_equals	(	const USet *	set1,
		const USet *	set2
	)

Returns true if set1 contains all of the characters and strings of set2, and vis versa.

It answers the question, 'Is set1 equal to set2?'

Parameters

set1	set to be checked for containment
set2	set to be checked for containment

Returns: true if the test condition is met

Stable:: ICU 3.2

◆ uset_freeze()

void uset_freeze ( USet * set )

Freeze the set (make it immutable).

Once frozen, it cannot be unfrozen and is therefore thread-safe until it is deleted. See the ICU4J Freezable interface for details. Freezing the set may also make some operations faster, for example uset_contains() and uset_span(). A frozen set will not be modified. (It remains frozen.)

Parameters

set the set

Returns: the same set, now frozen

See also: uset_isFrozen; uset_cloneAsThawed

Stable:: ICU 3.8

◆ uset_getItem()

int32_t uset_getItem	(	const USet *	set,
		int32_t	itemIndex,
		UChar32 *	start,
		UChar32 *	end,
		UChar *	str,
		int32_t	strCapacity,
		UErrorCode *	ec
	)

Returns an item of this set.

An item is either a range of characters or a single multicharacter string.

Parameters

set	the set
itemIndex	a non-negative integer in the range 0.. uset_getItemCount(set)-1
start	pointer to variable to receive first character in range, inclusive
end	pointer to variable to receive last character in range, inclusive
str	buffer to receive the string, may be NULL
strCapacity	capacity of str, or 0 if str is NULL
ec	error code

Returns: the length of the string (>= 2), or 0 if the item is a range, in which case it is the range *start..*end, or -1 if itemIndex is out of range

Stable:: ICU 2.4

◆ uset_getItemCount()

int32_t uset_getItemCount ( const USet * set )

Returns the number of items in this set.

An item is either a range of characters or a single multicharacter string.

Parameters

set the set

Returns: a non-negative integer counting the character ranges and/or strings contained in set

Stable:: ICU 2.4

◆ uset_getSerializedRange()

UBool uset_getSerializedRange	(	const USerializedSet *	set,
		int32_t	rangeIndex,
		UChar32 *	pStart,
		UChar32 *	pEnd
	)

Returns a range of characters contained in the given serialized set.

Parameters

set	the serialized set
rangeIndex	a non-negative integer in the range 0.. uset_getSerializedRangeCount(set)-1
pStart	pointer to variable to receive first character in range, inclusive
pEnd	pointer to variable to receive last character in range, inclusive

Returns: true if rangeIndex is valid, otherwise false

Stable:: ICU 2.4

◆ uset_getSerializedRangeCount()

int32_t uset_getSerializedRangeCount ( const USerializedSet * set )

Returns the number of disjoint ranges of characters contained in the given serialized set.

Ignores any strings contained in the set.

Parameters

set	the serialized set

Returns: a non-negative integer counting the character ranges contained in set

Stable:: ICU 2.4

◆ uset_getSerializedSet()

UBool uset_getSerializedSet	(	USerializedSet *	fillSet,
		const uint16_t *	src,
		int32_t	srcLength
	)

Given a serialized array, fill in the given serialized set object.

Parameters

fillSet	pointer to result
src	pointer to start of array
srcLength	length of array

Returns: true if the given array is valid, otherwise false

Stable:: ICU 2.4

◆ uset_indexOf()

int32_t uset_indexOf	(	const USet *	set,
		UChar32	c
	)

Returns the index of the given character within this set, where the set is ordered by ascending code point.

If the character is not in this set, return -1. The inverse of this method is charAt().

Parameters

set	the set
c	the character to obtain the index for

Returns: an index from 0..size()-1, or -1

Stable:: ICU 3.2

◆ uset_isEmpty()

UBool uset_isEmpty ( const USet * set )

Returns TRUE if the given USet contains no characters and no strings.

Parameters

set the set

Returns: true if set is empty

Stable:: ICU 2.4

◆ uset_isFrozen()

UBool uset_isFrozen ( const USet * set )

Determines whether the set has been frozen (made immutable) or not.

See the ICU4J Freezable interface for details.

Parameters

set the set

Returns: TRUE/FALSE for whether the set has been frozen

See also: uset_freeze; uset_cloneAsThawed

Stable:: ICU 3.8

◆ uset_open()

USet* uset_open	(	UChar32	start,
		UChar32	end
	)

Creates a USet object that contains the range of characters start..end, inclusive.

If start > end then an empty set is created (same as using uset_openEmpty()).

Parameters

start	first character of the range, inclusive
end	last character of the range, inclusive

Returns: a newly created USet. The caller must call uset_close() on it when done.

Stable:: ICU 2.4

◆ uset_openEmpty()

USet* uset_openEmpty ( void )

Create an empty USet object.

Equivalent to uset_open(1, 0).

Returns: a newly created USet. The caller must call uset_close() on it when done.

Stable:: ICU 4.2

◆ uset_openPattern()

USet* uset_openPattern	(	const UChar *	pattern,
		int32_t	patternLength,
		UErrorCode *	ec
	)

Creates a set from the given pattern.

See the UnicodeSet class description for the syntax of the pattern language.

Parameters

pattern	a string specifying what characters are in the set
patternLength	the length of the pattern, or -1 if null terminated
ec	the error code

Stable:: ICU 2.4

◆ uset_openPatternOptions()

USet* uset_openPatternOptions	(	const UChar *	pattern,
		int32_t	patternLength,
		uint32_t	options,
		UErrorCode *	ec
	)

Creates a set from the given pattern.

See the UnicodeSet class description for the syntax of the pattern language.

Parameters

pattern	a string specifying what characters are in the set
patternLength	the length of the pattern, or -1 if null terminated
options	bitmask for options to apply to the pattern. Valid options are USET_IGNORE_SPACE and USET_CASE_INSENSITIVE.
ec	the error code

Stable:: ICU 2.4

◆ uset_remove()

void uset_remove	(	USet *	set,
		UChar32	c
	)

Removes the given character from the given USet.

After this call, uset_contains(set, c) will return FALSE. A frozen set will not be modified.

Parameters

set	the object from which to remove the character
c	the character to remove

Stable:: ICU 2.4

◆ uset_removeAll()

void uset_removeAll	(	USet *	set,
		const USet *	removeSet
	)

Removes from this set all of its elements that are contained in the specified set.

This operation effectively modifies this set so that its value is the asymmetric set difference of the two sets. A frozen set will not be modified.

Parameters

set	the object from which the elements are to be removed
removeSet	the object that defines which elements will be removed from this set

Stable:: ICU 3.2

◆ uset_removeAllStrings()

void uset_removeAllStrings ( USet * set )

Remove all strings from this set.

Parameters

set the set

Stable:: ICU 4.2

◆ uset_removeRange()

void uset_removeRange	(	USet *	set,
		UChar32	start,
		UChar32	end
	)

Removes the given range of characters from the given USet.

After this call, uset_contains(set, start, end) will return FALSE. A frozen set will not be modified.

Parameters

set	the object to which to add the character
start	the first character of the range to remove, inclusive
end	the last character of the range to remove, inclusive

Stable:: ICU 2.2

◆ uset_removeString()

void uset_removeString	(	USet *	set,
		const UChar *	str,
		int32_t	strLen
	)

Removes the given string to the given USet.

After this call, uset_containsString(set, str, strLen) will return FALSE. A frozen set will not be modified.

Parameters

set	the object to which to add the character
str	the string to remove
strLen	the length of the string or -1 if null terminated.

Stable:: ICU 2.4

◆ uset_resemblesPattern()

UBool uset_resemblesPattern	(	const UChar *	pattern,
		int32_t	patternLength,
		int32_t	pos
	)

Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet pattern.

Parameters

pattern	a string specifying the pattern
patternLength	the length of the pattern, or -1 if NULL
pos	the given position

Stable:: ICU 3.2

◆ uset_retain()

void uset_retain	(	USet *	set,
		UChar32	start,
		UChar32	end
	)

Retain only the elements in this set that are contained in the specified range.

If start > end then an empty range is retained, leaving the set empty. This is equivalent to a boolean logic AND, or a set INTERSECTION. A frozen set will not be modified.

Parameters

set	the object for which to retain only the specified range
start	first character, inclusive, of range to be retained to this set.
end	last character, inclusive, of range to be retained to this set.

Stable:: ICU 3.2

◆ uset_retainAll()

void uset_retainAll	(	USet *	set,
		const USet *	retain
	)

Retains only the elements in this set that are contained in the specified set.

In other words, removes from this set all of its elements that are not contained in the specified set. This operation effectively modifies this set so that its value is the intersection of the two sets. A frozen set will not be modified.

Parameters

set	the object on which to perform the retain
retain	set that defines which elements this set will retain

Stable:: ICU 3.2

◆ uset_serialize()

int32_t uset_serialize	(	const USet *	set,
		uint16_t *	dest,
		int32_t	destCapacity,
		UErrorCode *	pErrorCode
	)

Serializes this set into an array of 16-bit integers.

Serialization (currently) only records the characters in the set; multicharacter strings are ignored.

The array has following format (each line is one 16-bit integer):

length = (n+2*m) | (m!=0?0x8000:0) bmpLength = n; present if m!=0 bmp[0] bmp[1] ... bmp[n-1] supp-high[0] supp-low[0] supp-high[1] supp-low[1] ... supp-high[m-1] supp-low[m-1]

The array starts with a header. After the header are n bmp code points, then m supplementary code points. Either n or m or both may be zero. n+2*m is always <= 0x7FFF.

If there are no supplementary characters (if m==0) then the header is one 16-bit integer, 'length', with value n.

If there are supplementary characters (if m!=0) then the header is two 16-bit integers. The first, 'length', has value (n+2*m)|0x8000. The second, 'bmpLength', has value n.

After the header the code points are stored in ascending order. Supplementary code points are stored as most significant 16 bits followed by least significant 16 bits.

Parameters

set	the set
dest	pointer to buffer of destCapacity 16-bit integers. May be NULL only if destCapacity is zero.
destCapacity	size of dest, or zero. Must not be negative.
pErrorCode	pointer to the error code. Will be set to U_INDEX_OUTOFBOUNDS_ERROR if n+2m > 0x7FFF. Will be set to U_BUFFER_OVERFLOW_ERROR if n+2m+(m!=0?2:1) > destCapacity.

Returns: the total length of the serialized format, including the header, that is, n+2*m+(m!=0?2:1), or 0 on error other than U_BUFFER_OVERFLOW_ERROR.

Stable:: ICU 2.4

◆ uset_serializedContains()

UBool uset_serializedContains	(	const USerializedSet *	set,
		UChar32	c
	)

Returns TRUE if the given USerializedSet contains the given character.

Parameters

set	the serialized set
c	The codepoint to check for within the set

Returns: true if set contains c

Stable:: ICU 2.4

◆ uset_set()

void uset_set	(	USet *	set,
		UChar32	start,
		UChar32	end
	)

Causes the USet object to represent the range start - end.

If start > end then this USet is set to an empty range. A frozen set will not be modified.

Parameters

set	the object to set to the given range
start	first character in the set, inclusive
end	last character in the set, inclusive

Stable:: ICU 3.2

◆ uset_setSerializedToOne()

void uset_setSerializedToOne	(	USerializedSet *	fillSet,
		UChar32	c
	)

Set the USerializedSet to contain the given character (and nothing else).

Parameters

fillSet	pointer to result
c	The codepoint to set

Stable:: ICU 2.4

◆ uset_size()

int32_t uset_size ( const USet * set )

Returns the number of characters and strings contained in the given USet.

Parameters

set the set

Returns: a non-negative integer counting the characters and strings contained in set

Stable:: ICU 2.4

◆ uset_span()

int32_t uset_span	(	const USet *	set,
		const UChar *	s,
		int32_t	length,
		USetSpanCondition	spanCondition
	)

Returns the length of the initial substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED).

See USetSpanCondition for details. Similar to the strspn() C library function. Unpaired surrogates are treated according to contains() of their surrogate code points. This function works faster with a frozen set and with a non-negative string length argument.

Parameters

set	the set
s	start of the string
length	of the string; can be -1 for NUL-terminated
spanCondition	specifies the containment condition

Returns: the length of the initial substring according to the spanCondition; 0 if the start of the string does not fit the spanCondition

Stable:: ICU 3.8

See also: USetSpanCondition

◆ uset_spanBack()

int32_t uset_spanBack	(	const USet *	set,
		const UChar *	s,
		int32_t	length,
		USetSpanCondition	spanCondition
	)

Returns the start of the trailing substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED).

See USetSpanCondition for details. Unpaired surrogates are treated according to contains() of their surrogate code points. This function works faster with a frozen set and with a non-negative string length argument.

Parameters

set	the set
s	start of the string
length	of the string; can be -1 for NUL-terminated
spanCondition	specifies the containment condition

Returns: the start of the trailing substring according to the spanCondition; the string length if the end of the string does not fit the spanCondition

Stable:: ICU 3.8

See also: USetSpanCondition

◆ uset_spanBackUTF8()

int32_t uset_spanBackUTF8	(	const USet *	set,
		const char *	s,
		int32_t	length,
		USetSpanCondition	spanCondition
	)

Returns the start of the trailing substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED).

See USetSpanCondition for details. Malformed byte sequences are treated according to contains(0xfffd). This function works faster with a frozen set and with a non-negative string length argument.

Parameters

set	the set
s	start of the string (UTF-8)
length	of the string; can be -1 for NUL-terminated
spanCondition	specifies the containment condition

Returns: the start of the trailing substring according to the spanCondition; the string length if the end of the string does not fit the spanCondition

Stable:: ICU 3.8

See also: USetSpanCondition

◆ uset_spanUTF8()

int32_t uset_spanUTF8	(	const USet *	set,
		const char *	s,
		int32_t	length,
		USetSpanCondition	spanCondition
	)

Returns the length of the initial substring of the input string which consists only of characters and strings that are contained in this set (USET_SPAN_CONTAINED, USET_SPAN_SIMPLE), or only of characters and strings that are not contained in this set (USET_SPAN_NOT_CONTAINED).

See USetSpanCondition for details. Similar to the strspn() C library function. Malformed byte sequences are treated according to contains(0xfffd). This function works faster with a frozen set and with a non-negative string length argument.

Parameters

set	the set
s	start of the string (UTF-8)
length	of the string; can be -1 for NUL-terminated
spanCondition	specifies the containment condition

Returns: the length of the initial substring according to the spanCondition; 0 if the start of the string does not fit the spanCondition

Stable:: ICU 3.8

See also: USetSpanCondition

◆ uset_toPattern()

int32_t uset_toPattern	(	const USet *	set,
		UChar *	result,
		int32_t	resultCapacity,
		UBool	escapeUnprintable,
		UErrorCode *	ec
	)

Returns a string representation of this set.

If the result of calling this function is passed to a uset_openPattern(), it will produce another set that is equal to this one.

Parameters

set	the set
result	the string to receive the rules, may be NULL
resultCapacity	the capacity of result, may be 0 if result is NULL
escapeUnprintable	if TRUE then convert unprintable character to their hex escape representations, \uxxxx or \Uxxxxxxxx. Unprintable characters are those other than U+000A, U+0020..U+007E.
ec	error code.

Returns: length of string, possibly larger than resultCapacity

Stable:: ICU 2.4

Data Structures

Namespaces

Typedefs

Enumerations

Functions

Detailed Description

Typedef Documentation

◆ USerializedSet

◆ USet

◆ USetSpanCondition

Enumeration Type Documentation

◆ anonymous enum

◆ anonymous enum

◆ USetSpanCondition

Function Documentation

◆ uset_add()

◆ uset_addAll()

◆ uset_addAllCodePoints()

◆ uset_addRange()

◆ uset_addString()

◆ uset_applyIntPropertyValue()

◆ uset_applyPattern()

◆ uset_applyPropertyAlias()

◆ uset_charAt()

◆ uset_clear()

◆ uset_clone()

◆ uset_cloneAsThawed()

◆ uset_close()

◆ uset_closeOver()

◆ uset_compact()

◆ uset_complement()

◆ uset_complementAll()

◆ uset_contains()

◆ uset_containsAll()

◆ uset_containsAllCodePoints()

◆ uset_containsNone()

◆ uset_containsRange()

◆ uset_containsSome()

◆ uset_containsString()

◆ uset_equals()

◆ uset_freeze()

◆ uset_getItem()

◆ uset_getItemCount()

◆ uset_getSerializedRange()

◆ uset_getSerializedRangeCount()

◆ uset_getSerializedSet()

◆ uset_indexOf()

◆ uset_isEmpty()

◆ uset_isFrozen()

◆ uset_open()

◆ uset_openEmpty()

◆ uset_openPattern()

◆ uset_openPatternOptions()

◆ uset_remove()

◆ uset_removeAll()

◆ uset_removeAllStrings()

◆ uset_removeRange()

◆ uset_removeString()

◆ uset_resemblesPattern()

◆ uset_retain()

◆ uset_retainAll()

◆ uset_serialize()

◆ uset_serializedContains()

◆ uset_set()

◆ uset_setSerializedToOne()

◆ uset_size()

◆ uset_span()

◆ uset_spanBack()

◆ uset_spanBackUTF8()

◆ uset_spanUTF8()

◆ uset_toPattern()