ICU 64.2  64.2
ucharstrie.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: ucharstrie.h
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010nov14
14 * created by: Markus W. Scherer
15 */
16 
17 #ifndef __UCHARSTRIE_H__
18 #define __UCHARSTRIE_H__
19 
26 #include "unicode/utypes.h"
27 #include "unicode/unistr.h"
28 #include "unicode/uobject.h"
29 #include "unicode/ustringtrie.h"
30 
32 
33 class Appendable;
34 class UCharsTrieBuilder;
35 class UVector32;
36 
51 public:
67  : ownedArray_(NULL), uchars_(trieUChars),
68  pos_(uchars_), remainingMatchLength_(-1) {}
69 
74  ~UCharsTrie();
75 
82  UCharsTrie(const UCharsTrie &other)
83  : ownedArray_(NULL), uchars_(other.uchars_),
84  pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
85 
92  pos_=uchars_;
93  remainingMatchLength_=-1;
94  return *this;
95  }
96 
102  class State : public UMemory {
103  public:
108  State() { uchars=NULL; }
109  private:
110  friend class UCharsTrie;
111 
112  const char16_t *uchars;
113  const char16_t *pos;
114  int32_t remainingMatchLength;
115  };
116 
124  const UCharsTrie &saveState(State &state) const {
125  state.uchars=uchars_;
126  state.pos=pos_;
127  state.remainingMatchLength=remainingMatchLength_;
128  return *this;
129  }
130 
141  UCharsTrie &resetToState(const State &state) {
142  if(uchars_==state.uchars && uchars_!=NULL) {
143  pos_=state.pos;
144  remainingMatchLength_=state.remainingMatchLength;
145  }
146  return *this;
147  }
148 
155  UStringTrieResult current() const;
156 
164  inline UStringTrieResult first(int32_t uchar) {
165  remainingMatchLength_=-1;
166  return nextImpl(uchars_, uchar);
167  }
168 
177  UStringTrieResult firstForCodePoint(UChar32 cp);
178 
185  UStringTrieResult next(int32_t uchar);
186 
194  UStringTrieResult nextForCodePoint(UChar32 cp);
195 
211  UStringTrieResult next(ConstChar16Ptr s, int32_t length);
212 
222  inline int32_t getValue() const {
223  const char16_t *pos=pos_;
224  int32_t leadUnit=*pos++;
225  // U_ASSERT(leadUnit>=kMinValueLead);
226  return leadUnit&kValueIsFinal ?
227  readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
228  }
229 
239  inline UBool hasUniqueValue(int32_t &uniqueValue) const {
240  const char16_t *pos=pos_;
241  // Skip the rest of a pending linear-match node.
242  return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
243  }
244 
252  int32_t getNextUChars(Appendable &out) const;
253 
258  class U_COMMON_API Iterator : public UMemory {
259  public:
271  Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
272 
284  Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
285 
290  ~Iterator();
291 
297  Iterator &reset();
298 
303  UBool hasNext() const;
304 
319  UBool next(UErrorCode &errorCode);
320 
325  const UnicodeString &getString() const { return str_; }
330  int32_t getValue() const { return value_; }
331 
332  private:
333  UBool truncateAndStop() {
334  pos_=NULL;
335  value_=-1; // no real value for str
336  return TRUE;
337  }
338 
339  const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode);
340 
341  const char16_t *uchars_;
342  const char16_t *pos_;
343  const char16_t *initialPos_;
344  int32_t remainingMatchLength_;
345  int32_t initialRemainingMatchLength_;
346  UBool skipValue_; // Skip intermediate value which was already delivered.
347 
348  UnicodeString str_;
349  int32_t maxLength_;
350  int32_t value_;
351 
352  // The stack stores pairs of integers for backtracking to another
353  // outbound edge of a branch node.
354  // The first integer is an offset from uchars_.
355  // The second integer has the str_.length() from before the node in bits 15..0,
356  // and the remaining branch length in bits 31..16.
357  // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
358  // but the code looks more confusing that way.)
359  UVector32 *stack_;
360  };
361 
362 private:
363  friend class UCharsTrieBuilder;
364 
371  UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars)
372  : ownedArray_(adoptUChars), uchars_(trieUChars),
373  pos_(uchars_), remainingMatchLength_(-1) {}
374 
375  // No assignment operator.
376  UCharsTrie &operator=(const UCharsTrie &other);
377 
378  inline void stop() {
379  pos_=NULL;
380  }
381 
382  // Reads a compact 32-bit integer.
383  // pos is already after the leadUnit, and the lead unit has bit 15 reset.
384  static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
385  int32_t value;
386  if(leadUnit<kMinTwoUnitValueLead) {
387  value=leadUnit;
388  } else if(leadUnit<kThreeUnitValueLead) {
389  value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
390  } else {
391  value=(pos[0]<<16)|pos[1];
392  }
393  return value;
394  }
395  static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
396  if(leadUnit>=kMinTwoUnitValueLead) {
397  if(leadUnit<kThreeUnitValueLead) {
398  ++pos;
399  } else {
400  pos+=2;
401  }
402  }
403  return pos;
404  }
405  static inline const char16_t *skipValue(const char16_t *pos) {
406  int32_t leadUnit=*pos++;
407  return skipValue(pos, leadUnit&0x7fff);
408  }
409 
410  static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
411  // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
412  int32_t value;
413  if(leadUnit<kMinTwoUnitNodeValueLead) {
414  value=(leadUnit>>6)-1;
415  } else if(leadUnit<kThreeUnitNodeValueLead) {
416  value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
417  } else {
418  value=(pos[0]<<16)|pos[1];
419  }
420  return value;
421  }
422  static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
423  // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
424  if(leadUnit>=kMinTwoUnitNodeValueLead) {
425  if(leadUnit<kThreeUnitNodeValueLead) {
426  ++pos;
427  } else {
428  pos+=2;
429  }
430  }
431  return pos;
432  }
433 
434  static inline const char16_t *jumpByDelta(const char16_t *pos) {
435  int32_t delta=*pos++;
436  if(delta>=kMinTwoUnitDeltaLead) {
437  if(delta==kThreeUnitDeltaLead) {
438  delta=(pos[0]<<16)|pos[1];
439  pos+=2;
440  } else {
441  delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
442  }
443  }
444  return pos+delta;
445  }
446 
447  static const char16_t *skipDelta(const char16_t *pos) {
448  int32_t delta=*pos++;
449  if(delta>=kMinTwoUnitDeltaLead) {
450  if(delta==kThreeUnitDeltaLead) {
451  pos+=2;
452  } else {
453  ++pos;
454  }
455  }
456  return pos;
457  }
458 
459  static inline UStringTrieResult valueResult(int32_t node) {
461  }
462 
463  // Handles a branch node for both next(uchar) and next(string).
464  UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
465 
466  // Requires remainingLength_<0.
467  UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
468 
469  // Helper functions for hasUniqueValue().
470  // Recursively finds a unique value (or whether there is not a unique one)
471  // from a branch.
472  static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
473  UBool haveUniqueValue, int32_t &uniqueValue);
474  // Recursively finds a unique value (or whether there is not a unique one)
475  // starting from a position on a node lead unit.
476  static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
477 
478  // Helper functions for getNextUChars().
479  // getNextUChars() when pos is on a branch node.
480  static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
481 
482  // UCharsTrie data structure
483  //
484  // The trie consists of a series of char16_t-serialized nodes for incremental
485  // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
486  // The root node is at the beginning of the trie data.
487  //
488  // Types of nodes are distinguished by their node lead unit ranges.
489  // After each node, except a final-value node, another node follows to
490  // encode match values or continue matching further units.
491  //
492  // Node types:
493  // - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
494  // The value is for the string/char16_t sequence so far.
495  // - Match node, optionally with an intermediate value in a different compact format.
496  // The value, if present, is for the string/char16_t sequence so far.
497  //
498  // Aside from the value, which uses the node lead unit's high bits:
499  //
500  // - Linear-match node: Matches a number of units.
501  // - Branch node: Branches to other nodes according to the current input unit.
502  // The node unit is the length of the branch (number of units to select from)
503  // minus 1. It is followed by a sub-node:
504  // - If the length is at most kMaxBranchLinearSubNodeLength, then
505  // there are length-1 (key, value) pairs and then one more comparison unit.
506  // If one of the key units matches, then the value is either a final value for
507  // the string so far, or a "jump" delta to the next node.
508  // If the last unit matches, then matching continues with the next node.
509  // (Values have the same encoding as final-value nodes.)
510  // - If the length is greater than kMaxBranchLinearSubNodeLength, then
511  // there is one unit and one "jump" delta.
512  // If the input unit is less than the sub-node unit, then "jump" by delta to
513  // the next sub-node which will have a length of length/2.
514  // (The delta has its own compact encoding.)
515  // Otherwise, skip the "jump" delta to the next sub-node
516  // which will have a length of length-length/2.
517 
518  // Match-node lead unit values, after masking off intermediate-value bits:
519 
520  // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
521  // the length is one more than the next unit.
522 
523  // For a branch sub-node with at most this many entries, we drop down
524  // to a linear search.
525  static const int32_t kMaxBranchLinearSubNodeLength=5;
526 
527  // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
528  static const int32_t kMinLinearMatch=0x30;
529  static const int32_t kMaxLinearMatchLength=0x10;
530 
531  // Match-node lead unit bits 14..6 for the optional intermediate value.
532  // If these bits are 0, then there is no intermediate value.
533  // Otherwise, see the *NodeValue* constants below.
534  static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
535  static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
536 
537  // A final-value node has bit 15 set.
538  static const int32_t kValueIsFinal=0x8000;
539 
540  // Compact value: After testing and masking off bit 15, use the following thresholds.
541  static const int32_t kMaxOneUnitValue=0x3fff;
542 
543  static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
544  static const int32_t kThreeUnitValueLead=0x7fff;
545 
546  static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
547 
548  // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
549  static const int32_t kMaxOneUnitNodeValue=0xff;
550  static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
551  static const int32_t kThreeUnitNodeValueLead=0x7fc0;
552 
553  static const int32_t kMaxTwoUnitNodeValue=
554  ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
555 
556  // Compact delta integers.
557  static const int32_t kMaxOneUnitDelta=0xfbff;
558  static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
559  static const int32_t kThreeUnitDeltaLead=0xffff;
560 
561  static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
562 
563  char16_t *ownedArray_;
564 
565  // Fixed value referencing the UCharsTrie words.
566  const char16_t *uchars_;
567 
568  // Iterator variables.
569 
570  // Pointer to next trie unit to read. NULL if no more matches.
571  const char16_t *pos_;
572  // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
573  int32_t remainingMatchLength_;
574 };
575 
577 
578 #endif // __UCHARSTRIE_H__
UCharsTrie & resetToState(const State &state)
Resets this trie to the saved state.
Definition: ucharstrie.h:141
int32_t getValue() const
Returns a matching string&#39;s value if called immediately after current()/first()/next() returned USTRI...
Definition: ucharstrie.h:222
C++ API: Unicode String.
int32_t getValue() const
Definition: ucharstrie.h:330
UStringTrieResult
Return values for BytesTrie::next(), UCharsTrie::next() and similar methods.
Definition: ustringtrie.h:35
UCharsTrie(const UCharsTrie &other)
Copy constructor, copies the other trie reader object and its state, but not the char16_t array which...
Definition: ucharstrie.h:82
UBool hasUniqueValue(int32_t &uniqueValue) const
Determines whether all strings reachable from the current state map to the same value.
Definition: ucharstrie.h:239
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:137
const UnicodeString & getString() const
Definition: ucharstrie.h:325
UCharsTrie & reset()
Resets this trie to its initial state.
Definition: ucharstrie.h:91
Iterator for all of the (string, value) pairs in a UCharsTrie.
Definition: ucharstrie.h:258
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition: umachine.h:389
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
Builder class for UCharsTrie.
#define TRUE
The TRUE value of a UBool.
Definition: umachine.h:229
const UCharsTrie & saveState(State &state) const
Saves the state of this trie.
Definition: ucharstrie.h:124
C++ API: Common ICU base class UObject.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:138
State()
Constructs an empty State.
Definition: ucharstrie.h:108
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:401
UCharsTrie state object, for saving a trie&#39;s current state and resetting the trie back to this state ...
Definition: ucharstrie.h:102
const char16_t * wrapper with implicit conversion from distinct but bit-compatible pointer types...
Definition: char16ptr.h:146
C API: Helper definitions for dictionary trie APIs.
UCharsTrie(ConstChar16Ptr trieUChars)
Constructs a UCharsTrie reader instance.
Definition: ucharstrie.h:66
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:233
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition: unistr.h:289
The input unit(s) continued a matching string and there is a value for the string so far...
Definition: ustringtrie.h:66
UStringTrieResult first(int32_t uchar)
Traverses the trie from the initial state for this input char16_t.
Definition: ucharstrie.h:164
UMemory is the common ICU base class.
Definition: uobject.h:112
Light-weight, non-const reader class for a UCharsTrie.
Definition: ucharstrie.h:50
int8_t UBool
The ICU boolean type.
Definition: umachine.h:225
Base class for objects to which Unicode characters and strings can be appended.
Definition: appendable.h:51