ICU 64.2  64.2
bytestrie.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2012, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: bytestrie.h
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010sep25
14 * created by: Markus W. Scherer
15 */
16 
17 #ifndef __BYTESTRIE_H__
18 #define __BYTESTRIE_H__
19 
25 #include "unicode/utypes.h"
26 #include "unicode/stringpiece.h"
27 #include "unicode/uobject.h"
28 #include "unicode/ustringtrie.h"
29 
31 
32 class ByteSink;
33 class BytesTrieBuilder;
34 class CharString;
35 class UVector32;
36 
50 class U_COMMON_API BytesTrie : public UMemory {
51 public:
66  BytesTrie(const void *trieBytes)
67  : ownedArray_(NULL), bytes_(static_cast<const uint8_t *>(trieBytes)),
68  pos_(bytes_), remainingMatchLength_(-1) {}
69 
74  ~BytesTrie();
75 
82  BytesTrie(const BytesTrie &other)
83  : ownedArray_(NULL), bytes_(other.bytes_),
84  pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
85 
92  pos_=bytes_;
93  remainingMatchLength_=-1;
94  return *this;
95  }
96 
102  class State : public UMemory {
103  public:
108  State() { bytes=NULL; }
109  private:
110  friend class BytesTrie;
111 
112  const uint8_t *bytes;
113  const uint8_t *pos;
114  int32_t remainingMatchLength;
115  };
116 
124  const BytesTrie &saveState(State &state) const {
125  state.bytes=bytes_;
126  state.pos=pos_;
127  state.remainingMatchLength=remainingMatchLength_;
128  return *this;
129  }
130 
141  BytesTrie &resetToState(const State &state) {
142  if(bytes_==state.bytes && bytes_!=NULL) {
143  pos_=state.pos;
144  remainingMatchLength_=state.remainingMatchLength;
145  }
146  return *this;
147  }
148 
155  UStringTrieResult current() const;
156 
165  inline UStringTrieResult first(int32_t inByte) {
166  remainingMatchLength_=-1;
167  if(inByte<0) {
168  inByte+=0x100;
169  }
170  return nextImpl(bytes_, inByte);
171  }
172 
180  UStringTrieResult next(int32_t inByte);
181 
197  UStringTrieResult next(const char *s, int32_t length);
198 
208  inline int32_t getValue() const {
209  const uint8_t *pos=pos_;
210  int32_t leadByte=*pos++;
211  // U_ASSERT(leadByte>=kMinValueLead);
212  return readValue(pos, leadByte>>1);
213  }
214 
224  inline UBool hasUniqueValue(int32_t &uniqueValue) const {
225  const uint8_t *pos=pos_;
226  // Skip the rest of a pending linear-match node.
227  return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
228  }
229 
238  int32_t getNextBytes(ByteSink &out) const;
239 
244  class U_COMMON_API Iterator : public UMemory {
245  public:
257  Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode);
258 
270  Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
271 
276  ~Iterator();
277 
283  Iterator &reset();
284 
289  UBool hasNext() const;
290 
305  UBool next(UErrorCode &errorCode);
306 
311  StringPiece getString() const;
316  int32_t getValue() const { return value_; }
317 
318  private:
319  UBool truncateAndStop();
320 
321  const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode);
322 
323  const uint8_t *bytes_;
324  const uint8_t *pos_;
325  const uint8_t *initialPos_;
326  int32_t remainingMatchLength_;
327  int32_t initialRemainingMatchLength_;
328 
329  CharString *str_;
330  int32_t maxLength_;
331  int32_t value_;
332 
333  // The stack stores pairs of integers for backtracking to another
334  // outbound edge of a branch node.
335  // The first integer is an offset from bytes_.
336  // The second integer has the str_->length() from before the node in bits 15..0,
337  // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.)
338  // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24,
339  // but the code looks more confusing that way.)
340  UVector32 *stack_;
341  };
342 
343 private:
344  friend class BytesTrieBuilder;
345 
352  BytesTrie(void *adoptBytes, const void *trieBytes)
353  : ownedArray_(static_cast<uint8_t *>(adoptBytes)),
354  bytes_(static_cast<const uint8_t *>(trieBytes)),
355  pos_(bytes_), remainingMatchLength_(-1) {}
356 
357  // No assignment operator.
358  BytesTrie &operator=(const BytesTrie &other);
359 
360  inline void stop() {
361  pos_=NULL;
362  }
363 
364  // Reads a compact 32-bit integer.
365  // pos is already after the leadByte, and the lead byte is already shifted right by 1.
366  static int32_t readValue(const uint8_t *pos, int32_t leadByte);
367  static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) {
368  // U_ASSERT(leadByte>=kMinValueLead);
369  if(leadByte>=(kMinTwoByteValueLead<<1)) {
370  if(leadByte<(kMinThreeByteValueLead<<1)) {
371  ++pos;
372  } else if(leadByte<(kFourByteValueLead<<1)) {
373  pos+=2;
374  } else {
375  pos+=3+((leadByte>>1)&1);
376  }
377  }
378  return pos;
379  }
380  static inline const uint8_t *skipValue(const uint8_t *pos) {
381  int32_t leadByte=*pos++;
382  return skipValue(pos, leadByte);
383  }
384 
385  // Reads a jump delta and jumps.
386  static const uint8_t *jumpByDelta(const uint8_t *pos);
387 
388  static inline const uint8_t *skipDelta(const uint8_t *pos) {
389  int32_t delta=*pos++;
390  if(delta>=kMinTwoByteDeltaLead) {
391  if(delta<kMinThreeByteDeltaLead) {
392  ++pos;
393  } else if(delta<kFourByteDeltaLead) {
394  pos+=2;
395  } else {
396  pos+=3+(delta&1);
397  }
398  }
399  return pos;
400  }
401 
402  static inline UStringTrieResult valueResult(int32_t node) {
403  return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node&kValueIsFinal));
404  }
405 
406  // Handles a branch node for both next(byte) and next(string).
407  UStringTrieResult branchNext(const uint8_t *pos, int32_t length, int32_t inByte);
408 
409  // Requires remainingLength_<0.
410  UStringTrieResult nextImpl(const uint8_t *pos, int32_t inByte);
411 
412  // Helper functions for hasUniqueValue().
413  // Recursively finds a unique value (or whether there is not a unique one)
414  // from a branch.
415  static const uint8_t *findUniqueValueFromBranch(const uint8_t *pos, int32_t length,
416  UBool haveUniqueValue, int32_t &uniqueValue);
417  // Recursively finds a unique value (or whether there is not a unique one)
418  // starting from a position on a node lead byte.
419  static UBool findUniqueValue(const uint8_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
420 
421  // Helper functions for getNextBytes().
422  // getNextBytes() when pos is on a branch node.
423  static void getNextBranchBytes(const uint8_t *pos, int32_t length, ByteSink &out);
424  static void append(ByteSink &out, int c);
425 
426  // BytesTrie data structure
427  //
428  // The trie consists of a series of byte-serialized nodes for incremental
429  // string/byte sequence matching. The root node is at the beginning of the trie data.
430  //
431  // Types of nodes are distinguished by their node lead byte ranges.
432  // After each node, except a final-value node, another node follows to
433  // encode match values or continue matching further bytes.
434  //
435  // Node types:
436  // - Value node: Stores a 32-bit integer in a compact, variable-length format.
437  // The value is for the string/byte sequence so far.
438  // One node bit indicates whether the value is final or whether
439  // matching continues with the next node.
440  // - Linear-match node: Matches a number of bytes.
441  // - Branch node: Branches to other nodes according to the current input byte.
442  // The node byte is the length of the branch (number of bytes to select from)
443  // minus 1. It is followed by a sub-node:
444  // - If the length is at most kMaxBranchLinearSubNodeLength, then
445  // there are length-1 (key, value) pairs and then one more comparison byte.
446  // If one of the key bytes matches, then the value is either a final value for
447  // the string/byte sequence so far, or a "jump" delta to the next node.
448  // If the last byte matches, then matching continues with the next node.
449  // (Values have the same encoding as value nodes.)
450  // - If the length is greater than kMaxBranchLinearSubNodeLength, then
451  // there is one byte and one "jump" delta.
452  // If the input byte is less than the sub-node byte, then "jump" by delta to
453  // the next sub-node which will have a length of length/2.
454  // (The delta has its own compact encoding.)
455  // Otherwise, skip the "jump" delta to the next sub-node
456  // which will have a length of length-length/2.
457 
458  // Node lead byte values.
459 
460  // 00..0f: Branch node. If node!=0 then the length is node+1, otherwise
461  // the length is one more than the next byte.
462 
463  // For a branch sub-node with at most this many entries, we drop down
464  // to a linear search.
465  static const int32_t kMaxBranchLinearSubNodeLength=5;
466 
467  // 10..1f: Linear-match node, match 1..16 bytes and continue reading the next node.
468  static const int32_t kMinLinearMatch=0x10;
469  static const int32_t kMaxLinearMatchLength=0x10;
470 
471  // 20..ff: Variable-length value node.
472  // If odd, the value is final. (Otherwise, intermediate value or jump delta.)
473  // Then shift-right by 1 bit.
474  // The remaining lead byte value indicates the number of following bytes (0..4)
475  // and contains the value's top bits.
476  static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x20
477  // It is a final value if bit 0 is set.
478  static const int32_t kValueIsFinal=1;
479 
480  // Compact value: After testing bit 0, shift right by 1 and then use the following thresholds.
481  static const int32_t kMinOneByteValueLead=kMinValueLead/2; // 0x10
482  static const int32_t kMaxOneByteValue=0x40; // At least 6 bits in the first byte.
483 
484  static const int32_t kMinTwoByteValueLead=kMinOneByteValueLead+kMaxOneByteValue+1; // 0x51
485  static const int32_t kMaxTwoByteValue=0x1aff;
486 
487  static const int32_t kMinThreeByteValueLead=kMinTwoByteValueLead+(kMaxTwoByteValue>>8)+1; // 0x6c
488  static const int32_t kFourByteValueLead=0x7e;
489 
490  // A little more than Unicode code points. (0x11ffff)
491  static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1;
492 
493  static const int32_t kFiveByteValueLead=0x7f;
494 
495  // Compact delta integers.
496  static const int32_t kMaxOneByteDelta=0xbf;
497  static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0
498  static const int32_t kMinThreeByteDeltaLead=0xf0;
499  static const int32_t kFourByteDeltaLead=0xfe;
500  static const int32_t kFiveByteDeltaLead=0xff;
501 
502  static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff
503  static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff
504 
505  uint8_t *ownedArray_;
506 
507  // Fixed value referencing the BytesTrie bytes.
508  const uint8_t *bytes_;
509 
510  // Iterator variables.
511 
512  // Pointer to next trie byte to read. NULL if no more matches.
513  const uint8_t *pos_;
514  // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
515  int32_t remainingMatchLength_;
516 };
517 
519 
520 #endif // __BYTESTRIE_H__
Builder class for BytesTrie.
BytesTrie state object, for saving a trie&#39;s current state and resetting the trie back to this state l...
Definition: bytestrie.h:102
UStringTrieResult
Return values for BytesTrie::next(), UCharsTrie::next() and similar methods.
Definition: ustringtrie.h:35
BytesTrie(const void *trieBytes)
Constructs a BytesTrie reader instance.
Definition: bytestrie.h:66
A ByteSink can be filled with bytes.
Definition: bytestream.h:50
UBool hasUniqueValue(int32_t &uniqueValue) const
Determines whether all byte sequences reachable from the current state map to the same value...
Definition: bytestrie.h:224
Light-weight, non-const reader class for a BytesTrie.
Definition: bytestrie.h:50
BytesTrie & reset()
Resets this trie to its initial state.
Definition: bytestrie.h:91
C++ API: StringPiece: Read-only byte string wrapper class.
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:137
int32_t getValue() const
Definition: bytestrie.h:316
UStringTrieResult first(int32_t inByte)
Traverses the trie from the initial state for this input byte.
Definition: bytestrie.h:165
int32_t getValue() const
Returns a matching byte sequence&#39;s value if called immediately after current()/first()/next() returne...
Definition: bytestrie.h:208
BytesTrie(const BytesTrie &other)
Copy constructor, copies the other trie reader object and its state, but not the byte array which wil...
Definition: bytestrie.h:82
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
State()
Constructs an empty State.
Definition: bytestrie.h:108
Iterator for all of the (byte sequence, value) pairs in a BytesTrie.
Definition: bytestrie.h:244
C++ API: Common ICU base class UObject.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:138
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:401
C API: Helper definitions for dictionary trie APIs.
const BytesTrie & saveState(State &state) const
Saves the state of this trie.
Definition: bytestrie.h:124
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:233
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
The input unit(s) continued a matching string and there is a value for the string so far...
Definition: ustringtrie.h:66
A string-like object that points to a sized piece of memory.
Definition: stringpiece.h:54
BytesTrie & resetToState(const State &state)
Resets this trie to the saved state.
Definition: bytestrie.h:141
UMemory is the common ICU base class.
Definition: uobject.h:112
int8_t UBool
The ICU boolean type.
Definition: umachine.h:225