ICU 64.2  64.2
stringtriebuilder.h
Go to the documentation of this file.
1 // © 2016 and later: Unicode, Inc. and others.
2 // License & terms of use: http://www.unicode.org/copyright.html
3 /*
4 *******************************************************************************
5 * Copyright (C) 2010-2012,2014, International Business Machines
6 * Corporation and others. All Rights Reserved.
7 *******************************************************************************
8 * file name: stringtriebuilder.h
9 * encoding: UTF-8
10 * tab size: 8 (not used)
11 * indentation:4
12 *
13 * created on: 2010dec24
14 * created by: Markus W. Scherer
15 */
16 
17 #ifndef __STRINGTRIEBUILDER_H__
18 #define __STRINGTRIEBUILDER_H__
19 
20 #include "unicode/utypes.h"
21 #include "unicode/uobject.h"
22 
28 // Forward declaration.
30 struct UHashtable;
31 typedef struct UHashtable UHashtable;
33 
55 };
56 
58 
65 class U_COMMON_API StringTrieBuilder : public UObject {
66 public:
67 #ifndef U_HIDE_INTERNAL_API
68 
69  static int32_t hashNode(const void *node);
71  static UBool equalNodes(const void *left, const void *right);
72 #endif /* U_HIDE_INTERNAL_API */
73 
74 protected:
75  // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
76  // or else the compiler will create a public default constructor.
78  StringTrieBuilder();
80  virtual ~StringTrieBuilder();
81 
82 #ifndef U_HIDE_INTERNAL_API
83 
84  void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
86  void deleteCompactBuilder();
87 
89  void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
90 
92  int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
94  int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
95 #endif /* U_HIDE_INTERNAL_API */
96 
97  class Node;
98 
99 #ifndef U_HIDE_INTERNAL_API
100 
101  Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
103  Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
104  int32_t length, UErrorCode &errorCode);
105 #endif /* U_HIDE_INTERNAL_API */
106 
108  virtual int32_t getElementStringLength(int32_t i) const = 0;
110  virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
112  virtual int32_t getElementValue(int32_t i) const = 0;
113 
114  // Finds the first unit index after this one where
115  // the first and last element have different units again.
117  virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
118 
119  // Number of different units at unitIndex.
121  virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
123  virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
125  virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
126 
128  virtual UBool matchNodesCanHaveValues() const = 0;
129 
131  virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
133  virtual int32_t getMinLinearMatch() const = 0;
135  virtual int32_t getMaxLinearMatchLength() const = 0;
136 
137 #ifndef U_HIDE_INTERNAL_API
138  // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
140  static const int32_t kMaxBranchLinearSubNodeLength=5;
141 
142  // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
143  // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
145  static const int32_t kMaxSplitBranchLevels=14;
146 
157  Node *registerNode(Node *newNode, UErrorCode &errorCode);
168  Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
169 #endif /* U_HIDE_INTERNAL_API */
170 
171  /*
172  * C++ note:
173  * registerNode() and registerFinalValue() take ownership of their input nodes,
174  * and only return owned nodes.
175  * If they see a failure UErrorCode, they will delete the input node.
176  * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
177  * If there is a failure, they return NULL.
178  *
179  * NULL Node pointers can be safely passed into other Nodes because
180  * they call the static Node::hashCode() which checks for a NULL pointer first.
181  *
182  * Therefore, as long as builder functions register a new node,
183  * they need to check for failures only before explicitly dereferencing
184  * a Node pointer, or before setting a new UErrorCode.
185  */
186 
187  // Hash set of nodes, maps from nodes to integer 1.
189  UHashtable *nodes;
190 
191  // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
192  // it is needed for layout of other objects.
197  class Node : public UObject {
198  public:
199  Node(int32_t initialHash) : hash(initialHash), offset(0) {}
200  inline int32_t hashCode() const { return hash; }
201  // Handles node==NULL.
202  static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
203  // Base class operator==() compares the actual class types.
204  virtual UBool operator==(const Node &other) const;
205  inline UBool operator!=(const Node &other) const { return !operator==(other); }
233  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
234  // write() must set the offset to a positive value.
235  virtual void write(StringTrieBuilder &builder) = 0;
236  // See markRightEdgesFirst.
237  inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
238  StringTrieBuilder &builder) {
239  // Note: Edge numbers are negative, lastRight<=firstRight.
240  // If offset>0 then this node and its sub-nodes have been written already
241  // and we need not write them again.
242  // If this node is part of the unwritten right branch edge,
243  // then we wait until that is written.
244  if(offset<0 && (offset<lastRight || firstRight<offset)) {
245  write(builder);
246  }
247  }
248  inline int32_t getOffset() const { return offset; }
249  protected:
250  int32_t hash;
251  int32_t offset;
252  };
253 
254 #ifndef U_HIDE_INTERNAL_API
255  // This class should not be overridden because
256  // registerFinalValue() compares a stack-allocated FinalValueNode
257  // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
258  // with the input node, and the
259  // !Node::operator==(other) used inside FinalValueNode::operator==(other)
260  // will be false if the typeid's are different.
262  class FinalValueNode : public Node {
263  public:
264  FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
265  virtual UBool operator==(const Node &other) const;
266  virtual void write(StringTrieBuilder &builder);
267  protected:
268  int32_t value;
269  };
270 #endif /* U_HIDE_INTERNAL_API */
271 
272  // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
273  // it is needed for layout of other objects.
277  class ValueNode : public Node {
278  public:
279  ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
280  virtual UBool operator==(const Node &other) const;
281  void setValue(int32_t v) {
282  hasValue=TRUE;
283  value=v;
284  hash=hash*37u+v;
285  }
286  protected:
287  UBool hasValue;
288  int32_t value;
289  };
290 
291 #ifndef U_HIDE_INTERNAL_API
292 
295  class IntermediateValueNode : public ValueNode {
296  public:
297  IntermediateValueNode(int32_t v, Node *nextNode)
298  : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
299  virtual UBool operator==(const Node &other) const;
300  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
301  virtual void write(StringTrieBuilder &builder);
302  protected:
303  Node *next;
304  };
305 #endif /* U_HIDE_INTERNAL_API */
306 
307  // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
308  // it is needed for layout of other objects.
312  class LinearMatchNode : public ValueNode {
313  public:
314  LinearMatchNode(int32_t len, Node *nextNode)
315  : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
316  length(len), next(nextNode) {}
317  virtual UBool operator==(const Node &other) const;
318  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
319  protected:
320  int32_t length;
321  Node *next;
322  };
323 
324 #ifndef U_HIDE_INTERNAL_API
325 
328  class BranchNode : public Node {
329  public:
330  BranchNode(int32_t initialHash) : Node(initialHash) {}
331  protected:
332  int32_t firstEdgeNumber;
333  };
334 
338  class ListBranchNode : public BranchNode {
339  public:
340  ListBranchNode() : BranchNode(0x444444), length(0) {}
341  virtual UBool operator==(const Node &other) const;
342  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
343  virtual void write(StringTrieBuilder &builder);
344  // Adds a unit with a final value.
345  void add(int32_t c, int32_t value) {
346  units[length]=(char16_t)c;
347  equal[length]=NULL;
348  values[length]=value;
349  ++length;
350  hash=(hash*37u+c)*37u+value;
351  }
352  // Adds a unit which leads to another match node.
353  void add(int32_t c, Node *node) {
354  units[length]=(char16_t)c;
355  equal[length]=node;
356  values[length]=0;
357  ++length;
358  hash=(hash*37u+c)*37u+hashCode(node);
359  }
360  protected:
361  Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
362  int32_t length;
363  int32_t values[kMaxBranchLinearSubNodeLength];
364  char16_t units[kMaxBranchLinearSubNodeLength];
365  };
366 
370  class SplitBranchNode : public BranchNode {
371  public:
372  SplitBranchNode(char16_t middleUnit, Node *lessThanNode, Node *greaterOrEqualNode)
373  : BranchNode(((0x555555u*37u+middleUnit)*37u+
374  hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
375  unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
376  virtual UBool operator==(const Node &other) const;
377  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
378  virtual void write(StringTrieBuilder &builder);
379  protected:
380  char16_t unit;
381  Node *lessThan;
382  Node *greaterOrEqual;
383  };
384 
385  // Branch head node, for writing the actual node lead unit.
387  class BranchHeadNode : public ValueNode {
388  public:
389  BranchHeadNode(int32_t len, Node *subNode)
390  : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
391  length(len), next(subNode) {}
392  virtual UBool operator==(const Node &other) const;
393  virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
394  virtual void write(StringTrieBuilder &builder);
395  protected:
396  int32_t length;
397  Node *next; // A branch sub-node.
398  };
399 
400 #endif /* U_HIDE_INTERNAL_API */
401 
404  virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
405  Node *nextNode) const = 0;
406 
408  virtual int32_t write(int32_t unit) = 0;
410  virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
412  virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
414  virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
416  virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
417 };
418 
420 
421 #endif // __STRINGTRIEBUILDER_H__
struct UHashtable UHashtable
Definition: msgfmt.h:41
Builds a trie more slowly, attempting to generate a shorter but equivalent serialization.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition: uversion.h:137
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
Definition: stringpiece.h:218
Builds a trie quickly.
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition: utypes.h:188
#define TRUE
The TRUE value of a UBool.
Definition: umachine.h:229
C++ API: Common ICU base class UObject.
UStringTrieBuildOption
Build options for BytesTrieBuilder and CharsTrieBuilder.
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition: uversion.h:138
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers...
Definition: utypes.h:401
Basic definitions for ICU, for both C and C++ APIs.
#define FALSE
The FALSE value of a UBool.
Definition: umachine.h:233
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside...
Definition: utypes.h:300
int8_t UBool
The ICU boolean type.
Definition: umachine.h:225