Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
//===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
2
//
3
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
// See https://llvm.org/LICENSE.txt for license information.
5
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
//
7
//===----------------------------------------------------------------------===//
8
//
9
// This file defines the Suffix Tree class and Suffix Tree Node struct.
10
//
11
//===----------------------------------------------------------------------===//
12
#ifndef LLVM_SUPPORT_SUFFIXTREE_H
13
#define LLVM_SUPPORT_SUFFIXTREE_H
14
 
15
#include "llvm/ADT/ArrayRef.h"
16
#include "llvm/ADT/DenseMap.h"
17
#include "llvm/Support/Allocator.h"
18
#include <vector>
19
 
20
namespace llvm {
21
 
22
/// Represents an undefined index in the suffix tree.
23
const unsigned EmptyIdx = -1;
24
 
25
/// A node in a suffix tree which represents a substring or suffix.
26
///
27
/// Each node has either no children or at least two children, with the root
28
/// being a exception in the empty tree.
29
///
30
/// Children are represented as a map between unsigned integers and nodes. If
31
/// a node N has a child M on unsigned integer k, then the mapping represented
32
/// by N is a proper prefix of the mapping represented by M. Note that this,
33
/// although similar to a trie is somewhat different: each node stores a full
34
/// substring of the full mapping rather than a single character state.
35
///
36
/// Each internal node contains a pointer to the internal node representing
37
/// the same string, but with the first character chopped off. This is stored
38
/// in \p Link. Each leaf node stores the start index of its respective
39
/// suffix in \p SuffixIdx.
40
struct SuffixTreeNode {
41
 
42
  /// The children of this node.
43
  ///
44
  /// A child existing on an unsigned integer implies that from the mapping
45
  /// represented by the current node, there is a way to reach another
46
  /// mapping by tacking that character on the end of the current string.
47
  llvm::DenseMap<unsigned, SuffixTreeNode *> Children;
48
 
49
  /// The start index of this node's substring in the main string.
50
  unsigned StartIdx = EmptyIdx;
51
 
52
  /// The end index of this node's substring in the main string.
53
  ///
54
  /// Every leaf node must have its \p EndIdx incremented at the end of every
55
  /// step in the construction algorithm. To avoid having to update O(N)
56
  /// nodes individually at the end of every step, the end index is stored
57
  /// as a pointer.
58
  unsigned *EndIdx = nullptr;
59
 
60
  /// For leaves, the start index of the suffix represented by this node.
61
  ///
62
  /// For all other nodes, this is ignored.
63
  unsigned SuffixIdx = EmptyIdx;
64
 
65
  /// For internal nodes, a pointer to the internal node representing
66
  /// the same sequence with the first character chopped off.
67
  ///
68
  /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
69
  /// Ukkonen's algorithm does to achieve linear-time construction is
70
  /// keep track of which node the next insert should be at. This makes each
71
  /// insert O(1), and there are a total of O(N) inserts. The suffix link
72
  /// helps with inserting children of internal nodes.
73
  ///
74
  /// Say we add a child to an internal node with associated mapping S. The
75
  /// next insertion must be at the node representing S - its first character.
76
  /// This is given by the way that we iteratively build the tree in Ukkonen's
77
  /// algorithm. The main idea is to look at the suffixes of each prefix in the
78
  /// string, starting with the longest suffix of the prefix, and ending with
79
  /// the shortest. Therefore, if we keep pointers between such nodes, we can
80
  /// move to the next insertion point in O(1) time. If we don't, then we'd
81
  /// have to query from the root, which takes O(N) time. This would make the
82
  /// construction algorithm O(N^2) rather than O(N).
83
  SuffixTreeNode *Link = nullptr;
84
 
85
  /// The length of the string formed by concatenating the edge labels from the
86
  /// root to this node.
87
  unsigned ConcatLen = 0;
88
 
89
  /// Returns true if this node is a leaf.
90
  bool isLeaf() const { return SuffixIdx != EmptyIdx; }
91
 
92
  /// Returns true if this node is the root of its owning \p SuffixTree.
93
  bool isRoot() const { return StartIdx == EmptyIdx; }
94
 
95
  /// Return the number of elements in the substring associated with this node.
96
  size_t size() const {
97
 
98
    // Is it the root? If so, it's the empty string so return 0.
99
    if (isRoot())
100
      return 0;
101
 
102
    assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
103
 
104
    // Size = the number of elements in the string.
105
    // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
106
    return *EndIdx - StartIdx + 1;
107
  }
108
 
109
  SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
110
      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
111
 
112
  SuffixTreeNode() = default;
113
};
114
 
115
/// A data structure for fast substring queries.
116
///
117
/// Suffix trees represent the suffixes of their input strings in their leaves.
118
/// A suffix tree is a type of compressed trie structure where each node
119
/// represents an entire substring rather than a single character. Each leaf
120
/// of the tree is a suffix.
121
///
122
/// A suffix tree can be seen as a type of state machine where each state is a
123
/// substring of the full string. The tree is structured so that, for a string
124
/// of length N, there are exactly N leaves in the tree. This structure allows
125
/// us to quickly find repeated substrings of the input string.
126
///
127
/// In this implementation, a "string" is a vector of unsigned integers.
128
/// These integers may result from hashing some data type. A suffix tree can
129
/// contain 1 or many strings, which can then be queried as one large string.
130
///
131
/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
132
/// suffix tree construction. Ukkonen's algorithm is explained in more detail
133
/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
134
/// paper is available at
135
///
136
/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
137
class SuffixTree {
138
public:
139
  /// Each element is an integer representing an instruction in the module.
140
  llvm::ArrayRef<unsigned> Str;
141
 
142
  /// A repeated substring in the tree.
143
  struct RepeatedSubstring {
144
    /// The length of the string.
145
    unsigned Length;
146
 
147
    /// The start indices of each occurrence.
148
    std::vector<unsigned> StartIndices;
149
  };
150
 
151
private:
152
  /// Maintains each node in the tree.
153
  llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
154
 
155
  /// The root of the suffix tree.
156
  ///
157
  /// The root represents the empty string. It is maintained by the
158
  /// \p NodeAllocator like every other node in the tree.
159
  SuffixTreeNode *Root = nullptr;
160
 
161
  /// Maintains the end indices of the internal nodes in the tree.
162
  ///
163
  /// Each internal node is guaranteed to never have its end index change
164
  /// during the construction algorithm; however, leaves must be updated at
165
  /// every step. Therefore, we need to store leaf end indices by reference
166
  /// to avoid updating O(N) leaves at every step of construction. Thus,
167
  /// every internal node must be allocated its own end index.
168
  llvm::BumpPtrAllocator InternalEndIdxAllocator;
169
 
170
  /// The end index of each leaf in the tree.
171
  unsigned LeafEndIdx = -1;
172
 
173
  /// Helper struct which keeps track of the next insertion point in
174
  /// Ukkonen's algorithm.
175
  struct ActiveState {
176
    /// The next node to insert at.
177
    SuffixTreeNode *Node = nullptr;
178
 
179
    /// The index of the first character in the substring currently being added.
180
    unsigned Idx = EmptyIdx;
181
 
182
    /// The length of the substring we have to add at the current step.
183
    unsigned Len = 0;
184
  };
185
 
186
  /// The point the next insertion will take place at in the
187
  /// construction algorithm.
188
  ActiveState Active;
189
 
190
  /// Allocate a leaf node and add it to the tree.
191
  ///
192
  /// \param Parent The parent of this node.
193
  /// \param StartIdx The start index of this node's associated string.
194
  /// \param Edge The label on the edge leaving \p Parent to this node.
195
  ///
196
  /// \returns A pointer to the allocated leaf node.
197
  SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
198
                             unsigned Edge);
199
 
200
  /// Allocate an internal node and add it to the tree.
201
  ///
202
  /// \param Parent The parent of this node. Only null when allocating the root.
203
  /// \param StartIdx The start index of this node's associated string.
204
  /// \param EndIdx The end index of this node's associated string.
205
  /// \param Edge The label on the edge leaving \p Parent to this node.
206
  ///
207
  /// \returns A pointer to the allocated internal node.
208
  SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
209
                                     unsigned EndIdx, unsigned Edge);
210
 
211
  /// Set the suffix indices of the leaves to the start indices of their
212
  /// respective suffixes.
213
  void setSuffixIndices();
214
 
215
  /// Construct the suffix tree for the prefix of the input ending at
216
  /// \p EndIdx.
217
  ///
218
  /// Used to construct the full suffix tree iteratively. At the end of each
219
  /// step, the constructed suffix tree is either a valid suffix tree, or a
220
  /// suffix tree with implicit suffixes. At the end of the final step, the
221
  /// suffix tree is a valid tree.
222
  ///
223
  /// \param EndIdx The end index of the current prefix in the main string.
224
  /// \param SuffixesToAdd The number of suffixes that must be added
225
  /// to complete the suffix tree at the current phase.
226
  ///
227
  /// \returns The number of suffixes that have not been added at the end of
228
  /// this step.
229
  unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
230
 
231
public:
232
  /// Construct a suffix tree from a sequence of unsigned integers.
233
  ///
234
  /// \param Str The string to construct the suffix tree for.
235
  SuffixTree(const std::vector<unsigned> &Str);
236
 
237
  /// Iterator for finding all repeated substrings in the suffix tree.
238
  struct RepeatedSubstringIterator {
239
  private:
240
    /// The current node we're visiting.
241
    SuffixTreeNode *N = nullptr;
242
 
243
    /// The repeated substring associated with this node.
244
    RepeatedSubstring RS;
245
 
246
    /// The nodes left to visit.
247
    std::vector<SuffixTreeNode *> ToVisit;
248
 
249
    /// The minimum length of a repeated substring to find.
250
    /// Since we're outlining, we want at least two instructions in the range.
251
    /// FIXME: This may not be true for targets like X86 which support many
252
    /// instruction lengths.
253
    const unsigned MinLength = 2;
254
 
255
    /// Move the iterator to the next repeated substring.
256
    void advance() {
257
      // Clear the current state. If we're at the end of the range, then this
258
      // is the state we want to be in.
259
      RS = RepeatedSubstring();
260
      N = nullptr;
261
 
262
      // Each leaf node represents a repeat of a string.
263
      std::vector<SuffixTreeNode *> LeafChildren;
264
 
265
      // Continue visiting nodes until we find one which repeats more than once.
266
      while (!ToVisit.empty()) {
267
        SuffixTreeNode *Curr = ToVisit.back();
268
        ToVisit.pop_back();
269
        LeafChildren.clear();
270
 
271
        // Keep track of the length of the string associated with the node. If
272
        // it's too short, we'll quit.
273
        unsigned Length = Curr->ConcatLen;
274
 
275
        // Iterate over each child, saving internal nodes for visiting, and
276
        // leaf nodes in LeafChildren. Internal nodes represent individual
277
        // strings, which may repeat.
278
        for (auto &ChildPair : Curr->Children) {
279
          // Save all of this node's children for processing.
280
          if (!ChildPair.second->isLeaf())
281
            ToVisit.push_back(ChildPair.second);
282
 
283
          // It's not an internal node, so it must be a leaf. If we have a
284
          // long enough string, then save the leaf children.
285
          else if (Length >= MinLength)
286
            LeafChildren.push_back(ChildPair.second);
287
        }
288
 
289
        // The root never represents a repeated substring. If we're looking at
290
        // that, then skip it.
291
        if (Curr->isRoot())
292
          continue;
293
 
294
        // Do we have any repeated substrings?
295
        if (LeafChildren.size() >= 2) {
296
          // Yes. Update the state to reflect this, and then bail out.
297
          N = Curr;
298
          RS.Length = Length;
299
          for (SuffixTreeNode *Leaf : LeafChildren)
300
            RS.StartIndices.push_back(Leaf->SuffixIdx);
301
          break;
302
        }
303
      }
304
 
305
      // At this point, either NewRS is an empty RepeatedSubstring, or it was
306
      // set in the above loop. Similarly, N is either nullptr, or the node
307
      // associated with NewRS.
308
    }
309
 
310
  public:
311
    /// Return the current repeated substring.
312
    RepeatedSubstring &operator*() { return RS; }
313
 
314
    RepeatedSubstringIterator &operator++() {
315
      advance();
316
      return *this;
317
    }
318
 
319
    RepeatedSubstringIterator operator++(int I) {
320
      RepeatedSubstringIterator It(*this);
321
      advance();
322
      return It;
323
    }
324
 
325
    bool operator==(const RepeatedSubstringIterator &Other) const {
326
      return N == Other.N;
327
    }
328
    bool operator!=(const RepeatedSubstringIterator &Other) const {
329
      return !(*this == Other);
330
    }
331
 
332
    RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
333
      // Do we have a non-null node?
334
      if (N) {
335
        // Yes. At the first step, we need to visit all of N's children.
336
        // Note: This means that we visit N last.
337
        ToVisit.push_back(N);
338
        advance();
339
      }
340
    }
341
  };
342
 
343
  typedef RepeatedSubstringIterator iterator;
344
  iterator begin() { return iterator(Root); }
345
  iterator end() { return iterator(nullptr); }
346
};
347
 
348
} // namespace llvm
349
 
350
#endif // LLVM_SUPPORT_SUFFIXTREE_H