Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===// |
2 | // |
||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | // See https://llvm.org/LICENSE.txt for license information. |
||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | // |
||
7 | //===----------------------------------------------------------------------===// |
||
8 | // |
||
9 | // This file defines the Suffix Tree class and Suffix Tree Node struct. |
||
10 | // |
||
11 | //===----------------------------------------------------------------------===// |
||
12 | #ifndef LLVM_SUPPORT_SUFFIXTREE_H |
||
13 | #define LLVM_SUPPORT_SUFFIXTREE_H |
||
14 | |||
15 | #include "llvm/ADT/ArrayRef.h" |
||
16 | #include "llvm/ADT/DenseMap.h" |
||
17 | #include "llvm/Support/Allocator.h" |
||
18 | #include <vector> |
||
19 | |||
20 | namespace llvm { |
||
21 | |||
22 | /// Represents an undefined index in the suffix tree. |
||
23 | const unsigned EmptyIdx = -1; |
||
24 | |||
25 | /// A node in a suffix tree which represents a substring or suffix. |
||
26 | /// |
||
27 | /// Each node has either no children or at least two children, with the root |
||
28 | /// being a exception in the empty tree. |
||
29 | /// |
||
30 | /// Children are represented as a map between unsigned integers and nodes. If |
||
31 | /// a node N has a child M on unsigned integer k, then the mapping represented |
||
32 | /// by N is a proper prefix of the mapping represented by M. Note that this, |
||
33 | /// although similar to a trie is somewhat different: each node stores a full |
||
34 | /// substring of the full mapping rather than a single character state. |
||
35 | /// |
||
36 | /// Each internal node contains a pointer to the internal node representing |
||
37 | /// the same string, but with the first character chopped off. This is stored |
||
38 | /// in \p Link. Each leaf node stores the start index of its respective |
||
39 | /// suffix in \p SuffixIdx. |
||
40 | struct SuffixTreeNode { |
||
41 | |||
42 | /// The children of this node. |
||
43 | /// |
||
44 | /// A child existing on an unsigned integer implies that from the mapping |
||
45 | /// represented by the current node, there is a way to reach another |
||
46 | /// mapping by tacking that character on the end of the current string. |
||
47 | llvm::DenseMap<unsigned, SuffixTreeNode *> Children; |
||
48 | |||
49 | /// The start index of this node's substring in the main string. |
||
50 | unsigned StartIdx = EmptyIdx; |
||
51 | |||
52 | /// The end index of this node's substring in the main string. |
||
53 | /// |
||
54 | /// Every leaf node must have its \p EndIdx incremented at the end of every |
||
55 | /// step in the construction algorithm. To avoid having to update O(N) |
||
56 | /// nodes individually at the end of every step, the end index is stored |
||
57 | /// as a pointer. |
||
58 | unsigned *EndIdx = nullptr; |
||
59 | |||
60 | /// For leaves, the start index of the suffix represented by this node. |
||
61 | /// |
||
62 | /// For all other nodes, this is ignored. |
||
63 | unsigned SuffixIdx = EmptyIdx; |
||
64 | |||
65 | /// For internal nodes, a pointer to the internal node representing |
||
66 | /// the same sequence with the first character chopped off. |
||
67 | /// |
||
68 | /// This acts as a shortcut in Ukkonen's algorithm. One of the things that |
||
69 | /// Ukkonen's algorithm does to achieve linear-time construction is |
||
70 | /// keep track of which node the next insert should be at. This makes each |
||
71 | /// insert O(1), and there are a total of O(N) inserts. The suffix link |
||
72 | /// helps with inserting children of internal nodes. |
||
73 | /// |
||
74 | /// Say we add a child to an internal node with associated mapping S. The |
||
75 | /// next insertion must be at the node representing S - its first character. |
||
76 | /// This is given by the way that we iteratively build the tree in Ukkonen's |
||
77 | /// algorithm. The main idea is to look at the suffixes of each prefix in the |
||
78 | /// string, starting with the longest suffix of the prefix, and ending with |
||
79 | /// the shortest. Therefore, if we keep pointers between such nodes, we can |
||
80 | /// move to the next insertion point in O(1) time. If we don't, then we'd |
||
81 | /// have to query from the root, which takes O(N) time. This would make the |
||
82 | /// construction algorithm O(N^2) rather than O(N). |
||
83 | SuffixTreeNode *Link = nullptr; |
||
84 | |||
85 | /// The length of the string formed by concatenating the edge labels from the |
||
86 | /// root to this node. |
||
87 | unsigned ConcatLen = 0; |
||
88 | |||
89 | /// Returns true if this node is a leaf. |
||
90 | bool isLeaf() const { return SuffixIdx != EmptyIdx; } |
||
91 | |||
92 | /// Returns true if this node is the root of its owning \p SuffixTree. |
||
93 | bool isRoot() const { return StartIdx == EmptyIdx; } |
||
94 | |||
95 | /// Return the number of elements in the substring associated with this node. |
||
96 | size_t size() const { |
||
97 | |||
98 | // Is it the root? If so, it's the empty string so return 0. |
||
99 | if (isRoot()) |
||
100 | return 0; |
||
101 | |||
102 | assert(*EndIdx != EmptyIdx && "EndIdx is undefined!"); |
||
103 | |||
104 | // Size = the number of elements in the string. |
||
105 | // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1. |
||
106 | return *EndIdx - StartIdx + 1; |
||
107 | } |
||
108 | |||
109 | SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link) |
||
110 | : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {} |
||
111 | |||
112 | SuffixTreeNode() = default; |
||
113 | }; |
||
114 | |||
115 | /// A data structure for fast substring queries. |
||
116 | /// |
||
117 | /// Suffix trees represent the suffixes of their input strings in their leaves. |
||
118 | /// A suffix tree is a type of compressed trie structure where each node |
||
119 | /// represents an entire substring rather than a single character. Each leaf |
||
120 | /// of the tree is a suffix. |
||
121 | /// |
||
122 | /// A suffix tree can be seen as a type of state machine where each state is a |
||
123 | /// substring of the full string. The tree is structured so that, for a string |
||
124 | /// of length N, there are exactly N leaves in the tree. This structure allows |
||
125 | /// us to quickly find repeated substrings of the input string. |
||
126 | /// |
||
127 | /// In this implementation, a "string" is a vector of unsigned integers. |
||
128 | /// These integers may result from hashing some data type. A suffix tree can |
||
129 | /// contain 1 or many strings, which can then be queried as one large string. |
||
130 | /// |
||
131 | /// The suffix tree is implemented using Ukkonen's algorithm for linear-time |
||
132 | /// suffix tree construction. Ukkonen's algorithm is explained in more detail |
||
133 | /// in the paper by Esko Ukkonen "On-line construction of suffix trees. The |
||
134 | /// paper is available at |
||
135 | /// |
||
136 | /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf |
||
137 | class SuffixTree { |
||
138 | public: |
||
139 | /// Each element is an integer representing an instruction in the module. |
||
140 | llvm::ArrayRef<unsigned> Str; |
||
141 | |||
142 | /// A repeated substring in the tree. |
||
143 | struct RepeatedSubstring { |
||
144 | /// The length of the string. |
||
145 | unsigned Length; |
||
146 | |||
147 | /// The start indices of each occurrence. |
||
148 | std::vector<unsigned> StartIndices; |
||
149 | }; |
||
150 | |||
151 | private: |
||
152 | /// Maintains each node in the tree. |
||
153 | llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator; |
||
154 | |||
155 | /// The root of the suffix tree. |
||
156 | /// |
||
157 | /// The root represents the empty string. It is maintained by the |
||
158 | /// \p NodeAllocator like every other node in the tree. |
||
159 | SuffixTreeNode *Root = nullptr; |
||
160 | |||
161 | /// Maintains the end indices of the internal nodes in the tree. |
||
162 | /// |
||
163 | /// Each internal node is guaranteed to never have its end index change |
||
164 | /// during the construction algorithm; however, leaves must be updated at |
||
165 | /// every step. Therefore, we need to store leaf end indices by reference |
||
166 | /// to avoid updating O(N) leaves at every step of construction. Thus, |
||
167 | /// every internal node must be allocated its own end index. |
||
168 | llvm::BumpPtrAllocator InternalEndIdxAllocator; |
||
169 | |||
170 | /// The end index of each leaf in the tree. |
||
171 | unsigned LeafEndIdx = -1; |
||
172 | |||
173 | /// Helper struct which keeps track of the next insertion point in |
||
174 | /// Ukkonen's algorithm. |
||
175 | struct ActiveState { |
||
176 | /// The next node to insert at. |
||
177 | SuffixTreeNode *Node = nullptr; |
||
178 | |||
179 | /// The index of the first character in the substring currently being added. |
||
180 | unsigned Idx = EmptyIdx; |
||
181 | |||
182 | /// The length of the substring we have to add at the current step. |
||
183 | unsigned Len = 0; |
||
184 | }; |
||
185 | |||
186 | /// The point the next insertion will take place at in the |
||
187 | /// construction algorithm. |
||
188 | ActiveState Active; |
||
189 | |||
190 | /// Allocate a leaf node and add it to the tree. |
||
191 | /// |
||
192 | /// \param Parent The parent of this node. |
||
193 | /// \param StartIdx The start index of this node's associated string. |
||
194 | /// \param Edge The label on the edge leaving \p Parent to this node. |
||
195 | /// |
||
196 | /// \returns A pointer to the allocated leaf node. |
||
197 | SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx, |
||
198 | unsigned Edge); |
||
199 | |||
200 | /// Allocate an internal node and add it to the tree. |
||
201 | /// |
||
202 | /// \param Parent The parent of this node. Only null when allocating the root. |
||
203 | /// \param StartIdx The start index of this node's associated string. |
||
204 | /// \param EndIdx The end index of this node's associated string. |
||
205 | /// \param Edge The label on the edge leaving \p Parent to this node. |
||
206 | /// |
||
207 | /// \returns A pointer to the allocated internal node. |
||
208 | SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx, |
||
209 | unsigned EndIdx, unsigned Edge); |
||
210 | |||
211 | /// Set the suffix indices of the leaves to the start indices of their |
||
212 | /// respective suffixes. |
||
213 | void setSuffixIndices(); |
||
214 | |||
215 | /// Construct the suffix tree for the prefix of the input ending at |
||
216 | /// \p EndIdx. |
||
217 | /// |
||
218 | /// Used to construct the full suffix tree iteratively. At the end of each |
||
219 | /// step, the constructed suffix tree is either a valid suffix tree, or a |
||
220 | /// suffix tree with implicit suffixes. At the end of the final step, the |
||
221 | /// suffix tree is a valid tree. |
||
222 | /// |
||
223 | /// \param EndIdx The end index of the current prefix in the main string. |
||
224 | /// \param SuffixesToAdd The number of suffixes that must be added |
||
225 | /// to complete the suffix tree at the current phase. |
||
226 | /// |
||
227 | /// \returns The number of suffixes that have not been added at the end of |
||
228 | /// this step. |
||
229 | unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd); |
||
230 | |||
231 | public: |
||
232 | /// Construct a suffix tree from a sequence of unsigned integers. |
||
233 | /// |
||
234 | /// \param Str The string to construct the suffix tree for. |
||
235 | SuffixTree(const std::vector<unsigned> &Str); |
||
236 | |||
237 | /// Iterator for finding all repeated substrings in the suffix tree. |
||
238 | struct RepeatedSubstringIterator { |
||
239 | private: |
||
240 | /// The current node we're visiting. |
||
241 | SuffixTreeNode *N = nullptr; |
||
242 | |||
243 | /// The repeated substring associated with this node. |
||
244 | RepeatedSubstring RS; |
||
245 | |||
246 | /// The nodes left to visit. |
||
247 | std::vector<SuffixTreeNode *> ToVisit; |
||
248 | |||
249 | /// The minimum length of a repeated substring to find. |
||
250 | /// Since we're outlining, we want at least two instructions in the range. |
||
251 | /// FIXME: This may not be true for targets like X86 which support many |
||
252 | /// instruction lengths. |
||
253 | const unsigned MinLength = 2; |
||
254 | |||
255 | /// Move the iterator to the next repeated substring. |
||
256 | void advance() { |
||
257 | // Clear the current state. If we're at the end of the range, then this |
||
258 | // is the state we want to be in. |
||
259 | RS = RepeatedSubstring(); |
||
260 | N = nullptr; |
||
261 | |||
262 | // Each leaf node represents a repeat of a string. |
||
263 | std::vector<SuffixTreeNode *> LeafChildren; |
||
264 | |||
265 | // Continue visiting nodes until we find one which repeats more than once. |
||
266 | while (!ToVisit.empty()) { |
||
267 | SuffixTreeNode *Curr = ToVisit.back(); |
||
268 | ToVisit.pop_back(); |
||
269 | LeafChildren.clear(); |
||
270 | |||
271 | // Keep track of the length of the string associated with the node. If |
||
272 | // it's too short, we'll quit. |
||
273 | unsigned Length = Curr->ConcatLen; |
||
274 | |||
275 | // Iterate over each child, saving internal nodes for visiting, and |
||
276 | // leaf nodes in LeafChildren. Internal nodes represent individual |
||
277 | // strings, which may repeat. |
||
278 | for (auto &ChildPair : Curr->Children) { |
||
279 | // Save all of this node's children for processing. |
||
280 | if (!ChildPair.second->isLeaf()) |
||
281 | ToVisit.push_back(ChildPair.second); |
||
282 | |||
283 | // It's not an internal node, so it must be a leaf. If we have a |
||
284 | // long enough string, then save the leaf children. |
||
285 | else if (Length >= MinLength) |
||
286 | LeafChildren.push_back(ChildPair.second); |
||
287 | } |
||
288 | |||
289 | // The root never represents a repeated substring. If we're looking at |
||
290 | // that, then skip it. |
||
291 | if (Curr->isRoot()) |
||
292 | continue; |
||
293 | |||
294 | // Do we have any repeated substrings? |
||
295 | if (LeafChildren.size() >= 2) { |
||
296 | // Yes. Update the state to reflect this, and then bail out. |
||
297 | N = Curr; |
||
298 | RS.Length = Length; |
||
299 | for (SuffixTreeNode *Leaf : LeafChildren) |
||
300 | RS.StartIndices.push_back(Leaf->SuffixIdx); |
||
301 | break; |
||
302 | } |
||
303 | } |
||
304 | |||
305 | // At this point, either NewRS is an empty RepeatedSubstring, or it was |
||
306 | // set in the above loop. Similarly, N is either nullptr, or the node |
||
307 | // associated with NewRS. |
||
308 | } |
||
309 | |||
310 | public: |
||
311 | /// Return the current repeated substring. |
||
312 | RepeatedSubstring &operator*() { return RS; } |
||
313 | |||
314 | RepeatedSubstringIterator &operator++() { |
||
315 | advance(); |
||
316 | return *this; |
||
317 | } |
||
318 | |||
319 | RepeatedSubstringIterator operator++(int I) { |
||
320 | RepeatedSubstringIterator It(*this); |
||
321 | advance(); |
||
322 | return It; |
||
323 | } |
||
324 | |||
325 | bool operator==(const RepeatedSubstringIterator &Other) const { |
||
326 | return N == Other.N; |
||
327 | } |
||
328 | bool operator!=(const RepeatedSubstringIterator &Other) const { |
||
329 | return !(*this == Other); |
||
330 | } |
||
331 | |||
332 | RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) { |
||
333 | // Do we have a non-null node? |
||
334 | if (N) { |
||
335 | // Yes. At the first step, we need to visit all of N's children. |
||
336 | // Note: This means that we visit N last. |
||
337 | ToVisit.push_back(N); |
||
338 | advance(); |
||
339 | } |
||
340 | } |
||
341 | }; |
||
342 | |||
343 | typedef RepeatedSubstringIterator iterator; |
||
344 | iterator begin() { return iterator(Root); } |
||
345 | iterator end() { return iterator(nullptr); } |
||
346 | }; |
||
347 | |||
348 | } // namespace llvm |
||
349 | |||
350 | #endif // LLVM_SUPPORT_SUFFIXTREE_H |