Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. //===- llvm/ADT/SuffixTree.h - Tree for substrings --------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. // This file defines the Suffix Tree class and Suffix Tree Node struct.
  10. //
  11. //===----------------------------------------------------------------------===//
  12. #ifndef LLVM_SUPPORT_SUFFIXTREE_H
  13. #define LLVM_SUPPORT_SUFFIXTREE_H
  14.  
  15. #include "llvm/ADT/ArrayRef.h"
  16. #include "llvm/ADT/DenseMap.h"
  17. #include "llvm/Support/Allocator.h"
  18. #include <vector>
  19.  
  20. namespace llvm {
  21.  
  22. /// Represents an undefined index in the suffix tree.
  23. const unsigned EmptyIdx = -1;
  24.  
  25. /// A node in a suffix tree which represents a substring or suffix.
  26. ///
  27. /// Each node has either no children or at least two children, with the root
  28. /// being a exception in the empty tree.
  29. ///
  30. /// Children are represented as a map between unsigned integers and nodes. If
  31. /// a node N has a child M on unsigned integer k, then the mapping represented
  32. /// by N is a proper prefix of the mapping represented by M. Note that this,
  33. /// although similar to a trie is somewhat different: each node stores a full
  34. /// substring of the full mapping rather than a single character state.
  35. ///
  36. /// Each internal node contains a pointer to the internal node representing
  37. /// the same string, but with the first character chopped off. This is stored
  38. /// in \p Link. Each leaf node stores the start index of its respective
  39. /// suffix in \p SuffixIdx.
  40. struct SuffixTreeNode {
  41.  
  42.   /// The children of this node.
  43.   ///
  44.   /// A child existing on an unsigned integer implies that from the mapping
  45.   /// represented by the current node, there is a way to reach another
  46.   /// mapping by tacking that character on the end of the current string.
  47.   llvm::DenseMap<unsigned, SuffixTreeNode *> Children;
  48.  
  49.   /// The start index of this node's substring in the main string.
  50.   unsigned StartIdx = EmptyIdx;
  51.  
  52.   /// The end index of this node's substring in the main string.
  53.   ///
  54.   /// Every leaf node must have its \p EndIdx incremented at the end of every
  55.   /// step in the construction algorithm. To avoid having to update O(N)
  56.   /// nodes individually at the end of every step, the end index is stored
  57.   /// as a pointer.
  58.   unsigned *EndIdx = nullptr;
  59.  
  60.   /// For leaves, the start index of the suffix represented by this node.
  61.   ///
  62.   /// For all other nodes, this is ignored.
  63.   unsigned SuffixIdx = EmptyIdx;
  64.  
  65.   /// For internal nodes, a pointer to the internal node representing
  66.   /// the same sequence with the first character chopped off.
  67.   ///
  68.   /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
  69.   /// Ukkonen's algorithm does to achieve linear-time construction is
  70.   /// keep track of which node the next insert should be at. This makes each
  71.   /// insert O(1), and there are a total of O(N) inserts. The suffix link
  72.   /// helps with inserting children of internal nodes.
  73.   ///
  74.   /// Say we add a child to an internal node with associated mapping S. The
  75.   /// next insertion must be at the node representing S - its first character.
  76.   /// This is given by the way that we iteratively build the tree in Ukkonen's
  77.   /// algorithm. The main idea is to look at the suffixes of each prefix in the
  78.   /// string, starting with the longest suffix of the prefix, and ending with
  79.   /// the shortest. Therefore, if we keep pointers between such nodes, we can
  80.   /// move to the next insertion point in O(1) time. If we don't, then we'd
  81.   /// have to query from the root, which takes O(N) time. This would make the
  82.   /// construction algorithm O(N^2) rather than O(N).
  83.   SuffixTreeNode *Link = nullptr;
  84.  
  85.   /// The length of the string formed by concatenating the edge labels from the
  86.   /// root to this node.
  87.   unsigned ConcatLen = 0;
  88.  
  89.   /// Returns true if this node is a leaf.
  90.   bool isLeaf() const { return SuffixIdx != EmptyIdx; }
  91.  
  92.   /// Returns true if this node is the root of its owning \p SuffixTree.
  93.   bool isRoot() const { return StartIdx == EmptyIdx; }
  94.  
  95.   /// Return the number of elements in the substring associated with this node.
  96.   size_t size() const {
  97.  
  98.     // Is it the root? If so, it's the empty string so return 0.
  99.     if (isRoot())
  100.       return 0;
  101.  
  102.     assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
  103.  
  104.     // Size = the number of elements in the string.
  105.     // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
  106.     return *EndIdx - StartIdx + 1;
  107.   }
  108.  
  109.   SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
  110.       : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
  111.  
  112.   SuffixTreeNode() = default;
  113. };
  114.  
  115. /// A data structure for fast substring queries.
  116. ///
  117. /// Suffix trees represent the suffixes of their input strings in their leaves.
  118. /// A suffix tree is a type of compressed trie structure where each node
  119. /// represents an entire substring rather than a single character. Each leaf
  120. /// of the tree is a suffix.
  121. ///
  122. /// A suffix tree can be seen as a type of state machine where each state is a
  123. /// substring of the full string. The tree is structured so that, for a string
  124. /// of length N, there are exactly N leaves in the tree. This structure allows
  125. /// us to quickly find repeated substrings of the input string.
  126. ///
  127. /// In this implementation, a "string" is a vector of unsigned integers.
  128. /// These integers may result from hashing some data type. A suffix tree can
  129. /// contain 1 or many strings, which can then be queried as one large string.
  130. ///
  131. /// The suffix tree is implemented using Ukkonen's algorithm for linear-time
  132. /// suffix tree construction. Ukkonen's algorithm is explained in more detail
  133. /// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
  134. /// paper is available at
  135. ///
  136. /// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
  137. class SuffixTree {
  138. public:
  139.   /// Each element is an integer representing an instruction in the module.
  140.   llvm::ArrayRef<unsigned> Str;
  141.  
  142.   /// A repeated substring in the tree.
  143.   struct RepeatedSubstring {
  144.     /// The length of the string.
  145.     unsigned Length;
  146.  
  147.     /// The start indices of each occurrence.
  148.     std::vector<unsigned> StartIndices;
  149.   };
  150.  
  151. private:
  152.   /// Maintains each node in the tree.
  153.   llvm::SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
  154.  
  155.   /// The root of the suffix tree.
  156.   ///
  157.   /// The root represents the empty string. It is maintained by the
  158.   /// \p NodeAllocator like every other node in the tree.
  159.   SuffixTreeNode *Root = nullptr;
  160.  
  161.   /// Maintains the end indices of the internal nodes in the tree.
  162.   ///
  163.   /// Each internal node is guaranteed to never have its end index change
  164.   /// during the construction algorithm; however, leaves must be updated at
  165.   /// every step. Therefore, we need to store leaf end indices by reference
  166.   /// to avoid updating O(N) leaves at every step of construction. Thus,
  167.   /// every internal node must be allocated its own end index.
  168.   llvm::BumpPtrAllocator InternalEndIdxAllocator;
  169.  
  170.   /// The end index of each leaf in the tree.
  171.   unsigned LeafEndIdx = -1;
  172.  
  173.   /// Helper struct which keeps track of the next insertion point in
  174.   /// Ukkonen's algorithm.
  175.   struct ActiveState {
  176.     /// The next node to insert at.
  177.     SuffixTreeNode *Node = nullptr;
  178.  
  179.     /// The index of the first character in the substring currently being added.
  180.     unsigned Idx = EmptyIdx;
  181.  
  182.     /// The length of the substring we have to add at the current step.
  183.     unsigned Len = 0;
  184.   };
  185.  
  186.   /// The point the next insertion will take place at in the
  187.   /// construction algorithm.
  188.   ActiveState Active;
  189.  
  190.   /// Allocate a leaf node and add it to the tree.
  191.   ///
  192.   /// \param Parent The parent of this node.
  193.   /// \param StartIdx The start index of this node's associated string.
  194.   /// \param Edge The label on the edge leaving \p Parent to this node.
  195.   ///
  196.   /// \returns A pointer to the allocated leaf node.
  197.   SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
  198.                              unsigned Edge);
  199.  
  200.   /// Allocate an internal node and add it to the tree.
  201.   ///
  202.   /// \param Parent The parent of this node. Only null when allocating the root.
  203.   /// \param StartIdx The start index of this node's associated string.
  204.   /// \param EndIdx The end index of this node's associated string.
  205.   /// \param Edge The label on the edge leaving \p Parent to this node.
  206.   ///
  207.   /// \returns A pointer to the allocated internal node.
  208.   SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
  209.                                      unsigned EndIdx, unsigned Edge);
  210.  
  211.   /// Set the suffix indices of the leaves to the start indices of their
  212.   /// respective suffixes.
  213.   void setSuffixIndices();
  214.  
  215.   /// Construct the suffix tree for the prefix of the input ending at
  216.   /// \p EndIdx.
  217.   ///
  218.   /// Used to construct the full suffix tree iteratively. At the end of each
  219.   /// step, the constructed suffix tree is either a valid suffix tree, or a
  220.   /// suffix tree with implicit suffixes. At the end of the final step, the
  221.   /// suffix tree is a valid tree.
  222.   ///
  223.   /// \param EndIdx The end index of the current prefix in the main string.
  224.   /// \param SuffixesToAdd The number of suffixes that must be added
  225.   /// to complete the suffix tree at the current phase.
  226.   ///
  227.   /// \returns The number of suffixes that have not been added at the end of
  228.   /// this step.
  229.   unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd);
  230.  
  231. public:
  232.   /// Construct a suffix tree from a sequence of unsigned integers.
  233.   ///
  234.   /// \param Str The string to construct the suffix tree for.
  235.   SuffixTree(const std::vector<unsigned> &Str);
  236.  
  237.   /// Iterator for finding all repeated substrings in the suffix tree.
  238.   struct RepeatedSubstringIterator {
  239.   private:
  240.     /// The current node we're visiting.
  241.     SuffixTreeNode *N = nullptr;
  242.  
  243.     /// The repeated substring associated with this node.
  244.     RepeatedSubstring RS;
  245.  
  246.     /// The nodes left to visit.
  247.     std::vector<SuffixTreeNode *> ToVisit;
  248.  
  249.     /// The minimum length of a repeated substring to find.
  250.     /// Since we're outlining, we want at least two instructions in the range.
  251.     /// FIXME: This may not be true for targets like X86 which support many
  252.     /// instruction lengths.
  253.     const unsigned MinLength = 2;
  254.  
  255.     /// Move the iterator to the next repeated substring.
  256.     void advance() {
  257.       // Clear the current state. If we're at the end of the range, then this
  258.       // is the state we want to be in.
  259.       RS = RepeatedSubstring();
  260.       N = nullptr;
  261.  
  262.       // Each leaf node represents a repeat of a string.
  263.       std::vector<SuffixTreeNode *> LeafChildren;
  264.  
  265.       // Continue visiting nodes until we find one which repeats more than once.
  266.       while (!ToVisit.empty()) {
  267.         SuffixTreeNode *Curr = ToVisit.back();
  268.         ToVisit.pop_back();
  269.         LeafChildren.clear();
  270.  
  271.         // Keep track of the length of the string associated with the node. If
  272.         // it's too short, we'll quit.
  273.         unsigned Length = Curr->ConcatLen;
  274.  
  275.         // Iterate over each child, saving internal nodes for visiting, and
  276.         // leaf nodes in LeafChildren. Internal nodes represent individual
  277.         // strings, which may repeat.
  278.         for (auto &ChildPair : Curr->Children) {
  279.           // Save all of this node's children for processing.
  280.           if (!ChildPair.second->isLeaf())
  281.             ToVisit.push_back(ChildPair.second);
  282.  
  283.           // It's not an internal node, so it must be a leaf. If we have a
  284.           // long enough string, then save the leaf children.
  285.           else if (Length >= MinLength)
  286.             LeafChildren.push_back(ChildPair.second);
  287.         }
  288.  
  289.         // The root never represents a repeated substring. If we're looking at
  290.         // that, then skip it.
  291.         if (Curr->isRoot())
  292.           continue;
  293.  
  294.         // Do we have any repeated substrings?
  295.         if (LeafChildren.size() >= 2) {
  296.           // Yes. Update the state to reflect this, and then bail out.
  297.           N = Curr;
  298.           RS.Length = Length;
  299.           for (SuffixTreeNode *Leaf : LeafChildren)
  300.             RS.StartIndices.push_back(Leaf->SuffixIdx);
  301.           break;
  302.         }
  303.       }
  304.  
  305.       // At this point, either NewRS is an empty RepeatedSubstring, or it was
  306.       // set in the above loop. Similarly, N is either nullptr, or the node
  307.       // associated with NewRS.
  308.     }
  309.  
  310.   public:
  311.     /// Return the current repeated substring.
  312.     RepeatedSubstring &operator*() { return RS; }
  313.  
  314.     RepeatedSubstringIterator &operator++() {
  315.       advance();
  316.       return *this;
  317.     }
  318.  
  319.     RepeatedSubstringIterator operator++(int I) {
  320.       RepeatedSubstringIterator It(*this);
  321.       advance();
  322.       return It;
  323.     }
  324.  
  325.     bool operator==(const RepeatedSubstringIterator &Other) const {
  326.       return N == Other.N;
  327.     }
  328.     bool operator!=(const RepeatedSubstringIterator &Other) const {
  329.       return !(*this == Other);
  330.     }
  331.  
  332.     RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
  333.       // Do we have a non-null node?
  334.       if (N) {
  335.         // Yes. At the first step, we need to visit all of N's children.
  336.         // Note: This means that we visit N last.
  337.         ToVisit.push_back(N);
  338.         advance();
  339.       }
  340.     }
  341.   };
  342.  
  343.   typedef RepeatedSubstringIterator iterator;
  344.   iterator begin() { return iterator(Root); }
  345.   iterator end() { return iterator(nullptr); }
  346. };
  347.  
  348. } // namespace llvm
  349.  
  350. #endif // LLVM_SUPPORT_SUFFIXTREE_H
  351.