Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line |
|---|---|---|---|
| 14 | pmbaty | 1 | //===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// |
| 2 | // |
||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
| 6 | // |
||
| 7 | //===----------------------------------------------------------------------===// |
||
| 8 | /// |
||
| 9 | /// \file |
||
| 10 | /// This file defines a Levenshtein distance function that works for any two |
||
| 11 | /// sequences, with each element of each sequence being analogous to a character |
||
| 12 | /// in a string. |
||
| 13 | /// |
||
| 14 | //===----------------------------------------------------------------------===// |
||
| 15 | |||
| 16 | #ifndef LLVM_ADT_EDIT_DISTANCE_H |
||
| 17 | #define LLVM_ADT_EDIT_DISTANCE_H |
||
| 18 | |||
| 19 | #include "llvm/ADT/ArrayRef.h" |
||
| 20 | #include <algorithm> |
||
| 21 | #include <memory> |
||
| 22 | |||
| 23 | namespace llvm { |
||
| 24 | |||
| 25 | /// Determine the edit distance between two sequences. |
||
| 26 | /// |
||
| 27 | /// \param FromArray the first sequence to compare. |
||
| 28 | /// |
||
| 29 | /// \param ToArray the second sequence to compare. |
||
| 30 | /// |
||
| 31 | /// \param Map A Functor to apply to each item of the sequences before |
||
| 32 | /// comparison. |
||
| 33 | /// |
||
| 34 | /// \param AllowReplacements whether to allow element replacements (change one |
||
| 35 | /// element into another) as a single operation, rather than as two operations |
||
| 36 | /// (an insertion and a removal). |
||
| 37 | /// |
||
| 38 | /// \param MaxEditDistance If non-zero, the maximum edit distance that this |
||
| 39 | /// routine is allowed to compute. If the edit distance will exceed that |
||
| 40 | /// maximum, returns \c MaxEditDistance+1. |
||
| 41 | /// |
||
| 42 | /// \returns the minimum number of element insertions, removals, or (if |
||
| 43 | /// \p AllowReplacements is \c true) replacements needed to transform one of |
||
| 44 | /// the given sequences into the other. If zero, the sequences are identical. |
||
| 45 | template <typename T, typename Functor> |
||
| 46 | unsigned ComputeMappedEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, |
||
| 47 | Functor Map, bool AllowReplacements = true, |
||
| 48 | unsigned MaxEditDistance = 0) { |
||
| 49 | // The algorithm implemented below is the "classic" |
||
| 50 | // dynamic-programming algorithm for computing the Levenshtein |
||
| 51 | // distance, which is described here: |
||
| 52 | // |
||
| 53 | // http://en.wikipedia.org/wiki/Levenshtein_distance |
||
| 54 | // |
||
| 55 | // Although the algorithm is typically described using an m x n |
||
| 56 | // array, only one row plus one element are used at a time, so this |
||
| 57 | // implementation just keeps one vector for the row. To update one entry, |
||
| 58 | // only the entries to the left, top, and top-left are needed. The left |
||
| 59 | // entry is in Row[x-1], the top entry is what's in Row[x] from the last |
||
| 60 | // iteration, and the top-left entry is stored in Previous. |
||
| 61 | typename ArrayRef<T>::size_type m = FromArray.size(); |
||
| 62 | typename ArrayRef<T>::size_type n = ToArray.size(); |
||
| 63 | |||
| 64 | if (MaxEditDistance) { |
||
| 65 | // If the difference in size between the 2 arrays is larger than the max |
||
| 66 | // distance allowed, we can bail out as we will always need at least |
||
| 67 | // MaxEditDistance insertions or removals. |
||
| 68 | typename ArrayRef<T>::size_type AbsDiff = m > n ? m - n : n - m; |
||
| 69 | if (AbsDiff > MaxEditDistance) |
||
| 70 | return MaxEditDistance + 1; |
||
| 71 | } |
||
| 72 | |||
| 73 | const unsigned SmallBufferSize = 64; |
||
| 74 | unsigned SmallBuffer[SmallBufferSize]; |
||
| 75 | std::unique_ptr<unsigned[]> Allocated; |
||
| 76 | unsigned *Row = SmallBuffer; |
||
| 77 | if (n + 1 > SmallBufferSize) { |
||
| 78 | Row = new unsigned[n + 1]; |
||
| 79 | Allocated.reset(Row); |
||
| 80 | } |
||
| 81 | |||
| 82 | for (unsigned i = 1; i <= n; ++i) |
||
| 83 | Row[i] = i; |
||
| 84 | |||
| 85 | for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) { |
||
| 86 | Row[0] = y; |
||
| 87 | unsigned BestThisRow = Row[0]; |
||
| 88 | |||
| 89 | unsigned Previous = y - 1; |
||
| 90 | const auto &CurItem = Map(FromArray[y - 1]); |
||
| 91 | for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) { |
||
| 92 | int OldRow = Row[x]; |
||
| 93 | if (AllowReplacements) { |
||
| 94 | Row[x] = std::min(Previous + (CurItem == Map(ToArray[x - 1]) ? 0u : 1u), |
||
| 95 | std::min(Row[x - 1], Row[x]) + 1); |
||
| 96 | } |
||
| 97 | else { |
||
| 98 | if (CurItem == Map(ToArray[x - 1])) |
||
| 99 | Row[x] = Previous; |
||
| 100 | else Row[x] = std::min(Row[x-1], Row[x]) + 1; |
||
| 101 | } |
||
| 102 | Previous = OldRow; |
||
| 103 | BestThisRow = std::min(BestThisRow, Row[x]); |
||
| 104 | } |
||
| 105 | |||
| 106 | if (MaxEditDistance && BestThisRow > MaxEditDistance) |
||
| 107 | return MaxEditDistance + 1; |
||
| 108 | } |
||
| 109 | |||
| 110 | unsigned Result = Row[n]; |
||
| 111 | return Result; |
||
| 112 | } |
||
| 113 | |||
| 114 | template <typename T> |
||
| 115 | unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, |
||
| 116 | bool AllowReplacements = true, |
||
| 117 | unsigned MaxEditDistance = 0) { |
||
| 118 | return ComputeMappedEditDistance( |
||
| 119 | FromArray, ToArray, [](const T &X) -> const T & { return X; }, |
||
| 120 | AllowReplacements, MaxEditDistance); |
||
| 121 | } |
||
| 122 | |||
| 123 | } // End llvm namespace |
||
| 124 | |||
| 125 | #endif |