Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | //===-- llvm/ADT/edit_distance.h - Array edit distance function --- C++ -*-===// |
2 | // |
||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | // See https://llvm.org/LICENSE.txt for license information. |
||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | // |
||
7 | //===----------------------------------------------------------------------===// |
||
8 | /// |
||
9 | /// \file |
||
10 | /// This file defines a Levenshtein distance function that works for any two |
||
11 | /// sequences, with each element of each sequence being analogous to a character |
||
12 | /// in a string. |
||
13 | /// |
||
14 | //===----------------------------------------------------------------------===// |
||
15 | |||
16 | #ifndef LLVM_ADT_EDIT_DISTANCE_H |
||
17 | #define LLVM_ADT_EDIT_DISTANCE_H |
||
18 | |||
19 | #include "llvm/ADT/ArrayRef.h" |
||
20 | #include <algorithm> |
||
21 | #include <memory> |
||
22 | |||
23 | namespace llvm { |
||
24 | |||
25 | /// Determine the edit distance between two sequences. |
||
26 | /// |
||
27 | /// \param FromArray the first sequence to compare. |
||
28 | /// |
||
29 | /// \param ToArray the second sequence to compare. |
||
30 | /// |
||
31 | /// \param Map A Functor to apply to each item of the sequences before |
||
32 | /// comparison. |
||
33 | /// |
||
34 | /// \param AllowReplacements whether to allow element replacements (change one |
||
35 | /// element into another) as a single operation, rather than as two operations |
||
36 | /// (an insertion and a removal). |
||
37 | /// |
||
38 | /// \param MaxEditDistance If non-zero, the maximum edit distance that this |
||
39 | /// routine is allowed to compute. If the edit distance will exceed that |
||
40 | /// maximum, returns \c MaxEditDistance+1. |
||
41 | /// |
||
42 | /// \returns the minimum number of element insertions, removals, or (if |
||
43 | /// \p AllowReplacements is \c true) replacements needed to transform one of |
||
44 | /// the given sequences into the other. If zero, the sequences are identical. |
||
45 | template <typename T, typename Functor> |
||
46 | unsigned ComputeMappedEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, |
||
47 | Functor Map, bool AllowReplacements = true, |
||
48 | unsigned MaxEditDistance = 0) { |
||
49 | // The algorithm implemented below is the "classic" |
||
50 | // dynamic-programming algorithm for computing the Levenshtein |
||
51 | // distance, which is described here: |
||
52 | // |
||
53 | // http://en.wikipedia.org/wiki/Levenshtein_distance |
||
54 | // |
||
55 | // Although the algorithm is typically described using an m x n |
||
56 | // array, only one row plus one element are used at a time, so this |
||
57 | // implementation just keeps one vector for the row. To update one entry, |
||
58 | // only the entries to the left, top, and top-left are needed. The left |
||
59 | // entry is in Row[x-1], the top entry is what's in Row[x] from the last |
||
60 | // iteration, and the top-left entry is stored in Previous. |
||
61 | typename ArrayRef<T>::size_type m = FromArray.size(); |
||
62 | typename ArrayRef<T>::size_type n = ToArray.size(); |
||
63 | |||
64 | if (MaxEditDistance) { |
||
65 | // If the difference in size between the 2 arrays is larger than the max |
||
66 | // distance allowed, we can bail out as we will always need at least |
||
67 | // MaxEditDistance insertions or removals. |
||
68 | typename ArrayRef<T>::size_type AbsDiff = m > n ? m - n : n - m; |
||
69 | if (AbsDiff > MaxEditDistance) |
||
70 | return MaxEditDistance + 1; |
||
71 | } |
||
72 | |||
73 | const unsigned SmallBufferSize = 64; |
||
74 | unsigned SmallBuffer[SmallBufferSize]; |
||
75 | std::unique_ptr<unsigned[]> Allocated; |
||
76 | unsigned *Row = SmallBuffer; |
||
77 | if (n + 1 > SmallBufferSize) { |
||
78 | Row = new unsigned[n + 1]; |
||
79 | Allocated.reset(Row); |
||
80 | } |
||
81 | |||
82 | for (unsigned i = 1; i <= n; ++i) |
||
83 | Row[i] = i; |
||
84 | |||
85 | for (typename ArrayRef<T>::size_type y = 1; y <= m; ++y) { |
||
86 | Row[0] = y; |
||
87 | unsigned BestThisRow = Row[0]; |
||
88 | |||
89 | unsigned Previous = y - 1; |
||
90 | const auto &CurItem = Map(FromArray[y - 1]); |
||
91 | for (typename ArrayRef<T>::size_type x = 1; x <= n; ++x) { |
||
92 | int OldRow = Row[x]; |
||
93 | if (AllowReplacements) { |
||
94 | Row[x] = std::min(Previous + (CurItem == Map(ToArray[x - 1]) ? 0u : 1u), |
||
95 | std::min(Row[x - 1], Row[x]) + 1); |
||
96 | } |
||
97 | else { |
||
98 | if (CurItem == Map(ToArray[x - 1])) |
||
99 | Row[x] = Previous; |
||
100 | else Row[x] = std::min(Row[x-1], Row[x]) + 1; |
||
101 | } |
||
102 | Previous = OldRow; |
||
103 | BestThisRow = std::min(BestThisRow, Row[x]); |
||
104 | } |
||
105 | |||
106 | if (MaxEditDistance && BestThisRow > MaxEditDistance) |
||
107 | return MaxEditDistance + 1; |
||
108 | } |
||
109 | |||
110 | unsigned Result = Row[n]; |
||
111 | return Result; |
||
112 | } |
||
113 | |||
114 | template <typename T> |
||
115 | unsigned ComputeEditDistance(ArrayRef<T> FromArray, ArrayRef<T> ToArray, |
||
116 | bool AllowReplacements = true, |
||
117 | unsigned MaxEditDistance = 0) { |
||
118 | return ComputeMappedEditDistance( |
||
119 | FromArray, ToArray, [](const T &X) -> const T & { return X; }, |
||
120 | AllowReplacements, MaxEditDistance); |
||
121 | } |
||
122 | |||
123 | } // End llvm namespace |
||
124 | |||
125 | #endif |