Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | //===--- CloneDetection.h - Finds code clones in an AST ---------*- C++ -*-===// |
2 | // |
||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | // See https://llvm.org/LICENSE.txt for license information. |
||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | // |
||
7 | //===----------------------------------------------------------------------===// |
||
8 | /// |
||
9 | /// \file |
||
10 | /// This file defines classes for searching and analyzing source code clones. |
||
11 | /// |
||
12 | //===----------------------------------------------------------------------===// |
||
13 | |||
14 | #ifndef LLVM_CLANG_ANALYSIS_CLONEDETECTION_H |
||
15 | #define LLVM_CLANG_ANALYSIS_CLONEDETECTION_H |
||
16 | |||
17 | #include "clang/AST/StmtVisitor.h" |
||
18 | #include "llvm/Support/Regex.h" |
||
19 | #include <vector> |
||
20 | |||
21 | namespace clang { |
||
22 | |||
23 | class Stmt; |
||
24 | class Decl; |
||
25 | class VarDecl; |
||
26 | class ASTContext; |
||
27 | class CompoundStmt; |
||
28 | |||
29 | /// Identifies a list of statements. |
||
30 | /// |
||
31 | /// Can either identify a single arbitrary Stmt object, a continuous sequence of |
||
32 | /// child statements inside a CompoundStmt or no statements at all. |
||
33 | class StmtSequence { |
||
34 | /// If this object identifies a sequence of statements inside a CompoundStmt, |
||
35 | /// S points to this CompoundStmt. If this object only identifies a single |
||
36 | /// Stmt, then S is a pointer to this Stmt. |
||
37 | const Stmt *S; |
||
38 | |||
39 | /// The declaration that contains the statements. |
||
40 | const Decl *D; |
||
41 | |||
42 | /// If EndIndex is non-zero, then S is a CompoundStmt and this StmtSequence |
||
43 | /// instance is representing the CompoundStmt children inside the array |
||
44 | /// [StartIndex, EndIndex). |
||
45 | unsigned StartIndex; |
||
46 | unsigned EndIndex; |
||
47 | |||
48 | public: |
||
49 | /// Constructs a StmtSequence holding multiple statements. |
||
50 | /// |
||
51 | /// The resulting StmtSequence identifies a continuous sequence of statements |
||
52 | /// in the body of the given CompoundStmt. Which statements of the body should |
||
53 | /// be identified needs to be specified by providing a start and end index |
||
54 | /// that describe a non-empty sub-array in the body of the given CompoundStmt. |
||
55 | /// |
||
56 | /// \param Stmt A CompoundStmt that contains all statements in its body. |
||
57 | /// \param D The Decl containing this Stmt. |
||
58 | /// \param StartIndex The inclusive start index in the children array of |
||
59 | /// \p Stmt |
||
60 | /// \param EndIndex The exclusive end index in the children array of \p Stmt. |
||
61 | StmtSequence(const CompoundStmt *Stmt, const Decl *D, unsigned StartIndex, |
||
62 | unsigned EndIndex); |
||
63 | |||
64 | /// Constructs a StmtSequence holding a single statement. |
||
65 | /// |
||
66 | /// \param Stmt An arbitrary Stmt. |
||
67 | /// \param D The Decl containing this Stmt. |
||
68 | StmtSequence(const Stmt *Stmt, const Decl *D); |
||
69 | |||
70 | /// Constructs an empty StmtSequence. |
||
71 | StmtSequence(); |
||
72 | |||
73 | typedef const Stmt *const *iterator; |
||
74 | |||
75 | /// Returns an iterator pointing to the first statement in this sequence. |
||
76 | iterator begin() const; |
||
77 | |||
78 | /// Returns an iterator pointing behind the last statement in this sequence. |
||
79 | iterator end() const; |
||
80 | |||
81 | /// Returns the first statement in this sequence. |
||
82 | /// |
||
83 | /// This method should only be called on a non-empty StmtSequence object. |
||
84 | const Stmt *front() const { |
||
85 | assert(!empty()); |
||
86 | return begin()[0]; |
||
87 | } |
||
88 | |||
89 | /// Returns the last statement in this sequence. |
||
90 | /// |
||
91 | /// This method should only be called on a non-empty StmtSequence object. |
||
92 | const Stmt *back() const { |
||
93 | assert(!empty()); |
||
94 | return begin()[size() - 1]; |
||
95 | } |
||
96 | |||
97 | /// Returns the number of statements this object holds. |
||
98 | unsigned size() const { |
||
99 | if (holdsSequence()) |
||
100 | return EndIndex - StartIndex; |
||
101 | if (S == nullptr) |
||
102 | return 0; |
||
103 | return 1; |
||
104 | } |
||
105 | |||
106 | /// Returns true if and only if this StmtSequence contains no statements. |
||
107 | bool empty() const { return size() == 0; } |
||
108 | |||
109 | /// Returns the related ASTContext for the stored Stmts. |
||
110 | ASTContext &getASTContext() const; |
||
111 | |||
112 | /// Returns the declaration that contains the stored Stmts. |
||
113 | const Decl *getContainingDecl() const { |
||
114 | assert(D); |
||
115 | return D; |
||
116 | } |
||
117 | |||
118 | /// Returns true if this objects holds a list of statements. |
||
119 | bool holdsSequence() const { return EndIndex != 0; } |
||
120 | |||
121 | /// Returns the start sourcelocation of the first statement in this sequence. |
||
122 | /// |
||
123 | /// This method should only be called on a non-empty StmtSequence object. |
||
124 | SourceLocation getBeginLoc() const; |
||
125 | |||
126 | /// Returns the end sourcelocation of the last statement in this sequence. |
||
127 | /// |
||
128 | /// This method should only be called on a non-empty StmtSequence object. |
||
129 | SourceLocation getEndLoc() const; |
||
130 | |||
131 | /// Returns the source range of the whole sequence - from the beginning |
||
132 | /// of the first statement to the end of the last statement. |
||
133 | SourceRange getSourceRange() const; |
||
134 | |||
135 | bool operator==(const StmtSequence &Other) const { |
||
136 | return std::tie(S, StartIndex, EndIndex) == |
||
137 | std::tie(Other.S, Other.StartIndex, Other.EndIndex); |
||
138 | } |
||
139 | |||
140 | bool operator!=(const StmtSequence &Other) const { |
||
141 | return std::tie(S, StartIndex, EndIndex) != |
||
142 | std::tie(Other.S, Other.StartIndex, Other.EndIndex); |
||
143 | } |
||
144 | |||
145 | /// Returns true if and only if this sequence covers a source range that |
||
146 | /// contains the source range of the given sequence \p Other. |
||
147 | /// |
||
148 | /// This method should only be called on a non-empty StmtSequence object |
||
149 | /// and passed a non-empty StmtSequence object. |
||
150 | bool contains(const StmtSequence &Other) const; |
||
151 | }; |
||
152 | |||
153 | /// Searches for similar subtrees in the AST. |
||
154 | /// |
||
155 | /// First, this class needs several declarations with statement bodies which |
||
156 | /// can be passed via analyzeCodeBody. Afterwards all statements can be |
||
157 | /// searched for clones by calling findClones with a given list of constraints |
||
158 | /// that should specify the wanted properties of the clones. |
||
159 | /// |
||
160 | /// The result of findClones can be further constrained with the constrainClones |
||
161 | /// method. |
||
162 | /// |
||
163 | /// This class only searches for clones in executable source code |
||
164 | /// (e.g. function bodies). Other clones (e.g. cloned comments or declarations) |
||
165 | /// are not supported. |
||
166 | class CloneDetector { |
||
167 | |||
168 | public: |
||
169 | /// A collection of StmtSequences that share an arbitrary property. |
||
170 | typedef llvm::SmallVector<StmtSequence, 8> CloneGroup; |
||
171 | |||
172 | /// Generates and stores search data for all statements in the body of |
||
173 | /// the given Decl. |
||
174 | void analyzeCodeBody(const Decl *D); |
||
175 | |||
176 | /// Constrains the given list of clone groups with the given constraint. |
||
177 | /// |
||
178 | /// The constraint is expected to have a method with the signature |
||
179 | /// `void constrain(std::vector<CloneDetector::CloneGroup> &Sequences)` |
||
180 | /// as this is the interface that the CloneDetector uses for applying the |
||
181 | /// constraint. The constraint is supposed to directly modify the passed list |
||
182 | /// so that all clones in the list fulfill the specific property this |
||
183 | /// constraint ensures. |
||
184 | template <typename T> |
||
185 | static void constrainClones(std::vector<CloneGroup> &CloneGroups, T C) { |
||
186 | C.constrain(CloneGroups); |
||
187 | } |
||
188 | |||
189 | /// Constrains the given list of clone groups with the given list of |
||
190 | /// constraints. |
||
191 | /// |
||
192 | /// The constraints are applied in sequence in the order in which they are |
||
193 | /// passed to this function. |
||
194 | template <typename T1, typename... Ts> |
||
195 | static void constrainClones(std::vector<CloneGroup> &CloneGroups, T1 C, |
||
196 | Ts... ConstraintList) { |
||
197 | constrainClones(CloneGroups, C); |
||
198 | constrainClones(CloneGroups, ConstraintList...); |
||
199 | } |
||
200 | |||
201 | /// Searches for clones in all previously passed statements. |
||
202 | /// \param Result Output parameter to which all created clone groups are |
||
203 | /// added. |
||
204 | /// \param ConstraintList The constraints that should be applied to the |
||
205 | // result. |
||
206 | template <typename... Ts> |
||
207 | void findClones(std::vector<CloneGroup> &Result, Ts... ConstraintList) { |
||
208 | // The initial assumption is that there is only one clone group and every |
||
209 | // statement is a clone of the others. This clone group will then be |
||
210 | // split up with the help of the constraints. |
||
211 | Result.push_back(Sequences); |
||
212 | |||
213 | constrainClones(Result, ConstraintList...); |
||
214 | } |
||
215 | |||
216 | private: |
||
217 | CloneGroup Sequences; |
||
218 | }; |
||
219 | |||
220 | /// This class is a utility class that contains utility functions for building |
||
221 | /// custom constraints. |
||
222 | class CloneConstraint { |
||
223 | public: |
||
224 | /// Removes all groups by using a filter function. |
||
225 | /// \param CloneGroups The list of CloneGroups that is supposed to be |
||
226 | /// filtered. |
||
227 | /// \param Filter The filter function that should return true for all groups |
||
228 | /// that should be removed from the list. |
||
229 | static void filterGroups( |
||
230 | std::vector<CloneDetector::CloneGroup> &CloneGroups, |
||
231 | llvm::function_ref<bool(const CloneDetector::CloneGroup &)> Filter) { |
||
232 | llvm::erase_if(CloneGroups, Filter); |
||
233 | } |
||
234 | |||
235 | /// Splits the given CloneGroups until the given Compare function returns true |
||
236 | /// for all clones in a single group. |
||
237 | /// \param CloneGroups A list of CloneGroups that should be modified. |
||
238 | /// \param Compare The comparison function that all clones are supposed to |
||
239 | /// pass. Should return true if and only if two clones belong |
||
240 | /// to the same CloneGroup. |
||
241 | static void splitCloneGroups( |
||
242 | std::vector<CloneDetector::CloneGroup> &CloneGroups, |
||
243 | llvm::function_ref<bool(const StmtSequence &, const StmtSequence &)> |
||
244 | Compare); |
||
245 | }; |
||
246 | |||
247 | /// This constraint moves clones into clone groups of type II via hashing. |
||
248 | /// |
||
249 | /// Clones with different hash values are moved into separate clone groups. |
||
250 | /// Collisions are possible, and this constraint does nothing to address this |
||
251 | /// them. Add the slower RecursiveCloneTypeIIVerifyConstraint later in the |
||
252 | /// constraint chain, not necessarily immediately, to eliminate hash collisions |
||
253 | /// through a more detailed analysis. |
||
254 | class RecursiveCloneTypeIIHashConstraint { |
||
255 | public: |
||
256 | void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); |
||
257 | }; |
||
258 | |||
259 | /// This constraint moves clones into clone groups of type II by comparing them. |
||
260 | /// |
||
261 | /// Clones that aren't type II clones are moved into separate clone groups. |
||
262 | /// In contrast to the RecursiveCloneTypeIIHashConstraint, all clones in a clone |
||
263 | /// group are guaranteed to be type II clones of each other, but it is too |
||
264 | /// slow to efficiently handle large amounts of clones. |
||
265 | class RecursiveCloneTypeIIVerifyConstraint { |
||
266 | public: |
||
267 | void constrain(std::vector<CloneDetector::CloneGroup> &Sequences); |
||
268 | }; |
||
269 | |||
270 | /// Ensures that every clone has at least the given complexity. |
||
271 | /// |
||
272 | /// Complexity is here defined as the total amount of children of a statement. |
||
273 | /// This constraint assumes the first statement in the group is representative |
||
274 | /// for all other statements in the group in terms of complexity. |
||
275 | class MinComplexityConstraint { |
||
276 | unsigned MinComplexity; |
||
277 | |||
278 | public: |
||
279 | MinComplexityConstraint(unsigned MinComplexity) |
||
280 | : MinComplexity(MinComplexity) {} |
||
281 | |||
282 | /// Calculates the complexity of the given StmtSequence. |
||
283 | /// \param Limit The limit of complexity we probe for. After reaching |
||
284 | /// this limit during calculation, this method is exiting |
||
285 | /// early to improve performance and returns this limit. |
||
286 | size_t calculateStmtComplexity(const StmtSequence &Seq, std::size_t Limit, |
||
287 | const std::string &ParentMacroStack = ""); |
||
288 | |||
289 | void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { |
||
290 | CloneConstraint::filterGroups( |
||
291 | CloneGroups, [this](const CloneDetector::CloneGroup &A) { |
||
292 | if (!A.empty()) |
||
293 | return calculateStmtComplexity(A.front(), MinComplexity) < |
||
294 | MinComplexity; |
||
295 | else |
||
296 | return false; |
||
297 | }); |
||
298 | } |
||
299 | }; |
||
300 | |||
301 | /// Ensures that all clone groups contain at least the given amount of clones. |
||
302 | class MinGroupSizeConstraint { |
||
303 | unsigned MinGroupSize; |
||
304 | |||
305 | public: |
||
306 | MinGroupSizeConstraint(unsigned MinGroupSize = 2) |
||
307 | : MinGroupSize(MinGroupSize) {} |
||
308 | |||
309 | void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { |
||
310 | CloneConstraint::filterGroups(CloneGroups, |
||
311 | [this](const CloneDetector::CloneGroup &A) { |
||
312 | return A.size() < MinGroupSize; |
||
313 | }); |
||
314 | } |
||
315 | }; |
||
316 | |||
317 | /// Ensures that no clone group fully contains another clone group. |
||
318 | struct OnlyLargestCloneConstraint { |
||
319 | void constrain(std::vector<CloneDetector::CloneGroup> &Result); |
||
320 | }; |
||
321 | |||
322 | struct FilenamePatternConstraint { |
||
323 | StringRef IgnoredFilesPattern; |
||
324 | std::shared_ptr<llvm::Regex> IgnoredFilesRegex; |
||
325 | |||
326 | FilenamePatternConstraint(StringRef IgnoredFilesPattern) |
||
327 | : IgnoredFilesPattern(IgnoredFilesPattern) { |
||
328 | IgnoredFilesRegex = std::make_shared<llvm::Regex>("^(" + |
||
329 | IgnoredFilesPattern.str() + "$)"); |
||
330 | } |
||
331 | |||
332 | bool isAutoGenerated(const CloneDetector::CloneGroup &Group); |
||
333 | |||
334 | void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups) { |
||
335 | CloneConstraint::filterGroups( |
||
336 | CloneGroups, [this](const CloneDetector::CloneGroup &Group) { |
||
337 | return isAutoGenerated(Group); |
||
338 | }); |
||
339 | } |
||
340 | }; |
||
341 | |||
342 | /// Analyzes the pattern of the referenced variables in a statement. |
||
343 | class VariablePattern { |
||
344 | |||
345 | /// Describes an occurrence of a variable reference in a statement. |
||
346 | struct VariableOccurence { |
||
347 | /// The index of the associated VarDecl in the Variables vector. |
||
348 | size_t KindID; |
||
349 | /// The statement in the code where the variable was referenced. |
||
350 | const Stmt *Mention; |
||
351 | |||
352 | VariableOccurence(size_t KindID, const Stmt *Mention) |
||
353 | : KindID(KindID), Mention(Mention) {} |
||
354 | }; |
||
355 | |||
356 | /// All occurrences of referenced variables in the order of appearance. |
||
357 | std::vector<VariableOccurence> Occurences; |
||
358 | /// List of referenced variables in the order of appearance. |
||
359 | /// Every item in this list is unique. |
||
360 | std::vector<const VarDecl *> Variables; |
||
361 | |||
362 | /// Adds a new variable referenced to this pattern. |
||
363 | /// \param VarDecl The declaration of the variable that is referenced. |
||
364 | /// \param Mention The SourceRange where this variable is referenced. |
||
365 | void addVariableOccurence(const VarDecl *VarDecl, const Stmt *Mention); |
||
366 | |||
367 | /// Adds each referenced variable from the given statement. |
||
368 | void addVariables(const Stmt *S); |
||
369 | |||
370 | public: |
||
371 | /// Creates an VariablePattern object with information about the given |
||
372 | /// StmtSequence. |
||
373 | VariablePattern(const StmtSequence &Sequence) { |
||
374 | for (const Stmt *S : Sequence) |
||
375 | addVariables(S); |
||
376 | } |
||
377 | |||
378 | /// Describes two clones that reference their variables in a different pattern |
||
379 | /// which could indicate a programming error. |
||
380 | struct SuspiciousClonePair { |
||
381 | /// Utility class holding the relevant information about a single |
||
382 | /// clone in this pair. |
||
383 | struct SuspiciousCloneInfo { |
||
384 | /// The variable which referencing in this clone was against the pattern. |
||
385 | const VarDecl *Variable; |
||
386 | /// Where the variable was referenced. |
||
387 | const Stmt *Mention; |
||
388 | /// The variable that should have been referenced to follow the pattern. |
||
389 | /// If Suggestion is a nullptr then it's not possible to fix the pattern |
||
390 | /// by referencing a different variable in this clone. |
||
391 | const VarDecl *Suggestion; |
||
392 | SuspiciousCloneInfo(const VarDecl *Variable, const Stmt *Mention, |
||
393 | const VarDecl *Suggestion) |
||
394 | : Variable(Variable), Mention(Mention), Suggestion(Suggestion) {} |
||
395 | SuspiciousCloneInfo() {} |
||
396 | }; |
||
397 | /// The first clone in the pair which always has a suggested variable. |
||
398 | SuspiciousCloneInfo FirstCloneInfo; |
||
399 | /// This other clone in the pair which can have a suggested variable. |
||
400 | SuspiciousCloneInfo SecondCloneInfo; |
||
401 | }; |
||
402 | |||
403 | /// Counts the differences between this pattern and the given one. |
||
404 | /// \param Other The given VariablePattern to compare with. |
||
405 | /// \param FirstMismatch Output parameter that will be filled with information |
||
406 | /// about the first difference between the two patterns. This parameter |
||
407 | /// can be a nullptr, in which case it will be ignored. |
||
408 | /// \return Returns the number of differences between the pattern this object |
||
409 | /// is following and the given VariablePattern. |
||
410 | /// |
||
411 | /// For example, the following statements all have the same pattern and this |
||
412 | /// function would return zero: |
||
413 | /// |
||
414 | /// if (a < b) return a; return b; |
||
415 | /// if (x < y) return x; return y; |
||
416 | /// if (u2 < u1) return u2; return u1; |
||
417 | /// |
||
418 | /// But the following statement has a different pattern (note the changed |
||
419 | /// variables in the return statements) and would have two differences when |
||
420 | /// compared with one of the statements above. |
||
421 | /// |
||
422 | /// if (a < b) return b; return a; |
||
423 | /// |
||
424 | /// This function should only be called if the related statements of the given |
||
425 | /// pattern and the statements of this objects are clones of each other. |
||
426 | unsigned countPatternDifferences( |
||
427 | const VariablePattern &Other, |
||
428 | VariablePattern::SuspiciousClonePair *FirstMismatch = nullptr); |
||
429 | }; |
||
430 | |||
431 | /// Ensures that all clones reference variables in the same pattern. |
||
432 | struct MatchingVariablePatternConstraint { |
||
433 | void constrain(std::vector<CloneDetector::CloneGroup> &CloneGroups); |
||
434 | }; |
||
435 | |||
436 | } // end namespace clang |
||
437 | |||
438 | #endif // LLVM_CLANG_ANALYSIS_CLONEDETECTION_H |