Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. // Record tokens that a preprocessor emits and define operations to map between
  9. // the tokens written in a file and tokens produced by the preprocessor.
  10. //
  11. // When running the compiler, there are two token streams we are interested in:
  12. //   - "spelled" tokens directly correspond to a substring written in some
  13. //     source file.
  14. //   - "expanded" tokens represent the result of preprocessing, parses consumes
  15. //     this token stream to produce the AST.
  16. //
  17. // Expanded tokens correspond directly to locations found in the AST, allowing
  18. // to find subranges of the token stream covered by various AST nodes. Spelled
  19. // tokens correspond directly to the source code written by the user.
  20. //
  21. // To allow composing these two use-cases, we also define operations that map
  22. // between expanded and spelled tokens that produced them (macro calls,
  23. // directives, etc).
  24. //
  25. //===----------------------------------------------------------------------===//
  26.  
  27. #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
  28. #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
  29.  
  30. #include "clang/Basic/LangOptions.h"
  31. #include "clang/Basic/SourceLocation.h"
  32. #include "clang/Basic/SourceManager.h"
  33. #include "clang/Basic/TokenKinds.h"
  34. #include "clang/Lex/Token.h"
  35. #include "llvm/ADT/ArrayRef.h"
  36. #include "llvm/ADT/DenseMap.h"
  37. #include "llvm/ADT/StringRef.h"
  38. #include "llvm/Support/Compiler.h"
  39. #include "llvm/Support/raw_ostream.h"
  40. #include <cstdint>
  41. #include <tuple>
  42.  
  43. namespace clang {
  44. class Preprocessor;
  45.  
  46. namespace syntax {
  47.  
  48. /// A half-open character range inside a particular file, the start offset is
  49. /// included and the end offset is excluded from the range.
  50. struct FileRange {
  51.   /// EXPECTS: File.isValid() && Begin <= End.
  52.   FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
  53.   /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
  54.   FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
  55.   /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
  56.   ///          are the same.
  57.   FileRange(const SourceManager &SM, SourceLocation BeginLoc,
  58.             SourceLocation EndLoc);
  59.  
  60.   FileID file() const { return File; }
  61.   /// Start is a start offset (inclusive) in the corresponding file.
  62.   unsigned beginOffset() const { return Begin; }
  63.   /// End offset (exclusive) in the corresponding file.
  64.   unsigned endOffset() const { return End; }
  65.  
  66.   unsigned length() const { return End - Begin; }
  67.  
  68.   /// Check if \p Offset is inside the range.
  69.   bool contains(unsigned Offset) const {
  70.     return Begin <= Offset && Offset < End;
  71.   }
  72.   /// Check \p Offset is inside the range or equal to its endpoint.
  73.   bool touches(unsigned Offset) const {
  74.     return Begin <= Offset && Offset <= End;
  75.   }
  76.  
  77.   /// Gets the substring that this FileRange refers to.
  78.   llvm::StringRef text(const SourceManager &SM) const;
  79.  
  80.   /// Convert to the clang range. The returned range is always a char range,
  81.   /// never a token range.
  82.   CharSourceRange toCharRange(const SourceManager &SM) const;
  83.  
  84.   friend bool operator==(const FileRange &L, const FileRange &R) {
  85.     return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
  86.   }
  87.   friend bool operator!=(const FileRange &L, const FileRange &R) {
  88.     return !(L == R);
  89.   }
  90.  
  91. private:
  92.   FileID File;
  93.   unsigned Begin;
  94.   unsigned End;
  95. };
  96.  
  97. /// For debugging purposes.
  98. llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
  99.  
  100. /// A token coming directly from a file or from a macro invocation. Has just
  101. /// enough information to locate the token in the source code.
  102. /// Can represent both expanded and spelled tokens.
  103. class Token {
  104. public:
  105.   Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
  106.   /// EXPECTS: clang::Token is not an annotation token.
  107.   explicit Token(const clang::Token &T);
  108.  
  109.   tok::TokenKind kind() const { return Kind; }
  110.   /// Location of the first character of a token.
  111.   SourceLocation location() const { return Location; }
  112.   /// Location right after the last character of a token.
  113.   SourceLocation endLocation() const {
  114.     return Location.getLocWithOffset(Length);
  115.   }
  116.   unsigned length() const { return Length; }
  117.  
  118.   /// Get the substring covered by the token. Note that will include all
  119.   /// digraphs, newline continuations, etc. E.g. tokens for 'int' and
  120.   ///    in\
  121.   ///    t
  122.   /// both have the same kind tok::kw_int, but results of text() are different.
  123.   llvm::StringRef text(const SourceManager &SM) const;
  124.  
  125.   /// Gets a range of this token.
  126.   /// EXPECTS: token comes from a file, not from a macro expansion.
  127.   FileRange range(const SourceManager &SM) const;
  128.  
  129.   /// Given two tokens inside the same file, returns a file range that starts at
  130.   /// \p First and ends at \p Last.
  131.   /// EXPECTS: First and Last are file tokens from the same file, Last starts
  132.   ///          after First.
  133.   static FileRange range(const SourceManager &SM, const syntax::Token &First,
  134.                          const syntax::Token &Last);
  135.  
  136.   std::string dumpForTests(const SourceManager &SM) const;
  137.   /// For debugging purposes.
  138.   std::string str() const;
  139.  
  140. private:
  141.   SourceLocation Location;
  142.   unsigned Length;
  143.   tok::TokenKind Kind;
  144. };
  145. /// For debugging purposes. Equivalent to a call to Token::str().
  146. llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
  147.  
  148. /// A list of tokens obtained by preprocessing a text buffer and operations to
  149. /// map between the expanded and spelled tokens, i.e. TokenBuffer has
  150. /// information about two token streams:
  151. ///    1. Expanded tokens: tokens produced by the preprocessor after all macro
  152. ///       replacements,
  153. ///    2. Spelled tokens: corresponding directly to the source code of a file
  154. ///       before any macro replacements occurred.
  155. /// Here's an example to illustrate a difference between those two:
  156. ///     #define FOO 10
  157. ///     int a = FOO;
  158. ///
  159. /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
  160. /// Expanded tokens are {'int','a','=','10',';','eof'}.
  161. ///
  162. /// Note that the expanded token stream has a tok::eof token at the end, the
  163. /// spelled tokens never store a 'eof' token.
  164. ///
  165. /// The full list expanded tokens can be obtained with expandedTokens(). Spelled
  166. /// tokens for each of the files can be obtained via spelledTokens(FileID).
  167. ///
  168. /// To map between the expanded and spelled tokens use findSpelledByExpanded().
  169. ///
  170. /// To build a token buffer use the TokenCollector class. You can also compute
  171. /// the spelled tokens of a file using the tokenize() helper.
  172. ///
  173. /// FIXME: allow mappings into macro arguments.
  174. class TokenBuffer {
  175. public:
  176.   TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
  177.  
  178.   TokenBuffer(TokenBuffer &&) = default;
  179.   TokenBuffer(const TokenBuffer &) = delete;
  180.   TokenBuffer &operator=(TokenBuffer &&) = default;
  181.   TokenBuffer &operator=(const TokenBuffer &) = delete;
  182.  
  183.   /// All tokens produced by the preprocessor after all macro replacements,
  184.   /// directives, etc. Source locations found in the clang AST will always
  185.   /// point to one of these tokens.
  186.   /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
  187.   /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
  188.   ///        into two '>' tokens by the parser. However, TokenBuffer currently
  189.   ///        keeps it as a single '>>' token.
  190.   llvm::ArrayRef<syntax::Token> expandedTokens() const {
  191.     return ExpandedTokens;
  192.   }
  193.  
  194.   /// Builds a cache to make future calls to expandedToken(SourceRange) faster.
  195.   /// Creates an index only once. Further calls to it will be no-op.
  196.   void indexExpandedTokens();
  197.  
  198.   /// Returns the subrange of expandedTokens() corresponding to the closed
  199.   /// token range R.
  200.   /// Consider calling indexExpandedTokens() before for faster lookups.
  201.   llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;
  202.  
  203.   /// Returns the subrange of spelled tokens corresponding to AST node spanning
  204.   /// \p Expanded. This is the text that should be replaced if a refactoring
  205.   /// were to rewrite the node. If \p Expanded is empty, the returned value is
  206.   /// std::nullopt.
  207.   ///
  208.   /// Will fail if the expanded tokens do not correspond to a sequence of
  209.   /// spelled tokens. E.g. for the following example:
  210.   ///
  211.   ///   #define FIRST f1 f2 f3
  212.   ///   #define SECOND s1 s2 s3
  213.   ///   #define ID2(X, Y) X Y
  214.   ///
  215.   ///   a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
  216.   ///   d ID2(e f g, h) i  // expanded tokens are: d e f g h i
  217.   ///
  218.   /// the results would be:
  219.   ///   expanded   => spelled
  220.   ///   ------------------------
  221.   ///            a => a
  222.   ///     s1 s2 s3 => SECOND
  223.   ///   a f1 f2 f3 => a FIRST
  224.   ///         a f1 => can't map
  225.   ///        s1 s2 => can't map
  226.   ///         e f  => e f
  227.   ///         g h  => can't map
  228.   ///
  229.   /// EXPECTS: \p Expanded is a subrange of expandedTokens().
  230.   /// Complexity is logarithmic.
  231.   std::optional<llvm::ArrayRef<syntax::Token>>
  232.   spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
  233.  
  234.   /// Find the subranges of expanded tokens, corresponding to \p Spelled.
  235.   ///
  236.   /// Some spelled tokens may not be present in the expanded token stream, so
  237.   /// this function can return an empty vector, e.g. for tokens of macro
  238.   /// directives or disabled preprocessor branches.
  239.   ///
  240.   /// Some spelled tokens can be duplicated in the expanded token stream
  241.   /// multiple times and this function will return multiple results in those
  242.   /// cases. This happens when \p Spelled is inside a macro argument.
  243.   ///
  244.   /// FIXME: return correct results on macro arguments. For now, we return an
  245.   ///        empty list.
  246.   ///
  247.   /// (!) will return empty vector on tokens from #define body:
  248.   /// E.g. for the following example:
  249.   ///
  250.   ///   #define FIRST(A) f1 A = A f2
  251.   ///   #define SECOND s
  252.   ///
  253.   ///   a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s
  254.   /// The results would be
  255.   ///   spelled           => expanded
  256.   ///   ------------------------
  257.   ///   #define FIRST     => {}
  258.   ///   a FIRST(arg)      => {a f1 arg = arg f2}
  259.   ///   arg               => {arg, arg} // arg #1 is before `=` and arg #2 is
  260.   ///                                   // after `=` in the expanded tokens.
  261.   llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1>
  262.   expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
  263.  
  264.   /// An expansion produced by the preprocessor, includes macro expansions and
  265.   /// preprocessor directives. Preprocessor always maps a non-empty range of
  266.   /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
  267.   /// few examples of expansions:
  268.   ///    #pragma once      // Expands to an empty range.
  269.   ///    #define FOO 1 2 3 // Expands an empty range.
  270.   ///    FOO               // Expands to "1 2 3".
  271.   /// FIXME(ibiryukov): implement this, currently #include expansions are empty.
  272.   ///    #include <vector> // Expands to tokens produced by the include.
  273.   struct Expansion {
  274.     llvm::ArrayRef<syntax::Token> Spelled;
  275.     llvm::ArrayRef<syntax::Token> Expanded;
  276.   };
  277.   /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
  278.   /// a preprocessor directive) return the subrange of expanded tokens that the
  279.   /// macro expands to.
  280.   std::optional<Expansion>
  281.   expansionStartingAt(const syntax::Token *Spelled) const;
  282.   /// Returns all expansions (partially) expanded from the specified tokens.
  283.   /// This is the expansions whose Spelled range intersects \p Spelled.
  284.   std::vector<Expansion>
  285.   expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const;
  286.  
  287.   /// Lexed tokens of a file before preprocessing. E.g. for the following input
  288.   ///     #define DECL(name) int name = 10
  289.   ///     DECL(a);
  290.   /// spelledTokens() returns
  291.   ///    {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10",
  292.   ///     "DECL", "(", "a", ")", ";"}
  293.   llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
  294.  
  295.   /// Returns the spelled Token starting at Loc, if there are no such tokens
  296.   /// returns nullptr.
  297.   const syntax::Token *spelledTokenAt(SourceLocation Loc) const;
  298.  
  299.   /// Get all tokens that expand a macro in \p FID. For the following input
  300.   ///     #define FOO B
  301.   ///     #define FOO2(X) int X
  302.   ///     FOO2(XY)
  303.   ///     int B;
  304.   ///     FOO;
  305.   /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
  306.   /// respecitvely).
  307.   std::vector<const syntax::Token *> macroExpansions(FileID FID) const;
  308.  
  309.   const SourceManager &sourceManager() const { return *SourceMgr; }
  310.  
  311.   std::string dumpForTests() const;
  312.  
  313. private:
  314.   /// Describes a mapping between a continuous subrange of spelled tokens and
  315.   /// expanded tokens. Represents macro expansions, preprocessor directives,
  316.   /// conditionally disabled pp regions, etc.
  317.   ///   #define FOO 1+2
  318.   ///   #define BAR(a) a + 1
  319.   ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
  320.   ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'},
  321.   ///                            macroTokens = {'BAR', '(', '1', ')'}.
  322.   struct Mapping {
  323.     // Positions in the corresponding spelled token stream. The corresponding
  324.     // range is never empty.
  325.     unsigned BeginSpelled = 0;
  326.     unsigned EndSpelled = 0;
  327.     // Positions in the expanded token stream. The corresponding range can be
  328.     // empty.
  329.     unsigned BeginExpanded = 0;
  330.     unsigned EndExpanded = 0;
  331.  
  332.     /// For debugging purposes.
  333.     std::string str() const;
  334.   };
  335.   /// Spelled tokens of the file with information about the subranges.
  336.   struct MarkedFile {
  337.     /// Lexed, but not preprocessed, tokens of the file. These map directly to
  338.     /// text in the corresponding files and include tokens of all preprocessor
  339.     /// directives.
  340.     /// FIXME: spelled tokens don't change across FileID that map to the same
  341.     ///        FileEntry. We could consider deduplicating them to save memory.
  342.     std::vector<syntax::Token> SpelledTokens;
  343.     /// A sorted list to convert between the spelled and expanded token streams.
  344.     std::vector<Mapping> Mappings;
  345.     /// The first expanded token produced for this FileID.
  346.     unsigned BeginExpanded = 0;
  347.     unsigned EndExpanded = 0;
  348.   };
  349.  
  350.   friend class TokenCollector;
  351.  
  352.   /// Maps a single expanded token to its spelled counterpart or a mapping that
  353.   /// produced it.
  354.   std::pair<const syntax::Token *, const Mapping *>
  355.   spelledForExpandedToken(const syntax::Token *Expanded) const;
  356.  
  357.   /// Returns a mapping starting before \p Spelled token, or nullptr if no
  358.   /// such mapping exists.
  359.   static const Mapping *
  360.   mappingStartingBeforeSpelled(const MarkedFile &F,
  361.                                const syntax::Token *Spelled);
  362.  
  363.   /// Convert a private Mapping to a public Expansion.
  364.   Expansion makeExpansion(const MarkedFile &, const Mapping &) const;
  365.   /// Returns the file that the Spelled tokens are taken from.
  366.   /// Asserts that they are non-empty, from a tracked file, and in-bounds.
  367.   const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
  368.  
  369.   /// Token stream produced after preprocessing, conceputally this captures the
  370.   /// same stream as 'clang -E' (excluding the preprocessor directives like
  371.   /// #file, etc.).
  372.   std::vector<syntax::Token> ExpandedTokens;
  373.   // Index of ExpandedTokens for faster lookups by SourceLocation.
  374.   llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex;
  375.   llvm::DenseMap<FileID, MarkedFile> Files;
  376.   // The value is never null, pointer instead of reference to avoid disabling
  377.   // implicit assignment operator.
  378.   const SourceManager *SourceMgr;
  379. };
  380.  
  381. /// The spelled tokens that overlap or touch a spelling location Loc.
  382. /// This always returns 0-2 tokens.
  383. llvm::ArrayRef<syntax::Token>
  384. spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens);
  385. llvm::ArrayRef<syntax::Token>
  386. spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens);
  387.  
  388. /// The identifier token that overlaps or touches a spelling location Loc.
  389. /// If there is none, returns nullptr.
  390. const syntax::Token *
  391. spelledIdentifierTouching(SourceLocation Loc,
  392.                           llvm::ArrayRef<syntax::Token> Tokens);
  393. const syntax::Token *
  394. spelledIdentifierTouching(SourceLocation Loc,
  395.                           const syntax::TokenBuffer &Tokens);
  396.  
  397. /// Lex the text buffer, corresponding to \p FID, in raw mode and record the
  398. /// resulting spelled tokens. Does minimal post-processing on raw identifiers,
  399. /// setting the appropriate token kind (instead of the raw_identifier reported
  400. /// by lexer in raw mode). This is a very low-level function, most users should
  401. /// prefer to use TokenCollector. Lexing in raw mode produces wildly different
  402. /// results from what one might expect when running a C++ frontend, e.g.
  403. /// preprocessor does not run at all.
  404. /// The result will *not* have a 'eof' token at the end.
  405. std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
  406.                                     const LangOptions &LO);
  407. /// Similar to one above, instead of whole file tokenizes a part of it. Note
  408. /// that, the first token might be incomplete if FR.startOffset is not at the
  409. /// beginning of a token, and the last token returned will start before the
  410. /// FR.endOffset but might end after it.
  411. std::vector<syntax::Token>
  412. tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO);
  413.  
  414. /// Collects tokens for the main file while running the frontend action. An
  415. /// instance of this object should be created on
  416. /// FrontendAction::BeginSourceFile() and the results should be consumed after
  417. /// FrontendAction::Execute() finishes.
  418. class TokenCollector {
  419. public:
  420.   /// Adds the hooks to collect the tokens. Should be called before the
  421.   /// preprocessing starts, i.e. as a part of BeginSourceFile() or
  422.   /// CreateASTConsumer().
  423.   TokenCollector(Preprocessor &P);
  424.  
  425.   /// Finalizes token collection. Should be called after preprocessing is
  426.   /// finished, i.e. after running Execute().
  427.   [[nodiscard]] TokenBuffer consume() &&;
  428.  
  429. private:
  430.   /// Maps from a start to an end spelling location of transformations
  431.   /// performed by the preprocessor. These include:
  432.   ///   1. range from '#' to the last token in the line for PP directives,
  433.   ///   2. macro name and arguments for macro expansions.
  434.   /// Note that we record only top-level macro expansions, intermediate
  435.   /// expansions (e.g. inside macro arguments) are ignored.
  436.   ///
  437.   /// Used to find correct boundaries of macro calls and directives when
  438.   /// building mappings from spelled to expanded tokens.
  439.   ///
  440.   /// Logically, at each point of the preprocessor execution there is a stack of
  441.   /// macro expansions being processed and we could use it to recover the
  442.   /// location information we need. However, the public preprocessor API only
  443.   /// exposes the points when macro expansions start (when we push a macro onto
  444.   /// the stack) and not when they end (when we pop a macro from the stack).
  445.   /// To workaround this limitation, we rely on source location information
  446.   /// stored in this map.
  447.   using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>;
  448.   class Builder;
  449.   class CollectPPExpansions;
  450.  
  451.   std::vector<syntax::Token> Expanded;
  452.   // FIXME: we only store macro expansions, also add directives(#pragma, etc.)
  453.   PPExpansions Expansions;
  454.   Preprocessor &PP;
  455.   CollectPPExpansions *Collector;
  456. };
  457.  
  458. } // namespace syntax
  459. } // namespace clang
  460.  
  461. #endif
  462.