- //===- Tokens.h - collect tokens from preprocessing --------------*- C++-*-===// 
- // 
- // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 
- // See https://llvm.org/LICENSE.txt for license information. 
- // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 
- // 
- //===----------------------------------------------------------------------===// 
- // Record tokens that a preprocessor emits and define operations to map between 
- // the tokens written in a file and tokens produced by the preprocessor. 
- // 
- // When running the compiler, there are two token streams we are interested in: 
- //   - "spelled" tokens directly correspond to a substring written in some 
- //     source file. 
- //   - "expanded" tokens represent the result of preprocessing, parses consumes 
- //     this token stream to produce the AST. 
- // 
- // Expanded tokens correspond directly to locations found in the AST, allowing 
- // to find subranges of the token stream covered by various AST nodes. Spelled 
- // tokens correspond directly to the source code written by the user. 
- // 
- // To allow composing these two use-cases, we also define operations that map 
- // between expanded and spelled tokens that produced them (macro calls, 
- // directives, etc). 
- // 
- //===----------------------------------------------------------------------===// 
-   
- #ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H 
- #define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H 
-   
- #include "clang/Basic/LangOptions.h" 
- #include "clang/Basic/SourceLocation.h" 
- #include "clang/Basic/SourceManager.h" 
- #include "clang/Basic/TokenKinds.h" 
- #include "clang/Lex/Token.h" 
- #include "llvm/ADT/ArrayRef.h" 
- #include "llvm/ADT/DenseMap.h" 
- #include "llvm/ADT/StringRef.h" 
- #include "llvm/Support/Compiler.h" 
- #include "llvm/Support/raw_ostream.h" 
- #include <cstdint> 
- #include <tuple> 
-   
- namespace clang { 
- class Preprocessor; 
-   
- namespace syntax { 
-   
- /// A half-open character range inside a particular file, the start offset is 
- /// included and the end offset is excluded from the range. 
- struct FileRange { 
-   /// EXPECTS: File.isValid() && Begin <= End. 
-   FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset); 
-   /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(). 
-   FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length); 
-   /// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files 
-   ///          are the same. 
-   FileRange(const SourceManager &SM, SourceLocation BeginLoc, 
-             SourceLocation EndLoc); 
-   
-   FileID file() const { return File; } 
-   /// Start is a start offset (inclusive) in the corresponding file. 
-   unsigned beginOffset() const { return Begin; } 
-   /// End offset (exclusive) in the corresponding file. 
-   unsigned endOffset() const { return End; } 
-   
-   unsigned length() const { return End - Begin; } 
-   
-   /// Check if \p Offset is inside the range. 
-   bool contains(unsigned Offset) const { 
-     return Begin <= Offset && Offset < End; 
-   } 
-   /// Check \p Offset is inside the range or equal to its endpoint. 
-   bool touches(unsigned Offset) const { 
-     return Begin <= Offset && Offset <= End; 
-   } 
-   
-   /// Gets the substring that this FileRange refers to. 
-   llvm::StringRef text(const SourceManager &SM) const; 
-   
-   /// Convert to the clang range. The returned range is always a char range, 
-   /// never a token range. 
-   CharSourceRange toCharRange(const SourceManager &SM) const; 
-   
-   friend bool operator==(const FileRange &L, const FileRange &R) { 
-     return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End); 
-   } 
-   friend bool operator!=(const FileRange &L, const FileRange &R) { 
-     return !(L == R); 
-   } 
-   
- private: 
-   FileID File; 
-   unsigned Begin; 
-   unsigned End; 
- }; 
-   
- /// For debugging purposes. 
- llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R); 
-   
- /// A token coming directly from a file or from a macro invocation. Has just 
- /// enough information to locate the token in the source code. 
- /// Can represent both expanded and spelled tokens. 
- class Token { 
- public: 
-   Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind); 
-   /// EXPECTS: clang::Token is not an annotation token. 
-   explicit Token(const clang::Token &T); 
-   
-   tok::TokenKind kind() const { return Kind; } 
-   /// Location of the first character of a token. 
-   SourceLocation location() const { return Location; } 
-   /// Location right after the last character of a token. 
-   SourceLocation endLocation() const { 
-     return Location.getLocWithOffset(Length); 
-   } 
-   unsigned length() const { return Length; } 
-   
-   /// Get the substring covered by the token. Note that will include all 
-   /// digraphs, newline continuations, etc. E.g. tokens for 'int' and 
-   ///    in\ 
-   ///    t 
-   /// both have the same kind tok::kw_int, but results of text() are different. 
-   llvm::StringRef text(const SourceManager &SM) const; 
-   
-   /// Gets a range of this token. 
-   /// EXPECTS: token comes from a file, not from a macro expansion. 
-   FileRange range(const SourceManager &SM) const; 
-   
-   /// Given two tokens inside the same file, returns a file range that starts at 
-   /// \p First and ends at \p Last. 
-   /// EXPECTS: First and Last are file tokens from the same file, Last starts 
-   ///          after First. 
-   static FileRange range(const SourceManager &SM, const syntax::Token &First, 
-                          const syntax::Token &Last); 
-   
-   std::string dumpForTests(const SourceManager &SM) const; 
-   /// For debugging purposes. 
-   std::string str() const; 
-   
- private: 
-   SourceLocation Location; 
-   unsigned Length; 
-   tok::TokenKind Kind; 
- }; 
- /// For debugging purposes. Equivalent to a call to Token::str(). 
- llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T); 
-   
- /// A list of tokens obtained by preprocessing a text buffer and operations to 
- /// map between the expanded and spelled tokens, i.e. TokenBuffer has 
- /// information about two token streams: 
- ///    1. Expanded tokens: tokens produced by the preprocessor after all macro 
- ///       replacements, 
- ///    2. Spelled tokens: corresponding directly to the source code of a file 
- ///       before any macro replacements occurred. 
- /// Here's an example to illustrate a difference between those two: 
- ///     #define FOO 10 
- ///     int a = FOO; 
- /// 
- /// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}. 
- /// Expanded tokens are {'int','a','=','10',';','eof'}. 
- /// 
- /// Note that the expanded token stream has a tok::eof token at the end, the 
- /// spelled tokens never store a 'eof' token. 
- /// 
- /// The full list expanded tokens can be obtained with expandedTokens(). Spelled 
- /// tokens for each of the files can be obtained via spelledTokens(FileID). 
- /// 
- /// To map between the expanded and spelled tokens use findSpelledByExpanded(). 
- /// 
- /// To build a token buffer use the TokenCollector class. You can also compute 
- /// the spelled tokens of a file using the tokenize() helper. 
- /// 
- /// FIXME: allow mappings into macro arguments. 
- class TokenBuffer { 
- public: 
-   TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {} 
-   
-   TokenBuffer(TokenBuffer &&) = default; 
-   TokenBuffer(const TokenBuffer &) = delete; 
-   TokenBuffer &operator=(TokenBuffer &&) = default; 
-   TokenBuffer &operator=(const TokenBuffer &) = delete; 
-   
-   /// All tokens produced by the preprocessor after all macro replacements, 
-   /// directives, etc. Source locations found in the clang AST will always 
-   /// point to one of these tokens. 
-   /// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()). 
-   /// FIXME: figure out how to handle token splitting, e.g. '>>' can be split 
-   ///        into two '>' tokens by the parser. However, TokenBuffer currently 
-   ///        keeps it as a single '>>' token. 
-   llvm::ArrayRef<syntax::Token> expandedTokens() const { 
-     return ExpandedTokens; 
-   } 
-   
-   /// Builds a cache to make future calls to expandedToken(SourceRange) faster. 
-   /// Creates an index only once. Further calls to it will be no-op. 
-   void indexExpandedTokens(); 
-   
-   /// Returns the subrange of expandedTokens() corresponding to the closed 
-   /// token range R. 
-   /// Consider calling indexExpandedTokens() before for faster lookups. 
-   llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const; 
-   
-   /// Returns the subrange of spelled tokens corresponding to AST node spanning 
-   /// \p Expanded. This is the text that should be replaced if a refactoring 
-   /// were to rewrite the node. If \p Expanded is empty, the returned value is 
-   /// std::nullopt. 
-   /// 
-   /// Will fail if the expanded tokens do not correspond to a sequence of 
-   /// spelled tokens. E.g. for the following example: 
-   /// 
-   ///   #define FIRST f1 f2 f3 
-   ///   #define SECOND s1 s2 s3 
-   ///   #define ID2(X, Y) X Y 
-   /// 
-   ///   a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c 
-   ///   d ID2(e f g, h) i  // expanded tokens are: d e f g h i 
-   /// 
-   /// the results would be: 
-   ///   expanded   => spelled 
-   ///   ------------------------ 
-   ///            a => a 
-   ///     s1 s2 s3 => SECOND 
-   ///   a f1 f2 f3 => a FIRST 
-   ///         a f1 => can't map 
-   ///        s1 s2 => can't map 
-   ///         e f  => e f 
-   ///         g h  => can't map 
-   /// 
-   /// EXPECTS: \p Expanded is a subrange of expandedTokens(). 
-   /// Complexity is logarithmic. 
-   std::optional<llvm::ArrayRef<syntax::Token>> 
-   spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const; 
-   
-   /// Find the subranges of expanded tokens, corresponding to \p Spelled. 
-   /// 
-   /// Some spelled tokens may not be present in the expanded token stream, so 
-   /// this function can return an empty vector, e.g. for tokens of macro 
-   /// directives or disabled preprocessor branches. 
-   /// 
-   /// Some spelled tokens can be duplicated in the expanded token stream 
-   /// multiple times and this function will return multiple results in those 
-   /// cases. This happens when \p Spelled is inside a macro argument. 
-   /// 
-   /// FIXME: return correct results on macro arguments. For now, we return an 
-   ///        empty list. 
-   /// 
-   /// (!) will return empty vector on tokens from #define body: 
-   /// E.g. for the following example: 
-   /// 
-   ///   #define FIRST(A) f1 A = A f2 
-   ///   #define SECOND s 
-   /// 
-   ///   a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s 
-   /// The results would be 
-   ///   spelled           => expanded 
-   ///   ------------------------ 
-   ///   #define FIRST     => {} 
-   ///   a FIRST(arg)      => {a f1 arg = arg f2} 
-   ///   arg               => {arg, arg} // arg #1 is before `=` and arg #2 is 
-   ///                                   // after `=` in the expanded tokens. 
-   llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1> 
-   expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; 
-   
-   /// An expansion produced by the preprocessor, includes macro expansions and 
-   /// preprocessor directives. Preprocessor always maps a non-empty range of 
-   /// spelled tokens to a (possibly empty) range of expanded tokens. Here is a 
-   /// few examples of expansions: 
-   ///    #pragma once      // Expands to an empty range. 
-   ///    #define FOO 1 2 3 // Expands an empty range. 
-   ///    FOO               // Expands to "1 2 3". 
-   /// FIXME(ibiryukov): implement this, currently #include expansions are empty. 
-   ///    #include <vector> // Expands to tokens produced by the include. 
-   struct Expansion { 
-     llvm::ArrayRef<syntax::Token> Spelled; 
-     llvm::ArrayRef<syntax::Token> Expanded; 
-   }; 
-   /// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting 
-   /// a preprocessor directive) return the subrange of expanded tokens that the 
-   /// macro expands to. 
-   std::optional<Expansion> 
-   expansionStartingAt(const syntax::Token *Spelled) const; 
-   /// Returns all expansions (partially) expanded from the specified tokens. 
-   /// This is the expansions whose Spelled range intersects \p Spelled. 
-   std::vector<Expansion> 
-   expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const; 
-   
-   /// Lexed tokens of a file before preprocessing. E.g. for the following input 
-   ///     #define DECL(name) int name = 10 
-   ///     DECL(a); 
-   /// spelledTokens() returns 
-   ///    {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10", 
-   ///     "DECL", "(", "a", ")", ";"} 
-   llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const; 
-   
-   /// Returns the spelled Token starting at Loc, if there are no such tokens 
-   /// returns nullptr. 
-   const syntax::Token *spelledTokenAt(SourceLocation Loc) const; 
-   
-   /// Get all tokens that expand a macro in \p FID. For the following input 
-   ///     #define FOO B 
-   ///     #define FOO2(X) int X 
-   ///     FOO2(XY) 
-   ///     int B; 
-   ///     FOO; 
-   /// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5 
-   /// respecitvely). 
-   std::vector<const syntax::Token *> macroExpansions(FileID FID) const; 
-   
-   const SourceManager &sourceManager() const { return *SourceMgr; } 
-   
-   std::string dumpForTests() const; 
-   
- private: 
-   /// Describes a mapping between a continuous subrange of spelled tokens and 
-   /// expanded tokens. Represents macro expansions, preprocessor directives, 
-   /// conditionally disabled pp regions, etc. 
-   ///   #define FOO 1+2 
-   ///   #define BAR(a) a + 1 
-   ///   FOO    // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}. 
-   ///   BAR(1) // invocation #2, tokens = {'a', '+', '1'}, 
-   ///                            macroTokens = {'BAR', '(', '1', ')'}. 
-   struct Mapping { 
-     // Positions in the corresponding spelled token stream. The corresponding 
-     // range is never empty. 
-     unsigned BeginSpelled = 0; 
-     unsigned EndSpelled = 0; 
-     // Positions in the expanded token stream. The corresponding range can be 
-     // empty. 
-     unsigned BeginExpanded = 0; 
-     unsigned EndExpanded = 0; 
-   
-     /// For debugging purposes. 
-     std::string str() const; 
-   }; 
-   /// Spelled tokens of the file with information about the subranges. 
-   struct MarkedFile { 
-     /// Lexed, but not preprocessed, tokens of the file. These map directly to 
-     /// text in the corresponding files and include tokens of all preprocessor 
-     /// directives. 
-     /// FIXME: spelled tokens don't change across FileID that map to the same 
-     ///        FileEntry. We could consider deduplicating them to save memory. 
-     std::vector<syntax::Token> SpelledTokens; 
-     /// A sorted list to convert between the spelled and expanded token streams. 
-     std::vector<Mapping> Mappings; 
-     /// The first expanded token produced for this FileID. 
-     unsigned BeginExpanded = 0; 
-     unsigned EndExpanded = 0; 
-   }; 
-   
-   friend class TokenCollector; 
-   
-   /// Maps a single expanded token to its spelled counterpart or a mapping that 
-   /// produced it. 
-   std::pair<const syntax::Token *, const Mapping *> 
-   spelledForExpandedToken(const syntax::Token *Expanded) const; 
-   
-   /// Returns a mapping starting before \p Spelled token, or nullptr if no 
-   /// such mapping exists. 
-   static const Mapping * 
-   mappingStartingBeforeSpelled(const MarkedFile &F, 
-                                const syntax::Token *Spelled); 
-   
-   /// Convert a private Mapping to a public Expansion. 
-   Expansion makeExpansion(const MarkedFile &, const Mapping &) const; 
-   /// Returns the file that the Spelled tokens are taken from. 
-   /// Asserts that they are non-empty, from a tracked file, and in-bounds. 
-   const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const; 
-   
-   /// Token stream produced after preprocessing, conceputally this captures the 
-   /// same stream as 'clang -E' (excluding the preprocessor directives like 
-   /// #file, etc.). 
-   std::vector<syntax::Token> ExpandedTokens; 
-   // Index of ExpandedTokens for faster lookups by SourceLocation. 
-   llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex; 
-   llvm::DenseMap<FileID, MarkedFile> Files; 
-   // The value is never null, pointer instead of reference to avoid disabling 
-   // implicit assignment operator. 
-   const SourceManager *SourceMgr; 
- }; 
-   
- /// The spelled tokens that overlap or touch a spelling location Loc. 
- /// This always returns 0-2 tokens. 
- llvm::ArrayRef<syntax::Token> 
- spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens); 
- llvm::ArrayRef<syntax::Token> 
- spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens); 
-   
- /// The identifier token that overlaps or touches a spelling location Loc. 
- /// If there is none, returns nullptr. 
- const syntax::Token * 
- spelledIdentifierTouching(SourceLocation Loc, 
-                           llvm::ArrayRef<syntax::Token> Tokens); 
- const syntax::Token * 
- spelledIdentifierTouching(SourceLocation Loc, 
-                           const syntax::TokenBuffer &Tokens); 
-   
- /// Lex the text buffer, corresponding to \p FID, in raw mode and record the 
- /// resulting spelled tokens. Does minimal post-processing on raw identifiers, 
- /// setting the appropriate token kind (instead of the raw_identifier reported 
- /// by lexer in raw mode). This is a very low-level function, most users should 
- /// prefer to use TokenCollector. Lexing in raw mode produces wildly different 
- /// results from what one might expect when running a C++ frontend, e.g. 
- /// preprocessor does not run at all. 
- /// The result will *not* have a 'eof' token at the end. 
- std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM, 
-                                     const LangOptions &LO); 
- /// Similar to one above, instead of whole file tokenizes a part of it. Note 
- /// that, the first token might be incomplete if FR.startOffset is not at the 
- /// beginning of a token, and the last token returned will start before the 
- /// FR.endOffset but might end after it. 
- std::vector<syntax::Token> 
- tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO); 
-   
- /// Collects tokens for the main file while running the frontend action. An 
- /// instance of this object should be created on 
- /// FrontendAction::BeginSourceFile() and the results should be consumed after 
- /// FrontendAction::Execute() finishes. 
- class TokenCollector { 
- public: 
-   /// Adds the hooks to collect the tokens. Should be called before the 
-   /// preprocessing starts, i.e. as a part of BeginSourceFile() or 
-   /// CreateASTConsumer(). 
-   TokenCollector(Preprocessor &P); 
-   
-   /// Finalizes token collection. Should be called after preprocessing is 
-   /// finished, i.e. after running Execute(). 
-   [[nodiscard]] TokenBuffer consume() &&; 
-   
- private: 
-   /// Maps from a start to an end spelling location of transformations 
-   /// performed by the preprocessor. These include: 
-   ///   1. range from '#' to the last token in the line for PP directives, 
-   ///   2. macro name and arguments for macro expansions. 
-   /// Note that we record only top-level macro expansions, intermediate 
-   /// expansions (e.g. inside macro arguments) are ignored. 
-   /// 
-   /// Used to find correct boundaries of macro calls and directives when 
-   /// building mappings from spelled to expanded tokens. 
-   /// 
-   /// Logically, at each point of the preprocessor execution there is a stack of 
-   /// macro expansions being processed and we could use it to recover the 
-   /// location information we need. However, the public preprocessor API only 
-   /// exposes the points when macro expansions start (when we push a macro onto 
-   /// the stack) and not when they end (when we pop a macro from the stack). 
-   /// To workaround this limitation, we rely on source location information 
-   /// stored in this map. 
-   using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>; 
-   class Builder; 
-   class CollectPPExpansions; 
-   
-   std::vector<syntax::Token> Expanded; 
-   // FIXME: we only store macro expansions, also add directives(#pragma, etc.) 
-   PPExpansions Expansions; 
-   Preprocessor &PP; 
-   CollectPPExpansions *Collector; 
- }; 
-   
- } // namespace syntax 
- } // namespace clang 
-   
- #endif 
-