WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – /llvm-build/x86_64/include/clang/Tooling/Syntax/Tokens.h

Rev	Author	Line No.	Line
14	pmbaty	1	//===- Tokens.h - collect tokens from preprocessing --------------- C++--===//
		2	//
		3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	// See https://llvm.org/LICENSE.txt for license information.
		5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	//
		7	//===----------------------------------------------------------------------===//
		8	// Record tokens that a preprocessor emits and define operations to map between
		9	// the tokens written in a file and tokens produced by the preprocessor.
		10	//
		11	// When running the compiler, there are two token streams we are interested in:
		12	// - "spelled" tokens directly correspond to a substring written in some
		13	// source file.
		14	// - "expanded" tokens represent the result of preprocessing, parses consumes
		15	// this token stream to produce the AST.
		16	//
		17	// Expanded tokens correspond directly to locations found in the AST, allowing
		18	// to find subranges of the token stream covered by various AST nodes. Spelled
		19	// tokens correspond directly to the source code written by the user.
		20	//
		21	// To allow composing these two use-cases, we also define operations that map
		22	// between expanded and spelled tokens that produced them (macro calls,
		23	// directives, etc).
		24	//
		25	//===----------------------------------------------------------------------===//
		26
		27	#ifndef LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
		28	#define LLVM_CLANG_TOOLING_SYNTAX_TOKENS_H
		29
		30	#include "clang/Basic/LangOptions.h"
		31	#include "clang/Basic/SourceLocation.h"
		32	#include "clang/Basic/SourceManager.h"
		33	#include "clang/Basic/TokenKinds.h"
		34	#include "clang/Lex/Token.h"
		35	#include "llvm/ADT/ArrayRef.h"
		36	#include "llvm/ADT/DenseMap.h"
		37	#include "llvm/ADT/StringRef.h"
		38	#include "llvm/Support/Compiler.h"
		39	#include "llvm/Support/raw_ostream.h"
		40	#include <cstdint>
		41	#include <tuple>
		42
		43	namespace clang {
		44	class Preprocessor;
		45
		46	namespace syntax {
		47
		48	/// A half-open character range inside a particular file, the start offset is
		49	/// included and the end offset is excluded from the range.
		50	struct FileRange {
		51	/// EXPECTS: File.isValid() && Begin <= End.
		52	FileRange(FileID File, unsigned BeginOffset, unsigned EndOffset);
		53	/// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID().
		54	FileRange(const SourceManager &SM, SourceLocation BeginLoc, unsigned Length);
		55	/// EXPECTS: BeginLoc.isValid() && BeginLoc.isFileID(), Begin <= End and files
		56	/// are the same.
		57	FileRange(const SourceManager &SM, SourceLocation BeginLoc,
		58	SourceLocation EndLoc);
		59
		60	FileID file() const { return File; }
		61	/// Start is a start offset (inclusive) in the corresponding file.
		62	unsigned beginOffset() const { return Begin; }
		63	/// End offset (exclusive) in the corresponding file.
		64	unsigned endOffset() const { return End; }
		65
		66	unsigned length() const { return End - Begin; }
		67
		68	/// Check if \p Offset is inside the range.
		69	bool contains(unsigned Offset) const {
		70	return Begin <= Offset && Offset < End;
		71	}
		72	/// Check \p Offset is inside the range or equal to its endpoint.
		73	bool touches(unsigned Offset) const {
		74	return Begin <= Offset && Offset <= End;
		75	}
		76
		77	/// Gets the substring that this FileRange refers to.
		78	llvm::StringRef text(const SourceManager &SM) const;
		79
		80	/// Convert to the clang range. The returned range is always a char range,
		81	/// never a token range.
		82	CharSourceRange toCharRange(const SourceManager &SM) const;
		83
		84	friend bool operator==(const FileRange &L, const FileRange &R) {
		85	return std::tie(L.File, L.Begin, L.End) == std::tie(R.File, R.Begin, R.End);
		86	}
		87	friend bool operator!=(const FileRange &L, const FileRange &R) {
		88	return !(L == R);
		89	}
		90
		91	private:
		92	FileID File;
		93	unsigned Begin;
		94	unsigned End;
		95	};
		96
		97	/// For debugging purposes.
		98	llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const FileRange &R);
		99
		100	/// A token coming directly from a file or from a macro invocation. Has just
		101	/// enough information to locate the token in the source code.
		102	/// Can represent both expanded and spelled tokens.
		103	class Token {
		104	public:
		105	Token(SourceLocation Location, unsigned Length, tok::TokenKind Kind);
		106	/// EXPECTS: clang::Token is not an annotation token.
		107	explicit Token(const clang::Token &T);
		108
		109	tok::TokenKind kind() const { return Kind; }
		110	/// Location of the first character of a token.
		111	SourceLocation location() const { return Location; }
		112	/// Location right after the last character of a token.
		113	SourceLocation endLocation() const {
		114	return Location.getLocWithOffset(Length);
		115	}
		116	unsigned length() const { return Length; }
		117
		118	/// Get the substring covered by the token. Note that will include all
		119	/// digraphs, newline continuations, etc. E.g. tokens for 'int' and
		120	/// in\
		121	/// t
		122	/// both have the same kind tok::kw_int, but results of text() are different.
		123	llvm::StringRef text(const SourceManager &SM) const;
		124
		125	/// Gets a range of this token.
		126	/// EXPECTS: token comes from a file, not from a macro expansion.
		127	FileRange range(const SourceManager &SM) const;
		128
		129	/// Given two tokens inside the same file, returns a file range that starts at
		130	/// \p First and ends at \p Last.
		131	/// EXPECTS: First and Last are file tokens from the same file, Last starts
		132	/// after First.
		133	static FileRange range(const SourceManager &SM, const syntax::Token &First,
		134	const syntax::Token &Last);
		135
		136	std::string dumpForTests(const SourceManager &SM) const;
		137	/// For debugging purposes.
		138	std::string str() const;
		139
		140	private:
		141	SourceLocation Location;
		142	unsigned Length;
		143	tok::TokenKind Kind;
		144	};
		145	/// For debugging purposes. Equivalent to a call to Token::str().
		146	llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T);
		147
		148	/// A list of tokens obtained by preprocessing a text buffer and operations to
		149	/// map between the expanded and spelled tokens, i.e. TokenBuffer has
		150	/// information about two token streams:
		151	/// 1. Expanded tokens: tokens produced by the preprocessor after all macro
		152	/// replacements,
		153	/// 2. Spelled tokens: corresponding directly to the source code of a file
		154	/// before any macro replacements occurred.
		155	/// Here's an example to illustrate a difference between those two:
		156	/// #define FOO 10
		157	/// int a = FOO;
		158	///
		159	/// Spelled tokens are {'#','define','FOO','10','int','a','=','FOO',';'}.
		160	/// Expanded tokens are {'int','a','=','10',';','eof'}.
		161	///
		162	/// Note that the expanded token stream has a tok::eof token at the end, the
		163	/// spelled tokens never store a 'eof' token.
		164	///
		165	/// The full list expanded tokens can be obtained with expandedTokens(). Spelled
		166	/// tokens for each of the files can be obtained via spelledTokens(FileID).
		167	///
		168	/// To map between the expanded and spelled tokens use findSpelledByExpanded().
		169	///
		170	/// To build a token buffer use the TokenCollector class. You can also compute
		171	/// the spelled tokens of a file using the tokenize() helper.
		172	///
		173	/// FIXME: allow mappings into macro arguments.
		174	class TokenBuffer {
		175	public:
		176	TokenBuffer(const SourceManager &SourceMgr) : SourceMgr(&SourceMgr) {}
		177
		178	TokenBuffer(TokenBuffer &&) = default;
		179	TokenBuffer(const TokenBuffer &) = delete;
		180	TokenBuffer &operator=(TokenBuffer &&) = default;
		181	TokenBuffer &operator=(const TokenBuffer &) = delete;
		182
		183	/// All tokens produced by the preprocessor after all macro replacements,
		184	/// directives, etc. Source locations found in the clang AST will always
		185	/// point to one of these tokens.
		186	/// Tokens are in TU order (per SourceManager::isBeforeInTranslationUnit()).
		187	/// FIXME: figure out how to handle token splitting, e.g. '>>' can be split
		188	/// into two '>' tokens by the parser. However, TokenBuffer currently
		189	/// keeps it as a single '>>' token.
		190	llvm::ArrayRef<syntax::Token> expandedTokens() const {
		191	return ExpandedTokens;
		192	}
		193
		194	/// Builds a cache to make future calls to expandedToken(SourceRange) faster.
		195	/// Creates an index only once. Further calls to it will be no-op.
		196	void indexExpandedTokens();
		197
		198	/// Returns the subrange of expandedTokens() corresponding to the closed
		199	/// token range R.
		200	/// Consider calling indexExpandedTokens() before for faster lookups.
		201	llvm::ArrayRef<syntax::Token> expandedTokens(SourceRange R) const;
		202
		203	/// Returns the subrange of spelled tokens corresponding to AST node spanning
		204	/// \p Expanded. This is the text that should be replaced if a refactoring
		205	/// were to rewrite the node. If \p Expanded is empty, the returned value is
		206	/// std::nullopt.
		207	///
		208	/// Will fail if the expanded tokens do not correspond to a sequence of
		209	/// spelled tokens. E.g. for the following example:
		210	///
		211	/// #define FIRST f1 f2 f3
		212	/// #define SECOND s1 s2 s3
		213	/// #define ID2(X, Y) X Y
		214	///
		215	/// a FIRST b SECOND c // expanded tokens are: a f1 f2 f3 b s1 s2 s3 c
		216	/// d ID2(e f g, h) i // expanded tokens are: d e f g h i
		217	///
		218	/// the results would be:
		219	/// expanded => spelled
		220	/// ------------------------
		221	/// a => a
		222	/// s1 s2 s3 => SECOND
		223	/// a f1 f2 f3 => a FIRST
		224	/// a f1 => can't map
		225	/// s1 s2 => can't map
		226	/// e f => e f
		227	/// g h => can't map
		228	///
		229	/// EXPECTS: \p Expanded is a subrange of expandedTokens().
		230	/// Complexity is logarithmic.
		231	std::optional<llvm::ArrayRef<syntax::Token>>
		232	spelledForExpanded(llvm::ArrayRef<syntax::Token> Expanded) const;
		233
		234	/// Find the subranges of expanded tokens, corresponding to \p Spelled.
		235	///
		236	/// Some spelled tokens may not be present in the expanded token stream, so
		237	/// this function can return an empty vector, e.g. for tokens of macro
		238	/// directives or disabled preprocessor branches.
		239	///
		240	/// Some spelled tokens can be duplicated in the expanded token stream
		241	/// multiple times and this function will return multiple results in those
		242	/// cases. This happens when \p Spelled is inside a macro argument.
		243	///
		244	/// FIXME: return correct results on macro arguments. For now, we return an
		245	/// empty list.
		246	///
		247	/// (!) will return empty vector on tokens from #define body:
		248	/// E.g. for the following example:
		249	///
		250	/// #define FIRST(A) f1 A = A f2
		251	/// #define SECOND s
		252	///
		253	/// a FIRST(arg) b SECOND c // expanded tokens are: a f1 arg = arg f2 b s
		254	/// The results would be
		255	/// spelled => expanded
		256	/// ------------------------
		257	/// #define FIRST => {}
		258	/// a FIRST(arg) => {a f1 arg = arg f2}
		259	/// arg => {arg, arg} // arg #1 is before `=` and arg #2 is
		260	/// // after `=` in the expanded tokens.
		261	llvm::SmallVector<llvm::ArrayRef<syntax::Token>, 1>
		262	expandedForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
		263
		264	/// An expansion produced by the preprocessor, includes macro expansions and
		265	/// preprocessor directives. Preprocessor always maps a non-empty range of
		266	/// spelled tokens to a (possibly empty) range of expanded tokens. Here is a
		267	/// few examples of expansions:
		268	/// #pragma once // Expands to an empty range.
		269	/// #define FOO 1 2 3 // Expands an empty range.
		270	/// FOO // Expands to "1 2 3".
		271	/// FIXME(ibiryukov): implement this, currently #include expansions are empty.
		272	/// #include <vector> // Expands to tokens produced by the include.
		273	struct Expansion {
		274	llvm::ArrayRef<syntax::Token> Spelled;
		275	llvm::ArrayRef<syntax::Token> Expanded;
		276	};
		277	/// If \p Spelled starts a mapping (e.g. if it's a macro name or '#' starting
		278	/// a preprocessor directive) return the subrange of expanded tokens that the
		279	/// macro expands to.
		280	std::optional<Expansion>
		281	expansionStartingAt(const syntax::Token *Spelled) const;
		282	/// Returns all expansions (partially) expanded from the specified tokens.
		283	/// This is the expansions whose Spelled range intersects \p Spelled.
		284	std::vector<Expansion>
		285	expansionsOverlapping(llvm::ArrayRef<syntax::Token> Spelled) const;
		286
		287	/// Lexed tokens of a file before preprocessing. E.g. for the following input
		288	/// #define DECL(name) int name = 10
		289	/// DECL(a);
		290	/// spelledTokens() returns
		291	/// {"#", "define", "DECL", "(", "name", ")", "int", "name", "=", "10",
		292	/// "DECL", "(", "a", ")", ";"}
		293	llvm::ArrayRef<syntax::Token> spelledTokens(FileID FID) const;
		294
		295	/// Returns the spelled Token starting at Loc, if there are no such tokens
		296	/// returns nullptr.
		297	const syntax::Token *spelledTokenAt(SourceLocation Loc) const;
		298
		299	/// Get all tokens that expand a macro in \p FID. For the following input
		300	/// #define FOO B
		301	/// #define FOO2(X) int X
		302	/// FOO2(XY)
		303	/// int B;
		304	/// FOO;
		305	/// macroExpansions() returns {"FOO2", "FOO"} (from line 3 and 5
		306	/// respecitvely).
		307	std::vector<const syntax::Token *> macroExpansions(FileID FID) const;
		308
		309	const SourceManager &sourceManager() const { return *SourceMgr; }
		310
		311	std::string dumpForTests() const;
		312
		313	private:
		314	/// Describes a mapping between a continuous subrange of spelled tokens and
		315	/// expanded tokens. Represents macro expansions, preprocessor directives,
		316	/// conditionally disabled pp regions, etc.
		317	/// #define FOO 1+2
		318	/// #define BAR(a) a + 1
		319	/// FOO // invocation #1, tokens = {'1','+','2'}, macroTokens = {'FOO'}.
		320	/// BAR(1) // invocation #2, tokens = {'a', '+', '1'},
		321	/// macroTokens = {'BAR', '(', '1', ')'}.
		322	struct Mapping {
		323	// Positions in the corresponding spelled token stream. The corresponding
		324	// range is never empty.
		325	unsigned BeginSpelled = 0;
		326	unsigned EndSpelled = 0;
		327	// Positions in the expanded token stream. The corresponding range can be
		328	// empty.
		329	unsigned BeginExpanded = 0;
		330	unsigned EndExpanded = 0;
		331
		332	/// For debugging purposes.
		333	std::string str() const;
		334	};
		335	/// Spelled tokens of the file with information about the subranges.
		336	struct MarkedFile {
		337	/// Lexed, but not preprocessed, tokens of the file. These map directly to
		338	/// text in the corresponding files and include tokens of all preprocessor
		339	/// directives.
		340	/// FIXME: spelled tokens don't change across FileID that map to the same
		341	/// FileEntry. We could consider deduplicating them to save memory.
		342	std::vector<syntax::Token> SpelledTokens;
		343	/// A sorted list to convert between the spelled and expanded token streams.
		344	std::vector<Mapping> Mappings;
		345	/// The first expanded token produced for this FileID.
		346	unsigned BeginExpanded = 0;
		347	unsigned EndExpanded = 0;
		348	};
		349
		350	friend class TokenCollector;
		351
		352	/// Maps a single expanded token to its spelled counterpart or a mapping that
		353	/// produced it.
		354	std::pair<const syntax::Token , const Mapping >
		355	spelledForExpandedToken(const syntax::Token *Expanded) const;
		356
		357	/// Returns a mapping starting before \p Spelled token, or nullptr if no
		358	/// such mapping exists.
		359	static const Mapping *
		360	mappingStartingBeforeSpelled(const MarkedFile &F,
		361	const syntax::Token *Spelled);
		362
		363	/// Convert a private Mapping to a public Expansion.
		364	Expansion makeExpansion(const MarkedFile &, const Mapping &) const;
		365	/// Returns the file that the Spelled tokens are taken from.
		366	/// Asserts that they are non-empty, from a tracked file, and in-bounds.
		367	const MarkedFile &fileForSpelled(llvm::ArrayRef<syntax::Token> Spelled) const;
		368
		369	/// Token stream produced after preprocessing, conceputally this captures the
		370	/// same stream as 'clang -E' (excluding the preprocessor directives like
		371	/// #file, etc.).
		372	std::vector<syntax::Token> ExpandedTokens;
		373	// Index of ExpandedTokens for faster lookups by SourceLocation.
		374	llvm::DenseMap<SourceLocation, unsigned> ExpandedTokIndex;
		375	llvm::DenseMap<FileID, MarkedFile> Files;
		376	// The value is never null, pointer instead of reference to avoid disabling
		377	// implicit assignment operator.
		378	const SourceManager *SourceMgr;
		379	};
		380
		381	/// The spelled tokens that overlap or touch a spelling location Loc.
		382	/// This always returns 0-2 tokens.
		383	llvm::ArrayRef<syntax::Token>
		384	spelledTokensTouching(SourceLocation Loc, const syntax::TokenBuffer &Tokens);
		385	llvm::ArrayRef<syntax::Token>
		386	spelledTokensTouching(SourceLocation Loc, llvm::ArrayRef<syntax::Token> Tokens);
		387
		388	/// The identifier token that overlaps or touches a spelling location Loc.
		389	/// If there is none, returns nullptr.
		390	const syntax::Token *
		391	spelledIdentifierTouching(SourceLocation Loc,
		392	llvm::ArrayRef<syntax::Token> Tokens);
		393	const syntax::Token *
		394	spelledIdentifierTouching(SourceLocation Loc,
		395	const syntax::TokenBuffer &Tokens);
		396
		397	/// Lex the text buffer, corresponding to \p FID, in raw mode and record the
		398	/// resulting spelled tokens. Does minimal post-processing on raw identifiers,
		399	/// setting the appropriate token kind (instead of the raw_identifier reported
		400	/// by lexer in raw mode). This is a very low-level function, most users should
		401	/// prefer to use TokenCollector. Lexing in raw mode produces wildly different
		402	/// results from what one might expect when running a C++ frontend, e.g.
		403	/// preprocessor does not run at all.
		404	/// The result will not have a 'eof' token at the end.
		405	std::vector<syntax::Token> tokenize(FileID FID, const SourceManager &SM,
		406	const LangOptions &LO);
		407	/// Similar to one above, instead of whole file tokenizes a part of it. Note
		408	/// that, the first token might be incomplete if FR.startOffset is not at the
		409	/// beginning of a token, and the last token returned will start before the
		410	/// FR.endOffset but might end after it.
		411	std::vector<syntax::Token>
		412	tokenize(const FileRange &FR, const SourceManager &SM, const LangOptions &LO);
		413
		414	/// Collects tokens for the main file while running the frontend action. An
		415	/// instance of this object should be created on
		416	/// FrontendAction::BeginSourceFile() and the results should be consumed after
		417	/// FrontendAction::Execute() finishes.
		418	class TokenCollector {
		419	public:
		420	/// Adds the hooks to collect the tokens. Should be called before the
		421	/// preprocessing starts, i.e. as a part of BeginSourceFile() or
		422	/// CreateASTConsumer().
		423	TokenCollector(Preprocessor &P);
		424
		425	/// Finalizes token collection. Should be called after preprocessing is
		426	/// finished, i.e. after running Execute().
		427	[[nodiscard]] TokenBuffer consume() &&;
		428
		429	private:
		430	/// Maps from a start to an end spelling location of transformations
		431	/// performed by the preprocessor. These include:
		432	/// 1. range from '#' to the last token in the line for PP directives,
		433	/// 2. macro name and arguments for macro expansions.
		434	/// Note that we record only top-level macro expansions, intermediate
		435	/// expansions (e.g. inside macro arguments) are ignored.
		436	///
		437	/// Used to find correct boundaries of macro calls and directives when
		438	/// building mappings from spelled to expanded tokens.
		439	///
		440	/// Logically, at each point of the preprocessor execution there is a stack of
		441	/// macro expansions being processed and we could use it to recover the
		442	/// location information we need. However, the public preprocessor API only
		443	/// exposes the points when macro expansions start (when we push a macro onto
		444	/// the stack) and not when they end (when we pop a macro from the stack).
		445	/// To workaround this limitation, we rely on source location information
		446	/// stored in this map.
		447	using PPExpansions = llvm::DenseMap<SourceLocation, SourceLocation>;
		448	class Builder;
		449	class CollectPPExpansions;
		450
		451	std::vector<syntax::Token> Expanded;
		452	// FIXME: we only store macro expansions, also add directives(#pragma, etc.)
		453	PPExpansions Expansions;
		454	Preprocessor &PP;
		455	CollectPPExpansions *Collector;
		456	};
		457
		458	} // namespace syntax
		459	} // namespace clang
		460
		461	#endif

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite/llvm-build/x86_64/include/clang/Tooling/Syntax/Tokens.h – Rev 14