Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line | 
|---|---|---|---|
| 14 | pmbaty | 1 | //===--- Token.h - Token interface ------------------------------*- C++ -*-===// | 
| 2 | // | ||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| 6 | // | ||
| 7 | //===----------------------------------------------------------------------===// | ||
| 8 | // | ||
| 9 | //  This file defines the Token interface. | ||
| 10 | // | ||
| 11 | //===----------------------------------------------------------------------===// | ||
| 12 | |||
| 13 | #ifndef LLVM_CLANG_LEX_TOKEN_H | ||
| 14 | #define LLVM_CLANG_LEX_TOKEN_H | ||
| 15 | |||
| 16 | #include "clang/Basic/SourceLocation.h" | ||
| 17 | #include "clang/Basic/TokenKinds.h" | ||
| 18 | #include "llvm/ADT/ArrayRef.h" | ||
| 19 | #include "llvm/ADT/StringRef.h" | ||
| 20 | #include <cassert> | ||
| 21 | |||
| 22 | namespace clang { | ||
| 23 | |||
| 24 | class IdentifierInfo; | ||
| 25 | |||
| 26 | /// Token - This structure provides full information about a lexed token. | ||
| 27 | /// It is not intended to be space efficient, it is intended to return as much | ||
| 28 | /// information as possible about each returned token.  This is expected to be | ||
| 29 | /// compressed into a smaller form if memory footprint is important. | ||
| 30 | /// | ||
| 31 | /// The parser can create a special "annotation token" representing a stream of | ||
| 32 | /// tokens that were parsed and semantically resolved, e.g.: "foo::MyClass<int>" | ||
| 33 | /// can be represented by a single typename annotation token that carries | ||
| 34 | /// information about the SourceRange of the tokens and the type object. | ||
| 35 | class Token { | ||
| 36 |   /// The location of the token. This is actually a SourceLocation. | ||
| 37 | SourceLocation::UIntTy Loc; | ||
| 38 | |||
| 39 |   // Conceptually these next two fields could be in a union.  However, this | ||
| 40 |   // causes gcc 4.2 to pessimize LexTokenInternal, a very performance critical | ||
| 41 |   // routine. Keeping as separate members with casts until a more beautiful fix | ||
| 42 |   // presents itself. | ||
| 43 | |||
| 44 |   /// UintData - This holds either the length of the token text, when | ||
| 45 |   /// a normal token, or the end of the SourceRange when an annotation | ||
| 46 |   /// token. | ||
| 47 | SourceLocation::UIntTy UintData; | ||
| 48 | |||
| 49 |   /// PtrData - This is a union of four different pointer types, which depends | ||
| 50 |   /// on what type of token this is: | ||
| 51 |   ///  Identifiers, keywords, etc: | ||
| 52 |   ///    This is an IdentifierInfo*, which contains the uniqued identifier | ||
| 53 |   ///    spelling. | ||
| 54 |   ///  Literals:  isLiteral() returns true. | ||
| 55 |   ///    This is a pointer to the start of the token in a text buffer, which | ||
| 56 |   ///    may be dirty (have trigraphs / escaped newlines). | ||
| 57 |   ///  Annotations (resolved type names, C++ scopes, etc): isAnnotation(). | ||
| 58 |   ///    This is a pointer to sema-specific data for the annotation token. | ||
| 59 |   ///  Eof: | ||
| 60 |   //     This is a pointer to a Decl. | ||
| 61 |   ///  Other: | ||
| 62 |   ///    This is null. | ||
| 63 | void *PtrData; | ||
| 64 | |||
| 65 |   /// Kind - The actual flavor of token this is. | ||
| 66 | tok::TokenKind Kind; | ||
| 67 | |||
| 68 |   /// Flags - Bits we track about this token, members of the TokenFlags enum. | ||
| 69 | unsigned short Flags; | ||
| 70 | |||
| 71 | public: | ||
| 72 |   // Various flags set per token: | ||
| 73 | enum TokenFlags { | ||
| 74 | StartOfLine = 0x01, // At start of line or only after whitespace | ||
| 75 |                           // (considering the line after macro expansion). | ||
| 76 | LeadingSpace = 0x02, // Whitespace exists before this token (considering | ||
| 77 |                           // whitespace after macro expansion). | ||
| 78 | DisableExpand = 0x04, // This identifier may never be macro expanded. | ||
| 79 | NeedsCleaning = 0x08, // Contained an escaped newline or trigraph. | ||
| 80 | LeadingEmptyMacro = 0x10, // Empty macro exists before this token. | ||
| 81 | HasUDSuffix = 0x20, // This string or character literal has a ud-suffix. | ||
| 82 | HasUCN = 0x40, // This identifier contains a UCN. | ||
| 83 | IgnoredComma = 0x80, // This comma is not a macro argument separator (MS). | ||
| 84 | StringifiedInMacro = 0x100, // This string or character literal is formed by | ||
| 85 |                                 // macro stringizing or charizing operator. | ||
| 86 | CommaAfterElided = 0x200, // The comma following this token was elided (MS). | ||
| 87 | IsEditorPlaceholder = 0x400, // This identifier is a placeholder. | ||
| 88 | IsReinjected = 0x800, // A phase 4 token that was produced before and | ||
| 89 |                           // re-added, e.g. via EnterTokenStream. Annotation | ||
| 90 |                           // tokens are *not* reinjected. | ||
| 91 | }; | ||
| 92 | |||
| 93 | tok::TokenKind getKind() const { return Kind; } | ||
| 94 | void setKind(tok::TokenKind K) { Kind = K; } | ||
| 95 | |||
| 96 |   /// is/isNot - Predicates to check if this token is a specific kind, as in | ||
| 97 |   /// "if (Tok.is(tok::l_brace)) {...}". | ||
| 98 | bool is(tok::TokenKind K) const { return Kind == K; } | ||
| 99 | bool isNot(tok::TokenKind K) const { return Kind != K; } | ||
| 100 | bool isOneOf(tok::TokenKind K1, tok::TokenKind K2) const { | ||
| 101 | return is(K1) || is(K2); | ||
| 102 |   } | ||
| 103 | template <typename... Ts> bool isOneOf(tok::TokenKind K1, Ts... Ks) const { | ||
| 104 | return is(K1) || isOneOf(Ks...); | ||
| 105 |   } | ||
| 106 | |||
| 107 |   /// Return true if this is a raw identifier (when lexing | ||
| 108 |   /// in raw mode) or a non-keyword identifier (when lexing in non-raw mode). | ||
| 109 | bool isAnyIdentifier() const { | ||
| 110 | return tok::isAnyIdentifier(getKind()); | ||
| 111 |   } | ||
| 112 | |||
| 113 |   /// Return true if this is a "literal", like a numeric | ||
| 114 |   /// constant, string, etc. | ||
| 115 | bool isLiteral() const { | ||
| 116 | return tok::isLiteral(getKind()); | ||
| 117 |   } | ||
| 118 | |||
| 119 |   /// Return true if this is any of tok::annot_* kind tokens. | ||
| 120 | bool isAnnotation() const { | ||
| 121 | return tok::isAnnotation(getKind()); | ||
| 122 |   } | ||
| 123 | |||
| 124 |   /// Return a source location identifier for the specified | ||
| 125 |   /// offset in the current file. | ||
| 126 | SourceLocation getLocation() const { | ||
| 127 | return SourceLocation::getFromRawEncoding(Loc); | ||
| 128 |   } | ||
| 129 | unsigned getLength() const { | ||
| 130 | assert(!isAnnotation() && "Annotation tokens have no length field"); | ||
| 131 | return UintData; | ||
| 132 |   } | ||
| 133 | |||
| 134 | void setLocation(SourceLocation L) { Loc = L.getRawEncoding(); } | ||
| 135 | void setLength(unsigned Len) { | ||
| 136 | assert(!isAnnotation() && "Annotation tokens have no length field"); | ||
| 137 | UintData = Len; | ||
| 138 |   } | ||
| 139 | |||
| 140 | SourceLocation getAnnotationEndLoc() const { | ||
| 141 | assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token"); | ||
| 142 | return SourceLocation::getFromRawEncoding(UintData ? UintData : Loc); | ||
| 143 |   } | ||
| 144 | void setAnnotationEndLoc(SourceLocation L) { | ||
| 145 | assert(isAnnotation() && "Used AnnotEndLocID on non-annotation token"); | ||
| 146 | UintData = L.getRawEncoding(); | ||
| 147 |   } | ||
| 148 | |||
| 149 | SourceLocation getLastLoc() const { | ||
| 150 | return isAnnotation() ? getAnnotationEndLoc() : getLocation(); | ||
| 151 |   } | ||
| 152 | |||
| 153 | SourceLocation getEndLoc() const { | ||
| 154 | return isAnnotation() ? getAnnotationEndLoc() | ||
| 155 | : getLocation().getLocWithOffset(getLength()); | ||
| 156 |   } | ||
| 157 | |||
| 158 |   /// SourceRange of the group of tokens that this annotation token | ||
| 159 |   /// represents. | ||
| 160 | SourceRange getAnnotationRange() const { | ||
| 161 | return SourceRange(getLocation(), getAnnotationEndLoc()); | ||
| 162 |   } | ||
| 163 | void setAnnotationRange(SourceRange R) { | ||
| 164 | setLocation(R.getBegin()); | ||
| 165 | setAnnotationEndLoc(R.getEnd()); | ||
| 166 |   } | ||
| 167 | |||
| 168 | const char *getName() const { return tok::getTokenName(Kind); } | ||
| 169 | |||
| 170 |   /// Reset all flags to cleared. | ||
| 171 | void startToken() { | ||
| 172 | Kind = tok::unknown; | ||
| 173 | Flags = 0; | ||
| 174 | PtrData = nullptr; | ||
| 175 | UintData = 0; | ||
| 176 | Loc = SourceLocation().getRawEncoding(); | ||
| 177 |   } | ||
| 178 | |||
| 179 | bool hasPtrData() const { return PtrData != nullptr; } | ||
| 180 | |||
| 181 | IdentifierInfo *getIdentifierInfo() const { | ||
| 182 | assert(isNot(tok::raw_identifier) && | ||
| 183 | "getIdentifierInfo() on a tok::raw_identifier token!"); | ||
| 184 | assert(!isAnnotation() && | ||
| 185 | "getIdentifierInfo() on an annotation token!"); | ||
| 186 | if (isLiteral()) return nullptr; | ||
| 187 | if (is(tok::eof)) return nullptr; | ||
| 188 | return (IdentifierInfo*) PtrData; | ||
| 189 |   } | ||
| 190 | void setIdentifierInfo(IdentifierInfo *II) { | ||
| 191 | PtrData = (void*) II; | ||
| 192 |   } | ||
| 193 | |||
| 194 | const void *getEofData() const { | ||
| 195 | assert(is(tok::eof)); | ||
| 196 | return reinterpret_cast<const void *>(PtrData); | ||
| 197 |   } | ||
| 198 | void setEofData(const void *D) { | ||
| 199 | assert(is(tok::eof)); | ||
| 200 | assert(!PtrData); | ||
| 201 | PtrData = const_cast<void *>(D); | ||
| 202 |   } | ||
| 203 | |||
| 204 |   /// getRawIdentifier - For a raw identifier token (i.e., an identifier | ||
| 205 |   /// lexed in raw mode), returns a reference to the text substring in the | ||
| 206 |   /// buffer if known. | ||
| 207 | StringRef getRawIdentifier() const { | ||
| 208 | assert(is(tok::raw_identifier)); | ||
| 209 | return StringRef(reinterpret_cast<const char *>(PtrData), getLength()); | ||
| 210 |   } | ||
| 211 | void setRawIdentifierData(const char *Ptr) { | ||
| 212 | assert(is(tok::raw_identifier)); | ||
| 213 | PtrData = const_cast<char*>(Ptr); | ||
| 214 |   } | ||
| 215 | |||
| 216 |   /// getLiteralData - For a literal token (numeric constant, string, etc), this | ||
| 217 |   /// returns a pointer to the start of it in the text buffer if known, null | ||
| 218 |   /// otherwise. | ||
| 219 | const char *getLiteralData() const { | ||
| 220 | assert(isLiteral() && "Cannot get literal data of non-literal"); | ||
| 221 | return reinterpret_cast<const char*>(PtrData); | ||
| 222 |   } | ||
| 223 | void setLiteralData(const char *Ptr) { | ||
| 224 | assert(isLiteral() && "Cannot set literal data of non-literal"); | ||
| 225 | PtrData = const_cast<char*>(Ptr); | ||
| 226 |   } | ||
| 227 | |||
| 228 | void *getAnnotationValue() const { | ||
| 229 | assert(isAnnotation() && "Used AnnotVal on non-annotation token"); | ||
| 230 | return PtrData; | ||
| 231 |   } | ||
| 232 | void setAnnotationValue(void *val) { | ||
| 233 | assert(isAnnotation() && "Used AnnotVal on non-annotation token"); | ||
| 234 | PtrData = val; | ||
| 235 |   } | ||
| 236 | |||
| 237 |   /// Set the specified flag. | ||
| 238 | void setFlag(TokenFlags Flag) { | ||
| 239 | Flags |= Flag; | ||
| 240 |   } | ||
| 241 | |||
| 242 |   /// Get the specified flag. | ||
| 243 | bool getFlag(TokenFlags Flag) const { | ||
| 244 | return (Flags & Flag) != 0; | ||
| 245 |   } | ||
| 246 | |||
| 247 |   /// Unset the specified flag. | ||
| 248 | void clearFlag(TokenFlags Flag) { | ||
| 249 | Flags &= ~Flag; | ||
| 250 |   } | ||
| 251 | |||
| 252 |   /// Return the internal represtation of the flags. | ||
| 253 |   /// | ||
| 254 |   /// This is only intended for low-level operations such as writing tokens to | ||
| 255 |   /// disk. | ||
| 256 | unsigned getFlags() const { | ||
| 257 | return Flags; | ||
| 258 |   } | ||
| 259 | |||
| 260 |   /// Set a flag to either true or false. | ||
| 261 | void setFlagValue(TokenFlags Flag, bool Val) { | ||
| 262 | if (Val) | ||
| 263 | setFlag(Flag); | ||
| 264 |     else | ||
| 265 | clearFlag(Flag); | ||
| 266 |   } | ||
| 267 | |||
| 268 |   /// isAtStartOfLine - Return true if this token is at the start of a line. | ||
| 269 |   /// | ||
| 270 | bool isAtStartOfLine() const { return getFlag(StartOfLine); } | ||
| 271 | |||
| 272 |   /// Return true if this token has whitespace before it. | ||
| 273 |   /// | ||
| 274 | bool hasLeadingSpace() const { return getFlag(LeadingSpace); } | ||
| 275 | |||
| 276 |   /// Return true if this identifier token should never | ||
| 277 |   /// be expanded in the future, due to C99 6.10.3.4p2. | ||
| 278 | bool isExpandDisabled() const { return getFlag(DisableExpand); } | ||
| 279 | |||
| 280 |   /// Return true if we have an ObjC keyword identifier. | ||
| 281 | bool isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const; | ||
| 282 | |||
| 283 |   /// Return the ObjC keyword kind. | ||
| 284 | tok::ObjCKeywordKind getObjCKeywordID() const; | ||
| 285 | |||
| 286 |   /// Return true if this token has trigraphs or escaped newlines in it. | ||
| 287 | bool needsCleaning() const { return getFlag(NeedsCleaning); } | ||
| 288 | |||
| 289 |   /// Return true if this token has an empty macro before it. | ||
| 290 |   /// | ||
| 291 | bool hasLeadingEmptyMacro() const { return getFlag(LeadingEmptyMacro); } | ||
| 292 | |||
| 293 |   /// Return true if this token is a string or character literal which | ||
| 294 |   /// has a ud-suffix. | ||
| 295 | bool hasUDSuffix() const { return getFlag(HasUDSuffix); } | ||
| 296 | |||
| 297 |   /// Returns true if this token contains a universal character name. | ||
| 298 | bool hasUCN() const { return getFlag(HasUCN); } | ||
| 299 | |||
| 300 |   /// Returns true if this token is formed by macro by stringizing or charizing | ||
| 301 |   /// operator. | ||
| 302 | bool stringifiedInMacro() const { return getFlag(StringifiedInMacro); } | ||
| 303 | |||
| 304 |   /// Returns true if the comma after this token was elided. | ||
| 305 | bool commaAfterElided() const { return getFlag(CommaAfterElided); } | ||
| 306 | |||
| 307 |   /// Returns true if this token is an editor placeholder. | ||
| 308 |   /// | ||
| 309 |   /// Editor placeholders are produced by the code-completion engine and are | ||
| 310 |   /// represented as characters between '<#' and '#>' in the source code. The | ||
| 311 |   /// lexer uses identifier tokens to represent placeholders. | ||
| 312 | bool isEditorPlaceholder() const { return getFlag(IsEditorPlaceholder); } | ||
| 313 | }; | ||
| 314 | |||
| 315 | /// Information about the conditional stack (\#if directives) | ||
| 316 | /// currently active. | ||
| 317 | struct PPConditionalInfo { | ||
| 318 |   /// Location where the conditional started. | ||
| 319 |   SourceLocation IfLoc; | ||
| 320 | |||
| 321 |   /// True if this was contained in a skipping directive, e.g., | ||
| 322 |   /// in a "\#if 0" block. | ||
| 323 | bool WasSkipping; | ||
| 324 | |||
| 325 |   /// True if we have emitted tokens already, and now we're in | ||
| 326 |   /// an \#else block or something.  Only useful in Skipping blocks. | ||
| 327 | bool FoundNonSkip; | ||
| 328 | |||
| 329 |   /// True if we've seen a \#else in this block.  If so, | ||
| 330 |   /// \#elif/\#else directives are not allowed. | ||
| 331 | bool FoundElse; | ||
| 332 | }; | ||
| 333 | |||
| 334 | // Extra information needed for annonation tokens. | ||
| 335 | struct PragmaLoopHintInfo { | ||
| 336 |   Token PragmaName; | ||
| 337 |   Token Option; | ||
| 338 | ArrayRef<Token> Toks; | ||
| 339 | }; | ||
| 340 | } // end namespace clang | ||
| 341 | |||
| 342 | #endif // LLVM_CLANG_LEX_TOKEN_H |