Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line | 
|---|---|---|---|
| 14 | pmbaty | 1 | //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// | 
| 2 | // | ||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| 6 | // | ||
| 7 | //===----------------------------------------------------------------------===// | ||
| 8 | // | ||
| 9 | //  This file defines lexer for structured comments and supporting token class. | ||
| 10 | // | ||
| 11 | //===----------------------------------------------------------------------===// | ||
| 12 | |||
| 13 | #ifndef LLVM_CLANG_AST_COMMENTLEXER_H | ||
| 14 | #define LLVM_CLANG_AST_COMMENTLEXER_H | ||
| 15 | |||
| 16 | #include "clang/Basic/Diagnostic.h" | ||
| 17 | #include "clang/Basic/SourceManager.h" | ||
| 18 | #include "llvm/ADT/SmallString.h" | ||
| 19 | #include "llvm/ADT/StringRef.h" | ||
| 20 | #include "llvm/Support/Allocator.h" | ||
| 21 | #include "llvm/Support/raw_ostream.h" | ||
| 22 | |||
| 23 | namespace clang { | ||
| 24 | namespace comments { | ||
| 25 | |||
| 26 | class Lexer; | ||
| 27 | class TextTokenRetokenizer; | ||
| 28 | struct CommandInfo; | ||
| 29 | class CommandTraits; | ||
| 30 | |||
| 31 | namespace tok { | ||
| 32 | enum TokenKind { | ||
| 33 | eof, | ||
| 34 | newline, | ||
| 35 | text, | ||
| 36 |   unknown_command,   // Command that does not have an ID. | ||
| 37 |   backslash_command, // Command with an ID, that used backslash marker. | ||
| 38 |   at_command,        // Command with an ID, that used 'at' marker. | ||
| 39 | verbatim_block_begin, | ||
| 40 | verbatim_block_line, | ||
| 41 | verbatim_block_end, | ||
| 42 | verbatim_line_name, | ||
| 43 | verbatim_line_text, | ||
| 44 |   html_start_tag,     // <tag | ||
| 45 |   html_ident,         // attr | ||
| 46 |   html_equals,        // = | ||
| 47 |   html_quoted_string, // "blah\"blah" or 'blah\'blah' | ||
| 48 |   html_greater,       // > | ||
| 49 |   html_slash_greater, // /> | ||
| 50 |   html_end_tag        // </tag | ||
| 51 | }; | ||
| 52 | } // end namespace tok | ||
| 53 | |||
| 54 | /// Comment token. | ||
| 55 | class Token { | ||
| 56 | friend class Lexer; | ||
| 57 | friend class TextTokenRetokenizer; | ||
| 58 | |||
| 59 |   /// The location of the token. | ||
| 60 |   SourceLocation Loc; | ||
| 61 | |||
| 62 |   /// The actual kind of the token. | ||
| 63 | tok::TokenKind Kind; | ||
| 64 | |||
| 65 |   /// Integer value associated with a token. | ||
| 66 |   /// | ||
| 67 |   /// If the token is a known command, contains command ID and TextPtr is | ||
| 68 |   /// unused (command spelling can be found with CommandTraits).  Otherwise, | ||
| 69 |   /// contains the length of the string that starts at TextPtr. | ||
| 70 | unsigned IntVal; | ||
| 71 | |||
| 72 |   /// Length of the token spelling in comment.  Can be 0 for synthenized | ||
| 73 |   /// tokens. | ||
| 74 | unsigned Length; | ||
| 75 | |||
| 76 |   /// Contains text value associated with a token. | ||
| 77 | const char *TextPtr; | ||
| 78 | |||
| 79 | public: | ||
| 80 | SourceLocation getLocation() const LLVM_READONLY { return Loc; } | ||
| 81 | void setLocation(SourceLocation SL) { Loc = SL; } | ||
| 82 | |||
| 83 | SourceLocation getEndLocation() const LLVM_READONLY { | ||
| 84 | if (Length == 0 || Length == 1) | ||
| 85 | return Loc; | ||
| 86 | return Loc.getLocWithOffset(Length - 1); | ||
| 87 |   } | ||
| 88 | |||
| 89 | tok::TokenKind getKind() const LLVM_READONLY { return Kind; } | ||
| 90 | void setKind(tok::TokenKind K) { Kind = K; } | ||
| 91 | |||
| 92 | bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } | ||
| 93 | bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } | ||
| 94 | |||
| 95 | unsigned getLength() const LLVM_READONLY { return Length; } | ||
| 96 | void setLength(unsigned L) { Length = L; } | ||
| 97 | |||
| 98 | StringRef getText() const LLVM_READONLY { | ||
| 99 | assert(is(tok::text)); | ||
| 100 | return StringRef(TextPtr, IntVal); | ||
| 101 |   } | ||
| 102 | |||
| 103 | void setText(StringRef Text) { | ||
| 104 | assert(is(tok::text)); | ||
| 105 | TextPtr = Text.data(); | ||
| 106 | IntVal = Text.size(); | ||
| 107 |   } | ||
| 108 | |||
| 109 | StringRef getUnknownCommandName() const LLVM_READONLY { | ||
| 110 | assert(is(tok::unknown_command)); | ||
| 111 | return StringRef(TextPtr, IntVal); | ||
| 112 |   } | ||
| 113 | |||
| 114 | void setUnknownCommandName(StringRef Name) { | ||
| 115 | assert(is(tok::unknown_command)); | ||
| 116 | TextPtr = Name.data(); | ||
| 117 | IntVal = Name.size(); | ||
| 118 |   } | ||
| 119 | |||
| 120 | unsigned getCommandID() const LLVM_READONLY { | ||
| 121 | assert(is(tok::backslash_command) || is(tok::at_command)); | ||
| 122 | return IntVal; | ||
| 123 |   } | ||
| 124 | |||
| 125 | void setCommandID(unsigned ID) { | ||
| 126 | assert(is(tok::backslash_command) || is(tok::at_command)); | ||
| 127 | IntVal = ID; | ||
| 128 |   } | ||
| 129 | |||
| 130 | unsigned getVerbatimBlockID() const LLVM_READONLY { | ||
| 131 | assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); | ||
| 132 | return IntVal; | ||
| 133 |   } | ||
| 134 | |||
| 135 | void setVerbatimBlockID(unsigned ID) { | ||
| 136 | assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); | ||
| 137 | IntVal = ID; | ||
| 138 |   } | ||
| 139 | |||
| 140 | StringRef getVerbatimBlockText() const LLVM_READONLY { | ||
| 141 | assert(is(tok::verbatim_block_line)); | ||
| 142 | return StringRef(TextPtr, IntVal); | ||
| 143 |   } | ||
| 144 | |||
| 145 | void setVerbatimBlockText(StringRef Text) { | ||
| 146 | assert(is(tok::verbatim_block_line)); | ||
| 147 | TextPtr = Text.data(); | ||
| 148 | IntVal = Text.size(); | ||
| 149 |   } | ||
| 150 | |||
| 151 | unsigned getVerbatimLineID() const LLVM_READONLY { | ||
| 152 | assert(is(tok::verbatim_line_name)); | ||
| 153 | return IntVal; | ||
| 154 |   } | ||
| 155 | |||
| 156 | void setVerbatimLineID(unsigned ID) { | ||
| 157 | assert(is(tok::verbatim_line_name)); | ||
| 158 | IntVal = ID; | ||
| 159 |   } | ||
| 160 | |||
| 161 | StringRef getVerbatimLineText() const LLVM_READONLY { | ||
| 162 | assert(is(tok::verbatim_line_text)); | ||
| 163 | return StringRef(TextPtr, IntVal); | ||
| 164 |   } | ||
| 165 | |||
| 166 | void setVerbatimLineText(StringRef Text) { | ||
| 167 | assert(is(tok::verbatim_line_text)); | ||
| 168 | TextPtr = Text.data(); | ||
| 169 | IntVal = Text.size(); | ||
| 170 |   } | ||
| 171 | |||
| 172 | StringRef getHTMLTagStartName() const LLVM_READONLY { | ||
| 173 | assert(is(tok::html_start_tag)); | ||
| 174 | return StringRef(TextPtr, IntVal); | ||
| 175 |   } | ||
| 176 | |||
| 177 | void setHTMLTagStartName(StringRef Name) { | ||
| 178 | assert(is(tok::html_start_tag)); | ||
| 179 | TextPtr = Name.data(); | ||
| 180 | IntVal = Name.size(); | ||
| 181 |   } | ||
| 182 | |||
| 183 | StringRef getHTMLIdent() const LLVM_READONLY { | ||
| 184 | assert(is(tok::html_ident)); | ||
| 185 | return StringRef(TextPtr, IntVal); | ||
| 186 |   } | ||
| 187 | |||
| 188 | void setHTMLIdent(StringRef Name) { | ||
| 189 | assert(is(tok::html_ident)); | ||
| 190 | TextPtr = Name.data(); | ||
| 191 | IntVal = Name.size(); | ||
| 192 |   } | ||
| 193 | |||
| 194 | StringRef getHTMLQuotedString() const LLVM_READONLY { | ||
| 195 | assert(is(tok::html_quoted_string)); | ||
| 196 | return StringRef(TextPtr, IntVal); | ||
| 197 |   } | ||
| 198 | |||
| 199 | void setHTMLQuotedString(StringRef Str) { | ||
| 200 | assert(is(tok::html_quoted_string)); | ||
| 201 | TextPtr = Str.data(); | ||
| 202 | IntVal = Str.size(); | ||
| 203 |   } | ||
| 204 | |||
| 205 | StringRef getHTMLTagEndName() const LLVM_READONLY { | ||
| 206 | assert(is(tok::html_end_tag)); | ||
| 207 | return StringRef(TextPtr, IntVal); | ||
| 208 |   } | ||
| 209 | |||
| 210 | void setHTMLTagEndName(StringRef Name) { | ||
| 211 | assert(is(tok::html_end_tag)); | ||
| 212 | TextPtr = Name.data(); | ||
| 213 | IntVal = Name.size(); | ||
| 214 |   } | ||
| 215 | |||
| 216 | void dump(const Lexer &L, const SourceManager &SM) const; | ||
| 217 | }; | ||
| 218 | |||
| 219 | /// Comment lexer. | ||
| 220 | class Lexer { | ||
| 221 | private: | ||
| 222 | Lexer(const Lexer &) = delete; | ||
| 223 | void operator=(const Lexer &) = delete; | ||
| 224 | |||
| 225 |   /// Allocator for strings that are semantic values of tokens and have to be | ||
| 226 |   /// computed (for example, resolved decimal character references). | ||
| 227 | llvm::BumpPtrAllocator &Allocator; | ||
| 228 | |||
| 229 | DiagnosticsEngine &Diags; | ||
| 230 | |||
| 231 | const CommandTraits &Traits; | ||
| 232 | |||
| 233 | const char *const BufferStart; | ||
| 234 | const char *const BufferEnd; | ||
| 235 | |||
| 236 | const char *BufferPtr; | ||
| 237 | |||
| 238 |   /// One past end pointer for the current comment.  For BCPL comments points | ||
| 239 |   /// to newline or BufferEnd, for C comments points to star in '*/'. | ||
| 240 | const char *CommentEnd; | ||
| 241 | |||
| 242 |   SourceLocation FileLoc; | ||
| 243 | |||
| 244 |   /// If true, the commands, html tags, etc will be parsed and reported as | ||
| 245 |   /// separate tokens inside the comment body. If false, the comment text will | ||
| 246 |   /// be parsed into text and newline tokens. | ||
| 247 | bool ParseCommands; | ||
| 248 | |||
| 249 | enum LexerCommentState : uint8_t { | ||
| 250 | LCS_BeforeComment, | ||
| 251 | LCS_InsideBCPLComment, | ||
| 252 | LCS_InsideCComment, | ||
| 253 | LCS_BetweenComments | ||
| 254 | }; | ||
| 255 | |||
| 256 |   /// Low-level lexer state, track if we are inside or outside of comment. | ||
| 257 |   LexerCommentState CommentState; | ||
| 258 | |||
| 259 | enum LexerState : uint8_t { | ||
| 260 |     /// Lexing normal comment text | ||
| 261 | LS_Normal, | ||
| 262 | |||
| 263 |     /// Finished lexing verbatim block beginning command, will lex first body | ||
| 264 |     /// line. | ||
| 265 | LS_VerbatimBlockFirstLine, | ||
| 266 | |||
| 267 |     /// Lexing verbatim block body line-by-line, skipping line-starting | ||
| 268 |     /// decorations. | ||
| 269 | LS_VerbatimBlockBody, | ||
| 270 | |||
| 271 |     /// Finished lexing verbatim line beginning command, will lex text (one | ||
| 272 |     /// line). | ||
| 273 | LS_VerbatimLineText, | ||
| 274 | |||
| 275 |     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. | ||
| 276 | LS_HTMLStartTag, | ||
| 277 | |||
| 278 |     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. | ||
| 279 | LS_HTMLEndTag | ||
| 280 | }; | ||
| 281 | |||
| 282 |   /// Current lexing mode. | ||
| 283 |   LexerState State; | ||
| 284 | |||
| 285 |   /// If State is LS_VerbatimBlock, contains the name of verbatim end | ||
| 286 |   /// command, including command marker. | ||
| 287 | SmallString<16> VerbatimBlockEndCommandName; | ||
| 288 | |||
| 289 |   /// Given a character reference name (e.g., "lt"), return the character that | ||
| 290 |   /// it stands for (e.g., "<"). | ||
| 291 | StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; | ||
| 292 | |||
| 293 |   /// Given a Unicode codepoint as base-10 integer, return the character. | ||
| 294 | StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; | ||
| 295 | |||
| 296 |   /// Given a Unicode codepoint as base-16 integer, return the character. | ||
| 297 | StringRef resolveHTMLHexCharacterReference(StringRef Name) const; | ||
| 298 | |||
| 299 | void formTokenWithChars(Token &Result, const char *TokEnd, | ||
| 300 | tok::TokenKind Kind); | ||
| 301 | |||
| 302 | void formTextToken(Token &Result, const char *TokEnd) { | ||
| 303 | StringRef Text(BufferPtr, TokEnd - BufferPtr); | ||
| 304 | formTokenWithChars(Result, TokEnd, tok::text); | ||
| 305 | Result.setText(Text); | ||
| 306 |   } | ||
| 307 | |||
| 308 | SourceLocation getSourceLocation(const char *Loc) const { | ||
| 309 | assert(Loc >= BufferStart && Loc <= BufferEnd && | ||
| 310 | "Location out of range for this buffer!"); | ||
| 311 | |||
| 312 | const unsigned CharNo = Loc - BufferStart; | ||
| 313 | return FileLoc.getLocWithOffset(CharNo); | ||
| 314 |   } | ||
| 315 | |||
| 316 | DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { | ||
| 317 | return Diags.Report(Loc, DiagID); | ||
| 318 |   } | ||
| 319 | |||
| 320 |   /// Eat string matching regexp \code \s*\* \endcode. | ||
| 321 | void skipLineStartingDecorations(); | ||
| 322 | |||
| 323 |   /// Skip over pure text. | ||
| 324 | const char *skipTextToken(); | ||
| 325 | |||
| 326 |   /// Lex comment text, including commands if ParseCommands is set to true. | ||
| 327 | void lexCommentText(Token &T); | ||
| 328 | |||
| 329 | void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, | ||
| 330 | const CommandInfo *Info); | ||
| 331 | |||
| 332 | void lexVerbatimBlockFirstLine(Token &T); | ||
| 333 | |||
| 334 | void lexVerbatimBlockBody(Token &T); | ||
| 335 | |||
| 336 | void setupAndLexVerbatimLine(Token &T, const char *TextBegin, | ||
| 337 | const CommandInfo *Info); | ||
| 338 | |||
| 339 | void lexVerbatimLineText(Token &T); | ||
| 340 | |||
| 341 | void lexHTMLCharacterReference(Token &T); | ||
| 342 | |||
| 343 | void setupAndLexHTMLStartTag(Token &T); | ||
| 344 | |||
| 345 | void lexHTMLStartTag(Token &T); | ||
| 346 | |||
| 347 | void setupAndLexHTMLEndTag(Token &T); | ||
| 348 | |||
| 349 | void lexHTMLEndTag(Token &T); | ||
| 350 | |||
| 351 | public: | ||
| 352 | Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, | ||
| 353 | const CommandTraits &Traits, SourceLocation FileLoc, | ||
| 354 | const char *BufferStart, const char *BufferEnd, | ||
| 355 | bool ParseCommands = true); | ||
| 356 | |||
| 357 | void lex(Token &T); | ||
| 358 | |||
| 359 | StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const; | ||
| 360 | }; | ||
| 361 | |||
| 362 | } // end namespace comments | ||
| 363 | } // end namespace clang | ||
| 364 | |||
| 365 | #endif | ||
| 366 |