Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line |
|---|---|---|---|
| 14 | pmbaty | 1 | //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// |
| 2 | // |
||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
| 4 | // See https://llvm.org/LICENSE.txt for license information. |
||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
| 6 | // |
||
| 7 | //===----------------------------------------------------------------------===// |
||
| 8 | // |
||
| 9 | // This file defines lexer for structured comments and supporting token class. |
||
| 10 | // |
||
| 11 | //===----------------------------------------------------------------------===// |
||
| 12 | |||
| 13 | #ifndef LLVM_CLANG_AST_COMMENTLEXER_H |
||
| 14 | #define LLVM_CLANG_AST_COMMENTLEXER_H |
||
| 15 | |||
| 16 | #include "clang/Basic/Diagnostic.h" |
||
| 17 | #include "clang/Basic/SourceManager.h" |
||
| 18 | #include "llvm/ADT/SmallString.h" |
||
| 19 | #include "llvm/ADT/StringRef.h" |
||
| 20 | #include "llvm/Support/Allocator.h" |
||
| 21 | #include "llvm/Support/raw_ostream.h" |
||
| 22 | |||
| 23 | namespace clang { |
||
| 24 | namespace comments { |
||
| 25 | |||
| 26 | class Lexer; |
||
| 27 | class TextTokenRetokenizer; |
||
| 28 | struct CommandInfo; |
||
| 29 | class CommandTraits; |
||
| 30 | |||
| 31 | namespace tok { |
||
| 32 | enum TokenKind { |
||
| 33 | eof, |
||
| 34 | newline, |
||
| 35 | text, |
||
| 36 | unknown_command, // Command that does not have an ID. |
||
| 37 | backslash_command, // Command with an ID, that used backslash marker. |
||
| 38 | at_command, // Command with an ID, that used 'at' marker. |
||
| 39 | verbatim_block_begin, |
||
| 40 | verbatim_block_line, |
||
| 41 | verbatim_block_end, |
||
| 42 | verbatim_line_name, |
||
| 43 | verbatim_line_text, |
||
| 44 | html_start_tag, // <tag |
||
| 45 | html_ident, // attr |
||
| 46 | html_equals, // = |
||
| 47 | html_quoted_string, // "blah\"blah" or 'blah\'blah' |
||
| 48 | html_greater, // > |
||
| 49 | html_slash_greater, // /> |
||
| 50 | html_end_tag // </tag |
||
| 51 | }; |
||
| 52 | } // end namespace tok |
||
| 53 | |||
| 54 | /// Comment token. |
||
| 55 | class Token { |
||
| 56 | friend class Lexer; |
||
| 57 | friend class TextTokenRetokenizer; |
||
| 58 | |||
| 59 | /// The location of the token. |
||
| 60 | SourceLocation Loc; |
||
| 61 | |||
| 62 | /// The actual kind of the token. |
||
| 63 | tok::TokenKind Kind; |
||
| 64 | |||
| 65 | /// Integer value associated with a token. |
||
| 66 | /// |
||
| 67 | /// If the token is a known command, contains command ID and TextPtr is |
||
| 68 | /// unused (command spelling can be found with CommandTraits). Otherwise, |
||
| 69 | /// contains the length of the string that starts at TextPtr. |
||
| 70 | unsigned IntVal; |
||
| 71 | |||
| 72 | /// Length of the token spelling in comment. Can be 0 for synthenized |
||
| 73 | /// tokens. |
||
| 74 | unsigned Length; |
||
| 75 | |||
| 76 | /// Contains text value associated with a token. |
||
| 77 | const char *TextPtr; |
||
| 78 | |||
| 79 | public: |
||
| 80 | SourceLocation getLocation() const LLVM_READONLY { return Loc; } |
||
| 81 | void setLocation(SourceLocation SL) { Loc = SL; } |
||
| 82 | |||
| 83 | SourceLocation getEndLocation() const LLVM_READONLY { |
||
| 84 | if (Length == 0 || Length == 1) |
||
| 85 | return Loc; |
||
| 86 | return Loc.getLocWithOffset(Length - 1); |
||
| 87 | } |
||
| 88 | |||
| 89 | tok::TokenKind getKind() const LLVM_READONLY { return Kind; } |
||
| 90 | void setKind(tok::TokenKind K) { Kind = K; } |
||
| 91 | |||
| 92 | bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } |
||
| 93 | bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } |
||
| 94 | |||
| 95 | unsigned getLength() const LLVM_READONLY { return Length; } |
||
| 96 | void setLength(unsigned L) { Length = L; } |
||
| 97 | |||
| 98 | StringRef getText() const LLVM_READONLY { |
||
| 99 | assert(is(tok::text)); |
||
| 100 | return StringRef(TextPtr, IntVal); |
||
| 101 | } |
||
| 102 | |||
| 103 | void setText(StringRef Text) { |
||
| 104 | assert(is(tok::text)); |
||
| 105 | TextPtr = Text.data(); |
||
| 106 | IntVal = Text.size(); |
||
| 107 | } |
||
| 108 | |||
| 109 | StringRef getUnknownCommandName() const LLVM_READONLY { |
||
| 110 | assert(is(tok::unknown_command)); |
||
| 111 | return StringRef(TextPtr, IntVal); |
||
| 112 | } |
||
| 113 | |||
| 114 | void setUnknownCommandName(StringRef Name) { |
||
| 115 | assert(is(tok::unknown_command)); |
||
| 116 | TextPtr = Name.data(); |
||
| 117 | IntVal = Name.size(); |
||
| 118 | } |
||
| 119 | |||
| 120 | unsigned getCommandID() const LLVM_READONLY { |
||
| 121 | assert(is(tok::backslash_command) || is(tok::at_command)); |
||
| 122 | return IntVal; |
||
| 123 | } |
||
| 124 | |||
| 125 | void setCommandID(unsigned ID) { |
||
| 126 | assert(is(tok::backslash_command) || is(tok::at_command)); |
||
| 127 | IntVal = ID; |
||
| 128 | } |
||
| 129 | |||
| 130 | unsigned getVerbatimBlockID() const LLVM_READONLY { |
||
| 131 | assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); |
||
| 132 | return IntVal; |
||
| 133 | } |
||
| 134 | |||
| 135 | void setVerbatimBlockID(unsigned ID) { |
||
| 136 | assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); |
||
| 137 | IntVal = ID; |
||
| 138 | } |
||
| 139 | |||
| 140 | StringRef getVerbatimBlockText() const LLVM_READONLY { |
||
| 141 | assert(is(tok::verbatim_block_line)); |
||
| 142 | return StringRef(TextPtr, IntVal); |
||
| 143 | } |
||
| 144 | |||
| 145 | void setVerbatimBlockText(StringRef Text) { |
||
| 146 | assert(is(tok::verbatim_block_line)); |
||
| 147 | TextPtr = Text.data(); |
||
| 148 | IntVal = Text.size(); |
||
| 149 | } |
||
| 150 | |||
| 151 | unsigned getVerbatimLineID() const LLVM_READONLY { |
||
| 152 | assert(is(tok::verbatim_line_name)); |
||
| 153 | return IntVal; |
||
| 154 | } |
||
| 155 | |||
| 156 | void setVerbatimLineID(unsigned ID) { |
||
| 157 | assert(is(tok::verbatim_line_name)); |
||
| 158 | IntVal = ID; |
||
| 159 | } |
||
| 160 | |||
| 161 | StringRef getVerbatimLineText() const LLVM_READONLY { |
||
| 162 | assert(is(tok::verbatim_line_text)); |
||
| 163 | return StringRef(TextPtr, IntVal); |
||
| 164 | } |
||
| 165 | |||
| 166 | void setVerbatimLineText(StringRef Text) { |
||
| 167 | assert(is(tok::verbatim_line_text)); |
||
| 168 | TextPtr = Text.data(); |
||
| 169 | IntVal = Text.size(); |
||
| 170 | } |
||
| 171 | |||
| 172 | StringRef getHTMLTagStartName() const LLVM_READONLY { |
||
| 173 | assert(is(tok::html_start_tag)); |
||
| 174 | return StringRef(TextPtr, IntVal); |
||
| 175 | } |
||
| 176 | |||
| 177 | void setHTMLTagStartName(StringRef Name) { |
||
| 178 | assert(is(tok::html_start_tag)); |
||
| 179 | TextPtr = Name.data(); |
||
| 180 | IntVal = Name.size(); |
||
| 181 | } |
||
| 182 | |||
| 183 | StringRef getHTMLIdent() const LLVM_READONLY { |
||
| 184 | assert(is(tok::html_ident)); |
||
| 185 | return StringRef(TextPtr, IntVal); |
||
| 186 | } |
||
| 187 | |||
| 188 | void setHTMLIdent(StringRef Name) { |
||
| 189 | assert(is(tok::html_ident)); |
||
| 190 | TextPtr = Name.data(); |
||
| 191 | IntVal = Name.size(); |
||
| 192 | } |
||
| 193 | |||
| 194 | StringRef getHTMLQuotedString() const LLVM_READONLY { |
||
| 195 | assert(is(tok::html_quoted_string)); |
||
| 196 | return StringRef(TextPtr, IntVal); |
||
| 197 | } |
||
| 198 | |||
| 199 | void setHTMLQuotedString(StringRef Str) { |
||
| 200 | assert(is(tok::html_quoted_string)); |
||
| 201 | TextPtr = Str.data(); |
||
| 202 | IntVal = Str.size(); |
||
| 203 | } |
||
| 204 | |||
| 205 | StringRef getHTMLTagEndName() const LLVM_READONLY { |
||
| 206 | assert(is(tok::html_end_tag)); |
||
| 207 | return StringRef(TextPtr, IntVal); |
||
| 208 | } |
||
| 209 | |||
| 210 | void setHTMLTagEndName(StringRef Name) { |
||
| 211 | assert(is(tok::html_end_tag)); |
||
| 212 | TextPtr = Name.data(); |
||
| 213 | IntVal = Name.size(); |
||
| 214 | } |
||
| 215 | |||
| 216 | void dump(const Lexer &L, const SourceManager &SM) const; |
||
| 217 | }; |
||
| 218 | |||
| 219 | /// Comment lexer. |
||
| 220 | class Lexer { |
||
| 221 | private: |
||
| 222 | Lexer(const Lexer &) = delete; |
||
| 223 | void operator=(const Lexer &) = delete; |
||
| 224 | |||
| 225 | /// Allocator for strings that are semantic values of tokens and have to be |
||
| 226 | /// computed (for example, resolved decimal character references). |
||
| 227 | llvm::BumpPtrAllocator &Allocator; |
||
| 228 | |||
| 229 | DiagnosticsEngine &Diags; |
||
| 230 | |||
| 231 | const CommandTraits &Traits; |
||
| 232 | |||
| 233 | const char *const BufferStart; |
||
| 234 | const char *const BufferEnd; |
||
| 235 | |||
| 236 | const char *BufferPtr; |
||
| 237 | |||
| 238 | /// One past end pointer for the current comment. For BCPL comments points |
||
| 239 | /// to newline or BufferEnd, for C comments points to star in '*/'. |
||
| 240 | const char *CommentEnd; |
||
| 241 | |||
| 242 | SourceLocation FileLoc; |
||
| 243 | |||
| 244 | /// If true, the commands, html tags, etc will be parsed and reported as |
||
| 245 | /// separate tokens inside the comment body. If false, the comment text will |
||
| 246 | /// be parsed into text and newline tokens. |
||
| 247 | bool ParseCommands; |
||
| 248 | |||
| 249 | enum LexerCommentState : uint8_t { |
||
| 250 | LCS_BeforeComment, |
||
| 251 | LCS_InsideBCPLComment, |
||
| 252 | LCS_InsideCComment, |
||
| 253 | LCS_BetweenComments |
||
| 254 | }; |
||
| 255 | |||
| 256 | /// Low-level lexer state, track if we are inside or outside of comment. |
||
| 257 | LexerCommentState CommentState; |
||
| 258 | |||
| 259 | enum LexerState : uint8_t { |
||
| 260 | /// Lexing normal comment text |
||
| 261 | LS_Normal, |
||
| 262 | |||
| 263 | /// Finished lexing verbatim block beginning command, will lex first body |
||
| 264 | /// line. |
||
| 265 | LS_VerbatimBlockFirstLine, |
||
| 266 | |||
| 267 | /// Lexing verbatim block body line-by-line, skipping line-starting |
||
| 268 | /// decorations. |
||
| 269 | LS_VerbatimBlockBody, |
||
| 270 | |||
| 271 | /// Finished lexing verbatim line beginning command, will lex text (one |
||
| 272 | /// line). |
||
| 273 | LS_VerbatimLineText, |
||
| 274 | |||
| 275 | /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. |
||
| 276 | LS_HTMLStartTag, |
||
| 277 | |||
| 278 | /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. |
||
| 279 | LS_HTMLEndTag |
||
| 280 | }; |
||
| 281 | |||
| 282 | /// Current lexing mode. |
||
| 283 | LexerState State; |
||
| 284 | |||
| 285 | /// If State is LS_VerbatimBlock, contains the name of verbatim end |
||
| 286 | /// command, including command marker. |
||
| 287 | SmallString<16> VerbatimBlockEndCommandName; |
||
| 288 | |||
| 289 | /// Given a character reference name (e.g., "lt"), return the character that |
||
| 290 | /// it stands for (e.g., "<"). |
||
| 291 | StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; |
||
| 292 | |||
| 293 | /// Given a Unicode codepoint as base-10 integer, return the character. |
||
| 294 | StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; |
||
| 295 | |||
| 296 | /// Given a Unicode codepoint as base-16 integer, return the character. |
||
| 297 | StringRef resolveHTMLHexCharacterReference(StringRef Name) const; |
||
| 298 | |||
| 299 | void formTokenWithChars(Token &Result, const char *TokEnd, |
||
| 300 | tok::TokenKind Kind); |
||
| 301 | |||
| 302 | void formTextToken(Token &Result, const char *TokEnd) { |
||
| 303 | StringRef Text(BufferPtr, TokEnd - BufferPtr); |
||
| 304 | formTokenWithChars(Result, TokEnd, tok::text); |
||
| 305 | Result.setText(Text); |
||
| 306 | } |
||
| 307 | |||
| 308 | SourceLocation getSourceLocation(const char *Loc) const { |
||
| 309 | assert(Loc >= BufferStart && Loc <= BufferEnd && |
||
| 310 | "Location out of range for this buffer!"); |
||
| 311 | |||
| 312 | const unsigned CharNo = Loc - BufferStart; |
||
| 313 | return FileLoc.getLocWithOffset(CharNo); |
||
| 314 | } |
||
| 315 | |||
| 316 | DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { |
||
| 317 | return Diags.Report(Loc, DiagID); |
||
| 318 | } |
||
| 319 | |||
| 320 | /// Eat string matching regexp \code \s*\* \endcode. |
||
| 321 | void skipLineStartingDecorations(); |
||
| 322 | |||
| 323 | /// Skip over pure text. |
||
| 324 | const char *skipTextToken(); |
||
| 325 | |||
| 326 | /// Lex comment text, including commands if ParseCommands is set to true. |
||
| 327 | void lexCommentText(Token &T); |
||
| 328 | |||
| 329 | void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, |
||
| 330 | const CommandInfo *Info); |
||
| 331 | |||
| 332 | void lexVerbatimBlockFirstLine(Token &T); |
||
| 333 | |||
| 334 | void lexVerbatimBlockBody(Token &T); |
||
| 335 | |||
| 336 | void setupAndLexVerbatimLine(Token &T, const char *TextBegin, |
||
| 337 | const CommandInfo *Info); |
||
| 338 | |||
| 339 | void lexVerbatimLineText(Token &T); |
||
| 340 | |||
| 341 | void lexHTMLCharacterReference(Token &T); |
||
| 342 | |||
| 343 | void setupAndLexHTMLStartTag(Token &T); |
||
| 344 | |||
| 345 | void lexHTMLStartTag(Token &T); |
||
| 346 | |||
| 347 | void setupAndLexHTMLEndTag(Token &T); |
||
| 348 | |||
| 349 | void lexHTMLEndTag(Token &T); |
||
| 350 | |||
| 351 | public: |
||
| 352 | Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, |
||
| 353 | const CommandTraits &Traits, SourceLocation FileLoc, |
||
| 354 | const char *BufferStart, const char *BufferEnd, |
||
| 355 | bool ParseCommands = true); |
||
| 356 | |||
| 357 | void lex(Token &T); |
||
| 358 | |||
| 359 | StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const; |
||
| 360 | }; |
||
| 361 | |||
| 362 | } // end namespace comments |
||
| 363 | } // end namespace clang |
||
| 364 | |||
| 365 | #endif |
||
| 366 |