Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// |
2 | // |
||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | // See https://llvm.org/LICENSE.txt for license information. |
||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | // |
||
7 | //===----------------------------------------------------------------------===// |
||
8 | // |
||
9 | // This file defines lexer for structured comments and supporting token class. |
||
10 | // |
||
11 | //===----------------------------------------------------------------------===// |
||
12 | |||
13 | #ifndef LLVM_CLANG_AST_COMMENTLEXER_H |
||
14 | #define LLVM_CLANG_AST_COMMENTLEXER_H |
||
15 | |||
16 | #include "clang/Basic/Diagnostic.h" |
||
17 | #include "clang/Basic/SourceManager.h" |
||
18 | #include "llvm/ADT/SmallString.h" |
||
19 | #include "llvm/ADT/StringRef.h" |
||
20 | #include "llvm/Support/Allocator.h" |
||
21 | #include "llvm/Support/raw_ostream.h" |
||
22 | |||
23 | namespace clang { |
||
24 | namespace comments { |
||
25 | |||
26 | class Lexer; |
||
27 | class TextTokenRetokenizer; |
||
28 | struct CommandInfo; |
||
29 | class CommandTraits; |
||
30 | |||
31 | namespace tok { |
||
32 | enum TokenKind { |
||
33 | eof, |
||
34 | newline, |
||
35 | text, |
||
36 | unknown_command, // Command that does not have an ID. |
||
37 | backslash_command, // Command with an ID, that used backslash marker. |
||
38 | at_command, // Command with an ID, that used 'at' marker. |
||
39 | verbatim_block_begin, |
||
40 | verbatim_block_line, |
||
41 | verbatim_block_end, |
||
42 | verbatim_line_name, |
||
43 | verbatim_line_text, |
||
44 | html_start_tag, // <tag |
||
45 | html_ident, // attr |
||
46 | html_equals, // = |
||
47 | html_quoted_string, // "blah\"blah" or 'blah\'blah' |
||
48 | html_greater, // > |
||
49 | html_slash_greater, // /> |
||
50 | html_end_tag // </tag |
||
51 | }; |
||
52 | } // end namespace tok |
||
53 | |||
54 | /// Comment token. |
||
55 | class Token { |
||
56 | friend class Lexer; |
||
57 | friend class TextTokenRetokenizer; |
||
58 | |||
59 | /// The location of the token. |
||
60 | SourceLocation Loc; |
||
61 | |||
62 | /// The actual kind of the token. |
||
63 | tok::TokenKind Kind; |
||
64 | |||
65 | /// Integer value associated with a token. |
||
66 | /// |
||
67 | /// If the token is a known command, contains command ID and TextPtr is |
||
68 | /// unused (command spelling can be found with CommandTraits). Otherwise, |
||
69 | /// contains the length of the string that starts at TextPtr. |
||
70 | unsigned IntVal; |
||
71 | |||
72 | /// Length of the token spelling in comment. Can be 0 for synthenized |
||
73 | /// tokens. |
||
74 | unsigned Length; |
||
75 | |||
76 | /// Contains text value associated with a token. |
||
77 | const char *TextPtr; |
||
78 | |||
79 | public: |
||
80 | SourceLocation getLocation() const LLVM_READONLY { return Loc; } |
||
81 | void setLocation(SourceLocation SL) { Loc = SL; } |
||
82 | |||
83 | SourceLocation getEndLocation() const LLVM_READONLY { |
||
84 | if (Length == 0 || Length == 1) |
||
85 | return Loc; |
||
86 | return Loc.getLocWithOffset(Length - 1); |
||
87 | } |
||
88 | |||
89 | tok::TokenKind getKind() const LLVM_READONLY { return Kind; } |
||
90 | void setKind(tok::TokenKind K) { Kind = K; } |
||
91 | |||
92 | bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } |
||
93 | bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } |
||
94 | |||
95 | unsigned getLength() const LLVM_READONLY { return Length; } |
||
96 | void setLength(unsigned L) { Length = L; } |
||
97 | |||
98 | StringRef getText() const LLVM_READONLY { |
||
99 | assert(is(tok::text)); |
||
100 | return StringRef(TextPtr, IntVal); |
||
101 | } |
||
102 | |||
103 | void setText(StringRef Text) { |
||
104 | assert(is(tok::text)); |
||
105 | TextPtr = Text.data(); |
||
106 | IntVal = Text.size(); |
||
107 | } |
||
108 | |||
109 | StringRef getUnknownCommandName() const LLVM_READONLY { |
||
110 | assert(is(tok::unknown_command)); |
||
111 | return StringRef(TextPtr, IntVal); |
||
112 | } |
||
113 | |||
114 | void setUnknownCommandName(StringRef Name) { |
||
115 | assert(is(tok::unknown_command)); |
||
116 | TextPtr = Name.data(); |
||
117 | IntVal = Name.size(); |
||
118 | } |
||
119 | |||
120 | unsigned getCommandID() const LLVM_READONLY { |
||
121 | assert(is(tok::backslash_command) || is(tok::at_command)); |
||
122 | return IntVal; |
||
123 | } |
||
124 | |||
125 | void setCommandID(unsigned ID) { |
||
126 | assert(is(tok::backslash_command) || is(tok::at_command)); |
||
127 | IntVal = ID; |
||
128 | } |
||
129 | |||
130 | unsigned getVerbatimBlockID() const LLVM_READONLY { |
||
131 | assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); |
||
132 | return IntVal; |
||
133 | } |
||
134 | |||
135 | void setVerbatimBlockID(unsigned ID) { |
||
136 | assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); |
||
137 | IntVal = ID; |
||
138 | } |
||
139 | |||
140 | StringRef getVerbatimBlockText() const LLVM_READONLY { |
||
141 | assert(is(tok::verbatim_block_line)); |
||
142 | return StringRef(TextPtr, IntVal); |
||
143 | } |
||
144 | |||
145 | void setVerbatimBlockText(StringRef Text) { |
||
146 | assert(is(tok::verbatim_block_line)); |
||
147 | TextPtr = Text.data(); |
||
148 | IntVal = Text.size(); |
||
149 | } |
||
150 | |||
151 | unsigned getVerbatimLineID() const LLVM_READONLY { |
||
152 | assert(is(tok::verbatim_line_name)); |
||
153 | return IntVal; |
||
154 | } |
||
155 | |||
156 | void setVerbatimLineID(unsigned ID) { |
||
157 | assert(is(tok::verbatim_line_name)); |
||
158 | IntVal = ID; |
||
159 | } |
||
160 | |||
161 | StringRef getVerbatimLineText() const LLVM_READONLY { |
||
162 | assert(is(tok::verbatim_line_text)); |
||
163 | return StringRef(TextPtr, IntVal); |
||
164 | } |
||
165 | |||
166 | void setVerbatimLineText(StringRef Text) { |
||
167 | assert(is(tok::verbatim_line_text)); |
||
168 | TextPtr = Text.data(); |
||
169 | IntVal = Text.size(); |
||
170 | } |
||
171 | |||
172 | StringRef getHTMLTagStartName() const LLVM_READONLY { |
||
173 | assert(is(tok::html_start_tag)); |
||
174 | return StringRef(TextPtr, IntVal); |
||
175 | } |
||
176 | |||
177 | void setHTMLTagStartName(StringRef Name) { |
||
178 | assert(is(tok::html_start_tag)); |
||
179 | TextPtr = Name.data(); |
||
180 | IntVal = Name.size(); |
||
181 | } |
||
182 | |||
183 | StringRef getHTMLIdent() const LLVM_READONLY { |
||
184 | assert(is(tok::html_ident)); |
||
185 | return StringRef(TextPtr, IntVal); |
||
186 | } |
||
187 | |||
188 | void setHTMLIdent(StringRef Name) { |
||
189 | assert(is(tok::html_ident)); |
||
190 | TextPtr = Name.data(); |
||
191 | IntVal = Name.size(); |
||
192 | } |
||
193 | |||
194 | StringRef getHTMLQuotedString() const LLVM_READONLY { |
||
195 | assert(is(tok::html_quoted_string)); |
||
196 | return StringRef(TextPtr, IntVal); |
||
197 | } |
||
198 | |||
199 | void setHTMLQuotedString(StringRef Str) { |
||
200 | assert(is(tok::html_quoted_string)); |
||
201 | TextPtr = Str.data(); |
||
202 | IntVal = Str.size(); |
||
203 | } |
||
204 | |||
205 | StringRef getHTMLTagEndName() const LLVM_READONLY { |
||
206 | assert(is(tok::html_end_tag)); |
||
207 | return StringRef(TextPtr, IntVal); |
||
208 | } |
||
209 | |||
210 | void setHTMLTagEndName(StringRef Name) { |
||
211 | assert(is(tok::html_end_tag)); |
||
212 | TextPtr = Name.data(); |
||
213 | IntVal = Name.size(); |
||
214 | } |
||
215 | |||
216 | void dump(const Lexer &L, const SourceManager &SM) const; |
||
217 | }; |
||
218 | |||
219 | /// Comment lexer. |
||
220 | class Lexer { |
||
221 | private: |
||
222 | Lexer(const Lexer &) = delete; |
||
223 | void operator=(const Lexer &) = delete; |
||
224 | |||
225 | /// Allocator for strings that are semantic values of tokens and have to be |
||
226 | /// computed (for example, resolved decimal character references). |
||
227 | llvm::BumpPtrAllocator &Allocator; |
||
228 | |||
229 | DiagnosticsEngine &Diags; |
||
230 | |||
231 | const CommandTraits &Traits; |
||
232 | |||
233 | const char *const BufferStart; |
||
234 | const char *const BufferEnd; |
||
235 | |||
236 | const char *BufferPtr; |
||
237 | |||
238 | /// One past end pointer for the current comment. For BCPL comments points |
||
239 | /// to newline or BufferEnd, for C comments points to star in '*/'. |
||
240 | const char *CommentEnd; |
||
241 | |||
242 | SourceLocation FileLoc; |
||
243 | |||
244 | /// If true, the commands, html tags, etc will be parsed and reported as |
||
245 | /// separate tokens inside the comment body. If false, the comment text will |
||
246 | /// be parsed into text and newline tokens. |
||
247 | bool ParseCommands; |
||
248 | |||
249 | enum LexerCommentState : uint8_t { |
||
250 | LCS_BeforeComment, |
||
251 | LCS_InsideBCPLComment, |
||
252 | LCS_InsideCComment, |
||
253 | LCS_BetweenComments |
||
254 | }; |
||
255 | |||
256 | /// Low-level lexer state, track if we are inside or outside of comment. |
||
257 | LexerCommentState CommentState; |
||
258 | |||
259 | enum LexerState : uint8_t { |
||
260 | /// Lexing normal comment text |
||
261 | LS_Normal, |
||
262 | |||
263 | /// Finished lexing verbatim block beginning command, will lex first body |
||
264 | /// line. |
||
265 | LS_VerbatimBlockFirstLine, |
||
266 | |||
267 | /// Lexing verbatim block body line-by-line, skipping line-starting |
||
268 | /// decorations. |
||
269 | LS_VerbatimBlockBody, |
||
270 | |||
271 | /// Finished lexing verbatim line beginning command, will lex text (one |
||
272 | /// line). |
||
273 | LS_VerbatimLineText, |
||
274 | |||
275 | /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. |
||
276 | LS_HTMLStartTag, |
||
277 | |||
278 | /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'. |
||
279 | LS_HTMLEndTag |
||
280 | }; |
||
281 | |||
282 | /// Current lexing mode. |
||
283 | LexerState State; |
||
284 | |||
285 | /// If State is LS_VerbatimBlock, contains the name of verbatim end |
||
286 | /// command, including command marker. |
||
287 | SmallString<16> VerbatimBlockEndCommandName; |
||
288 | |||
289 | /// Given a character reference name (e.g., "lt"), return the character that |
||
290 | /// it stands for (e.g., "<"). |
||
291 | StringRef resolveHTMLNamedCharacterReference(StringRef Name) const; |
||
292 | |||
293 | /// Given a Unicode codepoint as base-10 integer, return the character. |
||
294 | StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const; |
||
295 | |||
296 | /// Given a Unicode codepoint as base-16 integer, return the character. |
||
297 | StringRef resolveHTMLHexCharacterReference(StringRef Name) const; |
||
298 | |||
299 | void formTokenWithChars(Token &Result, const char *TokEnd, |
||
300 | tok::TokenKind Kind); |
||
301 | |||
302 | void formTextToken(Token &Result, const char *TokEnd) { |
||
303 | StringRef Text(BufferPtr, TokEnd - BufferPtr); |
||
304 | formTokenWithChars(Result, TokEnd, tok::text); |
||
305 | Result.setText(Text); |
||
306 | } |
||
307 | |||
308 | SourceLocation getSourceLocation(const char *Loc) const { |
||
309 | assert(Loc >= BufferStart && Loc <= BufferEnd && |
||
310 | "Location out of range for this buffer!"); |
||
311 | |||
312 | const unsigned CharNo = Loc - BufferStart; |
||
313 | return FileLoc.getLocWithOffset(CharNo); |
||
314 | } |
||
315 | |||
316 | DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) { |
||
317 | return Diags.Report(Loc, DiagID); |
||
318 | } |
||
319 | |||
320 | /// Eat string matching regexp \code \s*\* \endcode. |
||
321 | void skipLineStartingDecorations(); |
||
322 | |||
323 | /// Skip over pure text. |
||
324 | const char *skipTextToken(); |
||
325 | |||
326 | /// Lex comment text, including commands if ParseCommands is set to true. |
||
327 | void lexCommentText(Token &T); |
||
328 | |||
329 | void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker, |
||
330 | const CommandInfo *Info); |
||
331 | |||
332 | void lexVerbatimBlockFirstLine(Token &T); |
||
333 | |||
334 | void lexVerbatimBlockBody(Token &T); |
||
335 | |||
336 | void setupAndLexVerbatimLine(Token &T, const char *TextBegin, |
||
337 | const CommandInfo *Info); |
||
338 | |||
339 | void lexVerbatimLineText(Token &T); |
||
340 | |||
341 | void lexHTMLCharacterReference(Token &T); |
||
342 | |||
343 | void setupAndLexHTMLStartTag(Token &T); |
||
344 | |||
345 | void lexHTMLStartTag(Token &T); |
||
346 | |||
347 | void setupAndLexHTMLEndTag(Token &T); |
||
348 | |||
349 | void lexHTMLEndTag(Token &T); |
||
350 | |||
351 | public: |
||
352 | Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags, |
||
353 | const CommandTraits &Traits, SourceLocation FileLoc, |
||
354 | const char *BufferStart, const char *BufferEnd, |
||
355 | bool ParseCommands = true); |
||
356 | |||
357 | void lex(Token &T); |
||
358 | |||
359 | StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const; |
||
360 | }; |
||
361 | |||
362 | } // end namespace comments |
||
363 | } // end namespace clang |
||
364 | |||
365 | #endif |
||
366 |