//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines lexer for structured comments and supporting token class.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_CLANG_AST_COMMENTLEXER_H
#define LLVM_CLANG_AST_COMMENTLEXER_H
#include "clang/Basic/Diagnostic.h"
#include "clang/Basic/SourceManager.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/Allocator.h"
#include "llvm/Support/raw_ostream.h"
namespace clang {
namespace comments {
class Lexer;
class TextTokenRetokenizer;
struct CommandInfo;
class CommandTraits;
namespace tok {
enum TokenKind {
eof,
newline,
text,
unknown_command, // Command that does not have an ID.
backslash_command, // Command with an ID, that used backslash marker.
at_command, // Command with an ID, that used 'at' marker.
verbatim_block_begin,
verbatim_block_line,
verbatim_block_end,
verbatim_line_name,
verbatim_line_text,
html_start_tag, // <tag
html_ident, // attr
html_equals, // =
html_quoted_string, // "blah\"blah" or 'blah\'blah'
html_greater, // >
html_slash_greater, // />
html_end_tag // </tag
};
} // end namespace tok
/// Comment token.
class Token {
friend class Lexer;
friend class TextTokenRetokenizer;
/// The location of the token.
SourceLocation Loc;
/// The actual kind of the token.
tok::TokenKind Kind;
/// Integer value associated with a token.
///
/// If the token is a known command, contains command ID and TextPtr is
/// unused (command spelling can be found with CommandTraits). Otherwise,
/// contains the length of the string that starts at TextPtr.
unsigned IntVal;
/// Length of the token spelling in comment. Can be 0 for synthenized
/// tokens.
unsigned Length;
/// Contains text value associated with a token.
const char *TextPtr;
public:
SourceLocation getLocation() const LLVM_READONLY { return Loc; }
void setLocation(SourceLocation SL) { Loc = SL; }
SourceLocation getEndLocation() const LLVM_READONLY {
if (Length == 0 || Length == 1)
return Loc;
return Loc.getLocWithOffset(Length - 1);
}
tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
void setKind(tok::TokenKind K) { Kind = K; }
bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
unsigned getLength() const LLVM_READONLY { return Length; }
void setLength(unsigned L) { Length = L; }
StringRef getText() const LLVM_READONLY {
assert(is(tok::text));
return StringRef(TextPtr, IntVal);
}
void setText(StringRef Text) {
assert(is(tok::text));
TextPtr = Text.data();
IntVal = Text.size();
}
StringRef getUnknownCommandName() const LLVM_READONLY {
assert(is(tok::unknown_command));
return StringRef(TextPtr, IntVal);
}
void setUnknownCommandName(StringRef Name) {
assert(is(tok::unknown_command));
TextPtr = Name.data();
IntVal = Name.size();
}
unsigned getCommandID() const LLVM_READONLY {
assert(is(tok::backslash_command) || is(tok::at_command));
return IntVal;
}
void setCommandID(unsigned ID) {
assert(is(tok::backslash_command) || is(tok::at_command));
IntVal = ID;
}
unsigned getVerbatimBlockID() const LLVM_READONLY {
assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
return IntVal;
}
void setVerbatimBlockID(unsigned ID) {
assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
IntVal = ID;
}
StringRef getVerbatimBlockText() const LLVM_READONLY {
assert(is(tok::verbatim_block_line));
return StringRef(TextPtr, IntVal);
}
void setVerbatimBlockText(StringRef Text) {
assert(is(tok::verbatim_block_line));
TextPtr = Text.data();
IntVal = Text.size();
}
unsigned getVerbatimLineID() const LLVM_READONLY {
assert(is(tok::verbatim_line_name));
return IntVal;
}
void setVerbatimLineID(unsigned ID) {
assert(is(tok::verbatim_line_name));
IntVal = ID;
}
StringRef getVerbatimLineText() const LLVM_READONLY {
assert(is(tok::verbatim_line_text));
return StringRef(TextPtr, IntVal);
}
void setVerbatimLineText(StringRef Text) {
assert(is(tok::verbatim_line_text));
TextPtr = Text.data();
IntVal = Text.size();
}
StringRef getHTMLTagStartName() const LLVM_READONLY {
assert(is(tok::html_start_tag));
return StringRef(TextPtr, IntVal);
}
void setHTMLTagStartName(StringRef Name) {
assert(is(tok::html_start_tag));
TextPtr = Name.data();
IntVal = Name.size();
}
StringRef getHTMLIdent() const LLVM_READONLY {
assert(is(tok::html_ident));
return StringRef(TextPtr, IntVal);
}
void setHTMLIdent(StringRef Name) {
assert(is(tok::html_ident));
TextPtr = Name.data();
IntVal = Name.size();
}
StringRef getHTMLQuotedString() const LLVM_READONLY {
assert(is(tok::html_quoted_string));
return StringRef(TextPtr, IntVal);
}
void setHTMLQuotedString(StringRef Str) {
assert(is(tok::html_quoted_string));
TextPtr = Str.data();
IntVal = Str.size();
}
StringRef getHTMLTagEndName() const LLVM_READONLY {
assert(is(tok::html_end_tag));
return StringRef(TextPtr, IntVal);
}
void setHTMLTagEndName(StringRef Name) {
assert(is(tok::html_end_tag));
TextPtr = Name.data();
IntVal = Name.size();
}
void dump(const Lexer &L, const SourceManager &SM) const;
};
/// Comment lexer.
class Lexer {
private:
Lexer(const Lexer &) = delete;
void operator=(const Lexer &) = delete;
/// Allocator for strings that are semantic values of tokens and have to be
/// computed (for example, resolved decimal character references).
llvm::BumpPtrAllocator &Allocator;
DiagnosticsEngine &Diags;
const CommandTraits &Traits;
const char *const BufferStart;
const char *const BufferEnd;
const char *BufferPtr;
/// One past end pointer for the current comment. For BCPL comments points
/// to newline or BufferEnd, for C comments points to star in '*/'.
const char *CommentEnd;
SourceLocation FileLoc;
/// If true, the commands, html tags, etc will be parsed and reported as
/// separate tokens inside the comment body. If false, the comment text will
/// be parsed into text and newline tokens.
bool ParseCommands;
enum LexerCommentState : uint8_t {
LCS_BeforeComment,
LCS_InsideBCPLComment,
LCS_InsideCComment,
LCS_BetweenComments
};
/// Low-level lexer state, track if we are inside or outside of comment.
LexerCommentState CommentState;
enum LexerState : uint8_t {
/// Lexing normal comment text
LS_Normal,
/// Finished lexing verbatim block beginning command, will lex first body
/// line.
LS_VerbatimBlockFirstLine,
/// Lexing verbatim block body line-by-line, skipping line-starting
/// decorations.
LS_VerbatimBlockBody,
/// Finished lexing verbatim line beginning command, will lex text (one
/// line).
LS_VerbatimLineText,
/// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
LS_HTMLStartTag,
/// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
LS_HTMLEndTag
};
/// Current lexing mode.
LexerState State;
/// If State is LS_VerbatimBlock, contains the name of verbatim end
/// command, including command marker.
SmallString<16> VerbatimBlockEndCommandName;
/// Given a character reference name (e.g., "lt"), return the character that
/// it stands for (e.g., "<").
StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
/// Given a Unicode codepoint as base-10 integer, return the character.
StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
/// Given a Unicode codepoint as base-16 integer, return the character.
StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
void formTokenWithChars(Token &Result, const char *TokEnd,
tok::TokenKind Kind);
void formTextToken(Token &Result, const char *TokEnd) {
StringRef Text(BufferPtr, TokEnd - BufferPtr);
formTokenWithChars(Result, TokEnd, tok::text);
Result.setText(Text);
}
SourceLocation getSourceLocation(const char *Loc) const {
assert(Loc >= BufferStart && Loc <= BufferEnd &&
"Location out of range for this buffer!");
const unsigned CharNo = Loc - BufferStart;
return FileLoc.getLocWithOffset(CharNo);
}
DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
return Diags.Report(Loc, DiagID);
}
/// Eat string matching regexp \code \s*\* \endcode.
void skipLineStartingDecorations();
/// Skip over pure text.
const char *skipTextToken();
/// Lex comment text, including commands if ParseCommands is set to true.
void lexCommentText(Token &T);
void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
const CommandInfo *Info);
void lexVerbatimBlockFirstLine(Token &T);
void lexVerbatimBlockBody(Token &T);
void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
const CommandInfo *Info);
void lexVerbatimLineText(Token &T);
void lexHTMLCharacterReference(Token &T);
void setupAndLexHTMLStartTag(Token &T);
void lexHTMLStartTag(Token &T);
void setupAndLexHTMLEndTag(Token &T);
void lexHTMLEndTag(Token &T);
public:
Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
const CommandTraits &Traits, SourceLocation FileLoc,
const char *BufferStart, const char *BufferEnd,
bool ParseCommands = true);
void lex(Token &T);
StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
};
} // end namespace comments
} // end namespace clang
#endif