//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
 
//
 
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 
// See https://llvm.org/LICENSE.txt for license information.
 
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
//
 
//===----------------------------------------------------------------------===//
 
//
 
//  This file defines lexer for structured comments and supporting token class.
 
//
 
//===----------------------------------------------------------------------===//
 
 
 
#ifndef LLVM_CLANG_AST_COMMENTLEXER_H
 
#define LLVM_CLANG_AST_COMMENTLEXER_H
 
 
 
#include "clang/Basic/Diagnostic.h"
 
#include "clang/Basic/SourceManager.h"
 
#include "llvm/ADT/SmallString.h"
 
#include "llvm/ADT/StringRef.h"
 
#include "llvm/Support/Allocator.h"
 
#include "llvm/Support/raw_ostream.h"
 
 
 
namespace clang {
 
namespace comments {
 
 
 
class Lexer;
 
class TextTokenRetokenizer;
 
struct CommandInfo;
 
class CommandTraits;
 
 
 
namespace tok {
 
enum TokenKind {
 
  eof,
 
  newline,
 
  text,
 
  unknown_command,   // Command that does not have an ID.
 
  backslash_command, // Command with an ID, that used backslash marker.
 
  at_command,        // Command with an ID, that used 'at' marker.
 
  verbatim_block_begin,
 
  verbatim_block_line,
 
  verbatim_block_end,
 
  verbatim_line_name,
 
  verbatim_line_text,
 
  html_start_tag,     // <tag
 
  html_ident,         // attr
 
  html_equals,        // =
 
  html_quoted_string, // "blah\"blah" or 'blah\'blah'
 
  html_greater,       // >
 
  html_slash_greater, // />
 
  html_end_tag        // </tag
 
};
 
} // end namespace tok
 
 
 
/// Comment token.
 
class Token {
 
  friend class Lexer;
 
  friend class TextTokenRetokenizer;
 
 
 
  /// The location of the token.
 
  SourceLocation Loc;
 
 
 
  /// The actual kind of the token.
 
  tok::TokenKind Kind;
 
 
 
  /// Integer value associated with a token.
 
  ///
 
  /// If the token is a known command, contains command ID and TextPtr is
 
  /// unused (command spelling can be found with CommandTraits).  Otherwise,
 
  /// contains the length of the string that starts at TextPtr.
 
  unsigned IntVal;
 
 
 
  /// Length of the token spelling in comment.  Can be 0 for synthenized
 
  /// tokens.
 
  unsigned Length;
 
 
 
  /// Contains text value associated with a token.
 
  const char *TextPtr;
 
 
 
public:
 
  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
 
  void setLocation(SourceLocation SL) { Loc = SL; }
 
 
 
  SourceLocation getEndLocation() const LLVM_READONLY {
 
    if (Length == 0 || Length == 1)
 
      return Loc;
 
    return Loc.getLocWithOffset(Length - 1);
 
  }
 
 
 
  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
 
  void setKind(tok::TokenKind K) { Kind = K; }
 
 
 
  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
 
  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
 
 
 
  unsigned getLength() const LLVM_READONLY { return Length; }
 
  void setLength(unsigned L) { Length = L; }
 
 
 
  StringRef getText() const LLVM_READONLY {
 
    assert(is(tok::text));
 
    return StringRef(TextPtr, IntVal);
 
  }
 
 
 
  void setText(StringRef Text) {
 
    assert(is(tok::text));
 
    TextPtr = Text.data();
 
    IntVal = Text.size();
 
  }
 
 
 
  StringRef getUnknownCommandName() const LLVM_READONLY {
 
    assert(is(tok::unknown_command));
 
    return StringRef(TextPtr, IntVal);
 
  }
 
 
 
  void setUnknownCommandName(StringRef Name) {
 
    assert(is(tok::unknown_command));
 
    TextPtr = Name.data();
 
    IntVal = Name.size();
 
  }
 
 
 
  unsigned getCommandID() const LLVM_READONLY {
 
    assert(is(tok::backslash_command) || is(tok::at_command));
 
    return IntVal;
 
  }
 
 
 
  void setCommandID(unsigned ID) {
 
    assert(is(tok::backslash_command) || is(tok::at_command));
 
    IntVal = ID;
 
  }
 
 
 
  unsigned getVerbatimBlockID() const LLVM_READONLY {
 
    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
 
    return IntVal;
 
  }
 
 
 
  void setVerbatimBlockID(unsigned ID) {
 
    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
 
    IntVal = ID;
 
  }
 
 
 
  StringRef getVerbatimBlockText() const LLVM_READONLY {
 
    assert(is(tok::verbatim_block_line));
 
    return StringRef(TextPtr, IntVal);
 
  }
 
 
 
  void setVerbatimBlockText(StringRef Text) {
 
    assert(is(tok::verbatim_block_line));
 
    TextPtr = Text.data();
 
    IntVal = Text.size();
 
  }
 
 
 
  unsigned getVerbatimLineID() const LLVM_READONLY {
 
    assert(is(tok::verbatim_line_name));
 
    return IntVal;
 
  }
 
 
 
  void setVerbatimLineID(unsigned ID) {
 
    assert(is(tok::verbatim_line_name));
 
    IntVal = ID;
 
  }
 
 
 
  StringRef getVerbatimLineText() const LLVM_READONLY {
 
    assert(is(tok::verbatim_line_text));
 
    return StringRef(TextPtr, IntVal);
 
  }
 
 
 
  void setVerbatimLineText(StringRef Text) {
 
    assert(is(tok::verbatim_line_text));
 
    TextPtr = Text.data();
 
    IntVal = Text.size();
 
  }
 
 
 
  StringRef getHTMLTagStartName() const LLVM_READONLY {
 
    assert(is(tok::html_start_tag));
 
    return StringRef(TextPtr, IntVal);
 
  }
 
 
 
  void setHTMLTagStartName(StringRef Name) {
 
    assert(is(tok::html_start_tag));
 
    TextPtr = Name.data();
 
    IntVal = Name.size();
 
  }
 
 
 
  StringRef getHTMLIdent() const LLVM_READONLY {
 
    assert(is(tok::html_ident));
 
    return StringRef(TextPtr, IntVal);
 
  }
 
 
 
  void setHTMLIdent(StringRef Name) {
 
    assert(is(tok::html_ident));
 
    TextPtr = Name.data();
 
    IntVal = Name.size();
 
  }
 
 
 
  StringRef getHTMLQuotedString() const LLVM_READONLY {
 
    assert(is(tok::html_quoted_string));
 
    return StringRef(TextPtr, IntVal);
 
  }
 
 
 
  void setHTMLQuotedString(StringRef Str) {
 
    assert(is(tok::html_quoted_string));
 
    TextPtr = Str.data();
 
    IntVal = Str.size();
 
  }
 
 
 
  StringRef getHTMLTagEndName() const LLVM_READONLY {
 
    assert(is(tok::html_end_tag));
 
    return StringRef(TextPtr, IntVal);
 
  }
 
 
 
  void setHTMLTagEndName(StringRef Name) {
 
    assert(is(tok::html_end_tag));
 
    TextPtr = Name.data();
 
    IntVal = Name.size();
 
  }
 
 
 
  void dump(const Lexer &L, const SourceManager &SM) const;
 
};
 
 
 
/// Comment lexer.
 
class Lexer {
 
private:
 
  Lexer(const Lexer &) = delete;
 
  void operator=(const Lexer &) = delete;
 
 
 
  /// Allocator for strings that are semantic values of tokens and have to be
 
  /// computed (for example, resolved decimal character references).
 
  llvm::BumpPtrAllocator &Allocator;
 
 
 
  DiagnosticsEngine &Diags;
 
 
 
  const CommandTraits &Traits;
 
 
 
  const char *const BufferStart;
 
  const char *const BufferEnd;
 
 
 
  const char *BufferPtr;
 
 
 
  /// One past end pointer for the current comment.  For BCPL comments points
 
  /// to newline or BufferEnd, for C comments points to star in '*/'.
 
  const char *CommentEnd;
 
 
 
  SourceLocation FileLoc;
 
 
 
  /// If true, the commands, html tags, etc will be parsed and reported as
 
  /// separate tokens inside the comment body. If false, the comment text will
 
  /// be parsed into text and newline tokens.
 
  bool ParseCommands;
 
 
 
  enum LexerCommentState : uint8_t {
 
    LCS_BeforeComment,
 
    LCS_InsideBCPLComment,
 
    LCS_InsideCComment,
 
    LCS_BetweenComments
 
  };
 
 
 
  /// Low-level lexer state, track if we are inside or outside of comment.
 
  LexerCommentState CommentState;
 
 
 
  enum LexerState : uint8_t {
 
    /// Lexing normal comment text
 
    LS_Normal,
 
 
 
    /// Finished lexing verbatim block beginning command, will lex first body
 
    /// line.
 
    LS_VerbatimBlockFirstLine,
 
 
 
    /// Lexing verbatim block body line-by-line, skipping line-starting
 
    /// decorations.
 
    LS_VerbatimBlockBody,
 
 
 
    /// Finished lexing verbatim line beginning command, will lex text (one
 
    /// line).
 
    LS_VerbatimLineText,
 
 
 
    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
 
    LS_HTMLStartTag,
 
 
 
    /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
 
    LS_HTMLEndTag
 
  };
 
 
 
  /// Current lexing mode.
 
  LexerState State;
 
 
 
  /// If State is LS_VerbatimBlock, contains the name of verbatim end
 
  /// command, including command marker.
 
  SmallString<16> VerbatimBlockEndCommandName;
 
 
 
  /// Given a character reference name (e.g., "lt"), return the character that
 
  /// it stands for (e.g., "<").
 
  StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
 
 
 
  /// Given a Unicode codepoint as base-10 integer, return the character.
 
  StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
 
 
 
  /// Given a Unicode codepoint as base-16 integer, return the character.
 
  StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
 
 
 
  void formTokenWithChars(Token &Result, const char *TokEnd,
 
                          tok::TokenKind Kind);
 
 
 
  void formTextToken(Token &Result, const char *TokEnd) {
 
    StringRef Text(BufferPtr, TokEnd - BufferPtr);
 
    formTokenWithChars(Result, TokEnd, tok::text);
 
    Result.setText(Text);
 
  }
 
 
 
  SourceLocation getSourceLocation(const char *Loc) const {
 
    assert(Loc >= BufferStart && Loc <= BufferEnd &&
 
           "Location out of range for this buffer!");
 
 
 
    const unsigned CharNo = Loc - BufferStart;
 
    return FileLoc.getLocWithOffset(CharNo);
 
  }
 
 
 
  DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
 
    return Diags.Report(Loc, DiagID);
 
  }
 
 
 
  /// Eat string matching regexp \code \s*\* \endcode.
 
  void skipLineStartingDecorations();
 
 
 
  /// Skip over pure text.
 
  const char *skipTextToken();
 
 
 
  /// Lex comment text, including commands if ParseCommands is set to true.
 
  void lexCommentText(Token &T);
 
 
 
  void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
 
                                const CommandInfo *Info);
 
 
 
  void lexVerbatimBlockFirstLine(Token &T);
 
 
 
  void lexVerbatimBlockBody(Token &T);
 
 
 
  void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
 
                               const CommandInfo *Info);
 
 
 
  void lexVerbatimLineText(Token &T);
 
 
 
  void lexHTMLCharacterReference(Token &T);
 
 
 
  void setupAndLexHTMLStartTag(Token &T);
 
 
 
  void lexHTMLStartTag(Token &T);
 
 
 
  void setupAndLexHTMLEndTag(Token &T);
 
 
 
  void lexHTMLEndTag(Token &T);
 
 
 
public:
 
  Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
 
        const CommandTraits &Traits, SourceLocation FileLoc,
 
        const char *BufferStart, const char *BufferEnd,
 
        bool ParseCommands = true);
 
 
 
  void lex(Token &T);
 
 
 
  StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
 
};
 
 
 
} // end namespace comments
 
} // end namespace clang
 
 
 
#endif