Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. //  This file defines lexer for structured comments and supporting token class.
  10. //
  11. //===----------------------------------------------------------------------===//
  12.  
  13. #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
  14. #define LLVM_CLANG_AST_COMMENTLEXER_H
  15.  
  16. #include "clang/Basic/Diagnostic.h"
  17. #include "clang/Basic/SourceManager.h"
  18. #include "llvm/ADT/SmallString.h"
  19. #include "llvm/ADT/StringRef.h"
  20. #include "llvm/Support/Allocator.h"
  21. #include "llvm/Support/raw_ostream.h"
  22.  
  23. namespace clang {
  24. namespace comments {
  25.  
  26. class Lexer;
  27. class TextTokenRetokenizer;
  28. struct CommandInfo;
  29. class CommandTraits;
  30.  
  31. namespace tok {
  32. enum TokenKind {
  33.   eof,
  34.   newline,
  35.   text,
  36.   unknown_command,   // Command that does not have an ID.
  37.   backslash_command, // Command with an ID, that used backslash marker.
  38.   at_command,        // Command with an ID, that used 'at' marker.
  39.   verbatim_block_begin,
  40.   verbatim_block_line,
  41.   verbatim_block_end,
  42.   verbatim_line_name,
  43.   verbatim_line_text,
  44.   html_start_tag,     // <tag
  45.   html_ident,         // attr
  46.   html_equals,        // =
  47.   html_quoted_string, // "blah\"blah" or 'blah\'blah'
  48.   html_greater,       // >
  49.   html_slash_greater, // />
  50.   html_end_tag        // </tag
  51. };
  52. } // end namespace tok
  53.  
  54. /// Comment token.
  55. class Token {
  56.   friend class Lexer;
  57.   friend class TextTokenRetokenizer;
  58.  
  59.   /// The location of the token.
  60.   SourceLocation Loc;
  61.  
  62.   /// The actual kind of the token.
  63.   tok::TokenKind Kind;
  64.  
  65.   /// Integer value associated with a token.
  66.   ///
  67.   /// If the token is a known command, contains command ID and TextPtr is
  68.   /// unused (command spelling can be found with CommandTraits).  Otherwise,
  69.   /// contains the length of the string that starts at TextPtr.
  70.   unsigned IntVal;
  71.  
  72.   /// Length of the token spelling in comment.  Can be 0 for synthenized
  73.   /// tokens.
  74.   unsigned Length;
  75.  
  76.   /// Contains text value associated with a token.
  77.   const char *TextPtr;
  78.  
  79. public:
  80.   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
  81.   void setLocation(SourceLocation SL) { Loc = SL; }
  82.  
  83.   SourceLocation getEndLocation() const LLVM_READONLY {
  84.     if (Length == 0 || Length == 1)
  85.       return Loc;
  86.     return Loc.getLocWithOffset(Length - 1);
  87.   }
  88.  
  89.   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
  90.   void setKind(tok::TokenKind K) { Kind = K; }
  91.  
  92.   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
  93.   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
  94.  
  95.   unsigned getLength() const LLVM_READONLY { return Length; }
  96.   void setLength(unsigned L) { Length = L; }
  97.  
  98.   StringRef getText() const LLVM_READONLY {
  99.     assert(is(tok::text));
  100.     return StringRef(TextPtr, IntVal);
  101.   }
  102.  
  103.   void setText(StringRef Text) {
  104.     assert(is(tok::text));
  105.     TextPtr = Text.data();
  106.     IntVal = Text.size();
  107.   }
  108.  
  109.   StringRef getUnknownCommandName() const LLVM_READONLY {
  110.     assert(is(tok::unknown_command));
  111.     return StringRef(TextPtr, IntVal);
  112.   }
  113.  
  114.   void setUnknownCommandName(StringRef Name) {
  115.     assert(is(tok::unknown_command));
  116.     TextPtr = Name.data();
  117.     IntVal = Name.size();
  118.   }
  119.  
  120.   unsigned getCommandID() const LLVM_READONLY {
  121.     assert(is(tok::backslash_command) || is(tok::at_command));
  122.     return IntVal;
  123.   }
  124.  
  125.   void setCommandID(unsigned ID) {
  126.     assert(is(tok::backslash_command) || is(tok::at_command));
  127.     IntVal = ID;
  128.   }
  129.  
  130.   unsigned getVerbatimBlockID() const LLVM_READONLY {
  131.     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
  132.     return IntVal;
  133.   }
  134.  
  135.   void setVerbatimBlockID(unsigned ID) {
  136.     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
  137.     IntVal = ID;
  138.   }
  139.  
  140.   StringRef getVerbatimBlockText() const LLVM_READONLY {
  141.     assert(is(tok::verbatim_block_line));
  142.     return StringRef(TextPtr, IntVal);
  143.   }
  144.  
  145.   void setVerbatimBlockText(StringRef Text) {
  146.     assert(is(tok::verbatim_block_line));
  147.     TextPtr = Text.data();
  148.     IntVal = Text.size();
  149.   }
  150.  
  151.   unsigned getVerbatimLineID() const LLVM_READONLY {
  152.     assert(is(tok::verbatim_line_name));
  153.     return IntVal;
  154.   }
  155.  
  156.   void setVerbatimLineID(unsigned ID) {
  157.     assert(is(tok::verbatim_line_name));
  158.     IntVal = ID;
  159.   }
  160.  
  161.   StringRef getVerbatimLineText() const LLVM_READONLY {
  162.     assert(is(tok::verbatim_line_text));
  163.     return StringRef(TextPtr, IntVal);
  164.   }
  165.  
  166.   void setVerbatimLineText(StringRef Text) {
  167.     assert(is(tok::verbatim_line_text));
  168.     TextPtr = Text.data();
  169.     IntVal = Text.size();
  170.   }
  171.  
  172.   StringRef getHTMLTagStartName() const LLVM_READONLY {
  173.     assert(is(tok::html_start_tag));
  174.     return StringRef(TextPtr, IntVal);
  175.   }
  176.  
  177.   void setHTMLTagStartName(StringRef Name) {
  178.     assert(is(tok::html_start_tag));
  179.     TextPtr = Name.data();
  180.     IntVal = Name.size();
  181.   }
  182.  
  183.   StringRef getHTMLIdent() const LLVM_READONLY {
  184.     assert(is(tok::html_ident));
  185.     return StringRef(TextPtr, IntVal);
  186.   }
  187.  
  188.   void setHTMLIdent(StringRef Name) {
  189.     assert(is(tok::html_ident));
  190.     TextPtr = Name.data();
  191.     IntVal = Name.size();
  192.   }
  193.  
  194.   StringRef getHTMLQuotedString() const LLVM_READONLY {
  195.     assert(is(tok::html_quoted_string));
  196.     return StringRef(TextPtr, IntVal);
  197.   }
  198.  
  199.   void setHTMLQuotedString(StringRef Str) {
  200.     assert(is(tok::html_quoted_string));
  201.     TextPtr = Str.data();
  202.     IntVal = Str.size();
  203.   }
  204.  
  205.   StringRef getHTMLTagEndName() const LLVM_READONLY {
  206.     assert(is(tok::html_end_tag));
  207.     return StringRef(TextPtr, IntVal);
  208.   }
  209.  
  210.   void setHTMLTagEndName(StringRef Name) {
  211.     assert(is(tok::html_end_tag));
  212.     TextPtr = Name.data();
  213.     IntVal = Name.size();
  214.   }
  215.  
  216.   void dump(const Lexer &L, const SourceManager &SM) const;
  217. };
  218.  
  219. /// Comment lexer.
  220. class Lexer {
  221. private:
  222.   Lexer(const Lexer &) = delete;
  223.   void operator=(const Lexer &) = delete;
  224.  
  225.   /// Allocator for strings that are semantic values of tokens and have to be
  226.   /// computed (for example, resolved decimal character references).
  227.   llvm::BumpPtrAllocator &Allocator;
  228.  
  229.   DiagnosticsEngine &Diags;
  230.  
  231.   const CommandTraits &Traits;
  232.  
  233.   const char *const BufferStart;
  234.   const char *const BufferEnd;
  235.  
  236.   const char *BufferPtr;
  237.  
  238.   /// One past end pointer for the current comment.  For BCPL comments points
  239.   /// to newline or BufferEnd, for C comments points to star in '*/'.
  240.   const char *CommentEnd;
  241.  
  242.   SourceLocation FileLoc;
  243.  
  244.   /// If true, the commands, html tags, etc will be parsed and reported as
  245.   /// separate tokens inside the comment body. If false, the comment text will
  246.   /// be parsed into text and newline tokens.
  247.   bool ParseCommands;
  248.  
  249.   enum LexerCommentState : uint8_t {
  250.     LCS_BeforeComment,
  251.     LCS_InsideBCPLComment,
  252.     LCS_InsideCComment,
  253.     LCS_BetweenComments
  254.   };
  255.  
  256.   /// Low-level lexer state, track if we are inside or outside of comment.
  257.   LexerCommentState CommentState;
  258.  
  259.   enum LexerState : uint8_t {
  260.     /// Lexing normal comment text
  261.     LS_Normal,
  262.  
  263.     /// Finished lexing verbatim block beginning command, will lex first body
  264.     /// line.
  265.     LS_VerbatimBlockFirstLine,
  266.  
  267.     /// Lexing verbatim block body line-by-line, skipping line-starting
  268.     /// decorations.
  269.     LS_VerbatimBlockBody,
  270.  
  271.     /// Finished lexing verbatim line beginning command, will lex text (one
  272.     /// line).
  273.     LS_VerbatimLineText,
  274.  
  275.     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
  276.     LS_HTMLStartTag,
  277.  
  278.     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
  279.     LS_HTMLEndTag
  280.   };
  281.  
  282.   /// Current lexing mode.
  283.   LexerState State;
  284.  
  285.   /// If State is LS_VerbatimBlock, contains the name of verbatim end
  286.   /// command, including command marker.
  287.   SmallString<16> VerbatimBlockEndCommandName;
  288.  
  289.   /// Given a character reference name (e.g., "lt"), return the character that
  290.   /// it stands for (e.g., "<").
  291.   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
  292.  
  293.   /// Given a Unicode codepoint as base-10 integer, return the character.
  294.   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
  295.  
  296.   /// Given a Unicode codepoint as base-16 integer, return the character.
  297.   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
  298.  
  299.   void formTokenWithChars(Token &Result, const char *TokEnd,
  300.                           tok::TokenKind Kind);
  301.  
  302.   void formTextToken(Token &Result, const char *TokEnd) {
  303.     StringRef Text(BufferPtr, TokEnd - BufferPtr);
  304.     formTokenWithChars(Result, TokEnd, tok::text);
  305.     Result.setText(Text);
  306.   }
  307.  
  308.   SourceLocation getSourceLocation(const char *Loc) const {
  309.     assert(Loc >= BufferStart && Loc <= BufferEnd &&
  310.            "Location out of range for this buffer!");
  311.  
  312.     const unsigned CharNo = Loc - BufferStart;
  313.     return FileLoc.getLocWithOffset(CharNo);
  314.   }
  315.  
  316.   DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
  317.     return Diags.Report(Loc, DiagID);
  318.   }
  319.  
  320.   /// Eat string matching regexp \code \s*\* \endcode.
  321.   void skipLineStartingDecorations();
  322.  
  323.   /// Skip over pure text.
  324.   const char *skipTextToken();
  325.  
  326.   /// Lex comment text, including commands if ParseCommands is set to true.
  327.   void lexCommentText(Token &T);
  328.  
  329.   void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
  330.                                 const CommandInfo *Info);
  331.  
  332.   void lexVerbatimBlockFirstLine(Token &T);
  333.  
  334.   void lexVerbatimBlockBody(Token &T);
  335.  
  336.   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
  337.                                const CommandInfo *Info);
  338.  
  339.   void lexVerbatimLineText(Token &T);
  340.  
  341.   void lexHTMLCharacterReference(Token &T);
  342.  
  343.   void setupAndLexHTMLStartTag(Token &T);
  344.  
  345.   void lexHTMLStartTag(Token &T);
  346.  
  347.   void setupAndLexHTMLEndTag(Token &T);
  348.  
  349.   void lexHTMLEndTag(Token &T);
  350.  
  351. public:
  352.   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
  353.         const CommandTraits &Traits, SourceLocation FileLoc,
  354.         const char *BufferStart, const char *BufferEnd,
  355.         bool ParseCommands = true);
  356.  
  357.   void lex(Token &T);
  358.  
  359.   StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
  360. };
  361.  
  362. } // end namespace comments
  363. } // end namespace clang
  364.  
  365. #endif
  366.  
  367.