Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. //===- Lexer.h - C Language Family Lexer ------------------------*- C++ -*-===//
  2. //
  3. // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4. // See https://llvm.org/LICENSE.txt for license information.
  5. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6. //
  7. //===----------------------------------------------------------------------===//
  8. //
  9. //  This file defines the Lexer interface.
  10. //
  11. //===----------------------------------------------------------------------===//
  12.  
  13. #ifndef LLVM_CLANG_LEX_LEXER_H
  14. #define LLVM_CLANG_LEX_LEXER_H
  15.  
  16. #include "clang/Basic/LangOptions.h"
  17. #include "clang/Basic/SourceLocation.h"
  18. #include "clang/Basic/TokenKinds.h"
  19. #include "clang/Lex/DependencyDirectivesScanner.h"
  20. #include "clang/Lex/PreprocessorLexer.h"
  21. #include "clang/Lex/Token.h"
  22. #include "llvm/ADT/SmallVector.h"
  23. #include "llvm/ADT/StringRef.h"
  24. #include <cassert>
  25. #include <cstdint>
  26. #include <optional>
  27. #include <string>
  28.  
  29. namespace llvm {
  30.  
  31. class MemoryBufferRef;
  32.  
  33. } // namespace llvm
  34.  
  35. namespace clang {
  36.  
  37. class DiagnosticBuilder;
  38. class Preprocessor;
  39. class SourceManager;
  40. class LangOptions;
  41.  
  42. /// ConflictMarkerKind - Kinds of conflict marker which the lexer might be
  43. /// recovering from.
  44. enum ConflictMarkerKind {
  45.   /// Not within a conflict marker.
  46.   CMK_None,
  47.  
  48.   /// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
  49.   /// separated by at least 7 "="s or "|"s, and terminated by at least 7 ">"s.
  50.   CMK_Normal,
  51.  
  52.   /// A Perforce-style conflict marker, initiated by 4 ">"s,
  53.   /// separated by 4 "="s, and terminated by 4 "<"s.
  54.   CMK_Perforce
  55. };
  56.  
  57. /// Describes the bounds (start, size) of the preamble and a flag required by
  58. /// PreprocessorOptions::PrecompiledPreambleBytes.
  59. /// The preamble includes the BOM, if any.
  60. struct PreambleBounds {
  61.   /// Size of the preamble in bytes.
  62.   unsigned Size;
  63.  
  64.   /// Whether the preamble ends at the start of a new line.
  65.   ///
  66.   /// Used to inform the lexer as to whether it's starting at the beginning of
  67.   /// a line after skipping the preamble.
  68.   bool PreambleEndsAtStartOfLine;
  69.  
  70.   PreambleBounds(unsigned Size, bool PreambleEndsAtStartOfLine)
  71.       : Size(Size), PreambleEndsAtStartOfLine(PreambleEndsAtStartOfLine) {}
  72. };
  73.  
  74. /// Lexer - This provides a simple interface that turns a text buffer into a
  75. /// stream of tokens.  This provides no support for file reading or buffering,
  76. /// or buffering/seeking of tokens, only forward lexing is supported.  It relies
  77. /// on the specified Preprocessor object to handle preprocessor directives, etc.
  78. class Lexer : public PreprocessorLexer {
  79.   friend class Preprocessor;
  80.  
  81.   void anchor() override;
  82.  
  83.   //===--------------------------------------------------------------------===//
  84.   // Constant configuration values for this lexer.
  85.  
  86.   // Start of the buffer.
  87.   const char *BufferStart;
  88.  
  89.   // End of the buffer.
  90.   const char *BufferEnd;
  91.  
  92.   // Location for start of file.
  93.   SourceLocation FileLoc;
  94.  
  95.   // LangOpts enabled by this language.
  96.   // Storing LangOptions as reference here is important from performance point
  97.   // of view. Lack of reference means that LangOptions copy constructor would be
  98.   // called by Lexer(..., const LangOptions &LangOpts,...). Given that local
  99.   // Lexer objects are created thousands times (in Lexer::getRawToken,
  100.   // Preprocessor::EnterSourceFile and other places) during single module
  101.   // processing in frontend it would make std::vector<std::string> copy
  102.   // constructors surprisingly hot.
  103.   const LangOptions &LangOpts;
  104.  
  105.   // True if '//' line comments are enabled.
  106.   bool LineComment;
  107.  
  108.   // True if lexer for _Pragma handling.
  109.   bool Is_PragmaLexer;
  110.  
  111.   //===--------------------------------------------------------------------===//
  112.   // Context-specific lexing flags set by the preprocessor.
  113.   //
  114.  
  115.   /// ExtendedTokenMode - The lexer can optionally keep comments and whitespace
  116.   /// and return them as tokens.  This is used for -C and -CC modes, and
  117.   /// whitespace preservation can be useful for some clients that want to lex
  118.   /// the file in raw mode and get every character from the file.
  119.   ///
  120.   /// When this is set to 2 it returns comments and whitespace.  When set to 1
  121.   /// it returns comments, when it is set to 0 it returns normal tokens only.
  122.   unsigned char ExtendedTokenMode;
  123.  
  124.   //===--------------------------------------------------------------------===//
  125.   // Context that changes as the file is lexed.
  126.   // NOTE: any state that mutates when in raw mode must have save/restore code
  127.   // in Lexer::isNextPPTokenLParen.
  128.  
  129.   // BufferPtr - Current pointer into the buffer.  This is the next character
  130.   // to be lexed.
  131.   const char *BufferPtr;
  132.  
  133.   // IsAtStartOfLine - True if the next lexed token should get the "start of
  134.   // line" flag set on it.
  135.   bool IsAtStartOfLine;
  136.  
  137.   bool IsAtPhysicalStartOfLine;
  138.  
  139.   bool HasLeadingSpace;
  140.  
  141.   bool HasLeadingEmptyMacro;
  142.  
  143.   /// True if this is the first time we're lexing the input file.
  144.   bool IsFirstTimeLexingFile;
  145.  
  146.   // NewLinePtr - A pointer to new line character '\n' being lexed. For '\r\n',
  147.   // it also points to '\n.'
  148.   const char *NewLinePtr;
  149.  
  150.   // CurrentConflictMarkerState - The kind of conflict marker we are handling.
  151.   ConflictMarkerKind CurrentConflictMarkerState;
  152.  
  153.   /// Non-empty if this \p Lexer is \p isDependencyDirectivesLexer().
  154.   ArrayRef<dependency_directives_scan::Directive> DepDirectives;
  155.  
  156.   /// If this \p Lexer is \p isDependencyDirectivesLexer(), it represents the
  157.   /// next token to use from the current dependency directive.
  158.   unsigned NextDepDirectiveTokenIndex = 0;
  159.  
  160.   void InitLexer(const char *BufStart, const char *BufPtr, const char *BufEnd);
  161.  
  162. public:
  163.   /// Lexer constructor - Create a new lexer object for the specified buffer
  164.   /// with the specified preprocessor managing the lexing process.  This lexer
  165.   /// assumes that the associated file buffer and Preprocessor objects will
  166.   /// outlive it, so it doesn't take ownership of either of them.
  167.   Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile, Preprocessor &PP,
  168.         bool IsFirstIncludeOfFile = true);
  169.  
  170.   /// Lexer constructor - Create a new raw lexer object.  This object is only
  171.   /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
  172.   /// text range will outlive it, so it doesn't take ownership of it.
  173.   Lexer(SourceLocation FileLoc, const LangOptions &LangOpts,
  174.         const char *BufStart, const char *BufPtr, const char *BufEnd,
  175.         bool IsFirstIncludeOfFile = true);
  176.  
  177.   /// Lexer constructor - Create a new raw lexer object.  This object is only
  178.   /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the
  179.   /// text range will outlive it, so it doesn't take ownership of it.
  180.   Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
  181.         const SourceManager &SM, const LangOptions &LangOpts,
  182.         bool IsFirstIncludeOfFile = true);
  183.  
  184.   Lexer(const Lexer &) = delete;
  185.   Lexer &operator=(const Lexer &) = delete;
  186.  
  187.   /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
  188.   /// _Pragma expansion.  This has a variety of magic semantics that this method
  189.   /// sets up.  It returns a new'd Lexer that must be delete'd when done.
  190.   static Lexer *Create_PragmaLexer(SourceLocation SpellingLoc,
  191.                                    SourceLocation ExpansionLocStart,
  192.                                    SourceLocation ExpansionLocEnd,
  193.                                    unsigned TokLen, Preprocessor &PP);
  194.  
  195.   /// getFileLoc - Return the File Location for the file we are lexing out of.
  196.   /// The physical location encodes the location where the characters come from,
  197.   /// the virtual location encodes where we should *claim* the characters came
  198.   /// from.  Currently this is only used by _Pragma handling.
  199.   SourceLocation getFileLoc() const { return FileLoc; }
  200.  
  201. private:
  202.   /// Lex - Return the next token in the file.  If this is the end of file, it
  203.   /// return the tok::eof token.  This implicitly involves the preprocessor.
  204.   bool Lex(Token &Result);
  205.  
  206.   /// Called when the preprocessor is in 'dependency scanning lexing mode'.
  207.   bool LexDependencyDirectiveToken(Token &Result);
  208.  
  209.   /// Called when the preprocessor is in 'dependency scanning lexing mode' and
  210.   /// is skipping a conditional block.
  211.   bool LexDependencyDirectiveTokenWhileSkipping(Token &Result);
  212.  
  213.   /// True when the preprocessor is in 'dependency scanning lexing mode' and
  214.   /// created this \p Lexer for lexing a set of dependency directive tokens.
  215.   bool isDependencyDirectivesLexer() const { return !DepDirectives.empty(); }
  216.  
  217.   /// Initializes \p Result with data from \p DDTok and advances \p BufferPtr to
  218.   /// the position just after the token.
  219.   /// \returns the buffer pointer at the beginning of the token.
  220.   const char *convertDependencyDirectiveToken(
  221.       const dependency_directives_scan::Token &DDTok, Token &Result);
  222.  
  223. public:
  224.   /// isPragmaLexer - Returns true if this Lexer is being used to lex a pragma.
  225.   bool isPragmaLexer() const { return Is_PragmaLexer; }
  226.  
  227. private:
  228.   /// IndirectLex - An indirect call to 'Lex' that can be invoked via
  229.   ///  the PreprocessorLexer interface.
  230.   void IndirectLex(Token &Result) override { Lex(Result); }
  231.  
  232. public:
  233.   /// LexFromRawLexer - Lex a token from a designated raw lexer (one with no
  234.   /// associated preprocessor object.  Return true if the 'next character to
  235.   /// read' pointer points at the end of the lexer buffer, false otherwise.
  236.   bool LexFromRawLexer(Token &Result) {
  237.     assert(LexingRawMode && "Not already in raw mode!");
  238.     Lex(Result);
  239.     // Note that lexing to the end of the buffer doesn't implicitly delete the
  240.     // lexer when in raw mode.
  241.     return BufferPtr == BufferEnd;
  242.   }
  243.  
  244.   /// isKeepWhitespaceMode - Return true if the lexer should return tokens for
  245.   /// every character in the file, including whitespace and comments.  This
  246.   /// should only be used in raw mode, as the preprocessor is not prepared to
  247.   /// deal with the excess tokens.
  248.   bool isKeepWhitespaceMode() const {
  249.     return ExtendedTokenMode > 1;
  250.   }
  251.  
  252.   /// SetKeepWhitespaceMode - This method lets clients enable or disable
  253.   /// whitespace retention mode.
  254.   void SetKeepWhitespaceMode(bool Val) {
  255.     assert((!Val || LexingRawMode || LangOpts.TraditionalCPP) &&
  256.            "Can only retain whitespace in raw mode or -traditional-cpp");
  257.     ExtendedTokenMode = Val ? 2 : 0;
  258.   }
  259.  
  260.   /// inKeepCommentMode - Return true if the lexer should return comments as
  261.   /// tokens.
  262.   bool inKeepCommentMode() const {
  263.     return ExtendedTokenMode > 0;
  264.   }
  265.  
  266.   /// SetCommentRetentionMode - Change the comment retention mode of the lexer
  267.   /// to the specified mode.  This is really only useful when lexing in raw
  268.   /// mode, because otherwise the lexer needs to manage this.
  269.   void SetCommentRetentionState(bool Mode) {
  270.     assert(!isKeepWhitespaceMode() &&
  271.            "Can't play with comment retention state when retaining whitespace");
  272.     ExtendedTokenMode = Mode ? 1 : 0;
  273.   }
  274.  
  275.   /// Sets the extended token mode back to its initial value, according to the
  276.   /// language options and preprocessor. This controls whether the lexer
  277.   /// produces comment and whitespace tokens.
  278.   ///
  279.   /// This requires the lexer to have an associated preprocessor. A standalone
  280.   /// lexer has nothing to reset to.
  281.   void resetExtendedTokenMode();
  282.  
  283.   /// Gets source code buffer.
  284.   StringRef getBuffer() const {
  285.     return StringRef(BufferStart, BufferEnd - BufferStart);
  286.   }
  287.  
  288.   /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
  289.   /// uninterpreted string.  This switches the lexer out of directive mode.
  290.   void ReadToEndOfLine(SmallVectorImpl<char> *Result = nullptr);
  291.  
  292.  
  293.   /// Diag - Forwarding function for diagnostics.  This translate a source
  294.   /// position in the current buffer into a SourceLocation object for rendering.
  295.   DiagnosticBuilder Diag(const char *Loc, unsigned DiagID) const;
  296.  
  297.   /// getSourceLocation - Return a source location identifier for the specified
  298.   /// offset in the current file.
  299.   SourceLocation getSourceLocation(const char *Loc, unsigned TokLen = 1) const;
  300.  
  301.   /// getSourceLocation - Return a source location for the next character in
  302.   /// the current file.
  303.   SourceLocation getSourceLocation() override {
  304.     return getSourceLocation(BufferPtr);
  305.   }
  306.  
  307.   /// Return the current location in the buffer.
  308.   const char *getBufferLocation() const { return BufferPtr; }
  309.  
  310.   /// Returns the current lexing offset.
  311.   unsigned getCurrentBufferOffset() {
  312.     assert(BufferPtr >= BufferStart && "Invalid buffer state");
  313.     return BufferPtr - BufferStart;
  314.   }
  315.  
  316.   /// Set the lexer's buffer pointer to \p Offset.
  317.   void seek(unsigned Offset, bool IsAtStartOfLine);
  318.  
  319.   /// Stringify - Convert the specified string into a C string by i) escaping
  320.   /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
  321.   /// If Charify is true, this escapes the ' character instead of ".
  322.   static std::string Stringify(StringRef Str, bool Charify = false);
  323.  
  324.   /// Stringify - Convert the specified string into a C string by i) escaping
  325.   /// '\\' and " characters and ii) replacing newline character(s) with "\\n".
  326.   static void Stringify(SmallVectorImpl<char> &Str);
  327.  
  328.   /// getSpelling - This method is used to get the spelling of a token into a
  329.   /// preallocated buffer, instead of as an std::string.  The caller is required
  330.   /// to allocate enough space for the token, which is guaranteed to be at least
  331.   /// Tok.getLength() bytes long.  The length of the actual result is returned.
  332.   ///
  333.   /// Note that this method may do two possible things: it may either fill in
  334.   /// the buffer specified with characters, or it may *change the input pointer*
  335.   /// to point to a constant buffer with the data already in it (avoiding a
  336.   /// copy).  The caller is not allowed to modify the returned buffer pointer
  337.   /// if an internal buffer is returned.
  338.   static unsigned getSpelling(const Token &Tok, const char *&Buffer,
  339.                               const SourceManager &SourceMgr,
  340.                               const LangOptions &LangOpts,
  341.                               bool *Invalid = nullptr);
  342.  
  343.   /// getSpelling() - Return the 'spelling' of the Tok token.  The spelling of a
  344.   /// token is the characters used to represent the token in the source file
  345.   /// after trigraph expansion and escaped-newline folding.  In particular, this
  346.   /// wants to get the true, uncanonicalized, spelling of things like digraphs
  347.   /// UCNs, etc.
  348.   static std::string getSpelling(const Token &Tok,
  349.                                  const SourceManager &SourceMgr,
  350.                                  const LangOptions &LangOpts,
  351.                                  bool *Invalid = nullptr);
  352.  
  353.   /// getSpelling - This method is used to get the spelling of the
  354.   /// token at the given source location.  If, as is usually true, it
  355.   /// is not necessary to copy any data, then the returned string may
  356.   /// not point into the provided buffer.
  357.   ///
  358.   /// This method lexes at the expansion depth of the given
  359.   /// location and does not jump to the expansion or spelling
  360.   /// location.
  361.   static StringRef getSpelling(SourceLocation loc,
  362.                                SmallVectorImpl<char> &buffer,
  363.                                const SourceManager &SM,
  364.                                const LangOptions &options,
  365.                                bool *invalid = nullptr);
  366.  
  367.   /// MeasureTokenLength - Relex the token at the specified location and return
  368.   /// its length in bytes in the input file.  If the token needs cleaning (e.g.
  369.   /// includes a trigraph or an escaped newline) then this count includes bytes
  370.   /// that are part of that.
  371.   static unsigned MeasureTokenLength(SourceLocation Loc,
  372.                                      const SourceManager &SM,
  373.                                      const LangOptions &LangOpts);
  374.  
  375.   /// Relex the token at the specified location.
  376.   /// \returns true if there was a failure, false on success.
  377.   static bool getRawToken(SourceLocation Loc, Token &Result,
  378.                           const SourceManager &SM,
  379.                           const LangOptions &LangOpts,
  380.                           bool IgnoreWhiteSpace = false);
  381.  
  382.   /// Given a location any where in a source buffer, find the location
  383.   /// that corresponds to the beginning of the token in which the original
  384.   /// source location lands.
  385.   static SourceLocation GetBeginningOfToken(SourceLocation Loc,
  386.                                             const SourceManager &SM,
  387.                                             const LangOptions &LangOpts);
  388.  
  389.   /// Get the physical length (including trigraphs and escaped newlines) of the
  390.   /// first \p Characters characters of the token starting at TokStart.
  391.   static unsigned getTokenPrefixLength(SourceLocation TokStart,
  392.                                        unsigned CharNo,
  393.                                        const SourceManager &SM,
  394.                                        const LangOptions &LangOpts);
  395.  
  396.   /// AdvanceToTokenCharacter - If the current SourceLocation specifies a
  397.   /// location at the start of a token, return a new location that specifies a
  398.   /// character within the token.  This handles trigraphs and escaped newlines.
  399.   static SourceLocation AdvanceToTokenCharacter(SourceLocation TokStart,
  400.                                                 unsigned Characters,
  401.                                                 const SourceManager &SM,
  402.                                                 const LangOptions &LangOpts) {
  403.     return TokStart.getLocWithOffset(
  404.         getTokenPrefixLength(TokStart, Characters, SM, LangOpts));
  405.   }
  406.  
  407.   /// Computes the source location just past the end of the
  408.   /// token at this source location.
  409.   ///
  410.   /// This routine can be used to produce a source location that
  411.   /// points just past the end of the token referenced by \p Loc, and
  412.   /// is generally used when a diagnostic needs to point just after a
  413.   /// token where it expected something different that it received. If
  414.   /// the returned source location would not be meaningful (e.g., if
  415.   /// it points into a macro), this routine returns an invalid
  416.   /// source location.
  417.   ///
  418.   /// \param Offset an offset from the end of the token, where the source
  419.   /// location should refer to. The default offset (0) produces a source
  420.   /// location pointing just past the end of the token; an offset of 1 produces
  421.   /// a source location pointing to the last character in the token, etc.
  422.   static SourceLocation getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
  423.                                             const SourceManager &SM,
  424.                                             const LangOptions &LangOpts);
  425.  
  426.   /// Given a token range, produce a corresponding CharSourceRange that
  427.   /// is not a token range. This allows the source range to be used by
  428.   /// components that don't have access to the lexer and thus can't find the
  429.   /// end of the range for themselves.
  430.   static CharSourceRange getAsCharRange(SourceRange Range,
  431.                                         const SourceManager &SM,
  432.                                         const LangOptions &LangOpts) {
  433.     SourceLocation End = getLocForEndOfToken(Range.getEnd(), 0, SM, LangOpts);
  434.     return End.isInvalid() ? CharSourceRange()
  435.                            : CharSourceRange::getCharRange(
  436.                                  Range.getBegin(), End);
  437.   }
  438.   static CharSourceRange getAsCharRange(CharSourceRange Range,
  439.                                         const SourceManager &SM,
  440.                                         const LangOptions &LangOpts) {
  441.     return Range.isTokenRange()
  442.                ? getAsCharRange(Range.getAsRange(), SM, LangOpts)
  443.                : Range;
  444.   }
  445.  
  446.   /// Returns true if the given MacroID location points at the first
  447.   /// token of the macro expansion.
  448.   ///
  449.   /// \param MacroBegin If non-null and function returns true, it is set to
  450.   /// begin location of the macro.
  451.   static bool isAtStartOfMacroExpansion(SourceLocation loc,
  452.                                         const SourceManager &SM,
  453.                                         const LangOptions &LangOpts,
  454.                                         SourceLocation *MacroBegin = nullptr);
  455.  
  456.   /// Returns true if the given MacroID location points at the last
  457.   /// token of the macro expansion.
  458.   ///
  459.   /// \param MacroEnd If non-null and function returns true, it is set to
  460.   /// end location of the macro.
  461.   static bool isAtEndOfMacroExpansion(SourceLocation loc,
  462.                                       const SourceManager &SM,
  463.                                       const LangOptions &LangOpts,
  464.                                       SourceLocation *MacroEnd = nullptr);
  465.  
  466.   /// Accepts a range and returns a character range with file locations.
  467.   ///
  468.   /// Returns a null range if a part of the range resides inside a macro
  469.   /// expansion or the range does not reside on the same FileID.
  470.   ///
  471.   /// This function is trying to deal with macros and return a range based on
  472.   /// file locations. The cases where it can successfully handle macros are:
  473.   ///
  474.   /// -begin or end range lies at the start or end of a macro expansion, in
  475.   ///  which case the location will be set to the expansion point, e.g:
  476.   ///    \#define M 1 2
  477.   ///    a M
  478.   /// If you have a range [a, 2] (where 2 came from the macro), the function
  479.   /// will return a range for "a M"
  480.   /// if you have range [a, 1], the function will fail because the range
  481.   /// overlaps with only a part of the macro
  482.   ///
  483.   /// -The macro is a function macro and the range can be mapped to the macro
  484.   ///  arguments, e.g:
  485.   ///    \#define M 1 2
  486.   ///    \#define FM(x) x
  487.   ///    FM(a b M)
  488.   /// if you have range [b, 2], the function will return the file range "b M"
  489.   /// inside the macro arguments.
  490.   /// if you have range [a, 2], the function will return the file range
  491.   /// "FM(a b M)" since the range includes all of the macro expansion.
  492.   static CharSourceRange makeFileCharRange(CharSourceRange Range,
  493.                                            const SourceManager &SM,
  494.                                            const LangOptions &LangOpts);
  495.  
  496.   /// Returns a string for the source that the range encompasses.
  497.   static StringRef getSourceText(CharSourceRange Range,
  498.                                  const SourceManager &SM,
  499.                                  const LangOptions &LangOpts,
  500.                                  bool *Invalid = nullptr);
  501.  
  502.   /// Retrieve the name of the immediate macro expansion.
  503.   ///
  504.   /// This routine starts from a source location, and finds the name of the macro
  505.   /// responsible for its immediate expansion. It looks through any intervening
  506.   /// macro argument expansions to compute this. It returns a StringRef which
  507.   /// refers to the SourceManager-owned buffer of the source where that macro
  508.   /// name is spelled. Thus, the result shouldn't out-live that SourceManager.
  509.   static StringRef getImmediateMacroName(SourceLocation Loc,
  510.                                          const SourceManager &SM,
  511.                                          const LangOptions &LangOpts);
  512.  
  513.   /// Retrieve the name of the immediate macro expansion.
  514.   ///
  515.   /// This routine starts from a source location, and finds the name of the
  516.   /// macro responsible for its immediate expansion. It looks through any
  517.   /// intervening macro argument expansions to compute this. It returns a
  518.   /// StringRef which refers to the SourceManager-owned buffer of the source
  519.   /// where that macro name is spelled. Thus, the result shouldn't out-live
  520.   /// that SourceManager.
  521.   ///
  522.   /// This differs from Lexer::getImmediateMacroName in that any macro argument
  523.   /// location will result in the topmost function macro that accepted it.
  524.   /// e.g.
  525.   /// \code
  526.   ///   MAC1( MAC2(foo) )
  527.   /// \endcode
  528.   /// for location of 'foo' token, this function will return "MAC1" while
  529.   /// Lexer::getImmediateMacroName will return "MAC2".
  530.   static StringRef getImmediateMacroNameForDiagnostics(
  531.       SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts);
  532.  
  533.   /// Compute the preamble of the given file.
  534.   ///
  535.   /// The preamble of a file contains the initial comments, include directives,
  536.   /// and other preprocessor directives that occur before the code in this
  537.   /// particular file actually begins. The preamble of the main source file is
  538.   /// a potential prefix header.
  539.   ///
  540.   /// \param Buffer The memory buffer containing the file's contents.
  541.   ///
  542.   /// \param MaxLines If non-zero, restrict the length of the preamble
  543.   /// to fewer than this number of lines.
  544.   ///
  545.   /// \returns The offset into the file where the preamble ends and the rest
  546.   /// of the file begins along with a boolean value indicating whether
  547.   /// the preamble ends at the beginning of a new line.
  548.   static PreambleBounds ComputePreamble(StringRef Buffer,
  549.                                         const LangOptions &LangOpts,
  550.                                         unsigned MaxLines = 0);
  551.  
  552.   /// Finds the token that comes right after the given location.
  553.   ///
  554.   /// Returns the next token, or none if the location is inside a macro.
  555.   static std::optional<Token> findNextToken(SourceLocation Loc,
  556.                                             const SourceManager &SM,
  557.                                             const LangOptions &LangOpts);
  558.  
  559.   /// Checks that the given token is the first token that occurs after
  560.   /// the given location (this excludes comments and whitespace). Returns the
  561.   /// location immediately after the specified token. If the token is not found
  562.   /// or the location is inside a macro, the returned source location will be
  563.   /// invalid.
  564.   static SourceLocation findLocationAfterToken(SourceLocation loc,
  565.                                          tok::TokenKind TKind,
  566.                                          const SourceManager &SM,
  567.                                          const LangOptions &LangOpts,
  568.                                          bool SkipTrailingWhitespaceAndNewLine);
  569.  
  570.   /// Returns true if the given character could appear in an identifier.
  571.   static bool isAsciiIdentifierContinueChar(char c,
  572.                                             const LangOptions &LangOpts);
  573.  
  574.   /// Checks whether new line pointed by Str is preceded by escape
  575.   /// sequence.
  576.   static bool isNewLineEscaped(const char *BufferStart, const char *Str);
  577.  
  578.   /// getCharAndSizeNoWarn - Like the getCharAndSize method, but does not ever
  579.   /// emit a warning.
  580.   static inline char getCharAndSizeNoWarn(const char *Ptr, unsigned &Size,
  581.                                           const LangOptions &LangOpts) {
  582.     // If this is not a trigraph and not a UCN or escaped newline, return
  583.     // quickly.
  584.     if (isObviouslySimpleCharacter(Ptr[0])) {
  585.       Size = 1;
  586.       return *Ptr;
  587.     }
  588.  
  589.     Size = 0;
  590.     return getCharAndSizeSlowNoWarn(Ptr, Size, LangOpts);
  591.   }
  592.  
  593.   /// Returns the leading whitespace for line that corresponds to the given
  594.   /// location \p Loc.
  595.   static StringRef getIndentationForLine(SourceLocation Loc,
  596.                                          const SourceManager &SM);
  597.  
  598.   /// Check if this is the first time we're lexing the input file.
  599.   bool isFirstTimeLexingFile() const { return IsFirstTimeLexingFile; }
  600.  
  601. private:
  602.   //===--------------------------------------------------------------------===//
  603.   // Internal implementation interfaces.
  604.  
  605.   /// LexTokenInternal - Internal interface to lex a preprocessing token. Called
  606.   /// by Lex.
  607.   ///
  608.   bool LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine);
  609.  
  610.   bool CheckUnicodeWhitespace(Token &Result, uint32_t C, const char *CurPtr);
  611.  
  612.   bool LexUnicodeIdentifierStart(Token &Result, uint32_t C, const char *CurPtr);
  613.  
  614.   /// FormTokenWithChars - When we lex a token, we have identified a span
  615.   /// starting at BufferPtr, going to TokEnd that forms the token.  This method
  616.   /// takes that range and assigns it to the token as its location and size.  In
  617.   /// addition, since tokens cannot overlap, this also updates BufferPtr to be
  618.   /// TokEnd.
  619.   void FormTokenWithChars(Token &Result, const char *TokEnd,
  620.                           tok::TokenKind Kind) {
  621.     unsigned TokLen = TokEnd-BufferPtr;
  622.     Result.setLength(TokLen);
  623.     Result.setLocation(getSourceLocation(BufferPtr, TokLen));
  624.     Result.setKind(Kind);
  625.     BufferPtr = TokEnd;
  626.   }
  627.  
  628.   /// isNextPPTokenLParen - Return 1 if the next unexpanded token will return a
  629.   /// tok::l_paren token, 0 if it is something else and 2 if there are no more
  630.   /// tokens in the buffer controlled by this lexer.
  631.   unsigned isNextPPTokenLParen();
  632.  
  633.   //===--------------------------------------------------------------------===//
  634.   // Lexer character reading interfaces.
  635.  
  636.   // This lexer is built on two interfaces for reading characters, both of which
  637.   // automatically provide phase 1/2 translation.  getAndAdvanceChar is used
  638.   // when we know that we will be reading a character from the input buffer and
  639.   // that this character will be part of the result token. This occurs in (f.e.)
  640.   // string processing, because we know we need to read until we find the
  641.   // closing '"' character.
  642.   //
  643.   // The second interface is the combination of getCharAndSize with
  644.   // ConsumeChar.  getCharAndSize reads a phase 1/2 translated character,
  645.   // returning it and its size.  If the lexer decides that this character is
  646.   // part of the current token, it calls ConsumeChar on it.  This two stage
  647.   // approach allows us to emit diagnostics for characters (e.g. warnings about
  648.   // trigraphs), knowing that they only are emitted if the character is
  649.   // consumed.
  650.  
  651.   /// isObviouslySimpleCharacter - Return true if the specified character is
  652.   /// obviously the same in translation phase 1 and translation phase 3.  This
  653.   /// can return false for characters that end up being the same, but it will
  654.   /// never return true for something that needs to be mapped.
  655.   static bool isObviouslySimpleCharacter(char C) {
  656.     return C != '?' && C != '\\';
  657.   }
  658.  
  659.   /// getAndAdvanceChar - Read a single 'character' from the specified buffer,
  660.   /// advance over it, and return it.  This is tricky in several cases.  Here we
  661.   /// just handle the trivial case and fall-back to the non-inlined
  662.   /// getCharAndSizeSlow method to handle the hard case.
  663.   inline char getAndAdvanceChar(const char *&Ptr, Token &Tok) {
  664.     // If this is not a trigraph and not a UCN or escaped newline, return
  665.     // quickly.
  666.     if (isObviouslySimpleCharacter(Ptr[0])) return *Ptr++;
  667.  
  668.     unsigned Size = 0;
  669.     char C = getCharAndSizeSlow(Ptr, Size, &Tok);
  670.     Ptr += Size;
  671.     return C;
  672.   }
  673.  
  674.   /// ConsumeChar - When a character (identified by getCharAndSize) is consumed
  675.   /// and added to a given token, check to see if there are diagnostics that
  676.   /// need to be emitted or flags that need to be set on the token.  If so, do
  677.   /// it.
  678.   const char *ConsumeChar(const char *Ptr, unsigned Size, Token &Tok) {
  679.     // Normal case, we consumed exactly one token.  Just return it.
  680.     if (Size == 1)
  681.       return Ptr+Size;
  682.  
  683.     // Otherwise, re-lex the character with a current token, allowing
  684.     // diagnostics to be emitted and flags to be set.
  685.     Size = 0;
  686.     getCharAndSizeSlow(Ptr, Size, &Tok);
  687.     return Ptr+Size;
  688.   }
  689.  
  690.   /// getCharAndSize - Peek a single 'character' from the specified buffer,
  691.   /// get its size, and return it.  This is tricky in several cases.  Here we
  692.   /// just handle the trivial case and fall-back to the non-inlined
  693.   /// getCharAndSizeSlow method to handle the hard case.
  694.   inline char getCharAndSize(const char *Ptr, unsigned &Size) {
  695.     // If this is not a trigraph and not a UCN or escaped newline, return
  696.     // quickly.
  697.     if (isObviouslySimpleCharacter(Ptr[0])) {
  698.       Size = 1;
  699.       return *Ptr;
  700.     }
  701.  
  702.     Size = 0;
  703.     return getCharAndSizeSlow(Ptr, Size);
  704.   }
  705.  
  706.   /// getCharAndSizeSlow - Handle the slow/uncommon case of the getCharAndSize
  707.   /// method.
  708.   char getCharAndSizeSlow(const char *Ptr, unsigned &Size,
  709.                           Token *Tok = nullptr);
  710.  
  711.   /// getEscapedNewLineSize - Return the size of the specified escaped newline,
  712.   /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" on entry
  713.   /// to this function.
  714.   static unsigned getEscapedNewLineSize(const char *P);
  715.  
  716.   /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
  717.   /// them), skip over them and return the first non-escaped-newline found,
  718.   /// otherwise return P.
  719.   static const char *SkipEscapedNewLines(const char *P);
  720.  
  721.   /// getCharAndSizeSlowNoWarn - Same as getCharAndSizeSlow, but never emits a
  722.   /// diagnostic.
  723.   static char getCharAndSizeSlowNoWarn(const char *Ptr, unsigned &Size,
  724.                                        const LangOptions &LangOpts);
  725.  
  726.   //===--------------------------------------------------------------------===//
  727.   // Other lexer functions.
  728.  
  729.   void SetByteOffset(unsigned Offset, bool StartOfLine);
  730.  
  731.   void PropagateLineStartLeadingSpaceInfo(Token &Result);
  732.  
  733.   const char *LexUDSuffix(Token &Result, const char *CurPtr,
  734.                           bool IsStringLiteral);
  735.  
  736.   // Helper functions to lex the remainder of a token of the specific type.
  737.  
  738.   // This function handles both ASCII and Unicode identifiers after
  739.   // the first codepoint of the identifyier has been parsed.
  740.   bool LexIdentifierContinue(Token &Result, const char *CurPtr);
  741.  
  742.   bool LexNumericConstant    (Token &Result, const char *CurPtr);
  743.   bool LexStringLiteral      (Token &Result, const char *CurPtr,
  744.                               tok::TokenKind Kind);
  745.   bool LexRawStringLiteral   (Token &Result, const char *CurPtr,
  746.                               tok::TokenKind Kind);
  747.   bool LexAngledStringLiteral(Token &Result, const char *CurPtr);
  748.   bool LexCharConstant       (Token &Result, const char *CurPtr,
  749.                               tok::TokenKind Kind);
  750.   bool LexEndOfFile          (Token &Result, const char *CurPtr);
  751.   bool SkipWhitespace        (Token &Result, const char *CurPtr,
  752.                               bool &TokAtPhysicalStartOfLine);
  753.   bool SkipLineComment       (Token &Result, const char *CurPtr,
  754.                               bool &TokAtPhysicalStartOfLine);
  755.   bool SkipBlockComment      (Token &Result, const char *CurPtr,
  756.                               bool &TokAtPhysicalStartOfLine);
  757.   bool SaveLineComment       (Token &Result, const char *CurPtr);
  758.  
  759.   bool IsStartOfConflictMarker(const char *CurPtr);
  760.   bool HandleEndOfConflictMarker(const char *CurPtr);
  761.  
  762.   bool lexEditorPlaceholder(Token &Result, const char *CurPtr);
  763.  
  764.   bool isCodeCompletionPoint(const char *CurPtr) const;
  765.   void cutOffLexing() { BufferPtr = BufferEnd; }
  766.  
  767.   bool isHexaLiteral(const char *Start, const LangOptions &LangOpts);
  768.  
  769.   void codeCompleteIncludedFile(const char *PathStart,
  770.                                 const char *CompletionPoint, bool IsAngled);
  771.  
  772.   std::optional<uint32_t>
  773.   tryReadNumericUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
  774.   std::optional<uint32_t> tryReadNamedUCN(const char *&StartPtr,
  775.                                           const char *SlashLoc, Token *Result);
  776.  
  777.   /// Read a universal character name.
  778.   ///
  779.   /// \param StartPtr The position in the source buffer after the initial '\'.
  780.   ///                 If the UCN is syntactically well-formed (but not
  781.   ///                 necessarily valid), this parameter will be updated to
  782.   ///                 point to the character after the UCN.
  783.   /// \param SlashLoc The position in the source buffer of the '\'.
  784.   /// \param Result   The token being formed. Pass \c nullptr to suppress
  785.   ///                 diagnostics and handle token formation in the caller.
  786.   ///
  787.   /// \return The Unicode codepoint specified by the UCN, or 0 if the UCN is
  788.   ///         invalid.
  789.   uint32_t tryReadUCN(const char *&StartPtr, const char *SlashLoc, Token *Result);
  790.  
  791.   /// Try to consume a UCN as part of an identifier at the current
  792.   /// location.
  793.   /// \param CurPtr Initially points to the range of characters in the source
  794.   ///               buffer containing the '\'. Updated to point past the end of
  795.   ///               the UCN on success.
  796.   /// \param Size The number of characters occupied by the '\' (including
  797.   ///             trigraphs and escaped newlines).
  798.   /// \param Result The token being produced. Marked as containing a UCN on
  799.   ///               success.
  800.   /// \return \c true if a UCN was lexed and it produced an acceptable
  801.   ///         identifier character, \c false otherwise.
  802.   bool tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
  803.                                Token &Result);
  804.  
  805.   /// Try to consume an identifier character encoded in UTF-8.
  806.   /// \param CurPtr Points to the start of the (potential) UTF-8 code unit
  807.   ///        sequence. On success, updated to point past the end of it.
  808.   /// \return \c true if a UTF-8 sequence mapping to an acceptable identifier
  809.   ///         character was lexed, \c false otherwise.
  810.   bool tryConsumeIdentifierUTF8Char(const char *&CurPtr);
  811. };
  812.  
  813. } // namespace clang
  814.  
  815. #endif // LLVM_CLANG_LEX_LEXER_H
  816.