Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line | 
|---|---|---|---|
| 14 | pmbaty | 1 | //===--- clang/Basic/CharInfo.h - Classifying ASCII Characters --*- C++ -*-===// | 
| 2 | // | ||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | ||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | ||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
| 6 | // | ||
| 7 | //===----------------------------------------------------------------------===// | ||
| 8 | |||
| 9 | #ifndef LLVM_CLANG_BASIC_CHARINFO_H | ||
| 10 | #define LLVM_CLANG_BASIC_CHARINFO_H | ||
| 11 | |||
| 12 | #include "clang/Basic/LLVM.h" | ||
| 13 | #include "llvm/ADT/StringRef.h" | ||
| 14 | #include "llvm/Support/Compiler.h" | ||
| 15 | #include "llvm/Support/DataTypes.h" | ||
| 16 | |||
| 17 | namespace clang { | ||
| 18 | namespace charinfo { | ||
| 19 | extern const uint16_t InfoTable[256]; | ||
| 20 | |||
| 21 | enum { | ||
| 22 | CHAR_HORZ_WS = 0x0001, // '\t', '\f', '\v'. Note, no '\0' | ||
| 23 | CHAR_VERT_WS = 0x0002, // '\r', '\n' | ||
| 24 | CHAR_SPACE = 0x0004, // ' ' | ||
| 25 | CHAR_DIGIT = 0x0008, // 0-9 | ||
| 26 | CHAR_XLETTER = 0x0010, // a-f,A-F | ||
| 27 | CHAR_UPPER = 0x0020, // A-Z | ||
| 28 | CHAR_LOWER = 0x0040, // a-z | ||
| 29 | CHAR_UNDER = 0x0080, // _ | ||
| 30 | CHAR_PERIOD = 0x0100, // . | ||
| 31 | CHAR_RAWDEL = 0x0200, // {}[]#<>%:;?*+-/^&|~!=,"' | ||
| 32 | CHAR_PUNCT = 0x0400 // `$@() | ||
| 33 | }; | ||
| 34 | |||
| 35 | enum { | ||
| 36 | CHAR_XUPPER = CHAR_XLETTER | CHAR_UPPER, | ||
| 37 | CHAR_XLOWER = CHAR_XLETTER | CHAR_LOWER | ||
| 38 | }; | ||
| 39 | } // end namespace charinfo | ||
| 40 | |||
| 41 | /// Returns true if a byte is an ASCII character. | ||
| 42 | LLVM_READNONE inline bool isASCII(char c) { | ||
| 43 | return static_cast<unsigned char>(c) <= 127; | ||
| 44 | } | ||
| 45 | |||
| 46 | LLVM_READNONE inline bool isASCII(unsigned char c) { return c <= 127; } | ||
| 47 | |||
| 48 | /// Returns true if a codepoint is an ASCII character. | ||
| 49 | LLVM_READNONE inline bool isASCII(uint32_t c) { return c <= 127; } | ||
| 50 | LLVM_READNONE inline bool isASCII(int64_t c) { return 0 <= c && c <= 127; } | ||
| 51 | |||
| 52 | /// Returns true if this is a valid first character of a C identifier, | ||
| 53 | /// which is [a-zA-Z_]. | ||
| 54 | LLVM_READONLY inline bool isAsciiIdentifierStart(unsigned char c, | ||
| 55 | bool AllowDollar = false) { | ||
| 56 | using namespace charinfo; | ||
| 57 | if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER)) | ||
| 58 | return true; | ||
| 59 | return AllowDollar && c == '$'; | ||
| 60 | } | ||
| 61 | |||
| 62 | /// Returns true if this is a body character of a C identifier, | ||
| 63 | /// which is [a-zA-Z0-9_]. | ||
| 64 | LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned char c, | ||
| 65 | bool AllowDollar = false) { | ||
| 66 | using namespace charinfo; | ||
| 67 | if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER)) | ||
| 68 | return true; | ||
| 69 | return AllowDollar && c == '$'; | ||
| 70 | } | ||
| 71 | |||
| 72 | /// Returns true if this character is horizontal ASCII whitespace: | ||
| 73 | /// ' ', '\\t', '\\f', '\\v'. | ||
| 74 | /// | ||
| 75 | /// Note that this returns false for '\\0'. | ||
| 76 | LLVM_READONLY inline bool isHorizontalWhitespace(unsigned char c) { | ||
| 77 | using namespace charinfo; | ||
| 78 | return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0; | ||
| 79 | } | ||
| 80 | |||
| 81 | /// Returns true if this character is vertical ASCII whitespace: '\\n', '\\r'. | ||
| 82 | /// | ||
| 83 | /// Note that this returns false for '\\0'. | ||
| 84 | LLVM_READONLY inline bool isVerticalWhitespace(unsigned char c) { | ||
| 85 | using namespace charinfo; | ||
| 86 | return (InfoTable[c] & CHAR_VERT_WS) != 0; | ||
| 87 | } | ||
| 88 | |||
| 89 | /// Return true if this character is horizontal or vertical ASCII whitespace: | ||
| 90 | /// ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. | ||
| 91 | /// | ||
| 92 | /// Note that this returns false for '\\0'. | ||
| 93 | LLVM_READONLY inline bool isWhitespace(unsigned char c) { | ||
| 94 | using namespace charinfo; | ||
| 95 | return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_VERT_WS|CHAR_SPACE)) != 0; | ||
| 96 | } | ||
| 97 | |||
| 98 | /// Return true if this character is an ASCII digit: [0-9] | ||
| 99 | LLVM_READONLY inline bool isDigit(unsigned char c) { | ||
| 100 | using namespace charinfo; | ||
| 101 | return (InfoTable[c] & CHAR_DIGIT) != 0; | ||
| 102 | } | ||
| 103 | |||
| 104 | /// Return true if this character is a lowercase ASCII letter: [a-z] | ||
| 105 | LLVM_READONLY inline bool isLowercase(unsigned char c) { | ||
| 106 | using namespace charinfo; | ||
| 107 | return (InfoTable[c] & CHAR_LOWER) != 0; | ||
| 108 | } | ||
| 109 | |||
| 110 | /// Return true if this character is an uppercase ASCII letter: [A-Z] | ||
| 111 | LLVM_READONLY inline bool isUppercase(unsigned char c) { | ||
| 112 | using namespace charinfo; | ||
| 113 | return (InfoTable[c] & CHAR_UPPER) != 0; | ||
| 114 | } | ||
| 115 | |||
| 116 | /// Return true if this character is an ASCII letter: [a-zA-Z] | ||
| 117 | LLVM_READONLY inline bool isLetter(unsigned char c) { | ||
| 118 | using namespace charinfo; | ||
| 119 | return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER)) != 0; | ||
| 120 | } | ||
| 121 | |||
| 122 | /// Return true if this character is an ASCII letter or digit: [a-zA-Z0-9] | ||
| 123 | LLVM_READONLY inline bool isAlphanumeric(unsigned char c) { | ||
| 124 | using namespace charinfo; | ||
| 125 | return (InfoTable[c] & (CHAR_DIGIT|CHAR_UPPER|CHAR_LOWER)) != 0; | ||
| 126 | } | ||
| 127 | |||
| 128 | /// Return true if this character is an ASCII hex digit: [0-9a-fA-F] | ||
| 129 | LLVM_READONLY inline bool isHexDigit(unsigned char c) { | ||
| 130 | using namespace charinfo; | ||
| 131 | return (InfoTable[c] & (CHAR_DIGIT|CHAR_XLETTER)) != 0; | ||
| 132 | } | ||
| 133 | |||
| 134 | /// Return true if this character is an ASCII punctuation character. | ||
| 135 | /// | ||
| 136 | /// Note that '_' is both a punctuation character and an identifier character! | ||
| 137 | LLVM_READONLY inline bool isPunctuation(unsigned char c) { | ||
| 138 | using namespace charinfo; | ||
| 139 | return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0; | ||
| 140 | } | ||
| 141 | |||
| 142 | /// Return true if this character is an ASCII printable character; that is, a | ||
| 143 | /// character that should take exactly one column to print in a fixed-width | ||
| 144 | /// terminal. | ||
| 145 | LLVM_READONLY inline bool isPrintable(unsigned char c) { | ||
| 146 | using namespace charinfo; | ||
| 147 | return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT| | ||
| 148 | CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0; | ||
| 149 | } | ||
| 150 | |||
| 151 | /// Return true if this is the body character of a C preprocessing number, | ||
| 152 | /// which is [a-zA-Z0-9_.]. | ||
| 153 | LLVM_READONLY inline bool isPreprocessingNumberBody(unsigned char c) { | ||
| 154 | using namespace charinfo; | ||
| 155 | return (InfoTable[c] & | ||
| 156 | (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER|CHAR_PERIOD)) != 0; | ||
| 157 | } | ||
| 158 | |||
| 159 | /// Return true if this is the body character of a C++ raw string delimiter. | ||
| 160 | LLVM_READONLY inline bool isRawStringDelimBody(unsigned char c) { | ||
| 161 | using namespace charinfo; | ||
| 162 | return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD| | ||
| 163 | CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0; | ||
| 164 | } | ||
| 165 | |||
| 166 | enum class EscapeChar { | ||
| 167 | Single = 1, | ||
| 168 | Double = 2, | ||
| 169 | SingleAndDouble = static_cast<int>(Single) | static_cast<int>(Double), | ||
| 170 | }; | ||
| 171 | |||
| 172 | /// Return C-style escaped string for special characters, or an empty string if | ||
| 173 | /// there is no such mapping. | ||
| 174 | template <EscapeChar Opt, class CharT> | ||
| 175 | LLVM_READONLY inline auto escapeCStyle(CharT Ch) -> StringRef { | ||
| 176 | switch (Ch) { | ||
| 177 | case '\\': | ||
| 178 | return "\\\\"; | ||
| 179 | case '\'': | ||
| 180 | if ((static_cast<int>(Opt) & static_cast<int>(EscapeChar::Single)) == 0) | ||
| 181 | break; | ||
| 182 | return "\\'"; | ||
| 183 | case '"': | ||
| 184 | if ((static_cast<int>(Opt) & static_cast<int>(EscapeChar::Double)) == 0) | ||
| 185 | break; | ||
| 186 | return "\\\""; | ||
| 187 | case '\a': | ||
| 188 | return "\\a"; | ||
| 189 | case '\b': | ||
| 190 | return "\\b"; | ||
| 191 | case '\f': | ||
| 192 | return "\\f"; | ||
| 193 | case '\n': | ||
| 194 | return "\\n"; | ||
| 195 | case '\r': | ||
| 196 | return "\\r"; | ||
| 197 | case '\t': | ||
| 198 | return "\\t"; | ||
| 199 | case '\v': | ||
| 200 | return "\\v"; | ||
| 201 |   } | ||
| 202 | return {}; | ||
| 203 | } | ||
| 204 | |||
| 205 | /// Converts the given ASCII character to its lowercase equivalent. | ||
| 206 | /// | ||
| 207 | /// If the character is not an uppercase character, it is returned as is. | ||
| 208 | LLVM_READONLY inline char toLowercase(char c) { | ||
| 209 | if (isUppercase(c)) | ||
| 210 | return c + 'a' - 'A'; | ||
| 211 | return c; | ||
| 212 | } | ||
| 213 | |||
| 214 | /// Converts the given ASCII character to its uppercase equivalent. | ||
| 215 | /// | ||
| 216 | /// If the character is not a lowercase character, it is returned as is. | ||
| 217 | LLVM_READONLY inline char toUppercase(char c) { | ||
| 218 | if (isLowercase(c)) | ||
| 219 | return c + 'A' - 'a'; | ||
| 220 | return c; | ||
| 221 | } | ||
| 222 | |||
| 223 | |||
| 224 | /// Return true if this is a valid ASCII identifier. | ||
| 225 | /// | ||
| 226 | /// Note that this is a very simple check; it does not accept UCNs as valid | ||
| 227 | /// identifier characters. | ||
| 228 | LLVM_READONLY inline bool isValidAsciiIdentifier(StringRef S, | ||
| 229 | bool AllowDollar = false) { | ||
| 230 | if (S.empty() || !isAsciiIdentifierStart(S[0], AllowDollar)) | ||
| 231 | return false; | ||
| 232 | |||
| 233 | for (StringRef::iterator I = S.begin(), E = S.end(); I != E; ++I) | ||
| 234 | if (!isAsciiIdentifierContinue(*I, AllowDollar)) | ||
| 235 | return false; | ||
| 236 | |||
| 237 | return true; | ||
| 238 | } | ||
| 239 | |||
| 240 | } // end namespace clang | ||
| 241 | |||
| 242 | #endif |