Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | //===--- clang/Basic/CharInfo.h - Classifying ASCII Characters --*- C++ -*-===// |
2 | // |
||
3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | // See https://llvm.org/LICENSE.txt for license information. |
||
5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | // |
||
7 | //===----------------------------------------------------------------------===// |
||
8 | |||
9 | #ifndef LLVM_CLANG_BASIC_CHARINFO_H |
||
10 | #define LLVM_CLANG_BASIC_CHARINFO_H |
||
11 | |||
12 | #include "clang/Basic/LLVM.h" |
||
13 | #include "llvm/ADT/StringRef.h" |
||
14 | #include "llvm/Support/Compiler.h" |
||
15 | #include "llvm/Support/DataTypes.h" |
||
16 | |||
17 | namespace clang { |
||
18 | namespace charinfo { |
||
19 | extern const uint16_t InfoTable[256]; |
||
20 | |||
21 | enum { |
||
22 | CHAR_HORZ_WS = 0x0001, // '\t', '\f', '\v'. Note, no '\0' |
||
23 | CHAR_VERT_WS = 0x0002, // '\r', '\n' |
||
24 | CHAR_SPACE = 0x0004, // ' ' |
||
25 | CHAR_DIGIT = 0x0008, // 0-9 |
||
26 | CHAR_XLETTER = 0x0010, // a-f,A-F |
||
27 | CHAR_UPPER = 0x0020, // A-Z |
||
28 | CHAR_LOWER = 0x0040, // a-z |
||
29 | CHAR_UNDER = 0x0080, // _ |
||
30 | CHAR_PERIOD = 0x0100, // . |
||
31 | CHAR_RAWDEL = 0x0200, // {}[]#<>%:;?*+-/^&|~!=,"' |
||
32 | CHAR_PUNCT = 0x0400 // `$@() |
||
33 | }; |
||
34 | |||
35 | enum { |
||
36 | CHAR_XUPPER = CHAR_XLETTER | CHAR_UPPER, |
||
37 | CHAR_XLOWER = CHAR_XLETTER | CHAR_LOWER |
||
38 | }; |
||
39 | } // end namespace charinfo |
||
40 | |||
41 | /// Returns true if a byte is an ASCII character. |
||
42 | LLVM_READNONE inline bool isASCII(char c) { |
||
43 | return static_cast<unsigned char>(c) <= 127; |
||
44 | } |
||
45 | |||
46 | LLVM_READNONE inline bool isASCII(unsigned char c) { return c <= 127; } |
||
47 | |||
48 | /// Returns true if a codepoint is an ASCII character. |
||
49 | LLVM_READNONE inline bool isASCII(uint32_t c) { return c <= 127; } |
||
50 | LLVM_READNONE inline bool isASCII(int64_t c) { return 0 <= c && c <= 127; } |
||
51 | |||
52 | /// Returns true if this is a valid first character of a C identifier, |
||
53 | /// which is [a-zA-Z_]. |
||
54 | LLVM_READONLY inline bool isAsciiIdentifierStart(unsigned char c, |
||
55 | bool AllowDollar = false) { |
||
56 | using namespace charinfo; |
||
57 | if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_UNDER)) |
||
58 | return true; |
||
59 | return AllowDollar && c == '$'; |
||
60 | } |
||
61 | |||
62 | /// Returns true if this is a body character of a C identifier, |
||
63 | /// which is [a-zA-Z0-9_]. |
||
64 | LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned char c, |
||
65 | bool AllowDollar = false) { |
||
66 | using namespace charinfo; |
||
67 | if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER)) |
||
68 | return true; |
||
69 | return AllowDollar && c == '$'; |
||
70 | } |
||
71 | |||
72 | /// Returns true if this character is horizontal ASCII whitespace: |
||
73 | /// ' ', '\\t', '\\f', '\\v'. |
||
74 | /// |
||
75 | /// Note that this returns false for '\\0'. |
||
76 | LLVM_READONLY inline bool isHorizontalWhitespace(unsigned char c) { |
||
77 | using namespace charinfo; |
||
78 | return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_SPACE)) != 0; |
||
79 | } |
||
80 | |||
81 | /// Returns true if this character is vertical ASCII whitespace: '\\n', '\\r'. |
||
82 | /// |
||
83 | /// Note that this returns false for '\\0'. |
||
84 | LLVM_READONLY inline bool isVerticalWhitespace(unsigned char c) { |
||
85 | using namespace charinfo; |
||
86 | return (InfoTable[c] & CHAR_VERT_WS) != 0; |
||
87 | } |
||
88 | |||
89 | /// Return true if this character is horizontal or vertical ASCII whitespace: |
||
90 | /// ' ', '\\t', '\\f', '\\v', '\\n', '\\r'. |
||
91 | /// |
||
92 | /// Note that this returns false for '\\0'. |
||
93 | LLVM_READONLY inline bool isWhitespace(unsigned char c) { |
||
94 | using namespace charinfo; |
||
95 | return (InfoTable[c] & (CHAR_HORZ_WS|CHAR_VERT_WS|CHAR_SPACE)) != 0; |
||
96 | } |
||
97 | |||
98 | /// Return true if this character is an ASCII digit: [0-9] |
||
99 | LLVM_READONLY inline bool isDigit(unsigned char c) { |
||
100 | using namespace charinfo; |
||
101 | return (InfoTable[c] & CHAR_DIGIT) != 0; |
||
102 | } |
||
103 | |||
104 | /// Return true if this character is a lowercase ASCII letter: [a-z] |
||
105 | LLVM_READONLY inline bool isLowercase(unsigned char c) { |
||
106 | using namespace charinfo; |
||
107 | return (InfoTable[c] & CHAR_LOWER) != 0; |
||
108 | } |
||
109 | |||
110 | /// Return true if this character is an uppercase ASCII letter: [A-Z] |
||
111 | LLVM_READONLY inline bool isUppercase(unsigned char c) { |
||
112 | using namespace charinfo; |
||
113 | return (InfoTable[c] & CHAR_UPPER) != 0; |
||
114 | } |
||
115 | |||
116 | /// Return true if this character is an ASCII letter: [a-zA-Z] |
||
117 | LLVM_READONLY inline bool isLetter(unsigned char c) { |
||
118 | using namespace charinfo; |
||
119 | return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER)) != 0; |
||
120 | } |
||
121 | |||
122 | /// Return true if this character is an ASCII letter or digit: [a-zA-Z0-9] |
||
123 | LLVM_READONLY inline bool isAlphanumeric(unsigned char c) { |
||
124 | using namespace charinfo; |
||
125 | return (InfoTable[c] & (CHAR_DIGIT|CHAR_UPPER|CHAR_LOWER)) != 0; |
||
126 | } |
||
127 | |||
128 | /// Return true if this character is an ASCII hex digit: [0-9a-fA-F] |
||
129 | LLVM_READONLY inline bool isHexDigit(unsigned char c) { |
||
130 | using namespace charinfo; |
||
131 | return (InfoTable[c] & (CHAR_DIGIT|CHAR_XLETTER)) != 0; |
||
132 | } |
||
133 | |||
134 | /// Return true if this character is an ASCII punctuation character. |
||
135 | /// |
||
136 | /// Note that '_' is both a punctuation character and an identifier character! |
||
137 | LLVM_READONLY inline bool isPunctuation(unsigned char c) { |
||
138 | using namespace charinfo; |
||
139 | return (InfoTable[c] & (CHAR_UNDER|CHAR_PERIOD|CHAR_RAWDEL|CHAR_PUNCT)) != 0; |
||
140 | } |
||
141 | |||
142 | /// Return true if this character is an ASCII printable character; that is, a |
||
143 | /// character that should take exactly one column to print in a fixed-width |
||
144 | /// terminal. |
||
145 | LLVM_READONLY inline bool isPrintable(unsigned char c) { |
||
146 | using namespace charinfo; |
||
147 | return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD|CHAR_PUNCT| |
||
148 | CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL|CHAR_SPACE)) != 0; |
||
149 | } |
||
150 | |||
151 | /// Return true if this is the body character of a C preprocessing number, |
||
152 | /// which is [a-zA-Z0-9_.]. |
||
153 | LLVM_READONLY inline bool isPreprocessingNumberBody(unsigned char c) { |
||
154 | using namespace charinfo; |
||
155 | return (InfoTable[c] & |
||
156 | (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER|CHAR_PERIOD)) != 0; |
||
157 | } |
||
158 | |||
159 | /// Return true if this is the body character of a C++ raw string delimiter. |
||
160 | LLVM_READONLY inline bool isRawStringDelimBody(unsigned char c) { |
||
161 | using namespace charinfo; |
||
162 | return (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_PERIOD| |
||
163 | CHAR_DIGIT|CHAR_UNDER|CHAR_RAWDEL)) != 0; |
||
164 | } |
||
165 | |||
166 | enum class EscapeChar { |
||
167 | Single = 1, |
||
168 | Double = 2, |
||
169 | SingleAndDouble = static_cast<int>(Single) | static_cast<int>(Double), |
||
170 | }; |
||
171 | |||
172 | /// Return C-style escaped string for special characters, or an empty string if |
||
173 | /// there is no such mapping. |
||
174 | template <EscapeChar Opt, class CharT> |
||
175 | LLVM_READONLY inline auto escapeCStyle(CharT Ch) -> StringRef { |
||
176 | switch (Ch) { |
||
177 | case '\\': |
||
178 | return "\\\\"; |
||
179 | case '\'': |
||
180 | if ((static_cast<int>(Opt) & static_cast<int>(EscapeChar::Single)) == 0) |
||
181 | break; |
||
182 | return "\\'"; |
||
183 | case '"': |
||
184 | if ((static_cast<int>(Opt) & static_cast<int>(EscapeChar::Double)) == 0) |
||
185 | break; |
||
186 | return "\\\""; |
||
187 | case '\a': |
||
188 | return "\\a"; |
||
189 | case '\b': |
||
190 | return "\\b"; |
||
191 | case '\f': |
||
192 | return "\\f"; |
||
193 | case '\n': |
||
194 | return "\\n"; |
||
195 | case '\r': |
||
196 | return "\\r"; |
||
197 | case '\t': |
||
198 | return "\\t"; |
||
199 | case '\v': |
||
200 | return "\\v"; |
||
201 | } |
||
202 | return {}; |
||
203 | } |
||
204 | |||
205 | /// Converts the given ASCII character to its lowercase equivalent. |
||
206 | /// |
||
207 | /// If the character is not an uppercase character, it is returned as is. |
||
208 | LLVM_READONLY inline char toLowercase(char c) { |
||
209 | if (isUppercase(c)) |
||
210 | return c + 'a' - 'A'; |
||
211 | return c; |
||
212 | } |
||
213 | |||
214 | /// Converts the given ASCII character to its uppercase equivalent. |
||
215 | /// |
||
216 | /// If the character is not a lowercase character, it is returned as is. |
||
217 | LLVM_READONLY inline char toUppercase(char c) { |
||
218 | if (isLowercase(c)) |
||
219 | return c + 'A' - 'a'; |
||
220 | return c; |
||
221 | } |
||
222 | |||
223 | |||
224 | /// Return true if this is a valid ASCII identifier. |
||
225 | /// |
||
226 | /// Note that this is a very simple check; it does not accept UCNs as valid |
||
227 | /// identifier characters. |
||
228 | LLVM_READONLY inline bool isValidAsciiIdentifier(StringRef S, |
||
229 | bool AllowDollar = false) { |
||
230 | if (S.empty() || !isAsciiIdentifierStart(S[0], AllowDollar)) |
||
231 | return false; |
||
232 | |||
233 | for (StringRef::iterator I = S.begin(), E = S.end(); I != E; ++I) |
||
234 | if (!isAsciiIdentifierContinue(*I, AllowDollar)) |
||
235 | return false; |
||
236 | |||
237 | return true; |
||
238 | } |
||
239 | |||
240 | } // end namespace clang |
||
241 | |||
242 | #endif |