Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
1 | pmbaty | 1 | #define __PHYSICSFS_INTERNAL__ |
2 | #include "physfs_internal.h" |
||
3 | |||
4 | #include "physfs_casefolding.h" |
||
5 | |||
6 | |||
7 | /* |
||
8 | * From rfc3629, the UTF-8 spec: |
||
9 | * https://www.ietf.org/rfc/rfc3629.txt |
||
10 | * |
||
11 | * Char. number range | UTF-8 octet sequence |
||
12 | * (hexadecimal) | (binary) |
||
13 | * --------------------+--------------------------------------------- |
||
14 | * 0000 0000-0000 007F | 0xxxxxxx |
||
15 | * 0000 0080-0000 07FF | 110xxxxx 10xxxxxx |
||
16 | * 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
||
17 | * 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
||
18 | */ |
||
19 | |||
20 | |||
21 | /* |
||
22 | * This may not be the best value, but it's one that isn't represented |
||
23 | * in Unicode (0x10FFFF is the largest codepoint value). We return this |
||
24 | * value from utf8codepoint() if there's bogus bits in the |
||
25 | * stream. utf8codepoint() will turn this value into something |
||
26 | * reasonable (like a question mark), for text that wants to try to recover, |
||
27 | * whereas utf8valid() will use the value to determine if a string has bad |
||
28 | * bits. |
||
29 | */ |
||
30 | #define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF |
||
31 | |||
32 | /* |
||
33 | * This is the codepoint we currently return when there was bogus bits in a |
||
34 | * UTF-8 string. May not fly in Asian locales? |
||
35 | */ |
||
36 | #define UNICODE_BOGUS_CHAR_CODEPOINT '?' |
||
37 | |||
38 | static PHYSFS_uint32 utf8codepoint(const char **_str) |
||
39 | { |
||
40 | const char *str = *_str; |
||
41 | PHYSFS_uint32 retval = 0; |
||
42 | PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str); |
||
43 | PHYSFS_uint32 octet2, octet3, octet4; |
||
44 | |||
45 | if (octet == 0) /* null terminator, end of string. */ |
||
46 | return 0; |
||
47 | |||
48 | else if (octet < 128) /* one octet char: 0 to 127 */ |
||
49 | { |
||
50 | (*_str)++; /* skip to next possible start of codepoint. */ |
||
51 | return octet; |
||
52 | } /* else if */ |
||
53 | |||
54 | else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */ |
||
55 | { |
||
56 | /* |
||
57 | * Apparently each of these is supposed to be flagged as a bogus |
||
58 | * char, instead of just resyncing to the next valid codepoint. |
||
59 | */ |
||
60 | (*_str)++; /* skip to next possible start of codepoint. */ |
||
61 | return UNICODE_BOGUS_CHAR_VALUE; |
||
62 | } /* else if */ |
||
63 | |||
64 | else if (octet < 224) /* two octets */ |
||
65 | { |
||
66 | (*_str)++; /* advance at least one byte in case of an error */ |
||
67 | octet -= (128+64); |
||
68 | octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
69 | if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
70 | return UNICODE_BOGUS_CHAR_VALUE; |
||
71 | |||
72 | *_str += 1; /* skip to next possible start of codepoint. */ |
||
73 | retval = ((octet << 6) | (octet2 - 128)); |
||
74 | if ((retval >= 0x80) && (retval <= 0x7FF)) |
||
75 | return retval; |
||
76 | } /* else if */ |
||
77 | |||
78 | else if (octet < 240) /* three octets */ |
||
79 | { |
||
80 | (*_str)++; /* advance at least one byte in case of an error */ |
||
81 | octet -= (128+64+32); |
||
82 | octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
83 | if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
84 | return UNICODE_BOGUS_CHAR_VALUE; |
||
85 | |||
86 | octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
87 | if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
88 | return UNICODE_BOGUS_CHAR_VALUE; |
||
89 | |||
90 | *_str += 2; /* skip to next possible start of codepoint. */ |
||
91 | retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) ); |
||
92 | |||
93 | /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */ |
||
94 | switch (retval) |
||
95 | { |
||
96 | case 0xD800: |
||
97 | case 0xDB7F: |
||
98 | case 0xDB80: |
||
99 | case 0xDBFF: |
||
100 | case 0xDC00: |
||
101 | case 0xDF80: |
||
102 | case 0xDFFF: |
||
103 | return UNICODE_BOGUS_CHAR_VALUE; |
||
104 | } /* switch */ |
||
105 | |||
106 | /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */ |
||
107 | if ((retval >= 0x800) && (retval <= 0xFFFD)) |
||
108 | return retval; |
||
109 | } /* else if */ |
||
110 | |||
111 | else if (octet < 248) /* four octets */ |
||
112 | { |
||
113 | (*_str)++; /* advance at least one byte in case of an error */ |
||
114 | octet -= (128+64+32+16); |
||
115 | octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
116 | if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
117 | return UNICODE_BOGUS_CHAR_VALUE; |
||
118 | |||
119 | octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
120 | if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
121 | return UNICODE_BOGUS_CHAR_VALUE; |
||
122 | |||
123 | octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
124 | if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
125 | return UNICODE_BOGUS_CHAR_VALUE; |
||
126 | |||
127 | *_str += 3; /* skip to next possible start of codepoint. */ |
||
128 | retval = ( ((octet << 18)) | ((octet2 - 128) << 12) | |
||
129 | ((octet3 - 128) << 6) | ((octet4 - 128)) ); |
||
130 | if ((retval >= 0x10000) && (retval <= 0x10FFFF)) |
||
131 | return retval; |
||
132 | } /* else if */ |
||
133 | |||
134 | /* |
||
135 | * Five and six octet sequences became illegal in rfc3629. |
||
136 | * We throw the codepoint away, but parse them to make sure we move |
||
137 | * ahead the right number of bytes and don't overflow the buffer. |
||
138 | */ |
||
139 | |||
140 | else if (octet < 252) /* five octets */ |
||
141 | { |
||
142 | (*_str)++; /* advance at least one byte in case of an error */ |
||
143 | octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
144 | if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
145 | return UNICODE_BOGUS_CHAR_VALUE; |
||
146 | |||
147 | octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
148 | if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
149 | return UNICODE_BOGUS_CHAR_VALUE; |
||
150 | |||
151 | octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
152 | if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
153 | return UNICODE_BOGUS_CHAR_VALUE; |
||
154 | |||
155 | octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
156 | if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
157 | return UNICODE_BOGUS_CHAR_VALUE; |
||
158 | |||
159 | *_str += 4; /* skip to next possible start of codepoint. */ |
||
160 | return UNICODE_BOGUS_CHAR_VALUE; |
||
161 | } /* else if */ |
||
162 | |||
163 | else /* six octets */ |
||
164 | { |
||
165 | (*_str)++; /* advance at least one byte in case of an error */ |
||
166 | octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
167 | if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
168 | return UNICODE_BOGUS_CHAR_VALUE; |
||
169 | |||
170 | octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
171 | if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
172 | return UNICODE_BOGUS_CHAR_VALUE; |
||
173 | |||
174 | octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
175 | if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
176 | return UNICODE_BOGUS_CHAR_VALUE; |
||
177 | |||
178 | octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
179 | if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
180 | return UNICODE_BOGUS_CHAR_VALUE; |
||
181 | |||
182 | octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str)); |
||
183 | if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */ |
||
184 | return UNICODE_BOGUS_CHAR_VALUE; |
||
185 | |||
186 | *_str += 6; /* skip to next possible start of codepoint. */ |
||
187 | return UNICODE_BOGUS_CHAR_VALUE; |
||
188 | } /* else if */ |
||
189 | |||
190 | return UNICODE_BOGUS_CHAR_VALUE; |
||
191 | } /* utf8codepoint */ |
||
192 | |||
193 | static PHYSFS_uint32 utf16codepoint(const PHYSFS_uint16 **_str) |
||
194 | { |
||
195 | const PHYSFS_uint16 *src = *_str; |
||
196 | PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++); |
||
197 | |||
198 | if (cp == 0) /* null terminator, end of string. */ |
||
199 | return 0; |
||
200 | /* Orphaned second half of surrogate pair? */ |
||
201 | else if ((cp >= 0xDC00) && (cp <= 0xDFFF)) |
||
202 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
203 | else if ((cp >= 0xD800) && (cp <= 0xDBFF)) /* start surrogate pair! */ |
||
204 | { |
||
205 | const PHYSFS_uint32 pair = (PHYSFS_uint32) *src; |
||
206 | if (pair == 0) |
||
207 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
208 | else if ((pair < 0xDC00) || (pair > 0xDFFF)) |
||
209 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
210 | else |
||
211 | { |
||
212 | src++; /* eat the other surrogate. */ |
||
213 | cp = (((cp - 0xD800) << 10) | (pair - 0xDC00)); |
||
214 | } /* else */ |
||
215 | } /* else if */ |
||
216 | |||
217 | *_str = src; |
||
218 | return cp; |
||
219 | } /* utf16codepoint */ |
||
220 | |||
221 | static PHYSFS_uint32 utf32codepoint(const PHYSFS_uint32 **_str) |
||
222 | { |
||
223 | const PHYSFS_uint32 *src = *_str; |
||
224 | PHYSFS_uint32 cp = *(src++); |
||
225 | |||
226 | if (cp == 0) /* null terminator, end of string. */ |
||
227 | return 0; |
||
228 | else if (cp > 0x10FFF) |
||
229 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
230 | |||
231 | *_str = src; |
||
232 | return cp; |
||
233 | } /* utf32codepoint */ |
||
234 | |||
235 | |||
236 | void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len) |
||
237 | { |
||
238 | len -= sizeof (PHYSFS_uint32); /* save room for null char. */ |
||
239 | while (len >= sizeof (PHYSFS_uint32)) |
||
240 | { |
||
241 | PHYSFS_uint32 cp = utf8codepoint(&src); |
||
242 | if (cp == 0) |
||
243 | break; |
||
244 | else if (cp == UNICODE_BOGUS_CHAR_VALUE) |
||
245 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
246 | *(dst++) = cp; |
||
247 | len -= sizeof (PHYSFS_uint32); |
||
248 | } /* while */ |
||
249 | |||
250 | *dst = 0; |
||
251 | } /* PHYSFS_utf8ToUcs4 */ |
||
252 | |||
253 | |||
254 | void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len) |
||
255 | { |
||
256 | len -= sizeof (PHYSFS_uint16); /* save room for null char. */ |
||
257 | while (len >= sizeof (PHYSFS_uint16)) |
||
258 | { |
||
259 | PHYSFS_uint32 cp = utf8codepoint(&src); |
||
260 | if (cp == 0) |
||
261 | break; |
||
262 | else if (cp == UNICODE_BOGUS_CHAR_VALUE) |
||
263 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
264 | |||
265 | if (cp > 0xFFFF) /* UTF-16 surrogates (bogus chars in UCS-2) */ |
||
266 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
267 | |||
268 | *(dst++) = cp; |
||
269 | len -= sizeof (PHYSFS_uint16); |
||
270 | } /* while */ |
||
271 | |||
272 | *dst = 0; |
||
273 | } /* PHYSFS_utf8ToUcs2 */ |
||
274 | |||
275 | |||
276 | void PHYSFS_utf8ToUtf16(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len) |
||
277 | { |
||
278 | len -= sizeof (PHYSFS_uint16); /* save room for null char. */ |
||
279 | while (len >= sizeof (PHYSFS_uint16)) |
||
280 | { |
||
281 | PHYSFS_uint32 cp = utf8codepoint(&src); |
||
282 | if (cp == 0) |
||
283 | break; |
||
284 | else if (cp == UNICODE_BOGUS_CHAR_VALUE) |
||
285 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
286 | |||
287 | if (cp > 0xFFFF) /* encode as surrogate pair */ |
||
288 | { |
||
289 | if (len < (sizeof (PHYSFS_uint16) * 2)) |
||
290 | break; /* not enough room for the pair, stop now. */ |
||
291 | |||
292 | cp -= 0x10000; /* Make this a 20-bit value */ |
||
293 | |||
294 | *(dst++) = 0xD800 + ((cp >> 10) & 0x3FF); |
||
295 | len -= sizeof (PHYSFS_uint16); |
||
296 | |||
297 | cp = 0xDC00 + (cp & 0x3FF); |
||
298 | } /* if */ |
||
299 | |||
300 | *(dst++) = cp; |
||
301 | len -= sizeof (PHYSFS_uint16); |
||
302 | } /* while */ |
||
303 | |||
304 | *dst = 0; |
||
305 | } /* PHYSFS_utf8ToUtf16 */ |
||
306 | |||
307 | static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len) |
||
308 | { |
||
309 | char *dst = *_dst; |
||
310 | PHYSFS_uint64 len = *_len; |
||
311 | |||
312 | if (len == 0) |
||
313 | return; |
||
314 | |||
315 | if (cp > 0x10FFFF) |
||
316 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
317 | else if ((cp == 0xFFFE) || (cp == 0xFFFF)) /* illegal values. */ |
||
318 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
319 | else |
||
320 | { |
||
321 | /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */ |
||
322 | switch (cp) |
||
323 | { |
||
324 | case 0xD800: |
||
325 | case 0xDB7F: |
||
326 | case 0xDB80: |
||
327 | case 0xDBFF: |
||
328 | case 0xDC00: |
||
329 | case 0xDF80: |
||
330 | case 0xDFFF: |
||
331 | cp = UNICODE_BOGUS_CHAR_CODEPOINT; |
||
332 | } /* switch */ |
||
333 | } /* else */ |
||
334 | |||
335 | /* Do the encoding... */ |
||
336 | if (cp < 0x80) |
||
337 | { |
||
338 | *(dst++) = (char) cp; |
||
339 | len--; |
||
340 | } /* if */ |
||
341 | |||
342 | else if (cp < 0x800) |
||
343 | { |
||
344 | if (len < 2) |
||
345 | len = 0; |
||
346 | else |
||
347 | { |
||
348 | *(dst++) = (char) ((cp >> 6) | 128 | 64); |
||
349 | *(dst++) = (char) (cp & 0x3F) | 128; |
||
350 | len -= 2; |
||
351 | } /* else */ |
||
352 | } /* else if */ |
||
353 | |||
354 | else if (cp < 0x10000) |
||
355 | { |
||
356 | if (len < 3) |
||
357 | len = 0; |
||
358 | else |
||
359 | { |
||
360 | *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32); |
||
361 | *(dst++) = (char) ((cp >> 6) & 0x3F) | 128; |
||
362 | *(dst++) = (char) (cp & 0x3F) | 128; |
||
363 | len -= 3; |
||
364 | } /* else */ |
||
365 | } /* else if */ |
||
366 | |||
367 | else |
||
368 | { |
||
369 | if (len < 4) |
||
370 | len = 0; |
||
371 | else |
||
372 | { |
||
373 | *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16); |
||
374 | *(dst++) = (char) ((cp >> 12) & 0x3F) | 128; |
||
375 | *(dst++) = (char) ((cp >> 6) & 0x3F) | 128; |
||
376 | *(dst++) = (char) (cp & 0x3F) | 128; |
||
377 | len -= 4; |
||
378 | } /* else if */ |
||
379 | } /* else */ |
||
380 | |||
381 | *_dst = dst; |
||
382 | *_len = len; |
||
383 | } /* utf8fromcodepoint */ |
||
384 | |||
385 | #define UTF8FROMTYPE(typ, src, dst, len) \ |
||
386 | if (len == 0) return; \ |
||
387 | len--; \ |
||
388 | while (len) \ |
||
389 | { \ |
||
390 | const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \ |
||
391 | if (cp == 0) break; \ |
||
392 | utf8fromcodepoint(cp, &dst, &len); \ |
||
393 | } \ |
||
394 | *dst = '\0'; \ |
||
395 | |||
396 | void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len) |
||
397 | { |
||
398 | UTF8FROMTYPE(PHYSFS_uint32, src, dst, len); |
||
399 | } /* PHYSFS_utf8FromUcs4 */ |
||
400 | |||
401 | void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len) |
||
402 | { |
||
403 | UTF8FROMTYPE(PHYSFS_uint64, src, dst, len); |
||
404 | } /* PHYSFS_utf8FromUcs2 */ |
||
405 | |||
406 | /* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */ |
||
407 | void PHYSFS_utf8FromLatin1(const char *src, char *dst, PHYSFS_uint64 len) |
||
408 | { |
||
409 | UTF8FROMTYPE(PHYSFS_uint8, src, dst, len); |
||
410 | } /* PHYSFS_utf8FromLatin1 */ |
||
411 | |||
412 | #undef UTF8FROMTYPE |
||
413 | |||
414 | |||
415 | void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len) |
||
416 | { |
||
417 | if (len == 0) |
||
418 | return; |
||
419 | |||
420 | len--; |
||
421 | while (len) |
||
422 | { |
||
423 | const PHYSFS_uint32 cp = utf16codepoint(&src); |
||
424 | if (!cp) |
||
425 | break; |
||
426 | utf8fromcodepoint(cp, &dst, &len); |
||
427 | } /* while */ |
||
428 | |||
429 | *dst = '\0'; |
||
430 | } /* PHYSFS_utf8FromUtf16 */ |
||
431 | |||
432 | |||
433 | int PHYSFS_caseFold(const PHYSFS_uint32 from, PHYSFS_uint32 *to) |
||
434 | { |
||
435 | int i; |
||
436 | |||
437 | if (from < 128) /* low-ASCII, easy! */ |
||
438 | { |
||
439 | if ((from >= 'A') && (from <= 'Z')) |
||
440 | *to = from - ('A' - 'a'); |
||
441 | else |
||
442 | *to = from; |
||
443 | return 1; |
||
444 | } /* if */ |
||
445 | |||
446 | else if (from <= 0xFFFF) |
||
447 | { |
||
448 | const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF); |
||
449 | const PHYSFS_uint16 from16 = (PHYSFS_uint16) from; |
||
450 | |||
451 | { |
||
452 | const CaseFoldHashBucket1_16 *bucket = &case_fold_hash1_16[hash]; |
||
453 | const int count = (int) bucket->count; |
||
454 | for (i = 0; i < count; i++) |
||
455 | { |
||
456 | const CaseFoldMapping1_16 *mapping = &bucket->list[i]; |
||
457 | if (mapping->from == from16) |
||
458 | { |
||
459 | *to = mapping->to0; |
||
460 | return 1; |
||
461 | } /* if */ |
||
462 | } /* for */ |
||
463 | } |
||
464 | |||
465 | { |
||
466 | const CaseFoldHashBucket2_16 *bucket = &case_fold_hash2_16[hash & 15]; |
||
467 | const int count = (int) bucket->count; |
||
468 | for (i = 0; i < count; i++) |
||
469 | { |
||
470 | const CaseFoldMapping2_16 *mapping = &bucket->list[i]; |
||
471 | if (mapping->from == from16) |
||
472 | { |
||
473 | to[0] = mapping->to0; |
||
474 | to[1] = mapping->to1; |
||
475 | return 2; |
||
476 | } /* if */ |
||
477 | } /* for */ |
||
478 | } |
||
479 | |||
480 | { |
||
481 | const CaseFoldHashBucket3_16 *bucket = &case_fold_hash3_16[hash & 3]; |
||
482 | const int count = (int) bucket->count; |
||
483 | for (i = 0; i < count; i++) |
||
484 | { |
||
485 | const CaseFoldMapping3_16 *mapping = &bucket->list[i]; |
||
486 | if (mapping->from == from16) |
||
487 | { |
||
488 | to[0] = mapping->to0; |
||
489 | to[1] = mapping->to1; |
||
490 | to[2] = mapping->to2; |
||
491 | return 3; |
||
492 | } /* if */ |
||
493 | } /* for */ |
||
494 | } |
||
495 | } /* else if */ |
||
496 | |||
497 | else /* codepoint that doesn't fit in 16 bits. */ |
||
498 | { |
||
499 | const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF); |
||
500 | const CaseFoldHashBucket1_32 *bucket = &case_fold_hash1_32[hash & 15]; |
||
501 | const int count = (int) bucket->count; |
||
502 | for (i = 0; i < count; i++) |
||
503 | { |
||
504 | const CaseFoldMapping1_32 *mapping = &bucket->list[i]; |
||
505 | if (mapping->from == from) |
||
506 | { |
||
507 | *to = mapping->to0; |
||
508 | return 1; |
||
509 | } /* if */ |
||
510 | } /* for */ |
||
511 | } /* else */ |
||
512 | |||
513 | |||
514 | /* Not found...there's no remapping for this codepoint. */ |
||
515 | *to = from; |
||
516 | return 1; |
||
517 | } /* PHYSFS_caseFold */ |
||
518 | |||
519 | |||
520 | #define UTFSTRICMP(bits) \ |
||
521 | PHYSFS_uint32 folded1[3], folded2[3]; \ |
||
522 | int head1 = 0, tail1 = 0, head2 = 0, tail2 = 0; \ |
||
523 | while (1) { \ |
||
524 | PHYSFS_uint32 cp1, cp2; \ |
||
525 | if (head1 != tail1) { \ |
||
526 | cp1 = folded1[tail1++]; \ |
||
527 | } else { \ |
||
528 | head1 = PHYSFS_caseFold(utf##bits##codepoint(&str1), folded1); \ |
||
529 | cp1 = folded1[0]; \ |
||
530 | tail1 = 1; \ |
||
531 | } \ |
||
532 | if (head2 != tail2) { \ |
||
533 | cp2 = folded2[tail2++]; \ |
||
534 | } else { \ |
||
535 | head2 = PHYSFS_caseFold(utf##bits##codepoint(&str2), folded2); \ |
||
536 | cp2 = folded2[0]; \ |
||
537 | tail2 = 1; \ |
||
538 | } \ |
||
539 | if (cp1 < cp2) { \ |
||
540 | return -1; \ |
||
541 | } else if (cp1 > cp2) { \ |
||
542 | return 1; \ |
||
543 | } else if (cp1 == 0) { \ |
||
544 | break; /* complete match. */ \ |
||
545 | } \ |
||
546 | } \ |
||
547 | return 0 |
||
548 | |||
549 | int PHYSFS_utf8stricmp(const char *str1, const char *str2) |
||
550 | { |
||
551 | UTFSTRICMP(8); |
||
552 | } /* PHYSFS_utf8stricmp */ |
||
553 | |||
554 | int PHYSFS_utf16stricmp(const PHYSFS_uint16 *str1, const PHYSFS_uint16 *str2) |
||
555 | { |
||
556 | UTFSTRICMP(16); |
||
557 | } /* PHYSFS_utf16stricmp */ |
||
558 | |||
559 | int PHYSFS_ucs4stricmp(const PHYSFS_uint32 *str1, const PHYSFS_uint32 *str2) |
||
560 | { |
||
561 | UTFSTRICMP(32); |
||
562 | } /* PHYSFS_ucs4stricmp */ |
||
563 | |||
564 | #undef UTFSTRICMP |
||
565 | |||
566 | /* end of physfs_unicode.c ... */ |
||
567 |