Subversion Repositories Games.Descent

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
1 pmbaty 1
#define __PHYSICSFS_INTERNAL__
2
#include "physfs_internal.h"
3
 
4
#include "physfs_casefolding.h"
5
 
6
 
7
/*
8
 * From rfc3629, the UTF-8 spec:
9
 *  https://www.ietf.org/rfc/rfc3629.txt
10
 *
11
 *   Char. number range  |        UTF-8 octet sequence
12
 *      (hexadecimal)    |              (binary)
13
 *   --------------------+---------------------------------------------
14
 *   0000 0000-0000 007F | 0xxxxxxx
15
 *   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
16
 *   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
17
 *   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
18
 */
19
 
20
 
21
/*
22
 * This may not be the best value, but it's one that isn't represented
23
 *  in Unicode (0x10FFFF is the largest codepoint value). We return this
24
 *  value from utf8codepoint() if there's bogus bits in the
25
 *  stream. utf8codepoint() will turn this value into something
26
 *  reasonable (like a question mark), for text that wants to try to recover,
27
 *  whereas utf8valid() will use the value to determine if a string has bad
28
 *  bits.
29
 */
30
#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
31
 
32
/*
33
 * This is the codepoint we currently return when there was bogus bits in a
34
 *  UTF-8 string. May not fly in Asian locales?
35
 */
36
#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
37
 
38
static PHYSFS_uint32 utf8codepoint(const char **_str)
39
{
40
    const char *str = *_str;
41
    PHYSFS_uint32 retval = 0;
42
    PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
43
    PHYSFS_uint32 octet2, octet3, octet4;
44
 
45
    if (octet == 0)  /* null terminator, end of string. */
46
        return 0;
47
 
48
    else if (octet < 128)  /* one octet char: 0 to 127 */
49
    {
50
        (*_str)++;  /* skip to next possible start of codepoint. */
51
        return octet;
52
    } /* else if */
53
 
54
    else if ((octet > 127) && (octet < 192))  /* bad (starts with 10xxxxxx). */
55
    {
56
        /*
57
         * Apparently each of these is supposed to be flagged as a bogus
58
         *  char, instead of just resyncing to the next valid codepoint.
59
         */
60
        (*_str)++;  /* skip to next possible start of codepoint. */
61
        return UNICODE_BOGUS_CHAR_VALUE;
62
    } /* else if */
63
 
64
    else if (octet < 224)  /* two octets */
65
    {
66
        (*_str)++;  /* advance at least one byte in case of an error */
67
        octet -= (128+64);
68
        octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
69
        if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
70
            return UNICODE_BOGUS_CHAR_VALUE;
71
 
72
        *_str += 1;  /* skip to next possible start of codepoint. */
73
        retval = ((octet << 6) | (octet2 - 128));
74
        if ((retval >= 0x80) && (retval <= 0x7FF))
75
            return retval;
76
    } /* else if */
77
 
78
    else if (octet < 240)  /* three octets */
79
    {
80
        (*_str)++;  /* advance at least one byte in case of an error */
81
        octet -= (128+64+32);
82
        octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
83
        if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
84
            return UNICODE_BOGUS_CHAR_VALUE;
85
 
86
        octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
87
        if ((octet3 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
88
            return UNICODE_BOGUS_CHAR_VALUE;
89
 
90
        *_str += 2;  /* skip to next possible start of codepoint. */
91
        retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
92
 
93
        /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
94
        switch (retval)
95
        {
96
            case 0xD800:
97
            case 0xDB7F:
98
            case 0xDB80:
99
            case 0xDBFF:
100
            case 0xDC00:
101
            case 0xDF80:
102
            case 0xDFFF:
103
                return UNICODE_BOGUS_CHAR_VALUE;
104
        } /* switch */
105
 
106
        /* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
107
        if ((retval >= 0x800) && (retval <= 0xFFFD))
108
            return retval;
109
    } /* else if */
110
 
111
    else if (octet < 248)  /* four octets */
112
    {
113
        (*_str)++;  /* advance at least one byte in case of an error */
114
        octet -= (128+64+32+16);
115
        octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
116
        if ((octet2 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
117
            return UNICODE_BOGUS_CHAR_VALUE;
118
 
119
        octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
120
        if ((octet3 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
121
            return UNICODE_BOGUS_CHAR_VALUE;
122
 
123
        octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
124
        if ((octet4 & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
125
            return UNICODE_BOGUS_CHAR_VALUE;
126
 
127
        *_str += 3;  /* skip to next possible start of codepoint. */
128
        retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
129
                   ((octet3 - 128) << 6) | ((octet4 - 128)) );
130
        if ((retval >= 0x10000) && (retval <= 0x10FFFF))
131
            return retval;
132
    } /* else if */
133
 
134
    /*
135
     * Five and six octet sequences became illegal in rfc3629.
136
     *  We throw the codepoint away, but parse them to make sure we move
137
     *  ahead the right number of bytes and don't overflow the buffer.
138
     */
139
 
140
    else if (octet < 252)  /* five octets */
141
    {
142
        (*_str)++;  /* advance at least one byte in case of an error */
143
        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
144
        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
145
            return UNICODE_BOGUS_CHAR_VALUE;
146
 
147
        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
148
        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
149
            return UNICODE_BOGUS_CHAR_VALUE;
150
 
151
        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
152
        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
153
            return UNICODE_BOGUS_CHAR_VALUE;
154
 
155
        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
156
        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
157
            return UNICODE_BOGUS_CHAR_VALUE;
158
 
159
        *_str += 4;  /* skip to next possible start of codepoint. */
160
        return UNICODE_BOGUS_CHAR_VALUE;
161
    } /* else if */
162
 
163
    else  /* six octets */
164
    {
165
        (*_str)++;  /* advance at least one byte in case of an error */
166
        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
167
        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
168
            return UNICODE_BOGUS_CHAR_VALUE;
169
 
170
        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
171
        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
172
            return UNICODE_BOGUS_CHAR_VALUE;
173
 
174
        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
175
        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
176
            return UNICODE_BOGUS_CHAR_VALUE;
177
 
178
        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
179
        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
180
            return UNICODE_BOGUS_CHAR_VALUE;
181
 
182
        octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
183
        if ((octet & (128+64)) != 128)  /* Format isn't 10xxxxxx? */
184
            return UNICODE_BOGUS_CHAR_VALUE;
185
 
186
        *_str += 6;  /* skip to next possible start of codepoint. */
187
        return UNICODE_BOGUS_CHAR_VALUE;
188
    } /* else if */
189
 
190
    return UNICODE_BOGUS_CHAR_VALUE;
191
} /* utf8codepoint */
192
 
193
static PHYSFS_uint32 utf16codepoint(const PHYSFS_uint16 **_str)
194
{
195
    const PHYSFS_uint16 *src = *_str;
196
    PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++);
197
 
198
    if (cp == 0)  /* null terminator, end of string. */
199
        return 0;
200
    /* Orphaned second half of surrogate pair? */
201
    else if ((cp >= 0xDC00) && (cp <= 0xDFFF))
202
        cp = UNICODE_BOGUS_CHAR_CODEPOINT;
203
    else if ((cp >= 0xD800) && (cp <= 0xDBFF))  /* start surrogate pair! */
204
    {
205
        const PHYSFS_uint32 pair = (PHYSFS_uint32) *src;
206
        if (pair == 0)
207
            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
208
        else if ((pair < 0xDC00) || (pair > 0xDFFF))
209
            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
210
        else
211
        {
212
            src++;  /* eat the other surrogate. */
213
            cp = (((cp - 0xD800) << 10) | (pair - 0xDC00));
214
        } /* else */
215
    } /* else if */
216
 
217
    *_str = src;
218
    return cp;
219
} /* utf16codepoint */
220
 
221
static PHYSFS_uint32 utf32codepoint(const PHYSFS_uint32 **_str)
222
{
223
    const PHYSFS_uint32 *src = *_str;
224
    PHYSFS_uint32 cp = *(src++);
225
 
226
    if (cp == 0)  /* null terminator, end of string. */
227
        return 0;
228
    else if (cp > 0x10FFF)
229
        cp = UNICODE_BOGUS_CHAR_CODEPOINT;
230
 
231
    *_str = src;
232
    return cp;
233
} /* utf32codepoint */
234
 
235
 
236
void PHYSFS_utf8ToUcs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
237
{
238
    len -= sizeof (PHYSFS_uint32);   /* save room for null char. */
239
    while (len >= sizeof (PHYSFS_uint32))
240
    {
241
        PHYSFS_uint32 cp = utf8codepoint(&src);
242
        if (cp == 0)
243
            break;
244
        else if (cp == UNICODE_BOGUS_CHAR_VALUE)
245
            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
246
        *(dst++) = cp;
247
        len -= sizeof (PHYSFS_uint32);
248
    } /* while */
249
 
250
    *dst = 0;
251
} /* PHYSFS_utf8ToUcs4 */
252
 
253
 
254
void PHYSFS_utf8ToUcs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
255
{
256
    len -= sizeof (PHYSFS_uint16);   /* save room for null char. */
257
    while (len >= sizeof (PHYSFS_uint16))
258
    {
259
        PHYSFS_uint32 cp = utf8codepoint(&src);
260
        if (cp == 0)
261
            break;
262
        else if (cp == UNICODE_BOGUS_CHAR_VALUE)
263
            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
264
 
265
        if (cp > 0xFFFF)  /* UTF-16 surrogates (bogus chars in UCS-2) */
266
            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
267
 
268
        *(dst++) = cp;
269
        len -= sizeof (PHYSFS_uint16);
270
    } /* while */
271
 
272
    *dst = 0;
273
} /* PHYSFS_utf8ToUcs2 */
274
 
275
 
276
void PHYSFS_utf8ToUtf16(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
277
{
278
    len -= sizeof (PHYSFS_uint16);   /* save room for null char. */
279
    while (len >= sizeof (PHYSFS_uint16))
280
    {
281
        PHYSFS_uint32 cp = utf8codepoint(&src);
282
        if (cp == 0)
283
            break;
284
        else if (cp == UNICODE_BOGUS_CHAR_VALUE)
285
            cp = UNICODE_BOGUS_CHAR_CODEPOINT;
286
 
287
        if (cp > 0xFFFF)  /* encode as surrogate pair */
288
        {
289
            if (len < (sizeof (PHYSFS_uint16) * 2))
290
                break;  /* not enough room for the pair, stop now. */
291
 
292
            cp -= 0x10000;  /* Make this a 20-bit value */
293
 
294
            *(dst++) = 0xD800 + ((cp >> 10) & 0x3FF);
295
            len -= sizeof (PHYSFS_uint16);
296
 
297
            cp = 0xDC00 + (cp & 0x3FF);
298
        } /* if */
299
 
300
        *(dst++) = cp;
301
        len -= sizeof (PHYSFS_uint16);
302
    } /* while */
303
 
304
    *dst = 0;
305
} /* PHYSFS_utf8ToUtf16 */
306
 
307
static void utf8fromcodepoint(PHYSFS_uint32 cp, char **_dst, PHYSFS_uint64 *_len)
308
{
309
    char *dst = *_dst;
310
    PHYSFS_uint64 len = *_len;
311
 
312
    if (len == 0)
313
        return;
314
 
315
    if (cp > 0x10FFFF)
316
        cp = UNICODE_BOGUS_CHAR_CODEPOINT;
317
    else if ((cp == 0xFFFE) || (cp == 0xFFFF))  /* illegal values. */
318
        cp = UNICODE_BOGUS_CHAR_CODEPOINT;
319
    else
320
    {
321
        /* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
322
        switch (cp)
323
        {
324
            case 0xD800:
325
            case 0xDB7F:
326
            case 0xDB80:
327
            case 0xDBFF:
328
            case 0xDC00:
329
            case 0xDF80:
330
            case 0xDFFF:
331
                cp = UNICODE_BOGUS_CHAR_CODEPOINT;
332
        } /* switch */
333
    } /* else */
334
 
335
    /* Do the encoding... */
336
    if (cp < 0x80)
337
    {
338
        *(dst++) = (char) cp;
339
        len--;
340
    } /* if */
341
 
342
    else if (cp < 0x800)
343
    {
344
        if (len < 2)
345
            len = 0;
346
        else
347
        {
348
            *(dst++) = (char) ((cp >> 6) | 128 | 64);
349
            *(dst++) = (char) (cp & 0x3F) | 128;
350
            len -= 2;
351
        } /* else */
352
    } /* else if */
353
 
354
    else if (cp < 0x10000)
355
    {
356
        if (len < 3)
357
            len = 0;
358
        else
359
        {
360
            *(dst++) = (char) ((cp >> 12) | 128 | 64 | 32);
361
            *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
362
            *(dst++) = (char) (cp & 0x3F) | 128;
363
            len -= 3;
364
        } /* else */
365
    } /* else if */
366
 
367
    else
368
    {
369
        if (len < 4)
370
            len = 0;
371
        else
372
        {
373
            *(dst++) = (char) ((cp >> 18) | 128 | 64 | 32 | 16);
374
            *(dst++) = (char) ((cp >> 12) & 0x3F) | 128;
375
            *(dst++) = (char) ((cp >> 6) & 0x3F) | 128;
376
            *(dst++) = (char) (cp & 0x3F) | 128;
377
            len -= 4;
378
        } /* else if */
379
    } /* else */
380
 
381
    *_dst = dst;
382
    *_len = len;
383
} /* utf8fromcodepoint */
384
 
385
#define UTF8FROMTYPE(typ, src, dst, len) \
386
    if (len == 0) return; \
387
    len--;  \
388
    while (len) \
389
    { \
390
        const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \
391
        if (cp == 0) break; \
392
        utf8fromcodepoint(cp, &dst, &len); \
393
    } \
394
    *dst = '\0'; \
395
 
396
void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
397
{
398
    UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
399
} /* PHYSFS_utf8FromUcs4 */
400
 
401
void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
402
{
403
    UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
404
} /* PHYSFS_utf8FromUcs2 */
405
 
406
/* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
407
void PHYSFS_utf8FromLatin1(const char *src, char *dst, PHYSFS_uint64 len)
408
{
409
    UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
410
} /* PHYSFS_utf8FromLatin1 */
411
 
412
#undef UTF8FROMTYPE
413
 
414
 
415
void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
416
{
417
    if (len == 0)
418
        return;
419
 
420
    len--;
421
    while (len)
422
    {
423
        const PHYSFS_uint32 cp = utf16codepoint(&src);
424
        if (!cp)
425
            break;
426
        utf8fromcodepoint(cp, &dst, &len);
427
    } /* while */
428
 
429
    *dst = '\0';
430
} /* PHYSFS_utf8FromUtf16 */
431
 
432
 
433
int PHYSFS_caseFold(const PHYSFS_uint32 from, PHYSFS_uint32 *to)
434
{
435
    int i;
436
 
437
    if (from < 128)  /* low-ASCII, easy! */
438
    {
439
        if ((from >= 'A') && (from <= 'Z'))
440
            *to = from - ('A' - 'a');
441
        else
442
            *to = from;
443
        return 1;
444
    } /* if */
445
 
446
    else if (from <= 0xFFFF)
447
    {
448
        const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF);
449
        const PHYSFS_uint16 from16 = (PHYSFS_uint16) from;
450
 
451
        {
452
            const CaseFoldHashBucket1_16 *bucket = &case_fold_hash1_16[hash];
453
            const int count = (int) bucket->count;
454
            for (i = 0; i < count; i++)
455
            {
456
                const CaseFoldMapping1_16 *mapping = &bucket->list[i];
457
                if (mapping->from == from16)
458
                {
459
                    *to = mapping->to0;
460
                    return 1;
461
                } /* if */
462
            } /* for */
463
        }
464
 
465
        {
466
            const CaseFoldHashBucket2_16 *bucket = &case_fold_hash2_16[hash & 15];
467
            const int count = (int) bucket->count;
468
            for (i = 0; i < count; i++)
469
            {
470
                const CaseFoldMapping2_16 *mapping = &bucket->list[i];
471
                if (mapping->from == from16)
472
                {
473
                    to[0] = mapping->to0;
474
                    to[1] = mapping->to1;
475
                    return 2;
476
                } /* if */
477
            } /* for */
478
        }
479
 
480
        {
481
            const CaseFoldHashBucket3_16 *bucket = &case_fold_hash3_16[hash & 3];
482
            const int count = (int) bucket->count;
483
            for (i = 0; i < count; i++)
484
            {
485
                const CaseFoldMapping3_16 *mapping = &bucket->list[i];
486
                if (mapping->from == from16)
487
                {
488
                    to[0] = mapping->to0;
489
                    to[1] = mapping->to1;
490
                    to[2] = mapping->to2;
491
                    return 3;
492
                } /* if */
493
            } /* for */
494
        }
495
    } /* else if */
496
 
497
    else  /* codepoint that doesn't fit in 16 bits. */
498
    {
499
        const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF);
500
        const CaseFoldHashBucket1_32 *bucket = &case_fold_hash1_32[hash & 15];
501
        const int count = (int) bucket->count;
502
        for (i = 0; i < count; i++)
503
        {
504
            const CaseFoldMapping1_32 *mapping = &bucket->list[i];
505
            if (mapping->from == from)
506
            {
507
                *to = mapping->to0;
508
                return 1;
509
            } /* if */
510
        } /* for */
511
    } /* else */
512
 
513
 
514
    /* Not found...there's no remapping for this codepoint. */
515
    *to = from;
516
    return 1;
517
} /* PHYSFS_caseFold */
518
 
519
 
520
#define UTFSTRICMP(bits) \
521
    PHYSFS_uint32 folded1[3], folded2[3]; \
522
    int head1 = 0, tail1 = 0, head2 = 0, tail2 = 0; \
523
    while (1) { \
524
        PHYSFS_uint32 cp1, cp2; \
525
        if (head1 != tail1) { \
526
            cp1 = folded1[tail1++]; \
527
        } else { \
528
            head1 = PHYSFS_caseFold(utf##bits##codepoint(&str1), folded1); \
529
            cp1 = folded1[0]; \
530
            tail1 = 1; \
531
        } \
532
        if (head2 != tail2) { \
533
            cp2 = folded2[tail2++]; \
534
        } else { \
535
            head2 = PHYSFS_caseFold(utf##bits##codepoint(&str2), folded2); \
536
            cp2 = folded2[0]; \
537
            tail2 = 1; \
538
        } \
539
        if (cp1 < cp2) { \
540
            return -1; \
541
        } else if (cp1 > cp2) { \
542
            return 1; \
543
        } else if (cp1 == 0) { \
544
            break;  /* complete match. */ \
545
        } \
546
    } \
547
    return 0
548
 
549
int PHYSFS_utf8stricmp(const char *str1, const char *str2)
550
{
551
    UTFSTRICMP(8);
552
} /* PHYSFS_utf8stricmp */
553
 
554
int PHYSFS_utf16stricmp(const PHYSFS_uint16 *str1, const PHYSFS_uint16 *str2)
555
{
556
    UTFSTRICMP(16);
557
} /* PHYSFS_utf16stricmp */
558
 
559
int PHYSFS_ucs4stricmp(const PHYSFS_uint32 *str1, const PHYSFS_uint32 *str2)
560
{
561
    UTFSTRICMP(32);
562
} /* PHYSFS_ucs4stricmp */
563
 
564
#undef UTFSTRICMP
565
 
566
/* end of physfs_unicode.c ... */
567