WebSVN – Games.Descent – Blame – /libphysfs/src/physfs_unicode.c

Rev	Author	Line No.	Line
1	pmbaty	1	#define __PHYSICSFS_INTERNAL__
		2	#include "physfs_internal.h"
		3
		4	#include "physfs_casefolding.h"
		5
		6
		7	/*
		8	* From rfc3629, the UTF-8 spec:
		9	* https://www.ietf.org/rfc/rfc3629.txt
		10	*
		11	* Char. number range \| UTF-8 octet sequence
		12	* (hexadecimal) \| (binary)
		13	* --------------------+---------------------------------------------
		14	* 0000 0000-0000 007F \| 0xxxxxxx
		15	* 0000 0080-0000 07FF \| 110xxxxx 10xxxxxx
		16	* 0000 0800-0000 FFFF \| 1110xxxx 10xxxxxx 10xxxxxx
		17	* 0001 0000-0010 FFFF \| 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
		18	*/
		19
		20
		21	/*
		22	* This may not be the best value, but it's one that isn't represented
		23	* in Unicode (0x10FFFF is the largest codepoint value). We return this
		24	* value from utf8codepoint() if there's bogus bits in the
		25	* stream. utf8codepoint() will turn this value into something
		26	* reasonable (like a question mark), for text that wants to try to recover,
		27	* whereas utf8valid() will use the value to determine if a string has bad
		28	* bits.
		29	*/
		30	#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
		31
		32	/*
		33	* This is the codepoint we currently return when there was bogus bits in a
		34	* UTF-8 string. May not fly in Asian locales?
		35	*/
		36	#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
		37
		38	static PHYSFS_uint32 utf8codepoint(const char **_str)
		39	{
		40	const char str = _str;
		41	PHYSFS_uint32 retval = 0;
		42	PHYSFS_uint32 octet = (PHYSFS_uint32) ((PHYSFS_uint8) *str);
		43	PHYSFS_uint32 octet2, octet3, octet4;
		44
		45	if (octet == 0) /* null terminator, end of string. */
		46	return 0;
		47
		48	else if (octet < 128) /* one octet char: 0 to 127 */
		49	{
		50	(_str)++; / skip to next possible start of codepoint. */
		51	return octet;
		52	} /* else if */
		53
		54	else if ((octet > 127) && (octet < 192)) /* bad (starts with 10xxxxxx). */
		55	{
		56	/*
		57	* Apparently each of these is supposed to be flagged as a bogus
		58	* char, instead of just resyncing to the next valid codepoint.
		59	*/
		60	(_str)++; / skip to next possible start of codepoint. */
		61	return UNICODE_BOGUS_CHAR_VALUE;
		62	} /* else if */
		63
		64	else if (octet < 224) /* two octets */
		65	{
		66	(_str)++; / advance at least one byte in case of an error */
		67	octet -= (128+64);
		68	octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		69	if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		70	return UNICODE_BOGUS_CHAR_VALUE;
		71
		72	_str += 1; / skip to next possible start of codepoint. */
		73	retval = ((octet << 6) \| (octet2 - 128));
		74	if ((retval >= 0x80) && (retval <= 0x7FF))
		75	return retval;
		76	} /* else if */
		77
		78	else if (octet < 240) /* three octets */
		79	{
		80	(_str)++; / advance at least one byte in case of an error */
		81	octet -= (128+64+32);
		82	octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		83	if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		84	return UNICODE_BOGUS_CHAR_VALUE;
		85
		86	octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		87	if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		88	return UNICODE_BOGUS_CHAR_VALUE;
		89
		90	_str += 2; / skip to next possible start of codepoint. */
		91	retval = ( ((octet << 12)) \| ((octet2-128) << 6) \| ((octet3-128)) );
		92
		93	/* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
		94	switch (retval)
		95	{
		96	case 0xD800:
		97	case 0xDB7F:
		98	case 0xDB80:
		99	case 0xDBFF:
		100	case 0xDC00:
		101	case 0xDF80:
		102	case 0xDFFF:
		103	return UNICODE_BOGUS_CHAR_VALUE;
		104	} /* switch */
		105
		106	/* 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge. */
		107	if ((retval >= 0x800) && (retval <= 0xFFFD))
		108	return retval;
		109	} /* else if */
		110
		111	else if (octet < 248) /* four octets */
		112	{
		113	(_str)++; / advance at least one byte in case of an error */
		114	octet -= (128+64+32+16);
		115	octet2 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		116	if ((octet2 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		117	return UNICODE_BOGUS_CHAR_VALUE;
		118
		119	octet3 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		120	if ((octet3 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		121	return UNICODE_BOGUS_CHAR_VALUE;
		122
		123	octet4 = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		124	if ((octet4 & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		125	return UNICODE_BOGUS_CHAR_VALUE;
		126
		127	_str += 3; / skip to next possible start of codepoint. */
		128	retval = ( ((octet << 18)) \| ((octet2 - 128) << 12) \|
		129	((octet3 - 128) << 6) \| ((octet4 - 128)) );
		130	if ((retval >= 0x10000) && (retval <= 0x10FFFF))
		131	return retval;
		132	} /* else if */
		133
		134	/*
		135	* Five and six octet sequences became illegal in rfc3629.
		136	* We throw the codepoint away, but parse them to make sure we move
		137	* ahead the right number of bytes and don't overflow the buffer.
		138	*/
		139
		140	else if (octet < 252) /* five octets */
		141	{
		142	(_str)++; / advance at least one byte in case of an error */
		143	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		144	if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		145	return UNICODE_BOGUS_CHAR_VALUE;
		146
		147	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		148	if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		149	return UNICODE_BOGUS_CHAR_VALUE;
		150
		151	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		152	if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		153	return UNICODE_BOGUS_CHAR_VALUE;
		154
		155	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		156	if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		157	return UNICODE_BOGUS_CHAR_VALUE;
		158
		159	_str += 4; / skip to next possible start of codepoint. */
		160	return UNICODE_BOGUS_CHAR_VALUE;
		161	} /* else if */
		162
		163	else /* six octets */
		164	{
		165	(_str)++; / advance at least one byte in case of an error */
		166	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		167	if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		168	return UNICODE_BOGUS_CHAR_VALUE;
		169
		170	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		171	if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		172	return UNICODE_BOGUS_CHAR_VALUE;
		173
		174	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		175	if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		176	return UNICODE_BOGUS_CHAR_VALUE;
		177
		178	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		179	if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		180	return UNICODE_BOGUS_CHAR_VALUE;
		181
		182	octet = (PHYSFS_uint32) ((PHYSFS_uint8) *(++str));
		183	if ((octet & (128+64)) != 128) /* Format isn't 10xxxxxx? */
		184	return UNICODE_BOGUS_CHAR_VALUE;
		185
		186	_str += 6; / skip to next possible start of codepoint. */
		187	return UNICODE_BOGUS_CHAR_VALUE;
		188	} /* else if */
		189
		190	return UNICODE_BOGUS_CHAR_VALUE;
		191	} /* utf8codepoint */
		192
		193	static PHYSFS_uint32 utf16codepoint(const PHYSFS_uint16 **_str)
		194	{
		195	const PHYSFS_uint16 src = _str;
		196	PHYSFS_uint32 cp = (PHYSFS_uint32) *(src++);
		197
		198	if (cp == 0) /* null terminator, end of string. */
		199	return 0;
		200	/* Orphaned second half of surrogate pair? */
		201	else if ((cp >= 0xDC00) && (cp <= 0xDFFF))
		202	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		203	else if ((cp >= 0xD800) && (cp <= 0xDBFF)) /* start surrogate pair! */
		204	{
		205	const PHYSFS_uint32 pair = (PHYSFS_uint32) *src;
		206	if (pair == 0)
		207	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		208	else if ((pair < 0xDC00) \|\| (pair > 0xDFFF))
		209	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		210	else
		211	{
		212	src++; /* eat the other surrogate. */
		213	cp = (((cp - 0xD800) << 10) \| (pair - 0xDC00));
		214	} /* else */
		215	} /* else if */
		216
		217	*_str = src;
		218	return cp;
		219	} /* utf16codepoint */
		220
		221	static PHYSFS_uint32 utf32codepoint(const PHYSFS_uint32 **_str)
		222	{
		223	const PHYSFS_uint32 src = _str;
		224	PHYSFS_uint32 cp = *(src++);
		225
		226	if (cp == 0) /* null terminator, end of string. */
		227	return 0;
		228	else if (cp > 0x10FFF)
		229	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		230
		231	*_str = src;
		232	return cp;
		233	} /* utf32codepoint */
		234
		235
		236	void PHYSFS_utf8ToUcs4(const char src, PHYSFS_uint32 dst, PHYSFS_uint64 len)
		237	{
		238	len -= sizeof (PHYSFS_uint32); /* save room for null char. */
		239	while (len >= sizeof (PHYSFS_uint32))
		240	{
		241	PHYSFS_uint32 cp = utf8codepoint(&src);
		242	if (cp == 0)
		243	break;
		244	else if (cp == UNICODE_BOGUS_CHAR_VALUE)
		245	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		246	*(dst++) = cp;
		247	len -= sizeof (PHYSFS_uint32);
		248	} /* while */
		249
		250	*dst = 0;
		251	} /* PHYSFS_utf8ToUcs4 */
		252
		253
		254	void PHYSFS_utf8ToUcs2(const char src, PHYSFS_uint16 dst, PHYSFS_uint64 len)
		255	{
		256	len -= sizeof (PHYSFS_uint16); /* save room for null char. */
		257	while (len >= sizeof (PHYSFS_uint16))
		258	{
		259	PHYSFS_uint32 cp = utf8codepoint(&src);
		260	if (cp == 0)
		261	break;
		262	else if (cp == UNICODE_BOGUS_CHAR_VALUE)
		263	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		264
		265	if (cp > 0xFFFF) /* UTF-16 surrogates (bogus chars in UCS-2) */
		266	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		267
		268	*(dst++) = cp;
		269	len -= sizeof (PHYSFS_uint16);
		270	} /* while */
		271
		272	*dst = 0;
		273	} /* PHYSFS_utf8ToUcs2 */
		274
		275
		276	void PHYSFS_utf8ToUtf16(const char src, PHYSFS_uint16 dst, PHYSFS_uint64 len)
		277	{
		278	len -= sizeof (PHYSFS_uint16); /* save room for null char. */
		279	while (len >= sizeof (PHYSFS_uint16))
		280	{
		281	PHYSFS_uint32 cp = utf8codepoint(&src);
		282	if (cp == 0)
		283	break;
		284	else if (cp == UNICODE_BOGUS_CHAR_VALUE)
		285	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		286
		287	if (cp > 0xFFFF) /* encode as surrogate pair */
		288	{
		289	if (len < (sizeof (PHYSFS_uint16) * 2))
		290	break; /* not enough room for the pair, stop now. */
		291
		292	cp -= 0x10000; /* Make this a 20-bit value */
		293
		294	*(dst++) = 0xD800 + ((cp >> 10) & 0x3FF);
		295	len -= sizeof (PHYSFS_uint16);
		296
		297	cp = 0xDC00 + (cp & 0x3FF);
		298	} /* if */
		299
		300	*(dst++) = cp;
		301	len -= sizeof (PHYSFS_uint16);
		302	} /* while */
		303
		304	*dst = 0;
		305	} /* PHYSFS_utf8ToUtf16 */
		306
		307	static void utf8fromcodepoint(PHYSFS_uint32 cp, char *_dst, PHYSFS_uint64 _len)
		308	{
		309	char dst = _dst;
		310	PHYSFS_uint64 len = *_len;
		311
		312	if (len == 0)
		313	return;
		314
		315	if (cp > 0x10FFFF)
		316	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		317	else if ((cp == 0xFFFE) \|\| (cp == 0xFFFF)) /* illegal values. */
		318	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		319	else
		320	{
		321	/* There are seven "UTF-16 surrogates" that are illegal in UTF-8. */
		322	switch (cp)
		323	{
		324	case 0xD800:
		325	case 0xDB7F:
		326	case 0xDB80:
		327	case 0xDBFF:
		328	case 0xDC00:
		329	case 0xDF80:
		330	case 0xDFFF:
		331	cp = UNICODE_BOGUS_CHAR_CODEPOINT;
		332	} /* switch */
		333	} /* else */
		334
		335	/* Do the encoding... */
		336	if (cp < 0x80)
		337	{
		338	*(dst++) = (char) cp;
		339	len--;
		340	} /* if */
		341
		342	else if (cp < 0x800)
		343	{
		344	if (len < 2)
		345	len = 0;
		346	else
		347	{
		348	*(dst++) = (char) ((cp >> 6) \| 128 \| 64);
		349	*(dst++) = (char) (cp & 0x3F) \| 128;
		350	len -= 2;
		351	} /* else */
		352	} /* else if */
		353
		354	else if (cp < 0x10000)
		355	{
		356	if (len < 3)
		357	len = 0;
		358	else
		359	{
		360	*(dst++) = (char) ((cp >> 12) \| 128 \| 64 \| 32);
		361	*(dst++) = (char) ((cp >> 6) & 0x3F) \| 128;
		362	*(dst++) = (char) (cp & 0x3F) \| 128;
		363	len -= 3;
		364	} /* else */
		365	} /* else if */
		366
		367	else
		368	{
		369	if (len < 4)
		370	len = 0;
		371	else
		372	{
		373	*(dst++) = (char) ((cp >> 18) \| 128 \| 64 \| 32 \| 16);
		374	*(dst++) = (char) ((cp >> 12) & 0x3F) \| 128;
		375	*(dst++) = (char) ((cp >> 6) & 0x3F) \| 128;
		376	*(dst++) = (char) (cp & 0x3F) \| 128;
		377	len -= 4;
		378	} /* else if */
		379	} /* else */
		380
		381	*_dst = dst;
		382	*_len = len;
		383	} /* utf8fromcodepoint */
		384
		385	#define UTF8FROMTYPE(typ, src, dst, len) \
		386	if (len == 0) return; \
		387	len--; \
		388	while (len) \
		389	{ \
		390	const PHYSFS_uint32 cp = (PHYSFS_uint32) ((typ) (*(src++))); \
		391	if (cp == 0) break; \
		392	utf8fromcodepoint(cp, &dst, &len); \
		393	} \
		394	*dst = '\0'; \
		395
		396	void PHYSFS_utf8FromUcs4(const PHYSFS_uint32 src, char dst, PHYSFS_uint64 len)
		397	{
		398	UTF8FROMTYPE(PHYSFS_uint32, src, dst, len);
		399	} /* PHYSFS_utf8FromUcs4 */
		400
		401	void PHYSFS_utf8FromUcs2(const PHYSFS_uint16 src, char dst, PHYSFS_uint64 len)
		402	{
		403	UTF8FROMTYPE(PHYSFS_uint64, src, dst, len);
		404	} /* PHYSFS_utf8FromUcs2 */
		405
		406	/* latin1 maps to unicode codepoints directly, we just utf-8 encode it. */
		407	void PHYSFS_utf8FromLatin1(const char src, char dst, PHYSFS_uint64 len)
		408	{
		409	UTF8FROMTYPE(PHYSFS_uint8, src, dst, len);
		410	} /* PHYSFS_utf8FromLatin1 */
		411
		412	#undef UTF8FROMTYPE
		413
		414
		415	void PHYSFS_utf8FromUtf16(const PHYSFS_uint16 src, char dst, PHYSFS_uint64 len)
		416	{
		417	if (len == 0)
		418	return;
		419
		420	len--;
		421	while (len)
		422	{
		423	const PHYSFS_uint32 cp = utf16codepoint(&src);
		424	if (!cp)
		425	break;
		426	utf8fromcodepoint(cp, &dst, &len);
		427	} /* while */
		428
		429	*dst = '\0';
		430	} /* PHYSFS_utf8FromUtf16 */
		431
		432
		433	int PHYSFS_caseFold(const PHYSFS_uint32 from, PHYSFS_uint32 *to)
		434	{
		435	int i;
		436
		437	if (from < 128) /* low-ASCII, easy! */
		438	{
		439	if ((from >= 'A') && (from <= 'Z'))
		440	*to = from - ('A' - 'a');
		441	else
		442	*to = from;
		443	return 1;
		444	} /* if */
		445
		446	else if (from <= 0xFFFF)
		447	{
		448	const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF);
		449	const PHYSFS_uint16 from16 = (PHYSFS_uint16) from;
		450
		451	{
		452	const CaseFoldHashBucket1_16 *bucket = &case_fold_hash1_16[hash];
		453	const int count = (int) bucket->count;
		454	for (i = 0; i < count; i++)
		455	{
		456	const CaseFoldMapping1_16 *mapping = &bucket->list[i];
		457	if (mapping->from == from16)
		458	{
		459	*to = mapping->to0;
		460	return 1;
		461	} /* if */
		462	} /* for */
		463	}
		464
		465	{
		466	const CaseFoldHashBucket2_16 *bucket = &case_fold_hash2_16[hash & 15];
		467	const int count = (int) bucket->count;
		468	for (i = 0; i < count; i++)
		469	{
		470	const CaseFoldMapping2_16 *mapping = &bucket->list[i];
		471	if (mapping->from == from16)
		472	{
		473	to[0] = mapping->to0;
		474	to[1] = mapping->to1;
		475	return 2;
		476	} /* if */
		477	} /* for */
		478	}
		479
		480	{
		481	const CaseFoldHashBucket3_16 *bucket = &case_fold_hash3_16[hash & 3];
		482	const int count = (int) bucket->count;
		483	for (i = 0; i < count; i++)
		484	{
		485	const CaseFoldMapping3_16 *mapping = &bucket->list[i];
		486	if (mapping->from == from16)
		487	{
		488	to[0] = mapping->to0;
		489	to[1] = mapping->to1;
		490	to[2] = mapping->to2;
		491	return 3;
		492	} /* if */
		493	} /* for */
		494	}
		495	} /* else if */
		496
		497	else /* codepoint that doesn't fit in 16 bits. */
		498	{
		499	const PHYSFS_uint8 hash = ((from ^ (from >> 8)) & 0xFF);
		500	const CaseFoldHashBucket1_32 *bucket = &case_fold_hash1_32[hash & 15];
		501	const int count = (int) bucket->count;
		502	for (i = 0; i < count; i++)
		503	{
		504	const CaseFoldMapping1_32 *mapping = &bucket->list[i];
		505	if (mapping->from == from)
		506	{
		507	*to = mapping->to0;
		508	return 1;
		509	} /* if */
		510	} /* for */
		511	} /* else */
		512
		513
		514	/* Not found...there's no remapping for this codepoint. */
		515	*to = from;
		516	return 1;
		517	} /* PHYSFS_caseFold */
		518
		519
		520	#define UTFSTRICMP(bits) \
		521	PHYSFS_uint32 folded1[3], folded2[3]; \
		522	int head1 = 0, tail1 = 0, head2 = 0, tail2 = 0; \
		523	while (1) { \
		524	PHYSFS_uint32 cp1, cp2; \
		525	if (head1 != tail1) { \
		526	cp1 = folded1[tail1++]; \
		527	} else { \
		528	head1 = PHYSFS_caseFold(utf##bits##codepoint(&str1), folded1); \
		529	cp1 = folded1[0]; \
		530	tail1 = 1; \
		531	} \
		532	if (head2 != tail2) { \
		533	cp2 = folded2[tail2++]; \
		534	} else { \
		535	head2 = PHYSFS_caseFold(utf##bits##codepoint(&str2), folded2); \
		536	cp2 = folded2[0]; \
		537	tail2 = 1; \
		538	} \
		539	if (cp1 < cp2) { \
		540	return -1; \
		541	} else if (cp1 > cp2) { \
		542	return 1; \
		543	} else if (cp1 == 0) { \
		544	break; /* complete match. */ \
		545	} \
		546	} \
		547	return 0
		548
		549	int PHYSFS_utf8stricmp(const char str1, const char str2)
		550	{
		551	UTFSTRICMP(8);
		552	} /* PHYSFS_utf8stricmp */
		553
		554	int PHYSFS_utf16stricmp(const PHYSFS_uint16 str1, const PHYSFS_uint16 str2)
		555	{
		556	UTFSTRICMP(16);
		557	} /* PHYSFS_utf16stricmp */
		558
		559	int PHYSFS_ucs4stricmp(const PHYSFS_uint32 str1, const PHYSFS_uint32 str2)
		560	{
		561	UTFSTRICMP(32);
		562	} /* PHYSFS_ucs4stricmp */
		563
		564	#undef UTFSTRICMP
		565
		566	/* end of physfs_unicode.c ... */
		567

Subversion Repositories Games.Descent

Games.Descent /libphysfs/src/physfs_unicode.c – Rev 1