WebSVN – Games.Chess Giants – Blame – /DirectX9/Include/xnamathconvert.inl

Rev	Author	Line No.	Line
1	pmbaty	1	/*++
		2
		3	Copyright (c) Microsoft Corporation. All rights reserved.
		4
		5	Module Name:
		6
		7	xnamathconvert.inl
		8
		9	Abstract:
		10
		11	XNA math library for Windows and Xbox 360: Conversion, loading, and storing functions.
		12	--*/
		13
		14	#if defined(_MSC_VER) && (_MSC_VER > 1000)
		15	#pragma once
		16	#endif
		17
		18	#ifndef __XNAMATHCONVERT_INL__
		19	#define __XNAMATHCONVERT_INL__
		20
		21	#define XM_PACK_FACTOR (FLOAT)(1 << 22)
		22	#define XM_UNPACK_FACTOR_UNSIGNED (FLOAT)(1 << 23)
		23	#define XM_UNPACK_FACTOR_SIGNED XM_PACK_FACTOR
		24
		25	#define XM_UNPACK_UNSIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
		26	{-XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
		27	-XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
		28	-XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
		29	-XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
		30
		31	#define XM_UNPACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
		32	{XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
		33	XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
		34	XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
		35	XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
		36
		37	#define XM_UNPACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
		38	{-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1), \
		39	-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1), \
		40	-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1), \
		41	-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1)}
		42
		43	//#define XM_UNPACK_SIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
		44	// {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1) * 3.0f, \
		45	// -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1) * 3.0f, \
		46	// -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1) * 3.0f, \
		47	// -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1) * 3.0f}
		48
		49	#define XM_PACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
		50	{-(FLOAT)((1 << (BitsX)) - 1) / XM_PACK_FACTOR, \
		51	-(FLOAT)((1 << (BitsY)) - 1) / XM_PACK_FACTOR, \
		52	-(FLOAT)((1 << (BitsZ)) - 1) / XM_PACK_FACTOR, \
		53	-(FLOAT)((1 << (BitsW)) - 1) / XM_PACK_FACTOR}
		54
		55	#define XM_PACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
		56	{-(FLOAT)((1 << ((BitsX) - 1)) - 1) / XM_PACK_FACTOR, \
		57	-(FLOAT)((1 << ((BitsY) - 1)) - 1) / XM_PACK_FACTOR, \
		58	-(FLOAT)((1 << ((BitsZ) - 1)) - 1) / XM_PACK_FACTOR, \
		59	-(FLOAT)((1 << ((BitsW) - 1)) - 1) / XM_PACK_FACTOR}
		60
		61	#define XM_PACK_OFFSET XMVectorSplatConstant(3, 0)
		62	//#define XM_UNPACK_OFFSET XM_PACK_OFFSET
		63
		64	/****************************************************************************
		65	*
		66	* Data conversion
		67	*
		68	****************************************************************************/
		69
		70	//------------------------------------------------------------------------------
		71
		72	XMFINLINE FLOAT XMConvertHalfToFloat
		73	(
		74	HALF Value
		75	)
		76	{
		77	#if defined(_XM_NO_INTRINSICS_) \|\| defined(_XM_SSE_INTRINSICS_)
		78
		79	UINT Mantissa;
		80	UINT Exponent;
		81	UINT Result;
		82
		83	Mantissa = (UINT)(Value & 0x03FF);
		84
		85	if ((Value & 0x7C00) != 0) // The value is normalized
		86	{
		87	Exponent = (UINT)((Value >> 10) & 0x1F);
		88	}
		89	else if (Mantissa != 0) // The value is denormalized
		90	{
		91	// Normalize the value in the resulting float
		92	Exponent = 1;
		93
		94	do
		95	{
		96	Exponent--;
		97	Mantissa <<= 1;
		98	} while ((Mantissa & 0x0400) == 0);
		99
		100	Mantissa &= 0x03FF;
		101	}
		102	else // The value is zero
		103	{
		104	Exponent = (UINT)-112;
		105	}
		106
		107	Result = ((Value & 0x8000) << 16) \| // Sign
		108	((Exponent + 112) << 23) \| // Exponent
		109	(Mantissa << 13); // Mantissa
		110
		111	return (FLOAT)&Result;
		112
		113	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		114	#endif
		115	}
		116
		117	//------------------------------------------------------------------------------
		118
		119	XMINLINE FLOAT* XMConvertHalfToFloatStream
		120	(
		121	FLOAT* pOutputStream,
		122	UINT OutputStride,
		123	CONST HALF* pInputStream,
		124	UINT InputStride,
		125	UINT HalfCount
		126	)
		127	{
		128	#if defined(_XM_NO_INTRINSICS_) \|\| defined(_XM_SSE_INTRINSICS_)
		129
		130	UINT i;
		131	BYTE* pHalf = (BYTE*)pInputStream;
		132	BYTE* pFloat = (BYTE*)pOutputStream;
		133
		134	XMASSERT(pOutputStream);
		135	XMASSERT(pInputStream);
		136
		137	for (i = 0; i < HalfCount; i++)
		138	{
		139	(FLOAT)pFloat = XMConvertHalfToFloat((HALF)pHalf);
		140	pHalf += InputStride;
		141	pFloat += OutputStride;
		142	}
		143
		144	return pOutputStream;
		145
		146	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		147	#endif // _XM_VMX128_INTRINSICS_
		148	}
		149
		150	//------------------------------------------------------------------------------
		151
		152	XMFINLINE HALF XMConvertFloatToHalf
		153	(
		154	FLOAT Value
		155	)
		156	{
		157	#if defined(_XM_NO_INTRINSICS_) \|\| defined(_XM_SSE_INTRINSICS_)
		158	UINT Result;
		159
		160	UINT IValue = ((UINT *)(&Value))[0];
		161	UINT Sign = (IValue & 0x80000000U) >> 16U;
		162	IValue = IValue & 0x7FFFFFFFU; // Hack off the sign
		163
		164	if (IValue > 0x47FFEFFFU)
		165	{
		166	// The number is too large to be represented as a half. Saturate to infinity.
		167	Result = 0x7FFFU;
		168	}
		169	else
		170	{
		171	if (IValue < 0x38800000U)
		172	{
		173	// The number is too small to be represented as a normalized half.
		174	// Convert it to a denormalized value.
		175	UINT Shift = 113U - (IValue >> 23U);
		176	IValue = (0x800000U \| (IValue & 0x7FFFFFU)) >> Shift;
		177	}
		178	else
		179	{
		180	// Rebias the exponent to represent the value as a normalized half.
		181	IValue += 0xC8000000U;
		182	}
		183
		184	Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU;
		185	}
		186	return (HALF)(Result\|Sign);
		187
		188	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		189	#endif
		190	}
		191
		192	//------------------------------------------------------------------------------
		193
		194	XMINLINE HALF* XMConvertFloatToHalfStream
		195	(
		196	HALF* pOutputStream,
		197	UINT OutputStride,
		198	CONST FLOAT* pInputStream,
		199	UINT InputStride,
		200	UINT FloatCount
		201	)
		202	{
		203	#if defined(_XM_NO_INTRINSICS_) \|\| defined(_XM_SSE_INTRINSICS_)
		204
		205	UINT i;
		206	BYTE* pFloat = (BYTE*)pInputStream;
		207	BYTE* pHalf = (BYTE*)pOutputStream;
		208
		209	XMASSERT(pOutputStream);
		210	XMASSERT(pInputStream);
		211
		212	for (i = 0; i < FloatCount; i++)
		213	{
		214	(HALF)pHalf = XMConvertFloatToHalf((FLOAT)pFloat);
		215	pFloat += InputStride;
		216	pHalf += OutputStride;
		217	}
		218	return pOutputStream;
		219	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		220	#endif // _XM_VMX128_INTRINSICS_
		221	}
		222
		223	//------------------------------------------------------------------------------
		224
		225	#if defined(_XM_NO_INTRINSICS_) \|\| defined(_XM_SSE_INTRINSICS_)
		226	// For VMX128, these routines are all defines in the main header
		227
		228	#pragma warning(push)
		229	#pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized
		230
		231	XMINLINE XMVECTOR XMConvertVectorIntToFloat
		232	(
		233	FXMVECTOR VInt,
		234	UINT DivExponent
		235	)
		236	{
		237	#if defined(_XM_NO_INTRINSICS_)
		238	UINT ElementIndex;
		239	FLOAT fScale;
		240	XMVECTOR Result;
		241	XMASSERT(DivExponent<32);
		242	fScale = 1.0f / (FLOAT)(1U << DivExponent);
		243	ElementIndex = 0;
		244	do {
		245	INT iTemp = (INT)VInt.vector4_u32[ElementIndex];
		246	Result.vector4_f32[ElementIndex] = ((FLOAT)iTemp) * fScale;
		247	} while (++ElementIndex<4);
		248	return Result;
		249	#else // _XM_SSE_INTRINSICS_
		250	XMASSERT(DivExponent<32);
		251	// Convert to floats
		252	XMVECTOR vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&VInt)[0]);
		253	// Convert DivExponent into 1.0f/(1<<DivExponent)
		254	UINT uScale = 0x3F800000U - (DivExponent << 23);
		255	// Splat the scalar value
		256	__m128i vScale = _mm_set1_epi32(uScale);
		257	vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
		258	return vResult;
		259	#endif
		260	}
		261
		262	//------------------------------------------------------------------------------
		263
		264	XMINLINE XMVECTOR XMConvertVectorFloatToInt
		265	(
		266	FXMVECTOR VFloat,
		267	UINT MulExponent
		268	)
		269	{
		270	#if defined(_XM_NO_INTRINSICS_)
		271	UINT ElementIndex;
		272	XMVECTOR Result;
		273	FLOAT fScale;
		274	XMASSERT(MulExponent<32);
		275	// Get the scalar factor.
		276	fScale = (FLOAT)(1U << MulExponent);
		277	ElementIndex = 0;
		278	do {
		279	INT iResult;
		280	FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
		281	if (fTemp <= -(65536.0f*32768.0f)) {
		282	iResult = (-0x7FFFFFFF)-1;
		283	} else if (fTemp > (65536.0f*32768.0f)-128.0f) {
		284	iResult = 0x7FFFFFFF;
		285	} else {
		286	iResult = (INT)fTemp;
		287	}
		288	Result.vector4_u32[ElementIndex] = (UINT)iResult;
		289	} while (++ElementIndex<4);
		290	return Result;
		291	#else // _XM_SSE_INTRINSICS_
		292	XMASSERT(MulExponent<32);
		293	static const XMVECTORF32 MaxInt = {65536.0f32768.0f-128.0f,65536.0f32768.0f-128.0f,65536.0f32768.0f-128.0f,65536.0f32768.0f-128.0f};
		294	XMVECTOR vResult = _mm_set_ps1((FLOAT)(1U << MulExponent));
		295	vResult = _mm_mul_ps(vResult,VFloat);
		296	// In case of positive overflow, detect it
		297	XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,MaxInt);
		298	// Float to int conversion
		299	__m128i vResulti = _mm_cvttps_epi32(vResult);
		300	// If there was positive overflow, set to 0x7FFFFFFF
		301	vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
		302	vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
		303	vOverflow = _mm_or_ps(vOverflow,vResult);
		304	return vOverflow;
		305	#endif
		306	}
		307
		308	//------------------------------------------------------------------------------
		309
		310	XMINLINE XMVECTOR XMConvertVectorUIntToFloat
		311	(
		312	FXMVECTOR VUInt,
		313	UINT DivExponent
		314	)
		315	{
		316	#if defined(_XM_NO_INTRINSICS_)
		317	UINT ElementIndex;
		318	FLOAT fScale;
		319	XMVECTOR Result;
		320	XMASSERT(DivExponent<32);
		321	fScale = 1.0f / (FLOAT)(1U << DivExponent);
		322	ElementIndex = 0;
		323	do {
		324	Result.vector4_f32[ElementIndex] = (FLOAT)VUInt.vector4_u32[ElementIndex] * fScale;
		325	} while (++ElementIndex<4);
		326	return Result;
		327	#else // _XM_SSE_INTRINSICS_
		328	XMASSERT(DivExponent<32);
		329	static const XMVECTORF32 FixUnsigned = {32768.0f65536.0f,32768.0f65536.0f,32768.0f65536.0f,32768.0f65536.0f};
		330	// For the values that are higher than 0x7FFFFFFF, a fixup is needed
		331	// Determine which ones need the fix.
		332	XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
		333	// Force all values positive
		334	XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
		335	// Convert to floats
		336	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		337	// Convert 0x80000000 -> 0xFFFFFFFF
		338	__m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
		339	// For only the ones that are too big, add the fixup
		340	vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],FixUnsigned);
		341	vResult = _mm_add_ps(vResult,vMask);
		342	// Convert DivExponent into 1.0f/(1<<DivExponent)
		343	UINT uScale = 0x3F800000U - (DivExponent << 23);
		344	// Splat
		345	iMask = _mm_set1_epi32(uScale);
		346	vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&iMask)[0]);
		347	return vResult;
		348	#endif
		349	}
		350
		351	//------------------------------------------------------------------------------
		352
		353	XMINLINE XMVECTOR XMConvertVectorFloatToUInt
		354	(
		355	FXMVECTOR VFloat,
		356	UINT MulExponent
		357	)
		358	{
		359	#if defined(_XM_NO_INTRINSICS_)
		360	UINT ElementIndex;
		361	XMVECTOR Result;
		362	FLOAT fScale;
		363	XMASSERT(MulExponent<32);
		364	// Get the scalar factor.
		365	fScale = (FLOAT)(1U << MulExponent);
		366	ElementIndex = 0;
		367	do {
		368	UINT uResult;
		369	FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
		370	if (fTemp <= 0.0f) {
		371	uResult = 0;
		372	} else if (fTemp >= (65536.0f*65536.0f)) {
		373	uResult = 0xFFFFFFFFU;
		374	} else {
		375	uResult = (UINT)fTemp;
		376	}
		377	Result.vector4_u32[ElementIndex] = uResult;
		378	} while (++ElementIndex<4);
		379	return Result;
		380	#else // _XM_SSE_INTRINSICS_
		381	XMASSERT(MulExponent<32);
		382	static const XMVECTORF32 MaxUInt = {65536.0f65536.0f-256.0f,65536.0f65536.0f-256.0f,65536.0f65536.0f-256.0f,65536.0f65536.0f-256.0f};
		383	static const XMVECTORF32 UnsignedFix = {32768.0f65536.0f,32768.0f65536.0f,32768.0f65536.0f,32768.0f65536.0f};
		384	XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
		385	vResult = _mm_mul_ps(vResult,VFloat);
		386	// Clamp to >=0
		387	vResult = _mm_max_ps(vResult,g_XMZero);
		388	// Any numbers that are too big, set to 0xFFFFFFFFU
		389	XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,MaxUInt);
		390	XMVECTOR vValue = UnsignedFix;
		391	// Too large for a signed integer?
		392	XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
		393	// Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
		394	vValue = _mm_and_ps(vValue,vMask);
		395	// Perform fixup only on numbers too large (Keeps low bit precision)
		396	vResult = _mm_sub_ps(vResult,vValue);
		397	__m128i vResulti = _mm_cvttps_epi32(vResult);
		398	// Convert from signed to unsigned pnly if greater than 0x80000000
		399	vMask = _mm_and_ps(vMask,g_XMNegativeZero);
		400	vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
		401	// On those that are too large, set to 0xFFFFFFFF
		402	vResult = _mm_or_ps(vResult,vOverflow);
		403	return vResult;
		404	#endif
		405	}
		406
		407	#pragma warning(pop)
		408
		409	#endif // _XM_NO_INTRINSICS_ \|\| _XM_SSE_INTRINSICS_
		410
		411	/****************************************************************************
		412	*
		413	* Vector and matrix load operations
		414	*
		415	****************************************************************************/
		416
		417	//------------------------------------------------------------------------------
		418
		419	XMFINLINE XMVECTOR XMLoadInt(CONST UINT* pSource)
		420	{
		421	#if defined(_XM_NO_INTRINSICS_)
		422
		423	XMVECTOR V;
		424	XMASSERT(pSource);
		425	XMASSERT(((UINT_PTR)pSource & 3) == 0);
		426
		427	V.vector4_u32[0] = *pSource;
		428
		429	return V;
		430
		431	#elif defined(_XM_SSE_INTRINSICS_)
		432	XMASSERT(pSource);
		433	XMASSERT(((UINT_PTR)pSource & 3) == 0);
		434	__m128i V = _mm_set_epi32( 0, 0, 0, *pSource );
		435	return reinterpret_cast<__m128 *>(&V)[0];
		436	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		437	#endif // _XM_VMX128_INTRINSICS_
		438	}
		439
		440	//------------------------------------------------------------------------------
		441
		442	XMFINLINE XMVECTOR XMLoadFloat(CONST FLOAT* pSource)
		443	{
		444	#if defined(_XM_NO_INTRINSICS_)
		445
		446	XMVECTOR V;
		447	XMASSERT(pSource);
		448	XMASSERT(((UINT_PTR)pSource & 3) == 0);
		449
		450	V.vector4_f32[0] = *pSource;
		451
		452	return V;
		453
		454	#elif defined(_XM_SSE_INTRINSICS_)
		455	XMASSERT(pSource);
		456	XMASSERT(((UINT_PTR)pSource & 3) == 0);
		457
		458	return _mm_load_ss( pSource );
		459	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		460	#endif // _XM_VMX128_INTRINSICS_
		461	}
		462
		463	//------------------------------------------------------------------------------
		464
		465	XMFINLINE XMVECTOR XMLoadInt2
		466	(
		467	CONST UINT* pSource
		468	)
		469	{
		470	#if defined(_XM_NO_INTRINSICS_)
		471
		472	XMVECTOR V;
		473
		474	XMASSERT(pSource);
		475
		476	V.vector4_u32[0] = pSource[0];
		477	V.vector4_u32[1] = pSource[1];
		478
		479	return V;
		480	#elif defined(_XM_SSE_INTRINSICS_)
		481
		482	XMASSERT(pSource);
		483	__m128i V = _mm_set_epi32( 0, 0, (pSource+1), pSource );
		484	return reinterpret_cast<__m128 *>(&V)[0];
		485
		486	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		487	#endif // _XM_VMX128_INTRINSICS_
		488	}
		489
		490	//------------------------------------------------------------------------------
		491
		492	XMFINLINE XMVECTOR XMLoadInt2A
		493	(
		494	CONST UINT* pSource
		495	)
		496	{
		497	#if defined(_XM_NO_INTRINSICS_)
		498
		499	XMVECTOR V;
		500
		501	XMASSERT(pSource);
		502	XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
		503
		504	V.vector4_u32[0] = pSource[0];
		505	V.vector4_u32[1] = pSource[1];
		506
		507	return V;
		508
		509	#elif defined(_XM_SSE_INTRINSICS_)
		510
		511	XMASSERT(pSource);
		512	__m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
		513	return reinterpret_cast<__m128 *>(&V)[0];
		514
		515	#else // _XM_VMX128_INTRINSICS_
		516	#endif // _XM_VMX128_INTRINSICS_
		517	}
		518
		519	//------------------------------------------------------------------------------
		520
		521	XMFINLINE XMVECTOR XMLoadFloat2
		522	(
		523	CONST XMFLOAT2* pSource
		524	)
		525	{
		526	#if defined(_XM_NO_INTRINSICS_)
		527	XMVECTOR V;
		528	XMASSERT(pSource);
		529	((UINT )(&V.vector4_f32[0]))[0] = ((const UINT )(&pSource->x))[0];
		530	((UINT )(&V.vector4_f32[1]))[0] = ((const UINT )(&pSource->y))[0];
		531	V.vector4_f32[2] = V.vector4_f32[3] = 0.0f;
		532	return V;
		533	#elif defined(_XM_SSE_INTRINSICS_)
		534	XMASSERT(pSource);
		535	#ifdef _XM_X86_
		536	__m128 x = _mm_load_ss( &pSource->x );
		537	__m128 y = _mm_load_ss( &pSource->y );
		538	return _mm_unpacklo_ps( x, y );
		539	#else // _XM_X64_
		540	// This reads 2 floats past the memory that should be ignored.
		541	return _mm_loadu_ps( &pSource->x );
		542	#endif
		543	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		544	#endif // _XM_VMX128_INTRINSICS_
		545	}
		546
		547	//------------------------------------------------------------------------------
		548
		549	XMFINLINE XMVECTOR XMLoadFloat2A
		550	(
		551	CONST XMFLOAT2A* pSource
		552	)
		553	{
		554	#if defined(_XM_NO_INTRINSICS_)
		555
		556	XMVECTOR V;
		557
		558	XMASSERT(pSource);
		559	XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
		560
		561	V.vector4_f32[0] = pSource->x;
		562	V.vector4_f32[1] = pSource->y;
		563
		564	return V;
		565
		566	#elif defined(_XM_SSE_INTRINSICS_)
		567	XMASSERT(pSource);
		568	#ifdef _XM_X86_
		569	__m128 x = _mm_load_ss( &pSource->x );
		570	__m128 y = _mm_load_ss( &pSource->y );
		571	return _mm_unpacklo_ps( x, y );
		572	#else // _XM_X64_
		573	// This reads 2 floats past the memory that should be ignored.
		574	return _mm_load_ps( &pSource->x );
		575	#endif
		576	#else // _XM_VMX128_INTRINSICS_
		577	#endif // _XM_VMX128_INTRINSICS_
		578	}
		579
		580	//------------------------------------------------------------------------------
		581
		582	XMFINLINE XMVECTOR XMLoadHalf2
		583	(
		584	CONST XMHALF2* pSource
		585	)
		586	{
		587	#if defined(_XM_NO_INTRINSICS_)
		588	XMASSERT(pSource);
		589	{
		590	XMVECTOR vResult = {
		591	XMConvertHalfToFloat(pSource->x),
		592	XMConvertHalfToFloat(pSource->y),
		593	0.0f,
		594	0.0f
		595	};
		596	return vResult;
		597	}
		598	#elif defined(_XM_SSE_INTRINSICS_)
		599	XMASSERT(pSource);
		600	XMVECTOR vResult = {
		601	XMConvertHalfToFloat(pSource->x),
		602	XMConvertHalfToFloat(pSource->y),
		603	0.0f,
		604	0.0f
		605	};
		606	return vResult;
		607
		608	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		609	#endif // _XM_VMX128_INTRINSICS_
		610	}
		611
		612	//------------------------------------------------------------------------------
		613
		614	XMFINLINE XMVECTOR XMLoadShortN2
		615	(
		616	CONST XMSHORTN2* pSource
		617	)
		618	{
		619	#if defined(_XM_NO_INTRINSICS_)
		620	XMASSERT(pSource);
		621	XMASSERT(pSource->x != -32768);
		622	XMASSERT(pSource->y != -32768);
		623	{
		624	XMVECTOR vResult = {
		625	(FLOAT)pSource->x * (1.0f/32767.0f),
		626	(FLOAT)pSource->y * (1.0f/32767.0f),
		627	0.0f,
		628	0.0f
		629	};
		630	return vResult;
		631	}
		632
		633	#elif defined(_XM_SSE_INTRINSICS_)
		634	XMASSERT(pSource);
		635	XMASSERT(pSource->x != -32768);
		636	XMASSERT(pSource->y != -32768);
		637	// Splat the two shorts in all four entries (WORD alignment okay,
		638	// DWORD alignment preferred)
		639	__m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
		640	// Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
		641	vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
		642	// x needs to be sign extended
		643	vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
		644	// Convert to floating point numbers
		645	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		646	// x - 0x8000 to undo the signed order.
		647	vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
		648	// Convert 0-32767 to 0.0f-1.0f
		649	return _mm_mul_ps(vTemp,g_XMNormalizeX16Y16);
		650	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		651	#endif // _XM_VMX128_INTRINSICS_
		652	}
		653
		654	//------------------------------------------------------------------------------
		655
		656	XMFINLINE XMVECTOR XMLoadShort2
		657	(
		658	CONST XMSHORT2* pSource
		659	)
		660	{
		661	#if defined(_XM_NO_INTRINSICS_)
		662
		663	XMVECTOR V;
		664
		665	XMASSERT(pSource);
		666	XMASSERT(pSource->x != -32768);
		667	XMASSERT(pSource->y != -32768);
		668
		669	V.vector4_f32[0] = (FLOAT)pSource->x;
		670	V.vector4_f32[1] = (FLOAT)pSource->y;
		671
		672	return V;
		673
		674	#elif defined(_XM_SSE_INTRINSICS_)
		675	XMASSERT(pSource);
		676	XMASSERT(pSource->x != -32768);
		677	XMASSERT(pSource->y != -32768);
		678	// Splat the two shorts in all four entries (WORD alignment okay,
		679	// DWORD alignment preferred)
		680	__m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
		681	// Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
		682	vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
		683	// x needs to be sign extended
		684	vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
		685	// Convert to floating point numbers
		686	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		687	// x - 0x8000 to undo the signed order.
		688	vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
		689	// Y is 65536 too large
		690	return _mm_mul_ps(vTemp,g_XMFixupY16);
		691	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		692	#endif // _XM_VMX128_INTRINSICS_
		693	}
		694
		695	//------------------------------------------------------------------------------
		696
		697	XMFINLINE XMVECTOR XMLoadUShortN2
		698	(
		699	CONST XMUSHORTN2* pSource
		700	)
		701	{
		702	#if defined(_XM_NO_INTRINSICS_)
		703
		704	XMVECTOR V;
		705
		706	XMASSERT(pSource);
		707
		708	V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
		709	V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
		710
		711	return V;
		712
		713	#elif defined(_XM_SSE_INTRINSICS_)
		714	static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f};
		715	static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0};
		716	XMASSERT(pSource);
		717	// Splat the two shorts in all four entries (WORD alignment okay,
		718	// DWORD alignment preferred)
		719	__m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
		720	// Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
		721	vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
		722	// y needs to be sign flipped
		723	vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
		724	// Convert to floating point numbers
		725	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		726	// y + 0x8000 to undo the signed order.
		727	vTemp = _mm_add_ps(vTemp,FixaddY16);
		728	// Y is 65536 times too large
		729	vTemp = _mm_mul_ps(vTemp,FixupY16);
		730	return vTemp;
		731	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		732	#endif // _XM_VMX128_INTRINSICS_
		733	}
		734
		735	//------------------------------------------------------------------------------
		736
		737	XMFINLINE XMVECTOR XMLoadUShort2
		738	(
		739	CONST XMUSHORT2* pSource
		740	)
		741	{
		742	#if defined(_XM_NO_INTRINSICS_)
		743
		744	XMVECTOR V;
		745
		746	XMASSERT(pSource);
		747
		748	V.vector4_f32[0] = (FLOAT)pSource->x;
		749	V.vector4_f32[1] = (FLOAT)pSource->y;
		750
		751	return V;
		752
		753	#elif defined(_XM_SSE_INTRINSICS_)
		754	static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0};
		755	XMASSERT(pSource);
		756	// Splat the two shorts in all four entries (WORD alignment okay,
		757	// DWORD alignment preferred)
		758	__m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
		759	// Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
		760	vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
		761	// y needs to be sign flipped
		762	vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
		763	// Convert to floating point numbers
		764	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		765	// Y is 65536 times too large
		766	vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
		767	// y + 0x8000 to undo the signed order.
		768	vTemp = _mm_add_ps(vTemp,FixaddY16);
		769	return vTemp;
		770	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		771	#endif // _XM_VMX128_INTRINSICS_
		772	}
		773
		774	//------------------------------------------------------------------------------
		775
		776	XMFINLINE XMVECTOR XMLoadInt3
		777	(
		778	CONST UINT* pSource
		779	)
		780	{
		781	#if defined(_XM_NO_INTRINSICS_)
		782
		783	XMVECTOR V;
		784
		785	XMASSERT(pSource);
		786
		787	V.vector4_u32[0] = pSource[0];
		788	V.vector4_u32[1] = pSource[1];
		789	V.vector4_u32[2] = pSource[2];
		790
		791	return V;
		792
		793	#elif defined(_XM_SSE_INTRINSICS_)
		794	XMASSERT(pSource);
		795	__m128i V = _mm_set_epi32( 0, (pSource+2), (pSource+1), *pSource );
		796	return reinterpret_cast<__m128 *>(&V)[0];
		797	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		798	#endif // _XM_VMX128_INTRINSICS_
		799	}
		800
		801	//------------------------------------------------------------------------------
		802
		803	XMFINLINE XMVECTOR XMLoadInt3A
		804	(
		805	CONST UINT* pSource
		806	)
		807	{
		808	#if defined(_XM_NO_INTRINSICS_)
		809
		810	XMVECTOR V;
		811
		812	XMASSERT(pSource);
		813	XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
		814
		815	V.vector4_u32[0] = pSource[0];
		816	V.vector4_u32[1] = pSource[1];
		817	V.vector4_u32[2] = pSource[2];
		818
		819	return V;
		820
		821	#elif defined(_XM_SSE_INTRINSICS_)
		822	XMASSERT(pSource);
		823
		824	// Reads an extra integer that is 'undefined'
		825
		826	__m128i V = _mm_load_si128( (const __m128i*)pSource );
		827	return reinterpret_cast<__m128 *>(&V)[0];
		828	#else // _XM_VMX128_INTRINSICS_
		829	#endif // _XM_VMX128_INTRINSICS_
		830	}
		831
		832	//------------------------------------------------------------------------------
		833
		834	XMFINLINE XMVECTOR XMLoadFloat3
		835	(
		836	CONST XMFLOAT3* pSource
		837	)
		838	{
		839	#if defined(_XM_NO_INTRINSICS_)
		840	XMVECTOR V;
		841	XMASSERT(pSource);
		842	((UINT )(&V.vector4_f32[0]))[0] = ((const UINT )(&pSource->x))[0];
		843	((UINT )(&V.vector4_f32[1]))[0] = ((const UINT )(&pSource->y))[0];
		844	((UINT )(&V.vector4_f32[2]))[0] = ((const UINT )(&pSource->z))[0];
		845	V.vector4_f32[3] = 0.0f;
		846	return V;
		847	#elif defined(_XM_SSE_INTRINSICS_)
		848	XMASSERT(pSource);
		849	// This reads 1 floats past the memory that should be ignored.
		850	return _mm_loadu_ps( &pSource->x );
		851	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		852	#endif // _XM_VMX128_INTRINSICS_
		853	}
		854
		855	//------------------------------------------------------------------------------
		856
		857	XMFINLINE XMVECTOR XMLoadFloat3A
		858	(
		859	CONST XMFLOAT3A* pSource
		860	)
		861	{
		862	#if defined(_XM_NO_INTRINSICS_)
		863
		864	XMVECTOR V;
		865
		866	XMASSERT(pSource);
		867	XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
		868
		869	V.vector4_f32[0] = pSource->x;
		870	V.vector4_f32[1] = pSource->y;
		871	V.vector4_f32[2] = pSource->z;
		872
		873	return V;
		874
		875	#elif defined(_XM_SSE_INTRINSICS_)
		876	XMASSERT(pSource);
		877
		878	// This reads 1 floats past the memory that should be ignored.
		879
		880	return _mm_load_ps( &pSource->x );
		881	#else // _XM_VMX128_INTRINSICS_
		882	#endif // _XM_VMX128_INTRINSICS_
		883	}
		884
		885	//------------------------------------------------------------------------------
		886
		887	XMFINLINE XMVECTOR XMLoadUHenDN3
		888	(
		889	CONST XMUHENDN3* pSource
		890	)
		891	{
		892	#if defined(_XM_NO_INTRINSICS_)
		893
		894	XMVECTOR V;
		895	UINT Element;
		896
		897	XMASSERT(pSource);
		898
		899	Element = pSource->v & 0x7FF;
		900	V.vector4_f32[0] = (FLOAT)Element / 2047.0f;
		901	Element = (pSource->v >> 11) & 0x7FF;
		902	V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
		903	Element = (pSource->v >> 22) & 0x3FF;
		904	V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
		905
		906	return V;
		907
		908	#elif defined(_XM_SSE_INTRINSICS_)
		909	static const XMVECTORF32 UHenDN3Mul = {1.0f/2047.0f,1.0f/(2047.0f2048.0f),1.0f/(1023.0f2048.0f*2048.0f),0};
		910	XMASSERT(pSource);
		911	// Get the 32 bit value and splat it
		912	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		913	// Mask off x, y and z
		914	vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
		915	// Convert x and y to unsigned
		916	vResult = _mm_xor_ps(vResult,g_XMFlipZ);
		917	// Convert to float
		918	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		919	// Convert x and y back to signed
		920	vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
		921	// Normalize x,y and z to -1.0f-1.0f
		922	vResult = _mm_mul_ps(vResult,UHenDN3Mul);
		923	return vResult;
		924	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		925	#endif // _XM_VMX128_INTRINSICS_
		926	}
		927
		928	//------------------------------------------------------------------------------
		929
		930	XMFINLINE XMVECTOR XMLoadUHenD3
		931	(
		932	CONST XMUHEND3* pSource
		933	)
		934	{
		935	#if defined(_XM_NO_INTRINSICS_)
		936
		937	XMVECTOR V;
		938	UINT Element;
		939
		940	XMASSERT(pSource);
		941
		942	Element = pSource->v & 0x7FF;
		943	V.vector4_f32[0] = (FLOAT)Element;
		944	Element = (pSource->v >> 11) & 0x7FF;
		945	V.vector4_f32[1] = (FLOAT)Element;
		946	Element = (pSource->v >> 22) & 0x3FF;
		947	V.vector4_f32[2] = (FLOAT)Element;
		948
		949	return V;
		950
		951	#elif defined(_XM_SSE_INTRINSICS_)
		952	XMASSERT(pSource);
		953	// Get the 32 bit value and splat it
		954	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		955	// Mask off x, y and z
		956	vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
		957	// Convert x and y to unsigned
		958	vResult = _mm_xor_ps(vResult,g_XMFlipZ);
		959	// Convert to float
		960	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		961	// Convert x and y back to signed
		962	vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
		963	// Normalize x and y to -1024-1023.0f and z to -512-511.0f
		964	vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
		965	return vResult;
		966	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		967	#endif // _XM_VMX128_INTRINSICS_
		968	}
		969
		970	//------------------------------------------------------------------------------
		971
		972	XMFINLINE XMVECTOR XMLoadHenDN3
		973	(
		974	CONST XMHENDN3* pSource
		975	)
		976	{
		977	#if defined(_XM_NO_INTRINSICS_)
		978
		979	XMVECTOR V;
		980	UINT Element;
		981	static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
		982	static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
		983
		984	XMASSERT(pSource);
		985	XMASSERT((pSource->v & 0x7FF) != 0x400);
		986	XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
		987	XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
		988
		989	Element = pSource->v & 0x7FF;
		990	V.vector4_f32[0] = (FLOAT)(SHORT)(Element \| SignExtendXY[Element >> 10]) / 1023.0f;
		991	Element = (pSource->v >> 11) & 0x7FF;
		992	V.vector4_f32[1] = (FLOAT)(SHORT)(Element \| SignExtendXY[Element >> 10]) / 1023.0f;
		993	Element = (pSource->v >> 22) & 0x3FF;
		994	V.vector4_f32[2] = (FLOAT)(SHORT)(Element \| SignExtendZ[Element >> 9]) / 511.0f;
		995
		996	return V;
		997
		998	#elif defined(_XM_SSE_INTRINSICS_)
		999	static const XMVECTORF32 HenDN3Mul = {1.0f/1023.0f,1.0f/(1023.0f2048.0f),1.0f/(511.0f2048.0f*2048.0f),0};
		1000	XMASSERT(pSource);
		1001	XMASSERT((pSource->v & 0x7FF) != 0x400);
		1002	XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
		1003	XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
		1004	// Get the 32 bit value and splat it
		1005	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		1006	// Mask off x, y and z
		1007	vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
		1008	// Convert x and y to unsigned
		1009	vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
		1010	// Convert to float
		1011	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		1012	// Convert x and y back to signed
		1013	vResult = _mm_add_ps(vResult,g_XMAddHenD3);
		1014	// Normalize x,y and z to -1.0f-1.0f
		1015	vResult = _mm_mul_ps(vResult,HenDN3Mul);
		1016	return vResult;
		1017	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1018	#endif // _XM_VMX128_INTRINSICS_
		1019	}
		1020
		1021	//------------------------------------------------------------------------------
		1022
		1023	XMFINLINE XMVECTOR XMLoadHenD3
		1024	(
		1025	CONST XMHEND3* pSource
		1026	)
		1027	{
		1028	#if defined(_XM_NO_INTRINSICS_)
		1029
		1030	XMVECTOR V;
		1031	UINT Element;
		1032	static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
		1033	static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
		1034
		1035	XMASSERT(pSource);
		1036	XMASSERT((pSource->v & 0x7FF) != 0x400);
		1037	XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
		1038	XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
		1039
		1040	Element = pSource->v & 0x7FF;
		1041	V.vector4_f32[0] = (FLOAT)(SHORT)(Element \| SignExtendXY[Element >> 10]);
		1042	Element = (pSource->v >> 11) & 0x7FF;
		1043	V.vector4_f32[1] = (FLOAT)(SHORT)(Element \| SignExtendXY[Element >> 10]);
		1044	Element = (pSource->v >> 22) & 0x3FF;
		1045	V.vector4_f32[2] = (FLOAT)(SHORT)(Element \| SignExtendZ[Element >> 9]);
		1046
		1047	return V;
		1048
		1049	#elif defined(_XM_SSE_INTRINSICS_)
		1050	XMASSERT(pSource);
		1051	XMASSERT((pSource->v & 0x7FF) != 0x400);
		1052	XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
		1053	XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
		1054	// Get the 32 bit value and splat it
		1055	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		1056	// Mask off x, y and z
		1057	vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
		1058	// Convert x and y to unsigned
		1059	vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
		1060	// Convert to float
		1061	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		1062	// Convert x and y back to signed
		1063	vResult = _mm_add_ps(vResult,g_XMAddHenD3);
		1064	// Normalize x and y to -1024-1023.0f and z to -512-511.0f
		1065	vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
		1066	return vResult;
		1067	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1068	#endif // _XM_VMX128_INTRINSICS_
		1069	}
		1070
		1071	//------------------------------------------------------------------------------
		1072
		1073	XMFINLINE XMVECTOR XMLoadUDHenN3
		1074	(
		1075	CONST XMUDHENN3* pSource
		1076	)
		1077	{
		1078	#if defined(_XM_NO_INTRINSICS_)
		1079
		1080	XMVECTOR V;
		1081	UINT Element;
		1082
		1083	XMASSERT(pSource);
		1084
		1085	Element = pSource->v & 0x3FF;
		1086	V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
		1087	Element = (pSource->v >> 10) & 0x7FF;
		1088	V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
		1089	Element = (pSource->v >> 21) & 0x7FF;
		1090	V.vector4_f32[2] = (FLOAT)Element / 2047.0f;
		1091
		1092	return V;
		1093
		1094	#elif defined(_XM_SSE_INTRINSICS_)
		1095	static const XMVECTORF32 UDHenN3Mul = {1.0f/1023.0f,1.0f/(2047.0f1024.0f),1.0f/(2047.0f1024.0f*2048.0f),0};
		1096	XMASSERT(pSource);
		1097	// Get the 32 bit value and splat it
		1098	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		1099	// Mask off x, y and z
		1100	vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
		1101	// Convert x and y to unsigned
		1102	vResult = _mm_xor_ps(vResult,g_XMFlipZ);
		1103	// Convert to float
		1104	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		1105	// Convert x and y back to signed
		1106	vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
		1107	// Normalize x,y and z to -1.0f-1.0f
		1108	vResult = _mm_mul_ps(vResult,UDHenN3Mul);
		1109	return vResult;
		1110	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1111	#endif // _XM_VMX128_INTRINSICS_
		1112	}
		1113
		1114	//------------------------------------------------------------------------------
		1115
		1116	XMFINLINE XMVECTOR XMLoadUDHen3
		1117	(
		1118	CONST XMUDHEN3* pSource
		1119	)
		1120	{
		1121	#if defined(_XM_NO_INTRINSICS_)
		1122
		1123	XMVECTOR V;
		1124	UINT Element;
		1125
		1126	XMASSERT(pSource);
		1127
		1128	Element = pSource->v & 0x3FF;
		1129	V.vector4_f32[0] = (FLOAT)Element;
		1130	Element = (pSource->v >> 10) & 0x7FF;
		1131	V.vector4_f32[1] = (FLOAT)Element;
		1132	Element = (pSource->v >> 21) & 0x7FF;
		1133	V.vector4_f32[2] = (FLOAT)Element;
		1134
		1135	return V;
		1136
		1137	#elif defined(_XM_SSE_INTRINSICS_)
		1138	XMASSERT(pSource);
		1139	// Get the 32 bit value and splat it
		1140	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		1141	// Mask off x, y and z
		1142	vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
		1143	// Convert x and y to unsigned
		1144	vResult = _mm_xor_ps(vResult,g_XMFlipZ);
		1145	// Convert to float
		1146	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		1147	// Convert x and y back to signed
		1148	vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
		1149	// Normalize x to 0-1023.0f and y and z to 0-2047.0f
		1150	vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
		1151	return vResult;
		1152	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1153	#endif // _XM_VMX128_INTRINSICS_
		1154	}
		1155
		1156	//------------------------------------------------------------------------------
		1157
		1158	XMFINLINE XMVECTOR XMLoadDHenN3
		1159	(
		1160	CONST XMDHENN3* pSource
		1161	)
		1162	{
		1163	#if defined(_XM_NO_INTRINSICS_)
		1164
		1165	XMVECTOR V;
		1166	UINT Element;
		1167	static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
		1168	static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
		1169
		1170	XMASSERT(pSource);
		1171	XMASSERT((pSource->v & 0x3FF) != 0x200);
		1172	XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
		1173	XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
		1174
		1175	Element = pSource->v & 0x3FF;
		1176	V.vector4_f32[0] = (FLOAT)(SHORT)(Element \| SignExtendX[Element >> 9]) / 511.0f;
		1177	Element = (pSource->v >> 10) & 0x7FF;
		1178	V.vector4_f32[1] = (FLOAT)(SHORT)(Element \| SignExtendYZ[Element >> 10]) / 1023.0f;
		1179	Element = (pSource->v >> 21) & 0x7FF;
		1180	V.vector4_f32[2] = (FLOAT)(SHORT)(Element \| SignExtendYZ[Element >> 10]) / 1023.0f;
		1181
		1182	return V;
		1183
		1184	#elif defined(_XM_SSE_INTRINSICS_)
		1185	static const XMVECTORF32 DHenN3Mul = {1.0f/511.0f,1.0f/(1023.0f1024.0f),1.0f/(1023.0f1024.0f*2048.0f),0};
		1186	XMASSERT(pSource);
		1187	XMASSERT((pSource->v & 0x3FF) != 0x200);
		1188	XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
		1189	XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
		1190	// Get the 32 bit value and splat it
		1191	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		1192	// Mask off x, y and z
		1193	vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
		1194	// Convert x and y to unsigned
		1195	vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
		1196	// Convert to float
		1197	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		1198	// Convert x and y back to signed
		1199	vResult = _mm_add_ps(vResult,g_XMAddDHen3);
		1200	// Normalize x,y and z to -1.0f-1.0f
		1201	vResult = _mm_mul_ps(vResult,DHenN3Mul);
		1202	return vResult;
		1203	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1204	#endif // _XM_VMX128_INTRINSICS_
		1205	}
		1206
		1207	//------------------------------------------------------------------------------
		1208
		1209	XMFINLINE XMVECTOR XMLoadDHen3
		1210	(
		1211	CONST XMDHEN3* pSource
		1212	)
		1213	{
		1214	#if defined(_XM_NO_INTRINSICS_)
		1215
		1216	XMVECTOR V;
		1217	UINT Element;
		1218	static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
		1219	static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
		1220
		1221	XMASSERT(pSource);
		1222	XMASSERT((pSource->v & 0x3FF) != 0x200);
		1223	XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
		1224	XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
		1225
		1226	Element = pSource->v & 0x3FF;
		1227	V.vector4_f32[0] = (FLOAT)(SHORT)(Element \| SignExtendX[Element >> 9]);
		1228	Element = (pSource->v >> 10) & 0x7FF;
		1229	V.vector4_f32[1] = (FLOAT)(SHORT)(Element \| SignExtendYZ[Element >> 10]);
		1230	Element = (pSource->v >> 21) & 0x7FF;
		1231	V.vector4_f32[2] = (FLOAT)(SHORT)(Element \| SignExtendYZ[Element >> 10]);
		1232
		1233	return V;
		1234
		1235	#elif defined(_XM_SSE_INTRINSICS_)
		1236	XMASSERT(pSource);
		1237	XMASSERT((pSource->v & 0x3FF) != 0x200);
		1238	XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
		1239	XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
		1240	// Get the 32 bit value and splat it
		1241	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		1242	// Mask off x, y and z
		1243	vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
		1244	// Convert x and y to unsigned
		1245	vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
		1246	// Convert to float
		1247	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		1248	// Convert x and y back to signed
		1249	vResult = _mm_add_ps(vResult,g_XMAddDHen3);
		1250	// Normalize x to -210-511.0f and y and z to -1024-1023.0f
		1251	vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
		1252	return vResult;
		1253	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1254	#endif // _XM_VMX128_INTRINSICS_
		1255	}
		1256
		1257	//------------------------------------------------------------------------------
		1258
		1259	XMFINLINE XMVECTOR XMLoadU565
		1260	(
		1261	CONST XMU565* pSource
		1262	)
		1263	{
		1264	#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
		1265	static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
		1266	static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
		1267	XMASSERT(pSource);
		1268	// Get the 32 bit value and splat it
		1269	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		1270	// Mask off x, y and z
		1271	vResult = _mm_and_ps(vResult,U565And);
		1272	// Convert to float
		1273	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		1274	// Normalize x, y, and z
		1275	vResult = _mm_mul_ps(vResult,U565Mul);
		1276	return vResult;
		1277	#else
		1278	XMVECTOR V;
		1279	UINT Element;
		1280
		1281	XMASSERT(pSource);
		1282
		1283	Element = pSource->v & 0x1F;
		1284	V.vector4_f32[0] = (FLOAT)Element;
		1285	Element = (pSource->v >> 5) & 0x3F;
		1286	V.vector4_f32[1] = (FLOAT)Element;
		1287	Element = (pSource->v >> 11) & 0x1F;
		1288	V.vector4_f32[2] = (FLOAT)Element;
		1289
		1290	return V;
		1291	#endif // !_XM_SSE_INTRINSICS_
		1292	}
		1293
		1294	//------------------------------------------------------------------------------
		1295
		1296	XMFINLINE XMVECTOR XMLoadFloat3PK
		1297	(
		1298	CONST XMFLOAT3PK* pSource
		1299	)
		1300	{
		1301	UINT Mantissa;
		1302	UINT Exponent;
		1303	UINT Result[3];
		1304
		1305	XMASSERT(pSource);
		1306
		1307	// X Channel (6-bit mantissa)
		1308	Mantissa = pSource->xm;
		1309
		1310	if ( pSource->xe == 0x1f ) // INF or NAN
		1311	{
		1312	Result[0] = 0x7f800000 \| (pSource->xm << 17);
		1313	}
		1314	else
		1315	{
		1316	if ( pSource->xe != 0 ) // The value is normalized
		1317	{
		1318	Exponent = pSource->xe;
		1319	}
		1320	else if (Mantissa != 0) // The value is denormalized
		1321	{
		1322	// Normalize the value in the resulting float
		1323	Exponent = 1;
		1324
		1325	do
		1326	{
		1327	Exponent--;
		1328	Mantissa <<= 1;
		1329	} while ((Mantissa & 0x40) == 0);
		1330
		1331	Mantissa &= 0x3F;
		1332	}
		1333	else // The value is zero
		1334	{
		1335	Exponent = (UINT)-112;
		1336	}
		1337
		1338	Result[0] = ((Exponent + 112) << 23) \| (Mantissa << 17);
		1339	}
		1340
		1341	// Y Channel (6-bit mantissa)
		1342	Mantissa = pSource->ym;
		1343
		1344	if ( pSource->ye == 0x1f ) // INF or NAN
		1345	{
		1346	Result[1] = 0x7f800000 \| (pSource->ym << 17);
		1347	}
		1348	else
		1349	{
		1350	if ( pSource->ye != 0 ) // The value is normalized
		1351	{
		1352	Exponent = pSource->ye;
		1353	}
		1354	else if (Mantissa != 0) // The value is denormalized
		1355	{
		1356	// Normalize the value in the resulting float
		1357	Exponent = 1;
		1358
		1359	do
		1360	{
		1361	Exponent--;
		1362	Mantissa <<= 1;
		1363	} while ((Mantissa & 0x40) == 0);
		1364
		1365	Mantissa &= 0x3F;
		1366	}
		1367	else // The value is zero
		1368	{
		1369	Exponent = (UINT)-112;
		1370	}
		1371
		1372	Result[1] = ((Exponent + 112) << 23) \| (Mantissa << 17);
		1373	}
		1374
		1375	// Z Channel (5-bit mantissa)
		1376	Mantissa = pSource->zm;
		1377
		1378	if ( pSource->ze == 0x1f ) // INF or NAN
		1379	{
		1380	Result[2] = 0x7f800000 \| (pSource->zm << 17);
		1381	}
		1382	else
		1383	{
		1384	if ( pSource->ze != 0 ) // The value is normalized
		1385	{
		1386	Exponent = pSource->ze;
		1387	}
		1388	else if (Mantissa != 0) // The value is denormalized
		1389	{
		1390	// Normalize the value in the resulting float
		1391	Exponent = 1;
		1392
		1393	do
		1394	{
		1395	Exponent--;
		1396	Mantissa <<= 1;
		1397	} while ((Mantissa & 0x20) == 0);
		1398
		1399	Mantissa &= 0x1F;
		1400	}
		1401	else // The value is zero
		1402	{
		1403	Exponent = (UINT)-112;
		1404	}
		1405
		1406	Result[2] = ((Exponent + 112) << 23) \| (Mantissa << 18);
		1407	}
		1408
		1409	return XMLoadFloat3( (XMFLOAT3*)&Result );
		1410	}
		1411
		1412	//------------------------------------------------------------------------------
		1413
		1414	XMFINLINE XMVECTOR XMLoadFloat3SE
		1415	(
		1416	CONST XMFLOAT3SE* pSource
		1417	)
		1418	{
		1419	UINT Mantissa;
		1420	UINT Exponent, ExpBits;
		1421	UINT Result[3];
		1422
		1423	XMASSERT(pSource);
		1424
		1425	if ( pSource->e == 0x1f ) // INF or NAN
		1426	{
		1427	Result[0] = 0x7f800000 \| (pSource->xm << 14);
		1428	Result[1] = 0x7f800000 \| (pSource->ym << 14);
		1429	Result[2] = 0x7f800000 \| (pSource->zm << 14);
		1430	}
		1431	else if ( pSource->e != 0 ) // The values are all normalized
		1432	{
		1433	Exponent = pSource->e;
		1434
		1435	ExpBits = (Exponent + 112) << 23;
		1436
		1437	Mantissa = pSource->xm;
		1438	Result[0] = ExpBits \| (Mantissa << 14);
		1439
		1440	Mantissa = pSource->ym;
		1441	Result[1] = ExpBits \| (Mantissa << 14);
		1442
		1443	Mantissa = pSource->zm;
		1444	Result[2] = ExpBits \| (Mantissa << 14);
		1445	}
		1446	else
		1447	{
		1448	// X Channel
		1449	Mantissa = pSource->xm;
		1450
		1451	if (Mantissa != 0) // The value is denormalized
		1452	{
		1453	// Normalize the value in the resulting float
		1454	Exponent = 1;
		1455
		1456	do
		1457	{
		1458	Exponent--;
		1459	Mantissa <<= 1;
		1460	} while ((Mantissa & 0x200) == 0);
		1461
		1462	Mantissa &= 0x1FF;
		1463	}
		1464	else // The value is zero
		1465	{
		1466	Exponent = (UINT)-112;
		1467	}
		1468
		1469	Result[0] = ((Exponent + 112) << 23) \| (Mantissa << 14);
		1470
		1471	// Y Channel
		1472	Mantissa = pSource->ym;
		1473
		1474	if (Mantissa != 0) // The value is denormalized
		1475	{
		1476	// Normalize the value in the resulting float
		1477	Exponent = 1;
		1478
		1479	do
		1480	{
		1481	Exponent--;
		1482	Mantissa <<= 1;
		1483	} while ((Mantissa & 0x200) == 0);
		1484
		1485	Mantissa &= 0x1FF;
		1486	}
		1487	else // The value is zero
		1488	{
		1489	Exponent = (UINT)-112;
		1490	}
		1491
		1492	Result[1] = ((Exponent + 112) << 23) \| (Mantissa << 14);
		1493
		1494	// Z Channel
		1495	Mantissa = pSource->zm;
		1496
		1497	if (Mantissa != 0) // The value is denormalized
		1498	{
		1499	// Normalize the value in the resulting float
		1500	Exponent = 1;
		1501
		1502	do
		1503	{
		1504	Exponent--;
		1505	Mantissa <<= 1;
		1506	} while ((Mantissa & 0x200) == 0);
		1507
		1508	Mantissa &= 0x1FF;
		1509	}
		1510	else // The value is zero
		1511	{
		1512	Exponent = (UINT)-112;
		1513	}
		1514
		1515	Result[2] = ((Exponent + 112) << 23) \| (Mantissa << 14);
		1516	}
		1517
		1518	return XMLoadFloat3( (XMFLOAT3*)&Result );
		1519	}
		1520
		1521	//------------------------------------------------------------------------------
		1522
		1523	XMFINLINE XMVECTOR XMLoadInt4
		1524	(
		1525	CONST UINT* pSource
		1526	)
		1527	{
		1528	#if defined(_XM_NO_INTRINSICS_)
		1529
		1530	XMVECTOR V;
		1531
		1532	XMASSERT(pSource);
		1533
		1534	V.vector4_u32[0] = pSource[0];
		1535	V.vector4_u32[1] = pSource[1];
		1536	V.vector4_u32[2] = pSource[2];
		1537	V.vector4_u32[3] = pSource[3];
		1538
		1539	return V;
		1540
		1541	#elif defined(_XM_SSE_INTRINSICS_)
		1542
		1543	XMASSERT(pSource);
		1544	__m128i V = _mm_loadu_si128( (const __m128i*)pSource );
		1545	return reinterpret_cast<__m128 *>(&V)[0];
		1546
		1547	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1548	#endif // _XM_VMX128_INTRINSICS_
		1549	}
		1550
		1551	//------------------------------------------------------------------------------
		1552
		1553	XMFINLINE XMVECTOR XMLoadInt4A
		1554	(
		1555	CONST UINT* pSource
		1556	)
		1557	{
		1558	#if defined(_XM_NO_INTRINSICS_)
		1559
		1560	XMVECTOR V;
		1561
		1562	XMASSERT(pSource);
		1563	XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
		1564
		1565	V.vector4_u32[0] = pSource[0];
		1566	V.vector4_u32[1] = pSource[1];
		1567	V.vector4_u32[2] = pSource[2];
		1568	V.vector4_u32[3] = pSource[3];
		1569
		1570	return V;
		1571
		1572	#elif defined(_XM_SSE_INTRINSICS_)
		1573
		1574	XMASSERT(pSource);
		1575	XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
		1576
		1577	__m128i V = _mm_load_si128( (const __m128i*)pSource );
		1578	return reinterpret_cast<__m128 *>(&V)[0];
		1579
		1580
		1581	#else // _XM_VMX128_INTRINSICS_
		1582	#endif // _XM_VMX128_INTRINSICS_
		1583	}
		1584
		1585	//------------------------------------------------------------------------------
		1586
		1587	XMFINLINE XMVECTOR XMLoadFloat4
		1588	(
		1589	CONST XMFLOAT4* pSource
		1590	)
		1591	{
		1592	#if defined(_XM_NO_INTRINSICS_)
		1593	XMVECTOR V;
		1594	XMASSERT(pSource);
		1595	((UINT )(&V.vector4_f32[0]))[0] = ((const UINT )(&pSource->x))[0];
		1596	((UINT )(&V.vector4_f32[1]))[0] = ((const UINT )(&pSource->y))[0];
		1597	((UINT )(&V.vector4_f32[2]))[0] = ((const UINT )(&pSource->z))[0];
		1598	((UINT )(&V.vector4_f32[3]))[0] = ((const UINT )(&pSource->w))[0];
		1599	return V;
		1600	#elif defined(_XM_SSE_INTRINSICS_)
		1601	XMASSERT(pSource);
		1602	return _mm_loadu_ps( &pSource->x );
		1603	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1604	#endif // _XM_VMX128_INTRINSICS_
		1605	}
		1606
		1607	//------------------------------------------------------------------------------
		1608
		1609	XMFINLINE XMVECTOR XMLoadFloat4A
		1610	(
		1611	CONST XMFLOAT4A* pSource
		1612	)
		1613	{
		1614	#if defined(_XM_NO_INTRINSICS_)
		1615
		1616	XMVECTOR V;
		1617
		1618	XMASSERT(pSource);
		1619	XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
		1620
		1621	V.vector4_f32[0] = pSource->x;
		1622	V.vector4_f32[1] = pSource->y;
		1623	V.vector4_f32[2] = pSource->z;
		1624	V.vector4_f32[3] = pSource->w;
		1625
		1626	return V;
		1627
		1628	#elif defined(_XM_SSE_INTRINSICS_)
		1629
		1630	XMASSERT(pSource);
		1631	XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
		1632
		1633	return _mm_load_ps( &pSource->x );
		1634
		1635	#else // _XM_VMX128_INTRINSICS_
		1636	#endif // _XM_VMX128_INTRINSICS_
		1637	}
		1638
		1639	//------------------------------------------------------------------------------
		1640
		1641	XMFINLINE XMVECTOR XMLoadHalf4
		1642	(
		1643	CONST XMHALF4* pSource
		1644	)
		1645	{
		1646	#if defined(_XM_NO_INTRINSICS_)
		1647	XMASSERT(pSource);
		1648	{
		1649	XMVECTOR vResult = {
		1650	XMConvertHalfToFloat(pSource->x),
		1651	XMConvertHalfToFloat(pSource->y),
		1652	XMConvertHalfToFloat(pSource->z),
		1653	XMConvertHalfToFloat(pSource->w)
		1654	};
		1655	return vResult;
		1656	}
		1657	#elif defined(_XM_SSE_INTRINSICS_)
		1658	XMASSERT(pSource);
		1659	XMVECTOR vResult = {
		1660	XMConvertHalfToFloat(pSource->x),
		1661	XMConvertHalfToFloat(pSource->y),
		1662	XMConvertHalfToFloat(pSource->z),
		1663	XMConvertHalfToFloat(pSource->w)
		1664	};
		1665	return vResult;
		1666	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1667	#endif // _XM_VMX128_INTRINSICS_
		1668	}
		1669
		1670	//------------------------------------------------------------------------------
		1671
		1672	XMFINLINE XMVECTOR XMLoadShortN4
		1673	(
		1674	CONST XMSHORTN4* pSource
		1675	)
		1676	{
		1677	#if defined(_XM_NO_INTRINSICS_)
		1678	XMASSERT(pSource);
		1679	XMASSERT(pSource->x != -32768);
		1680	XMASSERT(pSource->y != -32768);
		1681	XMASSERT(pSource->z != -32768);
		1682	XMASSERT(pSource->w != -32768);
		1683	{
		1684	XMVECTOR vResult = {
		1685	(FLOAT)pSource->x * (1.0f/32767.0f),
		1686	(FLOAT)pSource->y * (1.0f/32767.0f),
		1687	(FLOAT)pSource->z * (1.0f/32767.0f),
		1688	(FLOAT)pSource->w * (1.0f/32767.0f)
		1689	};
		1690	return vResult;
		1691	}
		1692	#elif defined(_XM_SSE_INTRINSICS_)
		1693	XMASSERT(pSource);
		1694	XMASSERT(pSource->x != -32768);
		1695	XMASSERT(pSource->y != -32768);
		1696	XMASSERT(pSource->z != -32768);
		1697	XMASSERT(pSource->w != -32768);
		1698	// Splat the color in all four entries (x,z,y,w)
		1699	__m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
		1700	// Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
		1701	__m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
		1702	// x and z are unsigned! Flip the bits to convert the order to signed
		1703	vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
		1704	// Convert to floating point numbers
		1705	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		1706	// x and z - 0x8000 to complete the conversion
		1707	vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
		1708	// Convert -32767-32767 to -1.0f-1.0f
		1709	vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
		1710	// Very important! The entries are x,z,y,w, flip it to x,y,z,w
		1711	vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
		1712	return vTemp;
		1713	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1714	#endif // _XM_VMX128_INTRINSICS_
		1715	}
		1716
		1717	//------------------------------------------------------------------------------
		1718
		1719	XMFINLINE XMVECTOR XMLoadShort4
		1720	(
		1721	CONST XMSHORT4* pSource
		1722	)
		1723	{
		1724	#if defined(_XM_NO_INTRINSICS_)
		1725
		1726	XMVECTOR V;
		1727
		1728	XMASSERT(pSource);
		1729	XMASSERT(pSource->x != -32768);
		1730	XMASSERT(pSource->y != -32768);
		1731	XMASSERT(pSource->z != -32768);
		1732	XMASSERT(pSource->w != -32768);
		1733
		1734	V.vector4_f32[0] = (FLOAT)pSource->x;
		1735	V.vector4_f32[1] = (FLOAT)pSource->y;
		1736	V.vector4_f32[2] = (FLOAT)pSource->z;
		1737	V.vector4_f32[3] = (FLOAT)pSource->w;
		1738
		1739	return V;
		1740
		1741	#elif defined(_XM_SSE_INTRINSICS_)
		1742	XMASSERT(pSource);
		1743	XMASSERT(pSource->x != -32768);
		1744	XMASSERT(pSource->y != -32768);
		1745	XMASSERT(pSource->z != -32768);
		1746	XMASSERT(pSource->w != -32768);
		1747	// Splat the color in all four entries (x,z,y,w)
		1748	__m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
		1749	// Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
		1750	__m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
		1751	// x and z are unsigned! Flip the bits to convert the order to signed
		1752	vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
		1753	// Convert to floating point numbers
		1754	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		1755	// x and z - 0x8000 to complete the conversion
		1756	vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
		1757	// Fix y and w because they are 65536 too large
		1758	vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
		1759	// Very important! The entries are x,z,y,w, flip it to x,y,z,w
		1760	return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
		1761	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1762	#endif // _XM_VMX128_INTRINSICS_
		1763	}
		1764
		1765	//------------------------------------------------------------------------------
		1766
		1767	XMFINLINE XMVECTOR XMLoadUShortN4
		1768	(
		1769	CONST XMUSHORTN4* pSource
		1770	)
		1771	{
		1772	#if defined(_XM_NO_INTRINSICS_)
		1773
		1774	XMVECTOR V;
		1775
		1776	XMASSERT(pSource);
		1777
		1778	V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
		1779	V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
		1780	V.vector4_f32[2] = (FLOAT)pSource->z / 65535.0f;
		1781	V.vector4_f32[3] = (FLOAT)pSource->w / 65535.0f;
		1782
		1783	return V;
		1784
		1785	#elif defined(_XM_SSE_INTRINSICS_)
		1786	XMASSERT(pSource);
		1787	static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f65536.0f),1.0f/(65535.0f65536.0f)};
		1788	static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f65536.0f,32768.0f65536.0f};
		1789	XMASSERT(pSource);
		1790	// Splat the color in all four entries (x,z,y,w)
		1791	__m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
		1792	// Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
		1793	__m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
		1794	// y and w are signed! Flip the bits to convert the order to unsigned
		1795	vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
		1796	// Convert to floating point numbers
		1797	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		1798	// y and w + 0x8000 to complete the conversion
		1799	vTemp = _mm_add_ps(vTemp,FixaddY16W16);
		1800	// Fix y and w because they are 65536 too large
		1801	vTemp = _mm_mul_ps(vTemp,FixupY16W16);
		1802	// Very important! The entries are x,z,y,w, flip it to x,y,z,w
		1803	return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
		1804	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1805	#endif // _XM_VMX128_INTRINSICS_
		1806	}
		1807
		1808	//------------------------------------------------------------------------------
		1809
		1810	XMFINLINE XMVECTOR XMLoadUShort4
		1811	(
		1812	CONST XMUSHORT4* pSource
		1813	)
		1814	{
		1815	#if defined(_XM_NO_INTRINSICS_)
		1816
		1817	XMVECTOR V;
		1818
		1819	XMASSERT(pSource);
		1820
		1821	V.vector4_f32[0] = (FLOAT)pSource->x;
		1822	V.vector4_f32[1] = (FLOAT)pSource->y;
		1823	V.vector4_f32[2] = (FLOAT)pSource->z;
		1824	V.vector4_f32[3] = (FLOAT)pSource->w;
		1825
		1826	return V;
		1827
		1828	#elif defined(_XM_SSE_INTRINSICS_)
		1829	XMASSERT(pSource);
		1830	static const XMVECTORF32 FixaddY16W16 = {0,0,32768.0f,32768.0f};
		1831	XMASSERT(pSource);
		1832	// Splat the color in all four entries (x,z,y,w)
		1833	__m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
		1834	// Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
		1835	__m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
		1836	// y and w are signed! Flip the bits to convert the order to unsigned
		1837	vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
		1838	// Convert to floating point numbers
		1839	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		1840	// Fix y and w because they are 65536 too large
		1841	vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
		1842	// y and w + 0x8000 to complete the conversion
		1843	vTemp = _mm_add_ps(vTemp,FixaddY16W16);
		1844	// Very important! The entries are x,z,y,w, flip it to x,y,z,w
		1845	return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
		1846	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1847	#endif // _XM_VMX128_INTRINSICS_
		1848	}
		1849
		1850	//------------------------------------------------------------------------------
		1851
		1852	XMFINLINE XMVECTOR XMLoadXIcoN4
		1853	(
		1854	CONST XMXICON4* pSource
		1855	)
		1856	{
		1857	#if defined(_XM_NO_INTRINSICS_)
		1858
		1859	XMVECTOR V;
		1860	UINT Element;
		1861	static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
		1862
		1863	XMASSERT(pSource);
		1864	XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
		1865	XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
		1866	XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
		1867
		1868	Element = (UINT)pSource->v & 0xFFFFF;
		1869	V.vector4_f32[0] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]) / 524287.0f;
		1870	Element = (UINT)(pSource->v >> 20) & 0xFFFFF;
		1871	V.vector4_f32[1] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]) / 524287.0f;
		1872	Element = (UINT)(pSource->v >> 40) & 0xFFFFF;
		1873	V.vector4_f32[2] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]) / 524287.0f;
		1874	V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
		1875
		1876	return V;
		1877
		1878	#elif defined(_XM_SSE_INTRINSICS_)
		1879	XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
		1880	XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
		1881	XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
		1882	static const XMVECTORF32 LoadXIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f4096.0f),1.0f/524287.0f,1.0f/(15.0f4096.0f*65536.0f)};
		1883	XMASSERT(pSource);
		1884	// Grab the 64 bit structure
		1885	__m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
		1886	// By shifting down 8 bits, y and z are in seperate 32 bit elements
		1887	__m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
		1888	// vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
		1889	XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResultd)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
		1890	// Fix the entries to x,y,z,w
		1891	vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
		1892	// Mask x,y,z and w
		1893	vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
		1894	// x and z are unsigned! Flip the bits to convert the order to signed
		1895	vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
		1896	// Convert to floating point numbers
		1897	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		1898	// x and z - 0x80 to complete the conversion
		1899	vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
		1900	// Fix y and w because they are too large
		1901	vTemp = _mm_mul_ps(vTemp,LoadXIcoN4Mul);
		1902	return vTemp;
		1903	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1904	#endif // _XM_VMX128_INTRINSICS_
		1905	}
		1906
		1907	//------------------------------------------------------------------------------
		1908
		1909	XMFINLINE XMVECTOR XMLoadXIco4
		1910	(
		1911	CONST XMXICO4* pSource
		1912	)
		1913	{
		1914	#if defined(_XM_NO_INTRINSICS_)
		1915
		1916	XMVECTOR V;
		1917	UINT Element;
		1918	static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
		1919
		1920	XMASSERT(pSource);
		1921	XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
		1922	XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
		1923	XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
		1924
		1925	Element = (UINT)pSource->v & 0xFFFFF;
		1926	V.vector4_f32[0] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]);
		1927	Element = (UINT)(pSource->v >> 20) & 0xFFFFF;
		1928	V.vector4_f32[1] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]);
		1929	Element = (UINT)(pSource->v >> 40) & 0xFFFFF;
		1930	V.vector4_f32[2] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]);
		1931	V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
		1932
		1933	return V;
		1934
		1935	#elif defined(_XM_SSE_INTRINSICS_)
		1936	XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
		1937	XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
		1938	XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
		1939	XMASSERT(pSource);
		1940	// Grab the 64 bit structure
		1941	__m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
		1942	// By shifting down 8 bits, y and z are in seperate 32 bit elements
		1943	__m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
		1944	// vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
		1945	XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResultd)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
		1946	// Fix the entries to x,y,z,w
		1947	vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
		1948	// Mask x,y,z and w
		1949	vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
		1950	// x and z are unsigned! Flip the bits to convert the order to signed
		1951	vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
		1952	// Convert to floating point numbers
		1953	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		1954	// x and z - 0x80 to complete the conversion
		1955	vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
		1956	// Fix y and w because they are too large
		1957	vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
		1958	return vTemp;
		1959	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		1960	#endif // _XM_VMX128_INTRINSICS_
		1961	}
		1962
		1963	//------------------------------------------------------------------------------
		1964
		1965	XMFINLINE XMVECTOR XMLoadUIcoN4
		1966	(
		1967	CONST XMUICON4* pSource
		1968	)
		1969	{
		1970	#if defined(_XM_NO_INTRINSICS_)
		1971
		1972	XMVECTOR V;
		1973
		1974	XMASSERT(pSource);
		1975
		1976	V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF) / 1048575.0f;
		1977	V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF) / 1048575.0f;
		1978	V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF) / 1048575.0f;
		1979	V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
		1980
		1981	return V;
		1982
		1983	#elif defined(_XM_SSE_INTRINSICS_)
		1984	static const XMVECTORF32 LoadUIcoN4Mul = {1.0f/1048575.0f,1.0f/(1048575.0f4096.0f),1.0f/1048575.0f,1.0f/(15.0f4096.0f*65536.0f)};
		1985	XMASSERT(pSource);
		1986	// Grab the 64 bit structure
		1987	__m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
		1988	// By shifting down 8 bits, y and z are in seperate 32 bit elements
		1989	__m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
		1990	// vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
		1991	XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResultd)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
		1992	// Fix the entries to x,y,z,w
		1993	vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
		1994	// Mask x,y,z and w
		1995	vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
		1996	// x and z are unsigned! Flip the bits to convert the order to signed
		1997	vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
		1998	// Convert to floating point numbers
		1999	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2000	// x and z - 0x80 to complete the conversion
		2001	vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
		2002	// Fix y and w because they are too large
		2003	vTemp = _mm_mul_ps(vTemp,LoadUIcoN4Mul);
		2004	return vTemp;
		2005	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2006	#endif // _XM_VMX128_INTRINSICS_
		2007	}
		2008
		2009	//------------------------------------------------------------------------------
		2010
		2011	XMFINLINE XMVECTOR XMLoadUIco4
		2012	(
		2013	CONST XMUICO4* pSource
		2014	)
		2015	{
		2016	#if defined(_XM_NO_INTRINSICS_)
		2017
		2018	XMVECTOR V;
		2019
		2020	XMASSERT(pSource);
		2021
		2022	V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF);
		2023	V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF);
		2024	V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF);
		2025	V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
		2026
		2027	return V;
		2028
		2029	#elif defined(_XM_SSE_INTRINSICS_)
		2030	XMASSERT(pSource);
		2031	// Grab the 64 bit structure
		2032	__m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
		2033	// By shifting down 8 bits, y and z are in seperate 32 bit elements
		2034	__m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
		2035	// vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
		2036	XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResultd)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
		2037	// Fix the entries to x,y,z,w
		2038	vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
		2039	// Mask x,y,z and w
		2040	vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
		2041	// x and z are unsigned! Flip the bits to convert the order to signed
		2042	vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
		2043	// Convert to floating point numbers
		2044	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2045	// x and z - 0x80 to complete the conversion
		2046	vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
		2047	// Fix y and w because they are too large
		2048	vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
		2049	return vTemp;
		2050	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2051	#endif // _XM_VMX128_INTRINSICS_
		2052	}
		2053
		2054	//------------------------------------------------------------------------------
		2055
		2056	XMFINLINE XMVECTOR XMLoadIcoN4
		2057	(
		2058	CONST XMICON4* pSource
		2059	)
		2060	{
		2061	#if defined(_XM_NO_INTRINSICS_)
		2062
		2063	XMVECTOR V;
		2064	UINT Element;
		2065	static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
		2066	static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
		2067
		2068	XMASSERT(pSource);
		2069
		2070	Element = (UINT)pSource->v & 0xFFFFF;
		2071	V.vector4_f32[0] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]) / 524287.0f;
		2072	Element = (UINT)(pSource->v >> 20) & 0xFFFFF;
		2073	V.vector4_f32[1] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]) / 524287.0f;
		2074	Element = (UINT)(pSource->v >> 40) & 0xFFFFF;
		2075	V.vector4_f32[2] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]) / 524287.0f;
		2076	Element = (UINT)(pSource->v >> 60);
		2077	V.vector4_f32[3] = (FLOAT)(INT)(Element \| SignExtendW[Element >> 3]) / 7.0f;
		2078
		2079	return V;
		2080
		2081	#elif defined(_XM_SSE_INTRINSICS_)
		2082	static const XMVECTORF32 LoadIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f4096.0f),1.0f/524287.0f,1.0f/(7.0f4096.0f*65536.0f)};
		2083	XMASSERT(pSource);
		2084	// Grab the 64 bit structure
		2085	__m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
		2086	// By shifting down 8 bits, y and z are in seperate 32 bit elements
		2087	__m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
		2088	// vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
		2089	XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResultd)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
		2090	// Fix the entries to x,y,z,w
		2091	vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
		2092	// Mask x,y,z and w
		2093	vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
		2094	// x and z are unsigned! Flip the bits to convert the order to signed
		2095	vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
		2096	// Convert to floating point numbers
		2097	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2098	// x and z - 0x80 to complete the conversion
		2099	vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
		2100	// Fix y and w because they are too large
		2101	vTemp = _mm_mul_ps(vTemp,LoadIcoN4Mul);
		2102	return vTemp;
		2103	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2104	#endif // _XM_VMX128_INTRINSICS_
		2105	}
		2106
		2107	//------------------------------------------------------------------------------
		2108
		2109	XMFINLINE XMVECTOR XMLoadIco4
		2110	(
		2111	CONST XMICO4* pSource
		2112	)
		2113	{
		2114	#if defined(_XM_NO_INTRINSICS_)
		2115
		2116	XMVECTOR V;
		2117	UINT Element;
		2118	static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
		2119	static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
		2120
		2121	XMASSERT(pSource);
		2122
		2123	Element = (UINT)pSource->v & 0xFFFFF;
		2124	V.vector4_f32[0] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]);
		2125	Element = (UINT)(pSource->v >> 20) & 0xFFFFF;
		2126	V.vector4_f32[1] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]);
		2127	Element = (UINT)(pSource->v >> 40) & 0xFFFFF;
		2128	V.vector4_f32[2] = (FLOAT)(INT)(Element \| SignExtend[Element >> 19]);
		2129	Element = (UINT)(pSource->v >> 60);
		2130	V.vector4_f32[3] = (FLOAT)(INT)(Element \| SignExtendW[Element >> 3]);
		2131
		2132	return V;
		2133
		2134	#elif defined(_XM_SSE_INTRINSICS_)
		2135	XMASSERT(pSource);
		2136	// Grab the 64 bit structure
		2137	__m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
		2138	// By shifting down 8 bits, y and z are in seperate 32 bit elements
		2139	__m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
		2140	// vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
		2141	XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResultd)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
		2142	// Fix the entries to x,y,z,w
		2143	vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
		2144	// Mask x,y,z and w
		2145	vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
		2146	// x and z are unsigned! Flip the bits to convert the order to signed
		2147	vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
		2148	// Convert to floating point numbers
		2149	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2150	// x and z - 0x80 to complete the conversion
		2151	vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
		2152	// Fix y and w because they are too large
		2153	vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
		2154	return vTemp;
		2155	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2156	#endif // _XM_VMX128_INTRINSICS_
		2157	}
		2158
		2159
		2160	//------------------------------------------------------------------------------
		2161
		2162	XMFINLINE XMVECTOR XMLoadXDecN4
		2163	(
		2164	CONST XMXDECN4* pSource
		2165	)
		2166	{
		2167	#if defined(_XM_NO_INTRINSICS_)
		2168	XMVECTOR V;
		2169	UINT Element;
		2170	static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
		2171
		2172	XMASSERT(pSource);
		2173	XMASSERT((pSource->v & 0x3FF) != 0x200);
		2174	XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
		2175	XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
		2176
		2177	Element = pSource->v & 0x3FF;
		2178	V.vector4_f32[0] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]) / 511.0f;
		2179	Element = (pSource->v >> 10) & 0x3FF;
		2180	V.vector4_f32[1] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]) / 511.0f;
		2181	Element = (pSource->v >> 20) & 0x3FF;
		2182	V.vector4_f32[2] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]) / 511.0f;
		2183	V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
		2184
		2185	return V;
		2186
		2187	#elif defined(_XM_SSE_INTRINSICS_)
		2188	XMASSERT(pSource);
		2189	// Splat the color in all four entries
		2190	__m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		2191	// Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
		2192	vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10);
		2193	// a is unsigned! Flip the bit to convert the order to signed
		2194	vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
		2195	// Convert to floating point numbers
		2196	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2197	// RGB + 0, A + 0x80000000.f to undo the signed order.
		2198	vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
		2199	// Convert 0-255 to 0.0f-1.0f
		2200	return _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10);
		2201	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2202	#endif // _XM_VMX128_INTRINSICS_
		2203	}
		2204
		2205	//------------------------------------------------------------------------------
		2206
		2207	XMFINLINE XMVECTOR XMLoadXDec4
		2208	(
		2209	CONST XMXDEC4* pSource
		2210	)
		2211	{
		2212	#if defined(_XM_NO_INTRINSICS_)
		2213
		2214	XMVECTOR V;
		2215	UINT Element;
		2216	static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
		2217
		2218	XMASSERT(pSource);
		2219	XMASSERT((pSource->v & 0x3FF) != 0x200);
		2220	XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
		2221	XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
		2222
		2223	Element = pSource->v & 0x3FF;
		2224	V.vector4_f32[0] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]);
		2225	Element = (pSource->v >> 10) & 0x3FF;
		2226	V.vector4_f32[1] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]);
		2227	Element = (pSource->v >> 20) & 0x3FF;
		2228	V.vector4_f32[2] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]);
		2229	V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
		2230
		2231	return V;
		2232
		2233	#elif defined(_XM_SSE_INTRINSICS_)
		2234	XMASSERT((pSource->v & 0x3FF) != 0x200);
		2235	XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
		2236	XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
		2237	static const XMVECTORI32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
		2238	static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f1024.0f,-512.0f1024.0f1024.0f,3276865536.0f};
		2239	XMASSERT(pSource);
		2240	// Splat the color in all four entries
		2241	XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		2242	// Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
		2243	vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
		2244	// a is unsigned! Flip the bit to convert the order to signed
		2245	vTemp = _mm_xor_ps(vTemp,XDec4Xor);
		2246	// Convert to floating point numbers
		2247	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2248	// RGB + 0, A + 0x80000000.f to undo the signed order.
		2249	vTemp = _mm_add_ps(vTemp,XDec4Add);
		2250	// Convert 0-255 to 0.0f-1.0f
		2251	vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
		2252	return vTemp;
		2253	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2254	#endif // _XM_VMX128_INTRINSICS_
		2255	}
		2256
		2257	//------------------------------------------------------------------------------
		2258
		2259	XMFINLINE XMVECTOR XMLoadUDecN4
		2260	(
		2261	CONST XMUDECN4* pSource
		2262	)
		2263	{
		2264	#if defined(_XM_NO_INTRINSICS_)
		2265
		2266	XMVECTOR V;
		2267	UINT Element;
		2268
		2269	XMASSERT(pSource);
		2270
		2271	Element = pSource->v & 0x3FF;
		2272	V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
		2273	Element = (pSource->v >> 10) & 0x3FF;
		2274	V.vector4_f32[1] = (FLOAT)Element / 1023.0f;
		2275	Element = (pSource->v >> 20) & 0x3FF;
		2276	V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
		2277	V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
		2278
		2279	return V;
		2280
		2281	#elif defined(_XM_SSE_INTRINSICS_)
		2282	XMASSERT(pSource);
		2283	static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f1024.0f),1.0f/(1023.0f1024.0f1024.0f),1.0f/(3.0f1024.0f1024.0f1024.0f)};
		2284	// Splat the color in all four entries
		2285	XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		2286	// Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
		2287	vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
		2288	// a is unsigned! Flip the bit to convert the order to signed
		2289	vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
		2290	// Convert to floating point numbers
		2291	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2292	// RGB + 0, A + 0x80000000.f to undo the signed order.
		2293	vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
		2294	// Convert 0-255 to 0.0f-1.0f
		2295	vTemp = _mm_mul_ps(vTemp,UDecN4Mul);
		2296	return vTemp;
		2297	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2298	#endif // _XM_VMX128_INTRINSICS_
		2299	}
		2300
		2301	//------------------------------------------------------------------------------
		2302
		2303	XMFINLINE XMVECTOR XMLoadUDec4
		2304	(
		2305	CONST XMUDEC4* pSource
		2306	)
		2307	{
		2308	#if defined(_XM_NO_INTRINSICS_)
		2309
		2310	XMVECTOR V;
		2311	UINT Element;
		2312
		2313	XMASSERT(pSource);
		2314
		2315	Element = pSource->v & 0x3FF;
		2316	V.vector4_f32[0] = (FLOAT)Element;
		2317	Element = (pSource->v >> 10) & 0x3FF;
		2318	V.vector4_f32[1] = (FLOAT)Element;
		2319	Element = (pSource->v >> 20) & 0x3FF;
		2320	V.vector4_f32[2] = (FLOAT)Element;
		2321	V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
		2322
		2323	return V;
		2324
		2325	#elif defined(_XM_SSE_INTRINSICS_)
		2326	XMASSERT(pSource);
		2327	// Splat the color in all four entries
		2328	XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		2329	// Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
		2330	vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
		2331	// a is unsigned! Flip the bit to convert the order to signed
		2332	vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
		2333	// Convert to floating point numbers
		2334	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2335	// RGB + 0, A + 0x80000000.f to undo the signed order.
		2336	vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
		2337	// Convert 0-255 to 0.0f-1.0f
		2338	vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
		2339	return vTemp;
		2340	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2341	#endif // _XM_VMX128_INTRINSICS_
		2342	}
		2343
		2344	//------------------------------------------------------------------------------
		2345
		2346	XMFINLINE XMVECTOR XMLoadDecN4
		2347	(
		2348	CONST XMDECN4* pSource
		2349	)
		2350	{
		2351	#if defined(_XM_NO_INTRINSICS_)
		2352
		2353	XMVECTOR V;
		2354	UINT Element;
		2355	static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
		2356	static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
		2357
		2358	XMASSERT(pSource);
		2359	XMASSERT((pSource->v & 0x3FF) != 0x200);
		2360	XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
		2361	XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
		2362	XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
		2363
		2364	Element = pSource->v & 0x3FF;
		2365	V.vector4_f32[0] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]) / 511.0f;
		2366	Element = (pSource->v >> 10) & 0x3FF;
		2367	V.vector4_f32[1] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]) / 511.0f;
		2368	Element = (pSource->v >> 20) & 0x3FF;
		2369	V.vector4_f32[2] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]) / 511.0f;
		2370	Element = pSource->v >> 30;
		2371	V.vector4_f32[3] = (FLOAT)(SHORT)(Element \| SignExtendW[Element >> 1]);
		2372
		2373	return V;
		2374
		2375	#elif defined(_XM_SSE_INTRINSICS_)
		2376	XMASSERT(pSource);
		2377	XMASSERT((pSource->v & 0x3FF) != 0x200);
		2378	XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
		2379	XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
		2380	XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
		2381	static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f1024.0f),1.0f/(511.0f1024.0f1024.0f),1.0f/(1024.0f1024.0f*1024.0f)};
		2382	// Splat the color in all four entries
		2383	XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		2384	// Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
		2385	vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
		2386	// a is unsigned! Flip the bit to convert the order to signed
		2387	vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
		2388	// Convert to floating point numbers
		2389	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2390	// RGB + 0, A + 0x80000000.f to undo the signed order.
		2391	vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
		2392	// Convert 0-255 to 0.0f-1.0f
		2393	vTemp = _mm_mul_ps(vTemp,DecN4Mul);
		2394	return vTemp;
		2395	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2396	#endif // _XM_VMX128_INTRINSICS_
		2397	}
		2398
		2399	//------------------------------------------------------------------------------
		2400
		2401	XMFINLINE XMVECTOR XMLoadDec4
		2402	(
		2403	CONST XMDEC4* pSource
		2404	)
		2405	{
		2406	#if defined(_XM_NO_INTRINSICS_)
		2407
		2408	XMVECTOR V;
		2409	UINT Element;
		2410	static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
		2411	static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
		2412
		2413	XMASSERT(pSource);
		2414	XMASSERT((pSource->v & 0x3FF) != 0x200);
		2415	XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
		2416	XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
		2417	XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
		2418
		2419	Element = pSource->v & 0x3FF;
		2420	V.vector4_f32[0] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]);
		2421	Element = (pSource->v >> 10) & 0x3FF;
		2422	V.vector4_f32[1] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]);
		2423	Element = (pSource->v >> 20) & 0x3FF;
		2424	V.vector4_f32[2] = (FLOAT)(SHORT)(Element \| SignExtend[Element >> 9]);
		2425	Element = pSource->v >> 30;
		2426	V.vector4_f32[3] = (FLOAT)(SHORT)(Element \| SignExtendW[Element >> 1]);
		2427
		2428	return V;
		2429
		2430	#elif defined(_XM_SSE_INTRINSICS_)
		2431	XMASSERT((pSource->v & 0x3FF) != 0x200);
		2432	XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
		2433	XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
		2434	XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
		2435	XMASSERT(pSource);
		2436	// Splat the color in all four entries
		2437	XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		2438	// Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
		2439	vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
		2440	// a is unsigned! Flip the bit to convert the order to signed
		2441	vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
		2442	// Convert to floating point numbers
		2443	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2444	// RGB + 0, A + 0x80000000.f to undo the signed order.
		2445	vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
		2446	// Convert 0-255 to 0.0f-1.0f
		2447	vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
		2448	return vTemp;
		2449	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2450	#endif // _XM_VMX128_INTRINSICS_
		2451	}
		2452
		2453	//------------------------------------------------------------------------------
		2454
		2455	XMFINLINE XMVECTOR XMLoadUByteN4
		2456	(
		2457	CONST XMUBYTEN4* pSource
		2458	)
		2459	{
		2460	#if defined(_XM_NO_INTRINSICS_)
		2461
		2462	XMVECTOR V;
		2463
		2464	XMASSERT(pSource);
		2465
		2466	V.vector4_f32[0] = (FLOAT)pSource->x / 255.0f;
		2467	V.vector4_f32[1] = (FLOAT)pSource->y / 255.0f;
		2468	V.vector4_f32[2] = (FLOAT)pSource->z / 255.0f;
		2469	V.vector4_f32[3] = (FLOAT)pSource->w / 255.0f;
		2470
		2471	return V;
		2472
		2473	#elif defined(_XM_SSE_INTRINSICS_)
		2474	static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f256.0f),1.0f/(255.0f65536.0f),1.0f/(255.0f65536.0f256.0f)};
		2475	XMASSERT(pSource);
		2476	// Splat the color in all four entries (x,z,y,w)
		2477	XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
		2478	// Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
		2479	vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
		2480	// w is signed! Flip the bits to convert the order to unsigned
		2481	vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
		2482	// Convert to floating point numbers
		2483	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2484	// w + 0x80 to complete the conversion
		2485	vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
		2486	// Fix y, z and w because they are too large
		2487	vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul);
		2488	return vTemp;
		2489	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2490	#endif // _XM_VMX128_INTRINSICS_
		2491	}
		2492
		2493	//------------------------------------------------------------------------------
		2494
		2495	XMFINLINE XMVECTOR XMLoadUByte4
		2496	(
		2497	CONST XMUBYTE4* pSource
		2498	)
		2499	{
		2500	#if defined(_XM_NO_INTRINSICS_)
		2501
		2502	XMVECTOR V;
		2503
		2504	XMASSERT(pSource);
		2505
		2506	V.vector4_f32[0] = (FLOAT)pSource->x;
		2507	V.vector4_f32[1] = (FLOAT)pSource->y;
		2508	V.vector4_f32[2] = (FLOAT)pSource->z;
		2509	V.vector4_f32[3] = (FLOAT)pSource->w;
		2510
		2511	return V;
		2512
		2513	#elif defined(_XM_SSE_INTRINSICS_)
		2514	static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
		2515	XMASSERT(pSource);
		2516	// Splat the color in all four entries (x,z,y,w)
		2517	XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
		2518	// Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
		2519	vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
		2520	// w is signed! Flip the bits to convert the order to unsigned
		2521	vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
		2522	// Convert to floating point numbers
		2523	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2524	// w + 0x80 to complete the conversion
		2525	vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
		2526	// Fix y, z and w because they are too large
		2527	vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul);
		2528	return vTemp;
		2529	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2530	#endif // _XM_VMX128_INTRINSICS_
		2531	}
		2532
		2533	//------------------------------------------------------------------------------
		2534
		2535	XMFINLINE XMVECTOR XMLoadByteN4
		2536	(
		2537	CONST XMBYTEN4* pSource
		2538	)
		2539	{
		2540	#if defined(_XM_NO_INTRINSICS_)
		2541
		2542	XMVECTOR V;
		2543
		2544	XMASSERT(pSource);
		2545	XMASSERT(pSource->x != -128);
		2546	XMASSERT(pSource->y != -128);
		2547	XMASSERT(pSource->z != -128);
		2548	XMASSERT(pSource->w != -128);
		2549
		2550	V.vector4_f32[0] = (FLOAT)pSource->x / 127.0f;
		2551	V.vector4_f32[1] = (FLOAT)pSource->y / 127.0f;
		2552	V.vector4_f32[2] = (FLOAT)pSource->z / 127.0f;
		2553	V.vector4_f32[3] = (FLOAT)pSource->w / 127.0f;
		2554
		2555	return V;
		2556
		2557	#elif defined(_XM_SSE_INTRINSICS_)
		2558	static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f256.0f),1.0f/(127.0f65536.0f),1.0f/(127.0f65536.0f256.0f)};
		2559	XMASSERT(pSource);
		2560	XMASSERT(pSource->x != -128);
		2561	XMASSERT(pSource->y != -128);
		2562	XMASSERT(pSource->z != -128);
		2563	XMASSERT(pSource->w != -128);
		2564	// Splat the color in all four entries (x,z,y,w)
		2565	XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
		2566	// Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
		2567	vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
		2568	// x,y and z are unsigned! Flip the bits to convert the order to signed
		2569	vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
		2570	// Convert to floating point numbers
		2571	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2572	// x, y and z - 0x80 to complete the conversion
		2573	vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
		2574	// Fix y, z and w because they are too large
		2575	vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul);
		2576	return vTemp;
		2577	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2578	#endif // _XM_VMX128_INTRINSICS_
		2579	}
		2580
		2581	//------------------------------------------------------------------------------
		2582
		2583	XMFINLINE XMVECTOR XMLoadByte4
		2584	(
		2585	CONST XMBYTE4* pSource
		2586	)
		2587	{
		2588	#if defined(_XM_NO_INTRINSICS_)
		2589
		2590	XMVECTOR V;
		2591
		2592	XMASSERT(pSource);
		2593	XMASSERT(pSource->x != -128);
		2594	XMASSERT(pSource->y != -128);
		2595	XMASSERT(pSource->z != -128);
		2596	XMASSERT(pSource->w != -128);
		2597
		2598	V.vector4_f32[0] = (FLOAT)pSource->x;
		2599	V.vector4_f32[1] = (FLOAT)pSource->y;
		2600	V.vector4_f32[2] = (FLOAT)pSource->z;
		2601	V.vector4_f32[3] = (FLOAT)pSource->w;
		2602
		2603	return V;
		2604
		2605	#elif defined(_XM_SSE_INTRINSICS_)
		2606	static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
		2607	XMASSERT(pSource);
		2608	XMASSERT(pSource->x != -128);
		2609	XMASSERT(pSource->y != -128);
		2610	XMASSERT(pSource->z != -128);
		2611	XMASSERT(pSource->w != -128);
		2612	// Splat the color in all four entries (x,z,y,w)
		2613	XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
		2614	// Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
		2615	vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
		2616	// x,y and z are unsigned! Flip the bits to convert the order to signed
		2617	vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
		2618	// Convert to floating point numbers
		2619	vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
		2620	// x, y and z - 0x80 to complete the conversion
		2621	vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
		2622	// Fix y, z and w because they are too large
		2623	vTemp = _mm_mul_ps(vTemp,LoadByte4Mul);
		2624	return vTemp;
		2625	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2626	#endif // _XM_VMX128_INTRINSICS_
		2627	}
		2628
		2629	//------------------------------------------------------------------------------
		2630
		2631	XMFINLINE XMVECTOR XMLoadUNibble4
		2632	(
		2633	CONST XMUNIBBLE4* pSource
		2634	)
		2635	{
		2636	#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
		2637	static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
		2638	static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
		2639	XMASSERT(pSource);
		2640	// Get the 32 bit value and splat it
		2641	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		2642	// Mask off x, y and z
		2643	vResult = _mm_and_ps(vResult,UNibble4And);
		2644	// Convert to float
		2645	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		2646	// Normalize x, y, and z
		2647	vResult = _mm_mul_ps(vResult,UNibble4Mul);
		2648	return vResult;
		2649	#else
		2650	XMVECTOR V;
		2651	UINT Element;
		2652
		2653	XMASSERT(pSource);
		2654
		2655	Element = pSource->v & 0xF;
		2656	V.vector4_f32[0] = (FLOAT)Element;
		2657	Element = (pSource->v >> 4) & 0xF;
		2658	V.vector4_f32[1] = (FLOAT)Element;
		2659	Element = (pSource->v >> 8) & 0xF;
		2660	V.vector4_f32[2] = (FLOAT)Element;
		2661	Element = (pSource->v >> 12) & 0xF;
		2662	V.vector4_f32[3] = (FLOAT)Element;
		2663
		2664	return V;
		2665	#endif // !_XM_SSE_INTRISICS_
		2666	}
		2667
		2668	//------------------------------------------------------------------------------
		2669
		2670	XMFINLINE XMVECTOR XMLoadU555
		2671	(
		2672	CONST XMU555* pSource
		2673	)
		2674	{
		2675	#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
		2676	static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
		2677	static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
		2678	XMASSERT(pSource);
		2679	// Get the 32 bit value and splat it
		2680	XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
		2681	// Mask off x, y and z
		2682	vResult = _mm_and_ps(vResult,U555And);
		2683	// Convert to float
		2684	vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
		2685	// Normalize x, y, and z
		2686	vResult = _mm_mul_ps(vResult,U555Mul);
		2687	return vResult;
		2688	#else
		2689	XMVECTOR V;
		2690	UINT Element;
		2691
		2692	XMASSERT(pSource);
		2693
		2694	Element = pSource->v & 0x1F;
		2695	V.vector4_f32[0] = (FLOAT)Element;
		2696	Element = (pSource->v >> 5) & 0x1F;
		2697	V.vector4_f32[1] = (FLOAT)Element;
		2698	Element = (pSource->v >> 10) & 0x1F;
		2699	V.vector4_f32[2] = (FLOAT)Element;
		2700	Element = (pSource->v >> 15) & 0x1;
		2701	V.vector4_f32[3] = (FLOAT)Element;
		2702
		2703	return V;
		2704	#endif // !_XM_SSE_INTRISICS_
		2705	}
		2706
		2707	//------------------------------------------------------------------------------
		2708
		2709	XMFINLINE XMVECTOR XMLoadColor
		2710	(
		2711	CONST XMCOLOR* pSource
		2712	)
		2713	{
		2714	#if defined(_XM_NO_INTRINSICS_)
		2715	XMASSERT(pSource);
		2716	{
		2717	// INT -> Float conversions are done in one instruction.
		2718	// UINT -> Float calls a runtime function. Keep in INT
		2719	INT iColor = (INT)(pSource->c);
		2720	XMVECTOR vColor = {
		2721	(FLOAT)((iColor >> 16) & 0xFF) * (1.0f/255.0f),
		2722	(FLOAT)((iColor >> 8) & 0xFF) * (1.0f/255.0f),
		2723	(FLOAT)(iColor & 0xFF) * (1.0f/255.0f),
		2724	(FLOAT)((iColor >> 24) & 0xFF) * (1.0f/255.0f)
		2725	};
		2726	return vColor;
		2727	}
		2728	#elif defined(_XM_SSE_INTRINSICS_)
		2729	XMASSERT(pSource);
		2730	// Splat the color in all four entries
		2731	__m128i vInt = _mm_set1_epi32(pSource->c);
		2732	// Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
		2733	vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8);
		2734	// a is unsigned! Flip the bit to convert the order to signed
		2735	vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8);
		2736	// Convert to floating point numbers
		2737	XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
		2738	// RGB + 0, A + 0x80000000.f to undo the signed order.
		2739	vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8);
		2740	// Convert 0-255 to 0.0f-1.0f
		2741	return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8);
		2742	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2743	#endif // _XM_VMX128_INTRINSICS_
		2744	}
		2745
		2746	//------------------------------------------------------------------------------
		2747
		2748	XMFINLINE XMMATRIX XMLoadFloat3x3
		2749	(
		2750	CONST XMFLOAT3X3* pSource
		2751	)
		2752	{
		2753	#if defined(_XM_NO_INTRINSICS_)
		2754
		2755	XMMATRIX M;
		2756
		2757	XMASSERT(pSource);
		2758
		2759	M.r[0].vector4_f32[0] = pSource->m[0][0];
		2760	M.r[0].vector4_f32[1] = pSource->m[0][1];
		2761	M.r[0].vector4_f32[2] = pSource->m[0][2];
		2762	M.r[0].vector4_f32[3] = 0.0f;
		2763
		2764	M.r[1].vector4_f32[0] = pSource->m[1][0];
		2765	M.r[1].vector4_f32[1] = pSource->m[1][1];
		2766	M.r[1].vector4_f32[2] = pSource->m[1][2];
		2767	M.r[1].vector4_f32[3] = 0.0f;
		2768
		2769	M.r[2].vector4_f32[0] = pSource->m[2][0];
		2770	M.r[2].vector4_f32[1] = pSource->m[2][1];
		2771	M.r[2].vector4_f32[2] = pSource->m[2][2];
		2772	M.r[2].vector4_f32[3] = 0.0f;
		2773
		2774	M.r[3].vector4_f32[0] = 0.0f;
		2775	M.r[3].vector4_f32[1] = 0.0f;
		2776	M.r[3].vector4_f32[2] = 0.0f;
		2777	M.r[3].vector4_f32[3] = 1.0f;
		2778
		2779	return M;
		2780
		2781	#elif defined(_XM_SSE_INTRINSICS_)
		2782	XMMATRIX M;
		2783	XMVECTOR V1, V2, V3, Z, T1, T2, T3, T4, T5;
		2784
		2785	Z = _mm_setzero_ps();
		2786
		2787	XMASSERT(pSource);
		2788
		2789	V1 = _mm_loadu_ps( &pSource->m[0][0] );
		2790	V2 = _mm_loadu_ps( &pSource->m[1][1] );
		2791	V3 = _mm_load_ss( &pSource->m[2][2] );
		2792
		2793	T1 = _mm_unpackhi_ps( V1, Z );
		2794	T2 = _mm_unpacklo_ps( V2, Z );
		2795	T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
		2796	T4 = _mm_movehl_ps( T2, T3 );
		2797	T5 = _mm_movehl_ps( Z, T1 );
		2798
		2799	M.r[0] = _mm_movelh_ps( V1, T1 );
		2800	M.r[1] = _mm_add_ps( T4, T5 );
		2801	M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
		2802	M.r[3] = g_XMIdentityR3;
		2803
		2804	return M;
		2805	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2806	#endif // _XM_VMX128_INTRINSICS_
		2807	}
		2808
		2809	//------------------------------------------------------------------------------
		2810
		2811	XMFINLINE XMMATRIX XMLoadFloat4x3
		2812	(
		2813	CONST XMFLOAT4X3* pSource
		2814	)
		2815	{
		2816	#if defined(_XM_NO_INTRINSICS_)
		2817	XMMATRIX M;
		2818	XMASSERT(pSource);
		2819
		2820	((UINT )(&M.r[0].vector4_f32[0]))[0] = ((const UINT )(&pSource->m[0][0]))[0];
		2821	((UINT )(&M.r[0].vector4_f32[1]))[0] = ((const UINT )(&pSource->m[0][1]))[0];
		2822	((UINT )(&M.r[0].vector4_f32[2]))[0] = ((const UINT )(&pSource->m[0][2]))[0];
		2823	M.r[0].vector4_f32[3] = 0.0f;
		2824
		2825	((UINT )(&M.r[1].vector4_f32[0]))[0] = ((const UINT )(&pSource->m[1][0]))[0];
		2826	((UINT )(&M.r[1].vector4_f32[1]))[0] = ((const UINT )(&pSource->m[1][1]))[0];
		2827	((UINT )(&M.r[1].vector4_f32[2]))[0] = ((const UINT )(&pSource->m[1][2]))[0];
		2828	M.r[1].vector4_f32[3] = 0.0f;
		2829
		2830	((UINT )(&M.r[2].vector4_f32[0]))[0] = ((const UINT )(&pSource->m[2][0]))[0];
		2831	((UINT )(&M.r[2].vector4_f32[1]))[0] = ((const UINT )(&pSource->m[2][1]))[0];
		2832	((UINT )(&M.r[2].vector4_f32[2]))[0] = ((const UINT )(&pSource->m[2][2]))[0];
		2833	M.r[2].vector4_f32[3] = 0.0f;
		2834
		2835	((UINT )(&M.r[3].vector4_f32[0]))[0] = ((const UINT )(&pSource->m[3][0]))[0];
		2836	((UINT )(&M.r[3].vector4_f32[1]))[0] = ((const UINT )(&pSource->m[3][1]))[0];
		2837	((UINT )(&M.r[3].vector4_f32[2]))[0] = ((const UINT )(&pSource->m[3][2]))[0];
		2838	M.r[3].vector4_f32[3] = 1.0f;
		2839
		2840	return M;
		2841
		2842	#elif defined(_XM_SSE_INTRINSICS_)
		2843	XMASSERT(pSource);
		2844	// Use unaligned load instructions to
		2845	// load the 12 floats
		2846	// vTemp1 = x1,y1,z1,x2
		2847	XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
		2848	// vTemp2 = y2,z2,x3,y3
		2849	XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
		2850	// vTemp4 = z3,x4,y4,z4
		2851	XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
		2852	// vTemp3 = x3,y3,z3,z3
		2853	XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
		2854	// vTemp2 = y2,z2,x2,x2
		2855	vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
		2856	// vTemp2 = x2,y2,z2,z2
		2857	vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
		2858	// vTemp1 = x1,y1,z1,0
		2859	vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
		2860	// vTemp2 = x2,y2,z2,0
		2861	vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
		2862	// vTemp3 = x3,y3,z3,0
		2863	vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
		2864	// vTemp4i = x4,y4,z4,0
		2865	__m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
		2866	// vTemp4i = x4,y4,z4,1.0f
		2867	vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
		2868	XMMATRIX M(vTemp1,
		2869	vTemp2,
		2870	vTemp3,
		2871	reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
		2872	return M;
		2873	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2874	#endif // _XM_VMX128_INTRINSICS_
		2875	}
		2876
		2877	//------------------------------------------------------------------------------
		2878
		2879	XMFINLINE XMMATRIX XMLoadFloat4x3A
		2880	(
		2881	CONST XMFLOAT4X3A* pSource
		2882	)
		2883	{
		2884	#if defined(_XM_NO_INTRINSICS_)
		2885
		2886	XMMATRIX M;
		2887
		2888	XMASSERT(pSource);
		2889	XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
		2890
		2891	M.r[0].vector4_f32[0] = pSource->m[0][0];
		2892	M.r[0].vector4_f32[1] = pSource->m[0][1];
		2893	M.r[0].vector4_f32[2] = pSource->m[0][2];
		2894	M.r[0].vector4_f32[3] = 0.0f;
		2895
		2896	M.r[1].vector4_f32[0] = pSource->m[1][0];
		2897	M.r[1].vector4_f32[1] = pSource->m[1][1];
		2898	M.r[1].vector4_f32[2] = pSource->m[1][2];
		2899	M.r[1].vector4_f32[3] = 0.0f;
		2900
		2901	M.r[2].vector4_f32[0] = pSource->m[2][0];
		2902	M.r[2].vector4_f32[1] = pSource->m[2][1];
		2903	M.r[2].vector4_f32[2] = pSource->m[2][2];
		2904	M.r[2].vector4_f32[3] = 0.0f;
		2905
		2906	M.r[3].vector4_f32[0] = pSource->m[3][0];
		2907	M.r[3].vector4_f32[1] = pSource->m[3][1];
		2908	M.r[3].vector4_f32[2] = pSource->m[3][2];
		2909	M.r[3].vector4_f32[3] = 1.0f;
		2910
		2911	return M;
		2912
		2913	#elif defined(_XM_SSE_INTRINSICS_)
		2914	XMASSERT(pSource);
		2915	// Use aligned load instructions to
		2916	// load the 12 floats
		2917	// vTemp1 = x1,y1,z1,x2
		2918	XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
		2919	// vTemp2 = y2,z2,x3,y3
		2920	XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
		2921	// vTemp4 = z3,x4,y4,z4
		2922	XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
		2923	// vTemp3 = x3,y3,z3,z3
		2924	XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
		2925	// vTemp2 = y2,z2,x2,x2
		2926	vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
		2927	// vTemp2 = x2,y2,z2,z2
		2928	vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
		2929	// vTemp1 = x1,y1,z1,0
		2930	vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
		2931	// vTemp2 = x2,y2,z2,0
		2932	vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
		2933	// vTemp3 = x3,y3,z3,0
		2934	vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
		2935	// vTemp4i = x4,y4,z4,0
		2936	__m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
		2937	// vTemp4i = x4,y4,z4,1.0f
		2938	vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
		2939	XMMATRIX M(vTemp1,
		2940	vTemp2,
		2941	vTemp3,
		2942	reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
		2943	return M;
		2944	#else // _XM_VMX128_INTRINSICS_
		2945	#endif // _XM_VMX128_INTRINSICS_
		2946	}
		2947
		2948	//------------------------------------------------------------------------------
		2949
		2950	XMFINLINE XMMATRIX XMLoadFloat4x4
		2951	(
		2952	CONST XMFLOAT4X4* pSource
		2953	)
		2954	{
		2955	#if defined(_XM_NO_INTRINSICS_)
		2956	XMMATRIX M;
		2957	XMASSERT(pSource);
		2958
		2959	((UINT )(&M.r[0].vector4_f32[0]))[0] = ((const UINT )(&pSource->m[0][0]))[0];
		2960	((UINT )(&M.r[0].vector4_f32[1]))[0] = ((const UINT )(&pSource->m[0][1]))[0];
		2961	((UINT )(&M.r[0].vector4_f32[2]))[0] = ((const UINT )(&pSource->m[0][2]))[0];
		2962	((UINT )(&M.r[0].vector4_f32[3]))[0] = ((const UINT )(&pSource->m[0][3]))[0];
		2963
		2964	((UINT )(&M.r[1].vector4_f32[0]))[0] = ((const UINT )(&pSource->m[1][0]))[0];
		2965	((UINT )(&M.r[1].vector4_f32[1]))[0] = ((const UINT )(&pSource->m[1][1]))[0];
		2966	((UINT )(&M.r[1].vector4_f32[2]))[0] = ((const UINT )(&pSource->m[1][2]))[0];
		2967	((UINT )(&M.r[1].vector4_f32[3]))[0] = ((const UINT )(&pSource->m[1][3]))[0];
		2968
		2969	((UINT )(&M.r[2].vector4_f32[0]))[0] = ((const UINT )(&pSource->m[2][0]))[0];
		2970	((UINT )(&M.r[2].vector4_f32[1]))[0] = ((const UINT )(&pSource->m[2][1]))[0];
		2971	((UINT )(&M.r[2].vector4_f32[2]))[0] = ((const UINT )(&pSource->m[2][2]))[0];
		2972	((UINT )(&M.r[2].vector4_f32[3]))[0] = ((const UINT )(&pSource->m[2][3]))[0];
		2973
		2974	((UINT )(&M.r[3].vector4_f32[0]))[0] = ((const UINT )(&pSource->m[3][0]))[0];
		2975	((UINT )(&M.r[3].vector4_f32[1]))[0] = ((const UINT )(&pSource->m[3][1]))[0];
		2976	((UINT )(&M.r[3].vector4_f32[2]))[0] = ((const UINT )(&pSource->m[3][2]))[0];
		2977	((UINT )(&M.r[3].vector4_f32[3]))[0] = ((const UINT )(&pSource->m[3][3]))[0];
		2978
		2979	return M;
		2980
		2981	#elif defined(_XM_SSE_INTRINSICS_)
		2982	XMASSERT(pSource);
		2983	XMMATRIX M;
		2984
		2985	M.r[0] = _mm_loadu_ps( &pSource->_11 );
		2986	M.r[1] = _mm_loadu_ps( &pSource->_21 );
		2987	M.r[2] = _mm_loadu_ps( &pSource->_31 );
		2988	M.r[3] = _mm_loadu_ps( &pSource->_41 );
		2989
		2990	return M;
		2991	#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		2992	#endif // _XM_VMX128_INTRINSICS_
		2993	}
		2994
		2995	//------------------------------------------------------------------------------
		2996
		2997	XMFINLINE XMMATRIX XMLoadFloat4x4A
		2998	(
		2999	CONST XMFLOAT4X4A* pSource
		3000	)
		3001	{
		3002	#if defined(_XM_NO_INTRINSICS_)
		3003
		3004	XMMATRIX M;
		3005
		3006	XMASSERT(pSource);
		3007	XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
		3008
		3009	M.r[0].vector4_f32[0] = pSource->m[0][0];
		3010	M.r[0].vector4_f32[1] = pSource->m[0][1];
		3011	M.r[0].vector4_f32[2] = pSource->m[0][2];
		3012	M.r[0].vector4_f32[3] = pSource->m[0][3];
		3013
		3014	M.r[1].vector4_f32[0] = pSource->m[1][0];
		3015	M.r[1].vector4_f32[1] = pSource->m[1][1];
		3016	M.r[1].vector4_f32[2] = pSource->m[1][2];
		3017	M.r[1].vector4_f32[3] = pSource->m[1][3];
		3018
		3019	M.r[2].vector4_f32[0] = pSource->m[2][0];
		3020	M.r[2].vector4_f32[1] = pSource->m[2][1];
		3021	M.r[2].vector4_f32[2] = pSource->m[2][2];
		3022	M.r[2].vector4_f32[3] = pSource->m[2][3];
		3023
		3024	M.r[3].vector4_f32[0] = pSource->m[3][0];
		3025	M.r[3].vector4_f32[1] = pSource->m[3][1];
		3026	M.r[3].vector4_f32[2] = pSource->m[3][2];
		3027	M.r[3].vector4_f32[3] = pSource->m[3][3];
		3028
		3029	return M;
		3030
		3031	#elif defined(_XM_SSE_INTRINSICS_)
		3032	XMMATRIX M;
		3033
		3034	XMASSERT(pSource);
		3035
		3036	M.r[0] = _mm_load_ps( &pSource->_11 );
		3037	M.r[1] = _mm_load_ps( &pSource->_21 );
		3038	M.r[2] = _mm_load_ps( &pSource->_31 );
		3039	M.r[3] = _mm_load_ps( &pSource->_41 );
		3040
		3041	return M;
		3042	#else // _XM_VMX128_INTRINSICS_
		3043	#endif // _XM_VMX128_INTRINSICS_
		3044	}
		3045
		3046	/****************************************************************************
		3047	*
		3048	* Vector and matrix store operations
		3049	*
		3050	****************************************************************************/
		3051
		3052	XMFINLINE VOID XMStoreInt
		3053	(
		3054	UINT* pDestination,
		3055	FXMVECTOR V
		3056	)
		3057	{
		3058	#if defined(_XM_NO_INTRINSICS_) \|\| defined(_XM_SSE_INTRINSICS_)
		3059
		3060	XMASSERT(pDestination);
		3061	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3062
		3063	*pDestination = XMVectorGetIntX( V );
		3064
		3065	#else // _XM_VMX128_INTRINSICS_
		3066	#endif // _XM_VMX128_INTRINSICS_
		3067	}
		3068
		3069	//------------------------------------------------------------------------------
		3070
		3071	XMFINLINE VOID XMStoreFloat
		3072	(
		3073	FLOAT* pDestination,
		3074	FXMVECTOR V
		3075	)
		3076	{
		3077	#if defined(_XM_NO_INTRINSICS_) \|\| defined(_XM_SSE_INTRINSICS_)
		3078
		3079	XMASSERT(pDestination);
		3080	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3081
		3082	*pDestination = XMVectorGetX( V );
		3083
		3084	#else // _XM_VMX128_INTRINSICS_
		3085	#endif // _XM_VMX128_INTRINSICS_
		3086	}
		3087
		3088	//------------------------------------------------------------------------------
		3089
		3090	XMFINLINE VOID XMStoreInt2
		3091	(
		3092	UINT* pDestination,
		3093	FXMVECTOR V
		3094	)
		3095	{
		3096	#if defined(_XM_NO_INTRINSICS_)
		3097
		3098	XMASSERT(pDestination);
		3099	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3100
		3101	pDestination[0] = V.vector4_u32[0];
		3102	pDestination[1] = V.vector4_u32[1];
		3103
		3104	#elif defined(_XM_SSE_INTRINSICS_)
		3105
		3106	XMASSERT(pDestination);
		3107	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3108	pDestination[0] = XMVectorGetIntX( V );
		3109	pDestination[1] = XMVectorGetIntY( V );
		3110
		3111	#else // _XM_VMX128_INTRINSICS_
		3112	#endif // _XM_VMX128_INTRINSICS_
		3113	}
		3114
		3115	//------------------------------------------------------------------------------
		3116
		3117	XMFINLINE VOID XMStoreInt2A
		3118	(
		3119	UINT* pDestination,
		3120	FXMVECTOR V
		3121	)
		3122	{
		3123	#if defined(_XM_NO_INTRINSICS_)
		3124
		3125	XMASSERT(pDestination);
		3126	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		3127
		3128	pDestination[0] = V.vector4_u32[0];
		3129	pDestination[1] = V.vector4_u32[1];
		3130
		3131	#elif defined(_XM_SSE_INTRINSICS_)
		3132
		3133	XMASSERT(pDestination);
		3134	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3135
		3136	_mm_storel_epi64( (__m128i)pDestination, reinterpret_cast<const __m128i >(&V)[0] );
		3137
		3138	#else // _XM_VMX128_INTRINSICS_
		3139	#endif // _XM_VMX128_INTRINSICS_
		3140	}
		3141
		3142	//------------------------------------------------------------------------------
		3143
		3144	XMFINLINE VOID XMStoreFloat2
		3145	(
		3146	XMFLOAT2* pDestination,
		3147	FXMVECTOR V
		3148	)
		3149	{
		3150	#if defined(_XM_NO_INTRINSICS_)
		3151
		3152	XMASSERT(pDestination);
		3153	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3154
		3155	pDestination->x = V.vector4_f32[0];
		3156	pDestination->y = V.vector4_f32[1];
		3157
		3158	#elif defined(_XM_SSE_INTRINSICS_)
		3159
		3160	XMASSERT(pDestination);
		3161	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3162
		3163	XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
		3164	_mm_store_ss( &pDestination->x, V );
		3165	_mm_store_ss( &pDestination->y, T );
		3166
		3167	#else // _XM_VMX128_INTRINSICS_
		3168	#endif // _XM_VMX128_INTRINSICS_
		3169	}
		3170
		3171	//------------------------------------------------------------------------------
		3172
		3173	XMFINLINE VOID XMStoreFloat2A
		3174	(
		3175	XMFLOAT2A* pDestination,
		3176	FXMVECTOR V
		3177	)
		3178	{
		3179	#if defined(_XM_NO_INTRINSICS_)
		3180
		3181	XMASSERT(pDestination);
		3182	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		3183
		3184	pDestination->x = V.vector4_f32[0];
		3185	pDestination->y = V.vector4_f32[1];
		3186
		3187	#elif defined(_XM_SSE_INTRINSICS_)
		3188
		3189	XMASSERT(pDestination);
		3190	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3191
		3192	XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
		3193	_mm_store_ss( &pDestination->x, V );
		3194	_mm_store_ss( &pDestination->y, T );
		3195
		3196	#else // _XM_VMX128_INTRINSICS_
		3197	#endif // _XM_VMX128_INTRINSICS_
		3198	}
		3199
		3200	//------------------------------------------------------------------------------
		3201
		3202	XMFINLINE VOID XMStoreHalf2
		3203	(
		3204	XMHALF2* pDestination,
		3205	FXMVECTOR V
		3206	)
		3207	{
		3208	#if defined(_XM_NO_INTRINSICS_)
		3209
		3210	XMASSERT(pDestination);
		3211
		3212	pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
		3213	pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
		3214
		3215	#elif defined(_XM_SSE_INTRINSICS_)
		3216	XMASSERT(pDestination);
		3217	pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
		3218	pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
		3219	#else // _XM_VMX128_INTRINSICS_
		3220	#endif // _XM_VMX128_INTRINSICS_
		3221	}
		3222
		3223	//------------------------------------------------------------------------------
		3224
		3225	XMFINLINE VOID XMStoreShortN2
		3226	(
		3227	XMSHORTN2* pDestination,
		3228	FXMVECTOR V
		3229	)
		3230	{
		3231	#if defined(_XM_NO_INTRINSICS_)
		3232
		3233	XMVECTOR N;
		3234	static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
		3235
		3236	XMASSERT(pDestination);
		3237
		3238	N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
		3239	N = XMVectorMultiply(N, Scale.v);
		3240	N = XMVectorRound(N);
		3241
		3242	pDestination->x = (SHORT)N.vector4_f32[0];
		3243	pDestination->y = (SHORT)N.vector4_f32[1];
		3244
		3245	#elif defined(_XM_SSE_INTRINSICS_)
		3246	XMASSERT(pDestination);
		3247	static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
		3248
		3249	XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
		3250	vResult = _mm_min_ps(vResult,g_XMOne);
		3251	vResult = _mm_mul_ps(vResult,Scale);
		3252	__m128i vResulti = _mm_cvtps_epi32(vResult);
		3253	vResulti = _mm_packs_epi32(vResulti,vResulti);
		3254	_mm_store_ss(reinterpret_cast<float >(&pDestination->x),reinterpret_cast<const __m128 >(&vResulti)[0]);
		3255	#else // _XM_VMX128_INTRINSICS_
		3256	#endif // _XM_VMX128_INTRINSICS_
		3257	}
		3258
		3259	//------------------------------------------------------------------------------
		3260
		3261	XMFINLINE VOID XMStoreShort2
		3262	(
		3263	XMSHORT2* pDestination,
		3264	FXMVECTOR V
		3265	)
		3266	{
		3267	#if defined(_XM_NO_INTRINSICS_)
		3268
		3269	XMVECTOR N;
		3270	static CONST XMVECTOR Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
		3271	static CONST XMVECTOR Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
		3272
		3273	XMASSERT(pDestination);
		3274
		3275	N = XMVectorClamp(V, Min, Max);
		3276	N = XMVectorRound(N);
		3277
		3278	pDestination->x = (SHORT)N.vector4_f32[0];
		3279	pDestination->y = (SHORT)N.vector4_f32[1];
		3280
		3281	#elif defined(_XM_SSE_INTRINSICS_)
		3282	XMASSERT(pDestination);
		3283	static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
		3284	static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
		3285	// Bounds check
		3286	XMVECTOR vResult = _mm_max_ps(V,Min);
		3287	vResult = _mm_min_ps(vResult,Max);
		3288	// Convert to int with rounding
		3289	__m128i vInt = _mm_cvtps_epi32(vResult);
		3290	// Pack the ints into shorts
		3291	vInt = _mm_packs_epi32(vInt,vInt);
		3292	_mm_store_ss(reinterpret_cast<float >(&pDestination->x),reinterpret_cast<const __m128 >(&vInt)[0]);
		3293	#else // _XM_VMX128_INTRINSICS_
		3294	#endif // _XM_VMX128_INTRINSICS_
		3295	}
		3296
		3297	//------------------------------------------------------------------------------
		3298
		3299	XMFINLINE VOID XMStoreUShortN2
		3300	(
		3301	XMUSHORTN2* pDestination,
		3302	FXMVECTOR V
		3303	)
		3304	{
		3305	#if defined(_XM_NO_INTRINSICS_)
		3306
		3307	XMVECTOR N;
		3308	static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
		3309
		3310	XMASSERT(pDestination);
		3311
		3312	N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
		3313	N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
		3314	N = XMVectorTruncate(N);
		3315
		3316	pDestination->x = (SHORT)N.vector4_f32[0];
		3317	pDestination->y = (SHORT)N.vector4_f32[1];
		3318
		3319	#elif defined(_XM_SSE_INTRINSICS_)
		3320	XMASSERT(pDestination);
		3321	static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
		3322	// Bounds check
		3323	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		3324	vResult = _mm_min_ps(vResult,g_XMOne);
		3325	vResult = _mm_mul_ps(vResult,Scale);
		3326	// Convert to int with rounding
		3327	__m128i vInt = _mm_cvtps_epi32(vResult);
		3328	// Since the SSE pack instruction clamps using signed rules,
		3329	// manually extract the values to store them to memory
		3330	pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
		3331	pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
		3332	#else // _XM_VMX128_INTRINSICS_
		3333	#endif // _XM_VMX128_INTRINSICS_
		3334	}
		3335
		3336	//------------------------------------------------------------------------------
		3337
		3338	XMFINLINE VOID XMStoreUShort2
		3339	(
		3340	XMUSHORT2* pDestination,
		3341	FXMVECTOR V
		3342	)
		3343	{
		3344	#if defined(_XM_NO_INTRINSICS_)
		3345
		3346	XMVECTOR N;
		3347	static CONST XMVECTOR Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
		3348
		3349	XMASSERT(pDestination);
		3350
		3351	N = XMVectorClamp(V, XMVectorZero(), Max);
		3352	N = XMVectorRound(N);
		3353
		3354	pDestination->x = (SHORT)N.vector4_f32[0];
		3355	pDestination->y = (SHORT)N.vector4_f32[1];
		3356
		3357	#elif defined(_XM_SSE_INTRINSICS_)
		3358	XMASSERT(pDestination);
		3359	static CONST XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
		3360	// Bounds check
		3361	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		3362	vResult = _mm_min_ps(vResult,Max);
		3363	// Convert to int with rounding
		3364	__m128i vInt = _mm_cvtps_epi32(vResult);
		3365	// Since the SSE pack instruction clamps using signed rules,
		3366	// manually extract the values to store them to memory
		3367	pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
		3368	pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
		3369	#else // _XM_VMX128_INTRINSICS_
		3370	#endif // _XM_VMX128_INTRINSICS_
		3371	}
		3372
		3373	//------------------------------------------------------------------------------
		3374
		3375	XMFINLINE VOID XMStoreInt3
		3376	(
		3377	UINT* pDestination,
		3378	FXMVECTOR V
		3379	)
		3380	{
		3381	#if defined(_XM_NO_INTRINSICS_)
		3382
		3383	XMASSERT(pDestination);
		3384	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3385
		3386	pDestination[0] = V.vector4_u32[0];
		3387	pDestination[1] = V.vector4_u32[1];
		3388	pDestination[2] = V.vector4_u32[2];
		3389
		3390	#elif defined(_XM_SSE_INTRINSICS_)
		3391
		3392	XMASSERT(pDestination);
		3393	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3394	pDestination[0] = XMVectorGetIntX( V );
		3395	pDestination[1] = XMVectorGetIntY( V );
		3396	pDestination[2] = XMVectorGetIntZ( V );
		3397
		3398	#else // _XM_VMX128_INTRINSICS_
		3399	#endif // _XM_VMX128_INTRINSICS_
		3400	}
		3401
		3402	//------------------------------------------------------------------------------
		3403
		3404	XMFINLINE VOID XMStoreInt3A
		3405	(
		3406	UINT* pDestination,
		3407	FXMVECTOR V
		3408	)
		3409	{
		3410	#if defined(_XM_NO_INTRINSICS_)
		3411
		3412	XMASSERT(pDestination);
		3413	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		3414
		3415	pDestination[0] = V.vector4_u32[0];
		3416	pDestination[1] = V.vector4_u32[1];
		3417	pDestination[2] = V.vector4_u32[2];
		3418
		3419	#elif defined(_XM_SSE_INTRINSICS_)
		3420
		3421	XMASSERT(pDestination);
		3422	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3423	pDestination[0] = XMVectorGetIntX( V );
		3424	pDestination[1] = XMVectorGetIntY( V );
		3425	pDestination[2] = XMVectorGetIntZ( V );
		3426
		3427	#else // _XM_VMX128_INTRINSICS_
		3428	#endif // _XM_VMX128_INTRINSICS_
		3429	}
		3430
		3431	//------------------------------------------------------------------------------
		3432
		3433	XMFINLINE VOID XMStoreFloat3
		3434	(
		3435	XMFLOAT3* pDestination,
		3436	FXMVECTOR V
		3437	)
		3438	{
		3439	#if defined(_XM_NO_INTRINSICS_)
		3440
		3441	XMASSERT(pDestination);
		3442	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3443
		3444	pDestination->x = V.vector4_f32[0];
		3445	pDestination->y = V.vector4_f32[1];
		3446	pDestination->z = V.vector4_f32[2];
		3447
		3448	#elif defined(_XM_SSE_INTRINSICS_)
		3449
		3450	XMASSERT(pDestination);
		3451	XMASSERT(((UINT_PTR)pDestination & 3) == 0);
		3452
		3453	XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
		3454	XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
		3455	_mm_store_ss( &pDestination->x, V );
		3456	_mm_store_ss( &pDestination->y, T1 );
		3457	_mm_store_ss( &pDestination->z, T2 );
		3458
		3459	#else // _XM_VMX128_INTRINSICS_
		3460	#endif // _XM_VMX128_INTRINSICS_
		3461	}
		3462
		3463	//------------------------------------------------------------------------------
		3464
		3465	XMFINLINE VOID XMStoreFloat3A
		3466	(
		3467	XMFLOAT3A* pDestination,
		3468	FXMVECTOR V
		3469	)
		3470	{
		3471	#if defined(_XM_NO_INTRINSICS_)
		3472
		3473	XMASSERT(pDestination);
		3474	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		3475
		3476	pDestination->x = V.vector4_f32[0];
		3477	pDestination->y = V.vector4_f32[1];
		3478	pDestination->z = V.vector4_f32[2];
		3479
		3480	#elif defined(_XM_SSE_INTRINSICS_)
		3481
		3482	XMASSERT(pDestination);
		3483	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		3484
		3485	XMVECTOR T1 = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
		3486	XMVECTOR T2 = _mm_unpackhi_ps( V, V );
		3487	_mm_store_ss( &pDestination->x, V );
		3488	_mm_store_ss( &pDestination->y, T1 );
		3489	_mm_store_ss( &pDestination->z, T2 );
		3490
		3491	#else // _XM_VMX128_INTRINSICS_
		3492	#endif // _XM_VMX128_INTRINSICS_
		3493	}
		3494
		3495	//------------------------------------------------------------------------------
		3496
		3497	XMFINLINE VOID XMStoreUHenDN3
		3498	(
		3499	XMUHENDN3* pDestination,
		3500	FXMVECTOR V
		3501	)
		3502	{
		3503	#if defined(_XM_NO_INTRINSICS_)
		3504
		3505	XMVECTOR N;
		3506	static CONST XMVECTORF32 Scale = {2047.0f, 2047.0f, 1023.0f, 0.0f};
		3507
		3508	XMASSERT(pDestination);
		3509
		3510	N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
		3511	N = XMVectorMultiply(N, Scale.v);
		3512
		3513	pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) \|
		3514	(((UINT)N.vector4_f32[1] & 0x7FF) << 11) \|
		3515	(((UINT)N.vector4_f32[0] & 0x7FF));
		3516
		3517	#elif defined(_XM_SSE_INTRINSICS_)
		3518	XMASSERT(pDestination);
		3519	static const XMVECTORF32 ScaleUHenDN3 = {2047.0f, 2047.0f2048.0f,1023.0f(2048.0f*2048.0f)/2.0f,1.0f};
		3520	static const XMVECTORI32 MaskUHenDN3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
		3521	// Clamp to bounds
		3522	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		3523	vResult = _mm_min_ps(vResult,g_XMOne);
		3524	// Scale by multiplication
		3525	vResult = _mm_mul_ps(vResult,ScaleUHenDN3);
		3526	// Convert to int
		3527	__m128i vResulti = _mm_cvttps_epi32(vResult);
		3528	// Mask off any fraction
		3529	vResulti = _mm_and_si128(vResulti,MaskUHenDN3);
		3530	// Do a horizontal or of 3 entries
		3531	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
		3532	// i = x\|y
		3533	vResulti = _mm_or_si128(vResulti,vResulti2);
		3534	// Move Z to the x position
		3535	vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
		3536	// Add Z to itself to perform a single bit left shift
		3537	vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
		3538	// i = x\|y\|z
		3539	vResulti = _mm_or_si128(vResulti,vResulti2);
		3540	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		3541	#else // _XM_VMX128_INTRINSICS_
		3542	#endif // _XM_VMX128_INTRINSICS_
		3543	}
		3544
		3545	//------------------------------------------------------------------------------
		3546
		3547	XMFINLINE VOID XMStoreUHenD3
		3548	(
		3549	XMUHEND3* pDestination,
		3550	FXMVECTOR V
		3551	)
		3552	{
		3553	#if defined(_XM_NO_INTRINSICS_)
		3554
		3555	XMVECTOR N;
		3556	static CONST XMVECTOR Max = {2047.0f, 2047.0f, 1023.0f, 0.0f};
		3557
		3558	XMASSERT(pDestination);
		3559
		3560	N = XMVectorClamp(V, XMVectorZero(), Max);
		3561
		3562	pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) \|
		3563	(((UINT)N.vector4_f32[1] & 0x7FF) << 11) \|
		3564	(((UINT)N.vector4_f32[0] & 0x7FF));
		3565
		3566	#elif defined(_XM_SSE_INTRINSICS_)
		3567	XMASSERT(pDestination);
		3568	static const XMVECTORF32 MaxUHenD3 = { 2047.0f, 2047.0f, 1023.0f, 1.0f};
		3569	static const XMVECTORF32 ScaleUHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f)/2.0f,1.0f};
		3570	static const XMVECTORI32 MaskUHenD3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
		3571	// Clamp to bounds
		3572	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		3573	vResult = _mm_min_ps(vResult,MaxUHenD3);
		3574	// Scale by multiplication
		3575	vResult = _mm_mul_ps(vResult,ScaleUHenD3);
		3576	// Convert to int
		3577	__m128i vResulti = _mm_cvttps_epi32(vResult);
		3578	// Mask off any fraction
		3579	vResulti = _mm_and_si128(vResulti,MaskUHenD3);
		3580	// Do a horizontal or of 3 entries
		3581	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
		3582	// i = x\|y
		3583	vResulti = _mm_or_si128(vResulti,vResulti2);
		3584	// Move Z to the x position
		3585	vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
		3586	// Add Z to itself to perform a single bit left shift
		3587	vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
		3588	// i = x\|y\|z
		3589	vResulti = _mm_or_si128(vResulti,vResulti2);
		3590	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		3591	#else // _XM_VMX128_INTRINSICS_
		3592	#endif // _XM_VMX128_INTRINSICS_
		3593	}
		3594
		3595	//------------------------------------------------------------------------------
		3596
		3597	XMFINLINE VOID XMStoreHenDN3
		3598	(
		3599	XMHENDN3* pDestination,
		3600	FXMVECTOR V
		3601	)
		3602	{
		3603	#if defined(_XM_NO_INTRINSICS_)
		3604
		3605	XMVECTOR N;
		3606	static CONST XMVECTORF32 Scale = {1023.0f, 1023.0f, 511.0f, 1.0f};
		3607
		3608	XMASSERT(pDestination);
		3609
		3610	N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
		3611	N = XMVectorMultiply(N, Scale.v);
		3612
		3613	pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) \|
		3614	(((INT)N.vector4_f32[1] & 0x7FF) << 11) \|
		3615	(((INT)N.vector4_f32[0] & 0x7FF));
		3616
		3617	#elif defined(_XM_SSE_INTRINSICS_)
		3618	XMASSERT(pDestination);
		3619	static const XMVECTORF32 ScaleHenDN3 = {1023.0f, 1023.0f2048.0f,511.0f(2048.0f*2048.0f),1.0f};
		3620	// Clamp to bounds
		3621	XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
		3622	vResult = _mm_min_ps(vResult,g_XMOne);
		3623	// Scale by multiplication
		3624	vResult = _mm_mul_ps(vResult,ScaleHenDN3);
		3625	// Convert to int
		3626	__m128i vResulti = _mm_cvttps_epi32(vResult);
		3627	// Mask off any fraction
		3628	vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
		3629	// Do a horizontal or of all 4 entries
		3630	vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResulti)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
		3631	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		3632	vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
		3633	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		3634	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		3635	#else // _XM_VMX128_INTRINSICS_
		3636	#endif // _XM_VMX128_INTRINSICS_
		3637	}
		3638
		3639	//------------------------------------------------------------------------------
		3640
		3641	XMFINLINE VOID XMStoreHenD3
		3642	(
		3643	XMHEND3* pDestination,
		3644	FXMVECTOR V
		3645	)
		3646	{
		3647	#if defined(_XM_NO_INTRINSICS_)
		3648
		3649	XMVECTOR N;
		3650	static CONST XMVECTOR Min = {-1023.0f, -1023.0f, -511.0f, -1.0f};
		3651	static CONST XMVECTOR Max = {1023.0f, 1023.0f, 511.0f, 1.0f};
		3652
		3653	XMASSERT(pDestination);
		3654
		3655	N = XMVectorClamp(V, Min, Max);
		3656
		3657	pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) \|
		3658	(((INT)N.vector4_f32[1] & 0x7FF) << 11) \|
		3659	(((INT)N.vector4_f32[0] & 0x7FF));
		3660
		3661	#elif defined(_XM_SSE_INTRINSICS_)
		3662	XMASSERT(pDestination);
		3663	static const XMVECTORF32 MinHenD3 = {-1023.0f,-1023.0f,-511.0f,-1.0f};
		3664	static const XMVECTORF32 MaxHenD3 = { 1023.0f, 1023.0f, 511.0f, 1.0f};
		3665	static const XMVECTORF32 ScaleHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f),1.0f};
		3666	// Clamp to bounds
		3667	XMVECTOR vResult = _mm_max_ps(V,MinHenD3);
		3668	vResult = _mm_min_ps(vResult,MaxHenD3);
		3669	// Scale by multiplication
		3670	vResult = _mm_mul_ps(vResult,ScaleHenD3);
		3671	// Convert to int
		3672	__m128i vResulti = _mm_cvttps_epi32(vResult);
		3673	// Mask off any fraction
		3674	vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
		3675	// Do a horizontal or of all 4 entries
		3676	vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResulti)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
		3677	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		3678	vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
		3679	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		3680	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		3681	#else // _XM_VMX128_INTRINSICS_
		3682	#endif // _XM_VMX128_INTRINSICS_
		3683	}
		3684
		3685	//------------------------------------------------------------------------------
		3686
		3687	XMFINLINE VOID XMStoreUDHenN3
		3688	(
		3689	XMUDHENN3* pDestination,
		3690	FXMVECTOR V
		3691	)
		3692	{
		3693	#if defined(_XM_NO_INTRINSICS_)
		3694
		3695	XMVECTOR N;
		3696	static CONST XMVECTORF32 Scale = {1023.0f, 2047.0f, 2047.0f, 0.0f};
		3697
		3698	XMASSERT(pDestination);
		3699
		3700	N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
		3701	N = XMVectorMultiply(N, Scale.v);
		3702
		3703	pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) \|
		3704	(((UINT)N.vector4_f32[1] & 0x7FF) << 10) \|
		3705	(((UINT)N.vector4_f32[0] & 0x3FF));
		3706
		3707	#elif defined(_XM_SSE_INTRINSICS_)
		3708	XMASSERT(pDestination);
		3709	static const XMVECTORF32 ScaleUDHenN3 = {1023.0f,2047.0f1024.0f,2047.0f(1024.0f*2048.0f)/2.0f,1.0f};
		3710	static const XMVECTORI32 MaskUDHenN3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
		3711	// Clamp to bounds
		3712	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		3713	vResult = _mm_min_ps(vResult,g_XMOne);
		3714	// Scale by multiplication
		3715	vResult = _mm_mul_ps(vResult,ScaleUDHenN3);
		3716	// Convert to int
		3717	__m128i vResulti = _mm_cvttps_epi32(vResult);
		3718	// Mask off any fraction
		3719	vResulti = _mm_and_si128(vResulti,MaskUDHenN3);
		3720	// Do a horizontal or of 3 entries
		3721	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
		3722	// i = x\|y
		3723	vResulti = _mm_or_si128(vResulti,vResulti2);
		3724	// Move Z to the x position
		3725	vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
		3726	// Add Z to itself to perform a single bit left shift
		3727	vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
		3728	// i = x\|y\|z
		3729	vResulti = _mm_or_si128(vResulti,vResulti2);
		3730	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		3731	#else // _XM_VMX128_INTRINSICS_
		3732	#endif // _XM_VMX128_INTRINSICS_
		3733	}
		3734
		3735	//------------------------------------------------------------------------------
		3736
		3737	XMFINLINE VOID XMStoreUDHen3
		3738	(
		3739	XMUDHEN3* pDestination,
		3740	FXMVECTOR V
		3741	)
		3742	{
		3743	#if defined(_XM_NO_INTRINSICS_)
		3744
		3745	XMVECTOR N;
		3746	static CONST XMVECTOR Max = {1023.0f, 2047.0f, 2047.0f, 0.0f};
		3747
		3748	XMASSERT(pDestination);
		3749
		3750	N = XMVectorClamp(V, XMVectorZero(), Max);
		3751
		3752	pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) \|
		3753	(((UINT)N.vector4_f32[1] & 0x7FF) << 10) \|
		3754	(((UINT)N.vector4_f32[0] & 0x3FF));
		3755
		3756	#elif defined(_XM_SSE_INTRINSICS_)
		3757	XMASSERT(pDestination);
		3758	static const XMVECTORF32 MaxUDHen3 = { 1023.0f, 2047.0f, 2047.0f, 1.0f};
		3759	static const XMVECTORF32 ScaleUDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f)/2.0f,1.0f};
		3760	static const XMVECTORI32 MaskUDHen3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
		3761	// Clamp to bounds
		3762	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		3763	vResult = _mm_min_ps(vResult,MaxUDHen3);
		3764	// Scale by multiplication
		3765	vResult = _mm_mul_ps(vResult,ScaleUDHen3);
		3766	// Convert to int
		3767	__m128i vResulti = _mm_cvttps_epi32(vResult);
		3768	// Mask off any fraction
		3769	vResulti = _mm_and_si128(vResulti,MaskUDHen3);
		3770	// Do a horizontal or of 3 entries
		3771	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
		3772	// i = x\|y
		3773	vResulti = _mm_or_si128(vResulti,vResulti2);
		3774	// Move Z to the x position
		3775	vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
		3776	// Add Z to itself to perform a single bit left shift
		3777	vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
		3778	// i = x\|y\|z
		3779	vResulti = _mm_or_si128(vResulti,vResulti2);
		3780	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		3781	#else // _XM_VMX128_INTRINSICS_
		3782	#endif // _XM_VMX128_INTRINSICS_
		3783	}
		3784
		3785	//------------------------------------------------------------------------------
		3786
		3787	XMFINLINE VOID XMStoreDHenN3
		3788	(
		3789	XMDHENN3* pDestination,
		3790	FXMVECTOR V
		3791	)
		3792	{
		3793	#if defined(_XM_NO_INTRINSICS_)
		3794
		3795	XMVECTOR N;
		3796	static CONST XMVECTORF32 Scale = {511.0f, 1023.0f, 1023.0f, 1.0f};
		3797
		3798	XMASSERT(pDestination);
		3799
		3800	N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
		3801	N = XMVectorMultiply(N, Scale.v);
		3802
		3803	pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) \|
		3804	(((INT)N.vector4_f32[1] & 0x7FF) << 10) \|
		3805	(((INT)N.vector4_f32[0] & 0x3FF));
		3806
		3807	#elif defined(_XM_SSE_INTRINSICS_)
		3808	XMASSERT(pDestination);
		3809	static const XMVECTORF32 ScaleDHenN3 = {511.0f, 1023.0f1024.0f,1023.0f(1024.0f*2048.0f),1.0f};
		3810	// Clamp to bounds
		3811	XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
		3812	vResult = _mm_min_ps(vResult,g_XMOne);
		3813	// Scale by multiplication
		3814	vResult = _mm_mul_ps(vResult,ScaleDHenN3);
		3815	// Convert to int
		3816	__m128i vResulti = _mm_cvttps_epi32(vResult);
		3817	// Mask off any fraction
		3818	vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
		3819	// Do a horizontal or of all 4 entries
		3820	vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResulti)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
		3821	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		3822	vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
		3823	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		3824	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		3825	#else // _XM_VMX128_INTRINSICS_
		3826	#endif // _XM_VMX128_INTRINSICS_
		3827	}
		3828
		3829	//------------------------------------------------------------------------------
		3830
		3831	XMFINLINE VOID XMStoreDHen3
		3832	(
		3833	XMDHEN3* pDestination,
		3834	FXMVECTOR V
		3835	)
		3836	{
		3837	#if defined(_XM_NO_INTRINSICS_)
		3838
		3839	XMVECTOR N;
		3840	static CONST XMVECTOR Min = {-511.0f, -1023.0f, -1023.0f, -1.0f};
		3841	static CONST XMVECTOR Max = {511.0f, 1023.0f, 1023.0f, 1.0f};
		3842
		3843	XMASSERT(pDestination);
		3844
		3845	N = XMVectorClamp(V, Min, Max);
		3846
		3847	pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) \|
		3848	(((INT)N.vector4_f32[1] & 0x7FF) << 10) \|
		3849	(((INT)N.vector4_f32[0] & 0x3FF));
		3850
		3851	#elif defined(_XM_SSE_INTRINSICS_)
		3852	XMASSERT(pDestination);
		3853	static const XMVECTORF32 MinDHen3 = {-511.0f,-1023.0f,-1023.0f,-1.0f};
		3854	static const XMVECTORF32 MaxDHen3 = { 511.0f, 1023.0f, 1023.0f, 1.0f};
		3855	static const XMVECTORF32 ScaleDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f),1.0f};
		3856	// Clamp to bounds
		3857	XMVECTOR vResult = _mm_max_ps(V,MinDHen3);
		3858	vResult = _mm_min_ps(vResult,MaxDHen3);
		3859	// Scale by multiplication
		3860	vResult = _mm_mul_ps(vResult,ScaleDHen3);
		3861	// Convert to int
		3862	__m128i vResulti = _mm_cvttps_epi32(vResult);
		3863	// Mask off any fraction
		3864	vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
		3865	// Do a horizontal or of all 4 entries
		3866	vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResulti)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
		3867	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		3868	vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
		3869	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		3870	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		3871	#else // _XM_VMX128_INTRINSICS_
		3872	#endif // _XM_VMX128_INTRINSICS_
		3873	}
		3874
		3875	//------------------------------------------------------------------------------
		3876
		3877	XMFINLINE VOID XMStoreU565
		3878	(
		3879	XMU565* pDestination,
		3880	FXMVECTOR V
		3881	)
		3882	{
		3883	#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
		3884	XMASSERT(pDestination);
		3885	static CONST XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f};
		3886	// Bounds check
		3887	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		3888	vResult = _mm_min_ps(vResult,Max);
		3889	// Convert to int with rounding
		3890	__m128i vInt = _mm_cvtps_epi32(vResult);
		3891	// No SSE operations will write to 16-bit values, so we have to extract them manually
		3892	USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
		3893	USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
		3894	USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
		3895	pDestination->v = ((z & 0x1F) << 11) \|
		3896	((y & 0x3F) << 5) \|
		3897	((x & 0x1F));
		3898	#else
		3899	XMVECTOR N;
		3900	static CONST XMVECTORF32 Max = {31.0f, 63.0f, 31.0f, 0.0f};
		3901
		3902	XMASSERT(pDestination);
		3903
		3904	N = XMVectorClamp(V, XMVectorZero(), Max.v);
		3905	N = XMVectorRound(N);
		3906
		3907	pDestination->v = (((USHORT)N.vector4_f32[2] & 0x1F) << 11) \|
		3908	(((USHORT)N.vector4_f32[1] & 0x3F) << 5) \|
		3909	(((USHORT)N.vector4_f32[0] & 0x1F));
		3910	#endif !_XM_SSE_INTRINSICS_
		3911	}
		3912
		3913	//------------------------------------------------------------------------------
		3914
		3915	XMFINLINE VOID XMStoreFloat3PK
		3916	(
		3917	XMFLOAT3PK* pDestination,
		3918	FXMVECTOR V
		3919	)
		3920	{
		3921	UINT I, Sign, j;
		3922	UINT IValue[3];
		3923	UINT Result[3];
		3924
		3925	XMASSERT(pDestination);
		3926
		3927	XMStoreFloat3( (XMFLOAT3*)&IValue, V );
		3928
		3929	// X & Y Channels (5-bit exponent, 6-bit mantissa)
		3930	for(j=0; j < 2; ++j)
		3931	{
		3932	Sign = IValue[j] & 0x80000000;
		3933	I = IValue[j] & 0x7FFFFFFF;
		3934
		3935	if ((I & 0x7F800000) == 0x7F800000)
		3936	{
		3937	// INF or NAN
		3938	Result[j] = 0x7c0;
		3939	if (( I & 0x7FFFFF ) != 0)
		3940	{
		3941	Result[j] = 0x7c0 \| (((I>>17)\|(I>11)\|(I>>6)\|(I))&0x3f);
		3942	}
		3943	else if ( Sign )
		3944	{
		3945	// -INF is clamped to 0 since 3PK is positive only
		3946	Result[j] = 0;
		3947	}
		3948	}
		3949	else if ( Sign )
		3950	{
		3951	// 3PK is positive only, so clamp to zero
		3952	Result[j] = 0;
		3953	}
		3954	else if (I > 0x477E0000U)
		3955	{
		3956	// The number is too large to be represented as a float11, set to max
		3957	Result[j] = 0x7BF;
		3958	}
		3959	else
		3960	{
		3961	if (I < 0x38800000U)
		3962	{
		3963	// The number is too small to be represented as a normalized float11
		3964	// Convert it to a denormalized value.
		3965	UINT Shift = 113U - (I >> 23U);
		3966	I = (0x800000U \| (I & 0x7FFFFFU)) >> Shift;
		3967	}
		3968	else
		3969	{
		3970	// Rebias the exponent to represent the value as a normalized float11
		3971	I += 0xC8000000U;
		3972	}
		3973
		3974	Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU;
		3975	}
		3976	}
		3977
		3978	// Z Channel (5-bit exponent, 5-bit mantissa)
		3979	Sign = IValue[2] & 0x80000000;
		3980	I = IValue[2] & 0x7FFFFFFF;
		3981
		3982	if ((I & 0x7F800000) == 0x7F800000)
		3983	{
		3984	// INF or NAN
		3985	Result[2] = 0x3e0;
		3986	if ( I & 0x7FFFFF )
		3987	{
		3988	Result[2] = 0x3e0 \| (((I>>18)\|(I>13)\|(I>>3)\|(I))&0x1f);
		3989	}
		3990	else if ( Sign )
		3991	{
		3992	// -INF is clamped to 0 since 3PK is positive only
		3993	Result[2] = 0;
		3994	}
		3995	}
		3996	else if ( Sign )
		3997	{
		3998	// 3PK is positive only, so clamp to zero
		3999	Result[2] = 0;
		4000	}
		4001	else if (I > 0x477C0000U)
		4002	{
		4003	// The number is too large to be represented as a float10, set to max
		4004	Result[2] = 0x3df;
		4005	}
		4006	else
		4007	{
		4008	if (I < 0x38800000U)
		4009	{
		4010	// The number is too small to be represented as a normalized float10
		4011	// Convert it to a denormalized value.
		4012	UINT Shift = 113U - (I >> 23U);
		4013	I = (0x800000U \| (I & 0x7FFFFFU)) >> Shift;
		4014	}
		4015	else
		4016	{
		4017	// Rebias the exponent to represent the value as a normalized float10
		4018	I += 0xC8000000U;
		4019	}
		4020
		4021	Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU;
		4022	}
		4023
		4024	// Pack Result into memory
		4025	pDestination->v = (Result[0] & 0x7ff)
		4026	\| ( (Result[1] & 0x7ff) << 11 )
		4027	\| ( (Result[2] & 0x3ff) << 22 );
		4028	}
		4029
		4030
		4031	//------------------------------------------------------------------------------
		4032
		4033	XMFINLINE VOID XMStoreFloat3SE
		4034	(
		4035	XMFLOAT3SE* pDestination,
		4036	FXMVECTOR V
		4037	)
		4038	{
		4039	UINT I, Sign, j, T;
		4040	UINT IValue[3];
		4041	UINT Frac[3];
		4042	UINT Exp[3];
		4043
		4044	XMASSERT(pDestination);
		4045
		4046	XMStoreFloat3( (XMFLOAT3*)&IValue, V );
		4047
		4048	// X, Y, Z Channels (5-bit exponent, 9-bit mantissa)
		4049	for(j=0; j < 3; ++j)
		4050	{
		4051	Sign = IValue[j] & 0x80000000;
		4052	I = IValue[j] & 0x7FFFFFFF;
		4053
		4054	if ((I & 0x7F800000) == 0x7F800000)
		4055	{
		4056	// INF or NAN
		4057	Exp[j] = 0x1f;
		4058	if (( I & 0x7FFFFF ) != 0)
		4059	{
		4060	Frac[j] = ((I>>14)\|(I>5)\|(I))&0x1ff;
		4061	}
		4062	else if ( Sign )
		4063	{
		4064	// -INF is clamped to 0 since 3SE is positive only
		4065	Exp[j] = Frac[j] = 0;
		4066	}
		4067	}
		4068	else if ( Sign )
		4069	{
		4070	// 3SE is positive only, so clamp to zero
		4071	Exp[j] = Frac[j] = 0;
		4072	}
		4073	else if (I > 0x477FC000U)
		4074	{
		4075	// The number is too large, set to max
		4076	Exp[j] = 0x1e;
		4077	Frac[j] = 0x1ff;
		4078	}
		4079	else
		4080	{
		4081	if (I < 0x38800000U)
		4082	{
		4083	// The number is too small to be represented as a normalized float11
		4084	// Convert it to a denormalized value.
		4085	UINT Shift = 113U - (I >> 23U);
		4086	I = (0x800000U \| (I & 0x7FFFFFU)) >> Shift;
		4087	}
		4088	else
		4089	{
		4090	// Rebias the exponent to represent the value as a normalized float11
		4091	I += 0xC8000000U;
		4092	}
		4093
		4094	T = ((I + 0x1FFFU + ((I >> 14U) & 1U)) >> 14U)&0x3fffU;
		4095
		4096	Exp[j] = (T & 0x3E00) >> 9;
		4097	Frac[j] = T & 0x1ff;
		4098	}
		4099	}
		4100
		4101	// Adjust to a shared exponent
		4102	T = XMMax( Exp[0], XMMax( Exp[1], Exp[2] ) );
		4103
		4104	Frac[0] = Frac[0] >> (T - Exp[0]);
		4105	Frac[1] = Frac[1] >> (T - Exp[1]);
		4106	Frac[2] = Frac[2] >> (T - Exp[2]);
		4107
		4108	// Store packed into memory
		4109	pDestination->xm = Frac[0];
		4110	pDestination->ym = Frac[1];
		4111	pDestination->zm = Frac[2];
		4112	pDestination->e = T;
		4113	}
		4114
		4115	//------------------------------------------------------------------------------
		4116
		4117	XMFINLINE VOID XMStoreInt4
		4118	(
		4119	UINT* pDestination,
		4120	FXMVECTOR V
		4121	)
		4122	{
		4123	#if defined(_XM_NO_INTRINSICS_)
		4124
		4125	XMASSERT(pDestination);
		4126
		4127	pDestination[0] = V.vector4_u32[0];
		4128	pDestination[1] = V.vector4_u32[1];
		4129	pDestination[2] = V.vector4_u32[2];
		4130	pDestination[3] = V.vector4_u32[3];
		4131
		4132	#elif defined(_XM_SSE_INTRINSICS_)
		4133	XMASSERT(pDestination);
		4134
		4135	_mm_storeu_si128( (__m128i)pDestination, reinterpret_cast<const __m128i >(&V)[0] );
		4136
		4137	#else // _XM_VMX128_INTRINSICS_
		4138	#endif // _XM_VMX128_INTRINSICS_
		4139	}
		4140
		4141	//------------------------------------------------------------------------------
		4142
		4143	XMFINLINE VOID XMStoreInt4A
		4144	(
		4145	UINT* pDestination,
		4146	FXMVECTOR V
		4147	)
		4148	{
		4149	#if defined(_XM_NO_INTRINSICS_)
		4150
		4151	XMASSERT(pDestination);
		4152	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		4153
		4154	pDestination[0] = V.vector4_u32[0];
		4155	pDestination[1] = V.vector4_u32[1];
		4156	pDestination[2] = V.vector4_u32[2];
		4157	pDestination[3] = V.vector4_u32[3];
		4158
		4159	#elif defined(_XM_SSE_INTRINSICS_)
		4160	XMASSERT(pDestination);
		4161
		4162	_mm_store_si128( (__m128i)pDestination, reinterpret_cast<const __m128i >(&V)[0] );
		4163
		4164	#else // _XM_VMX128_INTRINSICS_
		4165	#endif // _XM_VMX128_INTRINSICS_
		4166	}
		4167
		4168	//------------------------------------------------------------------------------
		4169
		4170	XMFINLINE VOID XMStoreInt4NC
		4171	(
		4172	UINT* pDestination,
		4173	FXMVECTOR V
		4174	)
		4175	{
		4176	#if defined(_XM_NO_INTRINSICS_)
		4177
		4178	XMASSERT(pDestination);
		4179
		4180	pDestination[0] = V.vector4_u32[0];
		4181	pDestination[1] = V.vector4_u32[1];
		4182	pDestination[2] = V.vector4_u32[2];
		4183	pDestination[3] = V.vector4_u32[3];
		4184
		4185	#elif defined(_XM_SSE_INTRINSICS_)
		4186	XMASSERT(pDestination);
		4187
		4188	_mm_storeu_si128( (__m128i)pDestination, reinterpret_cast<const __m128i >(&V)[0] );
		4189
		4190	#else // _XM_VMX128_INTRINSICS_
		4191	#endif // _XM_VMX128_INTRINSICS_
		4192	}
		4193
		4194	//------------------------------------------------------------------------------
		4195
		4196	XMFINLINE VOID XMStoreFloat4
		4197	(
		4198	XMFLOAT4* pDestination,
		4199	FXMVECTOR V
		4200	)
		4201	{
		4202	#if defined(_XM_NO_INTRINSICS_)
		4203
		4204	XMASSERT(pDestination);
		4205
		4206	pDestination->x = V.vector4_f32[0];
		4207	pDestination->y = V.vector4_f32[1];
		4208	pDestination->z = V.vector4_f32[2];
		4209	pDestination->w = V.vector4_f32[3];
		4210
		4211	#elif defined(_XM_SSE_INTRINSICS_)
		4212	XMASSERT(pDestination);
		4213
		4214	_mm_storeu_ps( &pDestination->x, V );
		4215
		4216	#else // _XM_VMX128_INTRINSICS_
		4217	#endif // _XM_VMX128_INTRINSICS_
		4218	}
		4219
		4220	//------------------------------------------------------------------------------
		4221
		4222	XMFINLINE VOID XMStoreFloat4A
		4223	(
		4224	XMFLOAT4A* pDestination,
		4225	FXMVECTOR V
		4226	)
		4227	{
		4228	#if defined(_XM_NO_INTRINSICS_)
		4229
		4230	XMASSERT(pDestination);
		4231	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		4232
		4233	pDestination->x = V.vector4_f32[0];
		4234	pDestination->y = V.vector4_f32[1];
		4235	pDestination->z = V.vector4_f32[2];
		4236	pDestination->w = V.vector4_f32[3];
		4237
		4238	#elif defined(_XM_SSE_INTRINSICS_)
		4239	XMASSERT(pDestination);
		4240	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		4241
		4242	_mm_store_ps( &pDestination->x, V );
		4243	#else // _XM_VMX128_INTRINSICS_
		4244	#endif // _XM_VMX128_INTRINSICS_
		4245	}
		4246
		4247	//------------------------------------------------------------------------------
		4248
		4249	XMFINLINE VOID XMStoreFloat4NC
		4250	(
		4251	XMFLOAT4* pDestination,
		4252	FXMVECTOR V
		4253	)
		4254	{
		4255	#if defined(_XM_NO_INTRINSICS_)
		4256
		4257	XMASSERT(pDestination);
		4258
		4259	pDestination->x = V.vector4_f32[0];
		4260	pDestination->y = V.vector4_f32[1];
		4261	pDestination->z = V.vector4_f32[2];
		4262	pDestination->w = V.vector4_f32[3];
		4263
		4264	#elif defined(_XM_SSE_INTRINSICS_)
		4265	XMASSERT(pDestination);
		4266
		4267	_mm_storeu_ps( &pDestination->x, V );
		4268
		4269	#else // _XM_VMX128_INTRINSICS_
		4270	#endif // _XM_VMX128_INTRINSICS_
		4271	}
		4272
		4273	//------------------------------------------------------------------------------
		4274
		4275	XMFINLINE VOID XMStoreHalf4
		4276	(
		4277	XMHALF4* pDestination,
		4278	FXMVECTOR V
		4279	)
		4280	{
		4281	#if defined(_XM_NO_INTRINSICS_)
		4282
		4283	XMASSERT(pDestination);
		4284
		4285	pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
		4286	pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
		4287	pDestination->z = XMConvertFloatToHalf(V.vector4_f32[2]);
		4288	pDestination->w = XMConvertFloatToHalf(V.vector4_f32[3]);
		4289
		4290	#elif defined(_XM_SSE_INTRINSICS_)
		4291	XMASSERT(pDestination);
		4292	pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
		4293	pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
		4294	pDestination->z = XMConvertFloatToHalf(XMVectorGetZ(V));
		4295	pDestination->w = XMConvertFloatToHalf(XMVectorGetW(V));
		4296	#else // _XM_VMX128_INTRINSICS_
		4297	#endif // _XM_VMX128_INTRINSICS_
		4298	}
		4299
		4300	//------------------------------------------------------------------------------
		4301
		4302	XMFINLINE VOID XMStoreShortN4
		4303	(
		4304	XMSHORTN4* pDestination,
		4305	FXMVECTOR V
		4306	)
		4307	{
		4308	#if defined(_XM_NO_INTRINSICS_)
		4309
		4310	XMVECTOR N;
		4311	static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
		4312
		4313	XMASSERT(pDestination);
		4314
		4315	N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
		4316	N = XMVectorMultiply(N, Scale.v);
		4317	N = XMVectorRound(N);
		4318
		4319	pDestination->x = (SHORT)N.vector4_f32[0];
		4320	pDestination->y = (SHORT)N.vector4_f32[1];
		4321	pDestination->z = (SHORT)N.vector4_f32[2];
		4322	pDestination->w = (SHORT)N.vector4_f32[3];
		4323
		4324	#elif defined(_XM_SSE_INTRINSICS_)
		4325	XMASSERT(pDestination);
		4326	static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
		4327
		4328	XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
		4329	vResult = _mm_min_ps(vResult,g_XMOne);
		4330	vResult = _mm_mul_ps(vResult,Scale);
		4331	__m128i vResulti = _mm_cvtps_epi32(vResult);
		4332	vResulti = _mm_packs_epi32(vResulti,vResulti);
		4333	_mm_store_sd(reinterpret_cast<double >(&pDestination->x),reinterpret_cast<const __m128d >(&vResulti)[0]);
		4334	#else // _XM_VMX128_INTRINSICS_
		4335	#endif // _XM_VMX128_INTRINSICS_
		4336	}
		4337
		4338	//------------------------------------------------------------------------------
		4339
		4340	XMFINLINE VOID XMStoreShort4
		4341	(
		4342	XMSHORT4* pDestination,
		4343	FXMVECTOR V
		4344	)
		4345	{
		4346	#if defined(_XM_NO_INTRINSICS_)
		4347
		4348	XMVECTOR N;
		4349	static CONST XMVECTOR Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
		4350	static CONST XMVECTOR Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
		4351
		4352	XMASSERT(pDestination);
		4353
		4354	N = XMVectorClamp(V, Min, Max);
		4355	N = XMVectorRound(N);
		4356
		4357	pDestination->x = (SHORT)N.vector4_f32[0];
		4358	pDestination->y = (SHORT)N.vector4_f32[1];
		4359	pDestination->z = (SHORT)N.vector4_f32[2];
		4360	pDestination->w = (SHORT)N.vector4_f32[3];
		4361
		4362	#elif defined(_XM_SSE_INTRINSICS_)
		4363	XMASSERT(pDestination);
		4364	static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
		4365	static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
		4366	// Bounds check
		4367	XMVECTOR vResult = _mm_max_ps(V,Min);
		4368	vResult = _mm_min_ps(vResult,Max);
		4369	// Convert to int with rounding
		4370	__m128i vInt = _mm_cvtps_epi32(vResult);
		4371	// Pack the ints into shorts
		4372	vInt = _mm_packs_epi32(vInt,vInt);
		4373	_mm_store_sd(reinterpret_cast<double >(&pDestination->x),reinterpret_cast<const __m128d >(&vInt)[0]);
		4374	#else // _XM_VMX128_INTRINSICS_
		4375	#endif // _XM_VMX128_INTRINSICS_
		4376	}
		4377
		4378	//------------------------------------------------------------------------------
		4379
		4380	XMFINLINE VOID XMStoreUShortN4
		4381	(
		4382	XMUSHORTN4* pDestination,
		4383	FXMVECTOR V
		4384	)
		4385	{
		4386	#if defined(_XM_NO_INTRINSICS_)
		4387
		4388	XMVECTOR N;
		4389	static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
		4390
		4391	XMASSERT(pDestination);
		4392
		4393	N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
		4394	N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
		4395	N = XMVectorTruncate(N);
		4396
		4397	pDestination->x = (SHORT)N.vector4_f32[0];
		4398	pDestination->y = (SHORT)N.vector4_f32[1];
		4399	pDestination->z = (SHORT)N.vector4_f32[2];
		4400	pDestination->w = (SHORT)N.vector4_f32[3];
		4401
		4402	#elif defined(_XM_SSE_INTRINSICS_)
		4403	XMASSERT(pDestination);
		4404	static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
		4405	// Bounds check
		4406	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		4407	vResult = _mm_min_ps(vResult,g_XMOne);
		4408	vResult = _mm_mul_ps(vResult,Scale);
		4409	// Convert to int with rounding
		4410	__m128i vInt = _mm_cvtps_epi32(vResult);
		4411	// Since the SSE pack instruction clamps using signed rules,
		4412	// manually extract the values to store them to memory
		4413	pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
		4414	pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
		4415	pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
		4416	pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
		4417	#else // _XM_VMX128_INTRINSICS_
		4418	#endif // _XM_VMX128_INTRINSICS_
		4419	}
		4420
		4421	//------------------------------------------------------------------------------
		4422
		4423	XMFINLINE VOID XMStoreUShort4
		4424	(
		4425	XMUSHORT4* pDestination,
		4426	FXMVECTOR V
		4427	)
		4428	{
		4429	#if defined(_XM_NO_INTRINSICS_)
		4430
		4431	XMVECTOR N;
		4432	static CONST XMVECTOR Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
		4433
		4434	XMASSERT(pDestination);
		4435
		4436	N = XMVectorClamp(V, XMVectorZero(), Max);
		4437	N = XMVectorRound(N);
		4438
		4439	pDestination->x = (SHORT)N.vector4_f32[0];
		4440	pDestination->y = (SHORT)N.vector4_f32[1];
		4441	pDestination->z = (SHORT)N.vector4_f32[2];
		4442	pDestination->w = (SHORT)N.vector4_f32[3];
		4443
		4444	#elif defined(_XM_SSE_INTRINSICS_)
		4445	XMASSERT(pDestination);
		4446	static CONST XMVECTORF32 Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
		4447	// Bounds check
		4448	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		4449	vResult = _mm_min_ps(vResult,Max);
		4450	// Convert to int with rounding
		4451	__m128i vInt = _mm_cvtps_epi32(vResult);
		4452	// Since the SSE pack instruction clamps using signed rules,
		4453	// manually extract the values to store them to memory
		4454	pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
		4455	pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
		4456	pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
		4457	pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
		4458	#else // _XM_VMX128_INTRINSICS_
		4459	#endif // _XM_VMX128_INTRINSICS_
		4460	}
		4461
		4462	//------------------------------------------------------------------------------
		4463
		4464	XMFINLINE VOID XMStoreXIcoN4
		4465	(
		4466	XMXICON4* pDestination,
		4467	FXMVECTOR V
		4468	)
		4469	{
		4470	#if defined(_XM_NO_INTRINSICS_)
		4471
		4472	XMVECTOR N;
		4473	static CONST XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
		4474	static CONST XMVECTORF32 Scale = {524287.0f, 524287.0f, 524287.0f, 15.0f};
		4475
		4476	XMASSERT(pDestination);
		4477
		4478	N = XMVectorClamp(V, Min.v, g_XMOne.v);
		4479	N = XMVectorMultiply(N, Scale.v);
		4480	N = XMVectorRound(N);
		4481
		4482	pDestination->v = ((UINT64)N.vector4_f32[3] << 60) \|
		4483	(((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) \|
		4484	(((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) \|
		4485	(((INT64)N.vector4_f32[0] & 0xFFFFF));
		4486
		4487	#elif defined(_XM_SSE_INTRINSICS_)
		4488	XMASSERT(pDestination);
		4489	// Note: Masks are x,w,y and z
		4490	static const XMVECTORF32 MinXIcoN4 = {-1.0f, 0.0f,-1.0f,-1.0f};
		4491	static const XMVECTORF32 ScaleXIcoN4 = {524287.0f,15.0f4096.0f65536.0f0.5f,524287.0f4096.0f,524287.0f};
		4492	static const XMVECTORI32 MaskXIcoN4 = {0xFFFFF,0xF<<((60-32)-1),0xFFFFF000,0xFFFFF};
		4493
		4494	// Clamp to bounds
		4495	XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
		4496	vResult = _mm_max_ps(vResult,MinXIcoN4);
		4497	vResult = _mm_min_ps(vResult,g_XMOne);
		4498	// Scale by multiplication
		4499	vResult = _mm_mul_ps(vResult,ScaleXIcoN4);
		4500	// Convert to integer (w is unsigned)
		4501	__m128i vResulti = _mm_cvttps_epi32(vResult);
		4502	// Mask off unused bits
		4503	vResulti = _mm_and_si128(vResulti,MaskXIcoN4);
		4504	// Isolate Y
		4505	__m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
		4506	// Double Y (Really W) to fixup for unsigned conversion
		4507	vResulti = _mm_add_epi32(vResulti,vResulti2);
		4508	// Shift y and z to straddle the 32-bit boundary
		4509	vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
		4510	// Shift it into place
		4511	vResulti2 = _mm_slli_si128(vResulti2,20/8);
		4512	// i = x\|y<<20\|z<<40\|w<<60
		4513	vResulti = _mm_or_si128(vResulti,vResulti2);
		4514	_mm_store_sd(reinterpret_cast<double >(&pDestination->v),reinterpret_cast<const __m128d >(&vResulti)[0]);
		4515	#else // _XM_VMX128_INTRINSICS_
		4516	#endif // _XM_VMX128_INTRINSICS_
		4517	}
		4518
		4519	//------------------------------------------------------------------------------
		4520
		4521	XMFINLINE VOID XMStoreXIco4
		4522	(
		4523	XMXICO4* pDestination,
		4524	FXMVECTOR V
		4525	)
		4526	{
		4527	#if defined(_XM_NO_INTRINSICS_)
		4528
		4529	XMVECTOR N;
		4530	static CONST XMVECTORF32 Min = {-524287.0f, -524287.0f, -524287.0f, 0.0f};
		4531	static CONST XMVECTORF32 Max = {524287.0f, 524287.0f, 524287.0f, 15.0f};
		4532
		4533	XMASSERT(pDestination);
		4534	N = XMVectorClamp(V, Min.v, Max.v);
		4535	pDestination->v = ((UINT64)N.vector4_f32[3] << 60) \|
		4536	(((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) \|
		4537	(((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) \|
		4538	(((INT64)N.vector4_f32[0] & 0xFFFFF));
		4539
		4540	#elif defined(_XM_SSE_INTRINSICS_)
		4541	XMASSERT(pDestination);
		4542	// Note: Masks are x,w,y and z
		4543	static const XMVECTORF32 MinXIco4 = {-524287.0f, 0.0f,-524287.0f,-524287.0f};
		4544	static const XMVECTORF32 MaxXIco4 = { 524287.0f,15.0f, 524287.0f, 524287.0f};
		4545	static const XMVECTORF32 ScaleXIco4 = {1.0f,4096.0f65536.0f0.5f,4096.0f,1.0f};
		4546	static const XMVECTORI32 MaskXIco4 = {0xFFFFF,0xF<<((60-1)-32),0xFFFFF000,0xFFFFF};
		4547	// Clamp to bounds
		4548	XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
		4549	vResult = _mm_max_ps(vResult,MinXIco4);
		4550	vResult = _mm_min_ps(vResult,MaxXIco4);
		4551	// Scale by multiplication
		4552	vResult = _mm_mul_ps(vResult,ScaleXIco4);
		4553	// Convert to int
		4554	__m128i vResulti = _mm_cvttps_epi32(vResult);
		4555	// Mask off any fraction
		4556	vResulti = _mm_and_si128(vResulti,MaskXIco4);
		4557	// Isolate Y
		4558	__m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
		4559	// Double Y (Really W) to fixup for unsigned conversion
		4560	vResulti = _mm_add_epi32(vResulti,vResulti2);
		4561	// Shift y and z to straddle the 32-bit boundary
		4562	vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
		4563	// Shift it into place
		4564	vResulti2 = _mm_slli_si128(vResulti2,20/8);
		4565	// i = x\|y<<20\|z<<40\|w<<60
		4566	vResulti = _mm_or_si128(vResulti,vResulti2);
		4567	_mm_store_sd(reinterpret_cast<double >(&pDestination->v),reinterpret_cast<const __m128d >(&vResulti)[0]);
		4568	#else // _XM_VMX128_INTRINSICS_
		4569	#endif // _XM_VMX128_INTRINSICS_
		4570	}
		4571
		4572	//------------------------------------------------------------------------------
		4573
		4574	XMFINLINE VOID XMStoreUIcoN4
		4575	(
		4576	XMUICON4* pDestination,
		4577	FXMVECTOR V
		4578	)
		4579	{
		4580	#define XM_URange ((FLOAT)(1 << 20))
		4581	#define XM_URangeDiv2 ((FLOAT)(1 << 19))
		4582	#define XM_UMaxXYZ ((FLOAT)((1 << 20) - 1))
		4583	#define XM_UMaxW ((FLOAT)((1 << 4) - 1))
		4584	#define XM_ScaleXYZ (-(FLOAT)((1 << 20) - 1) / XM_PACK_FACTOR)
		4585	#define XM_ScaleW (-(FLOAT)((1 << 4) - 1) / XM_PACK_FACTOR)
		4586	#define XM_Scale (-1.0f / XM_PACK_FACTOR)
		4587	#define XM_Offset (3.0f)
		4588
		4589	#if defined(_XM_NO_INTRINSICS_)
		4590
		4591	XMVECTOR N;
		4592	static CONST XMVECTORF32 Scale = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
		4593
		4594	XMASSERT(pDestination);
		4595
		4596	N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
		4597	N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
		4598
		4599	pDestination->v = ((UINT64)N.vector4_f32[3] << 60) \|
		4600	(((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) \|
		4601	(((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) \|
		4602	(((UINT64)N.vector4_f32[0] & 0xFFFFF));
		4603
		4604	#elif defined(_XM_SSE_INTRINSICS_)
		4605	XMASSERT(pDestination);
		4606	// Note: Masks are x,w,y and z
		4607	static const XMVECTORF32 ScaleUIcoN4 = {1048575.0f,15.0f4096.0f65536.0f,1048575.0f*4096.0f,1048575.0f};
		4608	static const XMVECTORI32 MaskUIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
		4609	static const XMVECTORF32 AddUIcoN4 = {0.0f,-32768.0f65536.0f,-32768.0f65536.0f,0.0f};
		4610	// Clamp to bounds
		4611	XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
		4612	vResult = _mm_max_ps(vResult,g_XMZero);
		4613	vResult = _mm_min_ps(vResult,g_XMOne);
		4614	// Scale by multiplication
		4615	vResult = _mm_mul_ps(vResult,ScaleUIcoN4);
		4616	// Adjust for unsigned entries
		4617	vResult = _mm_add_ps(vResult,AddUIcoN4);
		4618	// Convert to int
		4619	__m128i vResulti = _mm_cvttps_epi32(vResult);
		4620	// Fix the signs on the unsigned entries
		4621	vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
		4622	// Mask off any fraction
		4623	vResulti = _mm_and_si128(vResulti,MaskUIcoN4);
		4624	// Shift y and z to straddle the 32-bit boundary
		4625	__m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
		4626	// Shift it into place
		4627	vResulti2 = _mm_slli_si128(vResulti2,20/8);
		4628	// i = x\|y<<20\|z<<40\|w<<60
		4629	vResulti = _mm_or_si128(vResulti,vResulti2);
		4630	_mm_store_sd(reinterpret_cast<double >(&pDestination->v),reinterpret_cast<const __m128d >(&vResulti)[0]);
		4631	#else // _XM_VMX128_INTRINSICS_
		4632	#endif // _XM_VMX128_INTRINSICS_
		4633
		4634	#undef XM_URange
		4635	#undef XM_URangeDiv2
		4636	#undef XM_UMaxXYZ
		4637	#undef XM_UMaxW
		4638	#undef XM_ScaleXYZ
		4639	#undef XM_ScaleW
		4640	#undef XM_Scale
		4641	#undef XM_Offset
		4642	}
		4643
		4644	//------------------------------------------------------------------------------
		4645
		4646	XMFINLINE VOID XMStoreUIco4
		4647	(
		4648	XMUICO4* pDestination,
		4649	FXMVECTOR V
		4650	)
		4651	{
		4652	#define XM_Scale (-1.0f / XM_PACK_FACTOR)
		4653	#define XM_URange ((FLOAT)(1 << 20))
		4654	#define XM_URangeDiv2 ((FLOAT)(1 << 19))
		4655
		4656	#if defined(_XM_NO_INTRINSICS_)
		4657
		4658	XMVECTOR N;
		4659	static CONST XMVECTOR Max = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
		4660
		4661	XMASSERT(pDestination);
		4662
		4663	N = XMVectorClamp(V, XMVectorZero(), Max);
		4664	N = XMVectorRound(N);
		4665
		4666	pDestination->v = ((UINT64)N.vector4_f32[3] << 60) \|
		4667	(((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) \|
		4668	(((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) \|
		4669	(((UINT64)N.vector4_f32[0] & 0xFFFFF));
		4670
		4671	#elif defined(_XM_SSE_INTRINSICS_)
		4672	XMASSERT(pDestination);
		4673	// Note: Masks are x,w,y and z
		4674	static const XMVECTORF32 MaxUIco4 = { 1048575.0f, 15.0f, 1048575.0f, 1048575.0f};
		4675	static const XMVECTORF32 ScaleUIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
		4676	static const XMVECTORI32 MaskUIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
		4677	static const XMVECTORF32 AddUIco4 = {0.0f,-32768.0f65536.0f,-32768.0f65536.0f,0.0f};
		4678	// Clamp to bounds
		4679	XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
		4680	vResult = _mm_max_ps(vResult,g_XMZero);
		4681	vResult = _mm_min_ps(vResult,MaxUIco4);
		4682	// Scale by multiplication
		4683	vResult = _mm_mul_ps(vResult,ScaleUIco4);
		4684	vResult = _mm_add_ps(vResult,AddUIco4);
		4685	// Convert to int
		4686	__m128i vResulti = _mm_cvttps_epi32(vResult);
		4687	vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
		4688	// Mask off any fraction
		4689	vResulti = _mm_and_si128(vResulti,MaskUIco4);
		4690	// Shift y and z to straddle the 32-bit boundary
		4691	__m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
		4692	// Shift it into place
		4693	vResulti2 = _mm_slli_si128(vResulti2,20/8);
		4694	// i = x\|y<<20\|z<<40\|w<<60
		4695	vResulti = _mm_or_si128(vResulti,vResulti2);
		4696	_mm_store_sd(reinterpret_cast<double >(&pDestination->v),reinterpret_cast<const __m128d >(&vResulti)[0]);
		4697	#else // _XM_VMX128_INTRINSICS_
		4698	#endif // _XM_VMX128_INTRINSICS_
		4699
		4700	#undef XM_Scale
		4701	#undef XM_URange
		4702	#undef XM_URangeDiv2
		4703	}
		4704
		4705	//------------------------------------------------------------------------------
		4706
		4707	XMFINLINE VOID XMStoreIcoN4
		4708	(
		4709	XMICON4* pDestination,
		4710	FXMVECTOR V
		4711	)
		4712	{
		4713	#define XM_Scale (-1.0f / XM_PACK_FACTOR)
		4714	#define XM_URange ((FLOAT)(1 << 4))
		4715	#define XM_Offset (3.0f)
		4716	#define XM_UMaxXYZ ((FLOAT)((1 << (20 - 1)) - 1))
		4717	#define XM_UMaxW ((FLOAT)((1 << (4 - 1)) - 1))
		4718
		4719	#if defined(_XM_NO_INTRINSICS_)
		4720
		4721	XMVECTOR N;
		4722	static CONST XMVECTORF32 Scale = {524287.0f, 524287.0f, 524287.0f, 7.0f};
		4723
		4724	XMASSERT(pDestination);
		4725
		4726	N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
		4727	N = XMVectorMultiplyAdd(N, Scale.v, g_XMNegativeZero.v);
		4728	N = XMVectorRound(N);
		4729
		4730	pDestination->v = ((UINT64)N.vector4_f32[3] << 60) \|
		4731	(((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) \|
		4732	(((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) \|
		4733	(((UINT64)N.vector4_f32[0] & 0xFFFFF));
		4734
		4735	#elif defined(_XM_SSE_INTRINSICS_)
		4736	XMASSERT(pDestination);
		4737	// Note: Masks are x,w,y and z
		4738	static const XMVECTORF32 ScaleIcoN4 = {524287.0f,7.0f4096.0f65536.0f,524287.0f*4096.0f,524287.0f};
		4739	static const XMVECTORI32 MaskIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
		4740	// Clamp to bounds
		4741	XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
		4742	vResult = _mm_max_ps(vResult,g_XMNegativeOne);
		4743	vResult = _mm_min_ps(vResult,g_XMOne);
		4744	// Scale by multiplication
		4745	vResult = _mm_mul_ps(vResult,ScaleIcoN4);
		4746	// Convert to int
		4747	__m128i vResulti = _mm_cvttps_epi32(vResult);
		4748	// Mask off any fraction
		4749	vResulti = _mm_and_si128(vResulti,MaskIcoN4);
		4750	// Shift y and z to straddle the 32-bit boundary
		4751	__m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
		4752	// Shift it into place
		4753	vResulti2 = _mm_slli_si128(vResulti2,20/8);
		4754	// i = x\|y<<20\|z<<40\|w<<60
		4755	vResulti = _mm_or_si128(vResulti,vResulti2);
		4756	_mm_store_sd(reinterpret_cast<double >(&pDestination->v),reinterpret_cast<const __m128d >(&vResulti)[0]);
		4757	#else // _XM_VMX128_INTRINSICS_
		4758	#endif // _XM_VMX128_INTRINSICS_
		4759
		4760	#undef XM_Scale
		4761	#undef XM_URange
		4762	#undef XM_Offset
		4763	#undef XM_UMaxXYZ
		4764	#undef XM_UMaxW
		4765	}
		4766
		4767	//------------------------------------------------------------------------------
		4768
		4769	XMFINLINE VOID XMStoreIco4
		4770	(
		4771	XMICO4* pDestination,
		4772	FXMVECTOR V
		4773	)
		4774	{
		4775	#define XM_Scale (-1.0f / XM_PACK_FACTOR)
		4776	#define XM_URange ((FLOAT)(1 << 4))
		4777	#define XM_Offset (3.0f)
		4778
		4779	#if defined(_XM_NO_INTRINSICS_)
		4780
		4781	XMVECTOR N;
		4782	static CONST XMVECTOR Min = {-524287.0f, -524287.0f, -524287.0f, -7.0f};
		4783	static CONST XMVECTOR Max = {524287.0f, 524287.0f, 524287.0f, 7.0f};
		4784
		4785	XMASSERT(pDestination);
		4786
		4787	N = XMVectorClamp(V, Min, Max);
		4788	N = XMVectorRound(N);
		4789
		4790	pDestination->v = ((INT64)N.vector4_f32[3] << 60) \|
		4791	(((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) \|
		4792	(((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) \|
		4793	(((INT64)N.vector4_f32[0] & 0xFFFFF));
		4794
		4795	#elif defined(_XM_SSE_INTRINSICS_)
		4796	XMASSERT(pDestination);
		4797	// Note: Masks are x,w,y and z
		4798	static const XMVECTORF32 MinIco4 = {-524287.0f,-7.0f,-524287.0f,-524287.0f};
		4799	static const XMVECTORF32 MaxIco4 = { 524287.0f, 7.0f, 524287.0f, 524287.0f};
		4800	static const XMVECTORF32 ScaleIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
		4801	static const XMVECTORI32 MaskIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
		4802	// Clamp to bounds
		4803	XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
		4804	vResult = _mm_max_ps(vResult,MinIco4);
		4805	vResult = _mm_min_ps(vResult,MaxIco4);
		4806	// Scale by multiplication
		4807	vResult = _mm_mul_ps(vResult,ScaleIco4);
		4808	// Convert to int
		4809	__m128i vResulti = _mm_cvttps_epi32(vResult);
		4810	// Mask off any fraction
		4811	vResulti = _mm_and_si128(vResulti,MaskIco4);
		4812	// Shift y and z to straddle the 32-bit boundary
		4813	__m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
		4814	// Shift it into place
		4815	vResulti2 = _mm_slli_si128(vResulti2,20/8);
		4816	// i = x\|y<<20\|z<<40\|w<<60
		4817	vResulti = _mm_or_si128(vResulti,vResulti2);
		4818	_mm_store_sd(reinterpret_cast<double >(&pDestination->v),reinterpret_cast<const __m128d >(&vResulti)[0]);
		4819	#else // _XM_VMX128_INTRINSICS_
		4820	#endif // _XM_VMX128_INTRINSICS_
		4821
		4822	#undef XM_Scale
		4823	#undef XM_URange
		4824	#undef XM_Offset
		4825	}
		4826
		4827	//------------------------------------------------------------------------------
		4828
		4829	XMFINLINE VOID XMStoreXDecN4
		4830	(
		4831	XMXDECN4* pDestination,
		4832	FXMVECTOR V
		4833	)
		4834	{
		4835	#if defined(_XM_NO_INTRINSICS_)
		4836
		4837	XMVECTOR N;
		4838	static CONST XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
		4839	static CONST XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 3.0f};
		4840
		4841	XMASSERT(pDestination);
		4842
		4843	N = XMVectorClamp(V, Min.v, g_XMOne.v);
		4844	N = XMVectorMultiply(N, Scale.v);
		4845	N = XMVectorRound(N);
		4846
		4847	pDestination->v = ((UINT)N.vector4_f32[3] << 30) \|
		4848	(((INT)N.vector4_f32[2] & 0x3FF) << 20) \|
		4849	(((INT)N.vector4_f32[1] & 0x3FF) << 10) \|
		4850	(((INT)N.vector4_f32[0] & 0x3FF));
		4851
		4852	#elif defined(_XM_SSE_INTRINSICS_)
		4853	static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
		4854	static const XMVECTORF32 Scale = {511.0f, 511.0f1024.0f, 511.0f1048576.0f,3.0f*536870912.0f};
		4855	static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
		4856	XMASSERT(pDestination);
		4857	XMVECTOR vResult = _mm_max_ps(V,Min);
		4858	vResult = _mm_min_ps(vResult,g_XMOne);
		4859	// Scale by multiplication
		4860	vResult = _mm_mul_ps(vResult,Scale);
		4861	// Convert to int (W is unsigned)
		4862	__m128i vResulti = _mm_cvtps_epi32(vResult);
		4863	// Mask off any fraction
		4864	vResulti = _mm_and_si128(vResulti,ScaleMask);
		4865	// To fix W, add itself to shift it up to <<30 instead of <<29
		4866	__m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
		4867	vResulti = _mm_add_epi32(vResulti,vResultw);
		4868	// Do a horizontal or of all 4 entries
		4869	vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 >(&vResulti)[0],reinterpret_cast<const __m128 >(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
		4870	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		4871	vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
		4872	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		4873	vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
		4874	vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
		4875	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		4876	#else // _XM_VMX128_INTRINSICS_
		4877	#endif // _XM_VMX128_INTRINSICS_
		4878	}
		4879
		4880	//------------------------------------------------------------------------------
		4881
		4882	XMFINLINE VOID XMStoreXDec4
		4883	(
		4884	XMXDEC4* pDestination,
		4885	FXMVECTOR V
		4886	)
		4887	{
		4888	#if defined(_XM_NO_INTRINSICS_)
		4889
		4890	XMVECTOR N;
		4891	static CONST XMVECTOR Min = {-511.0f, -511.0f, -511.0f, 0.0f};
		4892	static CONST XMVECTOR Max = {511.0f, 511.0f, 511.0f, 3.0f};
		4893
		4894	XMASSERT(pDestination);
		4895
		4896	N = XMVectorClamp(V, Min, Max);
		4897
		4898	pDestination->v = ((UINT)N.vector4_f32[3] << 30) \|
		4899	(((INT)N.vector4_f32[2] & 0x3FF) << 20) \|
		4900	(((INT)N.vector4_f32[1] & 0x3FF) << 10) \|
		4901	(((INT)N.vector4_f32[0] & 0x3FF));
		4902
		4903	#elif defined(_XM_SSE_INTRINSICS_)
		4904	XMASSERT(pDestination);
		4905	static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
		4906	static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
		4907	static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f1024.0f,1024.0f1024.0f*1024.0f/2.0f};
		4908	static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
		4909	// Clamp to bounds
		4910	XMVECTOR vResult = _mm_max_ps(V,MinXDec4);
		4911	vResult = _mm_min_ps(vResult,MaxXDec4);
		4912	// Scale by multiplication
		4913	vResult = _mm_mul_ps(vResult,ScaleXDec4);
		4914	// Convert to int
		4915	__m128i vResulti = _mm_cvttps_epi32(vResult);
		4916	// Mask off any fraction
		4917	vResulti = _mm_and_si128(vResulti,MaskXDec4);
		4918	// Do a horizontal or of 4 entries
		4919	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
		4920	// x = x\|z, y = y\|w
		4921	vResulti = _mm_or_si128(vResulti,vResulti2);
		4922	// Move Z to the x position
		4923	vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
		4924	// Perform a single bit left shift on y\|w
		4925	vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
		4926	// i = x\|y\|z\|w
		4927	vResulti = _mm_or_si128(vResulti,vResulti2);
		4928	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		4929	#else // _XM_VMX128_INTRINSICS_
		4930	#endif // _XM_VMX128_INTRINSICS_
		4931	}
		4932
		4933	//------------------------------------------------------------------------------
		4934
		4935	XMFINLINE VOID XMStoreUDecN4
		4936	(
		4937	XMUDECN4* pDestination,
		4938	FXMVECTOR V
		4939	)
		4940	{
		4941	#if defined(_XM_NO_INTRINSICS_)
		4942
		4943	XMVECTOR N;
		4944	static CONST XMVECTORF32 Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f};
		4945
		4946	XMASSERT(pDestination);
		4947
		4948	N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
		4949	N = XMVectorMultiply(N, Scale.v);
		4950
		4951	pDestination->v = ((UINT)N.vector4_f32[3] << 30) \|
		4952	(((UINT)N.vector4_f32[2] & 0x3FF) << 20) \|
		4953	(((UINT)N.vector4_f32[1] & 0x3FF) << 10) \|
		4954	(((UINT)N.vector4_f32[0] & 0x3FF));
		4955
		4956	#elif defined(_XM_SSE_INTRINSICS_)
		4957	XMASSERT(pDestination);
		4958	static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f1024.0f0.5f,1023.0f1024.0f1024.0f,3.0f1024.0f1024.0f1024.0f0.5f};
		4959	static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
		4960	// Clamp to bounds
		4961	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		4962	vResult = _mm_min_ps(vResult,g_XMOne);
		4963	// Scale by multiplication
		4964	vResult = _mm_mul_ps(vResult,ScaleUDecN4);
		4965	// Convert to int
		4966	__m128i vResulti = _mm_cvttps_epi32(vResult);
		4967	// Mask off any fraction
		4968	vResulti = _mm_and_si128(vResulti,MaskUDecN4);
		4969	// Do a horizontal or of 4 entries
		4970	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
		4971	// x = x\|z, y = y\|w
		4972	vResulti = _mm_or_si128(vResulti,vResulti2);
		4973	// Move Z to the x position
		4974	vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
		4975	// Perform a left shift by one bit on y\|w
		4976	vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
		4977	// i = x\|y\|z\|w
		4978	vResulti = _mm_or_si128(vResulti,vResulti2);
		4979	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		4980	#else // _XM_VMX128_INTRINSICS_
		4981	#endif // _XM_VMX128_INTRINSICS_
		4982	}
		4983
		4984	//------------------------------------------------------------------------------
		4985
		4986	XMFINLINE VOID XMStoreUDec4
		4987	(
		4988	XMUDEC4* pDestination,
		4989	FXMVECTOR V
		4990	)
		4991	{
		4992	#if defined(_XM_NO_INTRINSICS_)
		4993
		4994	XMVECTOR N;
		4995	static CONST XMVECTOR Max = {1023.0f, 1023.0f, 1023.0f, 3.0f};
		4996
		4997	XMASSERT(pDestination);
		4998
		4999	N = XMVectorClamp(V, XMVectorZero(), Max);
		5000
		5001	pDestination->v = ((UINT)N.vector4_f32[3] << 30) \|
		5002	(((UINT)N.vector4_f32[2] & 0x3FF) << 20) \|
		5003	(((UINT)N.vector4_f32[1] & 0x3FF) << 10) \|
		5004	(((UINT)N.vector4_f32[0] & 0x3FF));
		5005
		5006	#elif defined(_XM_SSE_INTRINSICS_)
		5007	XMASSERT(pDestination);
		5008	static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
		5009	static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f1024.0f,1024.0f1024.0f*1024.0f/2.0f};
		5010	static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
		5011	// Clamp to bounds
		5012	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		5013	vResult = _mm_min_ps(vResult,MaxUDec4);
		5014	// Scale by multiplication
		5015	vResult = _mm_mul_ps(vResult,ScaleUDec4);
		5016	// Convert to int
		5017	__m128i vResulti = _mm_cvttps_epi32(vResult);
		5018	// Mask off any fraction
		5019	vResulti = _mm_and_si128(vResulti,MaskUDec4);
		5020	// Do a horizontal or of 4 entries
		5021	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
		5022	// x = x\|z, y = y\|w
		5023	vResulti = _mm_or_si128(vResulti,vResulti2);
		5024	// Move Z to the x position
		5025	vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
		5026	// Perform a left shift by one bit on y\|w
		5027	vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
		5028	// i = x\|y\|z\|w
		5029	vResulti = _mm_or_si128(vResulti,vResulti2);
		5030	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		5031	#else // _XM_VMX128_INTRINSICS_
		5032	#endif // _XM_VMX128_INTRINSICS_
		5033	}
		5034
		5035	//------------------------------------------------------------------------------
		5036
		5037	XMFINLINE VOID XMStoreDecN4
		5038	(
		5039	XMDECN4* pDestination,
		5040	FXMVECTOR V
		5041	)
		5042	{
		5043	#if defined(_XM_NO_INTRINSICS_)
		5044
		5045	XMVECTOR N;
		5046	static CONST XMVECTORF32 Scale = {511.0f, 511.0f, 511.0f, 1.0f};
		5047
		5048	XMASSERT(pDestination);
		5049
		5050	N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
		5051	N = XMVectorMultiply(N, Scale.v);
		5052
		5053	pDestination->v = ((INT)N.vector4_f32[3] << 30) \|
		5054	(((INT)N.vector4_f32[2] & 0x3FF) << 20) \|
		5055	(((INT)N.vector4_f32[1] & 0x3FF) << 10) \|
		5056	(((INT)N.vector4_f32[0] & 0x3FF));
		5057
		5058	#elif defined(_XM_SSE_INTRINSICS_)
		5059	XMASSERT(pDestination);
		5060	static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f1024.0f,511.0f1024.0f1024.0f,1.0f1024.0f1024.0f1024.0f};
		5061	static const XMVECTORI32 MaskDecN4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
		5062	// Clamp to bounds
		5063	XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
		5064	vResult = _mm_min_ps(vResult,g_XMOne);
		5065	// Scale by multiplication
		5066	vResult = _mm_mul_ps(vResult,ScaleDecN4);
		5067	// Convert to int
		5068	__m128i vResulti = _mm_cvttps_epi32(vResult);
		5069	// Mask off any fraction
		5070	vResulti = _mm_and_si128(vResulti,MaskDecN4);
		5071	// Do a horizontal or of 4 entries
		5072	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
		5073	// x = x\|z, y = y\|w
		5074	vResulti = _mm_or_si128(vResulti,vResulti2);
		5075	// Move Z to the x position
		5076	vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
		5077	// i = x\|y\|z\|w
		5078	vResulti = _mm_or_si128(vResulti,vResulti2);
		5079	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		5080	#else // _XM_VMX128_INTRINSICS_
		5081	#endif // _XM_VMX128_INTRINSICS_
		5082	}
		5083
		5084	//------------------------------------------------------------------------------
		5085
		5086	XMFINLINE VOID XMStoreDec4
		5087	(
		5088	XMDEC4* pDestination,
		5089	FXMVECTOR V
		5090	)
		5091	{
		5092	#if defined(_XM_NO_INTRINSICS_)
		5093
		5094	XMVECTOR N;
		5095	static CONST XMVECTOR Min = {-511.0f, -511.0f, -511.0f, -1.0f};
		5096	static CONST XMVECTOR Max = {511.0f, 511.0f, 511.0f, 1.0f};
		5097
		5098	XMASSERT(pDestination);
		5099
		5100	N = XMVectorClamp(V, Min, Max);
		5101
		5102	pDestination->v = ((INT)N.vector4_f32[3] << 30) \|
		5103	(((INT)N.vector4_f32[2] & 0x3FF) << 20) \|
		5104	(((INT)N.vector4_f32[1] & 0x3FF) << 10) \|
		5105	(((INT)N.vector4_f32[0] & 0x3FF));
		5106
		5107	#elif defined(_XM_SSE_INTRINSICS_)
		5108	XMASSERT(pDestination);
		5109	static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
		5110	static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
		5111	static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f1024.0f,1024.0f1024.0f*1024.0f};
		5112	static const XMVECTORI32 MaskDec4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
		5113	// Clamp to bounds
		5114	XMVECTOR vResult = _mm_max_ps(V,MinDec4);
		5115	vResult = _mm_min_ps(vResult,MaxDec4);
		5116	// Scale by multiplication
		5117	vResult = _mm_mul_ps(vResult,ScaleDec4);
		5118	// Convert to int
		5119	__m128i vResulti = _mm_cvttps_epi32(vResult);
		5120	// Mask off any fraction
		5121	vResulti = _mm_and_si128(vResulti,MaskDec4);
		5122	// Do a horizontal or of 4 entries
		5123	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
		5124	// x = x\|z, y = y\|w
		5125	vResulti = _mm_or_si128(vResulti,vResulti2);
		5126	// Move Z to the x position
		5127	vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
		5128	// i = x\|y\|z\|w
		5129	vResulti = _mm_or_si128(vResulti,vResulti2);
		5130	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		5131	#else // _XM_VMX128_INTRINSICS_
		5132	#endif // _XM_VMX128_INTRINSICS_
		5133	}
		5134
		5135	//------------------------------------------------------------------------------
		5136
		5137	XMFINLINE VOID XMStoreUByteN4
		5138	(
		5139	XMUBYTEN4* pDestination,
		5140	FXMVECTOR V
		5141	)
		5142	{
		5143	#if defined(_XM_NO_INTRINSICS_)
		5144
		5145	XMVECTOR N;
		5146	static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f};
		5147
		5148	XMASSERT(pDestination);
		5149
		5150	N = XMVectorSaturate(V);
		5151	N = XMVectorMultiply(N, Scale.v);
		5152	N = XMVectorRound(N);
		5153
		5154	pDestination->x = (BYTE)N.vector4_f32[0];
		5155	pDestination->y = (BYTE)N.vector4_f32[1];
		5156	pDestination->z = (BYTE)N.vector4_f32[2];
		5157	pDestination->w = (BYTE)N.vector4_f32[3];
		5158
		5159	#elif defined(_XM_SSE_INTRINSICS_)
		5160	XMASSERT(pDestination);
		5161	static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f256.0f0.5f,255.0f256.0f256.0f,255.0f256.0f256.0f256.0f0.5f};
		5162	static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
		5163	// Clamp to bounds
		5164	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		5165	vResult = _mm_min_ps(vResult,g_XMOne);
		5166	// Scale by multiplication
		5167	vResult = _mm_mul_ps(vResult,ScaleUByteN4);
		5168	// Convert to int
		5169	__m128i vResulti = _mm_cvttps_epi32(vResult);
		5170	// Mask off any fraction
		5171	vResulti = _mm_and_si128(vResulti,MaskUByteN4);
		5172	// Do a horizontal or of 4 entries
		5173	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
		5174	// x = x\|z, y = y\|w
		5175	vResulti = _mm_or_si128(vResulti,vResulti2);
		5176	// Move Z to the x position
		5177	vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
		5178	// Perform a single bit left shift to fix y\|w
		5179	vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
		5180	// i = x\|y\|z\|w
		5181	vResulti = _mm_or_si128(vResulti,vResulti2);
		5182	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		5183	#else // _XM_VMX128_INTRINSICS_
		5184	#endif // _XM_VMX128_INTRINSICS_
		5185	}
		5186
		5187	//------------------------------------------------------------------------------
		5188
		5189	XMFINLINE VOID XMStoreUByte4
		5190	(
		5191	XMUBYTE4* pDestination,
		5192	FXMVECTOR V
		5193	)
		5194	{
		5195	#if defined(_XM_NO_INTRINSICS_)
		5196
		5197	XMVECTOR N;
		5198	static CONST XMVECTOR Max = {255.0f, 255.0f, 255.0f, 255.0f};
		5199
		5200	XMASSERT(pDestination);
		5201
		5202	N = XMVectorClamp(V, XMVectorZero(), Max);
		5203	N = XMVectorRound(N);
		5204
		5205	pDestination->x = (BYTE)N.vector4_f32[0];
		5206	pDestination->y = (BYTE)N.vector4_f32[1];
		5207	pDestination->z = (BYTE)N.vector4_f32[2];
		5208	pDestination->w = (BYTE)N.vector4_f32[3];
		5209
		5210	#elif defined(_XM_SSE_INTRINSICS_)
		5211	XMASSERT(pDestination);
		5212	static const XMVECTORF32 MaxUByte4 = { 255.0f, 255.0f, 255.0f, 255.0f};
		5213	static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f0.5f,256.0f256.0f,256.0f256.0f256.0f*0.5f};
		5214	static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
		5215	// Clamp to bounds
		5216	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		5217	vResult = _mm_min_ps(vResult,MaxUByte4);
		5218	// Scale by multiplication
		5219	vResult = _mm_mul_ps(vResult,ScaleUByte4);
		5220	// Convert to int
		5221	__m128i vResulti = _mm_cvttps_epi32(vResult);
		5222	// Mask off any fraction
		5223	vResulti = _mm_and_si128(vResulti,MaskUByte4);
		5224	// Do a horizontal or of 4 entries
		5225	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
		5226	// x = x\|z, y = y\|w
		5227	vResulti = _mm_or_si128(vResulti,vResulti2);
		5228	// Move Z to the x position
		5229	vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
		5230	// Perform a single bit left shift to fix y\|w
		5231	vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
		5232	// i = x\|y\|z\|w
		5233	vResulti = _mm_or_si128(vResulti,vResulti2);
		5234	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		5235	#else // _XM_VMX128_INTRINSICS_
		5236	#endif // _XM_VMX128_INTRINSICS_
		5237	}
		5238
		5239	//------------------------------------------------------------------------------
		5240
		5241	XMFINLINE VOID XMStoreByteN4
		5242	(
		5243	XMBYTEN4* pDestination,
		5244	FXMVECTOR V
		5245	)
		5246	{
		5247	#if defined(_XM_NO_INTRINSICS_)
		5248
		5249	XMVECTOR N;
		5250	static CONST XMVECTORF32 Scale = {127.0f, 127.0f, 127.0f, 127.0f};
		5251
		5252	XMASSERT(pDestination);
		5253
		5254	N = XMVectorMultiply(V, Scale.v);
		5255	N = XMVectorRound(N);
		5256
		5257	pDestination->x = (CHAR)N.vector4_f32[0];
		5258	pDestination->y = (CHAR)N.vector4_f32[1];
		5259	pDestination->z = (CHAR)N.vector4_f32[2];
		5260	pDestination->w = (CHAR)N.vector4_f32[3];
		5261
		5262	#elif defined(_XM_SSE_INTRINSICS_)
		5263	XMASSERT(pDestination);
		5264	static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f256.0f,127.0f256.0f256.0f,127.0f256.0f256.0f256.0f};
		5265	static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
		5266	// Clamp to bounds
		5267	XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
		5268	vResult = _mm_min_ps(vResult,g_XMOne);
		5269	// Scale by multiplication
		5270	vResult = _mm_mul_ps(vResult,ScaleByteN4);
		5271	// Convert to int
		5272	__m128i vResulti = _mm_cvttps_epi32(vResult);
		5273	// Mask off any fraction
		5274	vResulti = _mm_and_si128(vResulti,MaskByteN4);
		5275	// Do a horizontal or of 4 entries
		5276	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
		5277	// x = x\|z, y = y\|w
		5278	vResulti = _mm_or_si128(vResulti,vResulti2);
		5279	// Move Z to the x position
		5280	vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
		5281	// i = x\|y\|z\|w
		5282	vResulti = _mm_or_si128(vResulti,vResulti2);
		5283	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		5284	#else // _XM_VMX128_INTRINSICS_
		5285	#endif // _XM_VMX128_INTRINSICS_
		5286	}
		5287
		5288	//------------------------------------------------------------------------------
		5289
		5290	XMFINLINE VOID XMStoreByte4
		5291	(
		5292	XMBYTE4* pDestination,
		5293	FXMVECTOR V
		5294	)
		5295	{
		5296	#if defined(_XM_NO_INTRINSICS_)
		5297
		5298	XMVECTOR N;
		5299	static CONST XMVECTOR Min = {-127.0f, -127.0f, -127.0f, -127.0f};
		5300	static CONST XMVECTOR Max = {127.0f, 127.0f, 127.0f, 127.0f};
		5301
		5302	XMASSERT(pDestination);
		5303
		5304	N = XMVectorClamp(V, Min, Max);
		5305	N = XMVectorRound(N);
		5306
		5307	pDestination->x = (CHAR)N.vector4_f32[0];
		5308	pDestination->y = (CHAR)N.vector4_f32[1];
		5309	pDestination->z = (CHAR)N.vector4_f32[2];
		5310	pDestination->w = (CHAR)N.vector4_f32[3];
		5311
		5312	#elif defined(_XM_SSE_INTRINSICS_)
		5313	XMASSERT(pDestination);
		5314	static const XMVECTORF32 MinByte4 = {-127.0f,-127.0f,-127.0f,-127.0f};
		5315	static const XMVECTORF32 MaxByte4 = { 127.0f, 127.0f, 127.0f, 127.0f};
		5316	static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f256.0f,256.0f256.0f*256.0f};
		5317	static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
		5318	// Clamp to bounds
		5319	XMVECTOR vResult = _mm_max_ps(V,MinByte4);
		5320	vResult = _mm_min_ps(vResult,MaxByte4);
		5321	// Scale by multiplication
		5322	vResult = _mm_mul_ps(vResult,ScaleByte4);
		5323	// Convert to int
		5324	__m128i vResulti = _mm_cvttps_epi32(vResult);
		5325	// Mask off any fraction
		5326	vResulti = _mm_and_si128(vResulti,MaskByte4);
		5327	// Do a horizontal or of 4 entries
		5328	__m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
		5329	// x = x\|z, y = y\|w
		5330	vResulti = _mm_or_si128(vResulti,vResulti2);
		5331	// Move Z to the x position
		5332	vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
		5333	// i = x\|y\|z\|w
		5334	vResulti = _mm_or_si128(vResulti,vResulti2);
		5335	_mm_store_ss(reinterpret_cast<float >(&pDestination->v),reinterpret_cast<const __m128 >(&vResulti)[0]);
		5336	#else // _XM_VMX128_INTRINSICS_
		5337	#endif // _XM_VMX128_INTRINSICS_
		5338	}
		5339
		5340	//------------------------------------------------------------------------------
		5341
		5342	XMFINLINE VOID XMStoreUNibble4
		5343	(
		5344	XMUNIBBLE4* pDestination,
		5345	FXMVECTOR V
		5346	)
		5347	{
		5348	#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
		5349	XMASSERT(pDestination);
		5350	static CONST XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f};
		5351	// Bounds check
		5352	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		5353	vResult = _mm_min_ps(vResult,Max);
		5354	// Convert to int with rounding
		5355	__m128i vInt = _mm_cvtps_epi32(vResult);
		5356	// No SSE operations will write to 16-bit values, so we have to extract them manually
		5357	USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
		5358	USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
		5359	USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
		5360	USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
		5361	pDestination->v = ((w & 0xF) << 12) \|
		5362	((z & 0xF) << 8) \|
		5363	((y & 0xF) << 4) \|
		5364	((x & 0xF));
		5365	#else
		5366	XMVECTOR N;
		5367	static CONST XMVECTORF32 Max = {15.0f,15.0f,15.0f,15.0f};
		5368
		5369	XMASSERT(pDestination);
		5370
		5371	N = XMVectorClamp(V, XMVectorZero(), Max.v);
		5372	N = XMVectorRound(N);
		5373
		5374	pDestination->v = (((USHORT)N.vector4_f32[3] & 0xF) << 12) \|
		5375	(((USHORT)N.vector4_f32[2] & 0xF) << 8) \|
		5376	(((USHORT)N.vector4_f32[1] & 0xF) << 4) \|
		5377	(((USHORT)N.vector4_f32[0] & 0xF));
		5378	#endif !_XM_SSE_INTRINSICS_
		5379	}
		5380
		5381	//------------------------------------------------------------------------------
		5382
		5383	XMFINLINE VOID XMStoreU555(
		5384	XMU555* pDestination,
		5385	FXMVECTOR V
		5386	)
		5387	{
		5388	#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
		5389	XMASSERT(pDestination);
		5390	static CONST XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f};
		5391	// Bounds check
		5392	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		5393	vResult = _mm_min_ps(vResult,Max);
		5394	// Convert to int with rounding
		5395	__m128i vInt = _mm_cvtps_epi32(vResult);
		5396	// No SSE operations will write to 16-bit values, so we have to extract them manually
		5397	USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
		5398	USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
		5399	USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
		5400	USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
		5401	pDestination->v = ((w) ? 0x8000 : 0) \|
		5402	((z & 0x1F) << 10) \|
		5403	((y & 0x1F) << 5) \|
		5404	((x & 0x1F));
		5405	#else
		5406	XMVECTOR N;
		5407	static CONST XMVECTORF32 Max = {31.0f, 31.0f, 31.0f, 1.0f};
		5408
		5409	XMASSERT(pDestination);
		5410
		5411	N = XMVectorClamp(V, XMVectorZero(), Max.v);
		5412	N = XMVectorRound(N);
		5413
		5414	pDestination->v = ((N.vector4_f32[3] > 0.f) ? 0x8000 : 0) \|
		5415	(((USHORT)N.vector4_f32[2] & 0x1F) << 10) \|
		5416	(((USHORT)N.vector4_f32[1] & 0x1F) << 5) \|
		5417	(((USHORT)N.vector4_f32[0] & 0x1F));
		5418	#endif !_XM_SSE_INTRINSICS_
		5419	}
		5420
		5421	//------------------------------------------------------------------------------
		5422
		5423	XMFINLINE VOID XMStoreColor
		5424	(
		5425	XMCOLOR* pDestination,
		5426	FXMVECTOR V
		5427	)
		5428	{
		5429	#if defined(_XM_NO_INTRINSICS_)
		5430
		5431	XMVECTOR N;
		5432	static CONST XMVECTORF32 Scale = {255.0f, 255.0f, 255.0f, 255.0f};
		5433
		5434	XMASSERT(pDestination);
		5435
		5436	N = XMVectorSaturate(V);
		5437	N = XMVectorMultiply(N, Scale.v);
		5438	N = XMVectorRound(N);
		5439
		5440	pDestination->c = ((UINT)N.vector4_f32[3] << 24) \|
		5441	((UINT)N.vector4_f32[0] << 16) \|
		5442	((UINT)N.vector4_f32[1] << 8) \|
		5443	((UINT)N.vector4_f32[2]);
		5444
		5445	#elif defined(_XM_SSE_INTRINSICS_)
		5446	XMASSERT(pDestination);
		5447	static CONST XMVECTORF32 Scale = {255.0f,255.0f,255.0f,255.0f};
		5448	// Set <0 to 0
		5449	XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
		5450	// Set>1 to 1
		5451	vResult = _mm_min_ps(vResult,g_XMOne);
		5452	// Convert to 0-255
		5453	vResult = _mm_mul_ps(vResult,Scale);
		5454	// Shuffle RGBA to ARGB
		5455	vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,1,0,3));
		5456	// Convert to int
		5457	__m128i vInt = _mm_cvtps_epi32(vResult);
		5458	// Mash to shorts
		5459	vInt = _mm_packs_epi32(vInt,vInt);
		5460	// Mash to bytes
		5461	vInt = _mm_packs_epi16(vInt,vInt);
		5462	// Store the color
		5463	_mm_store_ss(reinterpret_cast<float >(&pDestination->c),reinterpret_cast<__m128 >(&vInt)[0]);
		5464	#else // _XM_VMX128_INTRINSICS_
		5465	#endif // _XM_VMX128_INTRINSICS_
		5466	}
		5467
		5468	//------------------------------------------------------------------------------
		5469
		5470	XMFINLINE VOID XMStoreFloat3x3
		5471	(
		5472	XMFLOAT3X3* pDestination,
		5473	CXMMATRIX M
		5474	)
		5475	{
		5476	#if defined(_XM_NO_INTRINSICS_) \|\| defined(XM_NO_MISALIGNED_VECTOR_ACCESS) \|\| defined(_XM_SSE_INTRINSICS_)
		5477
		5478	XMStoreFloat3x3NC(pDestination, M);
		5479
		5480	#else // _XM_VMX128_INTRINSICS_
		5481	#endif // _XM_VMX128_INTRINSICS_
		5482	}
		5483
		5484	//------------------------------------------------------------------------------
		5485
		5486	XMFINLINE VOID XMStoreFloat3x3NC
		5487	(
		5488	XMFLOAT3X3* pDestination,
		5489	CXMMATRIX M
		5490	)
		5491	{
		5492	#if defined(_XM_NO_INTRINSICS_)
		5493
		5494	XMASSERT(pDestination);
		5495
		5496	pDestination->m[0][0] = M.r[0].vector4_f32[0];
		5497	pDestination->m[0][1] = M.r[0].vector4_f32[1];
		5498	pDestination->m[0][2] = M.r[0].vector4_f32[2];
		5499
		5500	pDestination->m[1][0] = M.r[1].vector4_f32[0];
		5501	pDestination->m[1][1] = M.r[1].vector4_f32[1];
		5502	pDestination->m[1][2] = M.r[1].vector4_f32[2];
		5503
		5504	pDestination->m[2][0] = M.r[2].vector4_f32[0];
		5505	pDestination->m[2][1] = M.r[2].vector4_f32[1];
		5506	pDestination->m[2][2] = M.r[2].vector4_f32[2];
		5507
		5508	#elif defined(_XM_SSE_INTRINSICS_)
		5509	XMASSERT(pDestination);
		5510	XMVECTOR vTemp1 = M.r[0];
		5511	XMVECTOR vTemp2 = M.r[1];
		5512	XMVECTOR vTemp3 = M.r[2];
		5513	XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
		5514	vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
		5515	_mm_storeu_ps(&pDestination->m[0][0],vTemp1);
		5516	vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
		5517	_mm_storeu_ps(&pDestination->m[1][1],vTemp2);
		5518	vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(2,2,2,2));
		5519	_mm_store_ss(&pDestination->m[2][2],vTemp3);
		5520	#else // _XM_VMX128_INTRINSICS_
		5521	#endif // _XM_VMX128_INTRINSICS_
		5522	}
		5523
		5524	//------------------------------------------------------------------------------
		5525
		5526	XMFINLINE VOID XMStoreFloat4x3
		5527	(
		5528	XMFLOAT4X3* pDestination,
		5529	CXMMATRIX M
		5530	)
		5531	{
		5532	#if defined(_XM_NO_INTRINSICS_) \|\| defined(XM_NO_MISALIGNED_VECTOR_ACCESS) \|\| defined(_XM_SSE_INTRINSICS_)
		5533
		5534	XMStoreFloat4x3NC(pDestination, M);
		5535
		5536	#else // _XM_VMX128_INTRINSICS_
		5537	#endif // _XM_VMX128_INTRINSICS_
		5538	}
		5539
		5540	//------------------------------------------------------------------------------
		5541
		5542	XMFINLINE VOID XMStoreFloat4x3A
		5543	(
		5544	XMFLOAT4X3A* pDestination,
		5545	CXMMATRIX M
		5546	)
		5547	{
		5548	#if defined(_XM_NO_INTRINSICS_)
		5549
		5550	XMASSERT(pDestination);
		5551	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		5552
		5553	pDestination->m[0][0] = M.r[0].vector4_f32[0];
		5554	pDestination->m[0][1] = M.r[0].vector4_f32[1];
		5555	pDestination->m[0][2] = M.r[0].vector4_f32[2];
		5556
		5557	pDestination->m[1][0] = M.r[1].vector4_f32[0];
		5558	pDestination->m[1][1] = M.r[1].vector4_f32[1];
		5559	pDestination->m[1][2] = M.r[1].vector4_f32[2];
		5560
		5561	pDestination->m[2][0] = M.r[2].vector4_f32[0];
		5562	pDestination->m[2][1] = M.r[2].vector4_f32[1];
		5563	pDestination->m[2][2] = M.r[2].vector4_f32[2];
		5564
		5565	pDestination->m[3][0] = M.r[3].vector4_f32[0];
		5566	pDestination->m[3][1] = M.r[3].vector4_f32[1];
		5567	pDestination->m[3][2] = M.r[3].vector4_f32[2];
		5568
		5569	#elif defined(_XM_SSE_INTRINSICS_)
		5570	XMASSERT(pDestination);
		5571	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		5572	// x1,y1,z1,w1
		5573	XMVECTOR vTemp1 = M.r[0];
		5574	// x2,y2,z2,w2
		5575	XMVECTOR vTemp2 = M.r[1];
		5576	// x3,y3,z3,w3
		5577	XMVECTOR vTemp3 = M.r[2];
		5578	// x4,y4,z4,w4
		5579	XMVECTOR vTemp4 = M.r[3];
		5580	// z1,z1,x2,y2
		5581	XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
		5582	// y2,z2,x3,y3 (Final)
		5583	vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
		5584	// x1,y1,z1,x2 (Final)
		5585	vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
		5586	// z3,z3,x4,x4
		5587	vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
		5588	// z3,x4,y4,z4 (Final)
		5589	vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
		5590	// Store in 3 operations
		5591	_mm_store_ps(&pDestination->m[0][0],vTemp1);
		5592	_mm_store_ps(&pDestination->m[1][1],vTemp2);
		5593	_mm_store_ps(&pDestination->m[2][2],vTemp3);
		5594	#else // _XM_VMX128_INTRINSICS_
		5595	#endif // _XM_VMX128_INTRINSICS_
		5596	}
		5597
		5598	//------------------------------------------------------------------------------
		5599
		5600	XMFINLINE VOID XMStoreFloat4x3NC
		5601	(
		5602	XMFLOAT4X3* pDestination,
		5603	CXMMATRIX M
		5604	)
		5605	{
		5606	#if defined(_XM_NO_INTRINSICS_)
		5607
		5608	XMASSERT(pDestination);
		5609
		5610	pDestination->m[0][0] = M.r[0].vector4_f32[0];
		5611	pDestination->m[0][1] = M.r[0].vector4_f32[1];
		5612	pDestination->m[0][2] = M.r[0].vector4_f32[2];
		5613
		5614	pDestination->m[1][0] = M.r[1].vector4_f32[0];
		5615	pDestination->m[1][1] = M.r[1].vector4_f32[1];
		5616	pDestination->m[1][2] = M.r[1].vector4_f32[2];
		5617
		5618	pDestination->m[2][0] = M.r[2].vector4_f32[0];
		5619	pDestination->m[2][1] = M.r[2].vector4_f32[1];
		5620	pDestination->m[2][2] = M.r[2].vector4_f32[2];
		5621
		5622	pDestination->m[3][0] = M.r[3].vector4_f32[0];
		5623	pDestination->m[3][1] = M.r[3].vector4_f32[1];
		5624	pDestination->m[3][2] = M.r[3].vector4_f32[2];
		5625
		5626	#elif defined(_XM_SSE_INTRINSICS_)
		5627	XMASSERT(pDestination);
		5628	XMVECTOR vTemp1 = M.r[0];
		5629	XMVECTOR vTemp2 = M.r[1];
		5630	XMVECTOR vTemp3 = M.r[2];
		5631	XMVECTOR vTemp4 = M.r[3];
		5632	XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
		5633	vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
		5634	vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
		5635	vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
		5636	vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
		5637	_mm_storeu_ps(&pDestination->m[0][0],vTemp1);
		5638	_mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
		5639	_mm_storeu_ps(&pDestination->m[2][2],vTemp3);
		5640	#else // _XM_VMX128_INTRINSICS_
		5641	#endif // _XM_VMX128_INTRINSICS_
		5642	}
		5643
		5644	//------------------------------------------------------------------------------
		5645
		5646	XMFINLINE VOID XMStoreFloat4x4
		5647	(
		5648	XMFLOAT4X4* pDestination,
		5649	CXMMATRIX M
		5650	)
		5651	{
		5652	#if defined(_XM_NO_INTRINSICS_) \|\| defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
		5653
		5654	XMStoreFloat4x4NC(pDestination, M);
		5655
		5656	#elif defined(_XM_SSE_INTRINSICS_)
		5657	XMASSERT(pDestination);
		5658
		5659	_mm_storeu_ps( &pDestination->_11, M.r[0] );
		5660	_mm_storeu_ps( &pDestination->_21, M.r[1] );
		5661	_mm_storeu_ps( &pDestination->_31, M.r[2] );
		5662	_mm_storeu_ps( &pDestination->_41, M.r[3] );
		5663	#else // _XM_VMX128_INTRINSICS_
		5664	#endif // _XM_VMX128_INTRINSICS_
		5665	}
		5666
		5667	//------------------------------------------------------------------------------
		5668
		5669	XMFINLINE VOID XMStoreFloat4x4A
		5670	(
		5671	XMFLOAT4X4A* pDestination,
		5672	CXMMATRIX M
		5673	)
		5674	{
		5675	#if defined(_XM_NO_INTRINSICS_)
		5676
		5677	XMASSERT(pDestination);
		5678	XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
		5679
		5680	pDestination->m[0][0] = M.r[0].vector4_f32[0];
		5681	pDestination->m[0][1] = M.r[0].vector4_f32[1];
		5682	pDestination->m[0][2] = M.r[0].vector4_f32[2];
		5683	pDestination->m[0][3] = M.r[0].vector4_f32[3];
		5684
		5685	pDestination->m[1][0] = M.r[1].vector4_f32[0];
		5686	pDestination->m[1][1] = M.r[1].vector4_f32[1];
		5687	pDestination->m[1][2] = M.r[1].vector4_f32[2];
		5688	pDestination->m[1][3] = M.r[1].vector4_f32[3];
		5689
		5690	pDestination->m[2][0] = M.r[2].vector4_f32[0];
		5691	pDestination->m[2][1] = M.r[2].vector4_f32[1];
		5692	pDestination->m[2][2] = M.r[2].vector4_f32[2];
		5693	pDestination->m[2][3] = M.r[2].vector4_f32[3];
		5694
		5695	pDestination->m[3][0] = M.r[3].vector4_f32[0];
		5696	pDestination->m[3][1] = M.r[3].vector4_f32[1];
		5697	pDestination->m[3][2] = M.r[3].vector4_f32[2];
		5698	pDestination->m[3][3] = M.r[3].vector4_f32[3];
		5699
		5700	#elif defined(_XM_SSE_INTRINSICS_)
		5701	XMASSERT(pDestination);
		5702
		5703	_mm_store_ps( &pDestination->_11, M.r[0] );
		5704	_mm_store_ps( &pDestination->_21, M.r[1] );
		5705	_mm_store_ps( &pDestination->_31, M.r[2] );
		5706	_mm_store_ps( &pDestination->_41, M.r[3] );
		5707	#else // _XM_VMX128_INTRINSICS_
		5708	#endif // _XM_VMX128_INTRINSICS_
		5709	}
		5710
		5711	//------------------------------------------------------------------------------
		5712
		5713	XMFINLINE VOID XMStoreFloat4x4NC
		5714	(
		5715	XMFLOAT4X4* pDestination,
		5716	CXMMATRIX M
		5717	)
		5718	{
		5719	#if defined(_XM_NO_INTRINSICS_)
		5720
		5721	XMASSERT(pDestination);
		5722
		5723	pDestination->m[0][0] = M.r[0].vector4_f32[0];
		5724	pDestination->m[0][1] = M.r[0].vector4_f32[1];
		5725	pDestination->m[0][2] = M.r[0].vector4_f32[2];
		5726	pDestination->m[0][3] = M.r[0].vector4_f32[3];
		5727
		5728	pDestination->m[1][0] = M.r[1].vector4_f32[0];
		5729	pDestination->m[1][1] = M.r[1].vector4_f32[1];
		5730	pDestination->m[1][2] = M.r[1].vector4_f32[2];
		5731	pDestination->m[1][3] = M.r[1].vector4_f32[3];
		5732
		5733	pDestination->m[2][0] = M.r[2].vector4_f32[0];
		5734	pDestination->m[2][1] = M.r[2].vector4_f32[1];
		5735	pDestination->m[2][2] = M.r[2].vector4_f32[2];
		5736	pDestination->m[2][3] = M.r[2].vector4_f32[3];
		5737
		5738	pDestination->m[3][0] = M.r[3].vector4_f32[0];
		5739	pDestination->m[3][1] = M.r[3].vector4_f32[1];
		5740	pDestination->m[3][2] = M.r[3].vector4_f32[2];
		5741	pDestination->m[3][3] = M.r[3].vector4_f32[3];
		5742
		5743	#elif defined(_XM_SSE_INTRINSICS_)
		5744	XMASSERT(pDestination);
		5745	_mm_storeu_ps(&pDestination->m[0][0],M.r[0]);
		5746	_mm_storeu_ps(&pDestination->m[1][0],M.r[1]);
		5747	_mm_storeu_ps(&pDestination->m[2][0],M.r[2]);
		5748	_mm_storeu_ps(&pDestination->m[3][0],M.r[3]);
		5749	#else // _XM_VMX128_INTRINSICS_
		5750	#endif // _XM_VMX128_INTRINSICS_
		5751	}
		5752
		5753	#endif // __XNAMATHCONVERT_INL__
		5754

Subversion Repositories Games.Chess Giants

Games.Chess Giants/DirectX9/Include/xnamathconvert.inl – Rev 1