Subversion Repositories Games.Chess Giants

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
1 pmbaty 1
/*++
2
 
3
Copyright (c) Microsoft Corporation. All rights reserved.
4
 
5
Module Name:
6
 
7
    xnamathconvert.inl
8
 
9
Abstract:
10
 
11
	XNA math library for Windows and Xbox 360: Conversion, loading, and storing functions.
12
--*/
13
 
14
#if defined(_MSC_VER) && (_MSC_VER > 1000)
15
#pragma once
16
#endif
17
 
18
#ifndef __XNAMATHCONVERT_INL__
19
#define __XNAMATHCONVERT_INL__
20
 
21
#define XM_PACK_FACTOR                  (FLOAT)(1 << 22)
22
#define XM_UNPACK_FACTOR_UNSIGNED       (FLOAT)(1 << 23)
23
#define XM_UNPACK_FACTOR_SIGNED         XM_PACK_FACTOR
24
 
25
#define XM_UNPACK_UNSIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
26
                                        {-XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
27
                                         -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
28
                                         -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
29
                                         -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
30
 
31
#define XM_UNPACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
32
                                        {XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
33
                                         XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
34
                                         XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
35
                                         XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
36
 
37
#define XM_UNPACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
38
                                        {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1), \
39
                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1), \
40
                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1), \
41
                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1)}
42
 
43
//#define XM_UNPACK_SIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
44
//                                        {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1) * 3.0f, \
45
//                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1) * 3.0f, \
46
//                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1) * 3.0f, \
47
//                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1) * 3.0f}
48
 
49
#define XM_PACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
50
                                        {-(FLOAT)((1 << (BitsX)) - 1) / XM_PACK_FACTOR, \
51
                                         -(FLOAT)((1 << (BitsY)) - 1) / XM_PACK_FACTOR, \
52
                                         -(FLOAT)((1 << (BitsZ)) - 1) / XM_PACK_FACTOR, \
53
                                         -(FLOAT)((1 << (BitsW)) - 1) / XM_PACK_FACTOR}
54
 
55
#define XM_PACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
56
                                        {-(FLOAT)((1 << ((BitsX) - 1)) - 1) / XM_PACK_FACTOR, \
57
                                         -(FLOAT)((1 << ((BitsY) - 1)) - 1) / XM_PACK_FACTOR, \
58
                                         -(FLOAT)((1 << ((BitsZ) - 1)) - 1) / XM_PACK_FACTOR, \
59
                                         -(FLOAT)((1 << ((BitsW) - 1)) - 1) / XM_PACK_FACTOR}
60
 
61
#define XM_PACK_OFFSET                  XMVectorSplatConstant(3, 0)
62
//#define XM_UNPACK_OFFSET                XM_PACK_OFFSET
63
 
64
/****************************************************************************
65
 *
66
 * Data conversion
67
 *
68
 ****************************************************************************/
69
 
70
//------------------------------------------------------------------------------
71
 
72
XMFINLINE FLOAT XMConvertHalfToFloat
73
(
74
    HALF Value
75
)
76
{
77
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
78
 
79
    UINT Mantissa;
80
    UINT Exponent;
81
    UINT Result;
82
 
83
    Mantissa = (UINT)(Value & 0x03FF);
84
 
85
    if ((Value & 0x7C00) != 0)  // The value is normalized
86
    {
87
        Exponent = (UINT)((Value >> 10) & 0x1F);
88
    }
89
    else if (Mantissa != 0)     // The value is denormalized
90
    {
91
        // Normalize the value in the resulting float
92
        Exponent = 1;
93
 
94
        do
95
        {
96
            Exponent--;
97
            Mantissa <<= 1;
98
        } while ((Mantissa & 0x0400) == 0);
99
 
100
        Mantissa &= 0x03FF;
101
    }
102
    else                        // The value is zero
103
    {
104
        Exponent = (UINT)-112;
105
    }
106
 
107
    Result = ((Value & 0x8000) << 16) | // Sign
108
             ((Exponent + 112) << 23) | // Exponent
109
             (Mantissa << 13);          // Mantissa
110
 
111
    return *(FLOAT*)&Result;
112
 
113
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
114
#endif
115
}
116
 
117
//------------------------------------------------------------------------------
118
 
119
XMINLINE FLOAT* XMConvertHalfToFloatStream
120
(
121
    FLOAT*      pOutputStream, 
122
    UINT        OutputStride, 
123
    CONST HALF* pInputStream, 
124
    UINT        InputStride, 
125
    UINT        HalfCount
126
)
127
{
128
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
129
 
130
    UINT  i;
131
    BYTE* pHalf = (BYTE*)pInputStream;
132
    BYTE* pFloat = (BYTE*)pOutputStream;
133
 
134
    XMASSERT(pOutputStream);
135
    XMASSERT(pInputStream);
136
 
137
    for (i = 0; i < HalfCount; i++)
138
    {
139
        *(FLOAT*)pFloat = XMConvertHalfToFloat(*(HALF*)pHalf);
140
        pHalf += InputStride;
141
        pFloat += OutputStride; 
142
    }
143
 
144
    return pOutputStream;
145
 
146
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
147
#endif // _XM_VMX128_INTRINSICS_
148
}
149
 
150
//------------------------------------------------------------------------------
151
 
152
XMFINLINE HALF XMConvertFloatToHalf
153
(
154
    FLOAT Value
155
)
156
{
157
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
158
    UINT Result;
159
 
160
    UINT IValue = ((UINT *)(&Value))[0];
161
    UINT Sign = (IValue & 0x80000000U) >> 16U;
162
    IValue = IValue & 0x7FFFFFFFU;      // Hack off the sign
163
 
164
    if (IValue > 0x47FFEFFFU)
165
    {
166
        // The number is too large to be represented as a half.  Saturate to infinity.
167
        Result = 0x7FFFU;
168
    }
169
    else
170
    {
171
        if (IValue < 0x38800000U)
172
        {
173
            // The number is too small to be represented as a normalized half.
174
            // Convert it to a denormalized value.
175
            UINT Shift = 113U - (IValue >> 23U);
176
            IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
177
        }
178
        else
179
        {
180
            // Rebias the exponent to represent the value as a normalized half.
181
            IValue += 0xC8000000U;
182
        }
183
 
184
        Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU; 
185
    }
186
    return (HALF)(Result|Sign);
187
 
188
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
189
#endif
190
}
191
 
192
//------------------------------------------------------------------------------
193
 
194
XMINLINE HALF* XMConvertFloatToHalfStream
195
(
196
    HALF*        pOutputStream, 
197
    UINT         OutputStride, 
198
    CONST FLOAT* pInputStream, 
199
    UINT         InputStride, 
200
    UINT         FloatCount
201
)
202
{
203
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
204
 
205
    UINT  i;
206
    BYTE* pFloat = (BYTE*)pInputStream;
207
    BYTE* pHalf = (BYTE*)pOutputStream;
208
 
209
    XMASSERT(pOutputStream);
210
    XMASSERT(pInputStream);
211
 
212
    for (i = 0; i < FloatCount; i++)
213
    {
214
        *(HALF*)pHalf = XMConvertFloatToHalf(*(FLOAT*)pFloat);
215
        pFloat += InputStride; 
216
        pHalf += OutputStride;
217
    }
218
    return pOutputStream;
219
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
220
#endif // _XM_VMX128_INTRINSICS_
221
}
222
 
223
//------------------------------------------------------------------------------
224
 
225
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
226
// For VMX128, these routines are all defines in the main header
227
 
228
#pragma warning(push)
229
#pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized
230
 
231
XMINLINE XMVECTOR XMConvertVectorIntToFloat
232
(
233
    FXMVECTOR VInt,
234
    UINT     DivExponent
235
)
236
{
237
#if defined(_XM_NO_INTRINSICS_)
238
    UINT ElementIndex;
239
    FLOAT fScale;
240
    XMVECTOR Result;
241
    XMASSERT(DivExponent<32);
242
    fScale = 1.0f / (FLOAT)(1U << DivExponent);
243
    ElementIndex = 0;
244
    do {
245
        INT iTemp = (INT)VInt.vector4_u32[ElementIndex];
246
        Result.vector4_f32[ElementIndex] = ((FLOAT)iTemp) * fScale;
247
    } while (++ElementIndex<4);
248
    return Result;
249
#else // _XM_SSE_INTRINSICS_
250
    XMASSERT(DivExponent<32);
251
    // Convert to floats
252
    XMVECTOR vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&VInt)[0]);
253
    // Convert DivExponent into 1.0f/(1<<DivExponent)
254
    UINT uScale = 0x3F800000U - (DivExponent << 23);
255
    // Splat the scalar value
256
    __m128i vScale = _mm_set1_epi32(uScale);
257
    vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
258
    return vResult;
259
#endif
260
}
261
 
262
//------------------------------------------------------------------------------
263
 
264
XMINLINE XMVECTOR XMConvertVectorFloatToInt
265
(
266
    FXMVECTOR VFloat,
267
    UINT     MulExponent
268
)
269
{
270
#if defined(_XM_NO_INTRINSICS_)
271
    UINT ElementIndex;
272
    XMVECTOR Result;
273
    FLOAT fScale;
274
    XMASSERT(MulExponent<32);
275
    // Get the scalar factor.
276
    fScale = (FLOAT)(1U << MulExponent);
277
    ElementIndex = 0;
278
    do {
279
        INT iResult;
280
        FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
281
        if (fTemp <= -(65536.0f*32768.0f)) {
282
            iResult = (-0x7FFFFFFF)-1;
283
        } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
284
            iResult = 0x7FFFFFFF;
285
        } else {
286
            iResult = (INT)fTemp;
287
        }
288
        Result.vector4_u32[ElementIndex] = (UINT)iResult;
289
    } while (++ElementIndex<4);
290
    return Result;
291
#else // _XM_SSE_INTRINSICS_
292
    XMASSERT(MulExponent<32);
293
    static const XMVECTORF32 MaxInt = {65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f,65536.0f*32768.0f-128.0f};
294
    XMVECTOR vResult = _mm_set_ps1((FLOAT)(1U << MulExponent));
295
    vResult = _mm_mul_ps(vResult,VFloat);
296
    // In case of positive overflow, detect it
297
    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,MaxInt);
298
    // Float to int conversion
299
    __m128i vResulti = _mm_cvttps_epi32(vResult);
300
    // If there was positive overflow, set to 0x7FFFFFFF
301
    vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
302
    vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
303
    vOverflow = _mm_or_ps(vOverflow,vResult);
304
    return vOverflow;
305
#endif
306
}
307
 
308
//------------------------------------------------------------------------------
309
 
310
XMINLINE XMVECTOR XMConvertVectorUIntToFloat
311
(
312
    FXMVECTOR VUInt,
313
    UINT      DivExponent
314
)
315
{
316
#if defined(_XM_NO_INTRINSICS_)
317
    UINT ElementIndex;
318
    FLOAT fScale;
319
    XMVECTOR Result;
320
    XMASSERT(DivExponent<32);
321
    fScale = 1.0f / (FLOAT)(1U << DivExponent);
322
    ElementIndex = 0;
323
    do {
324
        Result.vector4_f32[ElementIndex] = (FLOAT)VUInt.vector4_u32[ElementIndex] * fScale;
325
    } while (++ElementIndex<4);
326
    return Result;
327
#else // _XM_SSE_INTRINSICS_
328
    XMASSERT(DivExponent<32);
329
    static const XMVECTORF32 FixUnsigned = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f};
330
    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
331
    // Determine which ones need the fix.
332
    XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
333
    // Force all values positive
334
    XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
335
    // Convert to floats
336
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
337
    // Convert 0x80000000 -> 0xFFFFFFFF
338
    __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
339
    // For only the ones that are too big, add the fixup
340
    vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],FixUnsigned);
341
    vResult = _mm_add_ps(vResult,vMask);
342
    // Convert DivExponent into 1.0f/(1<<DivExponent)
343
    UINT uScale = 0x3F800000U - (DivExponent << 23);
344
    // Splat
345
    iMask = _mm_set1_epi32(uScale);
346
    vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&iMask)[0]);
347
    return vResult;
348
#endif
349
}
350
 
351
//------------------------------------------------------------------------------
352
 
353
XMINLINE XMVECTOR XMConvertVectorFloatToUInt
354
(
355
    FXMVECTOR VFloat,
356
    UINT      MulExponent
357
)
358
{
359
#if defined(_XM_NO_INTRINSICS_)
360
    UINT ElementIndex;
361
    XMVECTOR Result;
362
    FLOAT fScale;
363
    XMASSERT(MulExponent<32);
364
    // Get the scalar factor.
365
    fScale = (FLOAT)(1U << MulExponent);
366
    ElementIndex = 0;
367
    do {
368
        UINT uResult;
369
        FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
370
        if (fTemp <= 0.0f) {
371
            uResult = 0;
372
        } else if (fTemp >= (65536.0f*65536.0f)) {
373
            uResult = 0xFFFFFFFFU;
374
        } else {
375
            uResult = (UINT)fTemp;
376
        }
377
        Result.vector4_u32[ElementIndex] = uResult;
378
    } while (++ElementIndex<4);
379
    return Result;
380
#else // _XM_SSE_INTRINSICS_
381
    XMASSERT(MulExponent<32);
382
    static const XMVECTORF32 MaxUInt = {65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f,65536.0f*65536.0f-256.0f};
383
    static const XMVECTORF32 UnsignedFix = {32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f,32768.0f*65536.0f};
384
    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
385
    vResult = _mm_mul_ps(vResult,VFloat);
386
    // Clamp to >=0
387
    vResult = _mm_max_ps(vResult,g_XMZero);
388
    // Any numbers that are too big, set to 0xFFFFFFFFU
389
    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,MaxUInt);
390
    XMVECTOR vValue = UnsignedFix;
391
    // Too large for a signed integer?
392
    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
393
    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
394
    vValue = _mm_and_ps(vValue,vMask);
395
    // Perform fixup only on numbers too large (Keeps low bit precision)
396
    vResult = _mm_sub_ps(vResult,vValue);
397
    __m128i vResulti = _mm_cvttps_epi32(vResult);
398
    // Convert from signed to unsigned pnly if greater than 0x80000000
399
    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
400
    vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
401
    // On those that are too large, set to 0xFFFFFFFF
402
    vResult = _mm_or_ps(vResult,vOverflow);
403
    return vResult;
404
#endif
405
}
406
 
407
#pragma warning(pop)
408
 
409
#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_
410
 
411
/****************************************************************************
412
 *
413
 * Vector and matrix load operations
414
 *
415
 ****************************************************************************/
416
 
417
//------------------------------------------------------------------------------
418
 
419
XMFINLINE XMVECTOR XMLoadInt(CONST UINT* pSource)
420
{
421
#if defined(_XM_NO_INTRINSICS_)
422
 
423
    XMVECTOR V;
424
    XMASSERT(pSource);
425
    XMASSERT(((UINT_PTR)pSource & 3) == 0);
426
 
427
    V.vector4_u32[0] = *pSource;
428
 
429
    return V;
430
 
431
#elif defined(_XM_SSE_INTRINSICS_)
432
    XMASSERT(pSource);
433
    XMASSERT(((UINT_PTR)pSource & 3) == 0);
434
    __m128i V = _mm_set_epi32( 0, 0, 0, *pSource );
435
    return reinterpret_cast<__m128 *>(&V)[0];
436
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
437
#endif // _XM_VMX128_INTRINSICS_
438
}
439
 
440
//------------------------------------------------------------------------------
441
 
442
XMFINLINE XMVECTOR XMLoadFloat(CONST FLOAT* pSource)
443
{
444
#if defined(_XM_NO_INTRINSICS_)
445
 
446
    XMVECTOR V;
447
    XMASSERT(pSource);
448
    XMASSERT(((UINT_PTR)pSource & 3) == 0);
449
 
450
    V.vector4_f32[0] = *pSource;
451
 
452
    return V;
453
 
454
#elif defined(_XM_SSE_INTRINSICS_)
455
    XMASSERT(pSource);
456
    XMASSERT(((UINT_PTR)pSource & 3) == 0);
457
 
458
    return _mm_load_ss( pSource );
459
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
460
#endif // _XM_VMX128_INTRINSICS_
461
}
462
 
463
//------------------------------------------------------------------------------
464
 
465
XMFINLINE XMVECTOR XMLoadInt2
466
(
467
    CONST UINT* pSource
468
)
469
{
470
#if defined(_XM_NO_INTRINSICS_)
471
 
472
    XMVECTOR V;
473
 
474
    XMASSERT(pSource);
475
 
476
    V.vector4_u32[0] = pSource[0];
477
    V.vector4_u32[1] = pSource[1];
478
 
479
    return V;
480
#elif defined(_XM_SSE_INTRINSICS_)
481
 
482
    XMASSERT(pSource);
483
    __m128i V = _mm_set_epi32( 0, 0, *(pSource+1), *pSource );
484
    return reinterpret_cast<__m128 *>(&V)[0];
485
 
486
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
487
#endif // _XM_VMX128_INTRINSICS_
488
}
489
 
490
//------------------------------------------------------------------------------
491
 
492
XMFINLINE XMVECTOR XMLoadInt2A
493
(
494
    CONST UINT* pSource
495
)
496
{
497
#if defined(_XM_NO_INTRINSICS_)
498
 
499
    XMVECTOR V;
500
 
501
    XMASSERT(pSource);
502
    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
503
 
504
    V.vector4_u32[0] = pSource[0];
505
    V.vector4_u32[1] = pSource[1];
506
 
507
    return V;
508
 
509
#elif defined(_XM_SSE_INTRINSICS_)
510
 
511
    XMASSERT(pSource);
512
    __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
513
    return reinterpret_cast<__m128 *>(&V)[0];
514
 
515
#else // _XM_VMX128_INTRINSICS_
516
#endif // _XM_VMX128_INTRINSICS_
517
}
518
 
519
//------------------------------------------------------------------------------
520
 
521
XMFINLINE XMVECTOR XMLoadFloat2
522
(
523
    CONST XMFLOAT2* pSource
524
)
525
{
526
#if defined(_XM_NO_INTRINSICS_)
527
    XMVECTOR V;
528
    XMASSERT(pSource);
529
    ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
530
    ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
531
    V.vector4_f32[2] = V.vector4_f32[3] = 0.0f;
532
    return V;
533
#elif defined(_XM_SSE_INTRINSICS_)
534
    XMASSERT(pSource);
535
#ifdef _XM_X86_
536
    __m128 x = _mm_load_ss( &pSource->x );
537
    __m128 y = _mm_load_ss( &pSource->y );
538
    return _mm_unpacklo_ps( x, y );
539
#else // _XM_X64_
540
    // This reads 2 floats past the memory that should be ignored.
541
    return _mm_loadu_ps( &pSource->x );
542
#endif
543
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
544
#endif // _XM_VMX128_INTRINSICS_
545
}
546
 
547
//------------------------------------------------------------------------------
548
 
549
XMFINLINE XMVECTOR XMLoadFloat2A
550
(
551
    CONST XMFLOAT2A* pSource
552
)
553
{
554
#if defined(_XM_NO_INTRINSICS_)
555
 
556
    XMVECTOR V;
557
 
558
    XMASSERT(pSource);
559
    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
560
 
561
    V.vector4_f32[0] = pSource->x;
562
    V.vector4_f32[1] = pSource->y;
563
 
564
    return V;
565
 
566
#elif defined(_XM_SSE_INTRINSICS_)
567
    XMASSERT(pSource);
568
#ifdef _XM_X86_
569
    __m128 x = _mm_load_ss( &pSource->x );
570
    __m128 y = _mm_load_ss( &pSource->y );
571
    return _mm_unpacklo_ps( x, y );
572
#else // _XM_X64_
573
    // This reads 2 floats past the memory that should be ignored.
574
    return _mm_load_ps( &pSource->x );
575
#endif
576
#else // _XM_VMX128_INTRINSICS_
577
#endif // _XM_VMX128_INTRINSICS_
578
}
579
 
580
//------------------------------------------------------------------------------
581
 
582
XMFINLINE XMVECTOR XMLoadHalf2
583
(
584
    CONST XMHALF2* pSource
585
)
586
{
587
#if defined(_XM_NO_INTRINSICS_)
588
    XMASSERT(pSource);
589
    {
590
    XMVECTOR vResult = {
591
        XMConvertHalfToFloat(pSource->x),
592
        XMConvertHalfToFloat(pSource->y),
593
        0.0f,
594
        0.0f
595
    };
596
    return vResult;
597
    }
598
#elif defined(_XM_SSE_INTRINSICS_)
599
    XMASSERT(pSource);
600
    XMVECTOR vResult = {
601
        XMConvertHalfToFloat(pSource->x),
602
        XMConvertHalfToFloat(pSource->y),
603
        0.0f,
604
        0.0f
605
    };
606
    return vResult;
607
 
608
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
609
#endif // _XM_VMX128_INTRINSICS_
610
}
611
 
612
//------------------------------------------------------------------------------
613
 
614
XMFINLINE XMVECTOR XMLoadShortN2
615
(
616
    CONST XMSHORTN2* pSource
617
)
618
{
619
#if defined(_XM_NO_INTRINSICS_)
620
    XMASSERT(pSource);
621
    XMASSERT(pSource->x != -32768);
622
    XMASSERT(pSource->y != -32768);
623
    {
624
    XMVECTOR vResult = {
625
        (FLOAT)pSource->x * (1.0f/32767.0f),
626
        (FLOAT)pSource->y * (1.0f/32767.0f),
627
        0.0f,
628
        0.0f
629
    };
630
    return vResult;
631
    }
632
 
633
#elif defined(_XM_SSE_INTRINSICS_)
634
    XMASSERT(pSource);
635
    XMASSERT(pSource->x != -32768);
636
    XMASSERT(pSource->y != -32768);
637
    // Splat the two shorts in all four entries (WORD alignment okay,
638
    // DWORD alignment preferred)
639
    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
640
    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
641
    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
642
    // x needs to be sign extended
643
    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
644
    // Convert to floating point numbers
645
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
646
    // x - 0x8000 to undo the signed order.
647
    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
648
    // Convert 0-32767 to 0.0f-1.0f
649
    return _mm_mul_ps(vTemp,g_XMNormalizeX16Y16);
650
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
651
#endif // _XM_VMX128_INTRINSICS_
652
}
653
 
654
//------------------------------------------------------------------------------
655
 
656
XMFINLINE XMVECTOR XMLoadShort2
657
(
658
    CONST XMSHORT2* pSource
659
)
660
{
661
#if defined(_XM_NO_INTRINSICS_)
662
 
663
    XMVECTOR V;
664
 
665
    XMASSERT(pSource);
666
    XMASSERT(pSource->x != -32768);
667
    XMASSERT(pSource->y != -32768);
668
 
669
    V.vector4_f32[0] = (FLOAT)pSource->x;
670
    V.vector4_f32[1] = (FLOAT)pSource->y;
671
 
672
    return V;
673
 
674
#elif defined(_XM_SSE_INTRINSICS_)
675
    XMASSERT(pSource);
676
    XMASSERT(pSource->x != -32768);
677
    XMASSERT(pSource->y != -32768);
678
    // Splat the two shorts in all four entries (WORD alignment okay,
679
    // DWORD alignment preferred)
680
    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
681
    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
682
    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
683
    // x needs to be sign extended
684
    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
685
    // Convert to floating point numbers
686
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
687
    // x - 0x8000 to undo the signed order.
688
    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
689
    // Y is 65536 too large
690
    return _mm_mul_ps(vTemp,g_XMFixupY16);
691
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
692
#endif // _XM_VMX128_INTRINSICS_
693
}
694
 
695
//------------------------------------------------------------------------------
696
 
697
XMFINLINE XMVECTOR XMLoadUShortN2
698
(
699
    CONST XMUSHORTN2* pSource
700
)
701
{
702
#if defined(_XM_NO_INTRINSICS_)
703
 
704
    XMVECTOR V;
705
 
706
    XMASSERT(pSource);
707
 
708
    V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
709
    V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
710
 
711
    return V;
712
 
713
#elif defined(_XM_SSE_INTRINSICS_)
714
    static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f};
715
    static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0};
716
    XMASSERT(pSource);
717
    // Splat the two shorts in all four entries (WORD alignment okay,
718
    // DWORD alignment preferred)
719
    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
720
    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
721
    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
722
    // y needs to be sign flipped
723
    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
724
    // Convert to floating point numbers
725
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
726
    // y + 0x8000 to undo the signed order.
727
    vTemp = _mm_add_ps(vTemp,FixaddY16);
728
    // Y is 65536 times too large
729
    vTemp = _mm_mul_ps(vTemp,FixupY16);
730
    return vTemp;
731
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
732
#endif // _XM_VMX128_INTRINSICS_
733
}
734
 
735
//------------------------------------------------------------------------------
736
 
737
XMFINLINE XMVECTOR XMLoadUShort2
738
(
739
    CONST XMUSHORT2* pSource
740
)
741
{
742
#if defined(_XM_NO_INTRINSICS_)
743
 
744
    XMVECTOR V;
745
 
746
    XMASSERT(pSource);
747
 
748
    V.vector4_f32[0] = (FLOAT)pSource->x;
749
    V.vector4_f32[1] = (FLOAT)pSource->y;
750
 
751
    return V;
752
 
753
#elif defined(_XM_SSE_INTRINSICS_)
754
    static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0};
755
    XMASSERT(pSource);
756
    // Splat the two shorts in all four entries (WORD alignment okay,
757
    // DWORD alignment preferred)
758
    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
759
    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
760
    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
761
    // y needs to be sign flipped
762
    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
763
    // Convert to floating point numbers
764
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
765
    // Y is 65536 times too large
766
    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
767
    // y + 0x8000 to undo the signed order.
768
    vTemp = _mm_add_ps(vTemp,FixaddY16);
769
    return vTemp;
770
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
771
#endif // _XM_VMX128_INTRINSICS_
772
}
773
 
774
//------------------------------------------------------------------------------
775
 
776
XMFINLINE XMVECTOR XMLoadInt3
777
(
778
    CONST UINT* pSource
779
)
780
{
781
#if defined(_XM_NO_INTRINSICS_)
782
 
783
    XMVECTOR V;
784
 
785
    XMASSERT(pSource);
786
 
787
    V.vector4_u32[0] = pSource[0];
788
    V.vector4_u32[1] = pSource[1];
789
    V.vector4_u32[2] = pSource[2];
790
 
791
    return V;
792
 
793
#elif defined(_XM_SSE_INTRINSICS_)
794
    XMASSERT(pSource);
795
    __m128i V = _mm_set_epi32( 0, *(pSource+2), *(pSource+1), *pSource );
796
    return reinterpret_cast<__m128 *>(&V)[0];
797
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
798
#endif // _XM_VMX128_INTRINSICS_
799
}
800
 
801
//------------------------------------------------------------------------------
802
 
803
XMFINLINE XMVECTOR XMLoadInt3A
804
(
805
    CONST UINT* pSource
806
)
807
{
808
#if defined(_XM_NO_INTRINSICS_)
809
 
810
    XMVECTOR V;
811
 
812
    XMASSERT(pSource);
813
    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
814
 
815
    V.vector4_u32[0] = pSource[0];
816
    V.vector4_u32[1] = pSource[1];
817
    V.vector4_u32[2] = pSource[2];
818
 
819
    return V;
820
 
821
#elif defined(_XM_SSE_INTRINSICS_)
822
    XMASSERT(pSource);
823
 
824
    // Reads an extra integer that is 'undefined'
825
 
826
    __m128i V = _mm_load_si128( (const __m128i*)pSource );
827
    return reinterpret_cast<__m128 *>(&V)[0];
828
#else // _XM_VMX128_INTRINSICS_
829
#endif // _XM_VMX128_INTRINSICS_
830
}
831
 
832
//------------------------------------------------------------------------------
833
 
834
XMFINLINE XMVECTOR XMLoadFloat3
835
(
836
    CONST XMFLOAT3* pSource
837
)
838
{
839
#if defined(_XM_NO_INTRINSICS_)
840
    XMVECTOR V;
841
    XMASSERT(pSource);
842
    ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
843
    ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
844
    ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
845
    V.vector4_f32[3] = 0.0f;
846
    return V;
847
#elif defined(_XM_SSE_INTRINSICS_)
848
    XMASSERT(pSource);
849
    // This reads 1 floats past the memory that should be ignored.
850
    return _mm_loadu_ps( &pSource->x );
851
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
852
#endif // _XM_VMX128_INTRINSICS_
853
}
854
 
855
//------------------------------------------------------------------------------
856
 
857
XMFINLINE XMVECTOR XMLoadFloat3A
858
(
859
    CONST XMFLOAT3A* pSource
860
)
861
{
862
#if defined(_XM_NO_INTRINSICS_)
863
 
864
    XMVECTOR V;
865
 
866
    XMASSERT(pSource);
867
    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
868
 
869
    V.vector4_f32[0] = pSource->x;
870
    V.vector4_f32[1] = pSource->y;
871
    V.vector4_f32[2] = pSource->z;
872
 
873
    return V;
874
 
875
#elif defined(_XM_SSE_INTRINSICS_)
876
    XMASSERT(pSource);
877
 
878
	// This reads 1 floats past the memory that should be ignored.
879
 
880
	return _mm_load_ps( &pSource->x );
881
#else // _XM_VMX128_INTRINSICS_
882
#endif // _XM_VMX128_INTRINSICS_
883
}
884
 
885
//------------------------------------------------------------------------------
886
 
887
XMFINLINE XMVECTOR XMLoadUHenDN3
888
(
889
    CONST XMUHENDN3* pSource
890
)
891
{
892
#if defined(_XM_NO_INTRINSICS_)
893
 
894
    XMVECTOR          V;
895
    UINT              Element;
896
 
897
    XMASSERT(pSource);
898
 
899
    Element = pSource->v & 0x7FF;
900
    V.vector4_f32[0] = (FLOAT)Element / 2047.0f;
901
    Element = (pSource->v >> 11) & 0x7FF;
902
    V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
903
    Element = (pSource->v >> 22) & 0x3FF;
904
    V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
905
 
906
    return V;
907
 
908
#elif defined(_XM_SSE_INTRINSICS_)
909
    static const XMVECTORF32 UHenDN3Mul = {1.0f/2047.0f,1.0f/(2047.0f*2048.0f),1.0f/(1023.0f*2048.0f*2048.0f),0};
910
    XMASSERT(pSource);
911
    // Get the 32 bit value and splat it
912
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
913
    // Mask off x, y and z
914
    vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
915
    // Convert x and y to unsigned
916
    vResult = _mm_xor_ps(vResult,g_XMFlipZ);
917
    // Convert to float
918
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
919
    // Convert x and y back to signed
920
    vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
921
    // Normalize x,y and z to -1.0f-1.0f
922
    vResult = _mm_mul_ps(vResult,UHenDN3Mul);
923
    return vResult;
924
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
925
#endif // _XM_VMX128_INTRINSICS_
926
}
927
 
928
//------------------------------------------------------------------------------
929
 
930
XMFINLINE XMVECTOR XMLoadUHenD3
931
(
932
    CONST XMUHEND3* pSource
933
)
934
{
935
#if defined(_XM_NO_INTRINSICS_)
936
 
937
    XMVECTOR          V;
938
    UINT              Element;
939
 
940
    XMASSERT(pSource);
941
 
942
    Element = pSource->v & 0x7FF;
943
    V.vector4_f32[0] = (FLOAT)Element;
944
    Element = (pSource->v >> 11) & 0x7FF;
945
    V.vector4_f32[1] = (FLOAT)Element;
946
    Element = (pSource->v >> 22) & 0x3FF;
947
    V.vector4_f32[2] = (FLOAT)Element;
948
 
949
    return V;
950
 
951
#elif defined(_XM_SSE_INTRINSICS_)
952
    XMASSERT(pSource);
953
    // Get the 32 bit value and splat it
954
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
955
    // Mask off x, y and z
956
    vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
957
    // Convert x and y to unsigned
958
    vResult = _mm_xor_ps(vResult,g_XMFlipZ);
959
    // Convert to float
960
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
961
    // Convert x and y back to signed
962
    vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
963
    // Normalize x and y to -1024-1023.0f and z to -512-511.0f
964
    vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
965
    return vResult;
966
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
967
#endif // _XM_VMX128_INTRINSICS_
968
}
969
 
970
//------------------------------------------------------------------------------
971
 
972
XMFINLINE XMVECTOR XMLoadHenDN3
973
(
974
    CONST XMHENDN3* pSource
975
)
976
{
977
#if defined(_XM_NO_INTRINSICS_)
978
 
979
    XMVECTOR          V;
980
    UINT              Element;
981
    static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
982
    static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
983
 
984
    XMASSERT(pSource);
985
    XMASSERT((pSource->v & 0x7FF) != 0x400);
986
    XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
987
    XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
988
 
989
    Element = pSource->v & 0x7FF;
990
    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f;
991
    Element = (pSource->v >> 11) & 0x7FF;
992
    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f;
993
    Element = (pSource->v >> 22) & 0x3FF;
994
    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]) / 511.0f;
995
 
996
    return V;
997
 
998
#elif defined(_XM_SSE_INTRINSICS_)
999
    static const XMVECTORF32 HenDN3Mul = {1.0f/1023.0f,1.0f/(1023.0f*2048.0f),1.0f/(511.0f*2048.0f*2048.0f),0};
1000
    XMASSERT(pSource);
1001
    XMASSERT((pSource->v & 0x7FF) != 0x400);
1002
    XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1003
    XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1004
    // Get the 32 bit value and splat it
1005
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1006
    // Mask off x, y and z
1007
    vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
1008
    // Convert x and y to unsigned
1009
    vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
1010
    // Convert to float
1011
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1012
    // Convert x and y back to signed
1013
    vResult = _mm_add_ps(vResult,g_XMAddHenD3);
1014
    // Normalize x,y and z to -1.0f-1.0f
1015
    vResult = _mm_mul_ps(vResult,HenDN3Mul);
1016
    return vResult;
1017
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1018
#endif // _XM_VMX128_INTRINSICS_
1019
}
1020
 
1021
//------------------------------------------------------------------------------
1022
 
1023
XMFINLINE XMVECTOR XMLoadHenD3
1024
(
1025
    CONST XMHEND3* pSource
1026
)
1027
{
1028
#if defined(_XM_NO_INTRINSICS_)
1029
 
1030
    XMVECTOR          V;
1031
    UINT              Element;
1032
    static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
1033
    static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
1034
 
1035
    XMASSERT(pSource);
1036
    XMASSERT((pSource->v & 0x7FF) != 0x400);
1037
    XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1038
    XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1039
 
1040
    Element = pSource->v & 0x7FF;
1041
    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]);
1042
    Element = (pSource->v >> 11) & 0x7FF;
1043
    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]);
1044
    Element = (pSource->v >> 22) & 0x3FF;
1045
    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]);
1046
 
1047
    return V;
1048
 
1049
#elif defined(_XM_SSE_INTRINSICS_)
1050
    XMASSERT(pSource);
1051
    XMASSERT((pSource->v & 0x7FF) != 0x400);
1052
    XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1053
    XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1054
    // Get the 32 bit value and splat it
1055
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1056
    // Mask off x, y and z
1057
    vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
1058
    // Convert x and y to unsigned
1059
    vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
1060
    // Convert to float
1061
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1062
    // Convert x and y back to signed
1063
    vResult = _mm_add_ps(vResult,g_XMAddHenD3);
1064
    // Normalize x and y to -1024-1023.0f and z to -512-511.0f
1065
    vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
1066
    return vResult;
1067
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1068
#endif // _XM_VMX128_INTRINSICS_
1069
}
1070
 
1071
//------------------------------------------------------------------------------
1072
 
1073
XMFINLINE XMVECTOR XMLoadUDHenN3
1074
(
1075
    CONST XMUDHENN3* pSource
1076
)
1077
{
1078
#if defined(_XM_NO_INTRINSICS_)
1079
 
1080
    XMVECTOR          V;
1081
    UINT              Element;
1082
 
1083
    XMASSERT(pSource);
1084
 
1085
    Element = pSource->v & 0x3FF;
1086
    V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
1087
    Element = (pSource->v >> 10) & 0x7FF;
1088
    V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
1089
    Element = (pSource->v >> 21) & 0x7FF;
1090
    V.vector4_f32[2] = (FLOAT)Element / 2047.0f;
1091
 
1092
    return V;
1093
 
1094
#elif defined(_XM_SSE_INTRINSICS_)
1095
    static const XMVECTORF32 UDHenN3Mul = {1.0f/1023.0f,1.0f/(2047.0f*1024.0f),1.0f/(2047.0f*1024.0f*2048.0f),0};
1096
    XMASSERT(pSource);
1097
    // Get the 32 bit value and splat it
1098
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1099
    // Mask off x, y and z
1100
    vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1101
    // Convert x and y to unsigned
1102
    vResult = _mm_xor_ps(vResult,g_XMFlipZ);
1103
    // Convert to float
1104
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1105
    // Convert x and y back to signed
1106
    vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
1107
    // Normalize x,y and z to -1.0f-1.0f
1108
    vResult = _mm_mul_ps(vResult,UDHenN3Mul);
1109
    return vResult;
1110
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1111
#endif // _XM_VMX128_INTRINSICS_
1112
}
1113
 
1114
//------------------------------------------------------------------------------
1115
 
1116
XMFINLINE XMVECTOR XMLoadUDHen3
1117
(
1118
    CONST XMUDHEN3* pSource
1119
)
1120
{
1121
#if defined(_XM_NO_INTRINSICS_)
1122
 
1123
    XMVECTOR          V;
1124
    UINT              Element;
1125
 
1126
    XMASSERT(pSource);
1127
 
1128
    Element = pSource->v & 0x3FF;
1129
    V.vector4_f32[0] = (FLOAT)Element;
1130
    Element = (pSource->v >> 10) & 0x7FF;
1131
    V.vector4_f32[1] = (FLOAT)Element;
1132
    Element = (pSource->v >> 21) & 0x7FF;
1133
    V.vector4_f32[2] = (FLOAT)Element;
1134
 
1135
    return V;
1136
 
1137
#elif defined(_XM_SSE_INTRINSICS_)
1138
    XMASSERT(pSource);
1139
    // Get the 32 bit value and splat it
1140
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1141
    // Mask off x, y and z
1142
    vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1143
    // Convert x and y to unsigned
1144
    vResult = _mm_xor_ps(vResult,g_XMFlipZ);
1145
    // Convert to float
1146
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1147
    // Convert x and y back to signed
1148
    vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
1149
    // Normalize x to 0-1023.0f and y and z to 0-2047.0f
1150
    vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
1151
    return vResult;
1152
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1153
#endif // _XM_VMX128_INTRINSICS_
1154
}
1155
 
1156
//------------------------------------------------------------------------------
1157
 
1158
XMFINLINE XMVECTOR XMLoadDHenN3
1159
(
1160
    CONST XMDHENN3* pSource
1161
)
1162
{
1163
#if defined(_XM_NO_INTRINSICS_)
1164
 
1165
    XMVECTOR          V;
1166
    UINT              Element;
1167
    static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
1168
    static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
1169
 
1170
    XMASSERT(pSource);
1171
    XMASSERT((pSource->v & 0x3FF) != 0x200);
1172
    XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1173
    XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1174
 
1175
    Element = pSource->v & 0x3FF;
1176
    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]) / 511.0f;
1177
    Element = (pSource->v >> 10) & 0x7FF;
1178
    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f;
1179
    Element = (pSource->v >> 21) & 0x7FF;
1180
    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f;
1181
 
1182
    return V;
1183
 
1184
#elif defined(_XM_SSE_INTRINSICS_)
1185
    static const XMVECTORF32 DHenN3Mul = {1.0f/511.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*2048.0f),0};
1186
    XMASSERT(pSource);
1187
    XMASSERT((pSource->v & 0x3FF) != 0x200);
1188
    XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1189
    XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1190
    // Get the 32 bit value and splat it
1191
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1192
    // Mask off x, y and z
1193
    vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1194
    // Convert x and y to unsigned
1195
    vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
1196
    // Convert to float
1197
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1198
    // Convert x and y back to signed
1199
    vResult = _mm_add_ps(vResult,g_XMAddDHen3);
1200
    // Normalize x,y and z to -1.0f-1.0f
1201
    vResult = _mm_mul_ps(vResult,DHenN3Mul);
1202
    return vResult;
1203
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1204
#endif // _XM_VMX128_INTRINSICS_
1205
}
1206
 
1207
//------------------------------------------------------------------------------
1208
 
1209
XMFINLINE XMVECTOR XMLoadDHen3
1210
(
1211
    CONST XMDHEN3* pSource
1212
)
1213
{
1214
#if defined(_XM_NO_INTRINSICS_)
1215
 
1216
    XMVECTOR          V;
1217
    UINT              Element;
1218
    static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
1219
    static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
1220
 
1221
    XMASSERT(pSource);
1222
    XMASSERT((pSource->v & 0x3FF) != 0x200);
1223
    XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1224
    XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1225
 
1226
    Element = pSource->v & 0x3FF;
1227
    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]);
1228
    Element = (pSource->v >> 10) & 0x7FF;
1229
    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]);
1230
    Element = (pSource->v >> 21) & 0x7FF;
1231
    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]);
1232
 
1233
    return V;
1234
 
1235
#elif defined(_XM_SSE_INTRINSICS_)
1236
    XMASSERT(pSource);
1237
    XMASSERT((pSource->v & 0x3FF) != 0x200);
1238
    XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1239
    XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1240
    // Get the 32 bit value and splat it
1241
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1242
    // Mask off x, y and z
1243
    vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1244
    // Convert x and y to unsigned
1245
    vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
1246
    // Convert to float
1247
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1248
    // Convert x and y back to signed
1249
    vResult = _mm_add_ps(vResult,g_XMAddDHen3);
1250
    // Normalize x to -210-511.0f and y and z to -1024-1023.0f
1251
    vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
1252
    return vResult;
1253
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1254
#endif // _XM_VMX128_INTRINSICS_
1255
}
1256
 
1257
//------------------------------------------------------------------------------
1258
 
1259
XMFINLINE XMVECTOR XMLoadU565
1260
(
1261
    CONST XMU565* pSource
1262
)
1263
{
1264
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1265
    static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
1266
    static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
1267
    XMASSERT(pSource);
1268
    // Get the 32 bit value and splat it
1269
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1270
    // Mask off x, y and z
1271
    vResult = _mm_and_ps(vResult,U565And);
1272
    // Convert to float
1273
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1274
    // Normalize x, y, and z
1275
    vResult = _mm_mul_ps(vResult,U565Mul);
1276
    return vResult;
1277
#else
1278
    XMVECTOR          V;
1279
    UINT              Element;
1280
 
1281
    XMASSERT(pSource);
1282
 
1283
    Element = pSource->v & 0x1F;
1284
    V.vector4_f32[0] = (FLOAT)Element;
1285
    Element = (pSource->v >> 5) & 0x3F;
1286
    V.vector4_f32[1] = (FLOAT)Element;
1287
    Element = (pSource->v >> 11) & 0x1F;
1288
    V.vector4_f32[2] = (FLOAT)Element;
1289
 
1290
    return V;
1291
#endif // !_XM_SSE_INTRINSICS_
1292
}
1293
 
1294
//------------------------------------------------------------------------------
1295
 
1296
XMFINLINE XMVECTOR XMLoadFloat3PK
1297
(
1298
    CONST XMFLOAT3PK* pSource
1299
)
1300
{
1301
    UINT Mantissa;
1302
    UINT Exponent;
1303
    UINT Result[3];
1304
 
1305
    XMASSERT(pSource);
1306
 
1307
    // X Channel (6-bit mantissa)
1308
    Mantissa = pSource->xm;
1309
 
1310
    if ( pSource->xe == 0x1f ) // INF or NAN
1311
    {
1312
        Result[0] = 0x7f800000 | (pSource->xm << 17);
1313
    }
1314
    else
1315
    {
1316
        if ( pSource->xe != 0 ) // The value is normalized
1317
        {
1318
            Exponent = pSource->xe;
1319
        }
1320
        else if (Mantissa != 0) // The value is denormalized
1321
        {
1322
            // Normalize the value in the resulting float
1323
            Exponent = 1;
1324
 
1325
            do
1326
            {
1327
                Exponent--;
1328
                Mantissa <<= 1;
1329
            } while ((Mantissa & 0x40) == 0);
1330
 
1331
            Mantissa &= 0x3F;
1332
        }
1333
        else // The value is zero
1334
        {
1335
            Exponent = (UINT)-112;
1336
        }
1337
 
1338
        Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
1339
    }
1340
 
1341
    // Y Channel (6-bit mantissa)
1342
    Mantissa = pSource->ym;
1343
 
1344
    if ( pSource->ye == 0x1f ) // INF or NAN
1345
    {
1346
        Result[1] = 0x7f800000 | (pSource->ym << 17);
1347
    }
1348
    else
1349
    {
1350
        if ( pSource->ye != 0 ) // The value is normalized
1351
        {
1352
            Exponent = pSource->ye;
1353
        }
1354
        else if (Mantissa != 0) // The value is denormalized
1355
        {
1356
            // Normalize the value in the resulting float
1357
            Exponent = 1;
1358
 
1359
            do
1360
            {
1361
                Exponent--;
1362
                Mantissa <<= 1;
1363
            } while ((Mantissa & 0x40) == 0);
1364
 
1365
            Mantissa &= 0x3F;
1366
        }
1367
        else // The value is zero
1368
        {
1369
            Exponent = (UINT)-112;
1370
        }
1371
 
1372
        Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
1373
    }
1374
 
1375
    // Z Channel (5-bit mantissa)
1376
    Mantissa = pSource->zm;
1377
 
1378
    if ( pSource->ze == 0x1f ) // INF or NAN
1379
    {
1380
        Result[2] = 0x7f800000 | (pSource->zm << 17);
1381
    }
1382
    else
1383
    {
1384
        if ( pSource->ze != 0 ) // The value is normalized
1385
        {
1386
            Exponent = pSource->ze;
1387
        }
1388
        else if (Mantissa != 0) // The value is denormalized
1389
        {
1390
            // Normalize the value in the resulting float
1391
            Exponent = 1;
1392
 
1393
            do
1394
            {
1395
                Exponent--;
1396
                Mantissa <<= 1;
1397
            } while ((Mantissa & 0x20) == 0);
1398
 
1399
            Mantissa &= 0x1F;
1400
        }
1401
        else // The value is zero
1402
        {
1403
            Exponent = (UINT)-112;
1404
        }
1405
 
1406
        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
1407
    }
1408
 
1409
    return XMLoadFloat3( (XMFLOAT3*)&Result );
1410
}
1411
 
1412
//------------------------------------------------------------------------------
1413
 
1414
XMFINLINE XMVECTOR XMLoadFloat3SE
1415
(
1416
    CONST XMFLOAT3SE* pSource
1417
)
1418
{
1419
    UINT Mantissa;
1420
    UINT Exponent, ExpBits;
1421
    UINT Result[3];
1422
 
1423
    XMASSERT(pSource);
1424
 
1425
    if ( pSource->e == 0x1f ) // INF or NAN
1426
    {
1427
        Result[0] = 0x7f800000 | (pSource->xm << 14);
1428
        Result[1] = 0x7f800000 | (pSource->ym << 14);
1429
        Result[2] = 0x7f800000 | (pSource->zm << 14);
1430
    }
1431
    else if ( pSource->e != 0 ) // The values are all normalized
1432
    {
1433
        Exponent = pSource->e;
1434
 
1435
        ExpBits = (Exponent + 112) << 23;
1436
 
1437
        Mantissa = pSource->xm;
1438
        Result[0] = ExpBits | (Mantissa << 14);
1439
 
1440
        Mantissa = pSource->ym;
1441
        Result[1] = ExpBits | (Mantissa << 14);
1442
 
1443
        Mantissa = pSource->zm;
1444
        Result[2] = ExpBits | (Mantissa << 14);
1445
    }
1446
    else
1447
    {
1448
        // X Channel
1449
        Mantissa = pSource->xm;
1450
 
1451
        if (Mantissa != 0) // The value is denormalized
1452
        {
1453
            // Normalize the value in the resulting float
1454
            Exponent = 1;
1455
 
1456
            do
1457
            {
1458
                Exponent--;
1459
                Mantissa <<= 1;
1460
            } while ((Mantissa & 0x200) == 0);
1461
 
1462
            Mantissa &= 0x1FF;
1463
        }
1464
        else // The value is zero
1465
        {
1466
            Exponent = (UINT)-112;
1467
        }
1468
 
1469
        Result[0] = ((Exponent + 112) << 23) | (Mantissa << 14);
1470
 
1471
        // Y Channel
1472
        Mantissa = pSource->ym;
1473
 
1474
        if (Mantissa != 0) // The value is denormalized
1475
        {
1476
            // Normalize the value in the resulting float
1477
            Exponent = 1;
1478
 
1479
            do
1480
            {
1481
                Exponent--;
1482
                Mantissa <<= 1;
1483
            } while ((Mantissa & 0x200) == 0);
1484
 
1485
            Mantissa &= 0x1FF;
1486
        }
1487
        else // The value is zero
1488
        {
1489
            Exponent = (UINT)-112;
1490
        }
1491
 
1492
        Result[1] = ((Exponent + 112) << 23) | (Mantissa << 14);
1493
 
1494
        // Z Channel
1495
        Mantissa = pSource->zm;
1496
 
1497
        if (Mantissa != 0) // The value is denormalized
1498
        {
1499
            // Normalize the value in the resulting float
1500
            Exponent = 1;
1501
 
1502
            do
1503
            {
1504
                Exponent--;
1505
                Mantissa <<= 1;
1506
            } while ((Mantissa & 0x200) == 0);
1507
 
1508
            Mantissa &= 0x1FF;
1509
        }
1510
        else // The value is zero
1511
        {
1512
            Exponent = (UINT)-112;
1513
        }
1514
 
1515
        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14);
1516
    }
1517
 
1518
    return XMLoadFloat3( (XMFLOAT3*)&Result );
1519
}
1520
 
1521
//------------------------------------------------------------------------------
1522
 
1523
XMFINLINE XMVECTOR XMLoadInt4
1524
(
1525
    CONST UINT* pSource
1526
)
1527
{
1528
#if defined(_XM_NO_INTRINSICS_)
1529
 
1530
    XMVECTOR V;
1531
 
1532
    XMASSERT(pSource);
1533
 
1534
    V.vector4_u32[0] = pSource[0];
1535
    V.vector4_u32[1] = pSource[1];
1536
    V.vector4_u32[2] = pSource[2];
1537
    V.vector4_u32[3] = pSource[3];
1538
 
1539
    return V;
1540
 
1541
#elif defined(_XM_SSE_INTRINSICS_)
1542
 
1543
    XMASSERT(pSource);
1544
    __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
1545
    return reinterpret_cast<__m128 *>(&V)[0];
1546
 
1547
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1548
#endif // _XM_VMX128_INTRINSICS_
1549
}
1550
 
1551
//------------------------------------------------------------------------------
1552
 
1553
XMFINLINE XMVECTOR XMLoadInt4A
1554
(
1555
    CONST UINT* pSource
1556
)
1557
{
1558
#if defined(_XM_NO_INTRINSICS_)
1559
 
1560
    XMVECTOR V;
1561
 
1562
    XMASSERT(pSource);
1563
    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1564
 
1565
    V.vector4_u32[0] = pSource[0];
1566
    V.vector4_u32[1] = pSource[1];
1567
    V.vector4_u32[2] = pSource[2];
1568
    V.vector4_u32[3] = pSource[3];
1569
 
1570
    return V;
1571
 
1572
#elif defined(_XM_SSE_INTRINSICS_)
1573
 
1574
    XMASSERT(pSource);
1575
    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1576
 
1577
    __m128i V = _mm_load_si128( (const __m128i*)pSource );
1578
    return reinterpret_cast<__m128 *>(&V)[0];
1579
 
1580
 
1581
#else // _XM_VMX128_INTRINSICS_
1582
#endif // _XM_VMX128_INTRINSICS_
1583
}
1584
 
1585
//------------------------------------------------------------------------------
1586
 
1587
XMFINLINE XMVECTOR XMLoadFloat4
1588
(
1589
    CONST XMFLOAT4* pSource
1590
)
1591
{
1592
#if defined(_XM_NO_INTRINSICS_)
1593
    XMVECTOR V;
1594
    XMASSERT(pSource);
1595
    ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
1596
    ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
1597
    ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
1598
    ((UINT *)(&V.vector4_f32[3]))[0] = ((const UINT *)(&pSource->w))[0];
1599
    return V;
1600
#elif defined(_XM_SSE_INTRINSICS_)
1601
    XMASSERT(pSource);
1602
    return _mm_loadu_ps( &pSource->x );
1603
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1604
#endif // _XM_VMX128_INTRINSICS_
1605
}
1606
 
1607
//------------------------------------------------------------------------------
1608
 
1609
XMFINLINE XMVECTOR XMLoadFloat4A
1610
(
1611
    CONST XMFLOAT4A* pSource
1612
)
1613
{
1614
#if defined(_XM_NO_INTRINSICS_)
1615
 
1616
    XMVECTOR V;
1617
 
1618
    XMASSERT(pSource);
1619
    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1620
 
1621
    V.vector4_f32[0] = pSource->x;
1622
    V.vector4_f32[1] = pSource->y;
1623
    V.vector4_f32[2] = pSource->z;
1624
    V.vector4_f32[3] = pSource->w;
1625
 
1626
    return V;
1627
 
1628
#elif defined(_XM_SSE_INTRINSICS_)
1629
 
1630
    XMASSERT(pSource);
1631
    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1632
 
1633
    return _mm_load_ps( &pSource->x );
1634
 
1635
#else // _XM_VMX128_INTRINSICS_
1636
#endif // _XM_VMX128_INTRINSICS_
1637
}
1638
 
1639
//------------------------------------------------------------------------------
1640
 
1641
XMFINLINE XMVECTOR XMLoadHalf4
1642
(
1643
    CONST XMHALF4* pSource
1644
)
1645
{
1646
#if defined(_XM_NO_INTRINSICS_)
1647
    XMASSERT(pSource);
1648
    {
1649
    XMVECTOR vResult = {
1650
        XMConvertHalfToFloat(pSource->x),
1651
        XMConvertHalfToFloat(pSource->y),
1652
        XMConvertHalfToFloat(pSource->z),
1653
        XMConvertHalfToFloat(pSource->w)
1654
    };
1655
    return vResult;
1656
    }
1657
#elif defined(_XM_SSE_INTRINSICS_)
1658
	XMASSERT(pSource);
1659
    XMVECTOR vResult = {
1660
        XMConvertHalfToFloat(pSource->x),
1661
        XMConvertHalfToFloat(pSource->y),
1662
        XMConvertHalfToFloat(pSource->z),
1663
        XMConvertHalfToFloat(pSource->w)
1664
    };
1665
    return vResult;
1666
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1667
#endif // _XM_VMX128_INTRINSICS_
1668
}
1669
 
1670
//------------------------------------------------------------------------------
1671
 
1672
XMFINLINE XMVECTOR XMLoadShortN4
1673
(
1674
    CONST XMSHORTN4* pSource
1675
)
1676
{
1677
#if defined(_XM_NO_INTRINSICS_)
1678
    XMASSERT(pSource);
1679
    XMASSERT(pSource->x != -32768);
1680
    XMASSERT(pSource->y != -32768);
1681
    XMASSERT(pSource->z != -32768);
1682
    XMASSERT(pSource->w != -32768);
1683
    {
1684
    XMVECTOR vResult = {
1685
        (FLOAT)pSource->x * (1.0f/32767.0f),
1686
        (FLOAT)pSource->y * (1.0f/32767.0f),
1687
        (FLOAT)pSource->z * (1.0f/32767.0f),
1688
        (FLOAT)pSource->w * (1.0f/32767.0f)
1689
    };
1690
    return vResult;
1691
    }
1692
#elif defined(_XM_SSE_INTRINSICS_)
1693
	XMASSERT(pSource);
1694
    XMASSERT(pSource->x != -32768);
1695
    XMASSERT(pSource->y != -32768);
1696
    XMASSERT(pSource->z != -32768);
1697
    XMASSERT(pSource->w != -32768);
1698
    // Splat the color in all four entries (x,z,y,w)
1699
    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
1700
    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
1701
    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
1702
    // x and z are unsigned! Flip the bits to convert the order to signed
1703
    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
1704
    // Convert to floating point numbers
1705
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
1706
    // x and z - 0x8000 to complete the conversion
1707
    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
1708
    // Convert -32767-32767 to -1.0f-1.0f
1709
    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
1710
    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
1711
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
1712
    return vTemp;
1713
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1714
#endif // _XM_VMX128_INTRINSICS_
1715
}
1716
 
1717
//------------------------------------------------------------------------------
1718
 
1719
XMFINLINE XMVECTOR XMLoadShort4
1720
(
1721
    CONST XMSHORT4* pSource
1722
)
1723
{
1724
#if defined(_XM_NO_INTRINSICS_)
1725
 
1726
    XMVECTOR V;
1727
 
1728
    XMASSERT(pSource);
1729
    XMASSERT(pSource->x != -32768);
1730
    XMASSERT(pSource->y != -32768);
1731
    XMASSERT(pSource->z != -32768);
1732
    XMASSERT(pSource->w != -32768);
1733
 
1734
    V.vector4_f32[0] = (FLOAT)pSource->x;
1735
    V.vector4_f32[1] = (FLOAT)pSource->y;
1736
    V.vector4_f32[2] = (FLOAT)pSource->z;
1737
    V.vector4_f32[3] = (FLOAT)pSource->w;
1738
 
1739
    return V;
1740
 
1741
#elif defined(_XM_SSE_INTRINSICS_)
1742
    XMASSERT(pSource);
1743
    XMASSERT(pSource->x != -32768);
1744
    XMASSERT(pSource->y != -32768);
1745
    XMASSERT(pSource->z != -32768);
1746
    XMASSERT(pSource->w != -32768);
1747
    // Splat the color in all four entries (x,z,y,w)
1748
    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
1749
    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
1750
    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
1751
    // x and z are unsigned! Flip the bits to convert the order to signed
1752
    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
1753
    // Convert to floating point numbers
1754
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
1755
    // x and z - 0x8000 to complete the conversion
1756
    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
1757
    // Fix y and w because they are 65536 too large
1758
    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
1759
    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
1760
    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
1761
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1762
#endif // _XM_VMX128_INTRINSICS_
1763
}
1764
 
1765
//------------------------------------------------------------------------------
1766
 
1767
XMFINLINE XMVECTOR XMLoadUShortN4
1768
(
1769
    CONST XMUSHORTN4* pSource
1770
)
1771
{
1772
#if defined(_XM_NO_INTRINSICS_)
1773
 
1774
    XMVECTOR V;
1775
 
1776
    XMASSERT(pSource);
1777
 
1778
    V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
1779
    V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
1780
    V.vector4_f32[2] = (FLOAT)pSource->z / 65535.0f;
1781
    V.vector4_f32[3] = (FLOAT)pSource->w / 65535.0f;
1782
 
1783
    return V;
1784
 
1785
#elif defined(_XM_SSE_INTRINSICS_)
1786
	XMASSERT(pSource);
1787
    static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)};
1788
    static const XMVECTORF32 FixaddY16W16  = {0,0,32768.0f*65536.0f,32768.0f*65536.0f};
1789
	XMASSERT(pSource);
1790
    // Splat the color in all four entries (x,z,y,w)
1791
    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
1792
    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
1793
    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
1794
    // y and w are signed! Flip the bits to convert the order to unsigned
1795
    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
1796
    // Convert to floating point numbers
1797
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
1798
    // y and w + 0x8000 to complete the conversion
1799
    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
1800
    // Fix y and w because they are 65536 too large
1801
    vTemp = _mm_mul_ps(vTemp,FixupY16W16);
1802
    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
1803
    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
1804
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1805
#endif // _XM_VMX128_INTRINSICS_
1806
}
1807
 
1808
//------------------------------------------------------------------------------
1809
 
1810
XMFINLINE XMVECTOR XMLoadUShort4
1811
(
1812
    CONST XMUSHORT4* pSource
1813
)
1814
{
1815
#if defined(_XM_NO_INTRINSICS_)
1816
 
1817
    XMVECTOR V;
1818
 
1819
    XMASSERT(pSource);
1820
 
1821
    V.vector4_f32[0] = (FLOAT)pSource->x;
1822
    V.vector4_f32[1] = (FLOAT)pSource->y;
1823
    V.vector4_f32[2] = (FLOAT)pSource->z;
1824
    V.vector4_f32[3] = (FLOAT)pSource->w;
1825
 
1826
    return V;
1827
 
1828
#elif defined(_XM_SSE_INTRINSICS_)
1829
    XMASSERT(pSource);
1830
    static const XMVECTORF32 FixaddY16W16  = {0,0,32768.0f,32768.0f};
1831
	XMASSERT(pSource);
1832
    // Splat the color in all four entries (x,z,y,w)
1833
    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
1834
    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
1835
    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
1836
    // y and w are signed! Flip the bits to convert the order to unsigned
1837
    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
1838
    // Convert to floating point numbers
1839
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
1840
    // Fix y and w because they are 65536 too large
1841
    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
1842
    // y and w + 0x8000 to complete the conversion
1843
    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
1844
    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
1845
    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
1846
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1847
#endif // _XM_VMX128_INTRINSICS_
1848
}
1849
 
1850
//------------------------------------------------------------------------------
1851
 
1852
XMFINLINE XMVECTOR XMLoadXIcoN4
1853
(
1854
    CONST XMXICON4* pSource
1855
)
1856
{
1857
#if defined(_XM_NO_INTRINSICS_)
1858
 
1859
    XMVECTOR          V;
1860
    UINT              Element;
1861
    static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
1862
 
1863
    XMASSERT(pSource);
1864
    XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
1865
    XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
1866
    XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
1867
 
1868
    Element = (UINT)pSource->v & 0xFFFFF;
1869
    V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
1870
    Element = (UINT)(pSource->v >> 20) & 0xFFFFF;
1871
    V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
1872
    Element = (UINT)(pSource->v >> 40) & 0xFFFFF;
1873
    V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
1874
    V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
1875
 
1876
    return V;
1877
 
1878
#elif defined(_XM_SSE_INTRINSICS_)
1879
    XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
1880
    XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
1881
    XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
1882
    static const XMVECTORF32 LoadXIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(15.0f*4096.0f*65536.0f)};
1883
	XMASSERT(pSource);
1884
    // Grab the 64 bit structure
1885
    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
1886
    // By shifting down 8 bits, y and z are in seperate 32 bit elements
1887
    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
1888
    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
1889
    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
1890
    // Fix the entries to x,y,z,w
1891
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
1892
    // Mask x,y,z and w
1893
    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
1894
    // x and z are unsigned! Flip the bits to convert the order to signed
1895
    vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
1896
    // Convert to floating point numbers
1897
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
1898
    // x and z - 0x80 to complete the conversion
1899
    vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
1900
    // Fix y and w because they are too large
1901
    vTemp = _mm_mul_ps(vTemp,LoadXIcoN4Mul);
1902
    return vTemp;
1903
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1904
#endif // _XM_VMX128_INTRINSICS_
1905
}
1906
 
1907
//------------------------------------------------------------------------------
1908
 
1909
XMFINLINE XMVECTOR XMLoadXIco4
1910
(
1911
    CONST XMXICO4* pSource
1912
)
1913
{
1914
#if defined(_XM_NO_INTRINSICS_)
1915
 
1916
    XMVECTOR          V;
1917
    UINT              Element;
1918
    static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
1919
 
1920
    XMASSERT(pSource);
1921
    XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
1922
    XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
1923
    XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
1924
 
1925
    Element = (UINT)pSource->v & 0xFFFFF;
1926
    V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
1927
    Element = (UINT)(pSource->v >> 20) & 0xFFFFF;
1928
    V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
1929
    Element = (UINT)(pSource->v >> 40) & 0xFFFFF;
1930
    V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
1931
    V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
1932
 
1933
    return V;
1934
 
1935
#elif defined(_XM_SSE_INTRINSICS_)
1936
    XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
1937
    XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
1938
    XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
1939
    XMASSERT(pSource);
1940
    // Grab the 64 bit structure
1941
    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
1942
    // By shifting down 8 bits, y and z are in seperate 32 bit elements
1943
    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
1944
    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
1945
    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
1946
    // Fix the entries to x,y,z,w
1947
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
1948
    // Mask x,y,z and w
1949
    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
1950
    // x and z are unsigned! Flip the bits to convert the order to signed
1951
    vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
1952
    // Convert to floating point numbers
1953
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
1954
    // x and z - 0x80 to complete the conversion
1955
    vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
1956
    // Fix y and w because they are too large
1957
    vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
1958
    return vTemp;
1959
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1960
#endif // _XM_VMX128_INTRINSICS_
1961
}
1962
 
1963
//------------------------------------------------------------------------------
1964
 
1965
XMFINLINE XMVECTOR XMLoadUIcoN4
1966
(
1967
    CONST XMUICON4* pSource
1968
)
1969
{
1970
#if defined(_XM_NO_INTRINSICS_)
1971
 
1972
    XMVECTOR V;
1973
 
1974
    XMASSERT(pSource);
1975
 
1976
    V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF) / 1048575.0f;
1977
    V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF) / 1048575.0f;
1978
    V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF) / 1048575.0f;
1979
    V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
1980
 
1981
    return V;
1982
 
1983
#elif defined(_XM_SSE_INTRINSICS_)
1984
    static const XMVECTORF32 LoadUIcoN4Mul = {1.0f/1048575.0f,1.0f/(1048575.0f*4096.0f),1.0f/1048575.0f,1.0f/(15.0f*4096.0f*65536.0f)};
1985
    XMASSERT(pSource);
1986
    // Grab the 64 bit structure
1987
    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
1988
    // By shifting down 8 bits, y and z are in seperate 32 bit elements
1989
    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
1990
    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
1991
    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
1992
    // Fix the entries to x,y,z,w
1993
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
1994
    // Mask x,y,z and w
1995
    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
1996
    // x and z are unsigned! Flip the bits to convert the order to signed
1997
    vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
1998
    // Convert to floating point numbers
1999
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2000
    // x and z - 0x80 to complete the conversion
2001
    vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
2002
    // Fix y and w because they are too large
2003
    vTemp = _mm_mul_ps(vTemp,LoadUIcoN4Mul);
2004
    return vTemp;
2005
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2006
#endif // _XM_VMX128_INTRINSICS_
2007
}
2008
 
2009
//------------------------------------------------------------------------------
2010
 
2011
XMFINLINE XMVECTOR XMLoadUIco4
2012
(
2013
    CONST XMUICO4* pSource
2014
)
2015
{
2016
#if defined(_XM_NO_INTRINSICS_)
2017
 
2018
    XMVECTOR V;
2019
 
2020
    XMASSERT(pSource);
2021
 
2022
    V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF);
2023
    V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF);
2024
    V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF);
2025
    V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
2026
 
2027
    return V;
2028
 
2029
#elif defined(_XM_SSE_INTRINSICS_)
2030
    XMASSERT(pSource);
2031
    // Grab the 64 bit structure
2032
    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2033
    // By shifting down 8 bits, y and z are in seperate 32 bit elements
2034
    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2035
    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2036
    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2037
    // Fix the entries to x,y,z,w
2038
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2039
    // Mask x,y,z and w
2040
    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2041
    // x and z are unsigned! Flip the bits to convert the order to signed
2042
    vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
2043
    // Convert to floating point numbers
2044
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2045
    // x and z - 0x80 to complete the conversion
2046
    vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
2047
    // Fix y and w because they are too large
2048
    vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
2049
    return vTemp;
2050
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2051
#endif // _XM_VMX128_INTRINSICS_
2052
}
2053
 
2054
//------------------------------------------------------------------------------
2055
 
2056
XMFINLINE XMVECTOR XMLoadIcoN4
2057
(
2058
    CONST XMICON4* pSource
2059
)
2060
{
2061
#if defined(_XM_NO_INTRINSICS_)
2062
 
2063
    XMVECTOR          V;
2064
    UINT              Element;
2065
    static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
2066
    static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
2067
 
2068
    XMASSERT(pSource);
2069
 
2070
    Element = (UINT)pSource->v & 0xFFFFF;
2071
    V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2072
    Element = (UINT)(pSource->v >> 20) & 0xFFFFF;
2073
    V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2074
    Element = (UINT)(pSource->v >> 40) & 0xFFFFF;
2075
    V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2076
    Element = (UINT)(pSource->v >> 60);
2077
    V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]) / 7.0f;
2078
 
2079
    return V;
2080
 
2081
#elif defined(_XM_SSE_INTRINSICS_)
2082
    static const XMVECTORF32 LoadIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(7.0f*4096.0f*65536.0f)};
2083
    XMASSERT(pSource);
2084
    // Grab the 64 bit structure
2085
    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2086
    // By shifting down 8 bits, y and z are in seperate 32 bit elements
2087
    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2088
    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2089
    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2090
    // Fix the entries to x,y,z,w
2091
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2092
    // Mask x,y,z and w
2093
    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2094
    // x and z are unsigned! Flip the bits to convert the order to signed
2095
    vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
2096
    // Convert to floating point numbers
2097
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2098
    // x and z - 0x80 to complete the conversion
2099
    vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
2100
    // Fix y and w because they are too large
2101
    vTemp = _mm_mul_ps(vTemp,LoadIcoN4Mul);
2102
    return vTemp;
2103
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2104
#endif // _XM_VMX128_INTRINSICS_
2105
}
2106
 
2107
//------------------------------------------------------------------------------
2108
 
2109
XMFINLINE XMVECTOR XMLoadIco4
2110
(
2111
    CONST XMICO4* pSource
2112
)
2113
{
2114
#if defined(_XM_NO_INTRINSICS_)
2115
 
2116
    XMVECTOR          V;
2117
    UINT              Element;
2118
    static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
2119
    static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
2120
 
2121
    XMASSERT(pSource);
2122
 
2123
    Element = (UINT)pSource->v & 0xFFFFF;
2124
    V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2125
    Element = (UINT)(pSource->v >> 20) & 0xFFFFF;
2126
    V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2127
    Element = (UINT)(pSource->v >> 40) & 0xFFFFF;
2128
    V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2129
    Element = (UINT)(pSource->v >> 60);
2130
    V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]);
2131
 
2132
    return V;
2133
 
2134
#elif defined(_XM_SSE_INTRINSICS_)
2135
    XMASSERT(pSource);
2136
    // Grab the 64 bit structure
2137
    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2138
    // By shifting down 8 bits, y and z are in seperate 32 bit elements
2139
    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2140
    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2141
    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2142
    // Fix the entries to x,y,z,w
2143
    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2144
    // Mask x,y,z and w
2145
    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2146
    // x and z are unsigned! Flip the bits to convert the order to signed
2147
    vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
2148
    // Convert to floating point numbers
2149
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2150
    // x and z - 0x80 to complete the conversion
2151
    vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
2152
    // Fix y and w because they are too large
2153
    vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
2154
    return vTemp;
2155
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2156
#endif // _XM_VMX128_INTRINSICS_
2157
}
2158
 
2159
 
2160
//------------------------------------------------------------------------------
2161
 
2162
XMFINLINE XMVECTOR XMLoadXDecN4
2163
(
2164
    CONST XMXDECN4* pSource
2165
)
2166
{
2167
#if defined(_XM_NO_INTRINSICS_)
2168
    XMVECTOR V;
2169
    UINT Element;
2170
    static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2171
 
2172
    XMASSERT(pSource);
2173
    XMASSERT((pSource->v & 0x3FF) != 0x200);
2174
    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2175
    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2176
 
2177
    Element = pSource->v & 0x3FF;
2178
    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2179
    Element = (pSource->v >> 10) & 0x3FF;
2180
    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2181
    Element = (pSource->v >> 20) & 0x3FF;
2182
    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2183
    V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
2184
 
2185
    return V;
2186
 
2187
#elif defined(_XM_SSE_INTRINSICS_)
2188
	XMASSERT(pSource);
2189
    // Splat the color in all four entries
2190
    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2191
    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2192
    vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10);
2193
    // a is unsigned! Flip the bit to convert the order to signed
2194
    vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
2195
    // Convert to floating point numbers
2196
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2197
    // RGB + 0, A + 0x80000000.f to undo the signed order.
2198
    vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
2199
    // Convert 0-255 to 0.0f-1.0f
2200
    return _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10);
2201
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2202
#endif // _XM_VMX128_INTRINSICS_
2203
}
2204
 
2205
//------------------------------------------------------------------------------
2206
 
2207
XMFINLINE XMVECTOR XMLoadXDec4
2208
(
2209
    CONST XMXDEC4* pSource
2210
)
2211
{
2212
#if defined(_XM_NO_INTRINSICS_)
2213
 
2214
    XMVECTOR          V;
2215
    UINT              Element;
2216
    static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2217
 
2218
    XMASSERT(pSource);
2219
    XMASSERT((pSource->v & 0x3FF) != 0x200);
2220
    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2221
    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2222
 
2223
    Element = pSource->v & 0x3FF;
2224
    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2225
    Element = (pSource->v >> 10) & 0x3FF;
2226
    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2227
    Element = (pSource->v >> 20) & 0x3FF;
2228
    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2229
    V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
2230
 
2231
    return V;
2232
 
2233
#elif defined(_XM_SSE_INTRINSICS_)
2234
    XMASSERT((pSource->v & 0x3FF) != 0x200);
2235
    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2236
    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2237
    static const XMVECTORI32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
2238
    static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f};
2239
    XMASSERT(pSource);
2240
    // Splat the color in all four entries
2241
    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2242
    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2243
    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2244
    // a is unsigned! Flip the bit to convert the order to signed
2245
    vTemp = _mm_xor_ps(vTemp,XDec4Xor);
2246
    // Convert to floating point numbers
2247
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2248
    // RGB + 0, A + 0x80000000.f to undo the signed order.
2249
    vTemp = _mm_add_ps(vTemp,XDec4Add);
2250
    // Convert 0-255 to 0.0f-1.0f
2251
    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
2252
    return vTemp;
2253
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2254
#endif // _XM_VMX128_INTRINSICS_
2255
}
2256
 
2257
//------------------------------------------------------------------------------
2258
 
2259
XMFINLINE XMVECTOR XMLoadUDecN4
2260
(
2261
    CONST XMUDECN4* pSource
2262
)
2263
{
2264
#if defined(_XM_NO_INTRINSICS_)
2265
 
2266
    XMVECTOR          V;
2267
    UINT              Element;
2268
 
2269
    XMASSERT(pSource);
2270
 
2271
    Element = pSource->v & 0x3FF;
2272
    V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
2273
    Element = (pSource->v >> 10) & 0x3FF;
2274
    V.vector4_f32[1] = (FLOAT)Element / 1023.0f;
2275
    Element = (pSource->v >> 20) & 0x3FF;
2276
    V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
2277
    V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
2278
 
2279
    return V;
2280
 
2281
#elif defined(_XM_SSE_INTRINSICS_)
2282
    XMASSERT(pSource);
2283
    static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
2284
    // Splat the color in all four entries
2285
    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2286
    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2287
    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2288
    // a is unsigned! Flip the bit to convert the order to signed
2289
    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2290
    // Convert to floating point numbers
2291
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2292
    // RGB + 0, A + 0x80000000.f to undo the signed order.
2293
    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2294
    // Convert 0-255 to 0.0f-1.0f
2295
    vTemp = _mm_mul_ps(vTemp,UDecN4Mul);
2296
    return vTemp;
2297
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2298
#endif // _XM_VMX128_INTRINSICS_
2299
}
2300
 
2301
//------------------------------------------------------------------------------
2302
 
2303
XMFINLINE XMVECTOR XMLoadUDec4
2304
(
2305
    CONST XMUDEC4* pSource
2306
)
2307
{
2308
#if defined(_XM_NO_INTRINSICS_)
2309
 
2310
    XMVECTOR          V;
2311
    UINT              Element;
2312
 
2313
    XMASSERT(pSource);
2314
 
2315
    Element = pSource->v & 0x3FF;
2316
    V.vector4_f32[0] = (FLOAT)Element;
2317
    Element = (pSource->v >> 10) & 0x3FF;
2318
    V.vector4_f32[1] = (FLOAT)Element;
2319
    Element = (pSource->v >> 20) & 0x3FF;
2320
    V.vector4_f32[2] = (FLOAT)Element;
2321
    V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
2322
 
2323
    return V;
2324
 
2325
#elif defined(_XM_SSE_INTRINSICS_)
2326
    XMASSERT(pSource);
2327
    // Splat the color in all four entries
2328
    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2329
    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2330
    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2331
    // a is unsigned! Flip the bit to convert the order to signed
2332
    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2333
    // Convert to floating point numbers
2334
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2335
    // RGB + 0, A + 0x80000000.f to undo the signed order.
2336
    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2337
    // Convert 0-255 to 0.0f-1.0f
2338
    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
2339
    return vTemp;
2340
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2341
#endif // _XM_VMX128_INTRINSICS_
2342
}
2343
 
2344
//------------------------------------------------------------------------------
2345
 
2346
XMFINLINE XMVECTOR XMLoadDecN4
2347
(
2348
    CONST XMDECN4* pSource
2349
)
2350
{
2351
#if defined(_XM_NO_INTRINSICS_)
2352
 
2353
    XMVECTOR          V;
2354
    UINT              Element;
2355
    static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2356
    static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
2357
 
2358
    XMASSERT(pSource);
2359
    XMASSERT((pSource->v & 0x3FF) != 0x200);
2360
    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2361
    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2362
    XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2363
 
2364
    Element = pSource->v & 0x3FF;
2365
    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2366
    Element = (pSource->v >> 10) & 0x3FF;
2367
    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2368
    Element = (pSource->v >> 20) & 0x3FF;
2369
    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2370
    Element = pSource->v >> 30;
2371
    V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]);
2372
 
2373
    return V;
2374
 
2375
#elif defined(_XM_SSE_INTRINSICS_)
2376
    XMASSERT(pSource);
2377
    XMASSERT((pSource->v & 0x3FF) != 0x200);
2378
    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2379
    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2380
    XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2381
    static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
2382
    // Splat the color in all four entries
2383
    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2384
    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2385
    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2386
    // a is unsigned! Flip the bit to convert the order to signed
2387
    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
2388
    // Convert to floating point numbers
2389
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2390
    // RGB + 0, A + 0x80000000.f to undo the signed order.
2391
    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
2392
    // Convert 0-255 to 0.0f-1.0f
2393
    vTemp = _mm_mul_ps(vTemp,DecN4Mul);
2394
    return vTemp;
2395
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2396
#endif // _XM_VMX128_INTRINSICS_
2397
}
2398
 
2399
//------------------------------------------------------------------------------
2400
 
2401
XMFINLINE XMVECTOR XMLoadDec4
2402
(
2403
    CONST XMDEC4* pSource
2404
)
2405
{
2406
#if defined(_XM_NO_INTRINSICS_)
2407
 
2408
    XMVECTOR          V;
2409
    UINT              Element;
2410
    static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2411
    static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
2412
 
2413
    XMASSERT(pSource);
2414
    XMASSERT((pSource->v & 0x3FF) != 0x200);
2415
    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2416
    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2417
    XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2418
 
2419
    Element = pSource->v & 0x3FF;
2420
    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2421
    Element = (pSource->v >> 10) & 0x3FF;
2422
    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2423
    Element = (pSource->v >> 20) & 0x3FF;
2424
    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2425
    Element = pSource->v >> 30;
2426
    V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]);
2427
 
2428
    return V;
2429
 
2430
#elif defined(_XM_SSE_INTRINSICS_)
2431
    XMASSERT((pSource->v & 0x3FF) != 0x200);
2432
    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2433
    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2434
    XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2435
    XMASSERT(pSource);
2436
    // Splat the color in all four entries
2437
    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2438
    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2439
    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2440
    // a is unsigned! Flip the bit to convert the order to signed
2441
    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
2442
    // Convert to floating point numbers
2443
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2444
    // RGB + 0, A + 0x80000000.f to undo the signed order.
2445
    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
2446
    // Convert 0-255 to 0.0f-1.0f
2447
    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
2448
    return vTemp;
2449
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2450
#endif // _XM_VMX128_INTRINSICS_
2451
}
2452
 
2453
//------------------------------------------------------------------------------
2454
 
2455
XMFINLINE XMVECTOR XMLoadUByteN4
2456
(
2457
    CONST XMUBYTEN4* pSource
2458
)
2459
{
2460
#if defined(_XM_NO_INTRINSICS_)
2461
 
2462
    XMVECTOR V;
2463
 
2464
    XMASSERT(pSource);
2465
 
2466
    V.vector4_f32[0] = (FLOAT)pSource->x / 255.0f;
2467
    V.vector4_f32[1] = (FLOAT)pSource->y / 255.0f;
2468
    V.vector4_f32[2] = (FLOAT)pSource->z / 255.0f;
2469
    V.vector4_f32[3] = (FLOAT)pSource->w / 255.0f;
2470
 
2471
    return V;
2472
 
2473
#elif defined(_XM_SSE_INTRINSICS_)
2474
    static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)};
2475
	XMASSERT(pSource);
2476
    // Splat the color in all four entries (x,z,y,w)
2477
    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2478
    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2479
    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2480
    // w is signed! Flip the bits to convert the order to unsigned
2481
    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2482
    // Convert to floating point numbers
2483
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2484
    // w + 0x80 to complete the conversion
2485
    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2486
    // Fix y, z and w because they are too large
2487
    vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul);
2488
    return vTemp;
2489
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2490
#endif // _XM_VMX128_INTRINSICS_
2491
}
2492
 
2493
//------------------------------------------------------------------------------
2494
 
2495
XMFINLINE XMVECTOR XMLoadUByte4
2496
(
2497
    CONST XMUBYTE4* pSource
2498
)
2499
{
2500
#if defined(_XM_NO_INTRINSICS_)
2501
 
2502
    XMVECTOR V;
2503
 
2504
    XMASSERT(pSource);
2505
 
2506
    V.vector4_f32[0] = (FLOAT)pSource->x;
2507
    V.vector4_f32[1] = (FLOAT)pSource->y;
2508
    V.vector4_f32[2] = (FLOAT)pSource->z;
2509
    V.vector4_f32[3] = (FLOAT)pSource->w;
2510
 
2511
    return V;
2512
 
2513
#elif defined(_XM_SSE_INTRINSICS_)
2514
    static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
2515
	XMASSERT(pSource);
2516
    // Splat the color in all four entries (x,z,y,w)
2517
    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2518
    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2519
    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2520
    // w is signed! Flip the bits to convert the order to unsigned
2521
    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2522
    // Convert to floating point numbers
2523
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2524
    // w + 0x80 to complete the conversion
2525
    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2526
    // Fix y, z and w because they are too large
2527
    vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul);
2528
    return vTemp;
2529
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2530
#endif // _XM_VMX128_INTRINSICS_
2531
}
2532
 
2533
//------------------------------------------------------------------------------
2534
 
2535
XMFINLINE XMVECTOR XMLoadByteN4
2536
(
2537
    CONST XMBYTEN4* pSource
2538
)
2539
{
2540
#if defined(_XM_NO_INTRINSICS_)
2541
 
2542
    XMVECTOR V;
2543
 
2544
    XMASSERT(pSource);
2545
    XMASSERT(pSource->x != -128);
2546
    XMASSERT(pSource->y != -128);
2547
    XMASSERT(pSource->z != -128);
2548
    XMASSERT(pSource->w != -128);
2549
 
2550
    V.vector4_f32[0] = (FLOAT)pSource->x / 127.0f;
2551
    V.vector4_f32[1] = (FLOAT)pSource->y / 127.0f;
2552
    V.vector4_f32[2] = (FLOAT)pSource->z / 127.0f;
2553
    V.vector4_f32[3] = (FLOAT)pSource->w / 127.0f;
2554
 
2555
    return V;
2556
 
2557
#elif defined(_XM_SSE_INTRINSICS_)
2558
    static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)};
2559
    XMASSERT(pSource);
2560
    XMASSERT(pSource->x != -128);
2561
    XMASSERT(pSource->y != -128);
2562
    XMASSERT(pSource->z != -128);
2563
    XMASSERT(pSource->w != -128);
2564
    // Splat the color in all four entries (x,z,y,w)
2565
    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2566
    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2567
    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2568
    // x,y and z are unsigned! Flip the bits to convert the order to signed
2569
    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
2570
    // Convert to floating point numbers
2571
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2572
    // x, y and z - 0x80 to complete the conversion
2573
    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
2574
    // Fix y, z and w because they are too large
2575
    vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul);
2576
    return vTemp;
2577
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2578
#endif // _XM_VMX128_INTRINSICS_
2579
}
2580
 
2581
//------------------------------------------------------------------------------
2582
 
2583
XMFINLINE XMVECTOR XMLoadByte4
2584
(
2585
    CONST XMBYTE4* pSource
2586
)
2587
{
2588
#if defined(_XM_NO_INTRINSICS_)
2589
 
2590
    XMVECTOR V;
2591
 
2592
    XMASSERT(pSource);
2593
    XMASSERT(pSource->x != -128);
2594
    XMASSERT(pSource->y != -128);
2595
    XMASSERT(pSource->z != -128);
2596
    XMASSERT(pSource->w != -128);
2597
 
2598
    V.vector4_f32[0] = (FLOAT)pSource->x;
2599
    V.vector4_f32[1] = (FLOAT)pSource->y;
2600
    V.vector4_f32[2] = (FLOAT)pSource->z;
2601
    V.vector4_f32[3] = (FLOAT)pSource->w;
2602
 
2603
    return V;
2604
 
2605
#elif defined(_XM_SSE_INTRINSICS_)
2606
    static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
2607
    XMASSERT(pSource);
2608
    XMASSERT(pSource->x != -128);
2609
    XMASSERT(pSource->y != -128);
2610
    XMASSERT(pSource->z != -128);
2611
    XMASSERT(pSource->w != -128);
2612
    // Splat the color in all four entries (x,z,y,w)
2613
    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2614
    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2615
    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2616
    // x,y and z are unsigned! Flip the bits to convert the order to signed
2617
    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
2618
    // Convert to floating point numbers
2619
    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2620
    // x, y and z - 0x80 to complete the conversion
2621
    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
2622
    // Fix y, z and w because they are too large
2623
    vTemp = _mm_mul_ps(vTemp,LoadByte4Mul);
2624
    return vTemp;
2625
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2626
#endif // _XM_VMX128_INTRINSICS_
2627
}
2628
 
2629
//------------------------------------------------------------------------------
2630
 
2631
XMFINLINE XMVECTOR XMLoadUNibble4
2632
(
2633
     CONST XMUNIBBLE4* pSource
2634
)
2635
{
2636
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
2637
    static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
2638
    static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
2639
    XMASSERT(pSource);
2640
    // Get the 32 bit value and splat it
2641
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2642
    // Mask off x, y and z
2643
    vResult = _mm_and_ps(vResult,UNibble4And);
2644
    // Convert to float
2645
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
2646
    // Normalize x, y, and z
2647
    vResult = _mm_mul_ps(vResult,UNibble4Mul);
2648
    return vResult;
2649
#else
2650
    XMVECTOR          V;
2651
    UINT              Element;
2652
 
2653
    XMASSERT(pSource);
2654
 
2655
    Element = pSource->v & 0xF;
2656
    V.vector4_f32[0] = (FLOAT)Element;
2657
    Element = (pSource->v >> 4) & 0xF;
2658
    V.vector4_f32[1] = (FLOAT)Element;
2659
    Element = (pSource->v >> 8) & 0xF;
2660
    V.vector4_f32[2] = (FLOAT)Element;
2661
    Element = (pSource->v >> 12) & 0xF;
2662
    V.vector4_f32[3] = (FLOAT)Element;
2663
 
2664
    return V;
2665
#endif // !_XM_SSE_INTRISICS_
2666
}
2667
 
2668
//------------------------------------------------------------------------------
2669
 
2670
XMFINLINE XMVECTOR XMLoadU555
2671
(
2672
     CONST XMU555* pSource
2673
)
2674
{
2675
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
2676
    static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
2677
    static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
2678
    XMASSERT(pSource);
2679
    // Get the 32 bit value and splat it
2680
    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2681
    // Mask off x, y and z
2682
    vResult = _mm_and_ps(vResult,U555And);
2683
    // Convert to float
2684
    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
2685
    // Normalize x, y, and z
2686
    vResult = _mm_mul_ps(vResult,U555Mul);
2687
    return vResult;
2688
#else
2689
    XMVECTOR          V;
2690
    UINT              Element;
2691
 
2692
    XMASSERT(pSource);
2693
 
2694
    Element = pSource->v & 0x1F;
2695
    V.vector4_f32[0] = (FLOAT)Element;
2696
    Element = (pSource->v >> 5) & 0x1F;
2697
    V.vector4_f32[1] = (FLOAT)Element;
2698
    Element = (pSource->v >> 10) & 0x1F;
2699
    V.vector4_f32[2] = (FLOAT)Element;
2700
    Element = (pSource->v >> 15) & 0x1;
2701
    V.vector4_f32[3] = (FLOAT)Element;
2702
 
2703
    return V;
2704
#endif // !_XM_SSE_INTRISICS_
2705
}
2706
 
2707
//------------------------------------------------------------------------------
2708
 
2709
XMFINLINE XMVECTOR XMLoadColor
2710
(
2711
    CONST XMCOLOR* pSource
2712
)
2713
{
2714
#if defined(_XM_NO_INTRINSICS_)
2715
    XMASSERT(pSource);
2716
    {
2717
    // INT -> Float conversions are done in one instruction.
2718
    // UINT -> Float calls a runtime function. Keep in INT
2719
    INT iColor = (INT)(pSource->c);
2720
    XMVECTOR vColor = {
2721
        (FLOAT)((iColor >> 16) & 0xFF) * (1.0f/255.0f),
2722
        (FLOAT)((iColor >> 8) & 0xFF) * (1.0f/255.0f),
2723
        (FLOAT)(iColor & 0xFF) * (1.0f/255.0f),
2724
        (FLOAT)((iColor >> 24) & 0xFF) * (1.0f/255.0f)
2725
    };
2726
    return vColor;
2727
    }
2728
#elif defined(_XM_SSE_INTRINSICS_)
2729
	XMASSERT(pSource);
2730
    // Splat the color in all four entries
2731
    __m128i vInt = _mm_set1_epi32(pSource->c);
2732
    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2733
    vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8);
2734
    // a is unsigned! Flip the bit to convert the order to signed
2735
    vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8);
2736
    // Convert to floating point numbers
2737
    XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
2738
    // RGB + 0, A + 0x80000000.f to undo the signed order.
2739
    vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8);
2740
    // Convert 0-255 to 0.0f-1.0f
2741
    return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8);
2742
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2743
#endif // _XM_VMX128_INTRINSICS_
2744
}
2745
 
2746
//------------------------------------------------------------------------------
2747
 
2748
XMFINLINE XMMATRIX XMLoadFloat3x3
2749
(
2750
    CONST XMFLOAT3X3* pSource
2751
)
2752
{
2753
#if defined(_XM_NO_INTRINSICS_)
2754
 
2755
    XMMATRIX M;
2756
 
2757
    XMASSERT(pSource);
2758
 
2759
    M.r[0].vector4_f32[0] = pSource->m[0][0];
2760
    M.r[0].vector4_f32[1] = pSource->m[0][1];
2761
    M.r[0].vector4_f32[2] = pSource->m[0][2];
2762
    M.r[0].vector4_f32[3] = 0.0f;
2763
 
2764
    M.r[1].vector4_f32[0] = pSource->m[1][0];
2765
    M.r[1].vector4_f32[1] = pSource->m[1][1];
2766
    M.r[1].vector4_f32[2] = pSource->m[1][2];
2767
    M.r[1].vector4_f32[3] = 0.0f;
2768
 
2769
    M.r[2].vector4_f32[0] = pSource->m[2][0];
2770
    M.r[2].vector4_f32[1] = pSource->m[2][1];
2771
    M.r[2].vector4_f32[2] = pSource->m[2][2];
2772
    M.r[2].vector4_f32[3] = 0.0f;
2773
 
2774
    M.r[3].vector4_f32[0] = 0.0f;
2775
    M.r[3].vector4_f32[1] = 0.0f;
2776
    M.r[3].vector4_f32[2] = 0.0f;
2777
    M.r[3].vector4_f32[3] = 1.0f;
2778
 
2779
    return M;
2780
 
2781
#elif defined(_XM_SSE_INTRINSICS_)
2782
	XMMATRIX M;
2783
	XMVECTOR V1, V2, V3, Z, T1, T2, T3, T4, T5;
2784
 
2785
	Z = _mm_setzero_ps();
2786
 
2787
	XMASSERT(pSource);
2788
 
2789
	V1 = _mm_loadu_ps( &pSource->m[0][0] );
2790
	V2 = _mm_loadu_ps( &pSource->m[1][1] );
2791
	V3 = _mm_load_ss( &pSource->m[2][2] );
2792
 
2793
	T1 = _mm_unpackhi_ps( V1, Z );
2794
	T2 = _mm_unpacklo_ps( V2, Z );
2795
	T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
2796
	T4 = _mm_movehl_ps( T2, T3 );
2797
	T5 = _mm_movehl_ps( Z, T1 );  
2798
 
2799
	M.r[0] = _mm_movelh_ps( V1, T1 );
2800
	M.r[1] = _mm_add_ps( T4, T5 );
2801
	M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
2802
	M.r[3] = g_XMIdentityR3;
2803
 
2804
	return M;
2805
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2806
#endif // _XM_VMX128_INTRINSICS_
2807
}
2808
 
2809
//------------------------------------------------------------------------------
2810
 
2811
XMFINLINE XMMATRIX XMLoadFloat4x3
2812
(
2813
    CONST XMFLOAT4X3* pSource
2814
)
2815
{
2816
#if defined(_XM_NO_INTRINSICS_)
2817
    XMMATRIX M;
2818
    XMASSERT(pSource);
2819
 
2820
    ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0];
2821
    ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0];
2822
    ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0];
2823
    M.r[0].vector4_f32[3] = 0.0f;
2824
 
2825
    ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0];
2826
    ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0];
2827
    ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0];
2828
    M.r[1].vector4_f32[3] = 0.0f;
2829
 
2830
    ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0];
2831
    ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0];
2832
    ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0];
2833
    M.r[2].vector4_f32[3] = 0.0f;
2834
 
2835
    ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0];
2836
    ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0];
2837
    ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0];
2838
    M.r[3].vector4_f32[3] = 1.0f;
2839
 
2840
    return M;
2841
 
2842
#elif defined(_XM_SSE_INTRINSICS_)
2843
    XMASSERT(pSource);
2844
    // Use unaligned load instructions to 
2845
    // load the 12 floats
2846
    // vTemp1 = x1,y1,z1,x2
2847
    XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
2848
    // vTemp2 = y2,z2,x3,y3
2849
    XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
2850
    // vTemp4 = z3,x4,y4,z4
2851
    XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
2852
    // vTemp3 = x3,y3,z3,z3
2853
    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
2854
    // vTemp2 = y2,z2,x2,x2
2855
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
2856
    // vTemp2 = x2,y2,z2,z2
2857
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
2858
    // vTemp1 = x1,y1,z1,0
2859
    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
2860
    // vTemp2 = x2,y2,z2,0
2861
    vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
2862
    // vTemp3 = x3,y3,z3,0
2863
    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
2864
    // vTemp4i = x4,y4,z4,0
2865
    __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
2866
    // vTemp4i = x4,y4,z4,1.0f
2867
    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
2868
    XMMATRIX M(vTemp1,
2869
            vTemp2,
2870
            vTemp3,
2871
            reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
2872
    return M;
2873
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2874
#endif // _XM_VMX128_INTRINSICS_
2875
}
2876
 
2877
//------------------------------------------------------------------------------
2878
 
2879
XMFINLINE XMMATRIX XMLoadFloat4x3A
2880
(
2881
    CONST XMFLOAT4X3A* pSource
2882
)
2883
{
2884
#if defined(_XM_NO_INTRINSICS_)
2885
 
2886
    XMMATRIX M;
2887
 
2888
    XMASSERT(pSource);
2889
    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
2890
 
2891
    M.r[0].vector4_f32[0] = pSource->m[0][0];
2892
    M.r[0].vector4_f32[1] = pSource->m[0][1];
2893
    M.r[0].vector4_f32[2] = pSource->m[0][2];
2894
    M.r[0].vector4_f32[3] = 0.0f;
2895
 
2896
    M.r[1].vector4_f32[0] = pSource->m[1][0];
2897
    M.r[1].vector4_f32[1] = pSource->m[1][1];
2898
    M.r[1].vector4_f32[2] = pSource->m[1][2];
2899
    M.r[1].vector4_f32[3] = 0.0f;
2900
 
2901
    M.r[2].vector4_f32[0] = pSource->m[2][0];
2902
    M.r[2].vector4_f32[1] = pSource->m[2][1];
2903
    M.r[2].vector4_f32[2] = pSource->m[2][2];
2904
    M.r[2].vector4_f32[3] = 0.0f;
2905
 
2906
    M.r[3].vector4_f32[0] = pSource->m[3][0];
2907
    M.r[3].vector4_f32[1] = pSource->m[3][1];
2908
    M.r[3].vector4_f32[2] = pSource->m[3][2];
2909
    M.r[3].vector4_f32[3] = 1.0f;
2910
 
2911
    return M;
2912
 
2913
#elif defined(_XM_SSE_INTRINSICS_)
2914
	XMASSERT(pSource);
2915
    // Use aligned load instructions to 
2916
    // load the 12 floats
2917
    // vTemp1 = x1,y1,z1,x2
2918
    XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
2919
    // vTemp2 = y2,z2,x3,y3
2920
    XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
2921
    // vTemp4 = z3,x4,y4,z4
2922
    XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
2923
    // vTemp3 = x3,y3,z3,z3
2924
    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
2925
    // vTemp2 = y2,z2,x2,x2
2926
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
2927
    // vTemp2 = x2,y2,z2,z2
2928
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
2929
    // vTemp1 = x1,y1,z1,0
2930
    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
2931
    // vTemp2 = x2,y2,z2,0
2932
    vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
2933
    // vTemp3 = x3,y3,z3,0
2934
    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
2935
    // vTemp4i = x4,y4,z4,0
2936
    __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
2937
    // vTemp4i = x4,y4,z4,1.0f
2938
    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
2939
    XMMATRIX M(vTemp1,
2940
            vTemp2,
2941
            vTemp3,
2942
            reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
2943
    return M;
2944
#else // _XM_VMX128_INTRINSICS_
2945
#endif // _XM_VMX128_INTRINSICS_
2946
}
2947
 
2948
//------------------------------------------------------------------------------
2949
 
2950
XMFINLINE XMMATRIX XMLoadFloat4x4
2951
(
2952
    CONST XMFLOAT4X4* pSource
2953
)
2954
{
2955
#if defined(_XM_NO_INTRINSICS_)
2956
    XMMATRIX M;
2957
    XMASSERT(pSource);
2958
 
2959
    ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0];
2960
    ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0];
2961
    ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0];
2962
    ((UINT *)(&M.r[0].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[0][3]))[0];
2963
 
2964
    ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0];
2965
    ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0];
2966
    ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0];
2967
    ((UINT *)(&M.r[1].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[1][3]))[0];
2968
 
2969
    ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0];
2970
    ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0];
2971
    ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0];
2972
    ((UINT *)(&M.r[2].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[2][3]))[0];
2973
 
2974
    ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0];
2975
    ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0];
2976
    ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0];
2977
    ((UINT *)(&M.r[3].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[3][3]))[0];
2978
 
2979
    return M;
2980
 
2981
#elif defined(_XM_SSE_INTRINSICS_)
2982
    XMASSERT(pSource);
2983
    XMMATRIX M;
2984
 
2985
    M.r[0] = _mm_loadu_ps( &pSource->_11 );
2986
    M.r[1] = _mm_loadu_ps( &pSource->_21 );
2987
    M.r[2] = _mm_loadu_ps( &pSource->_31 );
2988
    M.r[3] = _mm_loadu_ps( &pSource->_41 );
2989
 
2990
    return M;
2991
#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2992
#endif // _XM_VMX128_INTRINSICS_
2993
}
2994
 
2995
//------------------------------------------------------------------------------
2996
 
2997
XMFINLINE XMMATRIX XMLoadFloat4x4A
2998
(
2999
    CONST XMFLOAT4X4A* pSource
3000
)
3001
{
3002
#if defined(_XM_NO_INTRINSICS_)
3003
 
3004
    XMMATRIX M;
3005
 
3006
    XMASSERT(pSource);
3007
    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
3008
 
3009
    M.r[0].vector4_f32[0] = pSource->m[0][0];
3010
    M.r[0].vector4_f32[1] = pSource->m[0][1];
3011
    M.r[0].vector4_f32[2] = pSource->m[0][2];
3012
    M.r[0].vector4_f32[3] = pSource->m[0][3];
3013
 
3014
    M.r[1].vector4_f32[0] = pSource->m[1][0];
3015
    M.r[1].vector4_f32[1] = pSource->m[1][1];
3016
    M.r[1].vector4_f32[2] = pSource->m[1][2];
3017
    M.r[1].vector4_f32[3] = pSource->m[1][3];
3018
 
3019
    M.r[2].vector4_f32[0] = pSource->m[2][0];
3020
    M.r[2].vector4_f32[1] = pSource->m[2][1];
3021
    M.r[2].vector4_f32[2] = pSource->m[2][2];
3022
    M.r[2].vector4_f32[3] = pSource->m[2][3];
3023
 
3024
    M.r[3].vector4_f32[0] = pSource->m[3][0];
3025
    M.r[3].vector4_f32[1] = pSource->m[3][1];
3026
    M.r[3].vector4_f32[2] = pSource->m[3][2];
3027
    M.r[3].vector4_f32[3] = pSource->m[3][3];
3028
 
3029
    return M;
3030
 
3031
#elif defined(_XM_SSE_INTRINSICS_)
3032
	XMMATRIX M;
3033
 
3034
	XMASSERT(pSource);
3035
 
3036
	M.r[0] = _mm_load_ps( &pSource->_11 );
3037
	M.r[1] = _mm_load_ps( &pSource->_21 );
3038
	M.r[2] = _mm_load_ps( &pSource->_31 );
3039
	M.r[3] = _mm_load_ps( &pSource->_41 );
3040
 
3041
	return M;
3042
#else // _XM_VMX128_INTRINSICS_
3043
#endif // _XM_VMX128_INTRINSICS_
3044
}
3045
 
3046
/****************************************************************************
3047
 *
3048
 * Vector and matrix store operations
3049
 *
3050
 ****************************************************************************/
3051
 
3052
XMFINLINE VOID XMStoreInt
3053
(
3054
    UINT*    pDestination,
3055
    FXMVECTOR V
3056
)
3057
{
3058
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
3059
 
3060
    XMASSERT(pDestination);
3061
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3062
 
3063
    *pDestination = XMVectorGetIntX( V );
3064
 
3065
#else // _XM_VMX128_INTRINSICS_
3066
#endif // _XM_VMX128_INTRINSICS_
3067
}
3068
 
3069
//------------------------------------------------------------------------------
3070
 
3071
XMFINLINE VOID XMStoreFloat
3072
(
3073
    FLOAT*    pDestination,
3074
    FXMVECTOR V
3075
)
3076
{
3077
#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
3078
 
3079
    XMASSERT(pDestination);
3080
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3081
 
3082
    *pDestination = XMVectorGetX( V );
3083
 
3084
#else // _XM_VMX128_INTRINSICS_
3085
#endif // _XM_VMX128_INTRINSICS_
3086
}
3087
 
3088
//------------------------------------------------------------------------------
3089
 
3090
XMFINLINE VOID XMStoreInt2
3091
(
3092
    UINT*    pDestination, 
3093
    FXMVECTOR V
3094
)
3095
{
3096
#if defined(_XM_NO_INTRINSICS_)
3097
 
3098
    XMASSERT(pDestination);
3099
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3100
 
3101
    pDestination[0] = V.vector4_u32[0];
3102
    pDestination[1] = V.vector4_u32[1];
3103
 
3104
#elif defined(_XM_SSE_INTRINSICS_)
3105
 
3106
    XMASSERT(pDestination);
3107
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3108
    pDestination[0] = XMVectorGetIntX( V );
3109
    pDestination[1] = XMVectorGetIntY( V );
3110
 
3111
#else // _XM_VMX128_INTRINSICS_
3112
#endif // _XM_VMX128_INTRINSICS_
3113
}
3114
 
3115
//------------------------------------------------------------------------------
3116
 
3117
XMFINLINE VOID XMStoreInt2A
3118
(
3119
    UINT*    pDestination, 
3120
    FXMVECTOR V
3121
)
3122
{
3123
#if defined(_XM_NO_INTRINSICS_)
3124
 
3125
    XMASSERT(pDestination);
3126
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3127
 
3128
    pDestination[0] = V.vector4_u32[0];
3129
    pDestination[1] = V.vector4_u32[1];
3130
 
3131
#elif defined(_XM_SSE_INTRINSICS_)
3132
 
3133
    XMASSERT(pDestination);
3134
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3135
 
3136
    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
3137
 
3138
#else // _XM_VMX128_INTRINSICS_
3139
#endif // _XM_VMX128_INTRINSICS_
3140
}
3141
 
3142
//------------------------------------------------------------------------------
3143
 
3144
XMFINLINE VOID XMStoreFloat2
3145
(
3146
    XMFLOAT2* pDestination, 
3147
    FXMVECTOR  V
3148
)
3149
{
3150
#if defined(_XM_NO_INTRINSICS_)
3151
 
3152
    XMASSERT(pDestination);
3153
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3154
 
3155
    pDestination->x = V.vector4_f32[0];
3156
    pDestination->y = V.vector4_f32[1];
3157
 
3158
#elif defined(_XM_SSE_INTRINSICS_)
3159
 
3160
    XMASSERT(pDestination);
3161
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3162
 
3163
	XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3164
	_mm_store_ss( &pDestination->x, V );
3165
	_mm_store_ss( &pDestination->y, T );
3166
 
3167
#else // _XM_VMX128_INTRINSICS_
3168
#endif // _XM_VMX128_INTRINSICS_
3169
}
3170
 
3171
//------------------------------------------------------------------------------
3172
 
3173
XMFINLINE VOID XMStoreFloat2A
3174
(
3175
    XMFLOAT2A*   pDestination, 
3176
    FXMVECTOR     V
3177
)
3178
{
3179
#if defined(_XM_NO_INTRINSICS_)
3180
 
3181
    XMASSERT(pDestination);
3182
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3183
 
3184
    pDestination->x = V.vector4_f32[0];
3185
    pDestination->y = V.vector4_f32[1];
3186
 
3187
#elif defined(_XM_SSE_INTRINSICS_)
3188
 
3189
    XMASSERT(pDestination);
3190
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3191
 
3192
	XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3193
	_mm_store_ss( &pDestination->x, V );
3194
	_mm_store_ss( &pDestination->y, T );
3195
 
3196
#else // _XM_VMX128_INTRINSICS_
3197
#endif // _XM_VMX128_INTRINSICS_
3198
}
3199
 
3200
//------------------------------------------------------------------------------
3201
 
3202
XMFINLINE VOID XMStoreHalf2
3203
(
3204
    XMHALF2* pDestination, 
3205
    FXMVECTOR V
3206
)
3207
{
3208
#if defined(_XM_NO_INTRINSICS_)
3209
 
3210
    XMASSERT(pDestination);
3211
 
3212
    pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
3213
    pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
3214
 
3215
#elif defined(_XM_SSE_INTRINSICS_)
3216
    XMASSERT(pDestination);
3217
    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
3218
    pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
3219
#else // _XM_VMX128_INTRINSICS_
3220
#endif // _XM_VMX128_INTRINSICS_
3221
}
3222
 
3223
//------------------------------------------------------------------------------
3224
 
3225
XMFINLINE VOID XMStoreShortN2
3226
(
3227
    XMSHORTN2* pDestination, 
3228
    FXMVECTOR   V
3229
)
3230
{
3231
#if defined(_XM_NO_INTRINSICS_)
3232
 
3233
    XMVECTOR N;
3234
    static CONST XMVECTORF32  Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3235
 
3236
    XMASSERT(pDestination);
3237
 
3238
    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
3239
    N = XMVectorMultiply(N, Scale.v);
3240
    N = XMVectorRound(N);
3241
 
3242
    pDestination->x = (SHORT)N.vector4_f32[0];
3243
    pDestination->y = (SHORT)N.vector4_f32[1];
3244
 
3245
#elif defined(_XM_SSE_INTRINSICS_)
3246
	XMASSERT(pDestination);
3247
    static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3248
 
3249
	XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
3250
	vResult = _mm_min_ps(vResult,g_XMOne);
3251
    vResult = _mm_mul_ps(vResult,Scale);
3252
    __m128i vResulti = _mm_cvtps_epi32(vResult);
3253
    vResulti = _mm_packs_epi32(vResulti,vResulti);
3254
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3255
#else // _XM_VMX128_INTRINSICS_
3256
#endif // _XM_VMX128_INTRINSICS_
3257
}
3258
 
3259
//------------------------------------------------------------------------------
3260
 
3261
XMFINLINE VOID XMStoreShort2
3262
(
3263
    XMSHORT2* pDestination, 
3264
    FXMVECTOR  V
3265
)
3266
{
3267
#if defined(_XM_NO_INTRINSICS_)
3268
 
3269
    XMVECTOR               N;
3270
    static CONST XMVECTOR  Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
3271
    static CONST XMVECTOR  Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3272
 
3273
    XMASSERT(pDestination);
3274
 
3275
    N = XMVectorClamp(V, Min, Max);
3276
    N = XMVectorRound(N);
3277
 
3278
    pDestination->x = (SHORT)N.vector4_f32[0];
3279
    pDestination->y = (SHORT)N.vector4_f32[1];
3280
 
3281
#elif defined(_XM_SSE_INTRINSICS_)
3282
    XMASSERT(pDestination);
3283
    static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
3284
    static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3285
    // Bounds check
3286
    XMVECTOR vResult = _mm_max_ps(V,Min);
3287
    vResult = _mm_min_ps(vResult,Max);
3288
     // Convert to int with rounding
3289
    __m128i vInt = _mm_cvtps_epi32(vResult);
3290
    // Pack the ints into shorts
3291
    vInt = _mm_packs_epi32(vInt,vInt);
3292
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vInt)[0]);
3293
#else // _XM_VMX128_INTRINSICS_
3294
#endif // _XM_VMX128_INTRINSICS_
3295
}
3296
 
3297
//------------------------------------------------------------------------------
3298
 
3299
XMFINLINE VOID XMStoreUShortN2
3300
(
3301
    XMUSHORTN2* pDestination, 
3302
    FXMVECTOR    V
3303
)
3304
{
3305
#if defined(_XM_NO_INTRINSICS_)
3306
 
3307
    XMVECTOR               N;
3308
    static CONST XMVECTORF32  Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3309
 
3310
    XMASSERT(pDestination);
3311
 
3312
    N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
3313
    N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
3314
    N = XMVectorTruncate(N);
3315
 
3316
    pDestination->x = (SHORT)N.vector4_f32[0];
3317
    pDestination->y = (SHORT)N.vector4_f32[1];
3318
 
3319
#elif defined(_XM_SSE_INTRINSICS_)
3320
    XMASSERT(pDestination);
3321
    static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3322
    // Bounds check
3323
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3324
    vResult = _mm_min_ps(vResult,g_XMOne);
3325
    vResult = _mm_mul_ps(vResult,Scale);
3326
     // Convert to int with rounding
3327
    __m128i vInt = _mm_cvtps_epi32(vResult);
3328
    // Since the SSE pack instruction clamps using signed rules,
3329
    // manually extract the values to store them to memory
3330
    pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
3331
    pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
3332
#else // _XM_VMX128_INTRINSICS_
3333
#endif // _XM_VMX128_INTRINSICS_
3334
}
3335
 
3336
//------------------------------------------------------------------------------
3337
 
3338
XMFINLINE VOID XMStoreUShort2
3339
(
3340
    XMUSHORT2* pDestination, 
3341
    FXMVECTOR   V
3342
)
3343
{
3344
#if defined(_XM_NO_INTRINSICS_)
3345
 
3346
    XMVECTOR               N;
3347
    static CONST XMVECTOR  Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3348
 
3349
    XMASSERT(pDestination);
3350
 
3351
    N = XMVectorClamp(V, XMVectorZero(), Max);
3352
    N = XMVectorRound(N);
3353
 
3354
    pDestination->x = (SHORT)N.vector4_f32[0];
3355
    pDestination->y = (SHORT)N.vector4_f32[1];
3356
 
3357
#elif defined(_XM_SSE_INTRINSICS_)
3358
    XMASSERT(pDestination);
3359
    static CONST XMVECTORF32  Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3360
    // Bounds check
3361
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3362
    vResult = _mm_min_ps(vResult,Max);
3363
     // Convert to int with rounding
3364
    __m128i vInt = _mm_cvtps_epi32(vResult);
3365
    // Since the SSE pack instruction clamps using signed rules,
3366
    // manually extract the values to store them to memory
3367
    pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
3368
    pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
3369
#else // _XM_VMX128_INTRINSICS_
3370
#endif // _XM_VMX128_INTRINSICS_
3371
}
3372
 
3373
//------------------------------------------------------------------------------
3374
 
3375
XMFINLINE VOID XMStoreInt3
3376
(
3377
    UINT*    pDestination, 
3378
    FXMVECTOR V
3379
)
3380
{
3381
#if defined(_XM_NO_INTRINSICS_)
3382
 
3383
    XMASSERT(pDestination);
3384
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3385
 
3386
    pDestination[0] = V.vector4_u32[0];
3387
    pDestination[1] = V.vector4_u32[1];
3388
    pDestination[2] = V.vector4_u32[2];
3389
 
3390
#elif defined(_XM_SSE_INTRINSICS_)
3391
 
3392
    XMASSERT(pDestination);
3393
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3394
    pDestination[0] = XMVectorGetIntX( V );
3395
    pDestination[1] = XMVectorGetIntY( V );
3396
    pDestination[2] = XMVectorGetIntZ( V );
3397
 
3398
#else // _XM_VMX128_INTRINSICS_
3399
#endif // _XM_VMX128_INTRINSICS_
3400
}
3401
 
3402
//------------------------------------------------------------------------------
3403
 
3404
XMFINLINE VOID XMStoreInt3A
3405
(
3406
    UINT*    pDestination, 
3407
    FXMVECTOR V
3408
)
3409
{
3410
#if defined(_XM_NO_INTRINSICS_)
3411
 
3412
    XMASSERT(pDestination);
3413
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3414
 
3415
    pDestination[0] = V.vector4_u32[0];
3416
    pDestination[1] = V.vector4_u32[1];
3417
    pDestination[2] = V.vector4_u32[2];
3418
 
3419
#elif defined(_XM_SSE_INTRINSICS_)
3420
 
3421
    XMASSERT(pDestination);
3422
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3423
    pDestination[0] = XMVectorGetIntX( V );
3424
    pDestination[1] = XMVectorGetIntY( V );
3425
    pDestination[2] = XMVectorGetIntZ( V );
3426
 
3427
#else // _XM_VMX128_INTRINSICS_
3428
#endif // _XM_VMX128_INTRINSICS_
3429
}
3430
 
3431
//------------------------------------------------------------------------------
3432
 
3433
XMFINLINE VOID XMStoreFloat3
3434
(
3435
    XMFLOAT3* pDestination, 
3436
    FXMVECTOR V
3437
)
3438
{
3439
#if defined(_XM_NO_INTRINSICS_)
3440
 
3441
    XMASSERT(pDestination);
3442
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3443
 
3444
    pDestination->x = V.vector4_f32[0];
3445
    pDestination->y = V.vector4_f32[1];
3446
    pDestination->z = V.vector4_f32[2];
3447
 
3448
#elif defined(_XM_SSE_INTRINSICS_)
3449
 
3450
    XMASSERT(pDestination);
3451
    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3452
 
3453
	XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
3454
	XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
3455
	_mm_store_ss( &pDestination->x, V );
3456
	_mm_store_ss( &pDestination->y, T1 );
3457
	_mm_store_ss( &pDestination->z, T2 );
3458
 
3459
#else // _XM_VMX128_INTRINSICS_
3460
#endif // _XM_VMX128_INTRINSICS_
3461
}
3462
 
3463
//------------------------------------------------------------------------------
3464
 
3465
XMFINLINE VOID XMStoreFloat3A
3466
(
3467
    XMFLOAT3A*   pDestination, 
3468
    FXMVECTOR     V
3469
)
3470
{
3471
#if defined(_XM_NO_INTRINSICS_)
3472
 
3473
    XMASSERT(pDestination);
3474
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3475
 
3476
    pDestination->x = V.vector4_f32[0];
3477
    pDestination->y = V.vector4_f32[1];
3478
    pDestination->z = V.vector4_f32[2];
3479
 
3480
#elif defined(_XM_SSE_INTRINSICS_)
3481
 
3482
    XMASSERT(pDestination);
3483
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3484
 
3485
	XMVECTOR T1 = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3486
	XMVECTOR T2 = _mm_unpackhi_ps( V, V );
3487
	_mm_store_ss( &pDestination->x, V );
3488
	_mm_store_ss( &pDestination->y, T1 );
3489
	_mm_store_ss( &pDestination->z, T2 );
3490
 
3491
#else // _XM_VMX128_INTRINSICS_
3492
#endif // _XM_VMX128_INTRINSICS_
3493
}
3494
 
3495
//------------------------------------------------------------------------------
3496
 
3497
XMFINLINE VOID XMStoreUHenDN3
3498
(
3499
    XMUHENDN3* pDestination, 
3500
    FXMVECTOR   V
3501
)
3502
{
3503
#if defined(_XM_NO_INTRINSICS_)
3504
 
3505
    XMVECTOR               N;
3506
    static CONST XMVECTORF32  Scale = {2047.0f, 2047.0f, 1023.0f, 0.0f};
3507
 
3508
    XMASSERT(pDestination);
3509
 
3510
    N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
3511
    N = XMVectorMultiply(N, Scale.v);
3512
 
3513
    pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) |
3514
                      (((UINT)N.vector4_f32[1] & 0x7FF) << 11) |
3515
                      (((UINT)N.vector4_f32[0] & 0x7FF));
3516
 
3517
#elif defined(_XM_SSE_INTRINSICS_)
3518
    XMASSERT(pDestination);
3519
    static const XMVECTORF32 ScaleUHenDN3 = {2047.0f, 2047.0f*2048.0f,1023.0f*(2048.0f*2048.0f)/2.0f,1.0f};
3520
    static const XMVECTORI32 MaskUHenDN3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
3521
    // Clamp to bounds
3522
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3523
    vResult = _mm_min_ps(vResult,g_XMOne);
3524
    // Scale by multiplication
3525
    vResult = _mm_mul_ps(vResult,ScaleUHenDN3);
3526
    // Convert to int
3527
    __m128i vResulti = _mm_cvttps_epi32(vResult);
3528
    // Mask off any fraction
3529
    vResulti = _mm_and_si128(vResulti,MaskUHenDN3);
3530
    // Do a horizontal or of 3 entries
3531
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
3532
    // i = x|y
3533
    vResulti = _mm_or_si128(vResulti,vResulti2);
3534
    // Move Z to the x position
3535
    vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
3536
    // Add Z to itself to perform a single bit left shift
3537
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
3538
    // i = x|y|z
3539
    vResulti = _mm_or_si128(vResulti,vResulti2);
3540
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3541
#else // _XM_VMX128_INTRINSICS_
3542
#endif // _XM_VMX128_INTRINSICS_
3543
}
3544
 
3545
//------------------------------------------------------------------------------
3546
 
3547
XMFINLINE VOID XMStoreUHenD3
3548
(
3549
    XMUHEND3* pDestination, 
3550
    FXMVECTOR  V
3551
)
3552
{
3553
#if defined(_XM_NO_INTRINSICS_)
3554
 
3555
    XMVECTOR               N;
3556
    static CONST XMVECTOR  Max = {2047.0f, 2047.0f, 1023.0f, 0.0f};
3557
 
3558
    XMASSERT(pDestination);
3559
 
3560
    N = XMVectorClamp(V, XMVectorZero(), Max);
3561
 
3562
    pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) |
3563
                      (((UINT)N.vector4_f32[1] & 0x7FF) << 11) |
3564
                      (((UINT)N.vector4_f32[0] & 0x7FF));
3565
 
3566
#elif defined(_XM_SSE_INTRINSICS_)
3567
    XMASSERT(pDestination);
3568
    static const XMVECTORF32 MaxUHenD3 = { 2047.0f, 2047.0f, 1023.0f, 1.0f};
3569
    static const XMVECTORF32 ScaleUHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f)/2.0f,1.0f};
3570
    static const XMVECTORI32 MaskUHenD3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
3571
    // Clamp to bounds
3572
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3573
    vResult = _mm_min_ps(vResult,MaxUHenD3);
3574
    // Scale by multiplication
3575
    vResult = _mm_mul_ps(vResult,ScaleUHenD3);
3576
    // Convert to int
3577
    __m128i vResulti = _mm_cvttps_epi32(vResult);
3578
    // Mask off any fraction
3579
    vResulti = _mm_and_si128(vResulti,MaskUHenD3);
3580
    // Do a horizontal or of 3 entries
3581
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
3582
    // i = x|y
3583
    vResulti = _mm_or_si128(vResulti,vResulti2);
3584
    // Move Z to the x position
3585
    vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
3586
    // Add Z to itself to perform a single bit left shift
3587
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
3588
    // i = x|y|z
3589
    vResulti = _mm_or_si128(vResulti,vResulti2);
3590
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3591
#else // _XM_VMX128_INTRINSICS_
3592
#endif // _XM_VMX128_INTRINSICS_
3593
}
3594
 
3595
//------------------------------------------------------------------------------
3596
 
3597
XMFINLINE VOID XMStoreHenDN3
3598
(
3599
    XMHENDN3* pDestination, 
3600
    FXMVECTOR V
3601
)
3602
{
3603
#if defined(_XM_NO_INTRINSICS_)
3604
 
3605
    XMVECTOR               N;
3606
    static CONST XMVECTORF32  Scale = {1023.0f, 1023.0f, 511.0f, 1.0f};
3607
 
3608
    XMASSERT(pDestination);
3609
 
3610
    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
3611
    N = XMVectorMultiply(N, Scale.v);
3612
 
3613
    pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) |
3614
                      (((INT)N.vector4_f32[1] & 0x7FF) << 11) |
3615
                      (((INT)N.vector4_f32[0] & 0x7FF));
3616
 
3617
#elif defined(_XM_SSE_INTRINSICS_)
3618
    XMASSERT(pDestination);
3619
    static const XMVECTORF32 ScaleHenDN3 = {1023.0f, 1023.0f*2048.0f,511.0f*(2048.0f*2048.0f),1.0f};
3620
    // Clamp to bounds
3621
    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
3622
    vResult = _mm_min_ps(vResult,g_XMOne);
3623
    // Scale by multiplication
3624
    vResult = _mm_mul_ps(vResult,ScaleHenDN3);
3625
    // Convert to int
3626
    __m128i vResulti = _mm_cvttps_epi32(vResult);
3627
    // Mask off any fraction
3628
    vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
3629
    // Do a horizontal or of all 4 entries
3630
    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
3631
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
3632
    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
3633
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
3634
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3635
#else // _XM_VMX128_INTRINSICS_
3636
#endif // _XM_VMX128_INTRINSICS_
3637
}
3638
 
3639
//------------------------------------------------------------------------------
3640
 
3641
XMFINLINE VOID XMStoreHenD3
3642
(
3643
    XMHEND3* pDestination, 
3644
    FXMVECTOR V
3645
)
3646
{
3647
#if defined(_XM_NO_INTRINSICS_)
3648
 
3649
    XMVECTOR               N;
3650
    static CONST XMVECTOR  Min = {-1023.0f, -1023.0f, -511.0f, -1.0f};
3651
    static CONST XMVECTOR  Max = {1023.0f, 1023.0f, 511.0f, 1.0f};
3652
 
3653
    XMASSERT(pDestination);
3654
 
3655
    N = XMVectorClamp(V, Min, Max);
3656
 
3657
    pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) |
3658
                      (((INT)N.vector4_f32[1] & 0x7FF) << 11) |
3659
                      (((INT)N.vector4_f32[0] & 0x7FF));
3660
 
3661
#elif defined(_XM_SSE_INTRINSICS_)
3662
    XMASSERT(pDestination);
3663
    static const XMVECTORF32 MinHenD3 = {-1023.0f,-1023.0f,-511.0f,-1.0f};
3664
    static const XMVECTORF32 MaxHenD3 = { 1023.0f, 1023.0f, 511.0f, 1.0f};
3665
    static const XMVECTORF32 ScaleHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f),1.0f};
3666
    // Clamp to bounds
3667
    XMVECTOR vResult = _mm_max_ps(V,MinHenD3);
3668
    vResult = _mm_min_ps(vResult,MaxHenD3);
3669
    // Scale by multiplication
3670
    vResult = _mm_mul_ps(vResult,ScaleHenD3);
3671
    // Convert to int
3672
    __m128i vResulti = _mm_cvttps_epi32(vResult);
3673
    // Mask off any fraction
3674
    vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
3675
    // Do a horizontal or of all 4 entries
3676
    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
3677
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
3678
    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
3679
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
3680
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3681
#else // _XM_VMX128_INTRINSICS_
3682
#endif // _XM_VMX128_INTRINSICS_
3683
}
3684
 
3685
//------------------------------------------------------------------------------
3686
 
3687
XMFINLINE VOID XMStoreUDHenN3
3688
(
3689
    XMUDHENN3* pDestination, 
3690
    FXMVECTOR   V
3691
)
3692
{
3693
#if defined(_XM_NO_INTRINSICS_)
3694
 
3695
    XMVECTOR               N;
3696
    static CONST XMVECTORF32  Scale = {1023.0f, 2047.0f, 2047.0f, 0.0f};
3697
 
3698
    XMASSERT(pDestination);
3699
 
3700
    N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
3701
    N = XMVectorMultiply(N, Scale.v);
3702
 
3703
    pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) |
3704
                      (((UINT)N.vector4_f32[1] & 0x7FF) << 10) |
3705
                      (((UINT)N.vector4_f32[0] & 0x3FF));
3706
 
3707
#elif defined(_XM_SSE_INTRINSICS_)
3708
    XMASSERT(pDestination);
3709
    static const XMVECTORF32 ScaleUDHenN3 = {1023.0f,2047.0f*1024.0f,2047.0f*(1024.0f*2048.0f)/2.0f,1.0f};
3710
    static const XMVECTORI32 MaskUDHenN3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
3711
    // Clamp to bounds
3712
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3713
    vResult = _mm_min_ps(vResult,g_XMOne);
3714
    // Scale by multiplication
3715
    vResult = _mm_mul_ps(vResult,ScaleUDHenN3);
3716
    // Convert to int
3717
    __m128i vResulti = _mm_cvttps_epi32(vResult);
3718
    // Mask off any fraction
3719
    vResulti = _mm_and_si128(vResulti,MaskUDHenN3);
3720
    // Do a horizontal or of 3 entries
3721
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
3722
    // i = x|y
3723
    vResulti = _mm_or_si128(vResulti,vResulti2);
3724
    // Move Z to the x position
3725
    vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
3726
    // Add Z to itself to perform a single bit left shift
3727
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
3728
    // i = x|y|z
3729
    vResulti = _mm_or_si128(vResulti,vResulti2);
3730
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3731
#else // _XM_VMX128_INTRINSICS_
3732
#endif // _XM_VMX128_INTRINSICS_
3733
}
3734
 
3735
//------------------------------------------------------------------------------
3736
 
3737
XMFINLINE VOID XMStoreUDHen3
3738
(
3739
    XMUDHEN3* pDestination, 
3740
    FXMVECTOR  V
3741
)
3742
{
3743
#if defined(_XM_NO_INTRINSICS_)
3744
 
3745
    XMVECTOR               N;
3746
    static CONST XMVECTOR  Max = {1023.0f, 2047.0f, 2047.0f, 0.0f};
3747
 
3748
    XMASSERT(pDestination);
3749
 
3750
    N = XMVectorClamp(V, XMVectorZero(), Max);
3751
 
3752
    pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) |
3753
                      (((UINT)N.vector4_f32[1] & 0x7FF) << 10) |
3754
                      (((UINT)N.vector4_f32[0] & 0x3FF));
3755
 
3756
#elif defined(_XM_SSE_INTRINSICS_)
3757
    XMASSERT(pDestination);
3758
    static const XMVECTORF32 MaxUDHen3 = { 1023.0f, 2047.0f, 2047.0f, 1.0f};
3759
    static const XMVECTORF32 ScaleUDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f)/2.0f,1.0f};
3760
    static const XMVECTORI32 MaskUDHen3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
3761
    // Clamp to bounds
3762
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3763
    vResult = _mm_min_ps(vResult,MaxUDHen3);
3764
    // Scale by multiplication
3765
    vResult = _mm_mul_ps(vResult,ScaleUDHen3);
3766
    // Convert to int
3767
    __m128i vResulti = _mm_cvttps_epi32(vResult);
3768
    // Mask off any fraction
3769
    vResulti = _mm_and_si128(vResulti,MaskUDHen3);
3770
    // Do a horizontal or of 3 entries
3771
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
3772
    // i = x|y
3773
    vResulti = _mm_or_si128(vResulti,vResulti2);
3774
    // Move Z to the x position
3775
    vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
3776
    // Add Z to itself to perform a single bit left shift
3777
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
3778
    // i = x|y|z
3779
    vResulti = _mm_or_si128(vResulti,vResulti2);
3780
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3781
#else // _XM_VMX128_INTRINSICS_
3782
#endif // _XM_VMX128_INTRINSICS_
3783
}
3784
 
3785
//------------------------------------------------------------------------------
3786
 
3787
XMFINLINE VOID XMStoreDHenN3
3788
(
3789
    XMDHENN3* pDestination, 
3790
    FXMVECTOR V
3791
)
3792
{
3793
#if defined(_XM_NO_INTRINSICS_)
3794
 
3795
    XMVECTOR               N;
3796
    static CONST XMVECTORF32  Scale = {511.0f, 1023.0f, 1023.0f, 1.0f};
3797
 
3798
    XMASSERT(pDestination);
3799
 
3800
    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
3801
    N = XMVectorMultiply(N, Scale.v);
3802
 
3803
    pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) |
3804
                      (((INT)N.vector4_f32[1] & 0x7FF) << 10) |
3805
                      (((INT)N.vector4_f32[0] & 0x3FF));
3806
 
3807
#elif defined(_XM_SSE_INTRINSICS_)
3808
    XMASSERT(pDestination);
3809
    static const XMVECTORF32 ScaleDHenN3 = {511.0f, 1023.0f*1024.0f,1023.0f*(1024.0f*2048.0f),1.0f};
3810
    // Clamp to bounds
3811
    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
3812
    vResult = _mm_min_ps(vResult,g_XMOne);
3813
    // Scale by multiplication
3814
    vResult = _mm_mul_ps(vResult,ScaleDHenN3);
3815
    // Convert to int
3816
    __m128i vResulti = _mm_cvttps_epi32(vResult);
3817
    // Mask off any fraction
3818
    vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
3819
    // Do a horizontal or of all 4 entries
3820
    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
3821
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
3822
    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
3823
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
3824
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3825
#else // _XM_VMX128_INTRINSICS_
3826
#endif // _XM_VMX128_INTRINSICS_
3827
}
3828
 
3829
//------------------------------------------------------------------------------
3830
 
3831
XMFINLINE VOID XMStoreDHen3
3832
(
3833
    XMDHEN3* pDestination, 
3834
    FXMVECTOR V
3835
)
3836
{
3837
#if defined(_XM_NO_INTRINSICS_)
3838
 
3839
    XMVECTOR               N;
3840
    static CONST XMVECTOR  Min = {-511.0f, -1023.0f, -1023.0f, -1.0f};
3841
    static CONST XMVECTOR  Max = {511.0f, 1023.0f, 1023.0f, 1.0f};
3842
 
3843
    XMASSERT(pDestination);
3844
 
3845
    N = XMVectorClamp(V, Min, Max);
3846
 
3847
    pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) |
3848
                      (((INT)N.vector4_f32[1] & 0x7FF) << 10) |
3849
                      (((INT)N.vector4_f32[0] & 0x3FF));
3850
 
3851
#elif defined(_XM_SSE_INTRINSICS_)
3852
    XMASSERT(pDestination);
3853
    static const XMVECTORF32 MinDHen3 = {-511.0f,-1023.0f,-1023.0f,-1.0f};
3854
    static const XMVECTORF32 MaxDHen3 = { 511.0f, 1023.0f, 1023.0f, 1.0f};
3855
    static const XMVECTORF32 ScaleDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f),1.0f};
3856
    // Clamp to bounds
3857
    XMVECTOR vResult = _mm_max_ps(V,MinDHen3);
3858
    vResult = _mm_min_ps(vResult,MaxDHen3);
3859
    // Scale by multiplication
3860
    vResult = _mm_mul_ps(vResult,ScaleDHen3);
3861
    // Convert to int
3862
    __m128i vResulti = _mm_cvttps_epi32(vResult);
3863
    // Mask off any fraction
3864
    vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
3865
    // Do a horizontal or of all 4 entries
3866
    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
3867
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
3868
    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
3869
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
3870
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3871
#else // _XM_VMX128_INTRINSICS_
3872
#endif // _XM_VMX128_INTRINSICS_
3873
}
3874
 
3875
//------------------------------------------------------------------------------
3876
 
3877
XMFINLINE VOID XMStoreU565
3878
(
3879
    XMU565* pDestination,
3880
    FXMVECTOR V
3881
)
3882
{
3883
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
3884
    XMASSERT(pDestination);
3885
    static CONST XMVECTORF32  Max = {31.0f, 63.0f, 31.0f, 0.0f};
3886
    // Bounds check
3887
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3888
    vResult = _mm_min_ps(vResult,Max);
3889
     // Convert to int with rounding
3890
    __m128i vInt = _mm_cvtps_epi32(vResult);
3891
    // No SSE operations will write to 16-bit values, so we have to extract them manually
3892
    USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
3893
    USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
3894
    USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
3895
    pDestination->v = ((z & 0x1F) << 11) |
3896
                      ((y & 0x3F) << 5) |
3897
                      ((x & 0x1F));
3898
#else
3899
    XMVECTOR               N;
3900
    static CONST XMVECTORF32  Max = {31.0f, 63.0f, 31.0f, 0.0f};
3901
 
3902
    XMASSERT(pDestination);
3903
 
3904
    N = XMVectorClamp(V, XMVectorZero(), Max.v);
3905
    N = XMVectorRound(N);
3906
 
3907
    pDestination->v = (((USHORT)N.vector4_f32[2] & 0x1F) << 11) |
3908
                      (((USHORT)N.vector4_f32[1] & 0x3F) << 5) |
3909
                      (((USHORT)N.vector4_f32[0] & 0x1F));
3910
#endif !_XM_SSE_INTRINSICS_
3911
}
3912
 
3913
//------------------------------------------------------------------------------
3914
 
3915
XMFINLINE VOID XMStoreFloat3PK
3916
(
3917
    XMFLOAT3PK* pDestination,
3918
    FXMVECTOR V
3919
)
3920
{
3921
    UINT I, Sign, j;
3922
    UINT IValue[3];
3923
    UINT Result[3];
3924
 
3925
    XMASSERT(pDestination);
3926
 
3927
    XMStoreFloat3( (XMFLOAT3*)&IValue, V );
3928
 
3929
    // X & Y Channels (5-bit exponent, 6-bit mantissa)
3930
    for(j=0; j < 2; ++j)
3931
    {
3932
        Sign = IValue[j] & 0x80000000;
3933
        I = IValue[j] & 0x7FFFFFFF;
3934
 
3935
        if ((I & 0x7F800000) == 0x7F800000)
3936
        {
3937
            // INF or NAN
3938
            Result[j] = 0x7c0;
3939
            if (( I & 0x7FFFFF ) != 0)
3940
            {
3941
                Result[j] = 0x7c0 | (((I>>17)|(I>11)|(I>>6)|(I))&0x3f);
3942
            }
3943
            else if ( Sign )
3944
            {
3945
                // -INF is clamped to 0 since 3PK is positive only
3946
                Result[j] = 0;
3947
            }
3948
        }
3949
        else if ( Sign )
3950
        {
3951
            // 3PK is positive only, so clamp to zero
3952
            Result[j] = 0;
3953
        }
3954
        else if (I > 0x477E0000U)
3955
        {
3956
            // The number is too large to be represented as a float11, set to max
3957
            Result[j] = 0x7BF;
3958
        }
3959
        else
3960
        {
3961
            if (I < 0x38800000U)
3962
            {
3963
                // The number is too small to be represented as a normalized float11
3964
                // Convert it to a denormalized value.
3965
                UINT Shift = 113U - (I >> 23U);
3966
                I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
3967
            }
3968
            else
3969
            {
3970
                // Rebias the exponent to represent the value as a normalized float11
3971
                I += 0xC8000000U;
3972
            }
3973
 
3974
            Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU;
3975
        }
3976
    }
3977
 
3978
    // Z Channel (5-bit exponent, 5-bit mantissa)
3979
    Sign = IValue[2] & 0x80000000;
3980
    I = IValue[2] & 0x7FFFFFFF;
3981
 
3982
    if ((I & 0x7F800000) == 0x7F800000)
3983
    {
3984
        // INF or NAN
3985
        Result[2] = 0x3e0;
3986
        if ( I & 0x7FFFFF )
3987
        {
3988
            Result[2] = 0x3e0 | (((I>>18)|(I>13)|(I>>3)|(I))&0x1f);
3989
        }
3990
        else if ( Sign )
3991
        {
3992
            // -INF is clamped to 0 since 3PK is positive only
3993
            Result[2] = 0;
3994
        }
3995
    }
3996
    else if ( Sign )
3997
    {
3998
        // 3PK is positive only, so clamp to zero
3999
        Result[2] = 0;
4000
    }
4001
    else if (I > 0x477C0000U)
4002
    {
4003
        // The number is too large to be represented as a float10, set to max
4004
        Result[2] = 0x3df;
4005
    }
4006
    else
4007
    {
4008
        if (I < 0x38800000U)
4009
        {
4010
            // The number is too small to be represented as a normalized float10
4011
            // Convert it to a denormalized value.
4012
            UINT Shift = 113U - (I >> 23U);
4013
            I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
4014
        }
4015
        else
4016
        {
4017
            // Rebias the exponent to represent the value as a normalized float10
4018
            I += 0xC8000000U;
4019
        }
4020
 
4021
        Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU;
4022
    }
4023
 
4024
    // Pack Result into memory
4025
    pDestination->v = (Result[0] & 0x7ff)
4026
                      | ( (Result[1] & 0x7ff) << 11 )
4027
                      | ( (Result[2] & 0x3ff) << 22 );
4028
}
4029
 
4030
 
4031
//------------------------------------------------------------------------------
4032
 
4033
XMFINLINE VOID XMStoreFloat3SE
4034
(
4035
    XMFLOAT3SE* pDestination,
4036
    FXMVECTOR V
4037
)
4038
{
4039
    UINT I, Sign, j, T;
4040
    UINT IValue[3];
4041
    UINT Frac[3];
4042
    UINT Exp[3];
4043
 
4044
    XMASSERT(pDestination);
4045
 
4046
    XMStoreFloat3( (XMFLOAT3*)&IValue, V );
4047
 
4048
    // X, Y, Z Channels (5-bit exponent, 9-bit mantissa)
4049
    for(j=0; j < 3; ++j)
4050
    {
4051
        Sign = IValue[j] & 0x80000000;
4052
        I = IValue[j] & 0x7FFFFFFF;
4053
 
4054
        if ((I & 0x7F800000) == 0x7F800000)
4055
        {
4056
            // INF or NAN
4057
            Exp[j] = 0x1f;
4058
            if (( I & 0x7FFFFF ) != 0)
4059
            {
4060
                Frac[j] = ((I>>14)|(I>5)|(I))&0x1ff;
4061
            }
4062
            else if ( Sign )
4063
            {
4064
                // -INF is clamped to 0 since 3SE is positive only
4065
                Exp[j] = Frac[j] = 0;
4066
            }
4067
        }
4068
        else if ( Sign )
4069
        {
4070
            // 3SE is positive only, so clamp to zero
4071
            Exp[j] = Frac[j] = 0;
4072
        }
4073
        else if (I > 0x477FC000U)
4074
        {
4075
            // The number is too large, set to max
4076
            Exp[j] = 0x1e;
4077
            Frac[j] = 0x1ff;
4078
        }
4079
        else
4080
        {
4081
            if (I < 0x38800000U)
4082
            {
4083
                // The number is too small to be represented as a normalized float11
4084
                // Convert it to a denormalized value.
4085
                UINT Shift = 113U - (I >> 23U);
4086
                I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
4087
            }
4088
            else
4089
            {
4090
                // Rebias the exponent to represent the value as a normalized float11
4091
                I += 0xC8000000U;
4092
            }
4093
 
4094
            T = ((I + 0x1FFFU + ((I >> 14U) & 1U)) >> 14U)&0x3fffU;
4095
 
4096
            Exp[j] = (T & 0x3E00) >> 9;
4097
            Frac[j] = T & 0x1ff;
4098
        }
4099
    }
4100
 
4101
    // Adjust to a shared exponent
4102
    T = XMMax( Exp[0], XMMax( Exp[1], Exp[2] ) );
4103
 
4104
    Frac[0] = Frac[0] >> (T - Exp[0]);
4105
    Frac[1] = Frac[1] >> (T - Exp[1]);
4106
    Frac[2] = Frac[2] >> (T - Exp[2]);
4107
 
4108
    // Store packed into memory
4109
    pDestination->xm = Frac[0];
4110
    pDestination->ym = Frac[1];
4111
    pDestination->zm = Frac[2];
4112
    pDestination->e = T;
4113
}
4114
 
4115
//------------------------------------------------------------------------------
4116
 
4117
XMFINLINE VOID XMStoreInt4
4118
(
4119
    UINT*    pDestination, 
4120
    FXMVECTOR V
4121
)
4122
{
4123
#if defined(_XM_NO_INTRINSICS_)
4124
 
4125
    XMASSERT(pDestination);
4126
 
4127
    pDestination[0] = V.vector4_u32[0];
4128
    pDestination[1] = V.vector4_u32[1];
4129
    pDestination[2] = V.vector4_u32[2];
4130
    pDestination[3] = V.vector4_u32[3];
4131
 
4132
#elif defined(_XM_SSE_INTRINSICS_)
4133
    XMASSERT(pDestination);
4134
 
4135
    _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4136
 
4137
#else // _XM_VMX128_INTRINSICS_
4138
#endif // _XM_VMX128_INTRINSICS_
4139
}
4140
 
4141
//------------------------------------------------------------------------------
4142
 
4143
XMFINLINE VOID XMStoreInt4A
4144
(
4145
    UINT*    pDestination, 
4146
    FXMVECTOR V
4147
)
4148
{
4149
#if defined(_XM_NO_INTRINSICS_)
4150
 
4151
    XMASSERT(pDestination);
4152
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4153
 
4154
    pDestination[0] = V.vector4_u32[0];
4155
    pDestination[1] = V.vector4_u32[1];
4156
    pDestination[2] = V.vector4_u32[2];
4157
    pDestination[3] = V.vector4_u32[3];
4158
 
4159
#elif defined(_XM_SSE_INTRINSICS_)
4160
    XMASSERT(pDestination);
4161
 
4162
    _mm_store_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4163
 
4164
#else // _XM_VMX128_INTRINSICS_
4165
#endif // _XM_VMX128_INTRINSICS_
4166
}
4167
 
4168
//------------------------------------------------------------------------------
4169
 
4170
XMFINLINE VOID XMStoreInt4NC
4171
(
4172
    UINT*    pDestination, 
4173
    FXMVECTOR V
4174
)
4175
{
4176
#if defined(_XM_NO_INTRINSICS_)
4177
 
4178
    XMASSERT(pDestination);
4179
 
4180
    pDestination[0] = V.vector4_u32[0];
4181
    pDestination[1] = V.vector4_u32[1];
4182
    pDestination[2] = V.vector4_u32[2];
4183
    pDestination[3] = V.vector4_u32[3];
4184
 
4185
#elif defined(_XM_SSE_INTRINSICS_)
4186
    XMASSERT(pDestination);
4187
 
4188
    _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4189
 
4190
#else // _XM_VMX128_INTRINSICS_
4191
#endif // _XM_VMX128_INTRINSICS_
4192
}
4193
 
4194
//------------------------------------------------------------------------------
4195
 
4196
XMFINLINE VOID XMStoreFloat4
4197
(
4198
    XMFLOAT4* pDestination, 
4199
    FXMVECTOR  V
4200
)
4201
{
4202
#if defined(_XM_NO_INTRINSICS_)
4203
 
4204
    XMASSERT(pDestination);
4205
 
4206
    pDestination->x = V.vector4_f32[0];
4207
    pDestination->y = V.vector4_f32[1];
4208
    pDestination->z = V.vector4_f32[2];
4209
    pDestination->w = V.vector4_f32[3];
4210
 
4211
#elif defined(_XM_SSE_INTRINSICS_)
4212
    XMASSERT(pDestination);
4213
 
4214
    _mm_storeu_ps( &pDestination->x, V );
4215
 
4216
#else // _XM_VMX128_INTRINSICS_
4217
#endif // _XM_VMX128_INTRINSICS_
4218
}
4219
 
4220
//------------------------------------------------------------------------------
4221
 
4222
XMFINLINE VOID XMStoreFloat4A
4223
(
4224
    XMFLOAT4A*   pDestination, 
4225
    FXMVECTOR     V
4226
)
4227
{
4228
#if defined(_XM_NO_INTRINSICS_)
4229
 
4230
    XMASSERT(pDestination);
4231
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4232
 
4233
    pDestination->x = V.vector4_f32[0];
4234
    pDestination->y = V.vector4_f32[1];
4235
    pDestination->z = V.vector4_f32[2];
4236
    pDestination->w = V.vector4_f32[3];
4237
 
4238
#elif defined(_XM_SSE_INTRINSICS_)
4239
    XMASSERT(pDestination);
4240
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4241
 
4242
    _mm_store_ps( &pDestination->x, V );
4243
#else // _XM_VMX128_INTRINSICS_
4244
#endif // _XM_VMX128_INTRINSICS_
4245
}
4246
 
4247
//------------------------------------------------------------------------------
4248
 
4249
XMFINLINE VOID XMStoreFloat4NC
4250
(
4251
    XMFLOAT4* pDestination, 
4252
    FXMVECTOR  V
4253
)
4254
{
4255
#if defined(_XM_NO_INTRINSICS_)
4256
 
4257
    XMASSERT(pDestination);
4258
 
4259
    pDestination->x = V.vector4_f32[0];
4260
    pDestination->y = V.vector4_f32[1];
4261
    pDestination->z = V.vector4_f32[2];
4262
    pDestination->w = V.vector4_f32[3];
4263
 
4264
#elif defined(_XM_SSE_INTRINSICS_)
4265
    XMASSERT(pDestination);
4266
 
4267
    _mm_storeu_ps( &pDestination->x, V );
4268
 
4269
#else // _XM_VMX128_INTRINSICS_
4270
#endif // _XM_VMX128_INTRINSICS_
4271
}
4272
 
4273
//------------------------------------------------------------------------------
4274
 
4275
XMFINLINE VOID XMStoreHalf4
4276
(
4277
    XMHALF4* pDestination, 
4278
    FXMVECTOR V
4279
)
4280
{
4281
#if defined(_XM_NO_INTRINSICS_) 
4282
 
4283
    XMASSERT(pDestination);
4284
 
4285
    pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
4286
    pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
4287
    pDestination->z = XMConvertFloatToHalf(V.vector4_f32[2]);
4288
    pDestination->w = XMConvertFloatToHalf(V.vector4_f32[3]);
4289
 
4290
#elif defined(_XM_SSE_INTRINSICS_)
4291
    XMASSERT(pDestination);
4292
    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
4293
    pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
4294
    pDestination->z = XMConvertFloatToHalf(XMVectorGetZ(V));
4295
    pDestination->w = XMConvertFloatToHalf(XMVectorGetW(V));
4296
#else // _XM_VMX128_INTRINSICS_
4297
#endif // _XM_VMX128_INTRINSICS_
4298
}
4299
 
4300
//------------------------------------------------------------------------------
4301
 
4302
XMFINLINE VOID XMStoreShortN4
4303
(
4304
    XMSHORTN4* pDestination, 
4305
    FXMVECTOR   V
4306
)
4307
{
4308
#if defined(_XM_NO_INTRINSICS_)
4309
 
4310
    XMVECTOR               N;
4311
    static CONST XMVECTORF32  Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4312
 
4313
    XMASSERT(pDestination);
4314
 
4315
    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
4316
    N = XMVectorMultiply(N, Scale.v);
4317
    N = XMVectorRound(N);
4318
 
4319
    pDestination->x = (SHORT)N.vector4_f32[0];
4320
    pDestination->y = (SHORT)N.vector4_f32[1];
4321
    pDestination->z = (SHORT)N.vector4_f32[2];
4322
    pDestination->w = (SHORT)N.vector4_f32[3];
4323
 
4324
#elif defined(_XM_SSE_INTRINSICS_)
4325
    XMASSERT(pDestination);
4326
    static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4327
 
4328
    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
4329
    vResult = _mm_min_ps(vResult,g_XMOne);
4330
    vResult = _mm_mul_ps(vResult,Scale);
4331
    __m128i vResulti = _mm_cvtps_epi32(vResult);
4332
    vResulti = _mm_packs_epi32(vResulti,vResulti);
4333
    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vResulti)[0]);
4334
#else // _XM_VMX128_INTRINSICS_
4335
#endif // _XM_VMX128_INTRINSICS_
4336
}
4337
 
4338
//------------------------------------------------------------------------------
4339
 
4340
XMFINLINE VOID XMStoreShort4
4341
(
4342
    XMSHORT4* pDestination, 
4343
    FXMVECTOR  V
4344
)
4345
{
4346
#if defined(_XM_NO_INTRINSICS_)
4347
 
4348
    XMVECTOR               N;
4349
    static CONST XMVECTOR  Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
4350
    static CONST XMVECTOR  Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4351
 
4352
    XMASSERT(pDestination);
4353
 
4354
    N = XMVectorClamp(V, Min, Max);
4355
    N = XMVectorRound(N);
4356
 
4357
    pDestination->x = (SHORT)N.vector4_f32[0];
4358
    pDestination->y = (SHORT)N.vector4_f32[1];
4359
    pDestination->z = (SHORT)N.vector4_f32[2];
4360
    pDestination->w = (SHORT)N.vector4_f32[3];
4361
 
4362
#elif defined(_XM_SSE_INTRINSICS_)
4363
    XMASSERT(pDestination);
4364
    static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
4365
    static CONST XMVECTORF32  Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4366
    // Bounds check
4367
    XMVECTOR vResult = _mm_max_ps(V,Min);
4368
    vResult = _mm_min_ps(vResult,Max);
4369
     // Convert to int with rounding
4370
    __m128i vInt = _mm_cvtps_epi32(vResult);
4371
    // Pack the ints into shorts
4372
    vInt = _mm_packs_epi32(vInt,vInt);
4373
    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vInt)[0]);
4374
#else // _XM_VMX128_INTRINSICS_
4375
#endif // _XM_VMX128_INTRINSICS_
4376
}
4377
 
4378
//------------------------------------------------------------------------------
4379
 
4380
XMFINLINE VOID XMStoreUShortN4
4381
(
4382
    XMUSHORTN4* pDestination, 
4383
    FXMVECTOR    V
4384
)
4385
{
4386
#if defined(_XM_NO_INTRINSICS_)
4387
 
4388
    XMVECTOR               N;
4389
    static CONST XMVECTORF32  Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
4390
 
4391
    XMASSERT(pDestination);
4392
 
4393
    N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
4394
    N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
4395
    N = XMVectorTruncate(N);
4396
 
4397
    pDestination->x = (SHORT)N.vector4_f32[0];
4398
    pDestination->y = (SHORT)N.vector4_f32[1];
4399
    pDestination->z = (SHORT)N.vector4_f32[2];
4400
    pDestination->w = (SHORT)N.vector4_f32[3];
4401
 
4402
#elif defined(_XM_SSE_INTRINSICS_)
4403
    XMASSERT(pDestination);
4404
    static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
4405
    // Bounds check
4406
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4407
    vResult = _mm_min_ps(vResult,g_XMOne);
4408
    vResult = _mm_mul_ps(vResult,Scale);
4409
    // Convert to int with rounding
4410
    __m128i vInt = _mm_cvtps_epi32(vResult);
4411
    // Since the SSE pack instruction clamps using signed rules,
4412
    // manually extract the values to store them to memory
4413
    pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
4414
    pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
4415
    pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
4416
    pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
4417
#else // _XM_VMX128_INTRINSICS_
4418
#endif // _XM_VMX128_INTRINSICS_
4419
}
4420
 
4421
//------------------------------------------------------------------------------
4422
 
4423
XMFINLINE VOID XMStoreUShort4
4424
(
4425
    XMUSHORT4* pDestination, 
4426
    FXMVECTOR   V
4427
)
4428
{
4429
#if defined(_XM_NO_INTRINSICS_)
4430
 
4431
    XMVECTOR               N;
4432
    static CONST XMVECTOR  Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
4433
 
4434
    XMASSERT(pDestination);
4435
 
4436
    N = XMVectorClamp(V, XMVectorZero(), Max);
4437
    N = XMVectorRound(N);
4438
 
4439
    pDestination->x = (SHORT)N.vector4_f32[0];
4440
    pDestination->y = (SHORT)N.vector4_f32[1];
4441
    pDestination->z = (SHORT)N.vector4_f32[2];
4442
    pDestination->w = (SHORT)N.vector4_f32[3];
4443
 
4444
#elif defined(_XM_SSE_INTRINSICS_)
4445
    XMASSERT(pDestination);
4446
    static CONST XMVECTORF32  Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
4447
    // Bounds check
4448
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4449
    vResult = _mm_min_ps(vResult,Max);
4450
     // Convert to int with rounding
4451
    __m128i vInt = _mm_cvtps_epi32(vResult);
4452
    // Since the SSE pack instruction clamps using signed rules,
4453
    // manually extract the values to store them to memory
4454
    pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
4455
    pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
4456
    pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
4457
    pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
4458
#else // _XM_VMX128_INTRINSICS_
4459
#endif // _XM_VMX128_INTRINSICS_
4460
}
4461
 
4462
//------------------------------------------------------------------------------
4463
 
4464
XMFINLINE VOID XMStoreXIcoN4
4465
(
4466
    XMXICON4*  pDestination, 
4467
    FXMVECTOR   V
4468
)
4469
{
4470
#if defined(_XM_NO_INTRINSICS_)
4471
 
4472
    XMVECTOR               N;
4473
    static CONST XMVECTORF32  Min = {-1.0f, -1.0f, -1.0f, 0.0f};
4474
    static CONST XMVECTORF32  Scale = {524287.0f, 524287.0f, 524287.0f, 15.0f};
4475
 
4476
    XMASSERT(pDestination);
4477
 
4478
    N = XMVectorClamp(V, Min.v, g_XMOne.v);
4479
    N = XMVectorMultiply(N, Scale.v);
4480
    N = XMVectorRound(N);
4481
 
4482
    pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
4483
                       (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
4484
                       (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
4485
                       (((INT64)N.vector4_f32[0] & 0xFFFFF));
4486
 
4487
#elif defined(_XM_SSE_INTRINSICS_)
4488
    XMASSERT(pDestination);
4489
    // Note: Masks are x,w,y and z
4490
    static const XMVECTORF32 MinXIcoN4 = {-1.0f, 0.0f,-1.0f,-1.0f};
4491
    static const XMVECTORF32 ScaleXIcoN4 = {524287.0f,15.0f*4096.0f*65536.0f*0.5f,524287.0f*4096.0f,524287.0f};
4492
    static const XMVECTORI32 MaskXIcoN4 = {0xFFFFF,0xF<<((60-32)-1),0xFFFFF000,0xFFFFF};
4493
 
4494
    // Clamp to bounds
4495
    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
4496
    vResult = _mm_max_ps(vResult,MinXIcoN4);
4497
    vResult = _mm_min_ps(vResult,g_XMOne);
4498
    // Scale by multiplication
4499
    vResult = _mm_mul_ps(vResult,ScaleXIcoN4);
4500
    // Convert to integer (w is unsigned)
4501
    __m128i vResulti = _mm_cvttps_epi32(vResult);
4502
    // Mask off unused bits
4503
    vResulti = _mm_and_si128(vResulti,MaskXIcoN4);
4504
    // Isolate Y
4505
    __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
4506
    // Double Y (Really W) to fixup for unsigned conversion
4507
    vResulti = _mm_add_epi32(vResulti,vResulti2);
4508
    // Shift y and z to straddle the 32-bit boundary
4509
    vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
4510
    // Shift it into place
4511
    vResulti2 = _mm_slli_si128(vResulti2,20/8);
4512
    // i = x|y<<20|z<<40|w<<60
4513
    vResulti = _mm_or_si128(vResulti,vResulti2);
4514
    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
4515
#else // _XM_VMX128_INTRINSICS_
4516
#endif // _XM_VMX128_INTRINSICS_
4517
}
4518
 
4519
//------------------------------------------------------------------------------
4520
 
4521
XMFINLINE VOID XMStoreXIco4
4522
(
4523
    XMXICO4*  pDestination, 
4524
    FXMVECTOR  V
4525
)
4526
{
4527
#if defined(_XM_NO_INTRINSICS_)
4528
 
4529
    XMVECTOR N;
4530
    static CONST XMVECTORF32 Min = {-524287.0f, -524287.0f, -524287.0f, 0.0f};
4531
    static CONST XMVECTORF32 Max = {524287.0f, 524287.0f, 524287.0f, 15.0f};
4532
 
4533
    XMASSERT(pDestination);
4534
    N = XMVectorClamp(V, Min.v, Max.v);
4535
    pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
4536
                       (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
4537
                       (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
4538
                       (((INT64)N.vector4_f32[0] & 0xFFFFF));
4539
 
4540
#elif defined(_XM_SSE_INTRINSICS_)
4541
    XMASSERT(pDestination);
4542
    // Note: Masks are x,w,y and z
4543
    static const XMVECTORF32 MinXIco4 = {-524287.0f, 0.0f,-524287.0f,-524287.0f};
4544
    static const XMVECTORF32 MaxXIco4 = { 524287.0f,15.0f, 524287.0f, 524287.0f};
4545
    static const XMVECTORF32 ScaleXIco4 = {1.0f,4096.0f*65536.0f*0.5f,4096.0f,1.0f};
4546
    static const XMVECTORI32 MaskXIco4 = {0xFFFFF,0xF<<((60-1)-32),0xFFFFF000,0xFFFFF};
4547
    // Clamp to bounds
4548
    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
4549
    vResult = _mm_max_ps(vResult,MinXIco4);
4550
    vResult = _mm_min_ps(vResult,MaxXIco4);
4551
    // Scale by multiplication
4552
    vResult = _mm_mul_ps(vResult,ScaleXIco4);
4553
    // Convert to int
4554
    __m128i vResulti = _mm_cvttps_epi32(vResult);
4555
    // Mask off any fraction
4556
    vResulti = _mm_and_si128(vResulti,MaskXIco4);
4557
    // Isolate Y
4558
    __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
4559
    // Double Y (Really W) to fixup for unsigned conversion
4560
    vResulti = _mm_add_epi32(vResulti,vResulti2);
4561
    // Shift y and z to straddle the 32-bit boundary
4562
    vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
4563
    // Shift it into place
4564
    vResulti2 = _mm_slli_si128(vResulti2,20/8);
4565
    // i = x|y<<20|z<<40|w<<60
4566
    vResulti = _mm_or_si128(vResulti,vResulti2);
4567
    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
4568
#else // _XM_VMX128_INTRINSICS_
4569
#endif // _XM_VMX128_INTRINSICS_
4570
}
4571
 
4572
//------------------------------------------------------------------------------
4573
 
4574
XMFINLINE VOID XMStoreUIcoN4
4575
(
4576
    XMUICON4*  pDestination, 
4577
    FXMVECTOR   V
4578
)
4579
{
4580
    #define XM_URange       ((FLOAT)(1 << 20))
4581
    #define XM_URangeDiv2   ((FLOAT)(1 << 19))
4582
    #define XM_UMaxXYZ      ((FLOAT)((1 << 20) - 1))
4583
    #define XM_UMaxW        ((FLOAT)((1 << 4) - 1))
4584
    #define XM_ScaleXYZ     (-(FLOAT)((1 << 20) - 1) / XM_PACK_FACTOR)
4585
    #define XM_ScaleW       (-(FLOAT)((1 << 4) - 1) / XM_PACK_FACTOR)
4586
    #define XM_Scale        (-1.0f / XM_PACK_FACTOR)
4587
    #define XM_Offset       (3.0f)
4588
 
4589
#if defined(_XM_NO_INTRINSICS_)
4590
 
4591
    XMVECTOR               N;
4592
    static CONST XMVECTORF32 Scale = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
4593
 
4594
    XMASSERT(pDestination);
4595
 
4596
    N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
4597
    N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
4598
 
4599
    pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
4600
                       (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
4601
                       (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
4602
                       (((UINT64)N.vector4_f32[0] & 0xFFFFF));
4603
 
4604
#elif defined(_XM_SSE_INTRINSICS_)
4605
    XMASSERT(pDestination);
4606
    // Note: Masks are x,w,y and z
4607
    static const XMVECTORF32 ScaleUIcoN4 = {1048575.0f,15.0f*4096.0f*65536.0f,1048575.0f*4096.0f,1048575.0f};
4608
    static const XMVECTORI32 MaskUIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
4609
    static const XMVECTORF32 AddUIcoN4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f};
4610
    // Clamp to bounds
4611
    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
4612
    vResult = _mm_max_ps(vResult,g_XMZero);
4613
    vResult = _mm_min_ps(vResult,g_XMOne);
4614
    // Scale by multiplication
4615
    vResult = _mm_mul_ps(vResult,ScaleUIcoN4);
4616
    // Adjust for unsigned entries
4617
    vResult = _mm_add_ps(vResult,AddUIcoN4);
4618
    // Convert to int
4619
    __m128i vResulti = _mm_cvttps_epi32(vResult);
4620
    // Fix the signs on the unsigned entries
4621
    vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
4622
    // Mask off any fraction
4623
    vResulti = _mm_and_si128(vResulti,MaskUIcoN4);
4624
    // Shift y and z to straddle the 32-bit boundary
4625
    __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
4626
    // Shift it into place
4627
    vResulti2 = _mm_slli_si128(vResulti2,20/8);
4628
    // i = x|y<<20|z<<40|w<<60
4629
    vResulti = _mm_or_si128(vResulti,vResulti2);
4630
    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
4631
#else // _XM_VMX128_INTRINSICS_
4632
#endif // _XM_VMX128_INTRINSICS_
4633
 
4634
    #undef XM_URange
4635
    #undef XM_URangeDiv2
4636
    #undef XM_UMaxXYZ
4637
    #undef XM_UMaxW
4638
    #undef XM_ScaleXYZ
4639
    #undef XM_ScaleW
4640
    #undef XM_Scale
4641
    #undef XM_Offset
4642
}
4643
 
4644
//------------------------------------------------------------------------------
4645
 
4646
XMFINLINE VOID XMStoreUIco4
4647
(
4648
    XMUICO4*  pDestination, 
4649
    FXMVECTOR  V
4650
)
4651
{
4652
    #define XM_Scale        (-1.0f / XM_PACK_FACTOR)
4653
    #define XM_URange       ((FLOAT)(1 << 20))
4654
    #define XM_URangeDiv2   ((FLOAT)(1 << 19))
4655
 
4656
#if defined(_XM_NO_INTRINSICS_)
4657
 
4658
    XMVECTOR               N;
4659
    static CONST XMVECTOR  Max = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
4660
 
4661
    XMASSERT(pDestination);
4662
 
4663
    N = XMVectorClamp(V, XMVectorZero(), Max);
4664
    N = XMVectorRound(N);
4665
 
4666
    pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
4667
                       (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
4668
                       (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
4669
                       (((UINT64)N.vector4_f32[0] & 0xFFFFF));
4670
 
4671
#elif defined(_XM_SSE_INTRINSICS_)
4672
    XMASSERT(pDestination);
4673
    // Note: Masks are x,w,y and z
4674
    static const XMVECTORF32 MaxUIco4 = { 1048575.0f, 15.0f, 1048575.0f, 1048575.0f};
4675
    static const XMVECTORF32 ScaleUIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
4676
    static const XMVECTORI32 MaskUIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
4677
    static const XMVECTORF32 AddUIco4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f};
4678
    // Clamp to bounds
4679
    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
4680
    vResult = _mm_max_ps(vResult,g_XMZero);
4681
    vResult = _mm_min_ps(vResult,MaxUIco4);
4682
    // Scale by multiplication
4683
    vResult = _mm_mul_ps(vResult,ScaleUIco4);
4684
    vResult = _mm_add_ps(vResult,AddUIco4);
4685
    // Convert to int
4686
    __m128i vResulti = _mm_cvttps_epi32(vResult);
4687
    vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
4688
    // Mask off any fraction
4689
    vResulti = _mm_and_si128(vResulti,MaskUIco4);
4690
    // Shift y and z to straddle the 32-bit boundary
4691
    __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
4692
    // Shift it into place
4693
    vResulti2 = _mm_slli_si128(vResulti2,20/8);
4694
    // i = x|y<<20|z<<40|w<<60
4695
    vResulti = _mm_or_si128(vResulti,vResulti2);
4696
    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
4697
#else // _XM_VMX128_INTRINSICS_
4698
#endif // _XM_VMX128_INTRINSICS_
4699
 
4700
    #undef XM_Scale
4701
    #undef XM_URange
4702
    #undef XM_URangeDiv2
4703
}
4704
 
4705
//------------------------------------------------------------------------------
4706
 
4707
XMFINLINE VOID XMStoreIcoN4
4708
(
4709
    XMICON4*  pDestination, 
4710
    FXMVECTOR  V
4711
)
4712
{
4713
    #define XM_Scale    (-1.0f / XM_PACK_FACTOR)
4714
    #define XM_URange   ((FLOAT)(1 << 4))
4715
    #define XM_Offset   (3.0f)
4716
    #define XM_UMaxXYZ  ((FLOAT)((1 << (20 - 1)) - 1))
4717
    #define XM_UMaxW    ((FLOAT)((1 << (4 - 1)) - 1))
4718
 
4719
#if defined(_XM_NO_INTRINSICS_)
4720
 
4721
    XMVECTOR               N;
4722
    static CONST XMVECTORF32  Scale = {524287.0f, 524287.0f, 524287.0f, 7.0f};
4723
 
4724
    XMASSERT(pDestination);
4725
 
4726
    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
4727
    N = XMVectorMultiplyAdd(N, Scale.v, g_XMNegativeZero.v);
4728
    N = XMVectorRound(N);
4729
 
4730
    pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
4731
                       (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
4732
                       (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
4733
                       (((UINT64)N.vector4_f32[0] & 0xFFFFF));
4734
 
4735
#elif defined(_XM_SSE_INTRINSICS_)
4736
    XMASSERT(pDestination);
4737
    // Note: Masks are x,w,y and z
4738
    static const XMVECTORF32 ScaleIcoN4 = {524287.0f,7.0f*4096.0f*65536.0f,524287.0f*4096.0f,524287.0f};
4739
    static const XMVECTORI32 MaskIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
4740
    // Clamp to bounds
4741
    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
4742
    vResult = _mm_max_ps(vResult,g_XMNegativeOne);
4743
    vResult = _mm_min_ps(vResult,g_XMOne);
4744
    // Scale by multiplication
4745
    vResult = _mm_mul_ps(vResult,ScaleIcoN4);
4746
    // Convert to int
4747
    __m128i vResulti = _mm_cvttps_epi32(vResult);
4748
    // Mask off any fraction
4749
    vResulti = _mm_and_si128(vResulti,MaskIcoN4);
4750
    // Shift y and z to straddle the 32-bit boundary
4751
    __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
4752
    // Shift it into place
4753
    vResulti2 = _mm_slli_si128(vResulti2,20/8);
4754
    // i = x|y<<20|z<<40|w<<60
4755
    vResulti = _mm_or_si128(vResulti,vResulti2);
4756
    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
4757
#else // _XM_VMX128_INTRINSICS_
4758
#endif // _XM_VMX128_INTRINSICS_
4759
 
4760
    #undef XM_Scale
4761
    #undef XM_URange
4762
    #undef XM_Offset
4763
    #undef XM_UMaxXYZ
4764
    #undef XM_UMaxW
4765
}
4766
 
4767
//------------------------------------------------------------------------------
4768
 
4769
XMFINLINE VOID XMStoreIco4
4770
(
4771
    XMICO4*  pDestination, 
4772
    FXMVECTOR V
4773
)
4774
{
4775
    #define XM_Scale    (-1.0f / XM_PACK_FACTOR)
4776
    #define XM_URange   ((FLOAT)(1 << 4))
4777
    #define XM_Offset   (3.0f)
4778
 
4779
#if defined(_XM_NO_INTRINSICS_)
4780
 
4781
    XMVECTOR               N;
4782
    static CONST XMVECTOR  Min = {-524287.0f, -524287.0f, -524287.0f, -7.0f};
4783
    static CONST XMVECTOR  Max = {524287.0f, 524287.0f, 524287.0f, 7.0f};
4784
 
4785
    XMASSERT(pDestination);
4786
 
4787
    N = XMVectorClamp(V, Min, Max);
4788
    N = XMVectorRound(N);
4789
 
4790
    pDestination->v = ((INT64)N.vector4_f32[3] << 60) |
4791
                       (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
4792
                       (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
4793
                       (((INT64)N.vector4_f32[0] & 0xFFFFF));
4794
 
4795
#elif defined(_XM_SSE_INTRINSICS_)
4796
    XMASSERT(pDestination);
4797
    // Note: Masks are x,w,y and z
4798
    static const XMVECTORF32 MinIco4 = {-524287.0f,-7.0f,-524287.0f,-524287.0f};
4799
    static const XMVECTORF32 MaxIco4 = { 524287.0f, 7.0f, 524287.0f, 524287.0f};
4800
    static const XMVECTORF32 ScaleIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
4801
    static const XMVECTORI32 MaskIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
4802
    // Clamp to bounds
4803
    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
4804
    vResult = _mm_max_ps(vResult,MinIco4);
4805
    vResult = _mm_min_ps(vResult,MaxIco4);
4806
    // Scale by multiplication
4807
    vResult = _mm_mul_ps(vResult,ScaleIco4);
4808
    // Convert to int
4809
    __m128i vResulti = _mm_cvttps_epi32(vResult);
4810
    // Mask off any fraction
4811
    vResulti = _mm_and_si128(vResulti,MaskIco4);
4812
    // Shift y and z to straddle the 32-bit boundary
4813
    __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
4814
    // Shift it into place
4815
    vResulti2 = _mm_slli_si128(vResulti2,20/8);
4816
    // i = x|y<<20|z<<40|w<<60
4817
    vResulti = _mm_or_si128(vResulti,vResulti2);
4818
    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
4819
#else // _XM_VMX128_INTRINSICS_
4820
#endif // _XM_VMX128_INTRINSICS_
4821
 
4822
    #undef XM_Scale
4823
    #undef XM_URange
4824
    #undef XM_Offset
4825
}
4826
 
4827
//------------------------------------------------------------------------------
4828
 
4829
XMFINLINE VOID XMStoreXDecN4
4830
(
4831
    XMXDECN4* pDestination, 
4832
    FXMVECTOR  V
4833
)
4834
{
4835
#if defined(_XM_NO_INTRINSICS_)
4836
 
4837
    XMVECTOR               N;
4838
    static CONST XMVECTORF32  Min = {-1.0f, -1.0f, -1.0f, 0.0f};
4839
    static CONST XMVECTORF32  Scale = {511.0f, 511.0f, 511.0f, 3.0f};
4840
 
4841
    XMASSERT(pDestination);
4842
 
4843
    N = XMVectorClamp(V, Min.v, g_XMOne.v);
4844
    N = XMVectorMultiply(N, Scale.v);
4845
    N = XMVectorRound(N);
4846
 
4847
    pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
4848
                       (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
4849
                       (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
4850
                       (((INT)N.vector4_f32[0] & 0x3FF));
4851
 
4852
#elif defined(_XM_SSE_INTRINSICS_)
4853
    static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
4854
    static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f};
4855
    static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
4856
    XMASSERT(pDestination);
4857
    XMVECTOR vResult = _mm_max_ps(V,Min);
4858
    vResult = _mm_min_ps(vResult,g_XMOne);
4859
    // Scale by multiplication
4860
    vResult = _mm_mul_ps(vResult,Scale);
4861
    // Convert to int (W is unsigned)
4862
    __m128i vResulti = _mm_cvtps_epi32(vResult);
4863
    // Mask off any fraction
4864
    vResulti = _mm_and_si128(vResulti,ScaleMask);
4865
    // To fix W, add itself to shift it up to <<30 instead of <<29
4866
    __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
4867
    vResulti = _mm_add_epi32(vResulti,vResultw);
4868
    // Do a horizontal or of all 4 entries
4869
    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
4870
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4871
    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
4872
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4873
    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
4874
    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4875
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4876
#else // _XM_VMX128_INTRINSICS_
4877
#endif // _XM_VMX128_INTRINSICS_
4878
}
4879
 
4880
//------------------------------------------------------------------------------
4881
 
4882
XMFINLINE VOID XMStoreXDec4
4883
(
4884
    XMXDEC4* pDestination, 
4885
    FXMVECTOR  V
4886
)
4887
{
4888
#if defined(_XM_NO_INTRINSICS_)
4889
 
4890
    XMVECTOR               N;
4891
    static CONST XMVECTOR  Min = {-511.0f, -511.0f, -511.0f, 0.0f};
4892
    static CONST XMVECTOR  Max = {511.0f, 511.0f, 511.0f, 3.0f};
4893
 
4894
    XMASSERT(pDestination);
4895
 
4896
    N = XMVectorClamp(V, Min, Max);
4897
 
4898
    pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
4899
                       (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
4900
                       (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
4901
                       (((INT)N.vector4_f32[0] & 0x3FF));
4902
 
4903
#elif defined(_XM_SSE_INTRINSICS_)
4904
    XMASSERT(pDestination);
4905
    static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
4906
    static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
4907
    static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
4908
    static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
4909
    // Clamp to bounds
4910
    XMVECTOR vResult = _mm_max_ps(V,MinXDec4);
4911
    vResult = _mm_min_ps(vResult,MaxXDec4);
4912
    // Scale by multiplication
4913
    vResult = _mm_mul_ps(vResult,ScaleXDec4);
4914
    // Convert to int
4915
    __m128i vResulti = _mm_cvttps_epi32(vResult);
4916
    // Mask off any fraction
4917
    vResulti = _mm_and_si128(vResulti,MaskXDec4);
4918
    // Do a horizontal or of 4 entries
4919
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
4920
    // x = x|z, y = y|w
4921
    vResulti = _mm_or_si128(vResulti,vResulti2);
4922
    // Move Z to the x position
4923
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
4924
    // Perform a single bit left shift on y|w
4925
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
4926
    // i = x|y|z|w
4927
    vResulti = _mm_or_si128(vResulti,vResulti2);
4928
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4929
#else // _XM_VMX128_INTRINSICS_
4930
#endif // _XM_VMX128_INTRINSICS_
4931
}
4932
 
4933
//------------------------------------------------------------------------------
4934
 
4935
XMFINLINE VOID XMStoreUDecN4
4936
(
4937
    XMUDECN4* pDestination, 
4938
    FXMVECTOR  V
4939
)
4940
{
4941
#if defined(_XM_NO_INTRINSICS_)
4942
 
4943
    XMVECTOR               N;
4944
    static CONST XMVECTORF32  Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f};
4945
 
4946
    XMASSERT(pDestination);
4947
 
4948
    N = XMVectorClamp(V, XMVectorZero(), g_XMOne.v);
4949
    N = XMVectorMultiply(N, Scale.v);
4950
 
4951
    pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
4952
                       (((UINT)N.vector4_f32[2] & 0x3FF) << 20) |
4953
                       (((UINT)N.vector4_f32[1] & 0x3FF) << 10) |
4954
                       (((UINT)N.vector4_f32[0] & 0x3FF));
4955
 
4956
#elif defined(_XM_SSE_INTRINSICS_)
4957
    XMASSERT(pDestination);
4958
    static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f};
4959
    static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
4960
    // Clamp to bounds
4961
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4962
    vResult = _mm_min_ps(vResult,g_XMOne);
4963
    // Scale by multiplication
4964
    vResult = _mm_mul_ps(vResult,ScaleUDecN4);
4965
    // Convert to int
4966
    __m128i vResulti = _mm_cvttps_epi32(vResult);
4967
    // Mask off any fraction
4968
    vResulti = _mm_and_si128(vResulti,MaskUDecN4);
4969
    // Do a horizontal or of 4 entries
4970
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
4971
    // x = x|z, y = y|w
4972
    vResulti = _mm_or_si128(vResulti,vResulti2);
4973
    // Move Z to the x position
4974
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
4975
    // Perform a left shift by one bit on y|w
4976
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
4977
    // i = x|y|z|w
4978
    vResulti = _mm_or_si128(vResulti,vResulti2);
4979
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4980
#else // _XM_VMX128_INTRINSICS_
4981
#endif // _XM_VMX128_INTRINSICS_
4982
}
4983
 
4984
//------------------------------------------------------------------------------
4985
 
4986
XMFINLINE VOID XMStoreUDec4
4987
(
4988
    XMUDEC4* pDestination, 
4989
    FXMVECTOR  V
4990
)
4991
{
4992
#if defined(_XM_NO_INTRINSICS_)
4993
 
4994
    XMVECTOR               N;
4995
    static CONST XMVECTOR  Max = {1023.0f, 1023.0f, 1023.0f, 3.0f};
4996
 
4997
    XMASSERT(pDestination);
4998
 
4999
    N = XMVectorClamp(V, XMVectorZero(), Max);
5000
 
5001
    pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
5002
                       (((UINT)N.vector4_f32[2] & 0x3FF) << 20) |
5003
                       (((UINT)N.vector4_f32[1] & 0x3FF) << 10) |
5004
                       (((UINT)N.vector4_f32[0] & 0x3FF));
5005
 
5006
#elif defined(_XM_SSE_INTRINSICS_)
5007
	XMASSERT(pDestination);
5008
    static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
5009
    static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
5010
    static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
5011
    // Clamp to bounds
5012
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5013
    vResult = _mm_min_ps(vResult,MaxUDec4);
5014
    // Scale by multiplication
5015
    vResult = _mm_mul_ps(vResult,ScaleUDec4);
5016
    // Convert to int
5017
    __m128i vResulti = _mm_cvttps_epi32(vResult);
5018
    // Mask off any fraction
5019
    vResulti = _mm_and_si128(vResulti,MaskUDec4);
5020
    // Do a horizontal or of 4 entries
5021
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5022
    // x = x|z, y = y|w
5023
    vResulti = _mm_or_si128(vResulti,vResulti2);
5024
    // Move Z to the x position
5025
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5026
    // Perform a left shift by one bit on y|w
5027
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5028
    // i = x|y|z|w
5029
    vResulti = _mm_or_si128(vResulti,vResulti2);
5030
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5031
#else // _XM_VMX128_INTRINSICS_
5032
#endif // _XM_VMX128_INTRINSICS_
5033
}
5034
 
5035
//------------------------------------------------------------------------------
5036
 
5037
XMFINLINE VOID XMStoreDecN4
5038
(
5039
    XMDECN4* pDestination, 
5040
    FXMVECTOR V
5041
)
5042
{
5043
#if defined(_XM_NO_INTRINSICS_)
5044
 
5045
    XMVECTOR               N;
5046
    static CONST XMVECTORF32  Scale = {511.0f, 511.0f, 511.0f, 1.0f};
5047
 
5048
    XMASSERT(pDestination);
5049
 
5050
    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
5051
    N = XMVectorMultiply(N, Scale.v);
5052
 
5053
    pDestination->v = ((INT)N.vector4_f32[3] << 30) |
5054
                       (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
5055
                       (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
5056
                       (((INT)N.vector4_f32[0] & 0x3FF));
5057
 
5058
#elif defined(_XM_SSE_INTRINSICS_)
5059
    XMASSERT(pDestination);
5060
    static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f};
5061
    static const XMVECTORI32 MaskDecN4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
5062
    // Clamp to bounds
5063
    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
5064
    vResult = _mm_min_ps(vResult,g_XMOne);
5065
    // Scale by multiplication
5066
    vResult = _mm_mul_ps(vResult,ScaleDecN4);
5067
    // Convert to int
5068
    __m128i vResulti = _mm_cvttps_epi32(vResult);
5069
    // Mask off any fraction
5070
    vResulti = _mm_and_si128(vResulti,MaskDecN4);
5071
    // Do a horizontal or of 4 entries
5072
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5073
    // x = x|z, y = y|w
5074
    vResulti = _mm_or_si128(vResulti,vResulti2);
5075
    // Move Z to the x position
5076
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5077
    // i = x|y|z|w
5078
    vResulti = _mm_or_si128(vResulti,vResulti2);
5079
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5080
#else // _XM_VMX128_INTRINSICS_
5081
#endif // _XM_VMX128_INTRINSICS_
5082
}
5083
 
5084
//------------------------------------------------------------------------------
5085
 
5086
XMFINLINE VOID XMStoreDec4
5087
(
5088
    XMDEC4*  pDestination, 
5089
    FXMVECTOR V
5090
)
5091
{
5092
#if defined(_XM_NO_INTRINSICS_)
5093
 
5094
    XMVECTOR               N;
5095
    static CONST XMVECTOR  Min = {-511.0f, -511.0f, -511.0f, -1.0f};
5096
    static CONST XMVECTOR  Max = {511.0f, 511.0f, 511.0f, 1.0f};
5097
 
5098
    XMASSERT(pDestination);
5099
 
5100
    N = XMVectorClamp(V, Min, Max);
5101
 
5102
    pDestination->v = ((INT)N.vector4_f32[3] << 30) |
5103
                       (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
5104
                       (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
5105
                       (((INT)N.vector4_f32[0] & 0x3FF));
5106
 
5107
#elif defined(_XM_SSE_INTRINSICS_)
5108
    XMASSERT(pDestination);
5109
    static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
5110
    static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
5111
    static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f};
5112
    static const XMVECTORI32 MaskDec4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
5113
    // Clamp to bounds
5114
    XMVECTOR vResult = _mm_max_ps(V,MinDec4);
5115
    vResult = _mm_min_ps(vResult,MaxDec4);
5116
    // Scale by multiplication
5117
    vResult = _mm_mul_ps(vResult,ScaleDec4);
5118
    // Convert to int
5119
    __m128i vResulti = _mm_cvttps_epi32(vResult);
5120
    // Mask off any fraction
5121
    vResulti = _mm_and_si128(vResulti,MaskDec4);
5122
    // Do a horizontal or of 4 entries
5123
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5124
    // x = x|z, y = y|w
5125
    vResulti = _mm_or_si128(vResulti,vResulti2);
5126
    // Move Z to the x position
5127
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5128
    // i = x|y|z|w
5129
    vResulti = _mm_or_si128(vResulti,vResulti2);
5130
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5131
#else // _XM_VMX128_INTRINSICS_
5132
#endif // _XM_VMX128_INTRINSICS_
5133
}
5134
 
5135
//------------------------------------------------------------------------------
5136
 
5137
XMFINLINE VOID XMStoreUByteN4
5138
(
5139
    XMUBYTEN4* pDestination, 
5140
    FXMVECTOR V
5141
)
5142
{
5143
#if defined(_XM_NO_INTRINSICS_)
5144
 
5145
    XMVECTOR               N;
5146
    static CONST XMVECTORF32  Scale = {255.0f, 255.0f, 255.0f, 255.0f};
5147
 
5148
    XMASSERT(pDestination);
5149
 
5150
    N = XMVectorSaturate(V);
5151
    N = XMVectorMultiply(N, Scale.v);
5152
    N = XMVectorRound(N);
5153
 
5154
    pDestination->x = (BYTE)N.vector4_f32[0];
5155
    pDestination->y = (BYTE)N.vector4_f32[1];
5156
    pDestination->z = (BYTE)N.vector4_f32[2];
5157
    pDestination->w = (BYTE)N.vector4_f32[3];
5158
 
5159
#elif defined(_XM_SSE_INTRINSICS_)
5160
    XMASSERT(pDestination);
5161
    static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f};
5162
    static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
5163
    // Clamp to bounds
5164
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5165
    vResult = _mm_min_ps(vResult,g_XMOne);
5166
    // Scale by multiplication
5167
    vResult = _mm_mul_ps(vResult,ScaleUByteN4);
5168
    // Convert to int
5169
    __m128i vResulti = _mm_cvttps_epi32(vResult);
5170
    // Mask off any fraction
5171
    vResulti = _mm_and_si128(vResulti,MaskUByteN4);
5172
    // Do a horizontal or of 4 entries
5173
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5174
    // x = x|z, y = y|w
5175
    vResulti = _mm_or_si128(vResulti,vResulti2);
5176
    // Move Z to the x position
5177
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5178
    // Perform a single bit left shift to fix y|w 
5179
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5180
    // i = x|y|z|w
5181
    vResulti = _mm_or_si128(vResulti,vResulti2);
5182
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5183
#else // _XM_VMX128_INTRINSICS_
5184
#endif // _XM_VMX128_INTRINSICS_
5185
}
5186
 
5187
//------------------------------------------------------------------------------
5188
 
5189
XMFINLINE VOID XMStoreUByte4
5190
(
5191
    XMUBYTE4* pDestination, 
5192
    FXMVECTOR  V
5193
)
5194
{
5195
#if defined(_XM_NO_INTRINSICS_)
5196
 
5197
    XMVECTOR               N;
5198
    static CONST XMVECTOR  Max = {255.0f, 255.0f, 255.0f, 255.0f};
5199
 
5200
    XMASSERT(pDestination);
5201
 
5202
    N = XMVectorClamp(V, XMVectorZero(), Max);
5203
    N = XMVectorRound(N);
5204
 
5205
    pDestination->x = (BYTE)N.vector4_f32[0];
5206
    pDestination->y = (BYTE)N.vector4_f32[1];
5207
    pDestination->z = (BYTE)N.vector4_f32[2];
5208
    pDestination->w = (BYTE)N.vector4_f32[3];
5209
 
5210
#elif defined(_XM_SSE_INTRINSICS_)
5211
    XMASSERT(pDestination);
5212
    static const XMVECTORF32 MaxUByte4 = { 255.0f, 255.0f, 255.0f, 255.0f};
5213
    static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f};
5214
    static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
5215
    // Clamp to bounds
5216
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5217
    vResult = _mm_min_ps(vResult,MaxUByte4);
5218
    // Scale by multiplication
5219
    vResult = _mm_mul_ps(vResult,ScaleUByte4);
5220
    // Convert to int
5221
    __m128i vResulti = _mm_cvttps_epi32(vResult);
5222
    // Mask off any fraction
5223
    vResulti = _mm_and_si128(vResulti,MaskUByte4);
5224
    // Do a horizontal or of 4 entries
5225
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5226
    // x = x|z, y = y|w
5227
    vResulti = _mm_or_si128(vResulti,vResulti2);
5228
    // Move Z to the x position
5229
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5230
    // Perform a single bit left shift to fix y|w 
5231
    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5232
    // i = x|y|z|w
5233
    vResulti = _mm_or_si128(vResulti,vResulti2);
5234
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5235
#else // _XM_VMX128_INTRINSICS_
5236
#endif // _XM_VMX128_INTRINSICS_
5237
}
5238
 
5239
//------------------------------------------------------------------------------
5240
 
5241
XMFINLINE VOID XMStoreByteN4
5242
(
5243
    XMBYTEN4* pDestination, 
5244
    FXMVECTOR  V
5245
)
5246
{
5247
#if defined(_XM_NO_INTRINSICS_)
5248
 
5249
    XMVECTOR               N;
5250
    static CONST XMVECTORF32  Scale = {127.0f, 127.0f, 127.0f, 127.0f};
5251
 
5252
    XMASSERT(pDestination);
5253
 
5254
    N = XMVectorMultiply(V, Scale.v);
5255
    N = XMVectorRound(N);
5256
 
5257
    pDestination->x = (CHAR)N.vector4_f32[0];
5258
    pDestination->y = (CHAR)N.vector4_f32[1];
5259
    pDestination->z = (CHAR)N.vector4_f32[2];
5260
    pDestination->w = (CHAR)N.vector4_f32[3];
5261
 
5262
#elif defined(_XM_SSE_INTRINSICS_)
5263
	XMASSERT(pDestination);
5264
    static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f};
5265
    static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
5266
    // Clamp to bounds
5267
    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
5268
    vResult = _mm_min_ps(vResult,g_XMOne);
5269
    // Scale by multiplication
5270
    vResult = _mm_mul_ps(vResult,ScaleByteN4);
5271
    // Convert to int
5272
    __m128i vResulti = _mm_cvttps_epi32(vResult);
5273
    // Mask off any fraction
5274
    vResulti = _mm_and_si128(vResulti,MaskByteN4);
5275
    // Do a horizontal or of 4 entries
5276
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5277
    // x = x|z, y = y|w
5278
    vResulti = _mm_or_si128(vResulti,vResulti2);
5279
    // Move Z to the x position
5280
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5281
    // i = x|y|z|w
5282
    vResulti = _mm_or_si128(vResulti,vResulti2);
5283
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5284
#else // _XM_VMX128_INTRINSICS_
5285
#endif // _XM_VMX128_INTRINSICS_
5286
}
5287
 
5288
//------------------------------------------------------------------------------
5289
 
5290
XMFINLINE VOID XMStoreByte4
5291
(
5292
    XMBYTE4*  pDestination, 
5293
    FXMVECTOR  V
5294
)
5295
{
5296
#if defined(_XM_NO_INTRINSICS_)
5297
 
5298
    XMVECTOR               N;
5299
    static CONST XMVECTOR  Min = {-127.0f, -127.0f, -127.0f, -127.0f};
5300
    static CONST XMVECTOR  Max = {127.0f, 127.0f, 127.0f, 127.0f};
5301
 
5302
    XMASSERT(pDestination);
5303
 
5304
    N = XMVectorClamp(V, Min, Max);
5305
    N = XMVectorRound(N);
5306
 
5307
    pDestination->x = (CHAR)N.vector4_f32[0];
5308
    pDestination->y = (CHAR)N.vector4_f32[1];
5309
    pDestination->z = (CHAR)N.vector4_f32[2];
5310
    pDestination->w = (CHAR)N.vector4_f32[3];
5311
 
5312
#elif defined(_XM_SSE_INTRINSICS_)
5313
	XMASSERT(pDestination);
5314
    static const XMVECTORF32 MinByte4 = {-127.0f,-127.0f,-127.0f,-127.0f};
5315
    static const XMVECTORF32 MaxByte4 = { 127.0f, 127.0f, 127.0f, 127.0f};
5316
    static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f};
5317
    static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
5318
    // Clamp to bounds
5319
    XMVECTOR vResult = _mm_max_ps(V,MinByte4);
5320
    vResult = _mm_min_ps(vResult,MaxByte4);
5321
    // Scale by multiplication
5322
    vResult = _mm_mul_ps(vResult,ScaleByte4);
5323
    // Convert to int
5324
    __m128i vResulti = _mm_cvttps_epi32(vResult);
5325
    // Mask off any fraction
5326
    vResulti = _mm_and_si128(vResulti,MaskByte4);
5327
    // Do a horizontal or of 4 entries
5328
    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5329
    // x = x|z, y = y|w
5330
    vResulti = _mm_or_si128(vResulti,vResulti2);
5331
    // Move Z to the x position
5332
    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5333
    // i = x|y|z|w
5334
    vResulti = _mm_or_si128(vResulti,vResulti2);
5335
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5336
#else // _XM_VMX128_INTRINSICS_
5337
#endif // _XM_VMX128_INTRINSICS_
5338
}
5339
 
5340
//------------------------------------------------------------------------------
5341
 
5342
XMFINLINE VOID XMStoreUNibble4
5343
(
5344
     XMUNIBBLE4* pDestination,
5345
     FXMVECTOR V
5346
)
5347
{
5348
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
5349
    XMASSERT(pDestination);
5350
    static CONST XMVECTORF32  Max = {15.0f,15.0f,15.0f,15.0f};
5351
    // Bounds check
5352
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5353
    vResult = _mm_min_ps(vResult,Max);
5354
     // Convert to int with rounding
5355
    __m128i vInt = _mm_cvtps_epi32(vResult);
5356
    // No SSE operations will write to 16-bit values, so we have to extract them manually
5357
    USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
5358
    USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
5359
    USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
5360
    USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
5361
    pDestination->v = ((w & 0xF) << 12) |
5362
                      ((z & 0xF) << 8) |
5363
                      ((y & 0xF) << 4) |
5364
                      ((x & 0xF));
5365
#else
5366
    XMVECTOR               N;
5367
    static CONST XMVECTORF32  Max = {15.0f,15.0f,15.0f,15.0f};
5368
 
5369
    XMASSERT(pDestination);
5370
 
5371
    N = XMVectorClamp(V, XMVectorZero(), Max.v);
5372
    N = XMVectorRound(N);
5373
 
5374
    pDestination->v = (((USHORT)N.vector4_f32[3] & 0xF) << 12) |
5375
                      (((USHORT)N.vector4_f32[2] & 0xF) << 8) |
5376
                      (((USHORT)N.vector4_f32[1] & 0xF) << 4) |
5377
                      (((USHORT)N.vector4_f32[0] & 0xF));
5378
#endif !_XM_SSE_INTRINSICS_
5379
}
5380
 
5381
//------------------------------------------------------------------------------
5382
 
5383
XMFINLINE VOID XMStoreU555(
5384
     XMU555* pDestination,
5385
     FXMVECTOR V
5386
)
5387
{
5388
#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
5389
    XMASSERT(pDestination);
5390
    static CONST XMVECTORF32  Max = {31.0f, 31.0f, 31.0f, 1.0f};
5391
    // Bounds check
5392
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5393
    vResult = _mm_min_ps(vResult,Max);
5394
     // Convert to int with rounding
5395
    __m128i vInt = _mm_cvtps_epi32(vResult);
5396
    // No SSE operations will write to 16-bit values, so we have to extract them manually
5397
    USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
5398
    USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
5399
    USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
5400
    USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
5401
    pDestination->v = ((w) ? 0x8000 : 0) |
5402
                      ((z & 0x1F) << 10) |
5403
                      ((y & 0x1F) << 5) |
5404
                      ((x & 0x1F));
5405
#else
5406
    XMVECTOR               N;
5407
    static CONST XMVECTORF32  Max = {31.0f, 31.0f, 31.0f, 1.0f};
5408
 
5409
    XMASSERT(pDestination);
5410
 
5411
    N = XMVectorClamp(V, XMVectorZero(), Max.v);
5412
    N = XMVectorRound(N);
5413
 
5414
    pDestination->v = ((N.vector4_f32[3] > 0.f) ? 0x8000 : 0) |
5415
                      (((USHORT)N.vector4_f32[2] & 0x1F) << 10) |
5416
                      (((USHORT)N.vector4_f32[1] & 0x1F) << 5) |
5417
                      (((USHORT)N.vector4_f32[0] & 0x1F));
5418
#endif !_XM_SSE_INTRINSICS_
5419
}
5420
 
5421
//------------------------------------------------------------------------------
5422
 
5423
XMFINLINE VOID XMStoreColor
5424
(
5425
    XMCOLOR* pDestination, 
5426
    FXMVECTOR V
5427
)
5428
{
5429
#if defined(_XM_NO_INTRINSICS_)
5430
 
5431
    XMVECTOR               N;
5432
    static CONST XMVECTORF32  Scale = {255.0f, 255.0f, 255.0f, 255.0f};
5433
 
5434
    XMASSERT(pDestination);
5435
 
5436
    N = XMVectorSaturate(V);
5437
    N = XMVectorMultiply(N, Scale.v);
5438
    N = XMVectorRound(N);
5439
 
5440
    pDestination->c = ((UINT)N.vector4_f32[3] << 24) |
5441
                      ((UINT)N.vector4_f32[0] << 16) |
5442
                      ((UINT)N.vector4_f32[1] <<  8) |
5443
                      ((UINT)N.vector4_f32[2]);
5444
 
5445
#elif defined(_XM_SSE_INTRINSICS_)
5446
    XMASSERT(pDestination);
5447
    static CONST XMVECTORF32  Scale = {255.0f,255.0f,255.0f,255.0f};
5448
    // Set <0 to 0
5449
    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5450
    // Set>1 to 1
5451
    vResult = _mm_min_ps(vResult,g_XMOne);
5452
    // Convert to 0-255
5453
    vResult = _mm_mul_ps(vResult,Scale);
5454
    // Shuffle RGBA to ARGB
5455
    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,1,0,3));
5456
    // Convert to int 
5457
    __m128i vInt = _mm_cvtps_epi32(vResult);
5458
    // Mash to shorts
5459
    vInt = _mm_packs_epi32(vInt,vInt);
5460
    // Mash to bytes
5461
    vInt = _mm_packs_epi16(vInt,vInt);
5462
    // Store the color
5463
    _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]);
5464
#else // _XM_VMX128_INTRINSICS_
5465
#endif // _XM_VMX128_INTRINSICS_
5466
}
5467
 
5468
//------------------------------------------------------------------------------
5469
 
5470
XMFINLINE VOID XMStoreFloat3x3
5471
(
5472
    XMFLOAT3X3*	pDestination, 
5473
    CXMMATRIX	M
5474
)
5475
{
5476
#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
5477
 
5478
    XMStoreFloat3x3NC(pDestination, M);
5479
 
5480
#else // _XM_VMX128_INTRINSICS_
5481
#endif // _XM_VMX128_INTRINSICS_
5482
}
5483
 
5484
//------------------------------------------------------------------------------
5485
 
5486
XMFINLINE VOID XMStoreFloat3x3NC
5487
(
5488
    XMFLOAT3X3* pDestination, 
5489
    CXMMATRIX M
5490
)
5491
{
5492
#if defined(_XM_NO_INTRINSICS_)
5493
 
5494
    XMASSERT(pDestination);
5495
 
5496
    pDestination->m[0][0] = M.r[0].vector4_f32[0];
5497
    pDestination->m[0][1] = M.r[0].vector4_f32[1];
5498
    pDestination->m[0][2] = M.r[0].vector4_f32[2];
5499
 
5500
    pDestination->m[1][0] = M.r[1].vector4_f32[0];
5501
    pDestination->m[1][1] = M.r[1].vector4_f32[1];
5502
    pDestination->m[1][2] = M.r[1].vector4_f32[2];
5503
 
5504
    pDestination->m[2][0] = M.r[2].vector4_f32[0];
5505
    pDestination->m[2][1] = M.r[2].vector4_f32[1];
5506
    pDestination->m[2][2] = M.r[2].vector4_f32[2];
5507
 
5508
#elif defined(_XM_SSE_INTRINSICS_)
5509
	XMASSERT(pDestination);
5510
    XMVECTOR vTemp1 = M.r[0];
5511
    XMVECTOR vTemp2 = M.r[1];
5512
    XMVECTOR vTemp3 = M.r[2];
5513
    XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
5514
    vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
5515
    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
5516
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
5517
    _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
5518
    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(2,2,2,2));
5519
    _mm_store_ss(&pDestination->m[2][2],vTemp3);
5520
#else // _XM_VMX128_INTRINSICS_
5521
#endif // _XM_VMX128_INTRINSICS_
5522
}
5523
 
5524
//------------------------------------------------------------------------------
5525
 
5526
XMFINLINE VOID XMStoreFloat4x3
5527
(
5528
    XMFLOAT4X3* pDestination, 
5529
    CXMMATRIX M
5530
)
5531
{
5532
#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
5533
 
5534
    XMStoreFloat4x3NC(pDestination, M);
5535
 
5536
#else // _XM_VMX128_INTRINSICS_
5537
#endif // _XM_VMX128_INTRINSICS_
5538
}
5539
 
5540
//------------------------------------------------------------------------------
5541
 
5542
XMFINLINE VOID XMStoreFloat4x3A
5543
(
5544
    XMFLOAT4X3A*	pDestination, 
5545
    CXMMATRIX		M
5546
)
5547
{
5548
#if defined(_XM_NO_INTRINSICS_)
5549
 
5550
    XMASSERT(pDestination);
5551
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
5552
 
5553
    pDestination->m[0][0] = M.r[0].vector4_f32[0];
5554
    pDestination->m[0][1] = M.r[0].vector4_f32[1];
5555
    pDestination->m[0][2] = M.r[0].vector4_f32[2];
5556
 
5557
    pDestination->m[1][0] = M.r[1].vector4_f32[0];
5558
    pDestination->m[1][1] = M.r[1].vector4_f32[1];
5559
    pDestination->m[1][2] = M.r[1].vector4_f32[2];
5560
 
5561
    pDestination->m[2][0] = M.r[2].vector4_f32[0];
5562
    pDestination->m[2][1] = M.r[2].vector4_f32[1];
5563
    pDestination->m[2][2] = M.r[2].vector4_f32[2];
5564
 
5565
    pDestination->m[3][0] = M.r[3].vector4_f32[0];
5566
    pDestination->m[3][1] = M.r[3].vector4_f32[1];
5567
    pDestination->m[3][2] = M.r[3].vector4_f32[2];
5568
 
5569
#elif defined(_XM_SSE_INTRINSICS_)
5570
	XMASSERT(pDestination);
5571
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
5572
    // x1,y1,z1,w1
5573
    XMVECTOR vTemp1 = M.r[0];
5574
    // x2,y2,z2,w2
5575
    XMVECTOR vTemp2 = M.r[1];
5576
    // x3,y3,z3,w3
5577
    XMVECTOR vTemp3 = M.r[2];
5578
    // x4,y4,z4,w4
5579
    XMVECTOR vTemp4 = M.r[3];
5580
    // z1,z1,x2,y2
5581
    XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
5582
    // y2,z2,x3,y3 (Final)
5583
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
5584
    // x1,y1,z1,x2 (Final)
5585
    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
5586
    // z3,z3,x4,x4
5587
    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
5588
    // z3,x4,y4,z4 (Final)
5589
    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
5590
    // Store in 3 operations
5591
    _mm_store_ps(&pDestination->m[0][0],vTemp1);
5592
    _mm_store_ps(&pDestination->m[1][1],vTemp2);
5593
    _mm_store_ps(&pDestination->m[2][2],vTemp3);
5594
#else // _XM_VMX128_INTRINSICS_
5595
#endif // _XM_VMX128_INTRINSICS_
5596
}
5597
 
5598
//------------------------------------------------------------------------------
5599
 
5600
XMFINLINE VOID XMStoreFloat4x3NC
5601
(
5602
    XMFLOAT4X3* pDestination, 
5603
    CXMMATRIX M
5604
)
5605
{
5606
#if defined(_XM_NO_INTRINSICS_)
5607
 
5608
    XMASSERT(pDestination);
5609
 
5610
    pDestination->m[0][0] = M.r[0].vector4_f32[0];
5611
    pDestination->m[0][1] = M.r[0].vector4_f32[1];
5612
    pDestination->m[0][2] = M.r[0].vector4_f32[2];
5613
 
5614
    pDestination->m[1][0] = M.r[1].vector4_f32[0];
5615
    pDestination->m[1][1] = M.r[1].vector4_f32[1];
5616
    pDestination->m[1][2] = M.r[1].vector4_f32[2];
5617
 
5618
    pDestination->m[2][0] = M.r[2].vector4_f32[0];
5619
    pDestination->m[2][1] = M.r[2].vector4_f32[1];
5620
    pDestination->m[2][2] = M.r[2].vector4_f32[2];
5621
 
5622
    pDestination->m[3][0] = M.r[3].vector4_f32[0];
5623
    pDestination->m[3][1] = M.r[3].vector4_f32[1];
5624
    pDestination->m[3][2] = M.r[3].vector4_f32[2];
5625
 
5626
#elif defined(_XM_SSE_INTRINSICS_)
5627
	XMASSERT(pDestination);
5628
    XMVECTOR vTemp1 = M.r[0];
5629
    XMVECTOR vTemp2 = M.r[1];
5630
    XMVECTOR vTemp3 = M.r[2];
5631
    XMVECTOR vTemp4 = M.r[3];
5632
    XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
5633
    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
5634
    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
5635
    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
5636
    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
5637
    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
5638
    _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
5639
    _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
5640
#else // _XM_VMX128_INTRINSICS_
5641
#endif // _XM_VMX128_INTRINSICS_
5642
}
5643
 
5644
//------------------------------------------------------------------------------
5645
 
5646
XMFINLINE VOID XMStoreFloat4x4
5647
(
5648
    XMFLOAT4X4* pDestination, 
5649
    CXMMATRIX M
5650
)
5651
{
5652
#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
5653
 
5654
    XMStoreFloat4x4NC(pDestination, M);
5655
 
5656
#elif defined(_XM_SSE_INTRINSICS_)
5657
	XMASSERT(pDestination);
5658
 
5659
	_mm_storeu_ps( &pDestination->_11, M.r[0] );
5660
	_mm_storeu_ps( &pDestination->_21, M.r[1] );
5661
	_mm_storeu_ps( &pDestination->_31, M.r[2] );
5662
	_mm_storeu_ps( &pDestination->_41, M.r[3] );
5663
#else // _XM_VMX128_INTRINSICS_
5664
#endif // _XM_VMX128_INTRINSICS_
5665
}
5666
 
5667
//------------------------------------------------------------------------------
5668
 
5669
XMFINLINE VOID XMStoreFloat4x4A
5670
(
5671
    XMFLOAT4X4A*	pDestination, 
5672
    CXMMATRIX		M
5673
)
5674
{
5675
#if defined(_XM_NO_INTRINSICS_)
5676
 
5677
    XMASSERT(pDestination);
5678
    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
5679
 
5680
    pDestination->m[0][0] = M.r[0].vector4_f32[0];
5681
    pDestination->m[0][1] = M.r[0].vector4_f32[1];
5682
    pDestination->m[0][2] = M.r[0].vector4_f32[2];
5683
    pDestination->m[0][3] = M.r[0].vector4_f32[3];
5684
 
5685
    pDestination->m[1][0] = M.r[1].vector4_f32[0];
5686
    pDestination->m[1][1] = M.r[1].vector4_f32[1];
5687
    pDestination->m[1][2] = M.r[1].vector4_f32[2];
5688
    pDestination->m[1][3] = M.r[1].vector4_f32[3];
5689
 
5690
    pDestination->m[2][0] = M.r[2].vector4_f32[0];
5691
    pDestination->m[2][1] = M.r[2].vector4_f32[1];
5692
    pDestination->m[2][2] = M.r[2].vector4_f32[2];
5693
    pDestination->m[2][3] = M.r[2].vector4_f32[3];
5694
 
5695
    pDestination->m[3][0] = M.r[3].vector4_f32[0];
5696
    pDestination->m[3][1] = M.r[3].vector4_f32[1];
5697
    pDestination->m[3][2] = M.r[3].vector4_f32[2];
5698
    pDestination->m[3][3] = M.r[3].vector4_f32[3];
5699
 
5700
#elif defined(_XM_SSE_INTRINSICS_)
5701
	XMASSERT(pDestination);
5702
 
5703
	_mm_store_ps( &pDestination->_11, M.r[0] );
5704
	_mm_store_ps( &pDestination->_21, M.r[1] );
5705
	_mm_store_ps( &pDestination->_31, M.r[2] );
5706
	_mm_store_ps( &pDestination->_41, M.r[3] );
5707
#else // _XM_VMX128_INTRINSICS_
5708
#endif // _XM_VMX128_INTRINSICS_
5709
}
5710
 
5711
//------------------------------------------------------------------------------
5712
 
5713
XMFINLINE VOID XMStoreFloat4x4NC
5714
(
5715
    XMFLOAT4X4* pDestination, 
5716
    CXMMATRIX M
5717
)
5718
{
5719
#if defined(_XM_NO_INTRINSICS_)
5720
 
5721
    XMASSERT(pDestination);
5722
 
5723
    pDestination->m[0][0] = M.r[0].vector4_f32[0];
5724
    pDestination->m[0][1] = M.r[0].vector4_f32[1];
5725
    pDestination->m[0][2] = M.r[0].vector4_f32[2];
5726
    pDestination->m[0][3] = M.r[0].vector4_f32[3];
5727
 
5728
    pDestination->m[1][0] = M.r[1].vector4_f32[0];
5729
    pDestination->m[1][1] = M.r[1].vector4_f32[1];
5730
    pDestination->m[1][2] = M.r[1].vector4_f32[2];
5731
    pDestination->m[1][3] = M.r[1].vector4_f32[3];
5732
 
5733
    pDestination->m[2][0] = M.r[2].vector4_f32[0];
5734
    pDestination->m[2][1] = M.r[2].vector4_f32[1];
5735
    pDestination->m[2][2] = M.r[2].vector4_f32[2];
5736
    pDestination->m[2][3] = M.r[2].vector4_f32[3];
5737
 
5738
    pDestination->m[3][0] = M.r[3].vector4_f32[0];
5739
    pDestination->m[3][1] = M.r[3].vector4_f32[1];
5740
    pDestination->m[3][2] = M.r[3].vector4_f32[2];
5741
    pDestination->m[3][3] = M.r[3].vector4_f32[3];
5742
 
5743
#elif defined(_XM_SSE_INTRINSICS_)
5744
    XMASSERT(pDestination);
5745
    _mm_storeu_ps(&pDestination->m[0][0],M.r[0]);
5746
    _mm_storeu_ps(&pDestination->m[1][0],M.r[1]);
5747
    _mm_storeu_ps(&pDestination->m[2][0],M.r[2]);
5748
    _mm_storeu_ps(&pDestination->m[3][0],M.r[3]);
5749
#else // _XM_VMX128_INTRINSICS_
5750
#endif // _XM_VMX128_INTRINSICS_
5751
}
5752
 
5753
#endif // __XNAMATHCONVERT_INL__
5754