Subversion Repositories Games.Chess Giants

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
1 pmbaty 1
/*-========================================================================-_
2
 |                                 - XDSP -                                 |
3
 |        Copyright (c) Microsoft Corporation.  All rights reserved.        |
4
 |~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
5
 |PROJECT: XDSP                         MODEL:   Unmanaged User-mode        |
6
 |VERSION: 1.1                          EXCEPT:  No Exceptions              |
7
 |CLASS:   N / A                        MINREQ:  WinXP, Xbox360             |
8
 |BASE:    N / A                        DIALECT: MSC++ 14.00                |
9
 |>------------------------------------------------------------------------<|
10
 | DUTY: DSP functions with CPU extension specific optimizations            |
11
 ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
12
  NOTES:
13
    1.  Definition of terms:
14
            DSP: Digital Signal Processing.
15
            FFT: Fast Fourier Transform.
16
 
17
    2.  All buffer parameters must be 16-byte aligned.
18
 
19
    3.  All FFT functions support only FLOAT32 mono audio.                  */
20
 
21
#pragma once
22
//--------------<D-E-F-I-N-I-T-I-O-N-S>-------------------------------------//
23
#include <windef.h> // general windows types
24
#include <math.h>   // trigonometric functions
25
#if defined(_XBOX)  // SIMD intrinsics
26
    #include <ppcintrinsics.h>
27
#else
28
    #include <emmintrin.h>
29
#endif
30
 
31
 
32
//--------------<M-A-C-R-O-S>-----------------------------------------------//
33
// assertion
34
#if !defined(DSPASSERT)
35
    #if DBG
36
        #define DSPASSERT(exp) if (!(exp)) { OutputDebugStringA("XDSP ASSERT: " #exp ", {" __FUNCTION__ "}\n"); __debugbreak(); }
37
    #else
38
        #define DSPASSERT(exp) __assume(exp)
39
    #endif
40
#endif
41
 
42
// true if n is a power of 2
43
#if !defined(ISPOWEROF2)
44
    #define ISPOWEROF2(n) ( ((n)&((n)-1)) == 0 && (n) != 0 )
45
#endif
46
 
47
 
48
//--------------<H-E-L-P-E-R-S>---------------------------------------------//
49
namespace XDSP {
50
#pragma warning(push)
51
#pragma warning(disable: 4328 4640) // disable "indirection alignment of formal parameter", "construction of local static object is not thread-safe" compile warnings
52
 
53
 
54
// Helper functions, used by the FFT functions.
55
// The application need not call them directly.
56
 
57
    // primitive types
58
    typedef __m128 XVECTOR;
59
    typedef XVECTOR& XVECTORREF;
60
    typedef const XVECTOR& XVECTORREFC;
61
 
62
 
63
    // Parallel multiplication of four complex numbers, assuming
64
    // real and imaginary values are stored in separate vectors.
65
    __forceinline void vmulComplex (__out XVECTORREF rResult, __out XVECTORREF iResult, __in XVECTORREFC r1, __in XVECTORREFC i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
66
    {
67
        // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
68
        XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
69
        XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
70
        XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
71
        XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
72
        rResult = _mm_sub_ps(vr1r2, vi1i2); // real:      (r1*r2 - i1*i2)
73
        iResult = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
74
    }
75
    __forceinline void vmulComplex (__inout XVECTORREF r1, __inout XVECTORREF i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
76
    {
77
        // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
78
        XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
79
        XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
80
        XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
81
        XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
82
        r1 = _mm_sub_ps(vr1r2, vi1i2); // real:      (r1*r2 - i1*i2)
83
        i1 = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
84
    }
85
 
86
 
87
    // Radix-4 decimation-in-time FFT butterfly.
88
    // This version assumes that all four elements of the butterfly are
89
    // adjacent in a single vector.
90
    //
91
    // Compute the product of the complex input vector and the
92
    // 4-element DFT matrix:
93
    //     | 1  1  1  1 |    | (r1X,i1X) |
94
    //     | 1 -j -1  j |    | (r1Y,i1Y) |
95
    //     | 1 -1  1 -1 |    | (r1Z,i1Z) |
96
    //     | 1  j -1 -j |    | (r1W,i1W) |
97
    //
98
    // This matrix can be decomposed into two simpler ones to reduce the
99
    // number of additions needed. The decomposed matrices look like this:
100
    //     | 1  0  1  0 |    | 1  0  1  0 |
101
    //     | 0  1  0 -j |    | 1  0 -1  0 |
102
    //     | 1  0 -1  0 |    | 0  1  0  1 |
103
    //     | 0  1  0  j |    | 0  1  0 -1 |
104
    //
105
    // Combine as follows:
106
    //          | 1  0  1  0 |   | (r1X,i1X) |         | (r1X + r1Z, i1X + i1Z) |
107
    // Temp   = | 1  0 -1  0 | * | (r1Y,i1Y) |       = | (r1X - r1Z, i1X - i1Z) |
108
    //          | 0  1  0  1 |   | (r1Z,i1Z) |         | (r1Y + r1W, i1Y + i1W) |
109
    //          | 0  1  0 -1 |   | (r1W,i1W) |         | (r1Y - r1W, i1Y - i1W) |
110
    //
111
    //          | 1  0  1  0 |   | (rTempX,iTempX) |   | (rTempX + rTempZ, iTempX + iTempZ) |
112
    // Result = | 0  1  0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) |
113
    //          | 1  0 -1  0 |   | (rTempZ,iTempZ) |   | (rTempX - rTempZ, iTempX - iTempZ) |
114
    //          | 0  1  0  j |   | (rTempW,iTempW) |   | (rTempY - iTempW, iTempY + rTempW) |
115
    __forceinline void ButterflyDIT4_1 (__inout XVECTORREF r1, __inout XVECTORREF i1)
116
    {
117
        // sign constants for radix-4 butterflies
118
        const static XVECTOR vDFT4SignBits1 = { 0.0f, -0.0f,  0.0f, -0.0f };
119
        const static XVECTOR vDFT4SignBits2 = { 0.0f,  0.0f, -0.0f, -0.0f };
120
        const static XVECTOR vDFT4SignBits3 = { 0.0f, -0.0f, -0.0f,  0.0f };
121
 
122
 
123
        // calculating Temp
124
        XVECTOR rTemp = _mm_add_ps( _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1, 1, 0, 0)),                               // [r1X| r1X|r1Y| r1Y] +
125
                                    _mm_xor_ps(_mm_shuffle_ps(r1, r1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [r1Z|-r1Z|r1W|-r1W]
126
        XVECTOR iTemp = _mm_add_ps( _mm_shuffle_ps(i1, i1, _MM_SHUFFLE(1, 1, 0, 0)),                               // [i1X| i1X|i1Y| i1Y] +
127
                                    _mm_xor_ps(_mm_shuffle_ps(i1, i1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [i1Z|-i1Z|i1W|-i1W]
128
 
129
        // calculating Result
130
        XVECTOR rZrWiZiW = _mm_shuffle_ps(rTemp, iTemp, _MM_SHUFFLE(3, 2, 3, 2));       // [rTempZ|rTempW|iTempZ|iTempW]
131
        XVECTOR rZiWrZiW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(3, 0, 3, 0)); // [rTempZ|iTempW|rTempZ|iTempW]
132
        XVECTOR iZrWiZrW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(1, 2, 1, 2)); // [rTempZ|iTempW|rTempZ|iTempW]
133
        r1 = _mm_add_ps( _mm_shuffle_ps(rTemp, rTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [rTempX| rTempY| rTempX| rTempY] +
134
                         _mm_xor_ps(rZiWrZiW, vDFT4SignBits2) );                // [rTempZ| iTempW|-rTempZ|-iTempW]
135
        i1 = _mm_add_ps( _mm_shuffle_ps(iTemp, iTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [iTempX| iTempY| iTempX| iTempY] +
136
                         _mm_xor_ps(iZrWiZrW, vDFT4SignBits3) );                // [iTempZ|-rTempW|-iTempZ| rTempW]
137
    }
138
 
139
    // Radix-4 decimation-in-time FFT butterfly.
140
    // This version assumes that elements of the butterfly are
141
    // in different vectors, so that each vector in the input
142
    // contains elements from four different butterflies.
143
    // The four separate butterflies are processed in parallel.
144
    //
145
    // The calculations here are the same as the ones in the single-vector
146
    // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W)
147
    // they are done in parallel on sixteen independent complex values.
148
    // There is no interdependence between the vector elements:
149
    // | 1  0  1  0 |    | (rIn0,iIn0) |               | (rIn0 + rIn2, iIn0 + iIn2) |
150
    // | 1  0 -1  0 | *  | (rIn1,iIn1) |  =   Temp   = | (rIn0 - rIn2, iIn0 - iIn2) |
151
    // | 0  1  0  1 |    | (rIn2,iIn2) |               | (rIn1 + rIn3, iIn1 + iIn3) |
152
    // | 0  1  0 -1 |    | (rIn3,iIn3) |               | (rIn1 - rIn3, iIn1 - iIn3) |
153
    //
154
    //          | 1  0  1  0 |   | (rTemp0,iTemp0) |   | (rTemp0 + rTemp2, iTemp0 + iTemp2) |
155
    // Result = | 0  1  0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) |
156
    //          | 1  0 -1  0 |   | (rTemp2,iTemp2) |   | (rTemp0 - rTemp2, iTemp0 - iTemp2) |
157
    //          | 0  1  0  j |   | (rTemp3,iTemp3) |   | (rTemp1 - iTemp3, iTemp1 + rTemp3) |
158
    __forceinline void ButterflyDIT4_4 (__inout XVECTORREF r0,
159
                                        __inout XVECTORREF r1,
160
                                        __inout XVECTORREF r2,
161
                                        __inout XVECTORREF r3,
162
                                        __inout XVECTORREF i0,
163
                                        __inout XVECTORREF i1,
164
                                        __inout XVECTORREF i2,
165
                                        __inout XVECTORREF i3,
166
                                        __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableReal,
167
                                        __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableImaginary,
168
                                        const UINT32 uStride, const BOOL fLast)
169
    {
170
        DSPASSERT(pUnityTableReal != NULL);
171
        DSPASSERT(pUnityTableImaginary != NULL);
172
        DSPASSERT((UINT_PTR)pUnityTableReal % 16 == 0);
173
        DSPASSERT((UINT_PTR)pUnityTableImaginary % 16 == 0);
174
        DSPASSERT(ISPOWEROF2(uStride));
175
 
176
        XVECTOR rTemp0, rTemp1, rTemp2, rTemp3, rTemp4, rTemp5, rTemp6, rTemp7;
177
        XVECTOR iTemp0, iTemp1, iTemp2, iTemp3, iTemp4, iTemp5, iTemp6, iTemp7;
178
 
179
 
180
        // calculating Temp
181
        rTemp0 = _mm_add_ps(r0, r2);          iTemp0 = _mm_add_ps(i0, i2);
182
        rTemp2 = _mm_add_ps(r1, r3);          iTemp2 = _mm_add_ps(i1, i3);
183
        rTemp1 = _mm_sub_ps(r0, r2);          iTemp1 = _mm_sub_ps(i0, i2);
184
        rTemp3 = _mm_sub_ps(r1, r3);          iTemp3 = _mm_sub_ps(i1, i3);
185
        rTemp4 = _mm_add_ps(rTemp0, rTemp2);  iTemp4 = _mm_add_ps(iTemp0, iTemp2);
186
        rTemp5 = _mm_add_ps(rTemp1, iTemp3);  iTemp5 = _mm_sub_ps(iTemp1, rTemp3);
187
        rTemp6 = _mm_sub_ps(rTemp0, rTemp2);  iTemp6 = _mm_sub_ps(iTemp0, iTemp2);
188
        rTemp7 = _mm_sub_ps(rTemp1, iTemp3);  iTemp7 = _mm_add_ps(iTemp1, rTemp3);
189
 
190
        // calculating Result
191
        // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial
192
        vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]);
193
        vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]);
194
        vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]);
195
        if (fLast) {
196
            ButterflyDIT4_1(rTemp4, iTemp4);
197
            ButterflyDIT4_1(rTemp5, iTemp5);
198
            ButterflyDIT4_1(rTemp6, iTemp6);
199
            ButterflyDIT4_1(rTemp7, iTemp7);
200
        }
201
 
202
 
203
        r0 = rTemp4;    i0 = iTemp4;
204
        r1 = rTemp5;    i1 = iTemp5;
205
        r2 = rTemp6;    i2 = iTemp6;
206
        r3 = rTemp7;    i3 = iTemp7;
207
    }
208
 
209
//--------------<F-U-N-C-T-I-O-N-S>-----------------------------------------//
210
 
211
      ////
212
      // DESCRIPTION:
213
      //  4-sample FFT.
214
      //
215
      // PARAMETERS:
216
      //  pReal      - [inout] real components, must have at least uCount elements
217
      //  pImaginary - [inout] imaginary components, must have at least uCount elements
218
      //  uCount     - [in]    number of FFT iterations
219
      //
220
      // RETURN VALUE:
221
      //  void
222
      ////
223
    __forceinline void FFT4 (__inout_ecount(uCount) XVECTOR* __restrict pReal, __inout_ecount(uCount) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
224
    {
225
        DSPASSERT(pReal != NULL);
226
        DSPASSERT(pImaginary != NULL);
227
        DSPASSERT((UINT_PTR)pReal % 16 == 0);
228
        DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
229
        DSPASSERT(ISPOWEROF2(uCount));
230
 
231
        for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
232
            ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]);
233
        }
234
    }
235
 
236
 
237
 
238
      ////
239
      // DESCRIPTION:
240
      //  8-sample FFT.
241
      //
242
      // PARAMETERS:
243
      //  pReal      - [inout] real components, must have at least uCount*2 elements
244
      //  pImaginary - [inout] imaginary components, must have at least uCount*2 elements
245
      //  uCount     - [in]    number of FFT iterations
246
      //
247
      // RETURN VALUE:
248
      //  void
249
      ////
250
    __forceinline void FFT8 (__inout_ecount(uCount*2) XVECTOR* __restrict pReal, __inout_ecount(uCount*2) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
251
    {
252
        DSPASSERT(pReal != NULL);
253
        DSPASSERT(pImaginary != NULL);
254
        DSPASSERT((UINT_PTR)pReal % 16 == 0);
255
        DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
256
        DSPASSERT(ISPOWEROF2(uCount));
257
 
258
        static XVECTOR wr1 = {  1.0f,  0.70710677f,  0.0f, -0.70710677f };
259
        static XVECTOR wi1 = {  0.0f, -0.70710677f, -1.0f, -0.70710677f };
260
        static XVECTOR wr2 = { -1.0f, -0.70710677f,  0.0f,  0.70710677f };
261
        static XVECTOR wi2 = {  0.0f,  0.70710677f,  1.0f,  0.70710677f };
262
 
263
 
264
        for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
265
            XVECTOR* __restrict pR = pReal      + uIndex*2;
266
            XVECTOR* __restrict pI = pImaginary + uIndex*2;
267
 
268
            XVECTOR oddsR  = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(3, 1, 3, 1));
269
            XVECTOR evensR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(2, 0, 2, 0));
270
            XVECTOR oddsI  = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(3, 1, 3, 1));
271
            XVECTOR evensI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(2, 0, 2, 0));
272
            ButterflyDIT4_1(oddsR, oddsI);
273
            ButterflyDIT4_1(evensR, evensI);
274
 
275
            XVECTOR r, i;
276
            vmulComplex(r, i, oddsR, oddsI, wr1, wi1);
277
            pR[0] = _mm_add_ps(evensR, r);
278
            pI[0] = _mm_add_ps(evensI, i);
279
 
280
            vmulComplex(r, i, oddsR, oddsI, wr2, wi2);
281
            pR[1] = _mm_add_ps(evensR, r);
282
            pI[1] = _mm_add_ps(evensI, i);
283
        }
284
    }
285
 
286
 
287
 
288
      ////
289
      // DESCRIPTION:
290
      //  16-sample FFT.
291
      //
292
      // PARAMETERS:
293
      //  pReal      - [inout] real components, must have at least uCount*4 elements
294
      //  pImaginary - [inout] imaginary components, must have at least uCount*4 elements
295
      //  uCount     - [in]    number of FFT iterations
296
      //
297
      // RETURN VALUE:
298
      //  void
299
      ////
300
    __forceinline void FFT16 (__inout_ecount(uCount*4) XVECTOR* __restrict pReal, __inout_ecount(uCount*4) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
301
    {
302
        DSPASSERT(pReal != NULL);
303
        DSPASSERT(pImaginary != NULL);
304
        DSPASSERT((UINT_PTR)pReal % 16 == 0);
305
        DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
306
        DSPASSERT(ISPOWEROF2(uCount));
307
 
308
        XVECTOR aUnityTableReal[4]      = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.92387950f, 0.70710677f, 0.38268343f, 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f, 1.0f, 0.38268343f, -0.70710677f, -0.92387950f };
309
        XVECTOR aUnityTableImaginary[4] = { -0.0f, -0.0f, -0.0f, -0.0f, -0.0f, -0.38268343f, -0.70710677f, -0.92387950f, -0.0f, -0.70710677f, -1.0f, -0.70710677f, -0.0f, -0.92387950f, -0.70710677f, 0.38268343f };
310
 
311
 
312
        for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
313
            ButterflyDIT4_4(pReal[uIndex*4],
314
                            pReal[uIndex*4 + 1],
315
                            pReal[uIndex*4 + 2],
316
                            pReal[uIndex*4 + 3],
317
                            pImaginary[uIndex*4],
318
                            pImaginary[uIndex*4 + 1],
319
                            pImaginary[uIndex*4 + 2],
320
                            pImaginary[uIndex*4 + 3],
321
                            aUnityTableReal,
322
                            aUnityTableImaginary,
323
                            1, TRUE);
324
        }
325
    }
326
 
327
 
328
 
329
      ////
330
      // DESCRIPTION:
331
      //  2^N-sample FFT.
332
      //
333
      // REMARKS:
334
      //  For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4().
335
      //
336
      // PARAMETERS:
337
      //  pReal       - [inout] real components, must have at least (uLength*uCount)/4 elements
338
      //  pImaginary  - [inout] imaginary components, must have at least (uLength*uCount)/4 elements
339
      //  pUnityTable - [in]    unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable()
340
      //  uLength     - [in]    FFT length in samples, must be a power of 2 > 16
341
      //  uCount      - [in]    number of FFT iterations
342
      //
343
      // RETURN VALUE:
344
      //  void
345
      ////
346
    inline void FFT (__inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pReal, __inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(uLength*uCount) const XVECTOR* __restrict pUnityTable, const UINT32 uLength, const UINT32 uCount=1)
347
    {
348
        DSPASSERT(pReal != NULL);
349
        DSPASSERT(pImaginary != NULL);
350
        DSPASSERT(pUnityTable != NULL);
351
        DSPASSERT((UINT_PTR)pReal % 16 == 0);
352
        DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
353
        DSPASSERT((UINT_PTR)pUnityTable % 16 == 0);
354
        DSPASSERT(uLength > 16);
355
        DSPASSERT(ISPOWEROF2(uLength));
356
        DSPASSERT(ISPOWEROF2(uCount));
357
 
358
        const XVECTOR* __restrict pUnityTableReal      = pUnityTable;
359
        const XVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2);
360
        const UINT32 uTotal         = uCount * uLength;
361
        const UINT32 uTotal_vectors = uTotal >> 2;
362
        const UINT32 uStage_vectors = uLength >> 2;
363
        const UINT32 uStride        = uStage_vectors >> 2; // stride between butterfly elements
364
        const UINT32 uSkip          = uStage_vectors - uStride;
365
 
366
 
367
        for (UINT32 uIndex=0; uIndex<(uTotal_vectors>>2); ++uIndex) {
368
            UINT32 n = (uIndex/uStride) * (uStride + uSkip) + (uIndex % uStride);
369
            ButterflyDIT4_4(pReal[n],
370
                            pReal[n + uStride],
371
                            pReal[n + uStride*2],
372
                            pReal[n + uStride*3],
373
                            pImaginary[n ],
374
                            pImaginary[n + uStride],
375
                            pImaginary[n + uStride*2],
376
                            pImaginary[n + uStride*3],
377
                            pUnityTableReal      + n % uStage_vectors,
378
                            pUnityTableImaginary + n % uStage_vectors,
379
                            uStride, FALSE);
380
        }
381
 
382
 
383
        if (uLength > 16*4) {
384
            FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4);
385
        } else if (uLength == 16*4) {
386
            FFT16(pReal, pImaginary, uCount*4);
387
        } else if (uLength == 8*4) {
388
            FFT8(pReal, pImaginary, uCount*4);
389
        } else if (uLength == 4*4) {
390
            FFT4(pReal, pImaginary, uCount*4);
391
        }
392
    }
393
 
394
//--------------------------------------------------------------------------//
395
  ////
396
  // DESCRIPTION:
397
  //  Initializes unity roots lookup table used by FFT functions.
398
  //  Once initialized, the table need not be initialized again unless a
399
  //  different FFT length is desired.
400
  //
401
  // REMARKS:
402
  //  The unity tables of FFT length 16 and below are hard coded into the
403
  //  respective FFT functions and so need not be initialized.
404
  //
405
  // PARAMETERS:
406
  //  pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements
407
  //  uLength     - [in]  FFT length in samples, must be a power of 2 > 16
408
  //
409
  // RETURN VALUE:
410
  //  void
411
  ////
412
inline void FFTInitializeUnityTable (__out_ecount(uLength) XVECTOR* __restrict pUnityTable, UINT32 uLength)
413
{
414
    DSPASSERT(pUnityTable != NULL);
415
    DSPASSERT(uLength > 16);
416
    DSPASSERT(ISPOWEROF2(uLength));
417
 
418
    FLOAT32* __restrict pfUnityTable = (FLOAT32* __restrict)pUnityTable;
419
 
420
 
421
    // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16
422
    do {
423
        FLOAT32 flStep = 6.283185307f / uLength; // 2PI / FFT length
424
        uLength >>= 2;
425
 
426
        // pUnityTable[0 to uLength*4-1] contains real components for current FFT length
427
        // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length
428
        for (UINT32 i=0; i<4; ++i) {
429
            for (UINT32 j=0; j<uLength; ++j) {
430
                UINT32 uIndex = (i*uLength) + j;
431
                pfUnityTable[uIndex]             = cosf(FLOAT32(i)*FLOAT32(j)*flStep);  // real component
432
                pfUnityTable[uIndex + uLength*4] = -sinf(FLOAT32(i)*FLOAT32(j)*flStep); // imaginary component
433
            }
434
        }
435
        pfUnityTable += uLength*8;
436
    } while (uLength > 16);
437
}
438
 
439
 
440
  ////
441
  // DESCRIPTION:
442
  //  The FFT functions generate output in bit reversed order.
443
  //  Use this function to re-arrange them into order of increasing frequency.
444
  //
445
  // PARAMETERS:
446
  //  pOutput     - [out] output buffer, receives samples in order of increasing frequency, must have at least (1<<uLog2Length) elements
447
  //  pInput      - [in]  input buffer, samples in bit reversed order as generated by FFT functions, must have at least (1<<uLog2Length) elements
448
  //  uLog2Length - [in]  LOG (base 2) of FFT length in samples, must be > 0
449
  //
450
  // RETURN VALUE:
451
  //  void
452
  ////
453
inline void FFTUnswizzle (__out_ecount(1<<uLog2Length) FLOAT32* __restrict pOutput, __in_ecount(1<<uLog2Length) const FLOAT32* __restrict pInput, const UINT32 uLog2Length)
454
{
455
    DSPASSERT(pOutput != NULL);
456
    DSPASSERT(pInput != NULL);
457
    DSPASSERT(uLog2Length > 0);
458
 
459
    const UINT32 uLength = UINT32(1 << uLog2Length);
460
 
461
 
462
    if ((uLog2Length & 0x1) == 0) {
463
        // even powers of two
464
        for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
465
            UINT32 n = uIndex;
466
            n = ( (n & 0xcccccccc) >> 2 )  | ( (n & 0x33333333) << 2 );
467
            n = ( (n & 0xf0f0f0f0) >> 4 )  | ( (n & 0x0f0f0f0f) << 4 );
468
            n = ( (n & 0xff00ff00) >> 8 )  | ( (n & 0x00ff00ff) << 8 );
469
            n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
470
            n >>= (32 - uLog2Length);
471
            pOutput[n] = pInput[uIndex];
472
        }
473
    } else {
474
        // odd powers of two
475
        for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
476
            UINT32 n = (uIndex>>3);
477
            n = ( (n & 0xcccccccc) >> 2 )  | ( (n & 0x33333333) << 2 );
478
            n = ( (n & 0xf0f0f0f0) >> 4 )  | ( (n & 0x0f0f0f0f) << 4 );
479
            n = ( (n & 0xff00ff00) >> 8 )  | ( (n & 0x00ff00ff) << 8 );
480
            n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
481
            n >>= (32 - (uLog2Length-3));
482
            n |= ((uIndex & 0x7) << (uLog2Length - 3));
483
            pOutput[n] = pInput[uIndex];
484
        }
485
    }
486
}
487
 
488
 
489
  ////
490
  // DESCRIPTION:
491
  //  Convert complex components to polar form.
492
  //
493
  // PARAMETERS:
494
  //  pOutput         - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements
495
  //  pInputReal      - [in]  input buffer (real components), must have at least uLength/4 elements
496
  //  pInputImaginary - [in]  input buffer (imaginary components), must have at least uLength/4 elements
497
  //  uLength         - [in]  FFT length in samples, must be a power of 2 >= 4
498
  //
499
  // RETURN VALUE:
500
  //  void
501
  ////
502
inline void FFTPolar (__out_ecount(uLength/4) XVECTOR* __restrict pOutput, __in_ecount(uLength/4) const XVECTOR* __restrict pInputReal, __in_ecount(uLength/4) const XVECTOR* __restrict pInputImaginary, const UINT32 uLength)
503
{
504
    DSPASSERT(pOutput != NULL);
505
    DSPASSERT(pInputReal != NULL);
506
    DSPASSERT(pInputImaginary != NULL);
507
    DSPASSERT(uLength >= 4);
508
    DSPASSERT(ISPOWEROF2(uLength));
509
 
510
    FLOAT32 flOneOverLength = 1.0f / uLength;
511
 
512
 
513
    // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2
514
        XVECTOR vOneOverLength = _mm_set_ps1(flOneOverLength);
515
 
516
        for (UINT32 uIndex=0; uIndex<(uLength>>2); ++uIndex) {
517
            XVECTOR vReal      = _mm_mul_ps(pInputReal[uIndex], vOneOverLength);
518
            XVECTOR vImaginary = _mm_mul_ps(pInputImaginary[uIndex], vOneOverLength);
519
            XVECTOR vRR        = _mm_mul_ps(vReal, vReal);
520
            XVECTOR vII        = _mm_mul_ps(vImaginary, vImaginary);
521
            XVECTOR vRRplusII  = _mm_add_ps(vRR, vII);
522
            XVECTOR vTotal  = _mm_sqrt_ps(vRRplusII);
523
            pOutput[uIndex] = _mm_add_ps(vTotal, vTotal);
524
        }
525
}
526
 
527
 
528
#pragma warning(pop)
529
}; // namespace XDSP
530
//---------------------------------<-EOF->----------------------------------//
531