Details | Last modification | View Log | RSS feed
| Rev | Author | Line No. | Line |
|---|---|---|---|
| 1 | pmbaty | 1 | /*-========================================================================-_ |
| 2 | | - XDSP - | |
||
| 3 | | Copyright (c) Microsoft Corporation. All rights reserved. | |
||
| 4 | |~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| |
||
| 5 | |PROJECT: XDSP MODEL: Unmanaged User-mode | |
||
| 6 | |VERSION: 1.1 EXCEPT: No Exceptions | |
||
| 7 | |CLASS: N / A MINREQ: WinXP, Xbox360 | |
||
| 8 | |BASE: N / A DIALECT: MSC++ 14.00 | |
||
| 9 | |>------------------------------------------------------------------------<| |
||
| 10 | | DUTY: DSP functions with CPU extension specific optimizations | |
||
| 11 | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ |
||
| 12 | NOTES: |
||
| 13 | 1. Definition of terms: |
||
| 14 | DSP: Digital Signal Processing. |
||
| 15 | FFT: Fast Fourier Transform. |
||
| 16 | |||
| 17 | 2. All buffer parameters must be 16-byte aligned. |
||
| 18 | |||
| 19 | 3. All FFT functions support only FLOAT32 mono audio. */ |
||
| 20 | |||
| 21 | #pragma once |
||
| 22 | //--------------<D-E-F-I-N-I-T-I-O-N-S>-------------------------------------// |
||
| 23 | #include <windef.h> // general windows types |
||
| 24 | #include <math.h> // trigonometric functions |
||
| 25 | #if defined(_XBOX) // SIMD intrinsics |
||
| 26 | #include <ppcintrinsics.h> |
||
| 27 | #else |
||
| 28 | #include <emmintrin.h> |
||
| 29 | #endif |
||
| 30 | |||
| 31 | |||
| 32 | //--------------<M-A-C-R-O-S>-----------------------------------------------// |
||
| 33 | // assertion |
||
| 34 | #if !defined(DSPASSERT) |
||
| 35 | #if DBG |
||
| 36 | #define DSPASSERT(exp) if (!(exp)) { OutputDebugStringA("XDSP ASSERT: " #exp ", {" __FUNCTION__ "}\n"); __debugbreak(); } |
||
| 37 | #else |
||
| 38 | #define DSPASSERT(exp) __assume(exp) |
||
| 39 | #endif |
||
| 40 | #endif |
||
| 41 | |||
| 42 | // true if n is a power of 2 |
||
| 43 | #if !defined(ISPOWEROF2) |
||
| 44 | #define ISPOWEROF2(n) ( ((n)&((n)-1)) == 0 && (n) != 0 ) |
||
| 45 | #endif |
||
| 46 | |||
| 47 | |||
| 48 | //--------------<H-E-L-P-E-R-S>---------------------------------------------// |
||
| 49 | namespace XDSP { |
||
| 50 | #pragma warning(push) |
||
| 51 | #pragma warning(disable: 4328 4640) // disable "indirection alignment of formal parameter", "construction of local static object is not thread-safe" compile warnings |
||
| 52 | |||
| 53 | |||
| 54 | // Helper functions, used by the FFT functions. |
||
| 55 | // The application need not call them directly. |
||
| 56 | |||
| 57 | // primitive types |
||
| 58 | typedef __m128 XVECTOR; |
||
| 59 | typedef XVECTOR& XVECTORREF; |
||
| 60 | typedef const XVECTOR& XVECTORREFC; |
||
| 61 | |||
| 62 | |||
| 63 | // Parallel multiplication of four complex numbers, assuming |
||
| 64 | // real and imaginary values are stored in separate vectors. |
||
| 65 | __forceinline void vmulComplex (__out XVECTORREF rResult, __out XVECTORREF iResult, __in XVECTORREFC r1, __in XVECTORREFC i1, __in XVECTORREFC r2, __in XVECTORREFC i2) |
||
| 66 | { |
||
| 67 | // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1) |
||
| 68 | XVECTOR vi1i2 = _mm_mul_ps(i1, i2); |
||
| 69 | XVECTOR vr1r2 = _mm_mul_ps(r1, r2); |
||
| 70 | XVECTOR vr1i2 = _mm_mul_ps(r1, i2); |
||
| 71 | XVECTOR vr2i1 = _mm_mul_ps(r2, i1); |
||
| 72 | rResult = _mm_sub_ps(vr1r2, vi1i2); // real: (r1*r2 - i1*i2) |
||
| 73 | iResult = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1) |
||
| 74 | } |
||
| 75 | __forceinline void vmulComplex (__inout XVECTORREF r1, __inout XVECTORREF i1, __in XVECTORREFC r2, __in XVECTORREFC i2) |
||
| 76 | { |
||
| 77 | // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1) |
||
| 78 | XVECTOR vi1i2 = _mm_mul_ps(i1, i2); |
||
| 79 | XVECTOR vr1r2 = _mm_mul_ps(r1, r2); |
||
| 80 | XVECTOR vr1i2 = _mm_mul_ps(r1, i2); |
||
| 81 | XVECTOR vr2i1 = _mm_mul_ps(r2, i1); |
||
| 82 | r1 = _mm_sub_ps(vr1r2, vi1i2); // real: (r1*r2 - i1*i2) |
||
| 83 | i1 = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1) |
||
| 84 | } |
||
| 85 | |||
| 86 | |||
| 87 | // Radix-4 decimation-in-time FFT butterfly. |
||
| 88 | // This version assumes that all four elements of the butterfly are |
||
| 89 | // adjacent in a single vector. |
||
| 90 | // |
||
| 91 | // Compute the product of the complex input vector and the |
||
| 92 | // 4-element DFT matrix: |
||
| 93 | // | 1 1 1 1 | | (r1X,i1X) | |
||
| 94 | // | 1 -j -1 j | | (r1Y,i1Y) | |
||
| 95 | // | 1 -1 1 -1 | | (r1Z,i1Z) | |
||
| 96 | // | 1 j -1 -j | | (r1W,i1W) | |
||
| 97 | // |
||
| 98 | // This matrix can be decomposed into two simpler ones to reduce the |
||
| 99 | // number of additions needed. The decomposed matrices look like this: |
||
| 100 | // | 1 0 1 0 | | 1 0 1 0 | |
||
| 101 | // | 0 1 0 -j | | 1 0 -1 0 | |
||
| 102 | // | 1 0 -1 0 | | 0 1 0 1 | |
||
| 103 | // | 0 1 0 j | | 0 1 0 -1 | |
||
| 104 | // |
||
| 105 | // Combine as follows: |
||
| 106 | // | 1 0 1 0 | | (r1X,i1X) | | (r1X + r1Z, i1X + i1Z) | |
||
| 107 | // Temp = | 1 0 -1 0 | * | (r1Y,i1Y) | = | (r1X - r1Z, i1X - i1Z) | |
||
| 108 | // | 0 1 0 1 | | (r1Z,i1Z) | | (r1Y + r1W, i1Y + i1W) | |
||
| 109 | // | 0 1 0 -1 | | (r1W,i1W) | | (r1Y - r1W, i1Y - i1W) | |
||
| 110 | // |
||
| 111 | // | 1 0 1 0 | | (rTempX,iTempX) | | (rTempX + rTempZ, iTempX + iTempZ) | |
||
| 112 | // Result = | 0 1 0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) | |
||
| 113 | // | 1 0 -1 0 | | (rTempZ,iTempZ) | | (rTempX - rTempZ, iTempX - iTempZ) | |
||
| 114 | // | 0 1 0 j | | (rTempW,iTempW) | | (rTempY - iTempW, iTempY + rTempW) | |
||
| 115 | __forceinline void ButterflyDIT4_1 (__inout XVECTORREF r1, __inout XVECTORREF i1) |
||
| 116 | { |
||
| 117 | // sign constants for radix-4 butterflies |
||
| 118 | const static XVECTOR vDFT4SignBits1 = { 0.0f, -0.0f, 0.0f, -0.0f }; |
||
| 119 | const static XVECTOR vDFT4SignBits2 = { 0.0f, 0.0f, -0.0f, -0.0f }; |
||
| 120 | const static XVECTOR vDFT4SignBits3 = { 0.0f, -0.0f, -0.0f, 0.0f }; |
||
| 121 | |||
| 122 | |||
| 123 | // calculating Temp |
||
| 124 | XVECTOR rTemp = _mm_add_ps( _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1, 1, 0, 0)), // [r1X| r1X|r1Y| r1Y] + |
||
| 125 | _mm_xor_ps(_mm_shuffle_ps(r1, r1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [r1Z|-r1Z|r1W|-r1W] |
||
| 126 | XVECTOR iTemp = _mm_add_ps( _mm_shuffle_ps(i1, i1, _MM_SHUFFLE(1, 1, 0, 0)), // [i1X| i1X|i1Y| i1Y] + |
||
| 127 | _mm_xor_ps(_mm_shuffle_ps(i1, i1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [i1Z|-i1Z|i1W|-i1W] |
||
| 128 | |||
| 129 | // calculating Result |
||
| 130 | XVECTOR rZrWiZiW = _mm_shuffle_ps(rTemp, iTemp, _MM_SHUFFLE(3, 2, 3, 2)); // [rTempZ|rTempW|iTempZ|iTempW] |
||
| 131 | XVECTOR rZiWrZiW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(3, 0, 3, 0)); // [rTempZ|iTempW|rTempZ|iTempW] |
||
| 132 | XVECTOR iZrWiZrW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(1, 2, 1, 2)); // [rTempZ|iTempW|rTempZ|iTempW] |
||
| 133 | r1 = _mm_add_ps( _mm_shuffle_ps(rTemp, rTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [rTempX| rTempY| rTempX| rTempY] + |
||
| 134 | _mm_xor_ps(rZiWrZiW, vDFT4SignBits2) ); // [rTempZ| iTempW|-rTempZ|-iTempW] |
||
| 135 | i1 = _mm_add_ps( _mm_shuffle_ps(iTemp, iTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [iTempX| iTempY| iTempX| iTempY] + |
||
| 136 | _mm_xor_ps(iZrWiZrW, vDFT4SignBits3) ); // [iTempZ|-rTempW|-iTempZ| rTempW] |
||
| 137 | } |
||
| 138 | |||
| 139 | // Radix-4 decimation-in-time FFT butterfly. |
||
| 140 | // This version assumes that elements of the butterfly are |
||
| 141 | // in different vectors, so that each vector in the input |
||
| 142 | // contains elements from four different butterflies. |
||
| 143 | // The four separate butterflies are processed in parallel. |
||
| 144 | // |
||
| 145 | // The calculations here are the same as the ones in the single-vector |
||
| 146 | // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W) |
||
| 147 | // they are done in parallel on sixteen independent complex values. |
||
| 148 | // There is no interdependence between the vector elements: |
||
| 149 | // | 1 0 1 0 | | (rIn0,iIn0) | | (rIn0 + rIn2, iIn0 + iIn2) | |
||
| 150 | // | 1 0 -1 0 | * | (rIn1,iIn1) | = Temp = | (rIn0 - rIn2, iIn0 - iIn2) | |
||
| 151 | // | 0 1 0 1 | | (rIn2,iIn2) | | (rIn1 + rIn3, iIn1 + iIn3) | |
||
| 152 | // | 0 1 0 -1 | | (rIn3,iIn3) | | (rIn1 - rIn3, iIn1 - iIn3) | |
||
| 153 | // |
||
| 154 | // | 1 0 1 0 | | (rTemp0,iTemp0) | | (rTemp0 + rTemp2, iTemp0 + iTemp2) | |
||
| 155 | // Result = | 0 1 0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) | |
||
| 156 | // | 1 0 -1 0 | | (rTemp2,iTemp2) | | (rTemp0 - rTemp2, iTemp0 - iTemp2) | |
||
| 157 | // | 0 1 0 j | | (rTemp3,iTemp3) | | (rTemp1 - iTemp3, iTemp1 + rTemp3) | |
||
| 158 | __forceinline void ButterflyDIT4_4 (__inout XVECTORREF r0, |
||
| 159 | __inout XVECTORREF r1, |
||
| 160 | __inout XVECTORREF r2, |
||
| 161 | __inout XVECTORREF r3, |
||
| 162 | __inout XVECTORREF i0, |
||
| 163 | __inout XVECTORREF i1, |
||
| 164 | __inout XVECTORREF i2, |
||
| 165 | __inout XVECTORREF i3, |
||
| 166 | __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableReal, |
||
| 167 | __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableImaginary, |
||
| 168 | const UINT32 uStride, const BOOL fLast) |
||
| 169 | { |
||
| 170 | DSPASSERT(pUnityTableReal != NULL); |
||
| 171 | DSPASSERT(pUnityTableImaginary != NULL); |
||
| 172 | DSPASSERT((UINT_PTR)pUnityTableReal % 16 == 0); |
||
| 173 | DSPASSERT((UINT_PTR)pUnityTableImaginary % 16 == 0); |
||
| 174 | DSPASSERT(ISPOWEROF2(uStride)); |
||
| 175 | |||
| 176 | XVECTOR rTemp0, rTemp1, rTemp2, rTemp3, rTemp4, rTemp5, rTemp6, rTemp7; |
||
| 177 | XVECTOR iTemp0, iTemp1, iTemp2, iTemp3, iTemp4, iTemp5, iTemp6, iTemp7; |
||
| 178 | |||
| 179 | |||
| 180 | // calculating Temp |
||
| 181 | rTemp0 = _mm_add_ps(r0, r2); iTemp0 = _mm_add_ps(i0, i2); |
||
| 182 | rTemp2 = _mm_add_ps(r1, r3); iTemp2 = _mm_add_ps(i1, i3); |
||
| 183 | rTemp1 = _mm_sub_ps(r0, r2); iTemp1 = _mm_sub_ps(i0, i2); |
||
| 184 | rTemp3 = _mm_sub_ps(r1, r3); iTemp3 = _mm_sub_ps(i1, i3); |
||
| 185 | rTemp4 = _mm_add_ps(rTemp0, rTemp2); iTemp4 = _mm_add_ps(iTemp0, iTemp2); |
||
| 186 | rTemp5 = _mm_add_ps(rTemp1, iTemp3); iTemp5 = _mm_sub_ps(iTemp1, rTemp3); |
||
| 187 | rTemp6 = _mm_sub_ps(rTemp0, rTemp2); iTemp6 = _mm_sub_ps(iTemp0, iTemp2); |
||
| 188 | rTemp7 = _mm_sub_ps(rTemp1, iTemp3); iTemp7 = _mm_add_ps(iTemp1, rTemp3); |
||
| 189 | |||
| 190 | // calculating Result |
||
| 191 | // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial |
||
| 192 | vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]); |
||
| 193 | vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]); |
||
| 194 | vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]); |
||
| 195 | if (fLast) { |
||
| 196 | ButterflyDIT4_1(rTemp4, iTemp4); |
||
| 197 | ButterflyDIT4_1(rTemp5, iTemp5); |
||
| 198 | ButterflyDIT4_1(rTemp6, iTemp6); |
||
| 199 | ButterflyDIT4_1(rTemp7, iTemp7); |
||
| 200 | } |
||
| 201 | |||
| 202 | |||
| 203 | r0 = rTemp4; i0 = iTemp4; |
||
| 204 | r1 = rTemp5; i1 = iTemp5; |
||
| 205 | r2 = rTemp6; i2 = iTemp6; |
||
| 206 | r3 = rTemp7; i3 = iTemp7; |
||
| 207 | } |
||
| 208 | |||
| 209 | //--------------<F-U-N-C-T-I-O-N-S>-----------------------------------------// |
||
| 210 | |||
| 211 | //// |
||
| 212 | // DESCRIPTION: |
||
| 213 | // 4-sample FFT. |
||
| 214 | // |
||
| 215 | // PARAMETERS: |
||
| 216 | // pReal - [inout] real components, must have at least uCount elements |
||
| 217 | // pImaginary - [inout] imaginary components, must have at least uCount elements |
||
| 218 | // uCount - [in] number of FFT iterations |
||
| 219 | // |
||
| 220 | // RETURN VALUE: |
||
| 221 | // void |
||
| 222 | //// |
||
| 223 | __forceinline void FFT4 (__inout_ecount(uCount) XVECTOR* __restrict pReal, __inout_ecount(uCount) XVECTOR* __restrict pImaginary, const UINT32 uCount=1) |
||
| 224 | { |
||
| 225 | DSPASSERT(pReal != NULL); |
||
| 226 | DSPASSERT(pImaginary != NULL); |
||
| 227 | DSPASSERT((UINT_PTR)pReal % 16 == 0); |
||
| 228 | DSPASSERT((UINT_PTR)pImaginary % 16 == 0); |
||
| 229 | DSPASSERT(ISPOWEROF2(uCount)); |
||
| 230 | |||
| 231 | for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) { |
||
| 232 | ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]); |
||
| 233 | } |
||
| 234 | } |
||
| 235 | |||
| 236 | |||
| 237 | |||
| 238 | //// |
||
| 239 | // DESCRIPTION: |
||
| 240 | // 8-sample FFT. |
||
| 241 | // |
||
| 242 | // PARAMETERS: |
||
| 243 | // pReal - [inout] real components, must have at least uCount*2 elements |
||
| 244 | // pImaginary - [inout] imaginary components, must have at least uCount*2 elements |
||
| 245 | // uCount - [in] number of FFT iterations |
||
| 246 | // |
||
| 247 | // RETURN VALUE: |
||
| 248 | // void |
||
| 249 | //// |
||
| 250 | __forceinline void FFT8 (__inout_ecount(uCount*2) XVECTOR* __restrict pReal, __inout_ecount(uCount*2) XVECTOR* __restrict pImaginary, const UINT32 uCount=1) |
||
| 251 | { |
||
| 252 | DSPASSERT(pReal != NULL); |
||
| 253 | DSPASSERT(pImaginary != NULL); |
||
| 254 | DSPASSERT((UINT_PTR)pReal % 16 == 0); |
||
| 255 | DSPASSERT((UINT_PTR)pImaginary % 16 == 0); |
||
| 256 | DSPASSERT(ISPOWEROF2(uCount)); |
||
| 257 | |||
| 258 | static XVECTOR wr1 = { 1.0f, 0.70710677f, 0.0f, -0.70710677f }; |
||
| 259 | static XVECTOR wi1 = { 0.0f, -0.70710677f, -1.0f, -0.70710677f }; |
||
| 260 | static XVECTOR wr2 = { -1.0f, -0.70710677f, 0.0f, 0.70710677f }; |
||
| 261 | static XVECTOR wi2 = { 0.0f, 0.70710677f, 1.0f, 0.70710677f }; |
||
| 262 | |||
| 263 | |||
| 264 | for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) { |
||
| 265 | XVECTOR* __restrict pR = pReal + uIndex*2; |
||
| 266 | XVECTOR* __restrict pI = pImaginary + uIndex*2; |
||
| 267 | |||
| 268 | XVECTOR oddsR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(3, 1, 3, 1)); |
||
| 269 | XVECTOR evensR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(2, 0, 2, 0)); |
||
| 270 | XVECTOR oddsI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(3, 1, 3, 1)); |
||
| 271 | XVECTOR evensI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(2, 0, 2, 0)); |
||
| 272 | ButterflyDIT4_1(oddsR, oddsI); |
||
| 273 | ButterflyDIT4_1(evensR, evensI); |
||
| 274 | |||
| 275 | XVECTOR r, i; |
||
| 276 | vmulComplex(r, i, oddsR, oddsI, wr1, wi1); |
||
| 277 | pR[0] = _mm_add_ps(evensR, r); |
||
| 278 | pI[0] = _mm_add_ps(evensI, i); |
||
| 279 | |||
| 280 | vmulComplex(r, i, oddsR, oddsI, wr2, wi2); |
||
| 281 | pR[1] = _mm_add_ps(evensR, r); |
||
| 282 | pI[1] = _mm_add_ps(evensI, i); |
||
| 283 | } |
||
| 284 | } |
||
| 285 | |||
| 286 | |||
| 287 | |||
| 288 | //// |
||
| 289 | // DESCRIPTION: |
||
| 290 | // 16-sample FFT. |
||
| 291 | // |
||
| 292 | // PARAMETERS: |
||
| 293 | // pReal - [inout] real components, must have at least uCount*4 elements |
||
| 294 | // pImaginary - [inout] imaginary components, must have at least uCount*4 elements |
||
| 295 | // uCount - [in] number of FFT iterations |
||
| 296 | // |
||
| 297 | // RETURN VALUE: |
||
| 298 | // void |
||
| 299 | //// |
||
| 300 | __forceinline void FFT16 (__inout_ecount(uCount*4) XVECTOR* __restrict pReal, __inout_ecount(uCount*4) XVECTOR* __restrict pImaginary, const UINT32 uCount=1) |
||
| 301 | { |
||
| 302 | DSPASSERT(pReal != NULL); |
||
| 303 | DSPASSERT(pImaginary != NULL); |
||
| 304 | DSPASSERT((UINT_PTR)pReal % 16 == 0); |
||
| 305 | DSPASSERT((UINT_PTR)pImaginary % 16 == 0); |
||
| 306 | DSPASSERT(ISPOWEROF2(uCount)); |
||
| 307 | |||
| 308 | XVECTOR aUnityTableReal[4] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.92387950f, 0.70710677f, 0.38268343f, 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f, 1.0f, 0.38268343f, -0.70710677f, -0.92387950f }; |
||
| 309 | XVECTOR aUnityTableImaginary[4] = { -0.0f, -0.0f, -0.0f, -0.0f, -0.0f, -0.38268343f, -0.70710677f, -0.92387950f, -0.0f, -0.70710677f, -1.0f, -0.70710677f, -0.0f, -0.92387950f, -0.70710677f, 0.38268343f }; |
||
| 310 | |||
| 311 | |||
| 312 | for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) { |
||
| 313 | ButterflyDIT4_4(pReal[uIndex*4], |
||
| 314 | pReal[uIndex*4 + 1], |
||
| 315 | pReal[uIndex*4 + 2], |
||
| 316 | pReal[uIndex*4 + 3], |
||
| 317 | pImaginary[uIndex*4], |
||
| 318 | pImaginary[uIndex*4 + 1], |
||
| 319 | pImaginary[uIndex*4 + 2], |
||
| 320 | pImaginary[uIndex*4 + 3], |
||
| 321 | aUnityTableReal, |
||
| 322 | aUnityTableImaginary, |
||
| 323 | 1, TRUE); |
||
| 324 | } |
||
| 325 | } |
||
| 326 | |||
| 327 | |||
| 328 | |||
| 329 | //// |
||
| 330 | // DESCRIPTION: |
||
| 331 | // 2^N-sample FFT. |
||
| 332 | // |
||
| 333 | // REMARKS: |
||
| 334 | // For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4(). |
||
| 335 | // |
||
| 336 | // PARAMETERS: |
||
| 337 | // pReal - [inout] real components, must have at least (uLength*uCount)/4 elements |
||
| 338 | // pImaginary - [inout] imaginary components, must have at least (uLength*uCount)/4 elements |
||
| 339 | // pUnityTable - [in] unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable() |
||
| 340 | // uLength - [in] FFT length in samples, must be a power of 2 > 16 |
||
| 341 | // uCount - [in] number of FFT iterations |
||
| 342 | // |
||
| 343 | // RETURN VALUE: |
||
| 344 | // void |
||
| 345 | //// |
||
| 346 | inline void FFT (__inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pReal, __inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(uLength*uCount) const XVECTOR* __restrict pUnityTable, const UINT32 uLength, const UINT32 uCount=1) |
||
| 347 | { |
||
| 348 | DSPASSERT(pReal != NULL); |
||
| 349 | DSPASSERT(pImaginary != NULL); |
||
| 350 | DSPASSERT(pUnityTable != NULL); |
||
| 351 | DSPASSERT((UINT_PTR)pReal % 16 == 0); |
||
| 352 | DSPASSERT((UINT_PTR)pImaginary % 16 == 0); |
||
| 353 | DSPASSERT((UINT_PTR)pUnityTable % 16 == 0); |
||
| 354 | DSPASSERT(uLength > 16); |
||
| 355 | DSPASSERT(ISPOWEROF2(uLength)); |
||
| 356 | DSPASSERT(ISPOWEROF2(uCount)); |
||
| 357 | |||
| 358 | const XVECTOR* __restrict pUnityTableReal = pUnityTable; |
||
| 359 | const XVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2); |
||
| 360 | const UINT32 uTotal = uCount * uLength; |
||
| 361 | const UINT32 uTotal_vectors = uTotal >> 2; |
||
| 362 | const UINT32 uStage_vectors = uLength >> 2; |
||
| 363 | const UINT32 uStride = uStage_vectors >> 2; // stride between butterfly elements |
||
| 364 | const UINT32 uSkip = uStage_vectors - uStride; |
||
| 365 | |||
| 366 | |||
| 367 | for (UINT32 uIndex=0; uIndex<(uTotal_vectors>>2); ++uIndex) { |
||
| 368 | UINT32 n = (uIndex/uStride) * (uStride + uSkip) + (uIndex % uStride); |
||
| 369 | ButterflyDIT4_4(pReal[n], |
||
| 370 | pReal[n + uStride], |
||
| 371 | pReal[n + uStride*2], |
||
| 372 | pReal[n + uStride*3], |
||
| 373 | pImaginary[n ], |
||
| 374 | pImaginary[n + uStride], |
||
| 375 | pImaginary[n + uStride*2], |
||
| 376 | pImaginary[n + uStride*3], |
||
| 377 | pUnityTableReal + n % uStage_vectors, |
||
| 378 | pUnityTableImaginary + n % uStage_vectors, |
||
| 379 | uStride, FALSE); |
||
| 380 | } |
||
| 381 | |||
| 382 | |||
| 383 | if (uLength > 16*4) { |
||
| 384 | FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4); |
||
| 385 | } else if (uLength == 16*4) { |
||
| 386 | FFT16(pReal, pImaginary, uCount*4); |
||
| 387 | } else if (uLength == 8*4) { |
||
| 388 | FFT8(pReal, pImaginary, uCount*4); |
||
| 389 | } else if (uLength == 4*4) { |
||
| 390 | FFT4(pReal, pImaginary, uCount*4); |
||
| 391 | } |
||
| 392 | } |
||
| 393 | |||
| 394 | //--------------------------------------------------------------------------// |
||
| 395 | //// |
||
| 396 | // DESCRIPTION: |
||
| 397 | // Initializes unity roots lookup table used by FFT functions. |
||
| 398 | // Once initialized, the table need not be initialized again unless a |
||
| 399 | // different FFT length is desired. |
||
| 400 | // |
||
| 401 | // REMARKS: |
||
| 402 | // The unity tables of FFT length 16 and below are hard coded into the |
||
| 403 | // respective FFT functions and so need not be initialized. |
||
| 404 | // |
||
| 405 | // PARAMETERS: |
||
| 406 | // pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements |
||
| 407 | // uLength - [in] FFT length in samples, must be a power of 2 > 16 |
||
| 408 | // |
||
| 409 | // RETURN VALUE: |
||
| 410 | // void |
||
| 411 | //// |
||
| 412 | inline void FFTInitializeUnityTable (__out_ecount(uLength) XVECTOR* __restrict pUnityTable, UINT32 uLength) |
||
| 413 | { |
||
| 414 | DSPASSERT(pUnityTable != NULL); |
||
| 415 | DSPASSERT(uLength > 16); |
||
| 416 | DSPASSERT(ISPOWEROF2(uLength)); |
||
| 417 | |||
| 418 | FLOAT32* __restrict pfUnityTable = (FLOAT32* __restrict)pUnityTable; |
||
| 419 | |||
| 420 | |||
| 421 | // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16 |
||
| 422 | do { |
||
| 423 | FLOAT32 flStep = 6.283185307f / uLength; // 2PI / FFT length |
||
| 424 | uLength >>= 2; |
||
| 425 | |||
| 426 | // pUnityTable[0 to uLength*4-1] contains real components for current FFT length |
||
| 427 | // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length |
||
| 428 | for (UINT32 i=0; i<4; ++i) { |
||
| 429 | for (UINT32 j=0; j<uLength; ++j) { |
||
| 430 | UINT32 uIndex = (i*uLength) + j; |
||
| 431 | pfUnityTable[uIndex] = cosf(FLOAT32(i)*FLOAT32(j)*flStep); // real component |
||
| 432 | pfUnityTable[uIndex + uLength*4] = -sinf(FLOAT32(i)*FLOAT32(j)*flStep); // imaginary component |
||
| 433 | } |
||
| 434 | } |
||
| 435 | pfUnityTable += uLength*8; |
||
| 436 | } while (uLength > 16); |
||
| 437 | } |
||
| 438 | |||
| 439 | |||
| 440 | //// |
||
| 441 | // DESCRIPTION: |
||
| 442 | // The FFT functions generate output in bit reversed order. |
||
| 443 | // Use this function to re-arrange them into order of increasing frequency. |
||
| 444 | // |
||
| 445 | // PARAMETERS: |
||
| 446 | // pOutput - [out] output buffer, receives samples in order of increasing frequency, must have at least (1<<uLog2Length) elements |
||
| 447 | // pInput - [in] input buffer, samples in bit reversed order as generated by FFT functions, must have at least (1<<uLog2Length) elements |
||
| 448 | // uLog2Length - [in] LOG (base 2) of FFT length in samples, must be > 0 |
||
| 449 | // |
||
| 450 | // RETURN VALUE: |
||
| 451 | // void |
||
| 452 | //// |
||
| 453 | inline void FFTUnswizzle (__out_ecount(1<<uLog2Length) FLOAT32* __restrict pOutput, __in_ecount(1<<uLog2Length) const FLOAT32* __restrict pInput, const UINT32 uLog2Length) |
||
| 454 | { |
||
| 455 | DSPASSERT(pOutput != NULL); |
||
| 456 | DSPASSERT(pInput != NULL); |
||
| 457 | DSPASSERT(uLog2Length > 0); |
||
| 458 | |||
| 459 | const UINT32 uLength = UINT32(1 << uLog2Length); |
||
| 460 | |||
| 461 | |||
| 462 | if ((uLog2Length & 0x1) == 0) { |
||
| 463 | // even powers of two |
||
| 464 | for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) { |
||
| 465 | UINT32 n = uIndex; |
||
| 466 | n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 ); |
||
| 467 | n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 ); |
||
| 468 | n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 ); |
||
| 469 | n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 ); |
||
| 470 | n >>= (32 - uLog2Length); |
||
| 471 | pOutput[n] = pInput[uIndex]; |
||
| 472 | } |
||
| 473 | } else { |
||
| 474 | // odd powers of two |
||
| 475 | for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) { |
||
| 476 | UINT32 n = (uIndex>>3); |
||
| 477 | n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 ); |
||
| 478 | n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 ); |
||
| 479 | n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 ); |
||
| 480 | n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 ); |
||
| 481 | n >>= (32 - (uLog2Length-3)); |
||
| 482 | n |= ((uIndex & 0x7) << (uLog2Length - 3)); |
||
| 483 | pOutput[n] = pInput[uIndex]; |
||
| 484 | } |
||
| 485 | } |
||
| 486 | } |
||
| 487 | |||
| 488 | |||
| 489 | //// |
||
| 490 | // DESCRIPTION: |
||
| 491 | // Convert complex components to polar form. |
||
| 492 | // |
||
| 493 | // PARAMETERS: |
||
| 494 | // pOutput - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements |
||
| 495 | // pInputReal - [in] input buffer (real components), must have at least uLength/4 elements |
||
| 496 | // pInputImaginary - [in] input buffer (imaginary components), must have at least uLength/4 elements |
||
| 497 | // uLength - [in] FFT length in samples, must be a power of 2 >= 4 |
||
| 498 | // |
||
| 499 | // RETURN VALUE: |
||
| 500 | // void |
||
| 501 | //// |
||
| 502 | inline void FFTPolar (__out_ecount(uLength/4) XVECTOR* __restrict pOutput, __in_ecount(uLength/4) const XVECTOR* __restrict pInputReal, __in_ecount(uLength/4) const XVECTOR* __restrict pInputImaginary, const UINT32 uLength) |
||
| 503 | { |
||
| 504 | DSPASSERT(pOutput != NULL); |
||
| 505 | DSPASSERT(pInputReal != NULL); |
||
| 506 | DSPASSERT(pInputImaginary != NULL); |
||
| 507 | DSPASSERT(uLength >= 4); |
||
| 508 | DSPASSERT(ISPOWEROF2(uLength)); |
||
| 509 | |||
| 510 | FLOAT32 flOneOverLength = 1.0f / uLength; |
||
| 511 | |||
| 512 | |||
| 513 | // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2 |
||
| 514 | XVECTOR vOneOverLength = _mm_set_ps1(flOneOverLength); |
||
| 515 | |||
| 516 | for (UINT32 uIndex=0; uIndex<(uLength>>2); ++uIndex) { |
||
| 517 | XVECTOR vReal = _mm_mul_ps(pInputReal[uIndex], vOneOverLength); |
||
| 518 | XVECTOR vImaginary = _mm_mul_ps(pInputImaginary[uIndex], vOneOverLength); |
||
| 519 | XVECTOR vRR = _mm_mul_ps(vReal, vReal); |
||
| 520 | XVECTOR vII = _mm_mul_ps(vImaginary, vImaginary); |
||
| 521 | XVECTOR vRRplusII = _mm_add_ps(vRR, vII); |
||
| 522 | XVECTOR vTotal = _mm_sqrt_ps(vRRplusII); |
||
| 523 | pOutput[uIndex] = _mm_add_ps(vTotal, vTotal); |
||
| 524 | } |
||
| 525 | } |
||
| 526 | |||
| 527 | |||
| 528 | #pragma warning(pop) |
||
| 529 | }; // namespace XDSP |
||
| 530 | //---------------------------------<-EOF->----------------------------------// |
||
| 531 |