Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
1 | pmbaty | 1 | /*-========================================================================-_ |
2 | | - XDSP - | |
||
3 | | Copyright (c) Microsoft Corporation. All rights reserved. | |
||
4 | |~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~| |
||
5 | |PROJECT: XDSP MODEL: Unmanaged User-mode | |
||
6 | |VERSION: 1.1 EXCEPT: No Exceptions | |
||
7 | |CLASS: N / A MINREQ: WinXP, Xbox360 | |
||
8 | |BASE: N / A DIALECT: MSC++ 14.00 | |
||
9 | |>------------------------------------------------------------------------<| |
||
10 | | DUTY: DSP functions with CPU extension specific optimizations | |
||
11 | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^ |
||
12 | NOTES: |
||
13 | 1. Definition of terms: |
||
14 | DSP: Digital Signal Processing. |
||
15 | FFT: Fast Fourier Transform. |
||
16 | |||
17 | 2. All buffer parameters must be 16-byte aligned. |
||
18 | |||
19 | 3. All FFT functions support only FLOAT32 mono audio. */ |
||
20 | |||
21 | #pragma once |
||
22 | //--------------<D-E-F-I-N-I-T-I-O-N-S>-------------------------------------// |
||
23 | #include <windef.h> // general windows types |
||
24 | #include <math.h> // trigonometric functions |
||
25 | #if defined(_XBOX) // SIMD intrinsics |
||
26 | #include <ppcintrinsics.h> |
||
27 | #else |
||
28 | #include <emmintrin.h> |
||
29 | #endif |
||
30 | |||
31 | |||
32 | //--------------<M-A-C-R-O-S>-----------------------------------------------// |
||
33 | // assertion |
||
34 | #if !defined(DSPASSERT) |
||
35 | #if DBG |
||
36 | #define DSPASSERT(exp) if (!(exp)) { OutputDebugStringA("XDSP ASSERT: " #exp ", {" __FUNCTION__ "}\n"); __debugbreak(); } |
||
37 | #else |
||
38 | #define DSPASSERT(exp) __assume(exp) |
||
39 | #endif |
||
40 | #endif |
||
41 | |||
42 | // true if n is a power of 2 |
||
43 | #if !defined(ISPOWEROF2) |
||
44 | #define ISPOWEROF2(n) ( ((n)&((n)-1)) == 0 && (n) != 0 ) |
||
45 | #endif |
||
46 | |||
47 | |||
48 | //--------------<H-E-L-P-E-R-S>---------------------------------------------// |
||
49 | namespace XDSP { |
||
50 | #pragma warning(push) |
||
51 | #pragma warning(disable: 4328 4640) // disable "indirection alignment of formal parameter", "construction of local static object is not thread-safe" compile warnings |
||
52 | |||
53 | |||
54 | // Helper functions, used by the FFT functions. |
||
55 | // The application need not call them directly. |
||
56 | |||
57 | // primitive types |
||
58 | typedef __m128 XVECTOR; |
||
59 | typedef XVECTOR& XVECTORREF; |
||
60 | typedef const XVECTOR& XVECTORREFC; |
||
61 | |||
62 | |||
63 | // Parallel multiplication of four complex numbers, assuming |
||
64 | // real and imaginary values are stored in separate vectors. |
||
65 | __forceinline void vmulComplex (__out XVECTORREF rResult, __out XVECTORREF iResult, __in XVECTORREFC r1, __in XVECTORREFC i1, __in XVECTORREFC r2, __in XVECTORREFC i2) |
||
66 | { |
||
67 | // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1) |
||
68 | XVECTOR vi1i2 = _mm_mul_ps(i1, i2); |
||
69 | XVECTOR vr1r2 = _mm_mul_ps(r1, r2); |
||
70 | XVECTOR vr1i2 = _mm_mul_ps(r1, i2); |
||
71 | XVECTOR vr2i1 = _mm_mul_ps(r2, i1); |
||
72 | rResult = _mm_sub_ps(vr1r2, vi1i2); // real: (r1*r2 - i1*i2) |
||
73 | iResult = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1) |
||
74 | } |
||
75 | __forceinline void vmulComplex (__inout XVECTORREF r1, __inout XVECTORREF i1, __in XVECTORREFC r2, __in XVECTORREFC i2) |
||
76 | { |
||
77 | // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1) |
||
78 | XVECTOR vi1i2 = _mm_mul_ps(i1, i2); |
||
79 | XVECTOR vr1r2 = _mm_mul_ps(r1, r2); |
||
80 | XVECTOR vr1i2 = _mm_mul_ps(r1, i2); |
||
81 | XVECTOR vr2i1 = _mm_mul_ps(r2, i1); |
||
82 | r1 = _mm_sub_ps(vr1r2, vi1i2); // real: (r1*r2 - i1*i2) |
||
83 | i1 = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1) |
||
84 | } |
||
85 | |||
86 | |||
87 | // Radix-4 decimation-in-time FFT butterfly. |
||
88 | // This version assumes that all four elements of the butterfly are |
||
89 | // adjacent in a single vector. |
||
90 | // |
||
91 | // Compute the product of the complex input vector and the |
||
92 | // 4-element DFT matrix: |
||
93 | // | 1 1 1 1 | | (r1X,i1X) | |
||
94 | // | 1 -j -1 j | | (r1Y,i1Y) | |
||
95 | // | 1 -1 1 -1 | | (r1Z,i1Z) | |
||
96 | // | 1 j -1 -j | | (r1W,i1W) | |
||
97 | // |
||
98 | // This matrix can be decomposed into two simpler ones to reduce the |
||
99 | // number of additions needed. The decomposed matrices look like this: |
||
100 | // | 1 0 1 0 | | 1 0 1 0 | |
||
101 | // | 0 1 0 -j | | 1 0 -1 0 | |
||
102 | // | 1 0 -1 0 | | 0 1 0 1 | |
||
103 | // | 0 1 0 j | | 0 1 0 -1 | |
||
104 | // |
||
105 | // Combine as follows: |
||
106 | // | 1 0 1 0 | | (r1X,i1X) | | (r1X + r1Z, i1X + i1Z) | |
||
107 | // Temp = | 1 0 -1 0 | * | (r1Y,i1Y) | = | (r1X - r1Z, i1X - i1Z) | |
||
108 | // | 0 1 0 1 | | (r1Z,i1Z) | | (r1Y + r1W, i1Y + i1W) | |
||
109 | // | 0 1 0 -1 | | (r1W,i1W) | | (r1Y - r1W, i1Y - i1W) | |
||
110 | // |
||
111 | // | 1 0 1 0 | | (rTempX,iTempX) | | (rTempX + rTempZ, iTempX + iTempZ) | |
||
112 | // Result = | 0 1 0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) | |
||
113 | // | 1 0 -1 0 | | (rTempZ,iTempZ) | | (rTempX - rTempZ, iTempX - iTempZ) | |
||
114 | // | 0 1 0 j | | (rTempW,iTempW) | | (rTempY - iTempW, iTempY + rTempW) | |
||
115 | __forceinline void ButterflyDIT4_1 (__inout XVECTORREF r1, __inout XVECTORREF i1) |
||
116 | { |
||
117 | // sign constants for radix-4 butterflies |
||
118 | const static XVECTOR vDFT4SignBits1 = { 0.0f, -0.0f, 0.0f, -0.0f }; |
||
119 | const static XVECTOR vDFT4SignBits2 = { 0.0f, 0.0f, -0.0f, -0.0f }; |
||
120 | const static XVECTOR vDFT4SignBits3 = { 0.0f, -0.0f, -0.0f, 0.0f }; |
||
121 | |||
122 | |||
123 | // calculating Temp |
||
124 | XVECTOR rTemp = _mm_add_ps( _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1, 1, 0, 0)), // [r1X| r1X|r1Y| r1Y] + |
||
125 | _mm_xor_ps(_mm_shuffle_ps(r1, r1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [r1Z|-r1Z|r1W|-r1W] |
||
126 | XVECTOR iTemp = _mm_add_ps( _mm_shuffle_ps(i1, i1, _MM_SHUFFLE(1, 1, 0, 0)), // [i1X| i1X|i1Y| i1Y] + |
||
127 | _mm_xor_ps(_mm_shuffle_ps(i1, i1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [i1Z|-i1Z|i1W|-i1W] |
||
128 | |||
129 | // calculating Result |
||
130 | XVECTOR rZrWiZiW = _mm_shuffle_ps(rTemp, iTemp, _MM_SHUFFLE(3, 2, 3, 2)); // [rTempZ|rTempW|iTempZ|iTempW] |
||
131 | XVECTOR rZiWrZiW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(3, 0, 3, 0)); // [rTempZ|iTempW|rTempZ|iTempW] |
||
132 | XVECTOR iZrWiZrW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(1, 2, 1, 2)); // [rTempZ|iTempW|rTempZ|iTempW] |
||
133 | r1 = _mm_add_ps( _mm_shuffle_ps(rTemp, rTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [rTempX| rTempY| rTempX| rTempY] + |
||
134 | _mm_xor_ps(rZiWrZiW, vDFT4SignBits2) ); // [rTempZ| iTempW|-rTempZ|-iTempW] |
||
135 | i1 = _mm_add_ps( _mm_shuffle_ps(iTemp, iTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [iTempX| iTempY| iTempX| iTempY] + |
||
136 | _mm_xor_ps(iZrWiZrW, vDFT4SignBits3) ); // [iTempZ|-rTempW|-iTempZ| rTempW] |
||
137 | } |
||
138 | |||
139 | // Radix-4 decimation-in-time FFT butterfly. |
||
140 | // This version assumes that elements of the butterfly are |
||
141 | // in different vectors, so that each vector in the input |
||
142 | // contains elements from four different butterflies. |
||
143 | // The four separate butterflies are processed in parallel. |
||
144 | // |
||
145 | // The calculations here are the same as the ones in the single-vector |
||
146 | // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W) |
||
147 | // they are done in parallel on sixteen independent complex values. |
||
148 | // There is no interdependence between the vector elements: |
||
149 | // | 1 0 1 0 | | (rIn0,iIn0) | | (rIn0 + rIn2, iIn0 + iIn2) | |
||
150 | // | 1 0 -1 0 | * | (rIn1,iIn1) | = Temp = | (rIn0 - rIn2, iIn0 - iIn2) | |
||
151 | // | 0 1 0 1 | | (rIn2,iIn2) | | (rIn1 + rIn3, iIn1 + iIn3) | |
||
152 | // | 0 1 0 -1 | | (rIn3,iIn3) | | (rIn1 - rIn3, iIn1 - iIn3) | |
||
153 | // |
||
154 | // | 1 0 1 0 | | (rTemp0,iTemp0) | | (rTemp0 + rTemp2, iTemp0 + iTemp2) | |
||
155 | // Result = | 0 1 0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) | |
||
156 | // | 1 0 -1 0 | | (rTemp2,iTemp2) | | (rTemp0 - rTemp2, iTemp0 - iTemp2) | |
||
157 | // | 0 1 0 j | | (rTemp3,iTemp3) | | (rTemp1 - iTemp3, iTemp1 + rTemp3) | |
||
158 | __forceinline void ButterflyDIT4_4 (__inout XVECTORREF r0, |
||
159 | __inout XVECTORREF r1, |
||
160 | __inout XVECTORREF r2, |
||
161 | __inout XVECTORREF r3, |
||
162 | __inout XVECTORREF i0, |
||
163 | __inout XVECTORREF i1, |
||
164 | __inout XVECTORREF i2, |
||
165 | __inout XVECTORREF i3, |
||
166 | __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableReal, |
||
167 | __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableImaginary, |
||
168 | const UINT32 uStride, const BOOL fLast) |
||
169 | { |
||
170 | DSPASSERT(pUnityTableReal != NULL); |
||
171 | DSPASSERT(pUnityTableImaginary != NULL); |
||
172 | DSPASSERT((UINT_PTR)pUnityTableReal % 16 == 0); |
||
173 | DSPASSERT((UINT_PTR)pUnityTableImaginary % 16 == 0); |
||
174 | DSPASSERT(ISPOWEROF2(uStride)); |
||
175 | |||
176 | XVECTOR rTemp0, rTemp1, rTemp2, rTemp3, rTemp4, rTemp5, rTemp6, rTemp7; |
||
177 | XVECTOR iTemp0, iTemp1, iTemp2, iTemp3, iTemp4, iTemp5, iTemp6, iTemp7; |
||
178 | |||
179 | |||
180 | // calculating Temp |
||
181 | rTemp0 = _mm_add_ps(r0, r2); iTemp0 = _mm_add_ps(i0, i2); |
||
182 | rTemp2 = _mm_add_ps(r1, r3); iTemp2 = _mm_add_ps(i1, i3); |
||
183 | rTemp1 = _mm_sub_ps(r0, r2); iTemp1 = _mm_sub_ps(i0, i2); |
||
184 | rTemp3 = _mm_sub_ps(r1, r3); iTemp3 = _mm_sub_ps(i1, i3); |
||
185 | rTemp4 = _mm_add_ps(rTemp0, rTemp2); iTemp4 = _mm_add_ps(iTemp0, iTemp2); |
||
186 | rTemp5 = _mm_add_ps(rTemp1, iTemp3); iTemp5 = _mm_sub_ps(iTemp1, rTemp3); |
||
187 | rTemp6 = _mm_sub_ps(rTemp0, rTemp2); iTemp6 = _mm_sub_ps(iTemp0, iTemp2); |
||
188 | rTemp7 = _mm_sub_ps(rTemp1, iTemp3); iTemp7 = _mm_add_ps(iTemp1, rTemp3); |
||
189 | |||
190 | // calculating Result |
||
191 | // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial |
||
192 | vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]); |
||
193 | vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]); |
||
194 | vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]); |
||
195 | if (fLast) { |
||
196 | ButterflyDIT4_1(rTemp4, iTemp4); |
||
197 | ButterflyDIT4_1(rTemp5, iTemp5); |
||
198 | ButterflyDIT4_1(rTemp6, iTemp6); |
||
199 | ButterflyDIT4_1(rTemp7, iTemp7); |
||
200 | } |
||
201 | |||
202 | |||
203 | r0 = rTemp4; i0 = iTemp4; |
||
204 | r1 = rTemp5; i1 = iTemp5; |
||
205 | r2 = rTemp6; i2 = iTemp6; |
||
206 | r3 = rTemp7; i3 = iTemp7; |
||
207 | } |
||
208 | |||
209 | //--------------<F-U-N-C-T-I-O-N-S>-----------------------------------------// |
||
210 | |||
211 | //// |
||
212 | // DESCRIPTION: |
||
213 | // 4-sample FFT. |
||
214 | // |
||
215 | // PARAMETERS: |
||
216 | // pReal - [inout] real components, must have at least uCount elements |
||
217 | // pImaginary - [inout] imaginary components, must have at least uCount elements |
||
218 | // uCount - [in] number of FFT iterations |
||
219 | // |
||
220 | // RETURN VALUE: |
||
221 | // void |
||
222 | //// |
||
223 | __forceinline void FFT4 (__inout_ecount(uCount) XVECTOR* __restrict pReal, __inout_ecount(uCount) XVECTOR* __restrict pImaginary, const UINT32 uCount=1) |
||
224 | { |
||
225 | DSPASSERT(pReal != NULL); |
||
226 | DSPASSERT(pImaginary != NULL); |
||
227 | DSPASSERT((UINT_PTR)pReal % 16 == 0); |
||
228 | DSPASSERT((UINT_PTR)pImaginary % 16 == 0); |
||
229 | DSPASSERT(ISPOWEROF2(uCount)); |
||
230 | |||
231 | for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) { |
||
232 | ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]); |
||
233 | } |
||
234 | } |
||
235 | |||
236 | |||
237 | |||
238 | //// |
||
239 | // DESCRIPTION: |
||
240 | // 8-sample FFT. |
||
241 | // |
||
242 | // PARAMETERS: |
||
243 | // pReal - [inout] real components, must have at least uCount*2 elements |
||
244 | // pImaginary - [inout] imaginary components, must have at least uCount*2 elements |
||
245 | // uCount - [in] number of FFT iterations |
||
246 | // |
||
247 | // RETURN VALUE: |
||
248 | // void |
||
249 | //// |
||
250 | __forceinline void FFT8 (__inout_ecount(uCount*2) XVECTOR* __restrict pReal, __inout_ecount(uCount*2) XVECTOR* __restrict pImaginary, const UINT32 uCount=1) |
||
251 | { |
||
252 | DSPASSERT(pReal != NULL); |
||
253 | DSPASSERT(pImaginary != NULL); |
||
254 | DSPASSERT((UINT_PTR)pReal % 16 == 0); |
||
255 | DSPASSERT((UINT_PTR)pImaginary % 16 == 0); |
||
256 | DSPASSERT(ISPOWEROF2(uCount)); |
||
257 | |||
258 | static XVECTOR wr1 = { 1.0f, 0.70710677f, 0.0f, -0.70710677f }; |
||
259 | static XVECTOR wi1 = { 0.0f, -0.70710677f, -1.0f, -0.70710677f }; |
||
260 | static XVECTOR wr2 = { -1.0f, -0.70710677f, 0.0f, 0.70710677f }; |
||
261 | static XVECTOR wi2 = { 0.0f, 0.70710677f, 1.0f, 0.70710677f }; |
||
262 | |||
263 | |||
264 | for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) { |
||
265 | XVECTOR* __restrict pR = pReal + uIndex*2; |
||
266 | XVECTOR* __restrict pI = pImaginary + uIndex*2; |
||
267 | |||
268 | XVECTOR oddsR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(3, 1, 3, 1)); |
||
269 | XVECTOR evensR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(2, 0, 2, 0)); |
||
270 | XVECTOR oddsI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(3, 1, 3, 1)); |
||
271 | XVECTOR evensI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(2, 0, 2, 0)); |
||
272 | ButterflyDIT4_1(oddsR, oddsI); |
||
273 | ButterflyDIT4_1(evensR, evensI); |
||
274 | |||
275 | XVECTOR r, i; |
||
276 | vmulComplex(r, i, oddsR, oddsI, wr1, wi1); |
||
277 | pR[0] = _mm_add_ps(evensR, r); |
||
278 | pI[0] = _mm_add_ps(evensI, i); |
||
279 | |||
280 | vmulComplex(r, i, oddsR, oddsI, wr2, wi2); |
||
281 | pR[1] = _mm_add_ps(evensR, r); |
||
282 | pI[1] = _mm_add_ps(evensI, i); |
||
283 | } |
||
284 | } |
||
285 | |||
286 | |||
287 | |||
288 | //// |
||
289 | // DESCRIPTION: |
||
290 | // 16-sample FFT. |
||
291 | // |
||
292 | // PARAMETERS: |
||
293 | // pReal - [inout] real components, must have at least uCount*4 elements |
||
294 | // pImaginary - [inout] imaginary components, must have at least uCount*4 elements |
||
295 | // uCount - [in] number of FFT iterations |
||
296 | // |
||
297 | // RETURN VALUE: |
||
298 | // void |
||
299 | //// |
||
300 | __forceinline void FFT16 (__inout_ecount(uCount*4) XVECTOR* __restrict pReal, __inout_ecount(uCount*4) XVECTOR* __restrict pImaginary, const UINT32 uCount=1) |
||
301 | { |
||
302 | DSPASSERT(pReal != NULL); |
||
303 | DSPASSERT(pImaginary != NULL); |
||
304 | DSPASSERT((UINT_PTR)pReal % 16 == 0); |
||
305 | DSPASSERT((UINT_PTR)pImaginary % 16 == 0); |
||
306 | DSPASSERT(ISPOWEROF2(uCount)); |
||
307 | |||
308 | XVECTOR aUnityTableReal[4] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.92387950f, 0.70710677f, 0.38268343f, 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f, 1.0f, 0.38268343f, -0.70710677f, -0.92387950f }; |
||
309 | XVECTOR aUnityTableImaginary[4] = { -0.0f, -0.0f, -0.0f, -0.0f, -0.0f, -0.38268343f, -0.70710677f, -0.92387950f, -0.0f, -0.70710677f, -1.0f, -0.70710677f, -0.0f, -0.92387950f, -0.70710677f, 0.38268343f }; |
||
310 | |||
311 | |||
312 | for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) { |
||
313 | ButterflyDIT4_4(pReal[uIndex*4], |
||
314 | pReal[uIndex*4 + 1], |
||
315 | pReal[uIndex*4 + 2], |
||
316 | pReal[uIndex*4 + 3], |
||
317 | pImaginary[uIndex*4], |
||
318 | pImaginary[uIndex*4 + 1], |
||
319 | pImaginary[uIndex*4 + 2], |
||
320 | pImaginary[uIndex*4 + 3], |
||
321 | aUnityTableReal, |
||
322 | aUnityTableImaginary, |
||
323 | 1, TRUE); |
||
324 | } |
||
325 | } |
||
326 | |||
327 | |||
328 | |||
329 | //// |
||
330 | // DESCRIPTION: |
||
331 | // 2^N-sample FFT. |
||
332 | // |
||
333 | // REMARKS: |
||
334 | // For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4(). |
||
335 | // |
||
336 | // PARAMETERS: |
||
337 | // pReal - [inout] real components, must have at least (uLength*uCount)/4 elements |
||
338 | // pImaginary - [inout] imaginary components, must have at least (uLength*uCount)/4 elements |
||
339 | // pUnityTable - [in] unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable() |
||
340 | // uLength - [in] FFT length in samples, must be a power of 2 > 16 |
||
341 | // uCount - [in] number of FFT iterations |
||
342 | // |
||
343 | // RETURN VALUE: |
||
344 | // void |
||
345 | //// |
||
346 | inline void FFT (__inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pReal, __inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(uLength*uCount) const XVECTOR* __restrict pUnityTable, const UINT32 uLength, const UINT32 uCount=1) |
||
347 | { |
||
348 | DSPASSERT(pReal != NULL); |
||
349 | DSPASSERT(pImaginary != NULL); |
||
350 | DSPASSERT(pUnityTable != NULL); |
||
351 | DSPASSERT((UINT_PTR)pReal % 16 == 0); |
||
352 | DSPASSERT((UINT_PTR)pImaginary % 16 == 0); |
||
353 | DSPASSERT((UINT_PTR)pUnityTable % 16 == 0); |
||
354 | DSPASSERT(uLength > 16); |
||
355 | DSPASSERT(ISPOWEROF2(uLength)); |
||
356 | DSPASSERT(ISPOWEROF2(uCount)); |
||
357 | |||
358 | const XVECTOR* __restrict pUnityTableReal = pUnityTable; |
||
359 | const XVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2); |
||
360 | const UINT32 uTotal = uCount * uLength; |
||
361 | const UINT32 uTotal_vectors = uTotal >> 2; |
||
362 | const UINT32 uStage_vectors = uLength >> 2; |
||
363 | const UINT32 uStride = uStage_vectors >> 2; // stride between butterfly elements |
||
364 | const UINT32 uSkip = uStage_vectors - uStride; |
||
365 | |||
366 | |||
367 | for (UINT32 uIndex=0; uIndex<(uTotal_vectors>>2); ++uIndex) { |
||
368 | UINT32 n = (uIndex/uStride) * (uStride + uSkip) + (uIndex % uStride); |
||
369 | ButterflyDIT4_4(pReal[n], |
||
370 | pReal[n + uStride], |
||
371 | pReal[n + uStride*2], |
||
372 | pReal[n + uStride*3], |
||
373 | pImaginary[n ], |
||
374 | pImaginary[n + uStride], |
||
375 | pImaginary[n + uStride*2], |
||
376 | pImaginary[n + uStride*3], |
||
377 | pUnityTableReal + n % uStage_vectors, |
||
378 | pUnityTableImaginary + n % uStage_vectors, |
||
379 | uStride, FALSE); |
||
380 | } |
||
381 | |||
382 | |||
383 | if (uLength > 16*4) { |
||
384 | FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4); |
||
385 | } else if (uLength == 16*4) { |
||
386 | FFT16(pReal, pImaginary, uCount*4); |
||
387 | } else if (uLength == 8*4) { |
||
388 | FFT8(pReal, pImaginary, uCount*4); |
||
389 | } else if (uLength == 4*4) { |
||
390 | FFT4(pReal, pImaginary, uCount*4); |
||
391 | } |
||
392 | } |
||
393 | |||
394 | //--------------------------------------------------------------------------// |
||
395 | //// |
||
396 | // DESCRIPTION: |
||
397 | // Initializes unity roots lookup table used by FFT functions. |
||
398 | // Once initialized, the table need not be initialized again unless a |
||
399 | // different FFT length is desired. |
||
400 | // |
||
401 | // REMARKS: |
||
402 | // The unity tables of FFT length 16 and below are hard coded into the |
||
403 | // respective FFT functions and so need not be initialized. |
||
404 | // |
||
405 | // PARAMETERS: |
||
406 | // pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements |
||
407 | // uLength - [in] FFT length in samples, must be a power of 2 > 16 |
||
408 | // |
||
409 | // RETURN VALUE: |
||
410 | // void |
||
411 | //// |
||
412 | inline void FFTInitializeUnityTable (__out_ecount(uLength) XVECTOR* __restrict pUnityTable, UINT32 uLength) |
||
413 | { |
||
414 | DSPASSERT(pUnityTable != NULL); |
||
415 | DSPASSERT(uLength > 16); |
||
416 | DSPASSERT(ISPOWEROF2(uLength)); |
||
417 | |||
418 | FLOAT32* __restrict pfUnityTable = (FLOAT32* __restrict)pUnityTable; |
||
419 | |||
420 | |||
421 | // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16 |
||
422 | do { |
||
423 | FLOAT32 flStep = 6.283185307f / uLength; // 2PI / FFT length |
||
424 | uLength >>= 2; |
||
425 | |||
426 | // pUnityTable[0 to uLength*4-1] contains real components for current FFT length |
||
427 | // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length |
||
428 | for (UINT32 i=0; i<4; ++i) { |
||
429 | for (UINT32 j=0; j<uLength; ++j) { |
||
430 | UINT32 uIndex = (i*uLength) + j; |
||
431 | pfUnityTable[uIndex] = cosf(FLOAT32(i)*FLOAT32(j)*flStep); // real component |
||
432 | pfUnityTable[uIndex + uLength*4] = -sinf(FLOAT32(i)*FLOAT32(j)*flStep); // imaginary component |
||
433 | } |
||
434 | } |
||
435 | pfUnityTable += uLength*8; |
||
436 | } while (uLength > 16); |
||
437 | } |
||
438 | |||
439 | |||
440 | //// |
||
441 | // DESCRIPTION: |
||
442 | // The FFT functions generate output in bit reversed order. |
||
443 | // Use this function to re-arrange them into order of increasing frequency. |
||
444 | // |
||
445 | // PARAMETERS: |
||
446 | // pOutput - [out] output buffer, receives samples in order of increasing frequency, must have at least (1<<uLog2Length) elements |
||
447 | // pInput - [in] input buffer, samples in bit reversed order as generated by FFT functions, must have at least (1<<uLog2Length) elements |
||
448 | // uLog2Length - [in] LOG (base 2) of FFT length in samples, must be > 0 |
||
449 | // |
||
450 | // RETURN VALUE: |
||
451 | // void |
||
452 | //// |
||
453 | inline void FFTUnswizzle (__out_ecount(1<<uLog2Length) FLOAT32* __restrict pOutput, __in_ecount(1<<uLog2Length) const FLOAT32* __restrict pInput, const UINT32 uLog2Length) |
||
454 | { |
||
455 | DSPASSERT(pOutput != NULL); |
||
456 | DSPASSERT(pInput != NULL); |
||
457 | DSPASSERT(uLog2Length > 0); |
||
458 | |||
459 | const UINT32 uLength = UINT32(1 << uLog2Length); |
||
460 | |||
461 | |||
462 | if ((uLog2Length & 0x1) == 0) { |
||
463 | // even powers of two |
||
464 | for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) { |
||
465 | UINT32 n = uIndex; |
||
466 | n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 ); |
||
467 | n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 ); |
||
468 | n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 ); |
||
469 | n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 ); |
||
470 | n >>= (32 - uLog2Length); |
||
471 | pOutput[n] = pInput[uIndex]; |
||
472 | } |
||
473 | } else { |
||
474 | // odd powers of two |
||
475 | for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) { |
||
476 | UINT32 n = (uIndex>>3); |
||
477 | n = ( (n & 0xcccccccc) >> 2 ) | ( (n & 0x33333333) << 2 ); |
||
478 | n = ( (n & 0xf0f0f0f0) >> 4 ) | ( (n & 0x0f0f0f0f) << 4 ); |
||
479 | n = ( (n & 0xff00ff00) >> 8 ) | ( (n & 0x00ff00ff) << 8 ); |
||
480 | n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 ); |
||
481 | n >>= (32 - (uLog2Length-3)); |
||
482 | n |= ((uIndex & 0x7) << (uLog2Length - 3)); |
||
483 | pOutput[n] = pInput[uIndex]; |
||
484 | } |
||
485 | } |
||
486 | } |
||
487 | |||
488 | |||
489 | //// |
||
490 | // DESCRIPTION: |
||
491 | // Convert complex components to polar form. |
||
492 | // |
||
493 | // PARAMETERS: |
||
494 | // pOutput - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements |
||
495 | // pInputReal - [in] input buffer (real components), must have at least uLength/4 elements |
||
496 | // pInputImaginary - [in] input buffer (imaginary components), must have at least uLength/4 elements |
||
497 | // uLength - [in] FFT length in samples, must be a power of 2 >= 4 |
||
498 | // |
||
499 | // RETURN VALUE: |
||
500 | // void |
||
501 | //// |
||
502 | inline void FFTPolar (__out_ecount(uLength/4) XVECTOR* __restrict pOutput, __in_ecount(uLength/4) const XVECTOR* __restrict pInputReal, __in_ecount(uLength/4) const XVECTOR* __restrict pInputImaginary, const UINT32 uLength) |
||
503 | { |
||
504 | DSPASSERT(pOutput != NULL); |
||
505 | DSPASSERT(pInputReal != NULL); |
||
506 | DSPASSERT(pInputImaginary != NULL); |
||
507 | DSPASSERT(uLength >= 4); |
||
508 | DSPASSERT(ISPOWEROF2(uLength)); |
||
509 | |||
510 | FLOAT32 flOneOverLength = 1.0f / uLength; |
||
511 | |||
512 | |||
513 | // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2 |
||
514 | XVECTOR vOneOverLength = _mm_set_ps1(flOneOverLength); |
||
515 | |||
516 | for (UINT32 uIndex=0; uIndex<(uLength>>2); ++uIndex) { |
||
517 | XVECTOR vReal = _mm_mul_ps(pInputReal[uIndex], vOneOverLength); |
||
518 | XVECTOR vImaginary = _mm_mul_ps(pInputImaginary[uIndex], vOneOverLength); |
||
519 | XVECTOR vRR = _mm_mul_ps(vReal, vReal); |
||
520 | XVECTOR vII = _mm_mul_ps(vImaginary, vImaginary); |
||
521 | XVECTOR vRRplusII = _mm_add_ps(vRR, vII); |
||
522 | XVECTOR vTotal = _mm_sqrt_ps(vRRplusII); |
||
523 | pOutput[uIndex] = _mm_add_ps(vTotal, vTotal); |
||
524 | } |
||
525 | } |
||
526 | |||
527 | |||
528 | #pragma warning(pop) |
||
529 | }; // namespace XDSP |
||
530 | //---------------------------------<-EOF->----------------------------------// |
||
531 |