Subversion Repositories Games.Chess Giants

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*-========================================================================-_
  2.  |                                 - XDSP -                                 |
  3.  |        Copyright (c) Microsoft Corporation.  All rights reserved.        |
  4.  |~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~|
  5.  |PROJECT: XDSP                         MODEL:   Unmanaged User-mode        |
  6.  |VERSION: 1.1                          EXCEPT:  No Exceptions              |
  7.  |CLASS:   N / A                        MINREQ:  WinXP, Xbox360             |
  8.  |BASE:    N / A                        DIALECT: MSC++ 14.00                |
  9.  |>------------------------------------------------------------------------<|
  10.  | DUTY: DSP functions with CPU extension specific optimizations            |
  11.  ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
  12.   NOTES:
  13.     1.  Definition of terms:
  14.             DSP: Digital Signal Processing.
  15.             FFT: Fast Fourier Transform.
  16.  
  17.     2.  All buffer parameters must be 16-byte aligned.
  18.  
  19.     3.  All FFT functions support only FLOAT32 mono audio.                  */
  20.  
  21. #pragma once
  22. //--------------<D-E-F-I-N-I-T-I-O-N-S>-------------------------------------//
  23. #include <windef.h> // general windows types
  24. #include <math.h>   // trigonometric functions
  25. #if defined(_XBOX)  // SIMD intrinsics
  26.     #include <ppcintrinsics.h>
  27. #else
  28.     #include <emmintrin.h>
  29. #endif
  30.  
  31.  
  32. //--------------<M-A-C-R-O-S>-----------------------------------------------//
  33. // assertion
  34. #if !defined(DSPASSERT)
  35.     #if DBG
  36.         #define DSPASSERT(exp) if (!(exp)) { OutputDebugStringA("XDSP ASSERT: " #exp ", {" __FUNCTION__ "}\n"); __debugbreak(); }
  37.     #else
  38.         #define DSPASSERT(exp) __assume(exp)
  39.     #endif
  40. #endif
  41.  
  42. // true if n is a power of 2
  43. #if !defined(ISPOWEROF2)
  44.     #define ISPOWEROF2(n) ( ((n)&((n)-1)) == 0 && (n) != 0 )
  45. #endif
  46.  
  47.  
  48. //--------------<H-E-L-P-E-R-S>---------------------------------------------//
  49. namespace XDSP {
  50. #pragma warning(push)
  51. #pragma warning(disable: 4328 4640) // disable "indirection alignment of formal parameter", "construction of local static object is not thread-safe" compile warnings
  52.  
  53.  
  54. // Helper functions, used by the FFT functions.
  55. // The application need not call them directly.
  56.  
  57.     // primitive types
  58.     typedef __m128 XVECTOR;
  59.     typedef XVECTOR& XVECTORREF;
  60.     typedef const XVECTOR& XVECTORREFC;
  61.  
  62.  
  63.     // Parallel multiplication of four complex numbers, assuming
  64.     // real and imaginary values are stored in separate vectors.
  65.     __forceinline void vmulComplex (__out XVECTORREF rResult, __out XVECTORREF iResult, __in XVECTORREFC r1, __in XVECTORREFC i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
  66.     {
  67.         // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
  68.         XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
  69.         XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
  70.         XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
  71.         XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
  72.         rResult = _mm_sub_ps(vr1r2, vi1i2); // real:      (r1*r2 - i1*i2)
  73.         iResult = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
  74.     }
  75.     __forceinline void vmulComplex (__inout XVECTORREF r1, __inout XVECTORREF i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
  76.     {
  77.         // (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
  78.         XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
  79.         XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
  80.         XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
  81.         XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
  82.         r1 = _mm_sub_ps(vr1r2, vi1i2); // real:      (r1*r2 - i1*i2)
  83.         i1 = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1*i2 + r2*i1)
  84.     }
  85.  
  86.  
  87.     // Radix-4 decimation-in-time FFT butterfly.
  88.     // This version assumes that all four elements of the butterfly are
  89.     // adjacent in a single vector.
  90.     //
  91.     // Compute the product of the complex input vector and the
  92.     // 4-element DFT matrix:
  93.     //     | 1  1  1  1 |    | (r1X,i1X) |
  94.     //     | 1 -j -1  j |    | (r1Y,i1Y) |
  95.     //     | 1 -1  1 -1 |    | (r1Z,i1Z) |
  96.     //     | 1  j -1 -j |    | (r1W,i1W) |
  97.     //
  98.     // This matrix can be decomposed into two simpler ones to reduce the
  99.     // number of additions needed. The decomposed matrices look like this:
  100.     //     | 1  0  1  0 |    | 1  0  1  0 |
  101.     //     | 0  1  0 -j |    | 1  0 -1  0 |
  102.     //     | 1  0 -1  0 |    | 0  1  0  1 |
  103.     //     | 0  1  0  j |    | 0  1  0 -1 |
  104.     //
  105.     // Combine as follows:
  106.     //          | 1  0  1  0 |   | (r1X,i1X) |         | (r1X + r1Z, i1X + i1Z) |
  107.     // Temp   = | 1  0 -1  0 | * | (r1Y,i1Y) |       = | (r1X - r1Z, i1X - i1Z) |
  108.     //          | 0  1  0  1 |   | (r1Z,i1Z) |         | (r1Y + r1W, i1Y + i1W) |
  109.     //          | 0  1  0 -1 |   | (r1W,i1W) |         | (r1Y - r1W, i1Y - i1W) |
  110.     //
  111.     //          | 1  0  1  0 |   | (rTempX,iTempX) |   | (rTempX + rTempZ, iTempX + iTempZ) |
  112.     // Result = | 0  1  0 -j | * | (rTempY,iTempY) | = | (rTempY + iTempW, iTempY - rTempW) |
  113.     //          | 1  0 -1  0 |   | (rTempZ,iTempZ) |   | (rTempX - rTempZ, iTempX - iTempZ) |
  114.     //          | 0  1  0  j |   | (rTempW,iTempW) |   | (rTempY - iTempW, iTempY + rTempW) |
  115.     __forceinline void ButterflyDIT4_1 (__inout XVECTORREF r1, __inout XVECTORREF i1)
  116.     {
  117.         // sign constants for radix-4 butterflies
  118.         const static XVECTOR vDFT4SignBits1 = { 0.0f, -0.0f,  0.0f, -0.0f };
  119.         const static XVECTOR vDFT4SignBits2 = { 0.0f,  0.0f, -0.0f, -0.0f };
  120.         const static XVECTOR vDFT4SignBits3 = { 0.0f, -0.0f, -0.0f,  0.0f };
  121.  
  122.  
  123.         // calculating Temp
  124.         XVECTOR rTemp = _mm_add_ps( _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1, 1, 0, 0)),                               // [r1X| r1X|r1Y| r1Y] +
  125.                                     _mm_xor_ps(_mm_shuffle_ps(r1, r1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [r1Z|-r1Z|r1W|-r1W]
  126.         XVECTOR iTemp = _mm_add_ps( _mm_shuffle_ps(i1, i1, _MM_SHUFFLE(1, 1, 0, 0)),                               // [i1X| i1X|i1Y| i1Y] +
  127.                                     _mm_xor_ps(_mm_shuffle_ps(i1, i1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [i1Z|-i1Z|i1W|-i1W]
  128.  
  129.         // calculating Result
  130.         XVECTOR rZrWiZiW = _mm_shuffle_ps(rTemp, iTemp, _MM_SHUFFLE(3, 2, 3, 2));       // [rTempZ|rTempW|iTempZ|iTempW]
  131.         XVECTOR rZiWrZiW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(3, 0, 3, 0)); // [rTempZ|iTempW|rTempZ|iTempW]
  132.         XVECTOR iZrWiZrW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(1, 2, 1, 2)); // [rTempZ|iTempW|rTempZ|iTempW]
  133.         r1 = _mm_add_ps( _mm_shuffle_ps(rTemp, rTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [rTempX| rTempY| rTempX| rTempY] +
  134.                          _mm_xor_ps(rZiWrZiW, vDFT4SignBits2) );                // [rTempZ| iTempW|-rTempZ|-iTempW]
  135.         i1 = _mm_add_ps( _mm_shuffle_ps(iTemp, iTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [iTempX| iTempY| iTempX| iTempY] +
  136.                          _mm_xor_ps(iZrWiZrW, vDFT4SignBits3) );                // [iTempZ|-rTempW|-iTempZ| rTempW]
  137.     }
  138.  
  139.     // Radix-4 decimation-in-time FFT butterfly.
  140.     // This version assumes that elements of the butterfly are
  141.     // in different vectors, so that each vector in the input
  142.     // contains elements from four different butterflies.
  143.     // The four separate butterflies are processed in parallel.
  144.     //
  145.     // The calculations here are the same as the ones in the single-vector
  146.     // radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W)
  147.     // they are done in parallel on sixteen independent complex values.
  148.     // There is no interdependence between the vector elements:
  149.     // | 1  0  1  0 |    | (rIn0,iIn0) |               | (rIn0 + rIn2, iIn0 + iIn2) |
  150.     // | 1  0 -1  0 | *  | (rIn1,iIn1) |  =   Temp   = | (rIn0 - rIn2, iIn0 - iIn2) |
  151.     // | 0  1  0  1 |    | (rIn2,iIn2) |               | (rIn1 + rIn3, iIn1 + iIn3) |
  152.     // | 0  1  0 -1 |    | (rIn3,iIn3) |               | (rIn1 - rIn3, iIn1 - iIn3) |
  153.     //
  154.     //          | 1  0  1  0 |   | (rTemp0,iTemp0) |   | (rTemp0 + rTemp2, iTemp0 + iTemp2) |
  155.     // Result = | 0  1  0 -j | * | (rTemp1,iTemp1) | = | (rTemp1 + iTemp3, iTemp1 - rTemp3) |
  156.     //          | 1  0 -1  0 |   | (rTemp2,iTemp2) |   | (rTemp0 - rTemp2, iTemp0 - iTemp2) |
  157.     //          | 0  1  0  j |   | (rTemp3,iTemp3) |   | (rTemp1 - iTemp3, iTemp1 + rTemp3) |
  158.     __forceinline void ButterflyDIT4_4 (__inout XVECTORREF r0,
  159.                                         __inout XVECTORREF r1,
  160.                                         __inout XVECTORREF r2,
  161.                                         __inout XVECTORREF r3,
  162.                                         __inout XVECTORREF i0,
  163.                                         __inout XVECTORREF i1,
  164.                                         __inout XVECTORREF i2,
  165.                                         __inout XVECTORREF i3,
  166.                                         __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableReal,
  167.                                         __in_ecount(uStride*4) const XVECTOR* __restrict pUnityTableImaginary,
  168.                                         const UINT32 uStride, const BOOL fLast)
  169.     {
  170.         DSPASSERT(pUnityTableReal != NULL);
  171.         DSPASSERT(pUnityTableImaginary != NULL);
  172.         DSPASSERT((UINT_PTR)pUnityTableReal % 16 == 0);
  173.         DSPASSERT((UINT_PTR)pUnityTableImaginary % 16 == 0);
  174.         DSPASSERT(ISPOWEROF2(uStride));
  175.  
  176.         XVECTOR rTemp0, rTemp1, rTemp2, rTemp3, rTemp4, rTemp5, rTemp6, rTemp7;
  177.         XVECTOR iTemp0, iTemp1, iTemp2, iTemp3, iTemp4, iTemp5, iTemp6, iTemp7;
  178.  
  179.  
  180.         // calculating Temp
  181.         rTemp0 = _mm_add_ps(r0, r2);          iTemp0 = _mm_add_ps(i0, i2);
  182.         rTemp2 = _mm_add_ps(r1, r3);          iTemp2 = _mm_add_ps(i1, i3);
  183.         rTemp1 = _mm_sub_ps(r0, r2);          iTemp1 = _mm_sub_ps(i0, i2);
  184.         rTemp3 = _mm_sub_ps(r1, r3);          iTemp3 = _mm_sub_ps(i1, i3);
  185.         rTemp4 = _mm_add_ps(rTemp0, rTemp2);  iTemp4 = _mm_add_ps(iTemp0, iTemp2);
  186.         rTemp5 = _mm_add_ps(rTemp1, iTemp3);  iTemp5 = _mm_sub_ps(iTemp1, rTemp3);
  187.         rTemp6 = _mm_sub_ps(rTemp0, rTemp2);  iTemp6 = _mm_sub_ps(iTemp0, iTemp2);
  188.         rTemp7 = _mm_sub_ps(rTemp1, iTemp3);  iTemp7 = _mm_add_ps(iTemp1, rTemp3);
  189.  
  190.         // calculating Result
  191.         // vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial
  192.         vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]);
  193.         vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]);
  194.         vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]);
  195.         if (fLast) {
  196.             ButterflyDIT4_1(rTemp4, iTemp4);
  197.             ButterflyDIT4_1(rTemp5, iTemp5);
  198.             ButterflyDIT4_1(rTemp6, iTemp6);
  199.             ButterflyDIT4_1(rTemp7, iTemp7);
  200.         }
  201.  
  202.  
  203.         r0 = rTemp4;    i0 = iTemp4;
  204.         r1 = rTemp5;    i1 = iTemp5;
  205.         r2 = rTemp6;    i2 = iTemp6;
  206.         r3 = rTemp7;    i3 = iTemp7;
  207.     }
  208.  
  209. //--------------<F-U-N-C-T-I-O-N-S>-----------------------------------------//
  210.  
  211.       ////
  212.       // DESCRIPTION:
  213.       //  4-sample FFT.
  214.       //
  215.       // PARAMETERS:
  216.       //  pReal      - [inout] real components, must have at least uCount elements
  217.       //  pImaginary - [inout] imaginary components, must have at least uCount elements
  218.       //  uCount     - [in]    number of FFT iterations
  219.       //
  220.       // RETURN VALUE:
  221.       //  void
  222.       ////
  223.     __forceinline void FFT4 (__inout_ecount(uCount) XVECTOR* __restrict pReal, __inout_ecount(uCount) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
  224.     {
  225.         DSPASSERT(pReal != NULL);
  226.         DSPASSERT(pImaginary != NULL);
  227.         DSPASSERT((UINT_PTR)pReal % 16 == 0);
  228.         DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  229.         DSPASSERT(ISPOWEROF2(uCount));
  230.  
  231.         for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
  232.             ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]);
  233.         }
  234.     }
  235.  
  236.  
  237.  
  238.       ////
  239.       // DESCRIPTION:
  240.       //  8-sample FFT.
  241.       //
  242.       // PARAMETERS:
  243.       //  pReal      - [inout] real components, must have at least uCount*2 elements
  244.       //  pImaginary - [inout] imaginary components, must have at least uCount*2 elements
  245.       //  uCount     - [in]    number of FFT iterations
  246.       //
  247.       // RETURN VALUE:
  248.       //  void
  249.       ////
  250.     __forceinline void FFT8 (__inout_ecount(uCount*2) XVECTOR* __restrict pReal, __inout_ecount(uCount*2) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
  251.     {
  252.         DSPASSERT(pReal != NULL);
  253.         DSPASSERT(pImaginary != NULL);
  254.         DSPASSERT((UINT_PTR)pReal % 16 == 0);
  255.         DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  256.         DSPASSERT(ISPOWEROF2(uCount));
  257.  
  258.         static XVECTOR wr1 = {  1.0f,  0.70710677f,  0.0f, -0.70710677f };
  259.         static XVECTOR wi1 = {  0.0f, -0.70710677f, -1.0f, -0.70710677f };
  260.         static XVECTOR wr2 = { -1.0f, -0.70710677f,  0.0f,  0.70710677f };
  261.         static XVECTOR wi2 = {  0.0f,  0.70710677f,  1.0f,  0.70710677f };
  262.  
  263.  
  264.         for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
  265.             XVECTOR* __restrict pR = pReal      + uIndex*2;
  266.             XVECTOR* __restrict pI = pImaginary + uIndex*2;
  267.  
  268.             XVECTOR oddsR  = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(3, 1, 3, 1));
  269.             XVECTOR evensR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(2, 0, 2, 0));
  270.             XVECTOR oddsI  = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(3, 1, 3, 1));
  271.             XVECTOR evensI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(2, 0, 2, 0));
  272.             ButterflyDIT4_1(oddsR, oddsI);
  273.             ButterflyDIT4_1(evensR, evensI);
  274.  
  275.             XVECTOR r, i;
  276.             vmulComplex(r, i, oddsR, oddsI, wr1, wi1);
  277.             pR[0] = _mm_add_ps(evensR, r);
  278.             pI[0] = _mm_add_ps(evensI, i);
  279.  
  280.             vmulComplex(r, i, oddsR, oddsI, wr2, wi2);
  281.             pR[1] = _mm_add_ps(evensR, r);
  282.             pI[1] = _mm_add_ps(evensI, i);
  283.         }
  284.     }
  285.  
  286.  
  287.  
  288.       ////
  289.       // DESCRIPTION:
  290.       //  16-sample FFT.
  291.       //
  292.       // PARAMETERS:
  293.       //  pReal      - [inout] real components, must have at least uCount*4 elements
  294.       //  pImaginary - [inout] imaginary components, must have at least uCount*4 elements
  295.       //  uCount     - [in]    number of FFT iterations
  296.       //
  297.       // RETURN VALUE:
  298.       //  void
  299.       ////
  300.     __forceinline void FFT16 (__inout_ecount(uCount*4) XVECTOR* __restrict pReal, __inout_ecount(uCount*4) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
  301.     {
  302.         DSPASSERT(pReal != NULL);
  303.         DSPASSERT(pImaginary != NULL);
  304.         DSPASSERT((UINT_PTR)pReal % 16 == 0);
  305.         DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  306.         DSPASSERT(ISPOWEROF2(uCount));
  307.  
  308.         XVECTOR aUnityTableReal[4]      = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.92387950f, 0.70710677f, 0.38268343f, 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f, 1.0f, 0.38268343f, -0.70710677f, -0.92387950f };
  309.         XVECTOR aUnityTableImaginary[4] = { -0.0f, -0.0f, -0.0f, -0.0f, -0.0f, -0.38268343f, -0.70710677f, -0.92387950f, -0.0f, -0.70710677f, -1.0f, -0.70710677f, -0.0f, -0.92387950f, -0.70710677f, 0.38268343f };
  310.  
  311.  
  312.         for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
  313.             ButterflyDIT4_4(pReal[uIndex*4],
  314.                             pReal[uIndex*4 + 1],
  315.                             pReal[uIndex*4 + 2],
  316.                             pReal[uIndex*4 + 3],
  317.                             pImaginary[uIndex*4],
  318.                             pImaginary[uIndex*4 + 1],
  319.                             pImaginary[uIndex*4 + 2],
  320.                             pImaginary[uIndex*4 + 3],
  321.                             aUnityTableReal,
  322.                             aUnityTableImaginary,
  323.                             1, TRUE);
  324.         }
  325.     }
  326.  
  327.  
  328.  
  329.       ////
  330.       // DESCRIPTION:
  331.       //  2^N-sample FFT.
  332.       //
  333.       // REMARKS:
  334.       //  For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4().
  335.       //
  336.       // PARAMETERS:
  337.       //  pReal       - [inout] real components, must have at least (uLength*uCount)/4 elements
  338.       //  pImaginary  - [inout] imaginary components, must have at least (uLength*uCount)/4 elements
  339.       //  pUnityTable - [in]    unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable()
  340.       //  uLength     - [in]    FFT length in samples, must be a power of 2 > 16
  341.       //  uCount      - [in]    number of FFT iterations
  342.       //
  343.       // RETURN VALUE:
  344.       //  void
  345.       ////
  346.     inline void FFT (__inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pReal, __inout_ecount((uLength*uCount)/4) XVECTOR* __restrict pImaginary, __in_ecount(uLength*uCount) const XVECTOR* __restrict pUnityTable, const UINT32 uLength, const UINT32 uCount=1)
  347.     {
  348.         DSPASSERT(pReal != NULL);
  349.         DSPASSERT(pImaginary != NULL);
  350.         DSPASSERT(pUnityTable != NULL);
  351.         DSPASSERT((UINT_PTR)pReal % 16 == 0);
  352.         DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
  353.         DSPASSERT((UINT_PTR)pUnityTable % 16 == 0);
  354.         DSPASSERT(uLength > 16);
  355.         DSPASSERT(ISPOWEROF2(uLength));
  356.         DSPASSERT(ISPOWEROF2(uCount));
  357.  
  358.         const XVECTOR* __restrict pUnityTableReal      = pUnityTable;
  359.         const XVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2);
  360.         const UINT32 uTotal         = uCount * uLength;
  361.         const UINT32 uTotal_vectors = uTotal >> 2;
  362.         const UINT32 uStage_vectors = uLength >> 2;
  363.         const UINT32 uStride        = uStage_vectors >> 2; // stride between butterfly elements
  364.         const UINT32 uSkip          = uStage_vectors - uStride;
  365.  
  366.  
  367.         for (UINT32 uIndex=0; uIndex<(uTotal_vectors>>2); ++uIndex) {
  368.             UINT32 n = (uIndex/uStride) * (uStride + uSkip) + (uIndex % uStride);
  369.             ButterflyDIT4_4(pReal[n],
  370.                             pReal[n + uStride],
  371.                             pReal[n + uStride*2],
  372.                             pReal[n + uStride*3],
  373.                             pImaginary[n ],
  374.                             pImaginary[n + uStride],
  375.                             pImaginary[n + uStride*2],
  376.                             pImaginary[n + uStride*3],
  377.                             pUnityTableReal      + n % uStage_vectors,
  378.                             pUnityTableImaginary + n % uStage_vectors,
  379.                             uStride, FALSE);
  380.         }
  381.  
  382.  
  383.         if (uLength > 16*4) {
  384.             FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4);
  385.         } else if (uLength == 16*4) {
  386.             FFT16(pReal, pImaginary, uCount*4);
  387.         } else if (uLength == 8*4) {
  388.             FFT8(pReal, pImaginary, uCount*4);
  389.         } else if (uLength == 4*4) {
  390.             FFT4(pReal, pImaginary, uCount*4);
  391.         }
  392.     }
  393.  
  394. //--------------------------------------------------------------------------//
  395.   ////
  396.   // DESCRIPTION:
  397.   //  Initializes unity roots lookup table used by FFT functions.
  398.   //  Once initialized, the table need not be initialized again unless a
  399.   //  different FFT length is desired.
  400.   //
  401.   // REMARKS:
  402.   //  The unity tables of FFT length 16 and below are hard coded into the
  403.   //  respective FFT functions and so need not be initialized.
  404.   //
  405.   // PARAMETERS:
  406.   //  pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements
  407.   //  uLength     - [in]  FFT length in samples, must be a power of 2 > 16
  408.   //
  409.   // RETURN VALUE:
  410.   //  void
  411.   ////
  412. inline void FFTInitializeUnityTable (__out_ecount(uLength) XVECTOR* __restrict pUnityTable, UINT32 uLength)
  413. {
  414.     DSPASSERT(pUnityTable != NULL);
  415.     DSPASSERT(uLength > 16);
  416.     DSPASSERT(ISPOWEROF2(uLength));
  417.  
  418.     FLOAT32* __restrict pfUnityTable = (FLOAT32* __restrict)pUnityTable;
  419.  
  420.  
  421.     // initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16
  422.     do {
  423.         FLOAT32 flStep = 6.283185307f / uLength; // 2PI / FFT length
  424.         uLength >>= 2;
  425.  
  426.         // pUnityTable[0 to uLength*4-1] contains real components for current FFT length
  427.         // pUnityTable[uLength*4 to uLength*8-1] contains imaginary components for current FFT length
  428.         for (UINT32 i=0; i<4; ++i) {
  429.             for (UINT32 j=0; j<uLength; ++j) {
  430.                 UINT32 uIndex = (i*uLength) + j;
  431.                 pfUnityTable[uIndex]             = cosf(FLOAT32(i)*FLOAT32(j)*flStep);  // real component
  432.                 pfUnityTable[uIndex + uLength*4] = -sinf(FLOAT32(i)*FLOAT32(j)*flStep); // imaginary component
  433.             }
  434.         }
  435.         pfUnityTable += uLength*8;
  436.     } while (uLength > 16);
  437. }
  438.  
  439.  
  440.   ////
  441.   // DESCRIPTION:
  442.   //  The FFT functions generate output in bit reversed order.
  443.   //  Use this function to re-arrange them into order of increasing frequency.
  444.   //
  445.   // PARAMETERS:
  446.   //  pOutput     - [out] output buffer, receives samples in order of increasing frequency, must have at least (1<<uLog2Length) elements
  447.   //  pInput      - [in]  input buffer, samples in bit reversed order as generated by FFT functions, must have at least (1<<uLog2Length) elements
  448.   //  uLog2Length - [in]  LOG (base 2) of FFT length in samples, must be > 0
  449.   //
  450.   // RETURN VALUE:
  451.   //  void
  452.   ////
  453. inline void FFTUnswizzle (__out_ecount(1<<uLog2Length) FLOAT32* __restrict pOutput, __in_ecount(1<<uLog2Length) const FLOAT32* __restrict pInput, const UINT32 uLog2Length)
  454. {
  455.     DSPASSERT(pOutput != NULL);
  456.     DSPASSERT(pInput != NULL);
  457.     DSPASSERT(uLog2Length > 0);
  458.  
  459.     const UINT32 uLength = UINT32(1 << uLog2Length);
  460.  
  461.  
  462.     if ((uLog2Length & 0x1) == 0) {
  463.         // even powers of two
  464.         for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
  465.             UINT32 n = uIndex;
  466.             n = ( (n & 0xcccccccc) >> 2 )  | ( (n & 0x33333333) << 2 );
  467.             n = ( (n & 0xf0f0f0f0) >> 4 )  | ( (n & 0x0f0f0f0f) << 4 );
  468.             n = ( (n & 0xff00ff00) >> 8 )  | ( (n & 0x00ff00ff) << 8 );
  469.             n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
  470.             n >>= (32 - uLog2Length);
  471.             pOutput[n] = pInput[uIndex];
  472.         }
  473.     } else {
  474.         // odd powers of two
  475.         for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
  476.             UINT32 n = (uIndex>>3);
  477.             n = ( (n & 0xcccccccc) >> 2 )  | ( (n & 0x33333333) << 2 );
  478.             n = ( (n & 0xf0f0f0f0) >> 4 )  | ( (n & 0x0f0f0f0f) << 4 );
  479.             n = ( (n & 0xff00ff00) >> 8 )  | ( (n & 0x00ff00ff) << 8 );
  480.             n = ( (n & 0xffff0000) >> 16 ) | ( (n & 0x0000ffff) << 16 );
  481.             n >>= (32 - (uLog2Length-3));
  482.             n |= ((uIndex & 0x7) << (uLog2Length - 3));
  483.             pOutput[n] = pInput[uIndex];
  484.         }
  485.     }
  486. }
  487.  
  488.  
  489.   ////
  490.   // DESCRIPTION:
  491.   //  Convert complex components to polar form.
  492.   //
  493.   // PARAMETERS:
  494.   //  pOutput         - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements
  495.   //  pInputReal      - [in]  input buffer (real components), must have at least uLength/4 elements
  496.   //  pInputImaginary - [in]  input buffer (imaginary components), must have at least uLength/4 elements
  497.   //  uLength         - [in]  FFT length in samples, must be a power of 2 >= 4
  498.   //
  499.   // RETURN VALUE:
  500.   //  void
  501.   ////
  502. inline void FFTPolar (__out_ecount(uLength/4) XVECTOR* __restrict pOutput, __in_ecount(uLength/4) const XVECTOR* __restrict pInputReal, __in_ecount(uLength/4) const XVECTOR* __restrict pInputImaginary, const UINT32 uLength)
  503. {
  504.     DSPASSERT(pOutput != NULL);
  505.     DSPASSERT(pInputReal != NULL);
  506.     DSPASSERT(pInputImaginary != NULL);
  507.     DSPASSERT(uLength >= 4);
  508.     DSPASSERT(ISPOWEROF2(uLength));
  509.  
  510.     FLOAT32 flOneOverLength = 1.0f / uLength;
  511.  
  512.  
  513.     // result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2
  514.         XVECTOR vOneOverLength = _mm_set_ps1(flOneOverLength);
  515.  
  516.         for (UINT32 uIndex=0; uIndex<(uLength>>2); ++uIndex) {
  517.             XVECTOR vReal      = _mm_mul_ps(pInputReal[uIndex], vOneOverLength);
  518.             XVECTOR vImaginary = _mm_mul_ps(pInputImaginary[uIndex], vOneOverLength);
  519.             XVECTOR vRR        = _mm_mul_ps(vReal, vReal);
  520.             XVECTOR vII        = _mm_mul_ps(vImaginary, vImaginary);
  521.             XVECTOR vRRplusII  = _mm_add_ps(vRR, vII);
  522.             XVECTOR vTotal  = _mm_sqrt_ps(vRRplusII);
  523.             pOutput[uIndex] = _mm_add_ps(vTotal, vTotal);
  524.         }
  525. }
  526.  
  527.  
  528. #pragma warning(pop)
  529. }; // namespace XDSP
  530. //---------------------------------<-EOF->----------------------------------//
  531.  
  532.