WebSVN – Games.Chess Giants – Blame – /DirectX9/Include/XDSP.h

Rev	Author	Line No.	Line
1	pmbaty	1	/*-========================================================================-_
		2	\| - XDSP - \|
		3	\| Copyright (c) Microsoft Corporation. All rights reserved. \|
		4	\|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\|
		5	\|PROJECT: XDSP MODEL: Unmanaged User-mode \|
		6	\|VERSION: 1.1 EXCEPT: No Exceptions \|
		7	\|CLASS: N / A MINREQ: WinXP, Xbox360 \|
		8	\|BASE: N / A DIALECT: MSC++ 14.00 \|
		9	\|>------------------------------------------------------------------------<\|
		10	\| DUTY: DSP functions with CPU extension specific optimizations \|
		11	^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^
		12	NOTES:
		13	1. Definition of terms:
		14	DSP: Digital Signal Processing.
		15	FFT: Fast Fourier Transform.
		16
		17	2. All buffer parameters must be 16-byte aligned.
		18
		19	3. All FFT functions support only FLOAT32 mono audio. */
		20
		21	#pragma once
		22	//--------------<D-E-F-I-N-I-T-I-O-N-S>-------------------------------------//
		23	#include <windef.h> // general windows types
		24	#include <math.h> // trigonometric functions
		25	#if defined(_XBOX) // SIMD intrinsics
		26	#include <ppcintrinsics.h>
		27	#else
		28	#include <emmintrin.h>
		29	#endif
		30
		31
		32	//--------------<M-A-C-R-O-S>-----------------------------------------------//
		33	// assertion
		34	#if !defined(DSPASSERT)
		35	#if DBG
		36	#define DSPASSERT(exp) if (!(exp)) { OutputDebugStringA("XDSP ASSERT: " #exp ", {" __FUNCTION__ "}\n"); __debugbreak(); }
		37	#else
		38	#define DSPASSERT(exp) __assume(exp)
		39	#endif
		40	#endif
		41
		42	// true if n is a power of 2
		43	#if !defined(ISPOWEROF2)
		44	#define ISPOWEROF2(n) ( ((n)&((n)-1)) == 0 && (n) != 0 )
		45	#endif
		46
		47
		48	//--------------<H-E-L-P-E-R-S>---------------------------------------------//
		49	namespace XDSP {
		50	#pragma warning(push)
		51	#pragma warning(disable: 4328 4640) // disable "indirection alignment of formal parameter", "construction of local static object is not thread-safe" compile warnings
		52
		53
		54	// Helper functions, used by the FFT functions.
		55	// The application need not call them directly.
		56
		57	// primitive types
		58	typedef __m128 XVECTOR;
		59	typedef XVECTOR& XVECTORREF;
		60	typedef const XVECTOR& XVECTORREFC;
		61
		62
		63	// Parallel multiplication of four complex numbers, assuming
		64	// real and imaginary values are stored in separate vectors.
		65	__forceinline void vmulComplex (__out XVECTORREF rResult, __out XVECTORREF iResult, __in XVECTORREFC r1, __in XVECTORREFC i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
		66	{
		67	// (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
		68	XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
		69	XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
		70	XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
		71	XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
		72	rResult = _mm_sub_ps(vr1r2, vi1i2); // real: (r1r2 - i1i2)
		73	iResult = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1i2 + r2i1)
		74	}
		75	__forceinline void vmulComplex (__inout XVECTORREF r1, __inout XVECTORREF i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
		76	{
		77	// (r1, i1) * (r2, i2) = (r1r2 - i1i2, r1i2 + r2i1)
		78	XVECTOR vi1i2 = _mm_mul_ps(i1, i2);
		79	XVECTOR vr1r2 = _mm_mul_ps(r1, r2);
		80	XVECTOR vr1i2 = _mm_mul_ps(r1, i2);
		81	XVECTOR vr2i1 = _mm_mul_ps(r2, i1);
		82	r1 = _mm_sub_ps(vr1r2, vi1i2); // real: (r1r2 - i1i2)
		83	i1 = _mm_add_ps(vr1i2, vr2i1); // imaginary: (r1i2 + r2i1)
		84	}
		85
		86
		87	// Radix-4 decimation-in-time FFT butterfly.
		88	// This version assumes that all four elements of the butterfly are
		89	// adjacent in a single vector.
		90	//
		91	// Compute the product of the complex input vector and the
		92	// 4-element DFT matrix:
		93	// \| 1 1 1 1 \| \| (r1X,i1X) \|
		94	// \| 1 -j -1 j \| \| (r1Y,i1Y) \|
		95	// \| 1 -1 1 -1 \| \| (r1Z,i1Z) \|
		96	// \| 1 j -1 -j \| \| (r1W,i1W) \|
		97	//
		98	// This matrix can be decomposed into two simpler ones to reduce the
		99	// number of additions needed. The decomposed matrices look like this:
		100	// \| 1 0 1 0 \| \| 1 0 1 0 \|
		101	// \| 0 1 0 -j \| \| 1 0 -1 0 \|
		102	// \| 1 0 -1 0 \| \| 0 1 0 1 \|
		103	// \| 0 1 0 j \| \| 0 1 0 -1 \|
		104	//
		105	// Combine as follows:
		106	// \| 1 0 1 0 \| \| (r1X,i1X) \| \| (r1X + r1Z, i1X + i1Z) \|
		107	// Temp = \| 1 0 -1 0 \| * \| (r1Y,i1Y) \| = \| (r1X - r1Z, i1X - i1Z) \|
		108	// \| 0 1 0 1 \| \| (r1Z,i1Z) \| \| (r1Y + r1W, i1Y + i1W) \|
		109	// \| 0 1 0 -1 \| \| (r1W,i1W) \| \| (r1Y - r1W, i1Y - i1W) \|
		110	//
		111	// \| 1 0 1 0 \| \| (rTempX,iTempX) \| \| (rTempX + rTempZ, iTempX + iTempZ) \|
		112	// Result = \| 0 1 0 -j \| * \| (rTempY,iTempY) \| = \| (rTempY + iTempW, iTempY - rTempW) \|
		113	// \| 1 0 -1 0 \| \| (rTempZ,iTempZ) \| \| (rTempX - rTempZ, iTempX - iTempZ) \|
		114	// \| 0 1 0 j \| \| (rTempW,iTempW) \| \| (rTempY - iTempW, iTempY + rTempW) \|
		115	__forceinline void ButterflyDIT4_1 (__inout XVECTORREF r1, __inout XVECTORREF i1)
		116	{
		117	// sign constants for radix-4 butterflies
		118	const static XVECTOR vDFT4SignBits1 = { 0.0f, -0.0f, 0.0f, -0.0f };
		119	const static XVECTOR vDFT4SignBits2 = { 0.0f, 0.0f, -0.0f, -0.0f };
		120	const static XVECTOR vDFT4SignBits3 = { 0.0f, -0.0f, -0.0f, 0.0f };
		121
		122
		123	// calculating Temp
		124	XVECTOR rTemp = _mm_add_ps( _mm_shuffle_ps(r1, r1, _MM_SHUFFLE(1, 1, 0, 0)), // [r1X\| r1X\|r1Y\| r1Y] +
		125	_mm_xor_ps(_mm_shuffle_ps(r1, r1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [r1Z\|-r1Z\|r1W\|-r1W]
		126	XVECTOR iTemp = _mm_add_ps( _mm_shuffle_ps(i1, i1, _MM_SHUFFLE(1, 1, 0, 0)), // [i1X\| i1X\|i1Y\| i1Y] +
		127	_mm_xor_ps(_mm_shuffle_ps(i1, i1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) ); // [i1Z\|-i1Z\|i1W\|-i1W]
		128
		129	// calculating Result
		130	XVECTOR rZrWiZiW = _mm_shuffle_ps(rTemp, iTemp, _MM_SHUFFLE(3, 2, 3, 2)); // [rTempZ\|rTempW\|iTempZ\|iTempW]
		131	XVECTOR rZiWrZiW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(3, 0, 3, 0)); // [rTempZ\|iTempW\|rTempZ\|iTempW]
		132	XVECTOR iZrWiZrW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(1, 2, 1, 2)); // [rTempZ\|iTempW\|rTempZ\|iTempW]
		133	r1 = _mm_add_ps( _mm_shuffle_ps(rTemp, rTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [rTempX\| rTempY\| rTempX\| rTempY] +
		134	_mm_xor_ps(rZiWrZiW, vDFT4SignBits2) ); // [rTempZ\| iTempW\|-rTempZ\|-iTempW]
		135	i1 = _mm_add_ps( _mm_shuffle_ps(iTemp, iTemp, _MM_SHUFFLE(1, 0, 1, 0)), // [iTempX\| iTempY\| iTempX\| iTempY] +
		136	_mm_xor_ps(iZrWiZrW, vDFT4SignBits3) ); // [iTempZ\|-rTempW\|-iTempZ\| rTempW]
		137	}
		138
		139	// Radix-4 decimation-in-time FFT butterfly.
		140	// This version assumes that elements of the butterfly are
		141	// in different vectors, so that each vector in the input
		142	// contains elements from four different butterflies.
		143	// The four separate butterflies are processed in parallel.
		144	//
		145	// The calculations here are the same as the ones in the single-vector
		146	// radix-4 DFT, but instead of being done on a single vector (X,Y,Z,W)
		147	// they are done in parallel on sixteen independent complex values.
		148	// There is no interdependence between the vector elements:
		149	// \| 1 0 1 0 \| \| (rIn0,iIn0) \| \| (rIn0 + rIn2, iIn0 + iIn2) \|
		150	// \| 1 0 -1 0 \| * \| (rIn1,iIn1) \| = Temp = \| (rIn0 - rIn2, iIn0 - iIn2) \|
		151	// \| 0 1 0 1 \| \| (rIn2,iIn2) \| \| (rIn1 + rIn3, iIn1 + iIn3) \|
		152	// \| 0 1 0 -1 \| \| (rIn3,iIn3) \| \| (rIn1 - rIn3, iIn1 - iIn3) \|
		153	//
		154	// \| 1 0 1 0 \| \| (rTemp0,iTemp0) \| \| (rTemp0 + rTemp2, iTemp0 + iTemp2) \|
		155	// Result = \| 0 1 0 -j \| * \| (rTemp1,iTemp1) \| = \| (rTemp1 + iTemp3, iTemp1 - rTemp3) \|
		156	// \| 1 0 -1 0 \| \| (rTemp2,iTemp2) \| \| (rTemp0 - rTemp2, iTemp0 - iTemp2) \|
		157	// \| 0 1 0 j \| \| (rTemp3,iTemp3) \| \| (rTemp1 - iTemp3, iTemp1 + rTemp3) \|
		158	__forceinline void ButterflyDIT4_4 (__inout XVECTORREF r0,
		159	__inout XVECTORREF r1,
		160	__inout XVECTORREF r2,
		161	__inout XVECTORREF r3,
		162	__inout XVECTORREF i0,
		163	__inout XVECTORREF i1,
		164	__inout XVECTORREF i2,
		165	__inout XVECTORREF i3,
		166	__in_ecount(uStride4) const XVECTOR __restrict pUnityTableReal,
		167	__in_ecount(uStride4) const XVECTOR __restrict pUnityTableImaginary,
		168	const UINT32 uStride, const BOOL fLast)
		169	{
		170	DSPASSERT(pUnityTableReal != NULL);
		171	DSPASSERT(pUnityTableImaginary != NULL);
		172	DSPASSERT((UINT_PTR)pUnityTableReal % 16 == 0);
		173	DSPASSERT((UINT_PTR)pUnityTableImaginary % 16 == 0);
		174	DSPASSERT(ISPOWEROF2(uStride));
		175
		176	XVECTOR rTemp0, rTemp1, rTemp2, rTemp3, rTemp4, rTemp5, rTemp6, rTemp7;
		177	XVECTOR iTemp0, iTemp1, iTemp2, iTemp3, iTemp4, iTemp5, iTemp6, iTemp7;
		178
		179
		180	// calculating Temp
		181	rTemp0 = _mm_add_ps(r0, r2); iTemp0 = _mm_add_ps(i0, i2);
		182	rTemp2 = _mm_add_ps(r1, r3); iTemp2 = _mm_add_ps(i1, i3);
		183	rTemp1 = _mm_sub_ps(r0, r2); iTemp1 = _mm_sub_ps(i0, i2);
		184	rTemp3 = _mm_sub_ps(r1, r3); iTemp3 = _mm_sub_ps(i1, i3);
		185	rTemp4 = _mm_add_ps(rTemp0, rTemp2); iTemp4 = _mm_add_ps(iTemp0, iTemp2);
		186	rTemp5 = _mm_add_ps(rTemp1, iTemp3); iTemp5 = _mm_sub_ps(iTemp1, rTemp3);
		187	rTemp6 = _mm_sub_ps(rTemp0, rTemp2); iTemp6 = _mm_sub_ps(iTemp0, iTemp2);
		188	rTemp7 = _mm_sub_ps(rTemp1, iTemp3); iTemp7 = _mm_add_ps(iTemp1, rTemp3);
		189
		190	// calculating Result
		191	// vmulComplex(rTemp0, iTemp0, rTemp0, iTemp0, pUnityTableReal[0], pUnityTableImaginary[0]); // first one is always trivial
		192	vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]);
		193	vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride2], pUnityTableImaginary[uStride2]);
		194	vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride3], pUnityTableImaginary[uStride3]);
		195	if (fLast) {
		196	ButterflyDIT4_1(rTemp4, iTemp4);
		197	ButterflyDIT4_1(rTemp5, iTemp5);
		198	ButterflyDIT4_1(rTemp6, iTemp6);
		199	ButterflyDIT4_1(rTemp7, iTemp7);
		200	}
		201
		202
		203	r0 = rTemp4; i0 = iTemp4;
		204	r1 = rTemp5; i1 = iTemp5;
		205	r2 = rTemp6; i2 = iTemp6;
		206	r3 = rTemp7; i3 = iTemp7;
		207	}
		208
		209	//--------------<F-U-N-C-T-I-O-N-S>-----------------------------------------//
		210
		211	////
		212	// DESCRIPTION:
		213	// 4-sample FFT.
		214	//
		215	// PARAMETERS:
		216	// pReal - [inout] real components, must have at least uCount elements
		217	// pImaginary - [inout] imaginary components, must have at least uCount elements
		218	// uCount - [in] number of FFT iterations
		219	//
		220	// RETURN VALUE:
		221	// void
		222	////
		223	__forceinline void FFT4 (__inout_ecount(uCount) XVECTOR* __restrict pReal, __inout_ecount(uCount) XVECTOR* __restrict pImaginary, const UINT32 uCount=1)
		224	{
		225	DSPASSERT(pReal != NULL);
		226	DSPASSERT(pImaginary != NULL);
		227	DSPASSERT((UINT_PTR)pReal % 16 == 0);
		228	DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
		229	DSPASSERT(ISPOWEROF2(uCount));
		230
		231	for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
		232	ButterflyDIT4_1(pReal[uIndex], pImaginary[uIndex]);
		233	}
		234	}
		235
		236
		237
		238	////
		239	// DESCRIPTION:
		240	// 8-sample FFT.
		241	//
		242	// PARAMETERS:
		243	// pReal - [inout] real components, must have at least uCount*2 elements
		244	// pImaginary - [inout] imaginary components, must have at least uCount*2 elements
		245	// uCount - [in] number of FFT iterations
		246	//
		247	// RETURN VALUE:
		248	// void
		249	////
		250	__forceinline void FFT8 (__inout_ecount(uCount2) XVECTOR __restrict pReal, __inout_ecount(uCount2) XVECTOR __restrict pImaginary, const UINT32 uCount=1)
		251	{
		252	DSPASSERT(pReal != NULL);
		253	DSPASSERT(pImaginary != NULL);
		254	DSPASSERT((UINT_PTR)pReal % 16 == 0);
		255	DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
		256	DSPASSERT(ISPOWEROF2(uCount));
		257
		258	static XVECTOR wr1 = { 1.0f, 0.70710677f, 0.0f, -0.70710677f };
		259	static XVECTOR wi1 = { 0.0f, -0.70710677f, -1.0f, -0.70710677f };
		260	static XVECTOR wr2 = { -1.0f, -0.70710677f, 0.0f, 0.70710677f };
		261	static XVECTOR wi2 = { 0.0f, 0.70710677f, 1.0f, 0.70710677f };
		262
		263
		264	for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
		265	XVECTOR* __restrict pR = pReal + uIndex*2;
		266	XVECTOR* __restrict pI = pImaginary + uIndex*2;
		267
		268	XVECTOR oddsR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(3, 1, 3, 1));
		269	XVECTOR evensR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(2, 0, 2, 0));
		270	XVECTOR oddsI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(3, 1, 3, 1));
		271	XVECTOR evensI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(2, 0, 2, 0));
		272	ButterflyDIT4_1(oddsR, oddsI);
		273	ButterflyDIT4_1(evensR, evensI);
		274
		275	XVECTOR r, i;
		276	vmulComplex(r, i, oddsR, oddsI, wr1, wi1);
		277	pR[0] = _mm_add_ps(evensR, r);
		278	pI[0] = _mm_add_ps(evensI, i);
		279
		280	vmulComplex(r, i, oddsR, oddsI, wr2, wi2);
		281	pR[1] = _mm_add_ps(evensR, r);
		282	pI[1] = _mm_add_ps(evensI, i);
		283	}
		284	}
		285
		286
		287
		288	////
		289	// DESCRIPTION:
		290	// 16-sample FFT.
		291	//
		292	// PARAMETERS:
		293	// pReal - [inout] real components, must have at least uCount*4 elements
		294	// pImaginary - [inout] imaginary components, must have at least uCount*4 elements
		295	// uCount - [in] number of FFT iterations
		296	//
		297	// RETURN VALUE:
		298	// void
		299	////
		300	__forceinline void FFT16 (__inout_ecount(uCount4) XVECTOR __restrict pReal, __inout_ecount(uCount4) XVECTOR __restrict pImaginary, const UINT32 uCount=1)
		301	{
		302	DSPASSERT(pReal != NULL);
		303	DSPASSERT(pImaginary != NULL);
		304	DSPASSERT((UINT_PTR)pReal % 16 == 0);
		305	DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
		306	DSPASSERT(ISPOWEROF2(uCount));
		307
		308	XVECTOR aUnityTableReal[4] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.92387950f, 0.70710677f, 0.38268343f, 1.0f, 0.70710677f, -4.3711388e-008f, -0.70710677f, 1.0f, 0.38268343f, -0.70710677f, -0.92387950f };
		309	XVECTOR aUnityTableImaginary[4] = { -0.0f, -0.0f, -0.0f, -0.0f, -0.0f, -0.38268343f, -0.70710677f, -0.92387950f, -0.0f, -0.70710677f, -1.0f, -0.70710677f, -0.0f, -0.92387950f, -0.70710677f, 0.38268343f };
		310
		311
		312	for (UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
		313	ButterflyDIT4_4(pReal[uIndex*4],
		314	pReal[uIndex*4 + 1],
		315	pReal[uIndex*4 + 2],
		316	pReal[uIndex*4 + 3],
		317	pImaginary[uIndex*4],
		318	pImaginary[uIndex*4 + 1],
		319	pImaginary[uIndex*4 + 2],
		320	pImaginary[uIndex*4 + 3],
		321	aUnityTableReal,
		322	aUnityTableImaginary,
		323	1, TRUE);
		324	}
		325	}
		326
		327
		328
		329	////
		330	// DESCRIPTION:
		331	// 2^N-sample FFT.
		332	//
		333	// REMARKS:
		334	// For FFTs length 16 and below, call FFT16(), FFT8(), or FFT4().
		335	//
		336	// PARAMETERS:
		337	// pReal - [inout] real components, must have at least (uLength*uCount)/4 elements
		338	// pImaginary - [inout] imaginary components, must have at least (uLength*uCount)/4 elements
		339	// pUnityTable - [in] unity table, must have at least uLength*uCount elements, see FFTInitializeUnityTable()
		340	// uLength - [in] FFT length in samples, must be a power of 2 > 16
		341	// uCount - [in] number of FFT iterations
		342	//
		343	// RETURN VALUE:
		344	// void
		345	////
		346	inline void FFT (__inout_ecount((uLengthuCount)/4) XVECTOR __restrict pReal, __inout_ecount((uLengthuCount)/4) XVECTOR __restrict pImaginary, __in_ecount(uLengthuCount) const XVECTOR __restrict pUnityTable, const UINT32 uLength, const UINT32 uCount=1)
		347	{
		348	DSPASSERT(pReal != NULL);
		349	DSPASSERT(pImaginary != NULL);
		350	DSPASSERT(pUnityTable != NULL);
		351	DSPASSERT((UINT_PTR)pReal % 16 == 0);
		352	DSPASSERT((UINT_PTR)pImaginary % 16 == 0);
		353	DSPASSERT((UINT_PTR)pUnityTable % 16 == 0);
		354	DSPASSERT(uLength > 16);
		355	DSPASSERT(ISPOWEROF2(uLength));
		356	DSPASSERT(ISPOWEROF2(uCount));
		357
		358	const XVECTOR* __restrict pUnityTableReal = pUnityTable;
		359	const XVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2);
		360	const UINT32 uTotal = uCount * uLength;
		361	const UINT32 uTotal_vectors = uTotal >> 2;
		362	const UINT32 uStage_vectors = uLength >> 2;
		363	const UINT32 uStride = uStage_vectors >> 2; // stride between butterfly elements
		364	const UINT32 uSkip = uStage_vectors - uStride;
		365
		366
		367	for (UINT32 uIndex=0; uIndex<(uTotal_vectors>>2); ++uIndex) {
		368	UINT32 n = (uIndex/uStride) * (uStride + uSkip) + (uIndex % uStride);
		369	ButterflyDIT4_4(pReal[n],
		370	pReal[n + uStride],
		371	pReal[n + uStride*2],
		372	pReal[n + uStride*3],
		373	pImaginary[n ],
		374	pImaginary[n + uStride],
		375	pImaginary[n + uStride*2],
		376	pImaginary[n + uStride*3],
		377	pUnityTableReal + n % uStage_vectors,
		378	pUnityTableImaginary + n % uStage_vectors,
		379	uStride, FALSE);
		380	}
		381
		382
		383	if (uLength > 16*4) {
		384	FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4);
		385	} else if (uLength == 16*4) {
		386	FFT16(pReal, pImaginary, uCount*4);
		387	} else if (uLength == 8*4) {
		388	FFT8(pReal, pImaginary, uCount*4);
		389	} else if (uLength == 4*4) {
		390	FFT4(pReal, pImaginary, uCount*4);
		391	}
		392	}
		393
		394	//--------------------------------------------------------------------------//
		395	////
		396	// DESCRIPTION:
		397	// Initializes unity roots lookup table used by FFT functions.
		398	// Once initialized, the table need not be initialized again unless a
		399	// different FFT length is desired.
		400	//
		401	// REMARKS:
		402	// The unity tables of FFT length 16 and below are hard coded into the
		403	// respective FFT functions and so need not be initialized.
		404	//
		405	// PARAMETERS:
		406	// pUnityTable - [out] unity table, receives unity roots lookup table, must have at least uLength elements
		407	// uLength - [in] FFT length in samples, must be a power of 2 > 16
		408	//
		409	// RETURN VALUE:
		410	// void
		411	////
		412	inline void FFTInitializeUnityTable (__out_ecount(uLength) XVECTOR* __restrict pUnityTable, UINT32 uLength)
		413	{
		414	DSPASSERT(pUnityTable != NULL);
		415	DSPASSERT(uLength > 16);
		416	DSPASSERT(ISPOWEROF2(uLength));
		417
		418	FLOAT32* __restrict pfUnityTable = (FLOAT32* __restrict)pUnityTable;
		419
		420
		421	// initialize unity table for recursive FFT lengths: uLength, uLength/4, uLength/16... > 16
		422	do {
		423	FLOAT32 flStep = 6.283185307f / uLength; // 2PI / FFT length
		424	uLength >>= 2;
		425
		426	// pUnityTable[0 to uLength*4-1] contains real components for current FFT length
		427	// pUnityTable[uLength4 to uLength8-1] contains imaginary components for current FFT length
		428	for (UINT32 i=0; i<4; ++i) {
		429	for (UINT32 j=0; j<uLength; ++j) {
		430	UINT32 uIndex = (i*uLength) + j;
		431	pfUnityTable[uIndex] = cosf(FLOAT32(i)FLOAT32(j)flStep); // real component
		432	pfUnityTable[uIndex + uLength4] = -sinf(FLOAT32(i)FLOAT32(j)*flStep); // imaginary component
		433	}
		434	}
		435	pfUnityTable += uLength*8;
		436	} while (uLength > 16);
		437	}
		438
		439
		440	////
		441	// DESCRIPTION:
		442	// The FFT functions generate output in bit reversed order.
		443	// Use this function to re-arrange them into order of increasing frequency.
		444	//
		445	// PARAMETERS:
		446	// pOutput - [out] output buffer, receives samples in order of increasing frequency, must have at least (1<<uLog2Length) elements
		447	// pInput - [in] input buffer, samples in bit reversed order as generated by FFT functions, must have at least (1<<uLog2Length) elements
		448	// uLog2Length - [in] LOG (base 2) of FFT length in samples, must be > 0
		449	//
		450	// RETURN VALUE:
		451	// void
		452	////
		453	inline void FFTUnswizzle (__out_ecount(1<<uLog2Length) FLOAT32* __restrict pOutput, __in_ecount(1<<uLog2Length) const FLOAT32* __restrict pInput, const UINT32 uLog2Length)
		454	{
		455	DSPASSERT(pOutput != NULL);
		456	DSPASSERT(pInput != NULL);
		457	DSPASSERT(uLog2Length > 0);
		458
		459	const UINT32 uLength = UINT32(1 << uLog2Length);
		460
		461
		462	if ((uLog2Length & 0x1) == 0) {
		463	// even powers of two
		464	for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
		465	UINT32 n = uIndex;
		466	n = ( (n & 0xcccccccc) >> 2 ) \| ( (n & 0x33333333) << 2 );
		467	n = ( (n & 0xf0f0f0f0) >> 4 ) \| ( (n & 0x0f0f0f0f) << 4 );
		468	n = ( (n & 0xff00ff00) >> 8 ) \| ( (n & 0x00ff00ff) << 8 );
		469	n = ( (n & 0xffff0000) >> 16 ) \| ( (n & 0x0000ffff) << 16 );
		470	n >>= (32 - uLog2Length);
		471	pOutput[n] = pInput[uIndex];
		472	}
		473	} else {
		474	// odd powers of two
		475	for (UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
		476	UINT32 n = (uIndex>>3);
		477	n = ( (n & 0xcccccccc) >> 2 ) \| ( (n & 0x33333333) << 2 );
		478	n = ( (n & 0xf0f0f0f0) >> 4 ) \| ( (n & 0x0f0f0f0f) << 4 );
		479	n = ( (n & 0xff00ff00) >> 8 ) \| ( (n & 0x00ff00ff) << 8 );
		480	n = ( (n & 0xffff0000) >> 16 ) \| ( (n & 0x0000ffff) << 16 );
		481	n >>= (32 - (uLog2Length-3));
		482	n \|= ((uIndex & 0x7) << (uLog2Length - 3));
		483	pOutput[n] = pInput[uIndex];
		484	}
		485	}
		486	}
		487
		488
		489	////
		490	// DESCRIPTION:
		491	// Convert complex components to polar form.
		492	//
		493	// PARAMETERS:
		494	// pOutput - [out] output buffer, receives samples in polar form, must have at least uLength/4 elements
		495	// pInputReal - [in] input buffer (real components), must have at least uLength/4 elements
		496	// pInputImaginary - [in] input buffer (imaginary components), must have at least uLength/4 elements
		497	// uLength - [in] FFT length in samples, must be a power of 2 >= 4
		498	//
		499	// RETURN VALUE:
		500	// void
		501	////
		502	inline void FFTPolar (__out_ecount(uLength/4) XVECTOR* __restrict pOutput, __in_ecount(uLength/4) const XVECTOR* __restrict pInputReal, __in_ecount(uLength/4) const XVECTOR* __restrict pInputImaginary, const UINT32 uLength)
		503	{
		504	DSPASSERT(pOutput != NULL);
		505	DSPASSERT(pInputReal != NULL);
		506	DSPASSERT(pInputImaginary != NULL);
		507	DSPASSERT(uLength >= 4);
		508	DSPASSERT(ISPOWEROF2(uLength));
		509
		510	FLOAT32 flOneOverLength = 1.0f / uLength;
		511
		512
		513	// result = sqrtf((real/uLength)^2 + (imaginary/uLength)^2) * 2
		514	XVECTOR vOneOverLength = _mm_set_ps1(flOneOverLength);
		515
		516	for (UINT32 uIndex=0; uIndex<(uLength>>2); ++uIndex) {
		517	XVECTOR vReal = _mm_mul_ps(pInputReal[uIndex], vOneOverLength);
		518	XVECTOR vImaginary = _mm_mul_ps(pInputImaginary[uIndex], vOneOverLength);
		519	XVECTOR vRR = _mm_mul_ps(vReal, vReal);
		520	XVECTOR vII = _mm_mul_ps(vImaginary, vImaginary);
		521	XVECTOR vRRplusII = _mm_add_ps(vRR, vII);
		522	XVECTOR vTotal = _mm_sqrt_ps(vRRplusII);
		523	pOutput[uIndex] = _mm_add_ps(vTotal, vTotal);
		524	}
		525	}
		526
		527
		528	#pragma warning(pop)
		529	}; // namespace XDSP
		530	//---------------------------------<-EOF->----------------------------------//
		531

Subversion Repositories Games.Chess Giants

Games.Chess Giants/DirectX9/Include/XDSP.h – Rev 1