WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – /llvm-build/x86_64/lib/clang/16/include/xmmintrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9
		10	#ifndef __XMMINTRIN_H
		11	#define __XMMINTRIN_H
		12
		13	#if !defined(__i386__) && !defined(__x86_64__)
		14	#error "This header is only meant to be used on x86 and x64 architecture"
		15	#endif
		16
		17	#include <mmintrin.h>
		18
		19	typedef int __v4si __attribute__((__vector_size__(16)));
		20	typedef float __v4sf __attribute__((__vector_size__(16)));
		21	typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16)));
		22
		23	typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1)));
		24
		25	/* Unsigned types */
		26	typedef unsigned int __v4su __attribute__((__vector_size__(16)));
		27
		28	/* This header should only be included in a hosted environment as it depends on
		29	* a standard library to provide allocation routines. */
		30	#if __STDC_HOSTED__
		31	#include <mm_malloc.h>
		32	#endif
		33
		34	/* Define the default attributes for the functions in this file. */
		35	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128)))
		36	#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64)))
		37
		38	/// Adds the 32-bit float values in the low-order bits of the operands.
		39	///
		40	/// \headerfile <x86intrin.h>
		41	///
		42	/// This intrinsic corresponds to the <c> VADDSS / ADDSS </c> instructions.
		43	///
		44	/// \param __a
		45	/// A 128-bit vector of [4 x float] containing one of the source operands.
		46	/// The lower 32 bits of this operand are used in the calculation.
		47	/// \param __b
		48	/// A 128-bit vector of [4 x float] containing one of the source operands.
		49	/// The lower 32 bits of this operand are used in the calculation.
		50	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum
		51	/// of the lower 32 bits of both operands. The upper 96 bits are copied from
		52	/// the upper 96 bits of the first source operand.
		53	static __inline__ __m128 __DEFAULT_FN_ATTRS
		54	_mm_add_ss(__m128 __a, __m128 __b)
		55	{
		56	__a[0] += __b[0];
		57	return __a;
		58	}
		59
		60	/// Adds two 128-bit vectors of [4 x float], and returns the results of
		61	/// the addition.
		62	///
		63	/// \headerfile <x86intrin.h>
		64	///
		65	/// This intrinsic corresponds to the <c> VADDPS / ADDPS </c> instructions.
		66	///
		67	/// \param __a
		68	/// A 128-bit vector of [4 x float] containing one of the source operands.
		69	/// \param __b
		70	/// A 128-bit vector of [4 x float] containing one of the source operands.
		71	/// \returns A 128-bit vector of [4 x float] containing the sums of both
		72	/// operands.
		73	static __inline__ __m128 __DEFAULT_FN_ATTRS
		74	_mm_add_ps(__m128 __a, __m128 __b)
		75	{
		76	return (__m128)((__v4sf)__a + (__v4sf)__b);
		77	}
		78
		79	/// Subtracts the 32-bit float value in the low-order bits of the second
		80	/// operand from the corresponding value in the first operand.
		81	///
		82	/// \headerfile <x86intrin.h>
		83	///
		84	/// This intrinsic corresponds to the <c> VSUBSS / SUBSS </c> instructions.
		85	///
		86	/// \param __a
		87	/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits
		88	/// of this operand are used in the calculation.
		89	/// \param __b
		90	/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32
		91	/// bits of this operand are used in the calculation.
		92	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
		93	/// difference of the lower 32 bits of both operands. The upper 96 bits are
		94	/// copied from the upper 96 bits of the first source operand.
		95	static __inline__ __m128 __DEFAULT_FN_ATTRS
		96	_mm_sub_ss(__m128 __a, __m128 __b)
		97	{
		98	__a[0] -= __b[0];
		99	return __a;
		100	}
		101
		102	/// Subtracts each of the values of the second operand from the first
		103	/// operand, both of which are 128-bit vectors of [4 x float] and returns
		104	/// the results of the subtraction.
		105	///
		106	/// \headerfile <x86intrin.h>
		107	///
		108	/// This intrinsic corresponds to the <c> VSUBPS / SUBPS </c> instructions.
		109	///
		110	/// \param __a
		111	/// A 128-bit vector of [4 x float] containing the minuend.
		112	/// \param __b
		113	/// A 128-bit vector of [4 x float] containing the subtrahend.
		114	/// \returns A 128-bit vector of [4 x float] containing the differences between
		115	/// both operands.
		116	static __inline__ __m128 __DEFAULT_FN_ATTRS
		117	_mm_sub_ps(__m128 __a, __m128 __b)
		118	{
		119	return (__m128)((__v4sf)__a - (__v4sf)__b);
		120	}
		121
		122	/// Multiplies two 32-bit float values in the low-order bits of the
		123	/// operands.
		124	///
		125	/// \headerfile <x86intrin.h>
		126	///
		127	/// This intrinsic corresponds to the <c> VMULSS / MULSS </c> instructions.
		128	///
		129	/// \param __a
		130	/// A 128-bit vector of [4 x float] containing one of the source operands.
		131	/// The lower 32 bits of this operand are used in the calculation.
		132	/// \param __b
		133	/// A 128-bit vector of [4 x float] containing one of the source operands.
		134	/// The lower 32 bits of this operand are used in the calculation.
		135	/// \returns A 128-bit vector of [4 x float] containing the product of the lower
		136	/// 32 bits of both operands. The upper 96 bits are copied from the upper 96
		137	/// bits of the first source operand.
		138	static __inline__ __m128 __DEFAULT_FN_ATTRS
		139	_mm_mul_ss(__m128 __a, __m128 __b)
		140	{
		141	__a[0] *= __b[0];
		142	return __a;
		143	}
		144
		145	/// Multiplies two 128-bit vectors of [4 x float] and returns the
		146	/// results of the multiplication.
		147	///
		148	/// \headerfile <x86intrin.h>
		149	///
		150	/// This intrinsic corresponds to the <c> VMULPS / MULPS </c> instructions.
		151	///
		152	/// \param __a
		153	/// A 128-bit vector of [4 x float] containing one of the source operands.
		154	/// \param __b
		155	/// A 128-bit vector of [4 x float] containing one of the source operands.
		156	/// \returns A 128-bit vector of [4 x float] containing the products of both
		157	/// operands.
		158	static __inline__ __m128 __DEFAULT_FN_ATTRS
		159	_mm_mul_ps(__m128 __a, __m128 __b)
		160	{
		161	return (__m128)((__v4sf)__a * (__v4sf)__b);
		162	}
		163
		164	/// Divides the value in the low-order 32 bits of the first operand by
		165	/// the corresponding value in the second operand.
		166	///
		167	/// \headerfile <x86intrin.h>
		168	///
		169	/// This intrinsic corresponds to the <c> VDIVSS / DIVSS </c> instructions.
		170	///
		171	/// \param __a
		172	/// A 128-bit vector of [4 x float] containing the dividend. The lower 32
		173	/// bits of this operand are used in the calculation.
		174	/// \param __b
		175	/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits
		176	/// of this operand are used in the calculation.
		177	/// \returns A 128-bit vector of [4 x float] containing the quotients of the
		178	/// lower 32 bits of both operands. The upper 96 bits are copied from the
		179	/// upper 96 bits of the first source operand.
		180	static __inline__ __m128 __DEFAULT_FN_ATTRS
		181	_mm_div_ss(__m128 __a, __m128 __b)
		182	{
		183	__a[0] /= __b[0];
		184	return __a;
		185	}
		186
		187	/// Divides two 128-bit vectors of [4 x float].
		188	///
		189	/// \headerfile <x86intrin.h>
		190	///
		191	/// This intrinsic corresponds to the <c> VDIVPS / DIVPS </c> instructions.
		192	///
		193	/// \param __a
		194	/// A 128-bit vector of [4 x float] containing the dividend.
		195	/// \param __b
		196	/// A 128-bit vector of [4 x float] containing the divisor.
		197	/// \returns A 128-bit vector of [4 x float] containing the quotients of both
		198	/// operands.
		199	static __inline__ __m128 __DEFAULT_FN_ATTRS
		200	_mm_div_ps(__m128 __a, __m128 __b)
		201	{
		202	return (__m128)((__v4sf)__a / (__v4sf)__b);
		203	}
		204
		205	/// Calculates the square root of the value stored in the low-order bits
		206	/// of a 128-bit vector of [4 x float].
		207	///
		208	/// \headerfile <x86intrin.h>
		209	///
		210	/// This intrinsic corresponds to the <c> VSQRTSS / SQRTSS </c> instructions.
		211	///
		212	/// \param __a
		213	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		214	/// used in the calculation.
		215	/// \returns A 128-bit vector of [4 x float] containing the square root of the
		216	/// value in the low-order bits of the operand.
		217	static __inline__ __m128 __DEFAULT_FN_ATTRS
		218	_mm_sqrt_ss(__m128 __a)
		219	{
		220	return (__m128)__builtin_ia32_sqrtss((__v4sf)__a);
		221	}
		222
		223	/// Calculates the square roots of the values stored in a 128-bit vector
		224	/// of [4 x float].
		225	///
		226	/// \headerfile <x86intrin.h>
		227	///
		228	/// This intrinsic corresponds to the <c> VSQRTPS / SQRTPS </c> instructions.
		229	///
		230	/// \param __a
		231	/// A 128-bit vector of [4 x float].
		232	/// \returns A 128-bit vector of [4 x float] containing the square roots of the
		233	/// values in the operand.
		234	static __inline__ __m128 __DEFAULT_FN_ATTRS
		235	_mm_sqrt_ps(__m128 __a)
		236	{
		237	return __builtin_ia32_sqrtps((__v4sf)__a);
		238	}
		239
		240	/// Calculates the approximate reciprocal of the value stored in the
		241	/// low-order bits of a 128-bit vector of [4 x float].
		242	///
		243	/// \headerfile <x86intrin.h>
		244	///
		245	/// This intrinsic corresponds to the <c> VRCPSS / RCPSS </c> instructions.
		246	///
		247	/// \param __a
		248	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		249	/// used in the calculation.
		250	/// \returns A 128-bit vector of [4 x float] containing the approximate
		251	/// reciprocal of the value in the low-order bits of the operand.
		252	static __inline__ __m128 __DEFAULT_FN_ATTRS
		253	_mm_rcp_ss(__m128 __a)
		254	{
		255	return (__m128)__builtin_ia32_rcpss((__v4sf)__a);
		256	}
		257
		258	/// Calculates the approximate reciprocals of the values stored in a
		259	/// 128-bit vector of [4 x float].
		260	///
		261	/// \headerfile <x86intrin.h>
		262	///
		263	/// This intrinsic corresponds to the <c> VRCPPS / RCPPS </c> instructions.
		264	///
		265	/// \param __a
		266	/// A 128-bit vector of [4 x float].
		267	/// \returns A 128-bit vector of [4 x float] containing the approximate
		268	/// reciprocals of the values in the operand.
		269	static __inline__ __m128 __DEFAULT_FN_ATTRS
		270	_mm_rcp_ps(__m128 __a)
		271	{
		272	return (__m128)__builtin_ia32_rcpps((__v4sf)__a);
		273	}
		274
		275	/// Calculates the approximate reciprocal of the square root of the value
		276	/// stored in the low-order bits of a 128-bit vector of [4 x float].
		277	///
		278	/// \headerfile <x86intrin.h>
		279	///
		280	/// This intrinsic corresponds to the <c> VRSQRTSS / RSQRTSS </c> instructions.
		281	///
		282	/// \param __a
		283	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		284	/// used in the calculation.
		285	/// \returns A 128-bit vector of [4 x float] containing the approximate
		286	/// reciprocal of the square root of the value in the low-order bits of the
		287	/// operand.
		288	static __inline__ __m128 __DEFAULT_FN_ATTRS
		289	_mm_rsqrt_ss(__m128 __a)
		290	{
		291	return __builtin_ia32_rsqrtss((__v4sf)__a);
		292	}
		293
		294	/// Calculates the approximate reciprocals of the square roots of the
		295	/// values stored in a 128-bit vector of [4 x float].
		296	///
		297	/// \headerfile <x86intrin.h>
		298	///
		299	/// This intrinsic corresponds to the <c> VRSQRTPS / RSQRTPS </c> instructions.
		300	///
		301	/// \param __a
		302	/// A 128-bit vector of [4 x float].
		303	/// \returns A 128-bit vector of [4 x float] containing the approximate
		304	/// reciprocals of the square roots of the values in the operand.
		305	static __inline__ __m128 __DEFAULT_FN_ATTRS
		306	_mm_rsqrt_ps(__m128 __a)
		307	{
		308	return __builtin_ia32_rsqrtps((__v4sf)__a);
		309	}
		310
		311	/// Compares two 32-bit float values in the low-order bits of both
		312	/// operands and returns the lesser value in the low-order bits of the
		313	/// vector of [4 x float].
		314	///
		315	/// \headerfile <x86intrin.h>
		316	///
		317	/// This intrinsic corresponds to the <c> VMINSS / MINSS </c> instructions.
		318	///
		319	/// \param __a
		320	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		321	/// 32 bits of this operand are used in the comparison.
		322	/// \param __b
		323	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		324	/// 32 bits of this operand are used in the comparison.
		325	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
		326	/// minimum value between both operands. The upper 96 bits are copied from
		327	/// the upper 96 bits of the first source operand.
		328	static __inline__ __m128 __DEFAULT_FN_ATTRS
		329	_mm_min_ss(__m128 __a, __m128 __b)
		330	{
		331	return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b);
		332	}
		333
		334	/// Compares two 128-bit vectors of [4 x float] and returns the lesser
		335	/// of each pair of values.
		336	///
		337	/// \headerfile <x86intrin.h>
		338	///
		339	/// This intrinsic corresponds to the <c> VMINPS / MINPS </c> instructions.
		340	///
		341	/// \param __a
		342	/// A 128-bit vector of [4 x float] containing one of the operands.
		343	/// \param __b
		344	/// A 128-bit vector of [4 x float] containing one of the operands.
		345	/// \returns A 128-bit vector of [4 x float] containing the minimum values
		346	/// between both operands.
		347	static __inline__ __m128 __DEFAULT_FN_ATTRS
		348	_mm_min_ps(__m128 __a, __m128 __b)
		349	{
		350	return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b);
		351	}
		352
		353	/// Compares two 32-bit float values in the low-order bits of both
		354	/// operands and returns the greater value in the low-order bits of a 128-bit
		355	/// vector of [4 x float].
		356	///
		357	/// \headerfile <x86intrin.h>
		358	///
		359	/// This intrinsic corresponds to the <c> VMAXSS / MAXSS </c> instructions.
		360	///
		361	/// \param __a
		362	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		363	/// 32 bits of this operand are used in the comparison.
		364	/// \param __b
		365	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		366	/// 32 bits of this operand are used in the comparison.
		367	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
		368	/// maximum value between both operands. The upper 96 bits are copied from
		369	/// the upper 96 bits of the first source operand.
		370	static __inline__ __m128 __DEFAULT_FN_ATTRS
		371	_mm_max_ss(__m128 __a, __m128 __b)
		372	{
		373	return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b);
		374	}
		375
		376	/// Compares two 128-bit vectors of [4 x float] and returns the greater
		377	/// of each pair of values.
		378	///
		379	/// \headerfile <x86intrin.h>
		380	///
		381	/// This intrinsic corresponds to the <c> VMAXPS / MAXPS </c> instructions.
		382	///
		383	/// \param __a
		384	/// A 128-bit vector of [4 x float] containing one of the operands.
		385	/// \param __b
		386	/// A 128-bit vector of [4 x float] containing one of the operands.
		387	/// \returns A 128-bit vector of [4 x float] containing the maximum values
		388	/// between both operands.
		389	static __inline__ __m128 __DEFAULT_FN_ATTRS
		390	_mm_max_ps(__m128 __a, __m128 __b)
		391	{
		392	return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b);
		393	}
		394
		395	/// Performs a bitwise AND of two 128-bit vectors of [4 x float].
		396	///
		397	/// \headerfile <x86intrin.h>
		398	///
		399	/// This intrinsic corresponds to the <c> VANDPS / ANDPS </c> instructions.
		400	///
		401	/// \param __a
		402	/// A 128-bit vector containing one of the source operands.
		403	/// \param __b
		404	/// A 128-bit vector containing one of the source operands.
		405	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
		406	/// values between both operands.
		407	static __inline__ __m128 __DEFAULT_FN_ATTRS
		408	_mm_and_ps(__m128 __a, __m128 __b)
		409	{
		410	return (__m128)((__v4su)__a & (__v4su)__b);
		411	}
		412
		413	/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using
		414	/// the one's complement of the values contained in the first source
		415	/// operand.
		416	///
		417	/// \headerfile <x86intrin.h>
		418	///
		419	/// This intrinsic corresponds to the <c> VANDNPS / ANDNPS </c> instructions.
		420	///
		421	/// \param __a
		422	/// A 128-bit vector of [4 x float] containing the first source operand. The
		423	/// one's complement of this value is used in the bitwise AND.
		424	/// \param __b
		425	/// A 128-bit vector of [4 x float] containing the second source operand.
		426	/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the
		427	/// one's complement of the first operand and the values in the second
		428	/// operand.
		429	static __inline__ __m128 __DEFAULT_FN_ATTRS
		430	_mm_andnot_ps(__m128 __a, __m128 __b)
		431	{
		432	return (__m128)(~(__v4su)__a & (__v4su)__b);
		433	}
		434
		435	/// Performs a bitwise OR of two 128-bit vectors of [4 x float].
		436	///
		437	/// \headerfile <x86intrin.h>
		438	///
		439	/// This intrinsic corresponds to the <c> VORPS / ORPS </c> instructions.
		440	///
		441	/// \param __a
		442	/// A 128-bit vector of [4 x float] containing one of the source operands.
		443	/// \param __b
		444	/// A 128-bit vector of [4 x float] containing one of the source operands.
		445	/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the
		446	/// values between both operands.
		447	static __inline__ __m128 __DEFAULT_FN_ATTRS
		448	_mm_or_ps(__m128 __a, __m128 __b)
		449	{
		450	return (__m128)((__v4su)__a \| (__v4su)__b);
		451	}
		452
		453	/// Performs a bitwise exclusive OR of two 128-bit vectors of
		454	/// [4 x float].
		455	///
		456	/// \headerfile <x86intrin.h>
		457	///
		458	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instructions.
		459	///
		460	/// \param __a
		461	/// A 128-bit vector of [4 x float] containing one of the source operands.
		462	/// \param __b
		463	/// A 128-bit vector of [4 x float] containing one of the source operands.
		464	/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR
		465	/// of the values between both operands.
		466	static __inline__ __m128 __DEFAULT_FN_ATTRS
		467	_mm_xor_ps(__m128 __a, __m128 __b)
		468	{
		469	return (__m128)((__v4su)__a ^ (__v4su)__b);
		470	}
		471
		472	/// Compares two 32-bit float values in the low-order bits of both
		473	/// operands for equality and returns the result of the comparison in the
		474	/// low-order bits of a vector [4 x float].
		475	///
		476	/// \headerfile <x86intrin.h>
		477	///
		478	/// This intrinsic corresponds to the <c> VCMPEQSS / CMPEQSS </c> instructions.
		479	///
		480	/// \param __a
		481	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		482	/// 32 bits of this operand are used in the comparison.
		483	/// \param __b
		484	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		485	/// 32 bits of this operand are used in the comparison.
		486	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		487	/// in the low-order bits.
		488	static __inline__ __m128 __DEFAULT_FN_ATTRS
		489	_mm_cmpeq_ss(__m128 __a, __m128 __b)
		490	{
		491	return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b);
		492	}
		493
		494	/// Compares each of the corresponding 32-bit float values of the
		495	/// 128-bit vectors of [4 x float] for equality.
		496	///
		497	/// \headerfile <x86intrin.h>
		498	///
		499	/// This intrinsic corresponds to the <c> VCMPEQPS / CMPEQPS </c> instructions.
		500	///
		501	/// \param __a
		502	/// A 128-bit vector of [4 x float].
		503	/// \param __b
		504	/// A 128-bit vector of [4 x float].
		505	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		506	static __inline__ __m128 __DEFAULT_FN_ATTRS
		507	_mm_cmpeq_ps(__m128 __a, __m128 __b)
		508	{
		509	return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b);
		510	}
		511
		512	/// Compares two 32-bit float values in the low-order bits of both
		513	/// operands to determine if the value in the first operand is less than the
		514	/// corresponding value in the second operand and returns the result of the
		515	/// comparison in the low-order bits of a vector of [4 x float].
		516	///
		517	/// \headerfile <x86intrin.h>
		518	///
		519	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
		520	///
		521	/// \param __a
		522	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		523	/// 32 bits of this operand are used in the comparison.
		524	/// \param __b
		525	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		526	/// 32 bits of this operand are used in the comparison.
		527	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		528	/// in the low-order bits.
		529	static __inline__ __m128 __DEFAULT_FN_ATTRS
		530	_mm_cmplt_ss(__m128 __a, __m128 __b)
		531	{
		532	return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b);
		533	}
		534
		535	/// Compares each of the corresponding 32-bit float values of the
		536	/// 128-bit vectors of [4 x float] to determine if the values in the first
		537	/// operand are less than those in the second operand.
		538	///
		539	/// \headerfile <x86intrin.h>
		540	///
		541	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
		542	///
		543	/// \param __a
		544	/// A 128-bit vector of [4 x float].
		545	/// \param __b
		546	/// A 128-bit vector of [4 x float].
		547	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		548	static __inline__ __m128 __DEFAULT_FN_ATTRS
		549	_mm_cmplt_ps(__m128 __a, __m128 __b)
		550	{
		551	return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b);
		552	}
		553
		554	/// Compares two 32-bit float values in the low-order bits of both
		555	/// operands to determine if the value in the first operand is less than or
		556	/// equal to the corresponding value in the second operand and returns the
		557	/// result of the comparison in the low-order bits of a vector of
		558	/// [4 x float].
		559	///
		560	/// \headerfile <x86intrin.h>
		561	///
		562	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
		563	///
		564	/// \param __a
		565	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		566	/// 32 bits of this operand are used in the comparison.
		567	/// \param __b
		568	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		569	/// 32 bits of this operand are used in the comparison.
		570	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		571	/// in the low-order bits.
		572	static __inline__ __m128 __DEFAULT_FN_ATTRS
		573	_mm_cmple_ss(__m128 __a, __m128 __b)
		574	{
		575	return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b);
		576	}
		577
		578	/// Compares each of the corresponding 32-bit float values of the
		579	/// 128-bit vectors of [4 x float] to determine if the values in the first
		580	/// operand are less than or equal to those in the second operand.
		581	///
		582	/// \headerfile <x86intrin.h>
		583	///
		584	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
		585	///
		586	/// \param __a
		587	/// A 128-bit vector of [4 x float].
		588	/// \param __b
		589	/// A 128-bit vector of [4 x float].
		590	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		591	static __inline__ __m128 __DEFAULT_FN_ATTRS
		592	_mm_cmple_ps(__m128 __a, __m128 __b)
		593	{
		594	return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b);
		595	}
		596
		597	/// Compares two 32-bit float values in the low-order bits of both
		598	/// operands to determine if the value in the first operand is greater than
		599	/// the corresponding value in the second operand and returns the result of
		600	/// the comparison in the low-order bits of a vector of [4 x float].
		601	///
		602	/// \headerfile <x86intrin.h>
		603	///
		604	/// This intrinsic corresponds to the <c> VCMPLTSS / CMPLTSS </c> instructions.
		605	///
		606	/// \param __a
		607	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		608	/// 32 bits of this operand are used in the comparison.
		609	/// \param __b
		610	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		611	/// 32 bits of this operand are used in the comparison.
		612	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		613	/// in the low-order bits.
		614	static __inline__ __m128 __DEFAULT_FN_ATTRS
		615	_mm_cmpgt_ss(__m128 __a, __m128 __b)
		616	{
		617	return (__m128)__builtin_shufflevector((__v4sf)__a,
		618	(__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a),
		619	4, 1, 2, 3);
		620	}
		621
		622	/// Compares each of the corresponding 32-bit float values of the
		623	/// 128-bit vectors of [4 x float] to determine if the values in the first
		624	/// operand are greater than those in the second operand.
		625	///
		626	/// \headerfile <x86intrin.h>
		627	///
		628	/// This intrinsic corresponds to the <c> VCMPLTPS / CMPLTPS </c> instructions.
		629	///
		630	/// \param __a
		631	/// A 128-bit vector of [4 x float].
		632	/// \param __b
		633	/// A 128-bit vector of [4 x float].
		634	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		635	static __inline__ __m128 __DEFAULT_FN_ATTRS
		636	_mm_cmpgt_ps(__m128 __a, __m128 __b)
		637	{
		638	return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a);
		639	}
		640
		641	/// Compares two 32-bit float values in the low-order bits of both
		642	/// operands to determine if the value in the first operand is greater than
		643	/// or equal to the corresponding value in the second operand and returns
		644	/// the result of the comparison in the low-order bits of a vector of
		645	/// [4 x float].
		646	///
		647	/// \headerfile <x86intrin.h>
		648	///
		649	/// This intrinsic corresponds to the <c> VCMPLESS / CMPLESS </c> instructions.
		650	///
		651	/// \param __a
		652	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		653	/// 32 bits of this operand are used in the comparison.
		654	/// \param __b
		655	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		656	/// 32 bits of this operand are used in the comparison.
		657	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		658	/// in the low-order bits.
		659	static __inline__ __m128 __DEFAULT_FN_ATTRS
		660	_mm_cmpge_ss(__m128 __a, __m128 __b)
		661	{
		662	return (__m128)__builtin_shufflevector((__v4sf)__a,
		663	(__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a),
		664	4, 1, 2, 3);
		665	}
		666
		667	/// Compares each of the corresponding 32-bit float values of the
		668	/// 128-bit vectors of [4 x float] to determine if the values in the first
		669	/// operand are greater than or equal to those in the second operand.
		670	///
		671	/// \headerfile <x86intrin.h>
		672	///
		673	/// This intrinsic corresponds to the <c> VCMPLEPS / CMPLEPS </c> instructions.
		674	///
		675	/// \param __a
		676	/// A 128-bit vector of [4 x float].
		677	/// \param __b
		678	/// A 128-bit vector of [4 x float].
		679	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		680	static __inline__ __m128 __DEFAULT_FN_ATTRS
		681	_mm_cmpge_ps(__m128 __a, __m128 __b)
		682	{
		683	return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a);
		684	}
		685
		686	/// Compares two 32-bit float values in the low-order bits of both
		687	/// operands for inequality and returns the result of the comparison in the
		688	/// low-order bits of a vector of [4 x float].
		689	///
		690	/// \headerfile <x86intrin.h>
		691	///
		692	/// This intrinsic corresponds to the <c> VCMPNEQSS / CMPNEQSS </c>
		693	/// instructions.
		694	///
		695	/// \param __a
		696	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		697	/// 32 bits of this operand are used in the comparison.
		698	/// \param __b
		699	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		700	/// 32 bits of this operand are used in the comparison.
		701	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		702	/// in the low-order bits.
		703	static __inline__ __m128 __DEFAULT_FN_ATTRS
		704	_mm_cmpneq_ss(__m128 __a, __m128 __b)
		705	{
		706	return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b);
		707	}
		708
		709	/// Compares each of the corresponding 32-bit float values of the
		710	/// 128-bit vectors of [4 x float] for inequality.
		711	///
		712	/// \headerfile <x86intrin.h>
		713	///
		714	/// This intrinsic corresponds to the <c> VCMPNEQPS / CMPNEQPS </c>
		715	/// instructions.
		716	///
		717	/// \param __a
		718	/// A 128-bit vector of [4 x float].
		719	/// \param __b
		720	/// A 128-bit vector of [4 x float].
		721	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		722	static __inline__ __m128 __DEFAULT_FN_ATTRS
		723	_mm_cmpneq_ps(__m128 __a, __m128 __b)
		724	{
		725	return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b);
		726	}
		727
		728	/// Compares two 32-bit float values in the low-order bits of both
		729	/// operands to determine if the value in the first operand is not less than
		730	/// the corresponding value in the second operand and returns the result of
		731	/// the comparison in the low-order bits of a vector of [4 x float].
		732	///
		733	/// \headerfile <x86intrin.h>
		734	///
		735	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
		736	/// instructions.
		737	///
		738	/// \param __a
		739	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		740	/// 32 bits of this operand are used in the comparison.
		741	/// \param __b
		742	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		743	/// 32 bits of this operand are used in the comparison.
		744	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		745	/// in the low-order bits.
		746	static __inline__ __m128 __DEFAULT_FN_ATTRS
		747	_mm_cmpnlt_ss(__m128 __a, __m128 __b)
		748	{
		749	return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b);
		750	}
		751
		752	/// Compares each of the corresponding 32-bit float values of the
		753	/// 128-bit vectors of [4 x float] to determine if the values in the first
		754	/// operand are not less than those in the second operand.
		755	///
		756	/// \headerfile <x86intrin.h>
		757	///
		758	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
		759	/// instructions.
		760	///
		761	/// \param __a
		762	/// A 128-bit vector of [4 x float].
		763	/// \param __b
		764	/// A 128-bit vector of [4 x float].
		765	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		766	static __inline__ __m128 __DEFAULT_FN_ATTRS
		767	_mm_cmpnlt_ps(__m128 __a, __m128 __b)
		768	{
		769	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b);
		770	}
		771
		772	/// Compares two 32-bit float values in the low-order bits of both
		773	/// operands to determine if the value in the first operand is not less than
		774	/// or equal to the corresponding value in the second operand and returns
		775	/// the result of the comparison in the low-order bits of a vector of
		776	/// [4 x float].
		777	///
		778	/// \headerfile <x86intrin.h>
		779	///
		780	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
		781	/// instructions.
		782	///
		783	/// \param __a
		784	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		785	/// 32 bits of this operand are used in the comparison.
		786	/// \param __b
		787	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		788	/// 32 bits of this operand are used in the comparison.
		789	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		790	/// in the low-order bits.
		791	static __inline__ __m128 __DEFAULT_FN_ATTRS
		792	_mm_cmpnle_ss(__m128 __a, __m128 __b)
		793	{
		794	return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b);
		795	}
		796
		797	/// Compares each of the corresponding 32-bit float values of the
		798	/// 128-bit vectors of [4 x float] to determine if the values in the first
		799	/// operand are not less than or equal to those in the second operand.
		800	///
		801	/// \headerfile <x86intrin.h>
		802	///
		803	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
		804	/// instructions.
		805	///
		806	/// \param __a
		807	/// A 128-bit vector of [4 x float].
		808	/// \param __b
		809	/// A 128-bit vector of [4 x float].
		810	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		811	static __inline__ __m128 __DEFAULT_FN_ATTRS
		812	_mm_cmpnle_ps(__m128 __a, __m128 __b)
		813	{
		814	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b);
		815	}
		816
		817	/// Compares two 32-bit float values in the low-order bits of both
		818	/// operands to determine if the value in the first operand is not greater
		819	/// than the corresponding value in the second operand and returns the
		820	/// result of the comparison in the low-order bits of a vector of
		821	/// [4 x float].
		822	///
		823	/// \headerfile <x86intrin.h>
		824	///
		825	/// This intrinsic corresponds to the <c> VCMPNLTSS / CMPNLTSS </c>
		826	/// instructions.
		827	///
		828	/// \param __a
		829	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		830	/// 32 bits of this operand are used in the comparison.
		831	/// \param __b
		832	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		833	/// 32 bits of this operand are used in the comparison.
		834	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		835	/// in the low-order bits.
		836	static __inline__ __m128 __DEFAULT_FN_ATTRS
		837	_mm_cmpngt_ss(__m128 __a, __m128 __b)
		838	{
		839	return (__m128)__builtin_shufflevector((__v4sf)__a,
		840	(__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a),
		841	4, 1, 2, 3);
		842	}
		843
		844	/// Compares each of the corresponding 32-bit float values of the
		845	/// 128-bit vectors of [4 x float] to determine if the values in the first
		846	/// operand are not greater than those in the second operand.
		847	///
		848	/// \headerfile <x86intrin.h>
		849	///
		850	/// This intrinsic corresponds to the <c> VCMPNLTPS / CMPNLTPS </c>
		851	/// instructions.
		852	///
		853	/// \param __a
		854	/// A 128-bit vector of [4 x float].
		855	/// \param __b
		856	/// A 128-bit vector of [4 x float].
		857	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		858	static __inline__ __m128 __DEFAULT_FN_ATTRS
		859	_mm_cmpngt_ps(__m128 __a, __m128 __b)
		860	{
		861	return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a);
		862	}
		863
		864	/// Compares two 32-bit float values in the low-order bits of both
		865	/// operands to determine if the value in the first operand is not greater
		866	/// than or equal to the corresponding value in the second operand and
		867	/// returns the result of the comparison in the low-order bits of a vector
		868	/// of [4 x float].
		869	///
		870	/// \headerfile <x86intrin.h>
		871	///
		872	/// This intrinsic corresponds to the <c> VCMPNLESS / CMPNLESS </c>
		873	/// instructions.
		874	///
		875	/// \param __a
		876	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		877	/// 32 bits of this operand are used in the comparison.
		878	/// \param __b
		879	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		880	/// 32 bits of this operand are used in the comparison.
		881	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		882	/// in the low-order bits.
		883	static __inline__ __m128 __DEFAULT_FN_ATTRS
		884	_mm_cmpnge_ss(__m128 __a, __m128 __b)
		885	{
		886	return (__m128)__builtin_shufflevector((__v4sf)__a,
		887	(__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a),
		888	4, 1, 2, 3);
		889	}
		890
		891	/// Compares each of the corresponding 32-bit float values of the
		892	/// 128-bit vectors of [4 x float] to determine if the values in the first
		893	/// operand are not greater than or equal to those in the second operand.
		894	///
		895	/// \headerfile <x86intrin.h>
		896	///
		897	/// This intrinsic corresponds to the <c> VCMPNLEPS / CMPNLEPS </c>
		898	/// instructions.
		899	///
		900	/// \param __a
		901	/// A 128-bit vector of [4 x float].
		902	/// \param __b
		903	/// A 128-bit vector of [4 x float].
		904	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		905	static __inline__ __m128 __DEFAULT_FN_ATTRS
		906	_mm_cmpnge_ps(__m128 __a, __m128 __b)
		907	{
		908	return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a);
		909	}
		910
		911	/// Compares two 32-bit float values in the low-order bits of both
		912	/// operands to determine if the value in the first operand is ordered with
		913	/// respect to the corresponding value in the second operand and returns the
		914	/// result of the comparison in the low-order bits of a vector of
		915	/// [4 x float].
		916	///
		917	/// \headerfile <x86intrin.h>
		918	///
		919	/// This intrinsic corresponds to the <c> VCMPORDSS / CMPORDSS </c>
		920	/// instructions.
		921	///
		922	/// \param __a
		923	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		924	/// 32 bits of this operand are used in the comparison.
		925	/// \param __b
		926	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		927	/// 32 bits of this operand are used in the comparison.
		928	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		929	/// in the low-order bits.
		930	static __inline__ __m128 __DEFAULT_FN_ATTRS
		931	_mm_cmpord_ss(__m128 __a, __m128 __b)
		932	{
		933	return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b);
		934	}
		935
		936	/// Compares each of the corresponding 32-bit float values of the
		937	/// 128-bit vectors of [4 x float] to determine if the values in the first
		938	/// operand are ordered with respect to those in the second operand.
		939	///
		940	/// \headerfile <x86intrin.h>
		941	///
		942	/// This intrinsic corresponds to the <c> VCMPORDPS / CMPORDPS </c>
		943	/// instructions.
		944	///
		945	/// \param __a
		946	/// A 128-bit vector of [4 x float].
		947	/// \param __b
		948	/// A 128-bit vector of [4 x float].
		949	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		950	static __inline__ __m128 __DEFAULT_FN_ATTRS
		951	_mm_cmpord_ps(__m128 __a, __m128 __b)
		952	{
		953	return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b);
		954	}
		955
		956	/// Compares two 32-bit float values in the low-order bits of both
		957	/// operands to determine if the value in the first operand is unordered
		958	/// with respect to the corresponding value in the second operand and
		959	/// returns the result of the comparison in the low-order bits of a vector
		960	/// of [4 x float].
		961	///
		962	/// \headerfile <x86intrin.h>
		963	///
		964	/// This intrinsic corresponds to the <c> VCMPUNORDSS / CMPUNORDSS </c>
		965	/// instructions.
		966	///
		967	/// \param __a
		968	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		969	/// 32 bits of this operand are used in the comparison.
		970	/// \param __b
		971	/// A 128-bit vector of [4 x float] containing one of the operands. The lower
		972	/// 32 bits of this operand are used in the comparison.
		973	/// \returns A 128-bit vector of [4 x float] containing the comparison results
		974	/// in the low-order bits.
		975	static __inline__ __m128 __DEFAULT_FN_ATTRS
		976	_mm_cmpunord_ss(__m128 __a, __m128 __b)
		977	{
		978	return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b);
		979	}
		980
		981	/// Compares each of the corresponding 32-bit float values of the
		982	/// 128-bit vectors of [4 x float] to determine if the values in the first
		983	/// operand are unordered with respect to those in the second operand.
		984	///
		985	/// \headerfile <x86intrin.h>
		986	///
		987	/// This intrinsic corresponds to the <c> VCMPUNORDPS / CMPUNORDPS </c>
		988	/// instructions.
		989	///
		990	/// \param __a
		991	/// A 128-bit vector of [4 x float].
		992	/// \param __b
		993	/// A 128-bit vector of [4 x float].
		994	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		995	static __inline__ __m128 __DEFAULT_FN_ATTRS
		996	_mm_cmpunord_ps(__m128 __a, __m128 __b)
		997	{
		998	return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b);
		999	}
		1000
		1001	/// Compares two 32-bit float values in the low-order bits of both
		1002	/// operands for equality and returns the result of the comparison.
		1003	///
		1004	/// If either of the two lower 32-bit values is NaN, 0 is returned.
		1005	///
		1006	/// \headerfile <x86intrin.h>
		1007	///
		1008	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
		1009	/// instructions.
		1010	///
		1011	/// \param __a
		1012	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1013	/// used in the comparison.
		1014	/// \param __b
		1015	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1016	/// used in the comparison.
		1017	/// \returns An integer containing the comparison results. If either of the
		1018	/// two lower 32-bit values is NaN, 0 is returned.
		1019	static __inline__ int __DEFAULT_FN_ATTRS
		1020	_mm_comieq_ss(__m128 __a, __m128 __b)
		1021	{
		1022	return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b);
		1023	}
		1024
		1025	/// Compares two 32-bit float values in the low-order bits of both
		1026	/// operands to determine if the first operand is less than the second
		1027	/// operand and returns the result of the comparison.
		1028	///
		1029	/// If either of the two lower 32-bit values is NaN, 0 is returned.
		1030	///
		1031	/// \headerfile <x86intrin.h>
		1032	///
		1033	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c>
		1034	/// instructions.
		1035	///
		1036	/// \param __a
		1037	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1038	/// used in the comparison.
		1039	/// \param __b
		1040	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1041	/// used in the comparison.
		1042	/// \returns An integer containing the comparison results. If either of the two
		1043	/// lower 32-bit values is NaN, 0 is returned.
		1044	static __inline__ int __DEFAULT_FN_ATTRS
		1045	_mm_comilt_ss(__m128 __a, __m128 __b)
		1046	{
		1047	return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b);
		1048	}
		1049
		1050	/// Compares two 32-bit float values in the low-order bits of both
		1051	/// operands to determine if the first operand is less than or equal to the
		1052	/// second operand and returns the result of the comparison.
		1053	///
		1054	/// If either of the two lower 32-bit values is NaN, 0 is returned.
		1055	///
		1056	/// \headerfile <x86intrin.h>
		1057	///
		1058	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
		1059	///
		1060	/// \param __a
		1061	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1062	/// used in the comparison.
		1063	/// \param __b
		1064	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1065	/// used in the comparison.
		1066	/// \returns An integer containing the comparison results. If either of the two
		1067	/// lower 32-bit values is NaN, 0 is returned.
		1068	static __inline__ int __DEFAULT_FN_ATTRS
		1069	_mm_comile_ss(__m128 __a, __m128 __b)
		1070	{
		1071	return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b);
		1072	}
		1073
		1074	/// Compares two 32-bit float values in the low-order bits of both
		1075	/// operands to determine if the first operand is greater than the second
		1076	/// operand and returns the result of the comparison.
		1077	///
		1078	/// If either of the two lower 32-bit values is NaN, 0 is returned.
		1079	///
		1080	/// \headerfile <x86intrin.h>
		1081	///
		1082	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
		1083	///
		1084	/// \param __a
		1085	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1086	/// used in the comparison.
		1087	/// \param __b
		1088	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1089	/// used in the comparison.
		1090	/// \returns An integer containing the comparison results. If either of the
		1091	/// two lower 32-bit values is NaN, 0 is returned.
		1092	static __inline__ int __DEFAULT_FN_ATTRS
		1093	_mm_comigt_ss(__m128 __a, __m128 __b)
		1094	{
		1095	return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b);
		1096	}
		1097
		1098	/// Compares two 32-bit float values in the low-order bits of both
		1099	/// operands to determine if the first operand is greater than or equal to
		1100	/// the second operand and returns the result of the comparison.
		1101	///
		1102	/// If either of the two lower 32-bit values is NaN, 0 is returned.
		1103	///
		1104	/// \headerfile <x86intrin.h>
		1105	///
		1106	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
		1107	///
		1108	/// \param __a
		1109	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1110	/// used in the comparison.
		1111	/// \param __b
		1112	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1113	/// used in the comparison.
		1114	/// \returns An integer containing the comparison results. If either of the two
		1115	/// lower 32-bit values is NaN, 0 is returned.
		1116	static __inline__ int __DEFAULT_FN_ATTRS
		1117	_mm_comige_ss(__m128 __a, __m128 __b)
		1118	{
		1119	return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b);
		1120	}
		1121
		1122	/// Compares two 32-bit float values in the low-order bits of both
		1123	/// operands to determine if the first operand is not equal to the second
		1124	/// operand and returns the result of the comparison.
		1125	///
		1126	/// If either of the two lower 32-bit values is NaN, 1 is returned.
		1127	///
		1128	/// \headerfile <x86intrin.h>
		1129	///
		1130	/// This intrinsic corresponds to the <c> VCOMISS / COMISS </c> instructions.
		1131	///
		1132	/// \param __a
		1133	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1134	/// used in the comparison.
		1135	/// \param __b
		1136	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1137	/// used in the comparison.
		1138	/// \returns An integer containing the comparison results. If either of the
		1139	/// two lower 32-bit values is NaN, 1 is returned.
		1140	static __inline__ int __DEFAULT_FN_ATTRS
		1141	_mm_comineq_ss(__m128 __a, __m128 __b)
		1142	{
		1143	return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b);
		1144	}
		1145
		1146	/// Performs an unordered comparison of two 32-bit float values using
		1147	/// the low-order bits of both operands to determine equality and returns
		1148	/// the result of the comparison.
		1149	///
		1150	/// If either of the two lower 32-bit values is NaN, 0 is returned.
		1151	///
		1152	/// \headerfile <x86intrin.h>
		1153	///
		1154	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
		1155	///
		1156	/// \param __a
		1157	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1158	/// used in the comparison.
		1159	/// \param __b
		1160	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1161	/// used in the comparison.
		1162	/// \returns An integer containing the comparison results. If either of the two
		1163	/// lower 32-bit values is NaN, 0 is returned.
		1164	static __inline__ int __DEFAULT_FN_ATTRS
		1165	_mm_ucomieq_ss(__m128 __a, __m128 __b)
		1166	{
		1167	return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b);
		1168	}
		1169
		1170	/// Performs an unordered comparison of two 32-bit float values using
		1171	/// the low-order bits of both operands to determine if the first operand is
		1172	/// less than the second operand and returns the result of the comparison.
		1173	///
		1174	/// If either of the two lower 32-bit values is NaN, 0 is returned.
		1175	///
		1176	/// \headerfile <x86intrin.h>
		1177	///
		1178	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
		1179	///
		1180	/// \param __a
		1181	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1182	/// used in the comparison.
		1183	/// \param __b
		1184	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1185	/// used in the comparison.
		1186	/// \returns An integer containing the comparison results. If either of the two
		1187	/// lower 32-bit values is NaN, 0 is returned.
		1188	static __inline__ int __DEFAULT_FN_ATTRS
		1189	_mm_ucomilt_ss(__m128 __a, __m128 __b)
		1190	{
		1191	return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b);
		1192	}
		1193
		1194	/// Performs an unordered comparison of two 32-bit float values using
		1195	/// the low-order bits of both operands to determine if the first operand is
		1196	/// less than or equal to the second operand and returns the result of the
		1197	/// comparison.
		1198	///
		1199	/// If either of the two lower 32-bit values is NaN, 0 is returned.
		1200	///
		1201	/// \headerfile <x86intrin.h>
		1202	///
		1203	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
		1204	///
		1205	/// \param __a
		1206	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1207	/// used in the comparison.
		1208	/// \param __b
		1209	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1210	/// used in the comparison.
		1211	/// \returns An integer containing the comparison results. If either of the two
		1212	/// lower 32-bit values is NaN, 0 is returned.
		1213	static __inline__ int __DEFAULT_FN_ATTRS
		1214	_mm_ucomile_ss(__m128 __a, __m128 __b)
		1215	{
		1216	return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b);
		1217	}
		1218
		1219	/// Performs an unordered comparison of two 32-bit float values using
		1220	/// the low-order bits of both operands to determine if the first operand is
		1221	/// greater than the second operand and returns the result of the
		1222	/// comparison.
		1223	///
		1224	/// If either of the two lower 32-bit values is NaN, 0 is returned.
		1225	///
		1226	/// \headerfile <x86intrin.h>
		1227	///
		1228	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
		1229	///
		1230	/// \param __a
		1231	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1232	/// used in the comparison.
		1233	/// \param __b
		1234	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1235	/// used in the comparison.
		1236	/// \returns An integer containing the comparison results. If either of the two
		1237	/// lower 32-bit values is NaN, 0 is returned.
		1238	static __inline__ int __DEFAULT_FN_ATTRS
		1239	_mm_ucomigt_ss(__m128 __a, __m128 __b)
		1240	{
		1241	return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b);
		1242	}
		1243
		1244	/// Performs an unordered comparison of two 32-bit float values using
		1245	/// the low-order bits of both operands to determine if the first operand is
		1246	/// greater than or equal to the second operand and returns the result of
		1247	/// the comparison.
		1248	///
		1249	/// If either of the two lower 32-bit values is NaN, 0 is returned.
		1250	///
		1251	/// \headerfile <x86intrin.h>
		1252	///
		1253	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
		1254	///
		1255	/// \param __a
		1256	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1257	/// used in the comparison.
		1258	/// \param __b
		1259	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1260	/// used in the comparison.
		1261	/// \returns An integer containing the comparison results. If either of the two
		1262	/// lower 32-bit values is NaN, 0 is returned.
		1263	static __inline__ int __DEFAULT_FN_ATTRS
		1264	_mm_ucomige_ss(__m128 __a, __m128 __b)
		1265	{
		1266	return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b);
		1267	}
		1268
		1269	/// Performs an unordered comparison of two 32-bit float values using
		1270	/// the low-order bits of both operands to determine inequality and returns
		1271	/// the result of the comparison.
		1272	///
		1273	/// If either of the two lower 32-bit values is NaN, 1 is returned.
		1274	///
		1275	/// \headerfile <x86intrin.h>
		1276	///
		1277	/// This intrinsic corresponds to the <c> VUCOMISS / UCOMISS </c> instructions.
		1278	///
		1279	/// \param __a
		1280	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1281	/// used in the comparison.
		1282	/// \param __b
		1283	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1284	/// used in the comparison.
		1285	/// \returns An integer containing the comparison results. If either of the two
		1286	/// lower 32-bit values is NaN, 1 is returned.
		1287	static __inline__ int __DEFAULT_FN_ATTRS
		1288	_mm_ucomineq_ss(__m128 __a, __m128 __b)
		1289	{
		1290	return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b);
		1291	}
		1292
		1293	/// Converts a float value contained in the lower 32 bits of a vector of
		1294	/// [4 x float] into a 32-bit integer.
		1295	///
		1296	/// \headerfile <x86intrin.h>
		1297	///
		1298	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
		1299	/// instructions.
		1300	///
		1301	/// \param __a
		1302	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1303	/// used in the conversion.
		1304	/// \returns A 32-bit integer containing the converted value.
		1305	static __inline__ int __DEFAULT_FN_ATTRS
		1306	_mm_cvtss_si32(__m128 __a)
		1307	{
		1308	return __builtin_ia32_cvtss2si((__v4sf)__a);
		1309	}
		1310
		1311	/// Converts a float value contained in the lower 32 bits of a vector of
		1312	/// [4 x float] into a 32-bit integer.
		1313	///
		1314	/// \headerfile <x86intrin.h>
		1315	///
		1316	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
		1317	/// instructions.
		1318	///
		1319	/// \param __a
		1320	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1321	/// used in the conversion.
		1322	/// \returns A 32-bit integer containing the converted value.
		1323	static __inline__ int __DEFAULT_FN_ATTRS
		1324	_mm_cvt_ss2si(__m128 __a)
		1325	{
		1326	return _mm_cvtss_si32(__a);
		1327	}
		1328
		1329	#ifdef __x86_64__
		1330
		1331	/// Converts a float value contained in the lower 32 bits of a vector of
		1332	/// [4 x float] into a 64-bit integer.
		1333	///
		1334	/// \headerfile <x86intrin.h>
		1335	///
		1336	/// This intrinsic corresponds to the <c> VCVTSS2SI / CVTSS2SI </c>
		1337	/// instructions.
		1338	///
		1339	/// \param __a
		1340	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1341	/// used in the conversion.
		1342	/// \returns A 64-bit integer containing the converted value.
		1343	static __inline__ long long __DEFAULT_FN_ATTRS
		1344	_mm_cvtss_si64(__m128 __a)
		1345	{
		1346	return __builtin_ia32_cvtss2si64((__v4sf)__a);
		1347	}
		1348
		1349	#endif
		1350
		1351	/// Converts two low-order float values in a 128-bit vector of
		1352	/// [4 x float] into a 64-bit vector of [2 x i32].
		1353	///
		1354	/// \headerfile <x86intrin.h>
		1355	///
		1356	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
		1357	///
		1358	/// \param __a
		1359	/// A 128-bit vector of [4 x float].
		1360	/// \returns A 64-bit integer vector containing the converted values.
		1361	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		1362	_mm_cvtps_pi32(__m128 __a)
		1363	{
		1364	return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a);
		1365	}
		1366
		1367	/// Converts two low-order float values in a 128-bit vector of
		1368	/// [4 x float] into a 64-bit vector of [2 x i32].
		1369	///
		1370	/// \headerfile <x86intrin.h>
		1371	///
		1372	/// This intrinsic corresponds to the <c> CVTPS2PI </c> instruction.
		1373	///
		1374	/// \param __a
		1375	/// A 128-bit vector of [4 x float].
		1376	/// \returns A 64-bit integer vector containing the converted values.
		1377	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		1378	_mm_cvt_ps2pi(__m128 __a)
		1379	{
		1380	return _mm_cvtps_pi32(__a);
		1381	}
		1382
		1383	/// Converts a float value contained in the lower 32 bits of a vector of
		1384	/// [4 x float] into a 32-bit integer, truncating the result when it is
		1385	/// inexact.
		1386	///
		1387	/// \headerfile <x86intrin.h>
		1388	///
		1389	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
		1390	/// instructions.
		1391	///
		1392	/// \param __a
		1393	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1394	/// used in the conversion.
		1395	/// \returns A 32-bit integer containing the converted value.
		1396	static __inline__ int __DEFAULT_FN_ATTRS
		1397	_mm_cvttss_si32(__m128 __a)
		1398	{
		1399	return __builtin_ia32_cvttss2si((__v4sf)__a);
		1400	}
		1401
		1402	/// Converts a float value contained in the lower 32 bits of a vector of
		1403	/// [4 x float] into a 32-bit integer, truncating the result when it is
		1404	/// inexact.
		1405	///
		1406	/// \headerfile <x86intrin.h>
		1407	///
		1408	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
		1409	/// instructions.
		1410	///
		1411	/// \param __a
		1412	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1413	/// used in the conversion.
		1414	/// \returns A 32-bit integer containing the converted value.
		1415	static __inline__ int __DEFAULT_FN_ATTRS
		1416	_mm_cvtt_ss2si(__m128 __a)
		1417	{
		1418	return _mm_cvttss_si32(__a);
		1419	}
		1420
		1421	#ifdef __x86_64__
		1422	/// Converts a float value contained in the lower 32 bits of a vector of
		1423	/// [4 x float] into a 64-bit integer, truncating the result when it is
		1424	/// inexact.
		1425	///
		1426	/// \headerfile <x86intrin.h>
		1427	///
		1428	/// This intrinsic corresponds to the <c> VCVTTSS2SI / CVTTSS2SI </c>
		1429	/// instructions.
		1430	///
		1431	/// \param __a
		1432	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1433	/// used in the conversion.
		1434	/// \returns A 64-bit integer containing the converted value.
		1435	static __inline__ long long __DEFAULT_FN_ATTRS
		1436	_mm_cvttss_si64(__m128 __a)
		1437	{
		1438	return __builtin_ia32_cvttss2si64((__v4sf)__a);
		1439	}
		1440	#endif
		1441
		1442	/// Converts two low-order float values in a 128-bit vector of
		1443	/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result
		1444	/// when it is inexact.
		1445	///
		1446	/// \headerfile <x86intrin.h>
		1447	///
		1448	/// This intrinsic corresponds to the <c> CVTTPS2PI / VTTPS2PI </c>
		1449	/// instructions.
		1450	///
		1451	/// \param __a
		1452	/// A 128-bit vector of [4 x float].
		1453	/// \returns A 64-bit integer vector containing the converted values.
		1454	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		1455	_mm_cvttps_pi32(__m128 __a)
		1456	{
		1457	return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a);
		1458	}
		1459
		1460	/// Converts two low-order float values in a 128-bit vector of [4 x
		1461	/// float] into a 64-bit vector of [2 x i32], truncating the result when it
		1462	/// is inexact.
		1463	///
		1464	/// \headerfile <x86intrin.h>
		1465	///
		1466	/// This intrinsic corresponds to the <c> CVTTPS2PI </c> instruction.
		1467	///
		1468	/// \param __a
		1469	/// A 128-bit vector of [4 x float].
		1470	/// \returns A 64-bit integer vector containing the converted values.
		1471	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		1472	_mm_cvtt_ps2pi(__m128 __a)
		1473	{
		1474	return _mm_cvttps_pi32(__a);
		1475	}
		1476
		1477	/// Converts a 32-bit signed integer value into a floating point value
		1478	/// and writes it to the lower 32 bits of the destination. The remaining
		1479	/// higher order elements of the destination vector are copied from the
		1480	/// corresponding elements in the first operand.
		1481	///
		1482	/// \headerfile <x86intrin.h>
		1483	///
		1484	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
		1485	///
		1486	/// \param __a
		1487	/// A 128-bit vector of [4 x float].
		1488	/// \param __b
		1489	/// A 32-bit signed integer operand containing the value to be converted.
		1490	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
		1491	/// converted value of the second operand. The upper 96 bits are copied from
		1492	/// the upper 96 bits of the first operand.
		1493	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1494	_mm_cvtsi32_ss(__m128 __a, int __b)
		1495	{
		1496	__a[0] = __b;
		1497	return __a;
		1498	}
		1499
		1500	/// Converts a 32-bit signed integer value into a floating point value
		1501	/// and writes it to the lower 32 bits of the destination. The remaining
		1502	/// higher order elements of the destination are copied from the
		1503	/// corresponding elements in the first operand.
		1504	///
		1505	/// \headerfile <x86intrin.h>
		1506	///
		1507	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
		1508	///
		1509	/// \param __a
		1510	/// A 128-bit vector of [4 x float].
		1511	/// \param __b
		1512	/// A 32-bit signed integer operand containing the value to be converted.
		1513	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
		1514	/// converted value of the second operand. The upper 96 bits are copied from
		1515	/// the upper 96 bits of the first operand.
		1516	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1517	_mm_cvt_si2ss(__m128 __a, int __b)
		1518	{
		1519	return _mm_cvtsi32_ss(__a, __b);
		1520	}
		1521
		1522	#ifdef __x86_64__
		1523
		1524	/// Converts a 64-bit signed integer value into a floating point value
		1525	/// and writes it to the lower 32 bits of the destination. The remaining
		1526	/// higher order elements of the destination are copied from the
		1527	/// corresponding elements in the first operand.
		1528	///
		1529	/// \headerfile <x86intrin.h>
		1530	///
		1531	/// This intrinsic corresponds to the <c> VCVTSI2SS / CVTSI2SS </c> instruction.
		1532	///
		1533	/// \param __a
		1534	/// A 128-bit vector of [4 x float].
		1535	/// \param __b
		1536	/// A 64-bit signed integer operand containing the value to be converted.
		1537	/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the
		1538	/// converted value of the second operand. The upper 96 bits are copied from
		1539	/// the upper 96 bits of the first operand.
		1540	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1541	_mm_cvtsi64_ss(__m128 __a, long long __b)
		1542	{
		1543	__a[0] = __b;
		1544	return __a;
		1545	}
		1546
		1547	#endif
		1548
		1549	/// Converts two elements of a 64-bit vector of [2 x i32] into two
		1550	/// floating point values and writes them to the lower 64-bits of the
		1551	/// destination. The remaining higher order elements of the destination are
		1552	/// copied from the corresponding elements in the first operand.
		1553	///
		1554	/// \headerfile <x86intrin.h>
		1555	///
		1556	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
		1557	///
		1558	/// \param __a
		1559	/// A 128-bit vector of [4 x float].
		1560	/// \param __b
		1561	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
		1562	/// and written to the corresponding low-order elements in the destination.
		1563	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
		1564	/// converted value of the second operand. The upper 64 bits are copied from
		1565	/// the upper 64 bits of the first operand.
		1566	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
		1567	_mm_cvtpi32_ps(__m128 __a, __m64 __b)
		1568	{
		1569	return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b);
		1570	}
		1571
		1572	/// Converts two elements of a 64-bit vector of [2 x i32] into two
		1573	/// floating point values and writes them to the lower 64-bits of the
		1574	/// destination. The remaining higher order elements of the destination are
		1575	/// copied from the corresponding elements in the first operand.
		1576	///
		1577	/// \headerfile <x86intrin.h>
		1578	///
		1579	/// This intrinsic corresponds to the <c> CVTPI2PS </c> instruction.
		1580	///
		1581	/// \param __a
		1582	/// A 128-bit vector of [4 x float].
		1583	/// \param __b
		1584	/// A 64-bit vector of [2 x i32]. The elements in this vector are converted
		1585	/// and written to the corresponding low-order elements in the destination.
		1586	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
		1587	/// converted value from the second operand. The upper 64 bits are copied
		1588	/// from the upper 64 bits of the first operand.
		1589	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
		1590	_mm_cvt_pi2ps(__m128 __a, __m64 __b)
		1591	{
		1592	return _mm_cvtpi32_ps(__a, __b);
		1593	}
		1594
		1595	/// Extracts a float value contained in the lower 32 bits of a vector of
		1596	/// [4 x float].
		1597	///
		1598	/// \headerfile <x86intrin.h>
		1599	///
		1600	/// This intrinsic has no corresponding instruction.
		1601	///
		1602	/// \param __a
		1603	/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are
		1604	/// used in the extraction.
		1605	/// \returns A 32-bit float containing the extracted value.
		1606	static __inline__ float __DEFAULT_FN_ATTRS
		1607	_mm_cvtss_f32(__m128 __a)
		1608	{
		1609	return __a[0];
		1610	}
		1611
		1612	/// Loads two packed float values from the address \a __p into the
		1613	/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits
		1614	/// are copied from the low-order bits of the first operand.
		1615	///
		1616	/// \headerfile <x86intrin.h>
		1617	///
		1618	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
		1619	///
		1620	/// \param __a
		1621	/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0]
		1622	/// of the destination.
		1623	/// \param __p
		1624	/// A pointer to two packed float values. Bits [63:0] are written to bits
		1625	/// [127:64] of the destination.
		1626	/// \returns A 128-bit vector of [4 x float] containing the moved values.
		1627	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1628	_mm_loadh_pi(__m128 __a, const __m64 *__p)
		1629	{
		1630	typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8)));
		1631	struct __mm_loadh_pi_struct {
		1632	__mm_loadh_pi_v2f32 __u;
		1633	} __attribute__((__packed__, __may_alias__));
		1634	__mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u;
		1635	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
		1636	return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5);
		1637	}
		1638
		1639	/// Loads two packed float values from the address \a __p into the
		1640	/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits
		1641	/// are copied from the high-order bits of the first operand.
		1642	///
		1643	/// \headerfile <x86intrin.h>
		1644	///
		1645	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
		1646	///
		1647	/// \param __a
		1648	/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits
		1649	/// [127:64] of the destination.
		1650	/// \param __p
		1651	/// A pointer to two packed float values. Bits [63:0] are written to bits
		1652	/// [63:0] of the destination.
		1653	/// \returns A 128-bit vector of [4 x float] containing the moved values.
		1654	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1655	_mm_loadl_pi(__m128 __a, const __m64 *__p)
		1656	{
		1657	typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8)));
		1658	struct __mm_loadl_pi_struct {
		1659	__mm_loadl_pi_v2f32 __u;
		1660	} __attribute__((__packed__, __may_alias__));
		1661	__mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u;
		1662	__m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1);
		1663	return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3);
		1664	}
		1665
		1666	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
		1667	/// 32 bits of the vector are initialized with the single-precision
		1668	/// floating-point value loaded from a specified memory location. The upper
		1669	/// 96 bits are set to zero.
		1670	///
		1671	/// \headerfile <x86intrin.h>
		1672	///
		1673	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
		1674	///
		1675	/// \param __p
		1676	/// A pointer to a 32-bit memory location containing a single-precision
		1677	/// floating-point value.
		1678	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
		1679	/// lower 32 bits contain the value loaded from the memory location. The
		1680	/// upper 96 bits are set to zero.
		1681	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1682	_mm_load_ss(const float *__p)
		1683	{
		1684	struct __mm_load_ss_struct {
		1685	float __u;
		1686	} __attribute__((__packed__, __may_alias__));
		1687	float __u = ((const struct __mm_load_ss_struct*)__p)->__u;
		1688	return __extension__ (__m128){ __u, 0, 0, 0 };
		1689	}
		1690
		1691	/// Loads a 32-bit float value and duplicates it to all four vector
		1692	/// elements of a 128-bit vector of [4 x float].
		1693	///
		1694	/// \headerfile <x86intrin.h>
		1695	///
		1696	/// This intrinsic corresponds to the <c> VBROADCASTSS / MOVSS + shuffling </c>
		1697	/// instruction.
		1698	///
		1699	/// \param __p
		1700	/// A pointer to a float value to be loaded and duplicated.
		1701	/// \returns A 128-bit vector of [4 x float] containing the loaded and
		1702	/// duplicated values.
		1703	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1704	_mm_load1_ps(const float *__p)
		1705	{
		1706	struct __mm_load1_ps_struct {
		1707	float __u;
		1708	} __attribute__((__packed__, __may_alias__));
		1709	float __u = ((const struct __mm_load1_ps_struct*)__p)->__u;
		1710	return __extension__ (__m128){ __u, __u, __u, __u };
		1711	}
		1712
		1713	#define _mm_load_ps1(p) _mm_load1_ps(p)
		1714
		1715	/// Loads a 128-bit floating-point vector of [4 x float] from an aligned
		1716	/// memory location.
		1717	///
		1718	/// \headerfile <x86intrin.h>
		1719	///
		1720	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
		1721	///
		1722	/// \param __p
		1723	/// A pointer to a 128-bit memory location. The address of the memory
		1724	/// location has to be 128-bit aligned.
		1725	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
		1726	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1727	_mm_load_ps(const float *__p)
		1728	{
		1729	return (const __m128)__p;
		1730	}
		1731
		1732	/// Loads a 128-bit floating-point vector of [4 x float] from an
		1733	/// unaligned memory location.
		1734	///
		1735	/// \headerfile <x86intrin.h>
		1736	///
		1737	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
		1738	///
		1739	/// \param __p
		1740	/// A pointer to a 128-bit memory location. The address of the memory
		1741	/// location does not have to be aligned.
		1742	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
		1743	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1744	_mm_loadu_ps(const float *__p)
		1745	{
		1746	struct __loadu_ps {
		1747	__m128_u __v;
		1748	} __attribute__((__packed__, __may_alias__));
		1749	return ((const struct __loadu_ps*)__p)->__v;
		1750	}
		1751
		1752	/// Loads four packed float values, in reverse order, from an aligned
		1753	/// memory location to 32-bit elements in a 128-bit vector of [4 x float].
		1754	///
		1755	/// \headerfile <x86intrin.h>
		1756	///
		1757	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
		1758	/// instruction.
		1759	///
		1760	/// \param __p
		1761	/// A pointer to a 128-bit memory location. The address of the memory
		1762	/// location has to be 128-bit aligned.
		1763	/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded
		1764	/// in reverse order.
		1765	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1766	_mm_loadr_ps(const float *__p)
		1767	{
		1768	__m128 __a = _mm_load_ps(__p);
		1769	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
		1770	}
		1771
		1772	/// Create a 128-bit vector of [4 x float] with undefined values.
		1773	///
		1774	/// \headerfile <x86intrin.h>
		1775	///
		1776	/// This intrinsic has no corresponding instruction.
		1777	///
		1778	/// \returns A 128-bit vector of [4 x float] containing undefined values.
		1779	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1780	_mm_undefined_ps(void)
		1781	{
		1782	return (__m128)__builtin_ia32_undef128();
		1783	}
		1784
		1785	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
		1786	/// 32 bits of the vector are initialized with the specified single-precision
		1787	/// floating-point value. The upper 96 bits are set to zero.
		1788	///
		1789	/// \headerfile <x86intrin.h>
		1790	///
		1791	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
		1792	///
		1793	/// \param __w
		1794	/// A single-precision floating-point value used to initialize the lower 32
		1795	/// bits of the result.
		1796	/// \returns An initialized 128-bit floating-point vector of [4 x float]. The
		1797	/// lower 32 bits contain the value provided in the source operand. The
		1798	/// upper 96 bits are set to zero.
		1799	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1800	_mm_set_ss(float __w)
		1801	{
		1802	return __extension__ (__m128){ __w, 0, 0, 0 };
		1803	}
		1804
		1805	/// Constructs a 128-bit floating-point vector of [4 x float], with each
		1806	/// of the four single-precision floating-point vector elements set to the
		1807	/// specified single-precision floating-point value.
		1808	///
		1809	/// \headerfile <x86intrin.h>
		1810	///
		1811	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
		1812	///
		1813	/// \param __w
		1814	/// A single-precision floating-point value used to initialize each vector
		1815	/// element of the result.
		1816	/// \returns An initialized 128-bit floating-point vector of [4 x float].
		1817	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1818	_mm_set1_ps(float __w)
		1819	{
		1820	return __extension__ (__m128){ __w, __w, __w, __w };
		1821	}
		1822
		1823	/* Microsoft specific. */
		1824	/// Constructs a 128-bit floating-point vector of [4 x float], with each
		1825	/// of the four single-precision floating-point vector elements set to the
		1826	/// specified single-precision floating-point value.
		1827	///
		1828	/// \headerfile <x86intrin.h>
		1829	///
		1830	/// This intrinsic corresponds to the <c> VPERMILPS / PERMILPS </c> instruction.
		1831	///
		1832	/// \param __w
		1833	/// A single-precision floating-point value used to initialize each vector
		1834	/// element of the result.
		1835	/// \returns An initialized 128-bit floating-point vector of [4 x float].
		1836	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1837	_mm_set_ps1(float __w)
		1838	{
		1839	return _mm_set1_ps(__w);
		1840	}
		1841
		1842	/// Constructs a 128-bit floating-point vector of [4 x float]
		1843	/// initialized with the specified single-precision floating-point values.
		1844	///
		1845	/// \headerfile <x86intrin.h>
		1846	///
		1847	/// This intrinsic is a utility function and does not correspond to a specific
		1848	/// instruction.
		1849	///
		1850	/// \param __z
		1851	/// A single-precision floating-point value used to initialize bits [127:96]
		1852	/// of the result.
		1853	/// \param __y
		1854	/// A single-precision floating-point value used to initialize bits [95:64]
		1855	/// of the result.
		1856	/// \param __x
		1857	/// A single-precision floating-point value used to initialize bits [63:32]
		1858	/// of the result.
		1859	/// \param __w
		1860	/// A single-precision floating-point value used to initialize bits [31:0]
		1861	/// of the result.
		1862	/// \returns An initialized 128-bit floating-point vector of [4 x float].
		1863	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1864	_mm_set_ps(float __z, float __y, float __x, float __w)
		1865	{
		1866	return __extension__ (__m128){ __w, __x, __y, __z };
		1867	}
		1868
		1869	/// Constructs a 128-bit floating-point vector of [4 x float],
		1870	/// initialized in reverse order with the specified 32-bit single-precision
		1871	/// float-point values.
		1872	///
		1873	/// \headerfile <x86intrin.h>
		1874	///
		1875	/// This intrinsic is a utility function and does not correspond to a specific
		1876	/// instruction.
		1877	///
		1878	/// \param __z
		1879	/// A single-precision floating-point value used to initialize bits [31:0]
		1880	/// of the result.
		1881	/// \param __y
		1882	/// A single-precision floating-point value used to initialize bits [63:32]
		1883	/// of the result.
		1884	/// \param __x
		1885	/// A single-precision floating-point value used to initialize bits [95:64]
		1886	/// of the result.
		1887	/// \param __w
		1888	/// A single-precision floating-point value used to initialize bits [127:96]
		1889	/// of the result.
		1890	/// \returns An initialized 128-bit floating-point vector of [4 x float].
		1891	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1892	_mm_setr_ps(float __z, float __y, float __x, float __w)
		1893	{
		1894	return __extension__ (__m128){ __z, __y, __x, __w };
		1895	}
		1896
		1897	/// Constructs a 128-bit floating-point vector of [4 x float] initialized
		1898	/// to zero.
		1899	///
		1900	/// \headerfile <x86intrin.h>
		1901	///
		1902	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
		1903	///
		1904	/// \returns An initialized 128-bit floating-point vector of [4 x float] with
		1905	/// all elements set to zero.
		1906	static __inline__ __m128 __DEFAULT_FN_ATTRS
		1907	_mm_setzero_ps(void)
		1908	{
		1909	return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
		1910	}
		1911
		1912	/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
		1913	/// memory location.
		1914	///
		1915	/// \headerfile <x86intrin.h>
		1916	///
		1917	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction.
		1918	///
		1919	/// \param __p
		1920	/// A pointer to a 64-bit memory location.
		1921	/// \param __a
		1922	/// A 128-bit vector of [4 x float] containing the values to be stored.
		1923	static __inline__ void __DEFAULT_FN_ATTRS
		1924	_mm_storeh_pi(__m64 *__p, __m128 __a)
		1925	{
		1926	typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
		1927	struct __mm_storeh_pi_struct {
		1928	__mm_storeh_pi_v2f32 __u;
		1929	} __attribute__((__packed__, __may_alias__));
		1930	((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3);
		1931	}
		1932
		1933	/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a
		1934	/// memory location.
		1935	///
		1936	/// \headerfile <x86intrin.h>
		1937	///
		1938	/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
		1939	///
		1940	/// \param __p
		1941	/// A pointer to a memory location that will receive the float values.
		1942	/// \param __a
		1943	/// A 128-bit vector of [4 x float] containing the values to be stored.
		1944	static __inline__ void __DEFAULT_FN_ATTRS
		1945	_mm_storel_pi(__m64 *__p, __m128 __a)
		1946	{
		1947	typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8)));
		1948	struct __mm_storeh_pi_struct {
		1949	__mm_storeh_pi_v2f32 __u;
		1950	} __attribute__((__packed__, __may_alias__));
		1951	((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1);
		1952	}
		1953
		1954	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a
		1955	/// memory location.
		1956	///
		1957	/// \headerfile <x86intrin.h>
		1958	///
		1959	/// This intrinsic corresponds to the <c> VMOVSS / MOVSS </c> instruction.
		1960	///
		1961	/// \param __p
		1962	/// A pointer to a 32-bit memory location.
		1963	/// \param __a
		1964	/// A 128-bit vector of [4 x float] containing the value to be stored.
		1965	static __inline__ void __DEFAULT_FN_ATTRS
		1966	_mm_store_ss(float *__p, __m128 __a)
		1967	{
		1968	struct __mm_store_ss_struct {
		1969	float __u;
		1970	} __attribute__((__packed__, __may_alias__));
		1971	((struct __mm_store_ss_struct*)__p)->__u = __a[0];
		1972	}
		1973
		1974	/// Stores a 128-bit vector of [4 x float] to an unaligned memory
		1975	/// location.
		1976	///
		1977	/// \headerfile <x86intrin.h>
		1978	///
		1979	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
		1980	///
		1981	/// \param __p
		1982	/// A pointer to a 128-bit memory location. The address of the memory
		1983	/// location does not have to be aligned.
		1984	/// \param __a
		1985	/// A 128-bit vector of [4 x float] containing the values to be stored.
		1986	static __inline__ void __DEFAULT_FN_ATTRS
		1987	_mm_storeu_ps(float *__p, __m128 __a)
		1988	{
		1989	struct __storeu_ps {
		1990	__m128_u __v;
		1991	} __attribute__((__packed__, __may_alias__));
		1992	((struct __storeu_ps*)__p)->__v = __a;
		1993	}
		1994
		1995	/// Stores a 128-bit vector of [4 x float] into an aligned memory
		1996	/// location.
		1997	///
		1998	/// \headerfile <x86intrin.h>
		1999	///
		2000	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
		2001	///
		2002	/// \param __p
		2003	/// A pointer to a 128-bit memory location. The address of the memory
		2004	/// location has to be 16-byte aligned.
		2005	/// \param __a
		2006	/// A 128-bit vector of [4 x float] containing the values to be stored.
		2007	static __inline__ void __DEFAULT_FN_ATTRS
		2008	_mm_store_ps(float *__p, __m128 __a)
		2009	{
		2010	(__m128)__p = __a;
		2011	}
		2012
		2013	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
		2014	/// four contiguous elements in an aligned memory location.
		2015	///
		2016	/// \headerfile <x86intrin.h>
		2017	///
		2018	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
		2019	/// instruction.
		2020	///
		2021	/// \param __p
		2022	/// A pointer to a 128-bit memory location.
		2023	/// \param __a
		2024	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
		2025	/// of the four contiguous elements pointed by \a __p.
		2026	static __inline__ void __DEFAULT_FN_ATTRS
		2027	_mm_store1_ps(float *__p, __m128 __a)
		2028	{
		2029	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0);
		2030	_mm_store_ps(__p, __a);
		2031	}
		2032
		2033	/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into
		2034	/// four contiguous elements in an aligned memory location.
		2035	///
		2036	/// \headerfile <x86intrin.h>
		2037	///
		2038	/// This intrinsic corresponds to <c> VMOVAPS / MOVAPS + shuffling </c>
		2039	/// instruction.
		2040	///
		2041	/// \param __p
		2042	/// A pointer to a 128-bit memory location.
		2043	/// \param __a
		2044	/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each
		2045	/// of the four contiguous elements pointed by \a __p.
		2046	static __inline__ void __DEFAULT_FN_ATTRS
		2047	_mm_store_ps1(float *__p, __m128 __a)
		2048	{
		2049	_mm_store1_ps(__p, __a);
		2050	}
		2051
		2052	/// Stores float values from a 128-bit vector of [4 x float] to an
		2053	/// aligned memory location in reverse order.
		2054	///
		2055	/// \headerfile <x86intrin.h>
		2056	///
		2057	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS + shuffling </c>
		2058	/// instruction.
		2059	///
		2060	/// \param __p
		2061	/// A pointer to a 128-bit memory location. The address of the memory
		2062	/// location has to be 128-bit aligned.
		2063	/// \param __a
		2064	/// A 128-bit vector of [4 x float] containing the values to be stored.
		2065	static __inline__ void __DEFAULT_FN_ATTRS
		2066	_mm_storer_ps(float *__p, __m128 __a)
		2067	{
		2068	__a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0);
		2069	_mm_store_ps(__p, __a);
		2070	}
		2071
		2072	#define _MM_HINT_ET0 7
		2073	#define _MM_HINT_ET1 6
		2074	#define _MM_HINT_T0 3
		2075	#define _MM_HINT_T1 2
		2076	#define _MM_HINT_T2 1
		2077	#define _MM_HINT_NTA 0
		2078
		2079	#ifndef _MSC_VER
		2080	/* FIXME: We have to #define this because "sel" must be a constant integer, and
		2081	Sema doesn't do any form of constant propagation yet. */
		2082
		2083	/// Loads one cache line of data from the specified address to a location
		2084	/// closer to the processor.
		2085	///
		2086	/// \headerfile <x86intrin.h>
		2087	///
		2088	/// \code
		2089	/// void _mm_prefetch(const void *a, const int sel);
		2090	/// \endcode
		2091	///
		2092	/// This intrinsic corresponds to the <c> PREFETCHNTA </c> instruction.
		2093	///
		2094	/// \param a
		2095	/// A pointer to a memory location containing a cache line of data.
		2096	/// \param sel
		2097	/// A predefined integer constant specifying the type of prefetch
		2098	/// operation: \n
		2099	/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The
		2100	/// PREFETCHNTA instruction will be generated. \n
		2101	/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will
		2102	/// be generated. \n
		2103	/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will
		2104	/// be generated. \n
		2105	/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will
		2106	/// be generated.
		2107	#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \
		2108	((sel) >> 2) & 1, (sel) & 0x3))
		2109	#endif
		2110
		2111	/// Stores a 64-bit integer in the specified aligned memory location. To
		2112	/// minimize caching, the data is flagged as non-temporal (unlikely to be
		2113	/// used again soon).
		2114	///
		2115	/// \headerfile <x86intrin.h>
		2116	///
		2117	/// This intrinsic corresponds to the <c> MOVNTQ </c> instruction.
		2118	///
		2119	/// \param __p
		2120	/// A pointer to an aligned memory location used to store the register value.
		2121	/// \param __a
		2122	/// A 64-bit integer containing the value to be stored.
		2123	static __inline__ void __DEFAULT_FN_ATTRS_MMX
		2124	_mm_stream_pi(__m64 *__p, __m64 __a)
		2125	{
		2126	__builtin_ia32_movntq(__p, __a);
		2127	}
		2128
		2129	/// Moves packed float values from a 128-bit vector of [4 x float] to a
		2130	/// 128-bit aligned memory location. To minimize caching, the data is flagged
		2131	/// as non-temporal (unlikely to be used again soon).
		2132	///
		2133	/// \headerfile <x86intrin.h>
		2134	///
		2135	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
		2136	///
		2137	/// \param __p
		2138	/// A pointer to a 128-bit aligned memory location that will receive the
		2139	/// single-precision floating-point values.
		2140	/// \param __a
		2141	/// A 128-bit vector of [4 x float] containing the values to be moved.
		2142	static __inline__ void __DEFAULT_FN_ATTRS
		2143	_mm_stream_ps(float *__p, __m128 __a)
		2144	{
		2145	__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
		2146	}
		2147
		2148	#if defined(__cplusplus)
		2149	extern "C" {
		2150	#endif
		2151
		2152	/// Forces strong memory ordering (serialization) between store
		2153	/// instructions preceding this instruction and store instructions following
		2154	/// this instruction, ensuring the system completes all previous stores
		2155	/// before executing subsequent stores.
		2156	///
		2157	/// \headerfile <x86intrin.h>
		2158	///
		2159	/// This intrinsic corresponds to the <c> SFENCE </c> instruction.
		2160	///
		2161	void _mm_sfence(void);
		2162
		2163	#if defined(__cplusplus)
		2164	} // extern "C"
		2165	#endif
		2166
		2167	/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and
		2168	/// returns it, as specified by the immediate integer operand.
		2169	///
		2170	/// \headerfile <x86intrin.h>
		2171	///
		2172	/// \code
		2173	/// int _mm_extract_pi16(__m64 a, int n);
		2174	/// \endcode
		2175	///
		2176	/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
		2177	///
		2178	/// \param a
		2179	/// A 64-bit vector of [4 x i16].
		2180	/// \param n
		2181	/// An immediate integer operand that determines which bits are extracted: \n
		2182	/// 0: Bits [15:0] are copied to the destination. \n
		2183	/// 1: Bits [31:16] are copied to the destination. \n
		2184	/// 2: Bits [47:32] are copied to the destination. \n
		2185	/// 3: Bits [63:48] are copied to the destination.
		2186	/// \returns A 16-bit integer containing the extracted 16 bits of packed data.
		2187	#define _mm_extract_pi16(a, n) \
		2188	((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n))
		2189
		2190	/// Copies data from the 64-bit vector of [4 x i16] to the destination,
		2191	/// and inserts the lower 16-bits of an integer operand at the 16-bit offset
		2192	/// specified by the immediate operand \a n.
		2193	///
		2194	/// \headerfile <x86intrin.h>
		2195	///
		2196	/// \code
		2197	/// __m64 _mm_insert_pi16(__m64 a, int d, int n);
		2198	/// \endcode
		2199	///
		2200	/// This intrinsic corresponds to the <c> PINSRW </c> instruction.
		2201	///
		2202	/// \param a
		2203	/// A 64-bit vector of [4 x i16].
		2204	/// \param d
		2205	/// An integer. The lower 16-bit value from this operand is written to the
		2206	/// destination at the offset specified by operand \a n.
		2207	/// \param n
		2208	/// An immediate integer operant that determines which the bits to be used
		2209	/// in the destination. \n
		2210	/// 0: Bits [15:0] are copied to the destination. \n
		2211	/// 1: Bits [31:16] are copied to the destination. \n
		2212	/// 2: Bits [47:32] are copied to the destination. \n
		2213	/// 3: Bits [63:48] are copied to the destination. \n
		2214	/// The remaining bits in the destination are copied from the corresponding
		2215	/// bits in operand \a a.
		2216	/// \returns A 64-bit integer vector containing the copied packed data from the
		2217	/// operands.
		2218	#define _mm_insert_pi16(a, d, n) \
		2219	((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n))
		2220
		2221	/// Compares each of the corresponding packed 16-bit integer values of
		2222	/// the 64-bit integer vectors, and writes the greater value to the
		2223	/// corresponding bits in the destination.
		2224	///
		2225	/// \headerfile <x86intrin.h>
		2226	///
		2227	/// This intrinsic corresponds to the <c> PMAXSW </c> instruction.
		2228	///
		2229	/// \param __a
		2230	/// A 64-bit integer vector containing one of the source operands.
		2231	/// \param __b
		2232	/// A 64-bit integer vector containing one of the source operands.
		2233	/// \returns A 64-bit integer vector containing the comparison results.
		2234	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		2235	_mm_max_pi16(__m64 __a, __m64 __b)
		2236	{
		2237	return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b);
		2238	}
		2239
		2240	/// Compares each of the corresponding packed 8-bit unsigned integer
		2241	/// values of the 64-bit integer vectors, and writes the greater value to the
		2242	/// corresponding bits in the destination.
		2243	///
		2244	/// \headerfile <x86intrin.h>
		2245	///
		2246	/// This intrinsic corresponds to the <c> PMAXUB </c> instruction.
		2247	///
		2248	/// \param __a
		2249	/// A 64-bit integer vector containing one of the source operands.
		2250	/// \param __b
		2251	/// A 64-bit integer vector containing one of the source operands.
		2252	/// \returns A 64-bit integer vector containing the comparison results.
		2253	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		2254	_mm_max_pu8(__m64 __a, __m64 __b)
		2255	{
		2256	return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b);
		2257	}
		2258
		2259	/// Compares each of the corresponding packed 16-bit integer values of
		2260	/// the 64-bit integer vectors, and writes the lesser value to the
		2261	/// corresponding bits in the destination.
		2262	///
		2263	/// \headerfile <x86intrin.h>
		2264	///
		2265	/// This intrinsic corresponds to the <c> PMINSW </c> instruction.
		2266	///
		2267	/// \param __a
		2268	/// A 64-bit integer vector containing one of the source operands.
		2269	/// \param __b
		2270	/// A 64-bit integer vector containing one of the source operands.
		2271	/// \returns A 64-bit integer vector containing the comparison results.
		2272	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		2273	_mm_min_pi16(__m64 __a, __m64 __b)
		2274	{
		2275	return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b);
		2276	}
		2277
		2278	/// Compares each of the corresponding packed 8-bit unsigned integer
		2279	/// values of the 64-bit integer vectors, and writes the lesser value to the
		2280	/// corresponding bits in the destination.
		2281	///
		2282	/// \headerfile <x86intrin.h>
		2283	///
		2284	/// This intrinsic corresponds to the <c> PMINUB </c> instruction.
		2285	///
		2286	/// \param __a
		2287	/// A 64-bit integer vector containing one of the source operands.
		2288	/// \param __b
		2289	/// A 64-bit integer vector containing one of the source operands.
		2290	/// \returns A 64-bit integer vector containing the comparison results.
		2291	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		2292	_mm_min_pu8(__m64 __a, __m64 __b)
		2293	{
		2294	return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b);
		2295	}
		2296
		2297	/// Takes the most significant bit from each 8-bit element in a 64-bit
		2298	/// integer vector to create an 8-bit mask value. Zero-extends the value to
		2299	/// 32-bit integer and writes it to the destination.
		2300	///
		2301	/// \headerfile <x86intrin.h>
		2302	///
		2303	/// This intrinsic corresponds to the <c> PMOVMSKB </c> instruction.
		2304	///
		2305	/// \param __a
		2306	/// A 64-bit integer vector containing the values with bits to be extracted.
		2307	/// \returns The most significant bit from each 8-bit element in \a __a,
		2308	/// written to bits [7:0].
		2309	static __inline__ int __DEFAULT_FN_ATTRS_MMX
		2310	_mm_movemask_pi8(__m64 __a)
		2311	{
		2312	return __builtin_ia32_pmovmskb((__v8qi)__a);
		2313	}
		2314
		2315	/// Multiplies packed 16-bit unsigned integer values and writes the
		2316	/// high-order 16 bits of each 32-bit product to the corresponding bits in
		2317	/// the destination.
		2318	///
		2319	/// \headerfile <x86intrin.h>
		2320	///
		2321	/// This intrinsic corresponds to the <c> PMULHUW </c> instruction.
		2322	///
		2323	/// \param __a
		2324	/// A 64-bit integer vector containing one of the source operands.
		2325	/// \param __b
		2326	/// A 64-bit integer vector containing one of the source operands.
		2327	/// \returns A 64-bit integer vector containing the products of both operands.
		2328	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		2329	_mm_mulhi_pu16(__m64 __a, __m64 __b)
		2330	{
		2331	return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b);
		2332	}
		2333
		2334	/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the
		2335	/// destination, as specified by the immediate value operand.
		2336	///
		2337	/// \headerfile <x86intrin.h>
		2338	///
		2339	/// \code
		2340	/// __m64 _mm_shuffle_pi16(__m64 a, const int n);
		2341	/// \endcode
		2342	///
		2343	/// This intrinsic corresponds to the <c> PSHUFW </c> instruction.
		2344	///
		2345	/// \param a
		2346	/// A 64-bit integer vector containing the values to be shuffled.
		2347	/// \param n
		2348	/// An immediate value containing an 8-bit value specifying which elements to
		2349	/// copy from \a a. The destinations within the 64-bit destination are
		2350	/// assigned values as follows: \n
		2351	/// Bits [1:0] are used to assign values to bits [15:0] in the
		2352	/// destination. \n
		2353	/// Bits [3:2] are used to assign values to bits [31:16] in the
		2354	/// destination. \n
		2355	/// Bits [5:4] are used to assign values to bits [47:32] in the
		2356	/// destination. \n
		2357	/// Bits [7:6] are used to assign values to bits [63:48] in the
		2358	/// destination. \n
		2359	/// Bit value assignments: \n
		2360	/// 00: assigned from bits [15:0] of \a a. \n
		2361	/// 01: assigned from bits [31:16] of \a a. \n
		2362	/// 10: assigned from bits [47:32] of \a a. \n
		2363	/// 11: assigned from bits [63:48] of \a a. \n
		2364	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
		2365	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
		2366	/// <c>[b6, b4, b2, b0]</c>.
		2367	/// \returns A 64-bit integer vector containing the shuffled values.
		2368	#define _mm_shuffle_pi16(a, n) \
		2369	((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n)))
		2370
		2371	/// Conditionally copies the values from each 8-bit element in the first
		2372	/// 64-bit integer vector operand to the specified memory location, as
		2373	/// specified by the most significant bit in the corresponding element in the
		2374	/// second 64-bit integer vector operand.
		2375	///
		2376	/// To minimize caching, the data is flagged as non-temporal
		2377	/// (unlikely to be used again soon).
		2378	///
		2379	/// \headerfile <x86intrin.h>
		2380	///
		2381	/// This intrinsic corresponds to the <c> MASKMOVQ </c> instruction.
		2382	///
		2383	/// \param __d
		2384	/// A 64-bit integer vector containing the values with elements to be copied.
		2385	/// \param __n
		2386	/// A 64-bit integer vector operand. The most significant bit from each 8-bit
		2387	/// element determines whether the corresponding element in operand \a __d
		2388	/// is copied. If the most significant bit of a given element is 1, the
		2389	/// corresponding element in operand \a __d is copied.
		2390	/// \param __p
		2391	/// A pointer to a 64-bit memory location that will receive the conditionally
		2392	/// copied integer values. The address of the memory location does not have
		2393	/// to be aligned.
		2394	static __inline__ void __DEFAULT_FN_ATTRS_MMX
		2395	_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p)
		2396	{
		2397	__builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p);
		2398	}
		2399
		2400	/// Computes the rounded averages of the packed unsigned 8-bit integer
		2401	/// values and writes the averages to the corresponding bits in the
		2402	/// destination.
		2403	///
		2404	/// \headerfile <x86intrin.h>
		2405	///
		2406	/// This intrinsic corresponds to the <c> PAVGB </c> instruction.
		2407	///
		2408	/// \param __a
		2409	/// A 64-bit integer vector containing one of the source operands.
		2410	/// \param __b
		2411	/// A 64-bit integer vector containing one of the source operands.
		2412	/// \returns A 64-bit integer vector containing the averages of both operands.
		2413	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		2414	_mm_avg_pu8(__m64 __a, __m64 __b)
		2415	{
		2416	return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b);
		2417	}
		2418
		2419	/// Computes the rounded averages of the packed unsigned 16-bit integer
		2420	/// values and writes the averages to the corresponding bits in the
		2421	/// destination.
		2422	///
		2423	/// \headerfile <x86intrin.h>
		2424	///
		2425	/// This intrinsic corresponds to the <c> PAVGW </c> instruction.
		2426	///
		2427	/// \param __a
		2428	/// A 64-bit integer vector containing one of the source operands.
		2429	/// \param __b
		2430	/// A 64-bit integer vector containing one of the source operands.
		2431	/// \returns A 64-bit integer vector containing the averages of both operands.
		2432	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		2433	_mm_avg_pu16(__m64 __a, __m64 __b)
		2434	{
		2435	return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b);
		2436	}
		2437
		2438	/// Subtracts the corresponding 8-bit unsigned integer values of the two
		2439	/// 64-bit vector operands and computes the absolute value for each of the
		2440	/// difference. Then sum of the 8 absolute differences is written to the
		2441	/// bits [15:0] of the destination; the remaining bits [63:16] are cleared.
		2442	///
		2443	/// \headerfile <x86intrin.h>
		2444	///
		2445	/// This intrinsic corresponds to the <c> PSADBW </c> instruction.
		2446	///
		2447	/// \param __a
		2448	/// A 64-bit integer vector containing one of the source operands.
		2449	/// \param __b
		2450	/// A 64-bit integer vector containing one of the source operands.
		2451	/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the
		2452	/// sets of absolute differences between both operands. The upper bits are
		2453	/// cleared.
		2454	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		2455	_mm_sad_pu8(__m64 __a, __m64 __b)
		2456	{
		2457	return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b);
		2458	}
		2459
		2460	#if defined(__cplusplus)
		2461	extern "C" {
		2462	#endif
		2463
		2464	/// Returns the contents of the MXCSR register as a 32-bit unsigned
		2465	/// integer value.
		2466	///
		2467	/// There are several groups of macros associated with this
		2468	/// intrinsic, including:
		2469	/// <ul>
		2470	/// <li>
		2471	/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
		2472	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
		2473	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
		2474	/// _MM_GET_EXCEPTION_STATE().
		2475	/// </li>
		2476	/// <li>
		2477	/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
		2478	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
		2479	/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK().
		2480	/// </li>
		2481	/// <li>
		2482	/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
		2483	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
		2484	/// _MM_GET_ROUNDING_MODE().
		2485	/// </li>
		2486	/// <li>
		2487	/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
		2488	/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE().
		2489	/// </li>
		2490	/// <li>
		2491	/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
		2492	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
		2493	/// _MM_GET_DENORMALS_ZERO_MODE().
		2494	/// </li>
		2495	/// </ul>
		2496	///
		2497	/// For example, the following expression checks if an overflow exception has
		2498	/// occurred:
		2499	/// \code
		2500	/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW )
		2501	/// \endcode
		2502	///
		2503	/// The following expression gets the current rounding mode:
		2504	/// \code
		2505	/// _MM_GET_ROUNDING_MODE()
		2506	/// \endcode
		2507	///
		2508	/// \headerfile <x86intrin.h>
		2509	///
		2510	/// This intrinsic corresponds to the <c> VSTMXCSR / STMXCSR </c> instruction.
		2511	///
		2512	/// \returns A 32-bit unsigned integer containing the contents of the MXCSR
		2513	/// register.
		2514	unsigned int _mm_getcsr(void);
		2515
		2516	/// Sets the MXCSR register with the 32-bit unsigned integer value.
		2517	///
		2518	/// There are several groups of macros associated with this intrinsic,
		2519	/// including:
		2520	/// <ul>
		2521	/// <li>
		2522	/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO,
		2523	/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW,
		2524	/// _MM_EXCEPT_INEXACT. There is a convenience wrapper
		2525	/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros.
		2526	/// </li>
		2527	/// <li>
		2528	/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW,
		2529	/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT.
		2530	/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one
		2531	/// of these macros.
		2532	/// </li>
		2533	/// <li>
		2534	/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN,
		2535	/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper
		2536	/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros.
		2537	/// </li>
		2538	/// <li>
		2539	/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF.
		2540	/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is
		2541	/// one of these macros.
		2542	/// </li>
		2543	/// <li>
		2544	/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON,
		2545	/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper
		2546	/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros.
		2547	/// </li>
		2548	/// </ul>
		2549	///
		2550	/// For example, the following expression causes subsequent floating-point
		2551	/// operations to round up:
		2552	/// _mm_setcsr(_mm_getcsr() \| _MM_ROUND_UP)
		2553	///
		2554	/// The following example sets the DAZ and FTZ flags:
		2555	/// \code
		2556	/// void setFlags() {
		2557	/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
		2558	/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
		2559	/// }
		2560	/// \endcode
		2561	///
		2562	/// \headerfile <x86intrin.h>
		2563	///
		2564	/// This intrinsic corresponds to the <c> VLDMXCSR / LDMXCSR </c> instruction.
		2565	///
		2566	/// \param __i
		2567	/// A 32-bit unsigned integer value to be written to the MXCSR register.
		2568	void _mm_setcsr(unsigned int __i);
		2569
		2570	#if defined(__cplusplus)
		2571	} // extern "C"
		2572	#endif
		2573
		2574	/// Selects 4 float values from the 128-bit operands of [4 x float], as
		2575	/// specified by the immediate value operand.
		2576	///
		2577	/// \headerfile <x86intrin.h>
		2578	///
		2579	/// \code
		2580	/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask);
		2581	/// \endcode
		2582	///
		2583	/// This intrinsic corresponds to the <c> VSHUFPS / SHUFPS </c> instruction.
		2584	///
		2585	/// \param a
		2586	/// A 128-bit vector of [4 x float].
		2587	/// \param b
		2588	/// A 128-bit vector of [4 x float].
		2589	/// \param mask
		2590	/// An immediate value containing an 8-bit value specifying which elements to
		2591	/// copy from \a a and \a b. \n
		2592	/// Bits [3:0] specify the values copied from operand \a a. \n
		2593	/// Bits [7:4] specify the values copied from operand \a b. \n
		2594	/// The destinations within the 128-bit destination are assigned values as
		2595	/// follows: \n
		2596	/// Bits [1:0] are used to assign values to bits [31:0] in the
		2597	/// destination. \n
		2598	/// Bits [3:2] are used to assign values to bits [63:32] in the
		2599	/// destination. \n
		2600	/// Bits [5:4] are used to assign values to bits [95:64] in the
		2601	/// destination. \n
		2602	/// Bits [7:6] are used to assign values to bits [127:96] in the
		2603	/// destination. \n
		2604	/// Bit value assignments: \n
		2605	/// 00: Bits [31:0] copied from the specified operand. \n
		2606	/// 01: Bits [63:32] copied from the specified operand. \n
		2607	/// 10: Bits [95:64] copied from the specified operand. \n
		2608	/// 11: Bits [127:96] copied from the specified operand. \n
		2609	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
		2610	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
		2611	/// <c>[b6, b4, b2, b0]</c>.
		2612	/// \returns A 128-bit vector of [4 x float] containing the shuffled values.
		2613	#define _mm_shuffle_ps(a, b, mask) \
		2614	((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \
		2615	(int)(mask)))
		2616
		2617	/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
		2618	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
		2619	///
		2620	/// \headerfile <x86intrin.h>
		2621	///
		2622	/// This intrinsic corresponds to the <c> VUNPCKHPS / UNPCKHPS </c> instruction.
		2623	///
		2624	/// \param __a
		2625	/// A 128-bit vector of [4 x float]. \n
		2626	/// Bits [95:64] are written to bits [31:0] of the destination. \n
		2627	/// Bits [127:96] are written to bits [95:64] of the destination.
		2628	/// \param __b
		2629	/// A 128-bit vector of [4 x float].
		2630	/// Bits [95:64] are written to bits [63:32] of the destination. \n
		2631	/// Bits [127:96] are written to bits [127:96] of the destination.
		2632	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
		2633	static __inline__ __m128 __DEFAULT_FN_ATTRS
		2634	_mm_unpackhi_ps(__m128 __a, __m128 __b)
		2635	{
		2636	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7);
		2637	}
		2638
		2639	/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
		2640	/// [4 x float] and interleaves them into a 128-bit vector of [4 x float].
		2641	///
		2642	/// \headerfile <x86intrin.h>
		2643	///
		2644	/// This intrinsic corresponds to the <c> VUNPCKLPS / UNPCKLPS </c> instruction.
		2645	///
		2646	/// \param __a
		2647	/// A 128-bit vector of [4 x float]. \n
		2648	/// Bits [31:0] are written to bits [31:0] of the destination. \n
		2649	/// Bits [63:32] are written to bits [95:64] of the destination.
		2650	/// \param __b
		2651	/// A 128-bit vector of [4 x float]. \n
		2652	/// Bits [31:0] are written to bits [63:32] of the destination. \n
		2653	/// Bits [63:32] are written to bits [127:96] of the destination.
		2654	/// \returns A 128-bit vector of [4 x float] containing the interleaved values.
		2655	static __inline__ __m128 __DEFAULT_FN_ATTRS
		2656	_mm_unpacklo_ps(__m128 __a, __m128 __b)
		2657	{
		2658	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5);
		2659	}
		2660
		2661	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
		2662	/// 32 bits are set to the lower 32 bits of the second parameter. The upper
		2663	/// 96 bits are set to the upper 96 bits of the first parameter.
		2664	///
		2665	/// \headerfile <x86intrin.h>
		2666	///
		2667	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS / MOVSS </c>
		2668	/// instruction.
		2669	///
		2670	/// \param __a
		2671	/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are
		2672	/// written to the upper 96 bits of the result.
		2673	/// \param __b
		2674	/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are
		2675	/// written to the lower 32 bits of the result.
		2676	/// \returns A 128-bit floating-point vector of [4 x float].
		2677	static __inline__ __m128 __DEFAULT_FN_ATTRS
		2678	_mm_move_ss(__m128 __a, __m128 __b)
		2679	{
		2680	__a[0] = __b[0];
		2681	return __a;
		2682	}
		2683
		2684	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
		2685	/// 64 bits are set to the upper 64 bits of the second parameter. The upper
		2686	/// 64 bits are set to the upper 64 bits of the first parameter.
		2687	///
		2688	/// \headerfile <x86intrin.h>
		2689	///
		2690	/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
		2691	///
		2692	/// \param __a
		2693	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
		2694	/// written to the upper 64 bits of the result.
		2695	/// \param __b
		2696	/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are
		2697	/// written to the lower 64 bits of the result.
		2698	/// \returns A 128-bit floating-point vector of [4 x float].
		2699	static __inline__ __m128 __DEFAULT_FN_ATTRS
		2700	_mm_movehl_ps(__m128 __a, __m128 __b)
		2701	{
		2702	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3);
		2703	}
		2704
		2705	/// Constructs a 128-bit floating-point vector of [4 x float]. The lower
		2706	/// 64 bits are set to the lower 64 bits of the first parameter. The upper
		2707	/// 64 bits are set to the lower 64 bits of the second parameter.
		2708	///
		2709	/// \headerfile <x86intrin.h>
		2710	///
		2711	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
		2712	///
		2713	/// \param __a
		2714	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
		2715	/// written to the lower 64 bits of the result.
		2716	/// \param __b
		2717	/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are
		2718	/// written to the upper 64 bits of the result.
		2719	/// \returns A 128-bit floating-point vector of [4 x float].
		2720	static __inline__ __m128 __DEFAULT_FN_ATTRS
		2721	_mm_movelh_ps(__m128 __a, __m128 __b)
		2722	{
		2723	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5);
		2724	}
		2725
		2726	/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x
		2727	/// float].
		2728	///
		2729	/// \headerfile <x86intrin.h>
		2730	///
		2731	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
		2732	///
		2733	/// \param __a
		2734	/// A 64-bit vector of [4 x i16]. The elements of the destination are copied
		2735	/// from the corresponding elements in this operand.
		2736	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
		2737	/// values from the operand.
		2738	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
		2739	_mm_cvtpi16_ps(__m64 __a)
		2740	{
		2741	__m64 __b, __c;
		2742	__m128 __r;
		2743
		2744	__b = _mm_setzero_si64();
		2745	__b = _mm_cmpgt_pi16(__b, __a);
		2746	__c = _mm_unpackhi_pi16(__a, __b);
		2747	__r = _mm_setzero_ps();
		2748	__r = _mm_cvtpi32_ps(__r, __c);
		2749	__r = _mm_movelh_ps(__r, __r);
		2750	__c = _mm_unpacklo_pi16(__a, __b);
		2751	__r = _mm_cvtpi32_ps(__r, __c);
		2752
		2753	return __r;
		2754	}
		2755
		2756	/// Converts a 64-bit vector of 16-bit unsigned integer values into a
		2757	/// 128-bit vector of [4 x float].
		2758	///
		2759	/// \headerfile <x86intrin.h>
		2760	///
		2761	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
		2762	///
		2763	/// \param __a
		2764	/// A 64-bit vector of 16-bit unsigned integer values. The elements of the
		2765	/// destination are copied from the corresponding elements in this operand.
		2766	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
		2767	/// values from the operand.
		2768	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
		2769	_mm_cvtpu16_ps(__m64 __a)
		2770	{
		2771	__m64 __b, __c;
		2772	__m128 __r;
		2773
		2774	__b = _mm_setzero_si64();
		2775	__c = _mm_unpackhi_pi16(__a, __b);
		2776	__r = _mm_setzero_ps();
		2777	__r = _mm_cvtpi32_ps(__r, __c);
		2778	__r = _mm_movelh_ps(__r, __r);
		2779	__c = _mm_unpacklo_pi16(__a, __b);
		2780	__r = _mm_cvtpi32_ps(__r, __c);
		2781
		2782	return __r;
		2783	}
		2784
		2785	/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8]
		2786	/// into a 128-bit vector of [4 x float].
		2787	///
		2788	/// \headerfile <x86intrin.h>
		2789	///
		2790	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
		2791	///
		2792	/// \param __a
		2793	/// A 64-bit vector of [8 x i8]. The elements of the destination are copied
		2794	/// from the corresponding lower 4 elements in this operand.
		2795	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
		2796	/// values from the operand.
		2797	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
		2798	_mm_cvtpi8_ps(__m64 __a)
		2799	{
		2800	__m64 __b;
		2801
		2802	__b = _mm_setzero_si64();
		2803	__b = _mm_cmpgt_pi8(__b, __a);
		2804	__b = _mm_unpacklo_pi8(__a, __b);
		2805
		2806	return _mm_cvtpi16_ps(__b);
		2807	}
		2808
		2809	/// Converts the lower four unsigned 8-bit integer values from a 64-bit
		2810	/// vector of [8 x u8] into a 128-bit vector of [4 x float].
		2811	///
		2812	/// \headerfile <x86intrin.h>
		2813	///
		2814	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
		2815	///
		2816	/// \param __a
		2817	/// A 64-bit vector of unsigned 8-bit integer values. The elements of the
		2818	/// destination are copied from the corresponding lower 4 elements in this
		2819	/// operand.
		2820	/// \returns A 128-bit vector of [4 x float] containing the copied and converted
		2821	/// values from the source operand.
		2822	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
		2823	_mm_cvtpu8_ps(__m64 __a)
		2824	{
		2825	__m64 __b;
		2826
		2827	__b = _mm_setzero_si64();
		2828	__b = _mm_unpacklo_pi8(__a, __b);
		2829
		2830	return _mm_cvtpi16_ps(__b);
		2831	}
		2832
		2833	/// Converts the two 32-bit signed integer values from each 64-bit vector
		2834	/// operand of [2 x i32] into a 128-bit vector of [4 x float].
		2835	///
		2836	/// \headerfile <x86intrin.h>
		2837	///
		2838	/// This intrinsic corresponds to the <c> CVTPI2PS + COMPOSITE </c> instruction.
		2839	///
		2840	/// \param __a
		2841	/// A 64-bit vector of [2 x i32]. The lower elements of the destination are
		2842	/// copied from the elements in this operand.
		2843	/// \param __b
		2844	/// A 64-bit vector of [2 x i32]. The upper elements of the destination are
		2845	/// copied from the elements in this operand.
		2846	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
		2847	/// copied and converted values from the first operand. The upper 64 bits
		2848	/// contain the copied and converted values from the second operand.
		2849	static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX
		2850	_mm_cvtpi32x2_ps(__m64 __a, __m64 __b)
		2851	{
		2852	__m128 __c;
		2853
		2854	__c = _mm_setzero_ps();
		2855	__c = _mm_cvtpi32_ps(__c, __b);
		2856	__c = _mm_movelh_ps(__c, __c);
		2857
		2858	return _mm_cvtpi32_ps(__c, __a);
		2859	}
		2860
		2861	/// Converts each single-precision floating-point element of a 128-bit
		2862	/// floating-point vector of [4 x float] into a 16-bit signed integer, and
		2863	/// packs the results into a 64-bit integer vector of [4 x i16].
		2864	///
		2865	/// If the floating-point element is NaN or infinity, or if the
		2866	/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000,
		2867	/// it is converted to 0x8000. Otherwise if the floating-point element is
		2868	/// greater than 0x7FFF, it is converted to 0x7FFF.
		2869	///
		2870	/// \headerfile <x86intrin.h>
		2871	///
		2872	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
		2873	///
		2874	/// \param __a
		2875	/// A 128-bit floating-point vector of [4 x float].
		2876	/// \returns A 64-bit integer vector of [4 x i16] containing the converted
		2877	/// values.
		2878	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		2879	_mm_cvtps_pi16(__m128 __a)
		2880	{
		2881	__m64 __b, __c;
		2882
		2883	__b = _mm_cvtps_pi32(__a);
		2884	__a = _mm_movehl_ps(__a, __a);
		2885	__c = _mm_cvtps_pi32(__a);
		2886
		2887	return _mm_packs_pi32(__b, __c);
		2888	}
		2889
		2890	/// Converts each single-precision floating-point element of a 128-bit
		2891	/// floating-point vector of [4 x float] into an 8-bit signed integer, and
		2892	/// packs the results into the lower 32 bits of a 64-bit integer vector of
		2893	/// [8 x i8]. The upper 32 bits of the vector are set to 0.
		2894	///
		2895	/// If the floating-point element is NaN or infinity, or if the
		2896	/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it
		2897	/// is converted to 0x80. Otherwise if the floating-point element is greater
		2898	/// than 0x7F, it is converted to 0x7F.
		2899	///
		2900	/// \headerfile <x86intrin.h>
		2901	///
		2902	/// This intrinsic corresponds to the <c> CVTPS2PI + COMPOSITE </c> instruction.
		2903	///
		2904	/// \param __a
		2905	/// 128-bit floating-point vector of [4 x float].
		2906	/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the
		2907	/// converted values and the uppper 32 bits are set to zero.
		2908	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
		2909	_mm_cvtps_pi8(__m128 __a)
		2910	{
		2911	__m64 __b, __c;
		2912
		2913	__b = _mm_cvtps_pi16(__a);
		2914	__c = _mm_setzero_si64();
		2915
		2916	return _mm_packs_pi16(__b, __c);
		2917	}
		2918
		2919	/// Extracts the sign bits from each single-precision floating-point
		2920	/// element of a 128-bit floating-point vector of [4 x float] and returns the
		2921	/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set
		2922	/// to zero.
		2923	///
		2924	/// \headerfile <x86intrin.h>
		2925	///
		2926	/// This intrinsic corresponds to the <c> VMOVMSKPS / MOVMSKPS </c> instruction.
		2927	///
		2928	/// \param __a
		2929	/// A 128-bit floating-point vector of [4 x float].
		2930	/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each
		2931	/// single-precision floating-point element of the parameter. Bits [31:4] are
		2932	/// set to zero.
		2933	static __inline__ int __DEFAULT_FN_ATTRS
		2934	_mm_movemask_ps(__m128 __a)
		2935	{
		2936	return __builtin_ia32_movmskps((__v4sf)__a);
		2937	}
		2938
		2939
		2940	#define _MM_ALIGN16 __attribute__((aligned(16)))
		2941
		2942	#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) \| ((y) << 4) \| ((x) << 2) \| (w))
		2943
		2944	#define _MM_EXCEPT_INVALID (0x0001U)
		2945	#define _MM_EXCEPT_DENORM (0x0002U)
		2946	#define _MM_EXCEPT_DIV_ZERO (0x0004U)
		2947	#define _MM_EXCEPT_OVERFLOW (0x0008U)
		2948	#define _MM_EXCEPT_UNDERFLOW (0x0010U)
		2949	#define _MM_EXCEPT_INEXACT (0x0020U)
		2950	#define _MM_EXCEPT_MASK (0x003fU)
		2951
		2952	#define _MM_MASK_INVALID (0x0080U)
		2953	#define _MM_MASK_DENORM (0x0100U)
		2954	#define _MM_MASK_DIV_ZERO (0x0200U)
		2955	#define _MM_MASK_OVERFLOW (0x0400U)
		2956	#define _MM_MASK_UNDERFLOW (0x0800U)
		2957	#define _MM_MASK_INEXACT (0x1000U)
		2958	#define _MM_MASK_MASK (0x1f80U)
		2959
		2960	#define _MM_ROUND_NEAREST (0x0000U)
		2961	#define _MM_ROUND_DOWN (0x2000U)
		2962	#define _MM_ROUND_UP (0x4000U)
		2963	#define _MM_ROUND_TOWARD_ZERO (0x6000U)
		2964	#define _MM_ROUND_MASK (0x6000U)
		2965
		2966	#define _MM_FLUSH_ZERO_MASK (0x8000U)
		2967	#define _MM_FLUSH_ZERO_ON (0x8000U)
		2968	#define _MM_FLUSH_ZERO_OFF (0x0000U)
		2969
		2970	#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
		2971	#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
		2972	#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
		2973	#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
		2974
		2975	#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) \| (x)))
		2976	#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) \| (x)))
		2977	#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) \| (x)))
		2978	#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) \| (x)))
		2979
		2980	#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
		2981	do { \
		2982	__m128 tmp3, tmp2, tmp1, tmp0; \
		2983	tmp0 = _mm_unpacklo_ps((row0), (row1)); \
		2984	tmp2 = _mm_unpacklo_ps((row2), (row3)); \
		2985	tmp1 = _mm_unpackhi_ps((row0), (row1)); \
		2986	tmp3 = _mm_unpackhi_ps((row2), (row3)); \
		2987	(row0) = _mm_movelh_ps(tmp0, tmp2); \
		2988	(row1) = _mm_movehl_ps(tmp2, tmp0); \
		2989	(row2) = _mm_movelh_ps(tmp1, tmp3); \
		2990	(row3) = _mm_movehl_ps(tmp3, tmp1); \
		2991	} while (0)
		2992
		2993	/* Aliases for compatibility. */
		2994	#define _m_pextrw _mm_extract_pi16
		2995	#define _m_pinsrw _mm_insert_pi16
		2996	#define _m_pmaxsw _mm_max_pi16
		2997	#define _m_pmaxub _mm_max_pu8
		2998	#define _m_pminsw _mm_min_pi16
		2999	#define _m_pminub _mm_min_pu8
		3000	#define _m_pmovmskb _mm_movemask_pi8
		3001	#define _m_pmulhuw _mm_mulhi_pu16
		3002	#define _m_pshufw _mm_shuffle_pi16
		3003	#define _m_maskmovq _mm_maskmove_si64
		3004	#define _m_pavgb _mm_avg_pu8
		3005	#define _m_pavgw _mm_avg_pu16
		3006	#define _m_psadbw _mm_sad_pu8
		3007	#define _m_ _mm_
		3008
		3009	#undef __DEFAULT_FN_ATTRS
		3010	#undef __DEFAULT_FN_ATTRS_MMX
		3011
		3012	/* Ugly hack for backwards-compatibility (compatible with gcc) */
		3013	#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics)
		3014	#include <emmintrin.h>
		3015	#endif
		3016
		3017	#endif /* __XMMINTRIN_H */

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite/llvm-build/x86_64/lib/clang/16/include/xmmintrin.h – Rev 14