WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – /llvm-build/x86_64/lib/clang/16/include/emmintrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9
		10	#ifndef __EMMINTRIN_H
		11	#define __EMMINTRIN_H
		12
		13	#if !defined(__i386__) && !defined(__x86_64__)
		14	#error "This header is only meant to be used on x86 and x64 architecture"
		15	#endif
		16
		17	#include <xmmintrin.h>
		18
		19	typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16)));
		20	typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16)));
		21
		22	typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1)));
		23	typedef long long __m128i_u
		24	__attribute__((__vector_size__(16), __aligned__(1)));
		25
		26	/* Type defines. */
		27	typedef double __v2df __attribute__((__vector_size__(16)));
		28	typedef long long __v2di __attribute__((__vector_size__(16)));
		29	typedef short __v8hi __attribute__((__vector_size__(16)));
		30	typedef char __v16qi __attribute__((__vector_size__(16)));
		31
		32	/* Unsigned types */
		33	typedef unsigned long long __v2du __attribute__((__vector_size__(16)));
		34	typedef unsigned short __v8hu __attribute__((__vector_size__(16)));
		35	typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
		36
		37	/* We need an explicitly signed variant for char. Note that this shouldn't
		38	* appear in the interface though. */
		39	typedef signed char __v16qs __attribute__((__vector_size__(16)));
		40
		41	#ifdef __SSE2__
		42	/* Both _Float16 and __bf16 require SSE2 being enabled. */
		43	typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
		44	typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
		45	typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
		46
		47	typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
		48	typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
		49	#endif
		50
		51	/* Define the default attributes for the functions in this file. */
		52	#define __DEFAULT_FN_ATTRS \
		53	__attribute__((__always_inline__, __nodebug__, __target__("sse2"), \
		54	__min_vector_width__(128)))
		55	#define __DEFAULT_FN_ATTRS_MMX \
		56	__attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), \
		57	__min_vector_width__(64)))
		58
		59	/// Adds lower double-precision values in both operands and returns the
		60	/// sum in the lower 64 bits of the result. The upper 64 bits of the result
		61	/// are copied from the upper double-precision value of the first operand.
		62	///
		63	/// \headerfile <x86intrin.h>
		64	///
		65	/// This intrinsic corresponds to the <c> VADDSD / ADDSD </c> instruction.
		66	///
		67	/// \param __a
		68	/// A 128-bit vector of [2 x double] containing one of the source operands.
		69	/// \param __b
		70	/// A 128-bit vector of [2 x double] containing one of the source operands.
		71	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
		72	/// sum of the lower 64 bits of both operands. The upper 64 bits are copied
		73	/// from the upper 64 bits of the first source operand.
		74	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_sd(__m128d __a,
		75	__m128d __b) {
		76	__a[0] += __b[0];
		77	return __a;
		78	}
		79
		80	/// Adds two 128-bit vectors of [2 x double].
		81	///
		82	/// \headerfile <x86intrin.h>
		83	///
		84	/// This intrinsic corresponds to the <c> VADDPD / ADDPD </c> instruction.
		85	///
		86	/// \param __a
		87	/// A 128-bit vector of [2 x double] containing one of the source operands.
		88	/// \param __b
		89	/// A 128-bit vector of [2 x double] containing one of the source operands.
		90	/// \returns A 128-bit vector of [2 x double] containing the sums of both
		91	/// operands.
		92	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_add_pd(__m128d __a,
		93	__m128d __b) {
		94	return (__m128d)((__v2df)__a + (__v2df)__b);
		95	}
		96
		97	/// Subtracts the lower double-precision value of the second operand
		98	/// from the lower double-precision value of the first operand and returns
		99	/// the difference in the lower 64 bits of the result. The upper 64 bits of
		100	/// the result are copied from the upper double-precision value of the first
		101	/// operand.
		102	///
		103	/// \headerfile <x86intrin.h>
		104	///
		105	/// This intrinsic corresponds to the <c> VSUBSD / SUBSD </c> instruction.
		106	///
		107	/// \param __a
		108	/// A 128-bit vector of [2 x double] containing the minuend.
		109	/// \param __b
		110	/// A 128-bit vector of [2 x double] containing the subtrahend.
		111	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
		112	/// difference of the lower 64 bits of both operands. The upper 64 bits are
		113	/// copied from the upper 64 bits of the first source operand.
		114	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_sd(__m128d __a,
		115	__m128d __b) {
		116	__a[0] -= __b[0];
		117	return __a;
		118	}
		119
		120	/// Subtracts two 128-bit vectors of [2 x double].
		121	///
		122	/// \headerfile <x86intrin.h>
		123	///
		124	/// This intrinsic corresponds to the <c> VSUBPD / SUBPD </c> instruction.
		125	///
		126	/// \param __a
		127	/// A 128-bit vector of [2 x double] containing the minuend.
		128	/// \param __b
		129	/// A 128-bit vector of [2 x double] containing the subtrahend.
		130	/// \returns A 128-bit vector of [2 x double] containing the differences between
		131	/// both operands.
		132	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sub_pd(__m128d __a,
		133	__m128d __b) {
		134	return (__m128d)((__v2df)__a - (__v2df)__b);
		135	}
		136
		137	/// Multiplies lower double-precision values in both operands and returns
		138	/// the product in the lower 64 bits of the result. The upper 64 bits of the
		139	/// result are copied from the upper double-precision value of the first
		140	/// operand.
		141	///
		142	/// \headerfile <x86intrin.h>
		143	///
		144	/// This intrinsic corresponds to the <c> VMULSD / MULSD </c> instruction.
		145	///
		146	/// \param __a
		147	/// A 128-bit vector of [2 x double] containing one of the source operands.
		148	/// \param __b
		149	/// A 128-bit vector of [2 x double] containing one of the source operands.
		150	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
		151	/// product of the lower 64 bits of both operands. The upper 64 bits are
		152	/// copied from the upper 64 bits of the first source operand.
		153	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_sd(__m128d __a,
		154	__m128d __b) {
		155	__a[0] *= __b[0];
		156	return __a;
		157	}
		158
		159	/// Multiplies two 128-bit vectors of [2 x double].
		160	///
		161	/// \headerfile <x86intrin.h>
		162	///
		163	/// This intrinsic corresponds to the <c> VMULPD / MULPD </c> instruction.
		164	///
		165	/// \param __a
		166	/// A 128-bit vector of [2 x double] containing one of the operands.
		167	/// \param __b
		168	/// A 128-bit vector of [2 x double] containing one of the operands.
		169	/// \returns A 128-bit vector of [2 x double] containing the products of both
		170	/// operands.
		171	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_mul_pd(__m128d __a,
		172	__m128d __b) {
		173	return (__m128d)((__v2df)__a * (__v2df)__b);
		174	}
		175
		176	/// Divides the lower double-precision value of the first operand by the
		177	/// lower double-precision value of the second operand and returns the
		178	/// quotient in the lower 64 bits of the result. The upper 64 bits of the
		179	/// result are copied from the upper double-precision value of the first
		180	/// operand.
		181	///
		182	/// \headerfile <x86intrin.h>
		183	///
		184	/// This intrinsic corresponds to the <c> VDIVSD / DIVSD </c> instruction.
		185	///
		186	/// \param __a
		187	/// A 128-bit vector of [2 x double] containing the dividend.
		188	/// \param __b
		189	/// A 128-bit vector of [2 x double] containing divisor.
		190	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
		191	/// quotient of the lower 64 bits of both operands. The upper 64 bits are
		192	/// copied from the upper 64 bits of the first source operand.
		193	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_sd(__m128d __a,
		194	__m128d __b) {
		195	__a[0] /= __b[0];
		196	return __a;
		197	}
		198
		199	/// Performs an element-by-element division of two 128-bit vectors of
		200	/// [2 x double].
		201	///
		202	/// \headerfile <x86intrin.h>
		203	///
		204	/// This intrinsic corresponds to the <c> VDIVPD / DIVPD </c> instruction.
		205	///
		206	/// \param __a
		207	/// A 128-bit vector of [2 x double] containing the dividend.
		208	/// \param __b
		209	/// A 128-bit vector of [2 x double] containing the divisor.
		210	/// \returns A 128-bit vector of [2 x double] containing the quotients of both
		211	/// operands.
		212	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_div_pd(__m128d __a,
		213	__m128d __b) {
		214	return (__m128d)((__v2df)__a / (__v2df)__b);
		215	}
		216
		217	/// Calculates the square root of the lower double-precision value of
		218	/// the second operand and returns it in the lower 64 bits of the result.
		219	/// The upper 64 bits of the result are copied from the upper
		220	/// double-precision value of the first operand.
		221	///
		222	/// \headerfile <x86intrin.h>
		223	///
		224	/// This intrinsic corresponds to the <c> VSQRTSD / SQRTSD </c> instruction.
		225	///
		226	/// \param __a
		227	/// A 128-bit vector of [2 x double] containing one of the operands. The
		228	/// upper 64 bits of this operand are copied to the upper 64 bits of the
		229	/// result.
		230	/// \param __b
		231	/// A 128-bit vector of [2 x double] containing one of the operands. The
		232	/// square root is calculated using the lower 64 bits of this operand.
		233	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
		234	/// square root of the lower 64 bits of operand \a __b, and whose upper 64
		235	/// bits are copied from the upper 64 bits of operand \a __a.
		236	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_sd(__m128d __a,
		237	__m128d __b) {
		238	__m128d __c = __builtin_ia32_sqrtsd((__v2df)__b);
		239	return __extension__(__m128d){__c[0], __a[1]};
		240	}
		241
		242	/// Calculates the square root of the each of two values stored in a
		243	/// 128-bit vector of [2 x double].
		244	///
		245	/// \headerfile <x86intrin.h>
		246	///
		247	/// This intrinsic corresponds to the <c> VSQRTPD / SQRTPD </c> instruction.
		248	///
		249	/// \param __a
		250	/// A 128-bit vector of [2 x double].
		251	/// \returns A 128-bit vector of [2 x double] containing the square roots of the
		252	/// values in the operand.
		253	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_sqrt_pd(__m128d __a) {
		254	return __builtin_ia32_sqrtpd((__v2df)__a);
		255	}
		256
		257	/// Compares lower 64-bit double-precision values of both operands, and
		258	/// returns the lesser of the pair of values in the lower 64-bits of the
		259	/// result. The upper 64 bits of the result are copied from the upper
		260	/// double-precision value of the first operand.
		261	///
		262	/// \headerfile <x86intrin.h>
		263	///
		264	/// This intrinsic corresponds to the <c> VMINSD / MINSD </c> instruction.
		265	///
		266	/// \param __a
		267	/// A 128-bit vector of [2 x double] containing one of the operands. The
		268	/// lower 64 bits of this operand are used in the comparison.
		269	/// \param __b
		270	/// A 128-bit vector of [2 x double] containing one of the operands. The
		271	/// lower 64 bits of this operand are used in the comparison.
		272	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
		273	/// minimum value between both operands. The upper 64 bits are copied from
		274	/// the upper 64 bits of the first source operand.
		275	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_sd(__m128d __a,
		276	__m128d __b) {
		277	return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b);
		278	}
		279
		280	/// Performs element-by-element comparison of the two 128-bit vectors of
		281	/// [2 x double] and returns the vector containing the lesser of each pair of
		282	/// values.
		283	///
		284	/// \headerfile <x86intrin.h>
		285	///
		286	/// This intrinsic corresponds to the <c> VMINPD / MINPD </c> instruction.
		287	///
		288	/// \param __a
		289	/// A 128-bit vector of [2 x double] containing one of the operands.
		290	/// \param __b
		291	/// A 128-bit vector of [2 x double] containing one of the operands.
		292	/// \returns A 128-bit vector of [2 x double] containing the minimum values
		293	/// between both operands.
		294	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_min_pd(__m128d __a,
		295	__m128d __b) {
		296	return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b);
		297	}
		298
		299	/// Compares lower 64-bit double-precision values of both operands, and
		300	/// returns the greater of the pair of values in the lower 64-bits of the
		301	/// result. The upper 64 bits of the result are copied from the upper
		302	/// double-precision value of the first operand.
		303	///
		304	/// \headerfile <x86intrin.h>
		305	///
		306	/// This intrinsic corresponds to the <c> VMAXSD / MAXSD </c> instruction.
		307	///
		308	/// \param __a
		309	/// A 128-bit vector of [2 x double] containing one of the operands. The
		310	/// lower 64 bits of this operand are used in the comparison.
		311	/// \param __b
		312	/// A 128-bit vector of [2 x double] containing one of the operands. The
		313	/// lower 64 bits of this operand are used in the comparison.
		314	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
		315	/// maximum value between both operands. The upper 64 bits are copied from
		316	/// the upper 64 bits of the first source operand.
		317	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_sd(__m128d __a,
		318	__m128d __b) {
		319	return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b);
		320	}
		321
		322	/// Performs element-by-element comparison of the two 128-bit vectors of
		323	/// [2 x double] and returns the vector containing the greater of each pair
		324	/// of values.
		325	///
		326	/// \headerfile <x86intrin.h>
		327	///
		328	/// This intrinsic corresponds to the <c> VMAXPD / MAXPD </c> instruction.
		329	///
		330	/// \param __a
		331	/// A 128-bit vector of [2 x double] containing one of the operands.
		332	/// \param __b
		333	/// A 128-bit vector of [2 x double] containing one of the operands.
		334	/// \returns A 128-bit vector of [2 x double] containing the maximum values
		335	/// between both operands.
		336	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_max_pd(__m128d __a,
		337	__m128d __b) {
		338	return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b);
		339	}
		340
		341	/// Performs a bitwise AND of two 128-bit vectors of [2 x double].
		342	///
		343	/// \headerfile <x86intrin.h>
		344	///
		345	/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
		346	///
		347	/// \param __a
		348	/// A 128-bit vector of [2 x double] containing one of the source operands.
		349	/// \param __b
		350	/// A 128-bit vector of [2 x double] containing one of the source operands.
		351	/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
		352	/// values between both operands.
		353	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_and_pd(__m128d __a,
		354	__m128d __b) {
		355	return (__m128d)((__v2du)__a & (__v2du)__b);
		356	}
		357
		358	/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using
		359	/// the one's complement of the values contained in the first source operand.
		360	///
		361	/// \headerfile <x86intrin.h>
		362	///
		363	/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
		364	///
		365	/// \param __a
		366	/// A 128-bit vector of [2 x double] containing the left source operand. The
		367	/// one's complement of this value is used in the bitwise AND.
		368	/// \param __b
		369	/// A 128-bit vector of [2 x double] containing the right source operand.
		370	/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the
		371	/// values in the second operand and the one's complement of the first
		372	/// operand.
		373	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_andnot_pd(__m128d __a,
		374	__m128d __b) {
		375	return (__m128d)(~(__v2du)__a & (__v2du)__b);
		376	}
		377
		378	/// Performs a bitwise OR of two 128-bit vectors of [2 x double].
		379	///
		380	/// \headerfile <x86intrin.h>
		381	///
		382	/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
		383	///
		384	/// \param __a
		385	/// A 128-bit vector of [2 x double] containing one of the source operands.
		386	/// \param __b
		387	/// A 128-bit vector of [2 x double] containing one of the source operands.
		388	/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the
		389	/// values between both operands.
		390	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_or_pd(__m128d __a,
		391	__m128d __b) {
		392	return (__m128d)((__v2du)__a \| (__v2du)__b);
		393	}
		394
		395	/// Performs a bitwise XOR of two 128-bit vectors of [2 x double].
		396	///
		397	/// \headerfile <x86intrin.h>
		398	///
		399	/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
		400	///
		401	/// \param __a
		402	/// A 128-bit vector of [2 x double] containing one of the source operands.
		403	/// \param __b
		404	/// A 128-bit vector of [2 x double] containing one of the source operands.
		405	/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the
		406	/// values between both operands.
		407	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_xor_pd(__m128d __a,
		408	__m128d __b) {
		409	return (__m128d)((__v2du)__a ^ (__v2du)__b);
		410	}
		411
		412	/// Compares each of the corresponding double-precision values of the
		413	/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0
		414	/// for false, 0xFFFFFFFFFFFFFFFF for true.
		415	///
		416	/// \headerfile <x86intrin.h>
		417	///
		418	/// This intrinsic corresponds to the <c> VCMPEQPD / CMPEQPD </c> instruction.
		419	///
		420	/// \param __a
		421	/// A 128-bit vector of [2 x double].
		422	/// \param __b
		423	/// A 128-bit vector of [2 x double].
		424	/// \returns A 128-bit vector containing the comparison results.
		425	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_pd(__m128d __a,
		426	__m128d __b) {
		427	return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b);
		428	}
		429
		430	/// Compares each of the corresponding double-precision values of the
		431	/// 128-bit vectors of [2 x double] to determine if the values in the first
		432	/// operand are less than those in the second operand. Each comparison
		433	/// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		434	///
		435	/// \headerfile <x86intrin.h>
		436	///
		437	/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
		438	///
		439	/// \param __a
		440	/// A 128-bit vector of [2 x double].
		441	/// \param __b
		442	/// A 128-bit vector of [2 x double].
		443	/// \returns A 128-bit vector containing the comparison results.
		444	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_pd(__m128d __a,
		445	__m128d __b) {
		446	return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b);
		447	}
		448
		449	/// Compares each of the corresponding double-precision values of the
		450	/// 128-bit vectors of [2 x double] to determine if the values in the first
		451	/// operand are less than or equal to those in the second operand.
		452	///
		453	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		454	///
		455	/// \headerfile <x86intrin.h>
		456	///
		457	/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
		458	///
		459	/// \param __a
		460	/// A 128-bit vector of [2 x double].
		461	/// \param __b
		462	/// A 128-bit vector of [2 x double].
		463	/// \returns A 128-bit vector containing the comparison results.
		464	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_pd(__m128d __a,
		465	__m128d __b) {
		466	return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b);
		467	}
		468
		469	/// Compares each of the corresponding double-precision values of the
		470	/// 128-bit vectors of [2 x double] to determine if the values in the first
		471	/// operand are greater than those in the second operand.
		472	///
		473	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		474	///
		475	/// \headerfile <x86intrin.h>
		476	///
		477	/// This intrinsic corresponds to the <c> VCMPLTPD / CMPLTPD </c> instruction.
		478	///
		479	/// \param __a
		480	/// A 128-bit vector of [2 x double].
		481	/// \param __b
		482	/// A 128-bit vector of [2 x double].
		483	/// \returns A 128-bit vector containing the comparison results.
		484	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_pd(__m128d __a,
		485	__m128d __b) {
		486	return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a);
		487	}
		488
		489	/// Compares each of the corresponding double-precision values of the
		490	/// 128-bit vectors of [2 x double] to determine if the values in the first
		491	/// operand are greater than or equal to those in the second operand.
		492	///
		493	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		494	///
		495	/// \headerfile <x86intrin.h>
		496	///
		497	/// This intrinsic corresponds to the <c> VCMPLEPD / CMPLEPD </c> instruction.
		498	///
		499	/// \param __a
		500	/// A 128-bit vector of [2 x double].
		501	/// \param __b
		502	/// A 128-bit vector of [2 x double].
		503	/// \returns A 128-bit vector containing the comparison results.
		504	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_pd(__m128d __a,
		505	__m128d __b) {
		506	return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a);
		507	}
		508
		509	/// Compares each of the corresponding double-precision values of the
		510	/// 128-bit vectors of [2 x double] to determine if the values in the first
		511	/// operand are ordered with respect to those in the second operand.
		512	///
		513	/// A pair of double-precision values are "ordered" with respect to each
		514	/// other if neither value is a NaN. Each comparison yields 0x0 for false,
		515	/// 0xFFFFFFFFFFFFFFFF for true.
		516	///
		517	/// \headerfile <x86intrin.h>
		518	///
		519	/// This intrinsic corresponds to the <c> VCMPORDPD / CMPORDPD </c> instruction.
		520	///
		521	/// \param __a
		522	/// A 128-bit vector of [2 x double].
		523	/// \param __b
		524	/// A 128-bit vector of [2 x double].
		525	/// \returns A 128-bit vector containing the comparison results.
		526	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_pd(__m128d __a,
		527	__m128d __b) {
		528	return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b);
		529	}
		530
		531	/// Compares each of the corresponding double-precision values of the
		532	/// 128-bit vectors of [2 x double] to determine if the values in the first
		533	/// operand are unordered with respect to those in the second operand.
		534	///
		535	/// A pair of double-precision values are "unordered" with respect to each
		536	/// other if one or both values are NaN. Each comparison yields 0x0 for
		537	/// false, 0xFFFFFFFFFFFFFFFF for true.
		538	///
		539	/// \headerfile <x86intrin.h>
		540	///
		541	/// This intrinsic corresponds to the <c> VCMPUNORDPD / CMPUNORDPD </c>
		542	/// instruction.
		543	///
		544	/// \param __a
		545	/// A 128-bit vector of [2 x double].
		546	/// \param __b
		547	/// A 128-bit vector of [2 x double].
		548	/// \returns A 128-bit vector containing the comparison results.
		549	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_pd(__m128d __a,
		550	__m128d __b) {
		551	return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b);
		552	}
		553
		554	/// Compares each of the corresponding double-precision values of the
		555	/// 128-bit vectors of [2 x double] to determine if the values in the first
		556	/// operand are unequal to those in the second operand.
		557	///
		558	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		559	///
		560	/// \headerfile <x86intrin.h>
		561	///
		562	/// This intrinsic corresponds to the <c> VCMPNEQPD / CMPNEQPD </c> instruction.
		563	///
		564	/// \param __a
		565	/// A 128-bit vector of [2 x double].
		566	/// \param __b
		567	/// A 128-bit vector of [2 x double].
		568	/// \returns A 128-bit vector containing the comparison results.
		569	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_pd(__m128d __a,
		570	__m128d __b) {
		571	return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b);
		572	}
		573
		574	/// Compares each of the corresponding double-precision values of the
		575	/// 128-bit vectors of [2 x double] to determine if the values in the first
		576	/// operand are not less than those in the second operand.
		577	///
		578	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		579	///
		580	/// \headerfile <x86intrin.h>
		581	///
		582	/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
		583	///
		584	/// \param __a
		585	/// A 128-bit vector of [2 x double].
		586	/// \param __b
		587	/// A 128-bit vector of [2 x double].
		588	/// \returns A 128-bit vector containing the comparison results.
		589	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_pd(__m128d __a,
		590	__m128d __b) {
		591	return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b);
		592	}
		593
		594	/// Compares each of the corresponding double-precision values of the
		595	/// 128-bit vectors of [2 x double] to determine if the values in the first
		596	/// operand are not less than or equal to those in the second operand.
		597	///
		598	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		599	///
		600	/// \headerfile <x86intrin.h>
		601	///
		602	/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
		603	///
		604	/// \param __a
		605	/// A 128-bit vector of [2 x double].
		606	/// \param __b
		607	/// A 128-bit vector of [2 x double].
		608	/// \returns A 128-bit vector containing the comparison results.
		609	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_pd(__m128d __a,
		610	__m128d __b) {
		611	return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b);
		612	}
		613
		614	/// Compares each of the corresponding double-precision values of the
		615	/// 128-bit vectors of [2 x double] to determine if the values in the first
		616	/// operand are not greater than those in the second operand.
		617	///
		618	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		619	///
		620	/// \headerfile <x86intrin.h>
		621	///
		622	/// This intrinsic corresponds to the <c> VCMPNLTPD / CMPNLTPD </c> instruction.
		623	///
		624	/// \param __a
		625	/// A 128-bit vector of [2 x double].
		626	/// \param __b
		627	/// A 128-bit vector of [2 x double].
		628	/// \returns A 128-bit vector containing the comparison results.
		629	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_pd(__m128d __a,
		630	__m128d __b) {
		631	return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a);
		632	}
		633
		634	/// Compares each of the corresponding double-precision values of the
		635	/// 128-bit vectors of [2 x double] to determine if the values in the first
		636	/// operand are not greater than or equal to those in the second operand.
		637	///
		638	/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		639	///
		640	/// \headerfile <x86intrin.h>
		641	///
		642	/// This intrinsic corresponds to the <c> VCMPNLEPD / CMPNLEPD </c> instruction.
		643	///
		644	/// \param __a
		645	/// A 128-bit vector of [2 x double].
		646	/// \param __b
		647	/// A 128-bit vector of [2 x double].
		648	/// \returns A 128-bit vector containing the comparison results.
		649	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_pd(__m128d __a,
		650	__m128d __b) {
		651	return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a);
		652	}
		653
		654	/// Compares the lower double-precision floating-point values in each of
		655	/// the two 128-bit floating-point vectors of [2 x double] for equality.
		656	///
		657	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		658	///
		659	/// \headerfile <x86intrin.h>
		660	///
		661	/// This intrinsic corresponds to the <c> VCMPEQSD / CMPEQSD </c> instruction.
		662	///
		663	/// \param __a
		664	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		665	/// compared to the lower double-precision value of \a __b.
		666	/// \param __b
		667	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		668	/// compared to the lower double-precision value of \a __a.
		669	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		670	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		671	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpeq_sd(__m128d __a,
		672	__m128d __b) {
		673	return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b);
		674	}
		675
		676	/// Compares the lower double-precision floating-point values in each of
		677	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		678	/// the value in the first parameter is less than the corresponding value in
		679	/// the second parameter.
		680	///
		681	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		682	///
		683	/// \headerfile <x86intrin.h>
		684	///
		685	/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
		686	///
		687	/// \param __a
		688	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		689	/// compared to the lower double-precision value of \a __b.
		690	/// \param __b
		691	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		692	/// compared to the lower double-precision value of \a __a.
		693	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		694	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		695	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmplt_sd(__m128d __a,
		696	__m128d __b) {
		697	return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b);
		698	}
		699
		700	/// Compares the lower double-precision floating-point values in each of
		701	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		702	/// the value in the first parameter is less than or equal to the
		703	/// corresponding value in the second parameter.
		704	///
		705	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		706	///
		707	/// \headerfile <x86intrin.h>
		708	///
		709	/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
		710	///
		711	/// \param __a
		712	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		713	/// compared to the lower double-precision value of \a __b.
		714	/// \param __b
		715	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		716	/// compared to the lower double-precision value of \a __a.
		717	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		718	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		719	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmple_sd(__m128d __a,
		720	__m128d __b) {
		721	return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b);
		722	}
		723
		724	/// Compares the lower double-precision floating-point values in each of
		725	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		726	/// the value in the first parameter is greater than the corresponding value
		727	/// in the second parameter.
		728	///
		729	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		730	///
		731	/// \headerfile <x86intrin.h>
		732	///
		733	/// This intrinsic corresponds to the <c> VCMPLTSD / CMPLTSD </c> instruction.
		734	///
		735	/// \param __a
		736	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		737	/// compared to the lower double-precision value of \a __b.
		738	/// \param __b
		739	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		740	/// compared to the lower double-precision value of \a __a.
		741	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		742	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		743	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpgt_sd(__m128d __a,
		744	__m128d __b) {
		745	__m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a);
		746	return __extension__(__m128d){__c[0], __a[1]};
		747	}
		748
		749	/// Compares the lower double-precision floating-point values in each of
		750	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		751	/// the value in the first parameter is greater than or equal to the
		752	/// corresponding value in the second parameter.
		753	///
		754	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		755	///
		756	/// \headerfile <x86intrin.h>
		757	///
		758	/// This intrinsic corresponds to the <c> VCMPLESD / CMPLESD </c> instruction.
		759	///
		760	/// \param __a
		761	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		762	/// compared to the lower double-precision value of \a __b.
		763	/// \param __b
		764	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		765	/// compared to the lower double-precision value of \a __a.
		766	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		767	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		768	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpge_sd(__m128d __a,
		769	__m128d __b) {
		770	__m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a);
		771	return __extension__(__m128d){__c[0], __a[1]};
		772	}
		773
		774	/// Compares the lower double-precision floating-point values in each of
		775	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		776	/// the value in the first parameter is "ordered" with respect to the
		777	/// corresponding value in the second parameter.
		778	///
		779	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
		780	/// of double-precision values are "ordered" with respect to each other if
		781	/// neither value is a NaN.
		782	///
		783	/// \headerfile <x86intrin.h>
		784	///
		785	/// This intrinsic corresponds to the <c> VCMPORDSD / CMPORDSD </c> instruction.
		786	///
		787	/// \param __a
		788	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		789	/// compared to the lower double-precision value of \a __b.
		790	/// \param __b
		791	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		792	/// compared to the lower double-precision value of \a __a.
		793	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		794	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		795	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpord_sd(__m128d __a,
		796	__m128d __b) {
		797	return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b);
		798	}
		799
		800	/// Compares the lower double-precision floating-point values in each of
		801	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		802	/// the value in the first parameter is "unordered" with respect to the
		803	/// corresponding value in the second parameter.
		804	///
		805	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair
		806	/// of double-precision values are "unordered" with respect to each other if
		807	/// one or both values are NaN.
		808	///
		809	/// \headerfile <x86intrin.h>
		810	///
		811	/// This intrinsic corresponds to the <c> VCMPUNORDSD / CMPUNORDSD </c>
		812	/// instruction.
		813	///
		814	/// \param __a
		815	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		816	/// compared to the lower double-precision value of \a __b.
		817	/// \param __b
		818	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		819	/// compared to the lower double-precision value of \a __a.
		820	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		821	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		822	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpunord_sd(__m128d __a,
		823	__m128d __b) {
		824	return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b);
		825	}
		826
		827	/// Compares the lower double-precision floating-point values in each of
		828	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		829	/// the value in the first parameter is unequal to the corresponding value in
		830	/// the second parameter.
		831	///
		832	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		833	///
		834	/// \headerfile <x86intrin.h>
		835	///
		836	/// This intrinsic corresponds to the <c> VCMPNEQSD / CMPNEQSD </c> instruction.
		837	///
		838	/// \param __a
		839	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		840	/// compared to the lower double-precision value of \a __b.
		841	/// \param __b
		842	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		843	/// compared to the lower double-precision value of \a __a.
		844	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		845	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		846	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpneq_sd(__m128d __a,
		847	__m128d __b) {
		848	return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b);
		849	}
		850
		851	/// Compares the lower double-precision floating-point values in each of
		852	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		853	/// the value in the first parameter is not less than the corresponding
		854	/// value in the second parameter.
		855	///
		856	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		857	///
		858	/// \headerfile <x86intrin.h>
		859	///
		860	/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
		861	///
		862	/// \param __a
		863	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		864	/// compared to the lower double-precision value of \a __b.
		865	/// \param __b
		866	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		867	/// compared to the lower double-precision value of \a __a.
		868	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		869	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		870	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnlt_sd(__m128d __a,
		871	__m128d __b) {
		872	return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b);
		873	}
		874
		875	/// Compares the lower double-precision floating-point values in each of
		876	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		877	/// the value in the first parameter is not less than or equal to the
		878	/// corresponding value in the second parameter.
		879	///
		880	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		881	///
		882	/// \headerfile <x86intrin.h>
		883	///
		884	/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
		885	///
		886	/// \param __a
		887	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		888	/// compared to the lower double-precision value of \a __b.
		889	/// \param __b
		890	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		891	/// compared to the lower double-precision value of \a __a.
		892	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		893	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		894	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnle_sd(__m128d __a,
		895	__m128d __b) {
		896	return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b);
		897	}
		898
		899	/// Compares the lower double-precision floating-point values in each of
		900	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		901	/// the value in the first parameter is not greater than the corresponding
		902	/// value in the second parameter.
		903	///
		904	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		905	///
		906	/// \headerfile <x86intrin.h>
		907	///
		908	/// This intrinsic corresponds to the <c> VCMPNLTSD / CMPNLTSD </c> instruction.
		909	///
		910	/// \param __a
		911	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		912	/// compared to the lower double-precision value of \a __b.
		913	/// \param __b
		914	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		915	/// compared to the lower double-precision value of \a __a.
		916	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		917	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		918	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpngt_sd(__m128d __a,
		919	__m128d __b) {
		920	__m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a);
		921	return __extension__(__m128d){__c[0], __a[1]};
		922	}
		923
		924	/// Compares the lower double-precision floating-point values in each of
		925	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		926	/// the value in the first parameter is not greater than or equal to the
		927	/// corresponding value in the second parameter.
		928	///
		929	/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true.
		930	///
		931	/// \headerfile <x86intrin.h>
		932	///
		933	/// This intrinsic corresponds to the <c> VCMPNLESD / CMPNLESD </c> instruction.
		934	///
		935	/// \param __a
		936	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		937	/// compared to the lower double-precision value of \a __b.
		938	/// \param __b
		939	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		940	/// compared to the lower double-precision value of \a __a.
		941	/// \returns A 128-bit vector. The lower 64 bits contains the comparison
		942	/// results. The upper 64 bits are copied from the upper 64 bits of \a __a.
		943	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cmpnge_sd(__m128d __a,
		944	__m128d __b) {
		945	__m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a);
		946	return __extension__(__m128d){__c[0], __a[1]};
		947	}
		948
		949	/// Compares the lower double-precision floating-point values in each of
		950	/// the two 128-bit floating-point vectors of [2 x double] for equality.
		951	///
		952	/// The comparison yields 0 for false, 1 for true. If either of the two
		953	/// lower double-precision values is NaN, 0 is returned.
		954	///
		955	/// \headerfile <x86intrin.h>
		956	///
		957	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
		958	///
		959	/// \param __a
		960	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		961	/// compared to the lower double-precision value of \a __b.
		962	/// \param __b
		963	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		964	/// compared to the lower double-precision value of \a __a.
		965	/// \returns An integer containing the comparison results. If either of the two
		966	/// lower double-precision values is NaN, 0 is returned.
		967	static __inline__ int __DEFAULT_FN_ATTRS _mm_comieq_sd(__m128d __a,
		968	__m128d __b) {
		969	return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b);
		970	}
		971
		972	/// Compares the lower double-precision floating-point values in each of
		973	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		974	/// the value in the first parameter is less than the corresponding value in
		975	/// the second parameter.
		976	///
		977	/// The comparison yields 0 for false, 1 for true. If either of the two
		978	/// lower double-precision values is NaN, 0 is returned.
		979	///
		980	/// \headerfile <x86intrin.h>
		981	///
		982	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
		983	///
		984	/// \param __a
		985	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		986	/// compared to the lower double-precision value of \a __b.
		987	/// \param __b
		988	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		989	/// compared to the lower double-precision value of \a __a.
		990	/// \returns An integer containing the comparison results. If either of the two
		991	/// lower double-precision values is NaN, 0 is returned.
		992	static __inline__ int __DEFAULT_FN_ATTRS _mm_comilt_sd(__m128d __a,
		993	__m128d __b) {
		994	return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b);
		995	}
		996
		997	/// Compares the lower double-precision floating-point values in each of
		998	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		999	/// the value in the first parameter is less than or equal to the
		1000	/// corresponding value in the second parameter.
		1001	///
		1002	/// The comparison yields 0 for false, 1 for true. If either of the two
		1003	/// lower double-precision values is NaN, 0 is returned.
		1004	///
		1005	/// \headerfile <x86intrin.h>
		1006	///
		1007	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
		1008	///
		1009	/// \param __a
		1010	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1011	/// compared to the lower double-precision value of \a __b.
		1012	/// \param __b
		1013	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1014	/// compared to the lower double-precision value of \a __a.
		1015	/// \returns An integer containing the comparison results. If either of the two
		1016	/// lower double-precision values is NaN, 0 is returned.
		1017	static __inline__ int __DEFAULT_FN_ATTRS _mm_comile_sd(__m128d __a,
		1018	__m128d __b) {
		1019	return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b);
		1020	}
		1021
		1022	/// Compares the lower double-precision floating-point values in each of
		1023	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		1024	/// the value in the first parameter is greater than the corresponding value
		1025	/// in the second parameter.
		1026	///
		1027	/// The comparison yields 0 for false, 1 for true. If either of the two
		1028	/// lower double-precision values is NaN, 0 is returned.
		1029	///
		1030	/// \headerfile <x86intrin.h>
		1031	///
		1032	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
		1033	///
		1034	/// \param __a
		1035	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1036	/// compared to the lower double-precision value of \a __b.
		1037	/// \param __b
		1038	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1039	/// compared to the lower double-precision value of \a __a.
		1040	/// \returns An integer containing the comparison results. If either of the two
		1041	/// lower double-precision values is NaN, 0 is returned.
		1042	static __inline__ int __DEFAULT_FN_ATTRS _mm_comigt_sd(__m128d __a,
		1043	__m128d __b) {
		1044	return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b);
		1045	}
		1046
		1047	/// Compares the lower double-precision floating-point values in each of
		1048	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		1049	/// the value in the first parameter is greater than or equal to the
		1050	/// corresponding value in the second parameter.
		1051	///
		1052	/// The comparison yields 0 for false, 1 for true. If either of the two
		1053	/// lower double-precision values is NaN, 0 is returned.
		1054	///
		1055	/// \headerfile <x86intrin.h>
		1056	///
		1057	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
		1058	///
		1059	/// \param __a
		1060	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1061	/// compared to the lower double-precision value of \a __b.
		1062	/// \param __b
		1063	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1064	/// compared to the lower double-precision value of \a __a.
		1065	/// \returns An integer containing the comparison results. If either of the two
		1066	/// lower double-precision values is NaN, 0 is returned.
		1067	static __inline__ int __DEFAULT_FN_ATTRS _mm_comige_sd(__m128d __a,
		1068	__m128d __b) {
		1069	return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b);
		1070	}
		1071
		1072	/// Compares the lower double-precision floating-point values in each of
		1073	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		1074	/// the value in the first parameter is unequal to the corresponding value in
		1075	/// the second parameter.
		1076	///
		1077	/// The comparison yields 0 for false, 1 for true. If either of the two
		1078	/// lower double-precision values is NaN, 1 is returned.
		1079	///
		1080	/// \headerfile <x86intrin.h>
		1081	///
		1082	/// This intrinsic corresponds to the <c> VCOMISD / COMISD </c> instruction.
		1083	///
		1084	/// \param __a
		1085	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1086	/// compared to the lower double-precision value of \a __b.
		1087	/// \param __b
		1088	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1089	/// compared to the lower double-precision value of \a __a.
		1090	/// \returns An integer containing the comparison results. If either of the two
		1091	/// lower double-precision values is NaN, 1 is returned.
		1092	static __inline__ int __DEFAULT_FN_ATTRS _mm_comineq_sd(__m128d __a,
		1093	__m128d __b) {
		1094	return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b);
		1095	}
		1096
		1097	/// Compares the lower double-precision floating-point values in each of
		1098	/// the two 128-bit floating-point vectors of [2 x double] for equality. The
		1099	/// comparison yields 0 for false, 1 for true.
		1100	///
		1101	/// If either of the two lower double-precision values is NaN, 0 is returned.
		1102	///
		1103	/// \headerfile <x86intrin.h>
		1104	///
		1105	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
		1106	///
		1107	/// \param __a
		1108	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1109	/// compared to the lower double-precision value of \a __b.
		1110	/// \param __b
		1111	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1112	/// compared to the lower double-precision value of \a __a.
		1113	/// \returns An integer containing the comparison results. If either of the two
		1114	/// lower double-precision values is NaN, 0 is returned.
		1115	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomieq_sd(__m128d __a,
		1116	__m128d __b) {
		1117	return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b);
		1118	}
		1119
		1120	/// Compares the lower double-precision floating-point values in each of
		1121	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		1122	/// the value in the first parameter is less than the corresponding value in
		1123	/// the second parameter.
		1124	///
		1125	/// The comparison yields 0 for false, 1 for true. If either of the two lower
		1126	/// double-precision values is NaN, 0 is returned.
		1127	///
		1128	/// \headerfile <x86intrin.h>
		1129	///
		1130	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
		1131	///
		1132	/// \param __a
		1133	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1134	/// compared to the lower double-precision value of \a __b.
		1135	/// \param __b
		1136	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1137	/// compared to the lower double-precision value of \a __a.
		1138	/// \returns An integer containing the comparison results. If either of the two
		1139	/// lower double-precision values is NaN, 0 is returned.
		1140	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomilt_sd(__m128d __a,
		1141	__m128d __b) {
		1142	return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b);
		1143	}
		1144
		1145	/// Compares the lower double-precision floating-point values in each of
		1146	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		1147	/// the value in the first parameter is less than or equal to the
		1148	/// corresponding value in the second parameter.
		1149	///
		1150	/// The comparison yields 0 for false, 1 for true. If either of the two lower
		1151	/// double-precision values is NaN, 0 is returned.
		1152	///
		1153	/// \headerfile <x86intrin.h>
		1154	///
		1155	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
		1156	///
		1157	/// \param __a
		1158	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1159	/// compared to the lower double-precision value of \a __b.
		1160	/// \param __b
		1161	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1162	/// compared to the lower double-precision value of \a __a.
		1163	/// \returns An integer containing the comparison results. If either of the two
		1164	/// lower double-precision values is NaN, 0 is returned.
		1165	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomile_sd(__m128d __a,
		1166	__m128d __b) {
		1167	return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b);
		1168	}
		1169
		1170	/// Compares the lower double-precision floating-point values in each of
		1171	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		1172	/// the value in the first parameter is greater than the corresponding value
		1173	/// in the second parameter.
		1174	///
		1175	/// The comparison yields 0 for false, 1 for true. If either of the two lower
		1176	/// double-precision values is NaN, 0 is returned.
		1177	///
		1178	/// \headerfile <x86intrin.h>
		1179	///
		1180	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
		1181	///
		1182	/// \param __a
		1183	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1184	/// compared to the lower double-precision value of \a __b.
		1185	/// \param __b
		1186	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1187	/// compared to the lower double-precision value of \a __a.
		1188	/// \returns An integer containing the comparison results. If either of the two
		1189	/// lower double-precision values is NaN, 0 is returned.
		1190	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomigt_sd(__m128d __a,
		1191	__m128d __b) {
		1192	return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b);
		1193	}
		1194
		1195	/// Compares the lower double-precision floating-point values in each of
		1196	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		1197	/// the value in the first parameter is greater than or equal to the
		1198	/// corresponding value in the second parameter.
		1199	///
		1200	/// The comparison yields 0 for false, 1 for true. If either of the two
		1201	/// lower double-precision values is NaN, 0 is returned.
		1202	///
		1203	/// \headerfile <x86intrin.h>
		1204	///
		1205	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
		1206	///
		1207	/// \param __a
		1208	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1209	/// compared to the lower double-precision value of \a __b.
		1210	/// \param __b
		1211	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1212	/// compared to the lower double-precision value of \a __a.
		1213	/// \returns An integer containing the comparison results. If either of the two
		1214	/// lower double-precision values is NaN, 0 is returned.
		1215	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomige_sd(__m128d __a,
		1216	__m128d __b) {
		1217	return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b);
		1218	}
		1219
		1220	/// Compares the lower double-precision floating-point values in each of
		1221	/// the two 128-bit floating-point vectors of [2 x double] to determine if
		1222	/// the value in the first parameter is unequal to the corresponding value in
		1223	/// the second parameter.
		1224	///
		1225	/// The comparison yields 0 for false, 1 for true. If either of the two lower
		1226	/// double-precision values is NaN, 1 is returned.
		1227	///
		1228	/// \headerfile <x86intrin.h>
		1229	///
		1230	/// This intrinsic corresponds to the <c> VUCOMISD / UCOMISD </c> instruction.
		1231	///
		1232	/// \param __a
		1233	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1234	/// compared to the lower double-precision value of \a __b.
		1235	/// \param __b
		1236	/// A 128-bit vector of [2 x double]. The lower double-precision value is
		1237	/// compared to the lower double-precision value of \a __a.
		1238	/// \returns An integer containing the comparison result. If either of the two
		1239	/// lower double-precision values is NaN, 1 is returned.
		1240	static __inline__ int __DEFAULT_FN_ATTRS _mm_ucomineq_sd(__m128d __a,
		1241	__m128d __b) {
		1242	return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b);
		1243	}
		1244
		1245	/// Converts the two double-precision floating-point elements of a
		1246	/// 128-bit vector of [2 x double] into two single-precision floating-point
		1247	/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float].
		1248	/// The upper 64 bits of the result vector are set to zero.
		1249	///
		1250	/// \headerfile <x86intrin.h>
		1251	///
		1252	/// This intrinsic corresponds to the <c> VCVTPD2PS / CVTPD2PS </c> instruction.
		1253	///
		1254	/// \param __a
		1255	/// A 128-bit vector of [2 x double].
		1256	/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the
		1257	/// converted values. The upper 64 bits are set to zero.
		1258	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtpd_ps(__m128d __a) {
		1259	return __builtin_ia32_cvtpd2ps((__v2df)__a);
		1260	}
		1261
		1262	/// Converts the lower two single-precision floating-point elements of a
		1263	/// 128-bit vector of [4 x float] into two double-precision floating-point
		1264	/// values, returned in a 128-bit vector of [2 x double]. The upper two
		1265	/// elements of the input vector are unused.
		1266	///
		1267	/// \headerfile <x86intrin.h>
		1268	///
		1269	/// This intrinsic corresponds to the <c> VCVTPS2PD / CVTPS2PD </c> instruction.
		1270	///
		1271	/// \param __a
		1272	/// A 128-bit vector of [4 x float]. The lower two single-precision
		1273	/// floating-point elements are converted to double-precision values. The
		1274	/// upper two elements are unused.
		1275	/// \returns A 128-bit vector of [2 x double] containing the converted values.
		1276	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtps_pd(__m128 __a) {
		1277	return (__m128d) __builtin_convertvector(
		1278	__builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df);
		1279	}
		1280
		1281	/// Converts the lower two integer elements of a 128-bit vector of
		1282	/// [4 x i32] into two double-precision floating-point values, returned in a
		1283	/// 128-bit vector of [2 x double].
		1284	///
		1285	/// The upper two elements of the input vector are unused.
		1286	///
		1287	/// \headerfile <x86intrin.h>
		1288	///
		1289	/// This intrinsic corresponds to the <c> VCVTDQ2PD / CVTDQ2PD </c> instruction.
		1290	///
		1291	/// \param __a
		1292	/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are
		1293	/// converted to double-precision values.
		1294	///
		1295	/// The upper two elements are unused.
		1296	/// \returns A 128-bit vector of [2 x double] containing the converted values.
		1297	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtepi32_pd(__m128i __a) {
		1298	return (__m128d) __builtin_convertvector(
		1299	__builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df);
		1300	}
		1301
		1302	/// Converts the two double-precision floating-point elements of a
		1303	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
		1304	/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper
		1305	/// 64 bits of the result vector are set to zero.
		1306	///
		1307	/// \headerfile <x86intrin.h>
		1308	///
		1309	/// This intrinsic corresponds to the <c> VCVTPD2DQ / CVTPD2DQ </c> instruction.
		1310	///
		1311	/// \param __a
		1312	/// A 128-bit vector of [2 x double].
		1313	/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
		1314	/// converted values. The upper 64 bits are set to zero.
		1315	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtpd_epi32(__m128d __a) {
		1316	return __builtin_ia32_cvtpd2dq((__v2df)__a);
		1317	}
		1318
		1319	/// Converts the low-order element of a 128-bit vector of [2 x double]
		1320	/// into a 32-bit signed integer value.
		1321	///
		1322	/// \headerfile <x86intrin.h>
		1323	///
		1324	/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
		1325	///
		1326	/// \param __a
		1327	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
		1328	/// conversion.
		1329	/// \returns A 32-bit signed integer containing the converted value.
		1330	static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsd_si32(__m128d __a) {
		1331	return __builtin_ia32_cvtsd2si((__v2df)__a);
		1332	}
		1333
		1334	/// Converts the lower double-precision floating-point element of a
		1335	/// 128-bit vector of [2 x double], in the second parameter, into a
		1336	/// single-precision floating-point value, returned in the lower 32 bits of a
		1337	/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are
		1338	/// copied from the upper 96 bits of the first parameter.
		1339	///
		1340	/// \headerfile <x86intrin.h>
		1341	///
		1342	/// This intrinsic corresponds to the <c> VCVTSD2SS / CVTSD2SS </c> instruction.
		1343	///
		1344	/// \param __a
		1345	/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are
		1346	/// copied to the upper 96 bits of the result.
		1347	/// \param __b
		1348	/// A 128-bit vector of [2 x double]. The lower double-precision
		1349	/// floating-point element is used in the conversion.
		1350	/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the
		1351	/// converted value from the second parameter. The upper 96 bits are copied
		1352	/// from the upper 96 bits of the first parameter.
		1353	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtsd_ss(__m128 __a,
		1354	__m128d __b) {
		1355	return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b);
		1356	}
		1357
		1358	/// Converts a 32-bit signed integer value, in the second parameter, into
		1359	/// a double-precision floating-point value, returned in the lower 64 bits of
		1360	/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
		1361	/// are copied from the upper 64 bits of the first parameter.
		1362	///
		1363	/// \headerfile <x86intrin.h>
		1364	///
		1365	/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
		1366	///
		1367	/// \param __a
		1368	/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
		1369	/// copied to the upper 64 bits of the result.
		1370	/// \param __b
		1371	/// A 32-bit signed integer containing the value to be converted.
		1372	/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
		1373	/// converted value from the second parameter. The upper 64 bits are copied
		1374	/// from the upper 64 bits of the first parameter.
		1375	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi32_sd(__m128d __a,
		1376	int __b) {
		1377	__a[0] = __b;
		1378	return __a;
		1379	}
		1380
		1381	/// Converts the lower single-precision floating-point element of a
		1382	/// 128-bit vector of [4 x float], in the second parameter, into a
		1383	/// double-precision floating-point value, returned in the lower 64 bits of
		1384	/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector
		1385	/// are copied from the upper 64 bits of the first parameter.
		1386	///
		1387	/// \headerfile <x86intrin.h>
		1388	///
		1389	/// This intrinsic corresponds to the <c> VCVTSS2SD / CVTSS2SD </c> instruction.
		1390	///
		1391	/// \param __a
		1392	/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are
		1393	/// copied to the upper 64 bits of the result.
		1394	/// \param __b
		1395	/// A 128-bit vector of [4 x float]. The lower single-precision
		1396	/// floating-point element is used in the conversion.
		1397	/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the
		1398	/// converted value from the second parameter. The upper 64 bits are copied
		1399	/// from the upper 64 bits of the first parameter.
		1400	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtss_sd(__m128d __a,
		1401	__m128 __b) {
		1402	__a[0] = __b[0];
		1403	return __a;
		1404	}
		1405
		1406	/// Converts the two double-precision floating-point elements of a
		1407	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
		1408	/// returned in the lower 64 bits of a 128-bit vector of [4 x i32].
		1409	///
		1410	/// If the result of either conversion is inexact, the result is truncated
		1411	/// (rounded towards zero) regardless of the current MXCSR setting. The upper
		1412	/// 64 bits of the result vector are set to zero.
		1413	///
		1414	/// \headerfile <x86intrin.h>
		1415	///
		1416	/// This intrinsic corresponds to the <c> VCVTTPD2DQ / CVTTPD2DQ </c>
		1417	/// instruction.
		1418	///
		1419	/// \param __a
		1420	/// A 128-bit vector of [2 x double].
		1421	/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the
		1422	/// converted values. The upper 64 bits are set to zero.
		1423	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttpd_epi32(__m128d __a) {
		1424	return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a);
		1425	}
		1426
		1427	/// Converts the low-order element of a [2 x double] vector into a 32-bit
		1428	/// signed integer value, truncating the result when it is inexact.
		1429	///
		1430	/// \headerfile <x86intrin.h>
		1431	///
		1432	/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
		1433	/// instruction.
		1434	///
		1435	/// \param __a
		1436	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
		1437	/// conversion.
		1438	/// \returns A 32-bit signed integer containing the converted value.
		1439	static __inline__ int __DEFAULT_FN_ATTRS _mm_cvttsd_si32(__m128d __a) {
		1440	return __builtin_ia32_cvttsd2si((__v2df)__a);
		1441	}
		1442
		1443	/// Converts the two double-precision floating-point elements of a
		1444	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
		1445	/// returned in a 64-bit vector of [2 x i32].
		1446	///
		1447	/// \headerfile <x86intrin.h>
		1448	///
		1449	/// This intrinsic corresponds to the <c> CVTPD2PI </c> instruction.
		1450	///
		1451	/// \param __a
		1452	/// A 128-bit vector of [2 x double].
		1453	/// \returns A 64-bit vector of [2 x i32] containing the converted values.
		1454	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvtpd_pi32(__m128d __a) {
		1455	return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a);
		1456	}
		1457
		1458	/// Converts the two double-precision floating-point elements of a
		1459	/// 128-bit vector of [2 x double] into two signed 32-bit integer values,
		1460	/// returned in a 64-bit vector of [2 x i32].
		1461	///
		1462	/// If the result of either conversion is inexact, the result is truncated
		1463	/// (rounded towards zero) regardless of the current MXCSR setting.
		1464	///
		1465	/// \headerfile <x86intrin.h>
		1466	///
		1467	/// This intrinsic corresponds to the <c> CVTTPD2PI </c> instruction.
		1468	///
		1469	/// \param __a
		1470	/// A 128-bit vector of [2 x double].
		1471	/// \returns A 64-bit vector of [2 x i32] containing the converted values.
		1472	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_cvttpd_pi32(__m128d __a) {
		1473	return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a);
		1474	}
		1475
		1476	/// Converts the two signed 32-bit integer elements of a 64-bit vector of
		1477	/// [2 x i32] into two double-precision floating-point values, returned in a
		1478	/// 128-bit vector of [2 x double].
		1479	///
		1480	/// \headerfile <x86intrin.h>
		1481	///
		1482	/// This intrinsic corresponds to the <c> CVTPI2PD </c> instruction.
		1483	///
		1484	/// \param __a
		1485	/// A 64-bit vector of [2 x i32].
		1486	/// \returns A 128-bit vector of [2 x double] containing the converted values.
		1487	static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX _mm_cvtpi32_pd(__m64 __a) {
		1488	return __builtin_ia32_cvtpi2pd((__v2si)__a);
		1489	}
		1490
		1491	/// Returns the low-order element of a 128-bit vector of [2 x double] as
		1492	/// a double-precision floating-point value.
		1493	///
		1494	/// \headerfile <x86intrin.h>
		1495	///
		1496	/// This intrinsic has no corresponding instruction.
		1497	///
		1498	/// \param __a
		1499	/// A 128-bit vector of [2 x double]. The lower 64 bits are returned.
		1500	/// \returns A double-precision floating-point value copied from the lower 64
		1501	/// bits of \a __a.
		1502	static __inline__ double __DEFAULT_FN_ATTRS _mm_cvtsd_f64(__m128d __a) {
		1503	return __a[0];
		1504	}
		1505
		1506	/// Loads a 128-bit floating-point vector of [2 x double] from an aligned
		1507	/// memory location.
		1508	///
		1509	/// \headerfile <x86intrin.h>
		1510	///
		1511	/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction.
		1512	///
		1513	/// \param __dp
		1514	/// A pointer to a 128-bit memory location. The address of the memory
		1515	/// location has to be 16-byte aligned.
		1516	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
		1517	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_pd(double const *__dp) {
		1518	return (const __m128d )__dp;
		1519	}
		1520
		1521	/// Loads a double-precision floating-point value from a specified memory
		1522	/// location and duplicates it to both vector elements of a 128-bit vector of
		1523	/// [2 x double].
		1524	///
		1525	/// \headerfile <x86intrin.h>
		1526	///
		1527	/// This intrinsic corresponds to the <c> VMOVDDUP / MOVDDUP </c> instruction.
		1528	///
		1529	/// \param __dp
		1530	/// A pointer to a memory location containing a double-precision value.
		1531	/// \returns A 128-bit vector of [2 x double] containing the loaded and
		1532	/// duplicated values.
		1533	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load1_pd(double const *__dp) {
		1534	struct __mm_load1_pd_struct {
		1535	double __u;
		1536	} __attribute__((__packed__, __may_alias__));
		1537	double __u = ((const struct __mm_load1_pd_struct *)__dp)->__u;
		1538	return __extension__(__m128d){__u, __u};
		1539	}
		1540
		1541	#define _mm_load_pd1(dp) _mm_load1_pd(dp)
		1542
		1543	/// Loads two double-precision values, in reverse order, from an aligned
		1544	/// memory location into a 128-bit vector of [2 x double].
		1545	///
		1546	/// \headerfile <x86intrin.h>
		1547	///
		1548	/// This intrinsic corresponds to the <c> VMOVAPD / MOVAPD </c> instruction +
		1549	/// needed shuffling instructions. In AVX mode, the shuffling may be combined
		1550	/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction.
		1551	///
		1552	/// \param __dp
		1553	/// A 16-byte aligned pointer to an array of double-precision values to be
		1554	/// loaded in reverse order.
		1555	/// \returns A 128-bit vector of [2 x double] containing the reversed loaded
		1556	/// values.
		1557	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadr_pd(double const *__dp) {
		1558	__m128d __u = (const __m128d )__dp;
		1559	return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0);
		1560	}
		1561
		1562	/// Loads a 128-bit floating-point vector of [2 x double] from an
		1563	/// unaligned memory location.
		1564	///
		1565	/// \headerfile <x86intrin.h>
		1566	///
		1567	/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
		1568	///
		1569	/// \param __dp
		1570	/// A pointer to a 128-bit memory location. The address of the memory
		1571	/// location does not have to be aligned.
		1572	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
		1573	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadu_pd(double const *__dp) {
		1574	struct __loadu_pd {
		1575	__m128d_u __v;
		1576	} __attribute__((__packed__, __may_alias__));
		1577	return ((const struct __loadu_pd *)__dp)->__v;
		1578	}
		1579
		1580	/// Loads a 64-bit integer value to the low element of a 128-bit integer
		1581	/// vector and clears the upper element.
		1582	///
		1583	/// \headerfile <x86intrin.h>
		1584	///
		1585	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
		1586	///
		1587	/// \param __a
		1588	/// A pointer to a 64-bit memory location. The address of the memory
		1589	/// location does not have to be aligned.
		1590	/// \returns A 128-bit vector of [2 x i64] containing the loaded value.
		1591	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si64(void const *__a) {
		1592	struct __loadu_si64 {
		1593	long long __v;
		1594	} __attribute__((__packed__, __may_alias__));
		1595	long long __u = ((const struct __loadu_si64 *)__a)->__v;
		1596	return __extension__(__m128i)(__v2di){__u, 0LL};
		1597	}
		1598
		1599	/// Loads a 32-bit integer value to the low element of a 128-bit integer
		1600	/// vector and clears the upper element.
		1601	///
		1602	/// \headerfile <x86intrin.h>
		1603	///
		1604	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
		1605	///
		1606	/// \param __a
		1607	/// A pointer to a 32-bit memory location. The address of the memory
		1608	/// location does not have to be aligned.
		1609	/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
		1610	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si32(void const *__a) {
		1611	struct __loadu_si32 {
		1612	int __v;
		1613	} __attribute__((__packed__, __may_alias__));
		1614	int __u = ((const struct __loadu_si32 *)__a)->__v;
		1615	return __extension__(__m128i)(__v4si){__u, 0, 0, 0};
		1616	}
		1617
		1618	/// Loads a 16-bit integer value to the low element of a 128-bit integer
		1619	/// vector and clears the upper element.
		1620	///
		1621	/// \headerfile <x86intrin.h>
		1622	///
		1623	/// This intrinsic does not correspond to a specific instruction.
		1624	///
		1625	/// \param __a
		1626	/// A pointer to a 16-bit memory location. The address of the memory
		1627	/// location does not have to be aligned.
		1628	/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
		1629	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_loadu_si16(void const *__a) {
		1630	struct __loadu_si16 {
		1631	short __v;
		1632	} __attribute__((__packed__, __may_alias__));
		1633	short __u = ((const struct __loadu_si16 *)__a)->__v;
		1634	return __extension__(__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
		1635	}
		1636
		1637	/// Loads a 64-bit double-precision value to the low element of a
		1638	/// 128-bit integer vector and clears the upper element.
		1639	///
		1640	/// \headerfile <x86intrin.h>
		1641	///
		1642	/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
		1643	///
		1644	/// \param __dp
		1645	/// A pointer to a memory location containing a double-precision value.
		1646	/// The address of the memory location does not have to be aligned.
		1647	/// \returns A 128-bit vector of [2 x double] containing the loaded value.
		1648	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_load_sd(double const *__dp) {
		1649	struct __mm_load_sd_struct {
		1650	double __u;
		1651	} __attribute__((__packed__, __may_alias__));
		1652	double __u = ((const struct __mm_load_sd_struct *)__dp)->__u;
		1653	return __extension__(__m128d){__u, 0};
		1654	}
		1655
		1656	/// Loads a double-precision value into the high-order bits of a 128-bit
		1657	/// vector of [2 x double]. The low-order bits are copied from the low-order
		1658	/// bits of the first operand.
		1659	///
		1660	/// \headerfile <x86intrin.h>
		1661	///
		1662	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
		1663	///
		1664	/// \param __a
		1665	/// A 128-bit vector of [2 x double]. \n
		1666	/// Bits [63:0] are written to bits [63:0] of the result.
		1667	/// \param __dp
		1668	/// A pointer to a 64-bit memory location containing a double-precision
		1669	/// floating-point value that is loaded. The loaded value is written to bits
		1670	/// [127:64] of the result. The address of the memory location does not have
		1671	/// to be aligned.
		1672	/// \returns A 128-bit vector of [2 x double] containing the moved values.
		1673	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadh_pd(__m128d __a,
		1674	double const *__dp) {
		1675	struct __mm_loadh_pd_struct {
		1676	double __u;
		1677	} __attribute__((__packed__, __may_alias__));
		1678	double __u = ((const struct __mm_loadh_pd_struct *)__dp)->__u;
		1679	return __extension__(__m128d){__a[0], __u};
		1680	}
		1681
		1682	/// Loads a double-precision value into the low-order bits of a 128-bit
		1683	/// vector of [2 x double]. The high-order bits are copied from the
		1684	/// high-order bits of the first operand.
		1685	///
		1686	/// \headerfile <x86intrin.h>
		1687	///
		1688	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
		1689	///
		1690	/// \param __a
		1691	/// A 128-bit vector of [2 x double]. \n
		1692	/// Bits [127:64] are written to bits [127:64] of the result.
		1693	/// \param __dp
		1694	/// A pointer to a 64-bit memory location containing a double-precision
		1695	/// floating-point value that is loaded. The loaded value is written to bits
		1696	/// [63:0] of the result. The address of the memory location does not have to
		1697	/// be aligned.
		1698	/// \returns A 128-bit vector of [2 x double] containing the moved values.
		1699	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_loadl_pd(__m128d __a,
		1700	double const *__dp) {
		1701	struct __mm_loadl_pd_struct {
		1702	double __u;
		1703	} __attribute__((__packed__, __may_alias__));
		1704	double __u = ((const struct __mm_loadl_pd_struct *)__dp)->__u;
		1705	return __extension__(__m128d){__u, __a[1]};
		1706	}
		1707
		1708	/// Constructs a 128-bit floating-point vector of [2 x double] with
		1709	/// unspecified content. This could be used as an argument to another
		1710	/// intrinsic function where the argument is required but the value is not
		1711	/// actually used.
		1712	///
		1713	/// \headerfile <x86intrin.h>
		1714	///
		1715	/// This intrinsic has no corresponding instruction.
		1716	///
		1717	/// \returns A 128-bit floating-point vector of [2 x double] with unspecified
		1718	/// content.
		1719	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_undefined_pd(void) {
		1720	return (__m128d)__builtin_ia32_undef128();
		1721	}
		1722
		1723	/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
		1724	/// 64 bits of the vector are initialized with the specified double-precision
		1725	/// floating-point value. The upper 64 bits are set to zero.
		1726	///
		1727	/// \headerfile <x86intrin.h>
		1728	///
		1729	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
		1730	///
		1731	/// \param __w
		1732	/// A double-precision floating-point value used to initialize the lower 64
		1733	/// bits of the result.
		1734	/// \returns An initialized 128-bit floating-point vector of [2 x double]. The
		1735	/// lower 64 bits contain the value of the parameter. The upper 64 bits are
		1736	/// set to zero.
		1737	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_sd(double __w) {
		1738	return __extension__(__m128d){__w, 0};
		1739	}
		1740
		1741	/// Constructs a 128-bit floating-point vector of [2 x double], with each
		1742	/// of the two double-precision floating-point vector elements set to the
		1743	/// specified double-precision floating-point value.
		1744	///
		1745	/// \headerfile <x86intrin.h>
		1746	///
		1747	/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
		1748	///
		1749	/// \param __w
		1750	/// A double-precision floating-point value used to initialize each vector
		1751	/// element of the result.
		1752	/// \returns An initialized 128-bit floating-point vector of [2 x double].
		1753	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set1_pd(double __w) {
		1754	return __extension__(__m128d){__w, __w};
		1755	}
		1756
		1757	/// Constructs a 128-bit floating-point vector of [2 x double], with each
		1758	/// of the two double-precision floating-point vector elements set to the
		1759	/// specified double-precision floating-point value.
		1760	///
		1761	/// \headerfile <x86intrin.h>
		1762	///
		1763	/// This intrinsic corresponds to the <c> VMOVDDUP / MOVLHPS </c> instruction.
		1764	///
		1765	/// \param __w
		1766	/// A double-precision floating-point value used to initialize each vector
		1767	/// element of the result.
		1768	/// \returns An initialized 128-bit floating-point vector of [2 x double].
		1769	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd1(double __w) {
		1770	return _mm_set1_pd(__w);
		1771	}
		1772
		1773	/// Constructs a 128-bit floating-point vector of [2 x double]
		1774	/// initialized with the specified double-precision floating-point values.
		1775	///
		1776	/// \headerfile <x86intrin.h>
		1777	///
		1778	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
		1779	///
		1780	/// \param __w
		1781	/// A double-precision floating-point value used to initialize the upper 64
		1782	/// bits of the result.
		1783	/// \param __x
		1784	/// A double-precision floating-point value used to initialize the lower 64
		1785	/// bits of the result.
		1786	/// \returns An initialized 128-bit floating-point vector of [2 x double].
		1787	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_set_pd(double __w,
		1788	double __x) {
		1789	return __extension__(__m128d){__x, __w};
		1790	}
		1791
		1792	/// Constructs a 128-bit floating-point vector of [2 x double],
		1793	/// initialized in reverse order with the specified double-precision
		1794	/// floating-point values.
		1795	///
		1796	/// \headerfile <x86intrin.h>
		1797	///
		1798	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
		1799	///
		1800	/// \param __w
		1801	/// A double-precision floating-point value used to initialize the lower 64
		1802	/// bits of the result.
		1803	/// \param __x
		1804	/// A double-precision floating-point value used to initialize the upper 64
		1805	/// bits of the result.
		1806	/// \returns An initialized 128-bit floating-point vector of [2 x double].
		1807	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
		1808	double __x) {
		1809	return __extension__(__m128d){__w, __x};
		1810	}
		1811
		1812	/// Constructs a 128-bit floating-point vector of [2 x double]
		1813	/// initialized to zero.
		1814	///
		1815	/// \headerfile <x86intrin.h>
		1816	///
		1817	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
		1818	///
		1819	/// \returns An initialized 128-bit floating-point vector of [2 x double] with
		1820	/// all elements set to zero.
		1821	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
		1822	return __extension__(__m128d){0.0, 0.0};
		1823	}
		1824
		1825	/// Constructs a 128-bit floating-point vector of [2 x double]. The lower
		1826	/// 64 bits are set to the lower 64 bits of the second parameter. The upper
		1827	/// 64 bits are set to the upper 64 bits of the first parameter.
		1828	///
		1829	/// \headerfile <x86intrin.h>
		1830	///
		1831	/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
		1832	///
		1833	/// \param __a
		1834	/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the
		1835	/// upper 64 bits of the result.
		1836	/// \param __b
		1837	/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the
		1838	/// lower 64 bits of the result.
		1839	/// \returns A 128-bit vector of [2 x double] containing the moved values.
		1840	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_move_sd(__m128d __a,
		1841	__m128d __b) {
		1842	__a[0] = __b[0];
		1843	return __a;
		1844	}
		1845
		1846	/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
		1847	/// memory location.
		1848	///
		1849	/// \headerfile <x86intrin.h>
		1850	///
		1851	/// This intrinsic corresponds to the <c> VMOVSD / MOVSD </c> instruction.
		1852	///
		1853	/// \param __dp
		1854	/// A pointer to a 64-bit memory location.
		1855	/// \param __a
		1856	/// A 128-bit vector of [2 x double] containing the value to be stored.
		1857	static __inline__ void __DEFAULT_FN_ATTRS _mm_store_sd(double *__dp,
		1858	__m128d __a) {
		1859	struct __mm_store_sd_struct {
		1860	double __u;
		1861	} __attribute__((__packed__, __may_alias__));
		1862	((struct __mm_store_sd_struct *)__dp)->__u = __a[0];
		1863	}
		1864
		1865	/// Moves packed double-precision values from a 128-bit vector of
		1866	/// [2 x double] to a memory location.
		1867	///
		1868	/// \headerfile <x86intrin.h>
		1869	///
		1870	/// This intrinsic corresponds to the <c>VMOVAPD / MOVAPS</c> instruction.
		1871	///
		1872	/// \param __dp
		1873	/// A pointer to an aligned memory location that can store two
		1874	/// double-precision values.
		1875	/// \param __a
		1876	/// A packed 128-bit vector of [2 x double] containing the values to be
		1877	/// moved.
		1878	static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd(double *__dp,
		1879	__m128d __a) {
		1880	(__m128d )__dp = __a;
		1881	}
		1882
		1883	/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
		1884	/// the upper and lower 64 bits of a memory location.
		1885	///
		1886	/// \headerfile <x86intrin.h>
		1887	///
		1888	/// This intrinsic corresponds to the
		1889	/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
		1890	///
		1891	/// \param __dp
		1892	/// A pointer to a memory location that can store two double-precision
		1893	/// values.
		1894	/// \param __a
		1895	/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
		1896	/// of the values in \a __dp.
		1897	static __inline__ void __DEFAULT_FN_ATTRS _mm_store1_pd(double *__dp,
		1898	__m128d __a) {
		1899	__a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0);
		1900	_mm_store_pd(__dp, __a);
		1901	}
		1902
		1903	/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to
		1904	/// the upper and lower 64 bits of a memory location.
		1905	///
		1906	/// \headerfile <x86intrin.h>
		1907	///
		1908	/// This intrinsic corresponds to the
		1909	/// <c> VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS </c> instruction.
		1910	///
		1911	/// \param __dp
		1912	/// A pointer to a memory location that can store two double-precision
		1913	/// values.
		1914	/// \param __a
		1915	/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each
		1916	/// of the values in \a __dp.
		1917	static __inline__ void __DEFAULT_FN_ATTRS _mm_store_pd1(double *__dp,
		1918	__m128d __a) {
		1919	_mm_store1_pd(__dp, __a);
		1920	}
		1921
		1922	/// Stores a 128-bit vector of [2 x double] into an unaligned memory
		1923	/// location.
		1924	///
		1925	/// \headerfile <x86intrin.h>
		1926	///
		1927	/// This intrinsic corresponds to the <c> VMOVUPD / MOVUPD </c> instruction.
		1928	///
		1929	/// \param __dp
		1930	/// A pointer to a 128-bit memory location. The address of the memory
		1931	/// location does not have to be aligned.
		1932	/// \param __a
		1933	/// A 128-bit vector of [2 x double] containing the values to be stored.
		1934	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_pd(double *__dp,
		1935	__m128d __a) {
		1936	struct __storeu_pd {
		1937	__m128d_u __v;
		1938	} __attribute__((__packed__, __may_alias__));
		1939	((struct __storeu_pd *)__dp)->__v = __a;
		1940	}
		1941
		1942	/// Stores two double-precision values, in reverse order, from a 128-bit
		1943	/// vector of [2 x double] to a 16-byte aligned memory location.
		1944	///
		1945	/// \headerfile <x86intrin.h>
		1946	///
		1947	/// This intrinsic corresponds to a shuffling instruction followed by a
		1948	/// <c> VMOVAPD / MOVAPD </c> instruction.
		1949	///
		1950	/// \param __dp
		1951	/// A pointer to a 16-byte aligned memory location that can store two
		1952	/// double-precision values.
		1953	/// \param __a
		1954	/// A 128-bit vector of [2 x double] containing the values to be reversed and
		1955	/// stored.
		1956	static __inline__ void __DEFAULT_FN_ATTRS _mm_storer_pd(double *__dp,
		1957	__m128d __a) {
		1958	__a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0);
		1959	(__m128d )__dp = __a;
		1960	}
		1961
		1962	/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a
		1963	/// memory location.
		1964	///
		1965	/// \headerfile <x86intrin.h>
		1966	///
		1967	/// This intrinsic corresponds to the <c> VMOVHPD / MOVHPD </c> instruction.
		1968	///
		1969	/// \param __dp
		1970	/// A pointer to a 64-bit memory location.
		1971	/// \param __a
		1972	/// A 128-bit vector of [2 x double] containing the value to be stored.
		1973	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeh_pd(double *__dp,
		1974	__m128d __a) {
		1975	struct __mm_storeh_pd_struct {
		1976	double __u;
		1977	} __attribute__((__packed__, __may_alias__));
		1978	((struct __mm_storeh_pd_struct *)__dp)->__u = __a[1];
		1979	}
		1980
		1981	/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a
		1982	/// memory location.
		1983	///
		1984	/// \headerfile <x86intrin.h>
		1985	///
		1986	/// This intrinsic corresponds to the <c> VMOVLPD / MOVLPD </c> instruction.
		1987	///
		1988	/// \param __dp
		1989	/// A pointer to a 64-bit memory location.
		1990	/// \param __a
		1991	/// A 128-bit vector of [2 x double] containing the value to be stored.
		1992	static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_pd(double *__dp,
		1993	__m128d __a) {
		1994	struct __mm_storeh_pd_struct {
		1995	double __u;
		1996	} __attribute__((__packed__, __may_alias__));
		1997	((struct __mm_storeh_pd_struct *)__dp)->__u = __a[0];
		1998	}
		1999
		2000	/// Adds the corresponding elements of two 128-bit vectors of [16 x i8],
		2001	/// saving the lower 8 bits of each sum in the corresponding element of a
		2002	/// 128-bit result vector of [16 x i8].
		2003	///
		2004	/// The integer elements of both parameters can be either signed or unsigned.
		2005	///
		2006	/// \headerfile <x86intrin.h>
		2007	///
		2008	/// This intrinsic corresponds to the <c> VPADDB / PADDB </c> instruction.
		2009	///
		2010	/// \param __a
		2011	/// A 128-bit vector of [16 x i8].
		2012	/// \param __b
		2013	/// A 128-bit vector of [16 x i8].
		2014	/// \returns A 128-bit vector of [16 x i8] containing the sums of both
		2015	/// parameters.
		2016	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi8(__m128i __a,
		2017	__m128i __b) {
		2018	return (__m128i)((__v16qu)__a + (__v16qu)__b);
		2019	}
		2020
		2021	/// Adds the corresponding elements of two 128-bit vectors of [8 x i16],
		2022	/// saving the lower 16 bits of each sum in the corresponding element of a
		2023	/// 128-bit result vector of [8 x i16].
		2024	///
		2025	/// The integer elements of both parameters can be either signed or unsigned.
		2026	///
		2027	/// \headerfile <x86intrin.h>
		2028	///
		2029	/// This intrinsic corresponds to the <c> VPADDW / PADDW </c> instruction.
		2030	///
		2031	/// \param __a
		2032	/// A 128-bit vector of [8 x i16].
		2033	/// \param __b
		2034	/// A 128-bit vector of [8 x i16].
		2035	/// \returns A 128-bit vector of [8 x i16] containing the sums of both
		2036	/// parameters.
		2037	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi16(__m128i __a,
		2038	__m128i __b) {
		2039	return (__m128i)((__v8hu)__a + (__v8hu)__b);
		2040	}
		2041
		2042	/// Adds the corresponding elements of two 128-bit vectors of [4 x i32],
		2043	/// saving the lower 32 bits of each sum in the corresponding element of a
		2044	/// 128-bit result vector of [4 x i32].
		2045	///
		2046	/// The integer elements of both parameters can be either signed or unsigned.
		2047	///
		2048	/// \headerfile <x86intrin.h>
		2049	///
		2050	/// This intrinsic corresponds to the <c> VPADDD / PADDD </c> instruction.
		2051	///
		2052	/// \param __a
		2053	/// A 128-bit vector of [4 x i32].
		2054	/// \param __b
		2055	/// A 128-bit vector of [4 x i32].
		2056	/// \returns A 128-bit vector of [4 x i32] containing the sums of both
		2057	/// parameters.
		2058	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi32(__m128i __a,
		2059	__m128i __b) {
		2060	return (__m128i)((__v4su)__a + (__v4su)__b);
		2061	}
		2062
		2063	/// Adds two signed or unsigned 64-bit integer values, returning the
		2064	/// lower 64 bits of the sum.
		2065	///
		2066	/// \headerfile <x86intrin.h>
		2067	///
		2068	/// This intrinsic corresponds to the <c> PADDQ </c> instruction.
		2069	///
		2070	/// \param __a
		2071	/// A 64-bit integer.
		2072	/// \param __b
		2073	/// A 64-bit integer.
		2074	/// \returns A 64-bit integer containing the sum of both parameters.
		2075	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_add_si64(__m64 __a,
		2076	__m64 __b) {
		2077	return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b);
		2078	}
		2079
		2080	/// Adds the corresponding elements of two 128-bit vectors of [2 x i64],
		2081	/// saving the lower 64 bits of each sum in the corresponding element of a
		2082	/// 128-bit result vector of [2 x i64].
		2083	///
		2084	/// The integer elements of both parameters can be either signed or unsigned.
		2085	///
		2086	/// \headerfile <x86intrin.h>
		2087	///
		2088	/// This intrinsic corresponds to the <c> VPADDQ / PADDQ </c> instruction.
		2089	///
		2090	/// \param __a
		2091	/// A 128-bit vector of [2 x i64].
		2092	/// \param __b
		2093	/// A 128-bit vector of [2 x i64].
		2094	/// \returns A 128-bit vector of [2 x i64] containing the sums of both
		2095	/// parameters.
		2096	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_add_epi64(__m128i __a,
		2097	__m128i __b) {
		2098	return (__m128i)((__v2du)__a + (__v2du)__b);
		2099	}
		2100
		2101	/// Adds, with saturation, the corresponding elements of two 128-bit
		2102	/// signed [16 x i8] vectors, saving each sum in the corresponding element of
		2103	/// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are
		2104	/// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80.
		2105	///
		2106	/// \headerfile <x86intrin.h>
		2107	///
		2108	/// This intrinsic corresponds to the <c> VPADDSB / PADDSB </c> instruction.
		2109	///
		2110	/// \param __a
		2111	/// A 128-bit signed [16 x i8] vector.
		2112	/// \param __b
		2113	/// A 128-bit signed [16 x i8] vector.
		2114	/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of
		2115	/// both parameters.
		2116	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi8(__m128i __a,
		2117	__m128i __b) {
		2118	return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b);
		2119	}
		2120
		2121	/// Adds, with saturation, the corresponding elements of two 128-bit
		2122	/// signed [8 x i16] vectors, saving each sum in the corresponding element of
		2123	/// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF
		2124	/// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
		2125	/// 0x8000.
		2126	///
		2127	/// \headerfile <x86intrin.h>
		2128	///
		2129	/// This intrinsic corresponds to the <c> VPADDSW / PADDSW </c> instruction.
		2130	///
		2131	/// \param __a
		2132	/// A 128-bit signed [8 x i16] vector.
		2133	/// \param __b
		2134	/// A 128-bit signed [8 x i16] vector.
		2135	/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of
		2136	/// both parameters.
		2137	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epi16(__m128i __a,
		2138	__m128i __b) {
		2139	return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b);
		2140	}
		2141
		2142	/// Adds, with saturation, the corresponding elements of two 128-bit
		2143	/// unsigned [16 x i8] vectors, saving each sum in the corresponding element
		2144	/// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF
		2145	/// are saturated to 0xFF. Negative sums are saturated to 0x00.
		2146	///
		2147	/// \headerfile <x86intrin.h>
		2148	///
		2149	/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
		2150	///
		2151	/// \param __a
		2152	/// A 128-bit unsigned [16 x i8] vector.
		2153	/// \param __b
		2154	/// A 128-bit unsigned [16 x i8] vector.
		2155	/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums
		2156	/// of both parameters.
		2157	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu8(__m128i __a,
		2158	__m128i __b) {
		2159	return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b);
		2160	}
		2161
		2162	/// Adds, with saturation, the corresponding elements of two 128-bit
		2163	/// unsigned [8 x i16] vectors, saving each sum in the corresponding element
		2164	/// of a 128-bit result vector of [8 x i16]. Positive sums greater than
		2165	/// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000.
		2166	///
		2167	/// \headerfile <x86intrin.h>
		2168	///
		2169	/// This intrinsic corresponds to the <c> VPADDUSB / PADDUSB </c> instruction.
		2170	///
		2171	/// \param __a
		2172	/// A 128-bit unsigned [8 x i16] vector.
		2173	/// \param __b
		2174	/// A 128-bit unsigned [8 x i16] vector.
		2175	/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums
		2176	/// of both parameters.
		2177	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_adds_epu16(__m128i __a,
		2178	__m128i __b) {
		2179	return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b);
		2180	}
		2181
		2182	/// Computes the rounded averages of corresponding elements of two
		2183	/// 128-bit unsigned [16 x i8] vectors, saving each result in the
		2184	/// corresponding element of a 128-bit result vector of [16 x i8].
		2185	///
		2186	/// \headerfile <x86intrin.h>
		2187	///
		2188	/// This intrinsic corresponds to the <c> VPAVGB / PAVGB </c> instruction.
		2189	///
		2190	/// \param __a
		2191	/// A 128-bit unsigned [16 x i8] vector.
		2192	/// \param __b
		2193	/// A 128-bit unsigned [16 x i8] vector.
		2194	/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded
		2195	/// averages of both parameters.
		2196	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a,
		2197	__m128i __b) {
		2198	return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
		2199	}
		2200
		2201	/// Computes the rounded averages of corresponding elements of two
		2202	/// 128-bit unsigned [8 x i16] vectors, saving each result in the
		2203	/// corresponding element of a 128-bit result vector of [8 x i16].
		2204	///
		2205	/// \headerfile <x86intrin.h>
		2206	///
		2207	/// This intrinsic corresponds to the <c> VPAVGW / PAVGW </c> instruction.
		2208	///
		2209	/// \param __a
		2210	/// A 128-bit unsigned [8 x i16] vector.
		2211	/// \param __b
		2212	/// A 128-bit unsigned [8 x i16] vector.
		2213	/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded
		2214	/// averages of both parameters.
		2215	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a,
		2216	__m128i __b) {
		2217	return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
		2218	}
		2219
		2220	/// Multiplies the corresponding elements of two 128-bit signed [8 x i16]
		2221	/// vectors, producing eight intermediate 32-bit signed integer products, and
		2222	/// adds the consecutive pairs of 32-bit products to form a 128-bit signed
		2223	/// [4 x i32] vector.
		2224	///
		2225	/// For example, bits [15:0] of both parameters are multiplied producing a
		2226	/// 32-bit product, bits [31:16] of both parameters are multiplied producing
		2227	/// a 32-bit product, and the sum of those two products becomes bits [31:0]
		2228	/// of the result.
		2229	///
		2230	/// \headerfile <x86intrin.h>
		2231	///
		2232	/// This intrinsic corresponds to the <c> VPMADDWD / PMADDWD </c> instruction.
		2233	///
		2234	/// \param __a
		2235	/// A 128-bit signed [8 x i16] vector.
		2236	/// \param __b
		2237	/// A 128-bit signed [8 x i16] vector.
		2238	/// \returns A 128-bit signed [4 x i32] vector containing the sums of products
		2239	/// of both parameters.
		2240	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_madd_epi16(__m128i __a,
		2241	__m128i __b) {
		2242	return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
		2243	}
		2244
		2245	/// Compares corresponding elements of two 128-bit signed [8 x i16]
		2246	/// vectors, saving the greater value from each comparison in the
		2247	/// corresponding element of a 128-bit result vector of [8 x i16].
		2248	///
		2249	/// \headerfile <x86intrin.h>
		2250	///
		2251	/// This intrinsic corresponds to the <c> VPMAXSW / PMAXSW </c> instruction.
		2252	///
		2253	/// \param __a
		2254	/// A 128-bit signed [8 x i16] vector.
		2255	/// \param __b
		2256	/// A 128-bit signed [8 x i16] vector.
		2257	/// \returns A 128-bit signed [8 x i16] vector containing the greater value of
		2258	/// each comparison.
		2259	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi16(__m128i __a,
		2260	__m128i __b) {
		2261	return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b);
		2262	}
		2263
		2264	/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
		2265	/// vectors, saving the greater value from each comparison in the
		2266	/// corresponding element of a 128-bit result vector of [16 x i8].
		2267	///
		2268	/// \headerfile <x86intrin.h>
		2269	///
		2270	/// This intrinsic corresponds to the <c> VPMAXUB / PMAXUB </c> instruction.
		2271	///
		2272	/// \param __a
		2273	/// A 128-bit unsigned [16 x i8] vector.
		2274	/// \param __b
		2275	/// A 128-bit unsigned [16 x i8] vector.
		2276	/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of
		2277	/// each comparison.
		2278	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu8(__m128i __a,
		2279	__m128i __b) {
		2280	return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b);
		2281	}
		2282
		2283	/// Compares corresponding elements of two 128-bit signed [8 x i16]
		2284	/// vectors, saving the smaller value from each comparison in the
		2285	/// corresponding element of a 128-bit result vector of [8 x i16].
		2286	///
		2287	/// \headerfile <x86intrin.h>
		2288	///
		2289	/// This intrinsic corresponds to the <c> VPMINSW / PMINSW </c> instruction.
		2290	///
		2291	/// \param __a
		2292	/// A 128-bit signed [8 x i16] vector.
		2293	/// \param __b
		2294	/// A 128-bit signed [8 x i16] vector.
		2295	/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of
		2296	/// each comparison.
		2297	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi16(__m128i __a,
		2298	__m128i __b) {
		2299	return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b);
		2300	}
		2301
		2302	/// Compares corresponding elements of two 128-bit unsigned [16 x i8]
		2303	/// vectors, saving the smaller value from each comparison in the
		2304	/// corresponding element of a 128-bit result vector of [16 x i8].
		2305	///
		2306	/// \headerfile <x86intrin.h>
		2307	///
		2308	/// This intrinsic corresponds to the <c> VPMINUB / PMINUB </c> instruction.
		2309	///
		2310	/// \param __a
		2311	/// A 128-bit unsigned [16 x i8] vector.
		2312	/// \param __b
		2313	/// A 128-bit unsigned [16 x i8] vector.
		2314	/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of
		2315	/// each comparison.
		2316	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu8(__m128i __a,
		2317	__m128i __b) {
		2318	return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b);
		2319	}
		2320
		2321	/// Multiplies the corresponding elements of two signed [8 x i16]
		2322	/// vectors, saving the upper 16 bits of each 32-bit product in the
		2323	/// corresponding element of a 128-bit signed [8 x i16] result vector.
		2324	///
		2325	/// \headerfile <x86intrin.h>
		2326	///
		2327	/// This intrinsic corresponds to the <c> VPMULHW / PMULHW </c> instruction.
		2328	///
		2329	/// \param __a
		2330	/// A 128-bit signed [8 x i16] vector.
		2331	/// \param __b
		2332	/// A 128-bit signed [8 x i16] vector.
		2333	/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of
		2334	/// each of the eight 32-bit products.
		2335	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epi16(__m128i __a,
		2336	__m128i __b) {
		2337	return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
		2338	}
		2339
		2340	/// Multiplies the corresponding elements of two unsigned [8 x i16]
		2341	/// vectors, saving the upper 16 bits of each 32-bit product in the
		2342	/// corresponding element of a 128-bit unsigned [8 x i16] result vector.
		2343	///
		2344	/// \headerfile <x86intrin.h>
		2345	///
		2346	/// This intrinsic corresponds to the <c> VPMULHUW / PMULHUW </c> instruction.
		2347	///
		2348	/// \param __a
		2349	/// A 128-bit unsigned [8 x i16] vector.
		2350	/// \param __b
		2351	/// A 128-bit unsigned [8 x i16] vector.
		2352	/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits
		2353	/// of each of the eight 32-bit products.
		2354	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mulhi_epu16(__m128i __a,
		2355	__m128i __b) {
		2356	return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
		2357	}
		2358
		2359	/// Multiplies the corresponding elements of two signed [8 x i16]
		2360	/// vectors, saving the lower 16 bits of each 32-bit product in the
		2361	/// corresponding element of a 128-bit signed [8 x i16] result vector.
		2362	///
		2363	/// \headerfile <x86intrin.h>
		2364	///
		2365	/// This intrinsic corresponds to the <c> VPMULLW / PMULLW </c> instruction.
		2366	///
		2367	/// \param __a
		2368	/// A 128-bit signed [8 x i16] vector.
		2369	/// \param __b
		2370	/// A 128-bit signed [8 x i16] vector.
		2371	/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of
		2372	/// each of the eight 32-bit products.
		2373	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi16(__m128i __a,
		2374	__m128i __b) {
		2375	return (__m128i)((__v8hu)__a * (__v8hu)__b);
		2376	}
		2377
		2378	/// Multiplies 32-bit unsigned integer values contained in the lower bits
		2379	/// of the two 64-bit integer vectors and returns the 64-bit unsigned
		2380	/// product.
		2381	///
		2382	/// \headerfile <x86intrin.h>
		2383	///
		2384	/// This intrinsic corresponds to the <c> PMULUDQ </c> instruction.
		2385	///
		2386	/// \param __a
		2387	/// A 64-bit integer containing one of the source operands.
		2388	/// \param __b
		2389	/// A 64-bit integer containing one of the source operands.
		2390	/// \returns A 64-bit integer vector containing the product of both operands.
		2391	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_mul_su32(__m64 __a,
		2392	__m64 __b) {
		2393	return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
		2394	}
		2395
		2396	/// Multiplies 32-bit unsigned integer values contained in the lower
		2397	/// bits of the corresponding elements of two [2 x i64] vectors, and returns
		2398	/// the 64-bit products in the corresponding elements of a [2 x i64] vector.
		2399	///
		2400	/// \headerfile <x86intrin.h>
		2401	///
		2402	/// This intrinsic corresponds to the <c> VPMULUDQ / PMULUDQ </c> instruction.
		2403	///
		2404	/// \param __a
		2405	/// A [2 x i64] vector containing one of the source operands.
		2406	/// \param __b
		2407	/// A [2 x i64] vector containing one of the source operands.
		2408	/// \returns A [2 x i64] vector containing the product of both operands.
		2409	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epu32(__m128i __a,
		2410	__m128i __b) {
		2411	return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
		2412	}
		2413
		2414	/// Computes the absolute differences of corresponding 8-bit integer
		2415	/// values in two 128-bit vectors. Sums the first 8 absolute differences, and
		2416	/// separately sums the second 8 absolute differences. Packs these two
		2417	/// unsigned 16-bit integer sums into the upper and lower elements of a
		2418	/// [2 x i64] vector.
		2419	///
		2420	/// \headerfile <x86intrin.h>
		2421	///
		2422	/// This intrinsic corresponds to the <c> VPSADBW / PSADBW </c> instruction.
		2423	///
		2424	/// \param __a
		2425	/// A 128-bit integer vector containing one of the source operands.
		2426	/// \param __b
		2427	/// A 128-bit integer vector containing one of the source operands.
		2428	/// \returns A [2 x i64] vector containing the sums of the sets of absolute
		2429	/// differences between both operands.
		2430	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sad_epu8(__m128i __a,
		2431	__m128i __b) {
		2432	return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
		2433	}
		2434
		2435	/// Subtracts the corresponding 8-bit integer values in the operands.
		2436	///
		2437	/// \headerfile <x86intrin.h>
		2438	///
		2439	/// This intrinsic corresponds to the <c> VPSUBB / PSUBB </c> instruction.
		2440	///
		2441	/// \param __a
		2442	/// A 128-bit integer vector containing the minuends.
		2443	/// \param __b
		2444	/// A 128-bit integer vector containing the subtrahends.
		2445	/// \returns A 128-bit integer vector containing the differences of the values
		2446	/// in the operands.
		2447	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi8(__m128i __a,
		2448	__m128i __b) {
		2449	return (__m128i)((__v16qu)__a - (__v16qu)__b);
		2450	}
		2451
		2452	/// Subtracts the corresponding 16-bit integer values in the operands.
		2453	///
		2454	/// \headerfile <x86intrin.h>
		2455	///
		2456	/// This intrinsic corresponds to the <c> VPSUBW / PSUBW </c> instruction.
		2457	///
		2458	/// \param __a
		2459	/// A 128-bit integer vector containing the minuends.
		2460	/// \param __b
		2461	/// A 128-bit integer vector containing the subtrahends.
		2462	/// \returns A 128-bit integer vector containing the differences of the values
		2463	/// in the operands.
		2464	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi16(__m128i __a,
		2465	__m128i __b) {
		2466	return (__m128i)((__v8hu)__a - (__v8hu)__b);
		2467	}
		2468
		2469	/// Subtracts the corresponding 32-bit integer values in the operands.
		2470	///
		2471	/// \headerfile <x86intrin.h>
		2472	///
		2473	/// This intrinsic corresponds to the <c> VPSUBD / PSUBD </c> instruction.
		2474	///
		2475	/// \param __a
		2476	/// A 128-bit integer vector containing the minuends.
		2477	/// \param __b
		2478	/// A 128-bit integer vector containing the subtrahends.
		2479	/// \returns A 128-bit integer vector containing the differences of the values
		2480	/// in the operands.
		2481	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi32(__m128i __a,
		2482	__m128i __b) {
		2483	return (__m128i)((__v4su)__a - (__v4su)__b);
		2484	}
		2485
		2486	/// Subtracts signed or unsigned 64-bit integer values and writes the
		2487	/// difference to the corresponding bits in the destination.
		2488	///
		2489	/// \headerfile <x86intrin.h>
		2490	///
		2491	/// This intrinsic corresponds to the <c> PSUBQ </c> instruction.
		2492	///
		2493	/// \param __a
		2494	/// A 64-bit integer vector containing the minuend.
		2495	/// \param __b
		2496	/// A 64-bit integer vector containing the subtrahend.
		2497	/// \returns A 64-bit integer vector containing the difference of the values in
		2498	/// the operands.
		2499	static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX _mm_sub_si64(__m64 __a,
		2500	__m64 __b) {
		2501	return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b);
		2502	}
		2503
		2504	/// Subtracts the corresponding elements of two [2 x i64] vectors.
		2505	///
		2506	/// \headerfile <x86intrin.h>
		2507	///
		2508	/// This intrinsic corresponds to the <c> VPSUBQ / PSUBQ </c> instruction.
		2509	///
		2510	/// \param __a
		2511	/// A 128-bit integer vector containing the minuends.
		2512	/// \param __b
		2513	/// A 128-bit integer vector containing the subtrahends.
		2514	/// \returns A 128-bit integer vector containing the differences of the values
		2515	/// in the operands.
		2516	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sub_epi64(__m128i __a,
		2517	__m128i __b) {
		2518	return (__m128i)((__v2du)__a - (__v2du)__b);
		2519	}
		2520
		2521	/// Subtracts corresponding 8-bit signed integer values in the input and
		2522	/// returns the differences in the corresponding bytes in the destination.
		2523	/// Differences greater than 0x7F are saturated to 0x7F, and differences less
		2524	/// than 0x80 are saturated to 0x80.
		2525	///
		2526	/// \headerfile <x86intrin.h>
		2527	///
		2528	/// This intrinsic corresponds to the <c> VPSUBSB / PSUBSB </c> instruction.
		2529	///
		2530	/// \param __a
		2531	/// A 128-bit integer vector containing the minuends.
		2532	/// \param __b
		2533	/// A 128-bit integer vector containing the subtrahends.
		2534	/// \returns A 128-bit integer vector containing the differences of the values
		2535	/// in the operands.
		2536	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi8(__m128i __a,
		2537	__m128i __b) {
		2538	return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b);
		2539	}
		2540
		2541	/// Subtracts corresponding 16-bit signed integer values in the input and
		2542	/// returns the differences in the corresponding bytes in the destination.
		2543	/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less
		2544	/// than 0x8000 are saturated to 0x8000.
		2545	///
		2546	/// \headerfile <x86intrin.h>
		2547	///
		2548	/// This intrinsic corresponds to the <c> VPSUBSW / PSUBSW </c> instruction.
		2549	///
		2550	/// \param __a
		2551	/// A 128-bit integer vector containing the minuends.
		2552	/// \param __b
		2553	/// A 128-bit integer vector containing the subtrahends.
		2554	/// \returns A 128-bit integer vector containing the differences of the values
		2555	/// in the operands.
		2556	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epi16(__m128i __a,
		2557	__m128i __b) {
		2558	return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b);
		2559	}
		2560
		2561	/// Subtracts corresponding 8-bit unsigned integer values in the input
		2562	/// and returns the differences in the corresponding bytes in the
		2563	/// destination. Differences less than 0x00 are saturated to 0x00.
		2564	///
		2565	/// \headerfile <x86intrin.h>
		2566	///
		2567	/// This intrinsic corresponds to the <c> VPSUBUSB / PSUBUSB </c> instruction.
		2568	///
		2569	/// \param __a
		2570	/// A 128-bit integer vector containing the minuends.
		2571	/// \param __b
		2572	/// A 128-bit integer vector containing the subtrahends.
		2573	/// \returns A 128-bit integer vector containing the unsigned integer
		2574	/// differences of the values in the operands.
		2575	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu8(__m128i __a,
		2576	__m128i __b) {
		2577	return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b);
		2578	}
		2579
		2580	/// Subtracts corresponding 16-bit unsigned integer values in the input
		2581	/// and returns the differences in the corresponding bytes in the
		2582	/// destination. Differences less than 0x0000 are saturated to 0x0000.
		2583	///
		2584	/// \headerfile <x86intrin.h>
		2585	///
		2586	/// This intrinsic corresponds to the <c> VPSUBUSW / PSUBUSW </c> instruction.
		2587	///
		2588	/// \param __a
		2589	/// A 128-bit integer vector containing the minuends.
		2590	/// \param __b
		2591	/// A 128-bit integer vector containing the subtrahends.
		2592	/// \returns A 128-bit integer vector containing the unsigned integer
		2593	/// differences of the values in the operands.
		2594	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_subs_epu16(__m128i __a,
		2595	__m128i __b) {
		2596	return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b);
		2597	}
		2598
		2599	/// Performs a bitwise AND of two 128-bit integer vectors.
		2600	///
		2601	/// \headerfile <x86intrin.h>
		2602	///
		2603	/// This intrinsic corresponds to the <c> VPAND / PAND </c> instruction.
		2604	///
		2605	/// \param __a
		2606	/// A 128-bit integer vector containing one of the source operands.
		2607	/// \param __b
		2608	/// A 128-bit integer vector containing one of the source operands.
		2609	/// \returns A 128-bit integer vector containing the bitwise AND of the values
		2610	/// in both operands.
		2611	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_and_si128(__m128i __a,
		2612	__m128i __b) {
		2613	return (__m128i)((__v2du)__a & (__v2du)__b);
		2614	}
		2615
		2616	/// Performs a bitwise AND of two 128-bit integer vectors, using the
		2617	/// one's complement of the values contained in the first source operand.
		2618	///
		2619	/// \headerfile <x86intrin.h>
		2620	///
		2621	/// This intrinsic corresponds to the <c> VPANDN / PANDN </c> instruction.
		2622	///
		2623	/// \param __a
		2624	/// A 128-bit vector containing the left source operand. The one's complement
		2625	/// of this value is used in the bitwise AND.
		2626	/// \param __b
		2627	/// A 128-bit vector containing the right source operand.
		2628	/// \returns A 128-bit integer vector containing the bitwise AND of the one's
		2629	/// complement of the first operand and the values in the second operand.
		2630	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_andnot_si128(__m128i __a,
		2631	__m128i __b) {
		2632	return (__m128i)(~(__v2du)__a & (__v2du)__b);
		2633	}
		2634	/// Performs a bitwise OR of two 128-bit integer vectors.
		2635	///
		2636	/// \headerfile <x86intrin.h>
		2637	///
		2638	/// This intrinsic corresponds to the <c> VPOR / POR </c> instruction.
		2639	///
		2640	/// \param __a
		2641	/// A 128-bit integer vector containing one of the source operands.
		2642	/// \param __b
		2643	/// A 128-bit integer vector containing one of the source operands.
		2644	/// \returns A 128-bit integer vector containing the bitwise OR of the values
		2645	/// in both operands.
		2646	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_or_si128(__m128i __a,
		2647	__m128i __b) {
		2648	return (__m128i)((__v2du)__a \| (__v2du)__b);
		2649	}
		2650
		2651	/// Performs a bitwise exclusive OR of two 128-bit integer vectors.
		2652	///
		2653	/// \headerfile <x86intrin.h>
		2654	///
		2655	/// This intrinsic corresponds to the <c> VPXOR / PXOR </c> instruction.
		2656	///
		2657	/// \param __a
		2658	/// A 128-bit integer vector containing one of the source operands.
		2659	/// \param __b
		2660	/// A 128-bit integer vector containing one of the source operands.
		2661	/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the
		2662	/// values in both operands.
		2663	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_xor_si128(__m128i __a,
		2664	__m128i __b) {
		2665	return (__m128i)((__v2du)__a ^ (__v2du)__b);
		2666	}
		2667
		2668	/// Left-shifts the 128-bit integer vector operand by the specified
		2669	/// number of bytes. Low-order bits are cleared.
		2670	///
		2671	/// \headerfile <x86intrin.h>
		2672	///
		2673	/// \code
		2674	/// __m128i _mm_slli_si128(__m128i a, const int imm);
		2675	/// \endcode
		2676	///
		2677	/// This intrinsic corresponds to the <c> VPSLLDQ / PSLLDQ </c> instruction.
		2678	///
		2679	/// \param a
		2680	/// A 128-bit integer vector containing the source operand.
		2681	/// \param imm
		2682	/// An immediate value specifying the number of bytes to left-shift operand
		2683	/// \a a.
		2684	/// \returns A 128-bit integer vector containing the left-shifted value.
		2685	#define _mm_slli_si128(a, imm) \
		2686	((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
		2687	(int)(imm)))
		2688
		2689	#define _mm_bslli_si128(a, imm) \
		2690	((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), \
		2691	(int)(imm)))
		2692
		2693	/// Left-shifts each 16-bit value in the 128-bit integer vector operand
		2694	/// by the specified number of bits. Low-order bits are cleared.
		2695	///
		2696	/// \headerfile <x86intrin.h>
		2697	///
		2698	/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
		2699	///
		2700	/// \param __a
		2701	/// A 128-bit integer vector containing the source operand.
		2702	/// \param __count
		2703	/// An integer value specifying the number of bits to left-shift each value
		2704	/// in operand \a __a.
		2705	/// \returns A 128-bit integer vector containing the left-shifted values.
		2706	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi16(__m128i __a,
		2707	int __count) {
		2708	return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
		2709	}
		2710
		2711	/// Left-shifts each 16-bit value in the 128-bit integer vector operand
		2712	/// by the specified number of bits. Low-order bits are cleared.
		2713	///
		2714	/// \headerfile <x86intrin.h>
		2715	///
		2716	/// This intrinsic corresponds to the <c> VPSLLW / PSLLW </c> instruction.
		2717	///
		2718	/// \param __a
		2719	/// A 128-bit integer vector containing the source operand.
		2720	/// \param __count
		2721	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
		2722	/// to left-shift each value in operand \a __a.
		2723	/// \returns A 128-bit integer vector containing the left-shifted values.
		2724	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi16(__m128i __a,
		2725	__m128i __count) {
		2726	return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
		2727	}
		2728
		2729	/// Left-shifts each 32-bit value in the 128-bit integer vector operand
		2730	/// by the specified number of bits. Low-order bits are cleared.
		2731	///
		2732	/// \headerfile <x86intrin.h>
		2733	///
		2734	/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
		2735	///
		2736	/// \param __a
		2737	/// A 128-bit integer vector containing the source operand.
		2738	/// \param __count
		2739	/// An integer value specifying the number of bits to left-shift each value
		2740	/// in operand \a __a.
		2741	/// \returns A 128-bit integer vector containing the left-shifted values.
		2742	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi32(__m128i __a,
		2743	int __count) {
		2744	return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
		2745	}
		2746
		2747	/// Left-shifts each 32-bit value in the 128-bit integer vector operand
		2748	/// by the specified number of bits. Low-order bits are cleared.
		2749	///
		2750	/// \headerfile <x86intrin.h>
		2751	///
		2752	/// This intrinsic corresponds to the <c> VPSLLD / PSLLD </c> instruction.
		2753	///
		2754	/// \param __a
		2755	/// A 128-bit integer vector containing the source operand.
		2756	/// \param __count
		2757	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
		2758	/// to left-shift each value in operand \a __a.
		2759	/// \returns A 128-bit integer vector containing the left-shifted values.
		2760	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi32(__m128i __a,
		2761	__m128i __count) {
		2762	return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
		2763	}
		2764
		2765	/// Left-shifts each 64-bit value in the 128-bit integer vector operand
		2766	/// by the specified number of bits. Low-order bits are cleared.
		2767	///
		2768	/// \headerfile <x86intrin.h>
		2769	///
		2770	/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
		2771	///
		2772	/// \param __a
		2773	/// A 128-bit integer vector containing the source operand.
		2774	/// \param __count
		2775	/// An integer value specifying the number of bits to left-shift each value
		2776	/// in operand \a __a.
		2777	/// \returns A 128-bit integer vector containing the left-shifted values.
		2778	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_slli_epi64(__m128i __a,
		2779	int __count) {
		2780	return __builtin_ia32_psllqi128((__v2di)__a, __count);
		2781	}
		2782
		2783	/// Left-shifts each 64-bit value in the 128-bit integer vector operand
		2784	/// by the specified number of bits. Low-order bits are cleared.
		2785	///
		2786	/// \headerfile <x86intrin.h>
		2787	///
		2788	/// This intrinsic corresponds to the <c> VPSLLQ / PSLLQ </c> instruction.
		2789	///
		2790	/// \param __a
		2791	/// A 128-bit integer vector containing the source operand.
		2792	/// \param __count
		2793	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
		2794	/// to left-shift each value in operand \a __a.
		2795	/// \returns A 128-bit integer vector containing the left-shifted values.
		2796	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sll_epi64(__m128i __a,
		2797	__m128i __count) {
		2798	return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count);
		2799	}
		2800
		2801	/// Right-shifts each 16-bit value in the 128-bit integer vector operand
		2802	/// by the specified number of bits. High-order bits are filled with the sign
		2803	/// bit of the initial value.
		2804	///
		2805	/// \headerfile <x86intrin.h>
		2806	///
		2807	/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
		2808	///
		2809	/// \param __a
		2810	/// A 128-bit integer vector containing the source operand.
		2811	/// \param __count
		2812	/// An integer value specifying the number of bits to right-shift each value
		2813	/// in operand \a __a.
		2814	/// \returns A 128-bit integer vector containing the right-shifted values.
		2815	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi16(__m128i __a,
		2816	int __count) {
		2817	return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
		2818	}
		2819
		2820	/// Right-shifts each 16-bit value in the 128-bit integer vector operand
		2821	/// by the specified number of bits. High-order bits are filled with the sign
		2822	/// bit of the initial value.
		2823	///
		2824	/// \headerfile <x86intrin.h>
		2825	///
		2826	/// This intrinsic corresponds to the <c> VPSRAW / PSRAW </c> instruction.
		2827	///
		2828	/// \param __a
		2829	/// A 128-bit integer vector containing the source operand.
		2830	/// \param __count
		2831	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
		2832	/// to right-shift each value in operand \a __a.
		2833	/// \returns A 128-bit integer vector containing the right-shifted values.
		2834	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi16(__m128i __a,
		2835	__m128i __count) {
		2836	return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
		2837	}
		2838
		2839	/// Right-shifts each 32-bit value in the 128-bit integer vector operand
		2840	/// by the specified number of bits. High-order bits are filled with the sign
		2841	/// bit of the initial value.
		2842	///
		2843	/// \headerfile <x86intrin.h>
		2844	///
		2845	/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
		2846	///
		2847	/// \param __a
		2848	/// A 128-bit integer vector containing the source operand.
		2849	/// \param __count
		2850	/// An integer value specifying the number of bits to right-shift each value
		2851	/// in operand \a __a.
		2852	/// \returns A 128-bit integer vector containing the right-shifted values.
		2853	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srai_epi32(__m128i __a,
		2854	int __count) {
		2855	return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
		2856	}
		2857
		2858	/// Right-shifts each 32-bit value in the 128-bit integer vector operand
		2859	/// by the specified number of bits. High-order bits are filled with the sign
		2860	/// bit of the initial value.
		2861	///
		2862	/// \headerfile <x86intrin.h>
		2863	///
		2864	/// This intrinsic corresponds to the <c> VPSRAD / PSRAD </c> instruction.
		2865	///
		2866	/// \param __a
		2867	/// A 128-bit integer vector containing the source operand.
		2868	/// \param __count
		2869	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
		2870	/// to right-shift each value in operand \a __a.
		2871	/// \returns A 128-bit integer vector containing the right-shifted values.
		2872	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_sra_epi32(__m128i __a,
		2873	__m128i __count) {
		2874	return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
		2875	}
		2876
		2877	/// Right-shifts the 128-bit integer vector operand by the specified
		2878	/// number of bytes. High-order bits are cleared.
		2879	///
		2880	/// \headerfile <x86intrin.h>
		2881	///
		2882	/// \code
		2883	/// __m128i _mm_srli_si128(__m128i a, const int imm);
		2884	/// \endcode
		2885	///
		2886	/// This intrinsic corresponds to the <c> VPSRLDQ / PSRLDQ </c> instruction.
		2887	///
		2888	/// \param a
		2889	/// A 128-bit integer vector containing the source operand.
		2890	/// \param imm
		2891	/// An immediate value specifying the number of bytes to right-shift operand
		2892	/// \a a.
		2893	/// \returns A 128-bit integer vector containing the right-shifted value.
		2894	#define _mm_srli_si128(a, imm) \
		2895	((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
		2896	(int)(imm)))
		2897
		2898	#define _mm_bsrli_si128(a, imm) \
		2899	((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), \
		2900	(int)(imm)))
		2901
		2902	/// Right-shifts each of 16-bit values in the 128-bit integer vector
		2903	/// operand by the specified number of bits. High-order bits are cleared.
		2904	///
		2905	/// \headerfile <x86intrin.h>
		2906	///
		2907	/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
		2908	///
		2909	/// \param __a
		2910	/// A 128-bit integer vector containing the source operand.
		2911	/// \param __count
		2912	/// An integer value specifying the number of bits to right-shift each value
		2913	/// in operand \a __a.
		2914	/// \returns A 128-bit integer vector containing the right-shifted values.
		2915	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi16(__m128i __a,
		2916	int __count) {
		2917	return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
		2918	}
		2919
		2920	/// Right-shifts each of 16-bit values in the 128-bit integer vector
		2921	/// operand by the specified number of bits. High-order bits are cleared.
		2922	///
		2923	/// \headerfile <x86intrin.h>
		2924	///
		2925	/// This intrinsic corresponds to the <c> VPSRLW / PSRLW </c> instruction.
		2926	///
		2927	/// \param __a
		2928	/// A 128-bit integer vector containing the source operand.
		2929	/// \param __count
		2930	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
		2931	/// to right-shift each value in operand \a __a.
		2932	/// \returns A 128-bit integer vector containing the right-shifted values.
		2933	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi16(__m128i __a,
		2934	__m128i __count) {
		2935	return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
		2936	}
		2937
		2938	/// Right-shifts each of 32-bit values in the 128-bit integer vector
		2939	/// operand by the specified number of bits. High-order bits are cleared.
		2940	///
		2941	/// \headerfile <x86intrin.h>
		2942	///
		2943	/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
		2944	///
		2945	/// \param __a
		2946	/// A 128-bit integer vector containing the source operand.
		2947	/// \param __count
		2948	/// An integer value specifying the number of bits to right-shift each value
		2949	/// in operand \a __a.
		2950	/// \returns A 128-bit integer vector containing the right-shifted values.
		2951	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi32(__m128i __a,
		2952	int __count) {
		2953	return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
		2954	}
		2955
		2956	/// Right-shifts each of 32-bit values in the 128-bit integer vector
		2957	/// operand by the specified number of bits. High-order bits are cleared.
		2958	///
		2959	/// \headerfile <x86intrin.h>
		2960	///
		2961	/// This intrinsic corresponds to the <c> VPSRLD / PSRLD </c> instruction.
		2962	///
		2963	/// \param __a
		2964	/// A 128-bit integer vector containing the source operand.
		2965	/// \param __count
		2966	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
		2967	/// to right-shift each value in operand \a __a.
		2968	/// \returns A 128-bit integer vector containing the right-shifted values.
		2969	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi32(__m128i __a,
		2970	__m128i __count) {
		2971	return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
		2972	}
		2973
		2974	/// Right-shifts each of 64-bit values in the 128-bit integer vector
		2975	/// operand by the specified number of bits. High-order bits are cleared.
		2976	///
		2977	/// \headerfile <x86intrin.h>
		2978	///
		2979	/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
		2980	///
		2981	/// \param __a
		2982	/// A 128-bit integer vector containing the source operand.
		2983	/// \param __count
		2984	/// An integer value specifying the number of bits to right-shift each value
		2985	/// in operand \a __a.
		2986	/// \returns A 128-bit integer vector containing the right-shifted values.
		2987	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srli_epi64(__m128i __a,
		2988	int __count) {
		2989	return __builtin_ia32_psrlqi128((__v2di)__a, __count);
		2990	}
		2991
		2992	/// Right-shifts each of 64-bit values in the 128-bit integer vector
		2993	/// operand by the specified number of bits. High-order bits are cleared.
		2994	///
		2995	/// \headerfile <x86intrin.h>
		2996	///
		2997	/// This intrinsic corresponds to the <c> VPSRLQ / PSRLQ </c> instruction.
		2998	///
		2999	/// \param __a
		3000	/// A 128-bit integer vector containing the source operand.
		3001	/// \param __count
		3002	/// A 128-bit integer vector in which bits [63:0] specify the number of bits
		3003	/// to right-shift each value in operand \a __a.
		3004	/// \returns A 128-bit integer vector containing the right-shifted values.
		3005	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
		3006	__m128i __count) {
		3007	return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count);
		3008	}
		3009
		3010	/// Compares each of the corresponding 8-bit values of the 128-bit
		3011	/// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF
		3012	/// for true.
		3013	///
		3014	/// \headerfile <x86intrin.h>
		3015	///
		3016	/// This intrinsic corresponds to the <c> VPCMPEQB / PCMPEQB </c> instruction.
		3017	///
		3018	/// \param __a
		3019	/// A 128-bit integer vector.
		3020	/// \param __b
		3021	/// A 128-bit integer vector.
		3022	/// \returns A 128-bit integer vector containing the comparison results.
		3023	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
		3024	__m128i __b) {
		3025	return (__m128i)((__v16qi)__a == (__v16qi)__b);
		3026	}
		3027
		3028	/// Compares each of the corresponding 16-bit values of the 128-bit
		3029	/// integer vectors for equality. Each comparison yields 0x0 for false,
		3030	/// 0xFFFF for true.
		3031	///
		3032	/// \headerfile <x86intrin.h>
		3033	///
		3034	/// This intrinsic corresponds to the <c> VPCMPEQW / PCMPEQW </c> instruction.
		3035	///
		3036	/// \param __a
		3037	/// A 128-bit integer vector.
		3038	/// \param __b
		3039	/// A 128-bit integer vector.
		3040	/// \returns A 128-bit integer vector containing the comparison results.
		3041	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
		3042	__m128i __b) {
		3043	return (__m128i)((__v8hi)__a == (__v8hi)__b);
		3044	}
		3045
		3046	/// Compares each of the corresponding 32-bit values of the 128-bit
		3047	/// integer vectors for equality. Each comparison yields 0x0 for false,
		3048	/// 0xFFFFFFFF for true.
		3049	///
		3050	/// \headerfile <x86intrin.h>
		3051	///
		3052	/// This intrinsic corresponds to the <c> VPCMPEQD / PCMPEQD </c> instruction.
		3053	///
		3054	/// \param __a
		3055	/// A 128-bit integer vector.
		3056	/// \param __b
		3057	/// A 128-bit integer vector.
		3058	/// \returns A 128-bit integer vector containing the comparison results.
		3059	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
		3060	__m128i __b) {
		3061	return (__m128i)((__v4si)__a == (__v4si)__b);
		3062	}
		3063
		3064	/// Compares each of the corresponding signed 8-bit values of the 128-bit
		3065	/// integer vectors to determine if the values in the first operand are
		3066	/// greater than those in the second operand. Each comparison yields 0x0 for
		3067	/// false, 0xFF for true.
		3068	///
		3069	/// \headerfile <x86intrin.h>
		3070	///
		3071	/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
		3072	///
		3073	/// \param __a
		3074	/// A 128-bit integer vector.
		3075	/// \param __b
		3076	/// A 128-bit integer vector.
		3077	/// \returns A 128-bit integer vector containing the comparison results.
		3078	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
		3079	__m128i __b) {
		3080	/* This function always performs a signed comparison, but __v16qi is a char
		3081	which may be signed or unsigned, so use __v16qs. */
		3082	return (__m128i)((__v16qs)__a > (__v16qs)__b);
		3083	}
		3084
		3085	/// Compares each of the corresponding signed 16-bit values of the
		3086	/// 128-bit integer vectors to determine if the values in the first operand
		3087	/// are greater than those in the second operand.
		3088	///
		3089	/// Each comparison yields 0x0 for false, 0xFFFF for true.
		3090	///
		3091	/// \headerfile <x86intrin.h>
		3092	///
		3093	/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
		3094	///
		3095	/// \param __a
		3096	/// A 128-bit integer vector.
		3097	/// \param __b
		3098	/// A 128-bit integer vector.
		3099	/// \returns A 128-bit integer vector containing the comparison results.
		3100	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
		3101	__m128i __b) {
		3102	return (__m128i)((__v8hi)__a > (__v8hi)__b);
		3103	}
		3104
		3105	/// Compares each of the corresponding signed 32-bit values of the
		3106	/// 128-bit integer vectors to determine if the values in the first operand
		3107	/// are greater than those in the second operand.
		3108	///
		3109	/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
		3110	///
		3111	/// \headerfile <x86intrin.h>
		3112	///
		3113	/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
		3114	///
		3115	/// \param __a
		3116	/// A 128-bit integer vector.
		3117	/// \param __b
		3118	/// A 128-bit integer vector.
		3119	/// \returns A 128-bit integer vector containing the comparison results.
		3120	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
		3121	__m128i __b) {
		3122	return (__m128i)((__v4si)__a > (__v4si)__b);
		3123	}
		3124
		3125	/// Compares each of the corresponding signed 8-bit values of the 128-bit
		3126	/// integer vectors to determine if the values in the first operand are less
		3127	/// than those in the second operand.
		3128	///
		3129	/// Each comparison yields 0x0 for false, 0xFF for true.
		3130	///
		3131	/// \headerfile <x86intrin.h>
		3132	///
		3133	/// This intrinsic corresponds to the <c> VPCMPGTB / PCMPGTB </c> instruction.
		3134	///
		3135	/// \param __a
		3136	/// A 128-bit integer vector.
		3137	/// \param __b
		3138	/// A 128-bit integer vector.
		3139	/// \returns A 128-bit integer vector containing the comparison results.
		3140	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
		3141	__m128i __b) {
		3142	return _mm_cmpgt_epi8(__b, __a);
		3143	}
		3144
		3145	/// Compares each of the corresponding signed 16-bit values of the
		3146	/// 128-bit integer vectors to determine if the values in the first operand
		3147	/// are less than those in the second operand.
		3148	///
		3149	/// Each comparison yields 0x0 for false, 0xFFFF for true.
		3150	///
		3151	/// \headerfile <x86intrin.h>
		3152	///
		3153	/// This intrinsic corresponds to the <c> VPCMPGTW / PCMPGTW </c> instruction.
		3154	///
		3155	/// \param __a
		3156	/// A 128-bit integer vector.
		3157	/// \param __b
		3158	/// A 128-bit integer vector.
		3159	/// \returns A 128-bit integer vector containing the comparison results.
		3160	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
		3161	__m128i __b) {
		3162	return _mm_cmpgt_epi16(__b, __a);
		3163	}
		3164
		3165	/// Compares each of the corresponding signed 32-bit values of the
		3166	/// 128-bit integer vectors to determine if the values in the first operand
		3167	/// are less than those in the second operand.
		3168	///
		3169	/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true.
		3170	///
		3171	/// \headerfile <x86intrin.h>
		3172	///
		3173	/// This intrinsic corresponds to the <c> VPCMPGTD / PCMPGTD </c> instruction.
		3174	///
		3175	/// \param __a
		3176	/// A 128-bit integer vector.
		3177	/// \param __b
		3178	/// A 128-bit integer vector.
		3179	/// \returns A 128-bit integer vector containing the comparison results.
		3180	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
		3181	__m128i __b) {
		3182	return _mm_cmpgt_epi32(__b, __a);
		3183	}
		3184
		3185	#ifdef __x86_64__
		3186	/// Converts a 64-bit signed integer value from the second operand into a
		3187	/// double-precision value and returns it in the lower element of a [2 x
		3188	/// double] vector; the upper element of the returned vector is copied from
		3189	/// the upper element of the first operand.
		3190	///
		3191	/// \headerfile <x86intrin.h>
		3192	///
		3193	/// This intrinsic corresponds to the <c> VCVTSI2SD / CVTSI2SD </c> instruction.
		3194	///
		3195	/// \param __a
		3196	/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are
		3197	/// copied to the upper 64 bits of the destination.
		3198	/// \param __b
		3199	/// A 64-bit signed integer operand containing the value to be converted.
		3200	/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the
		3201	/// converted value of the second operand. The upper 64 bits are copied from
		3202	/// the upper 64 bits of the first operand.
		3203	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_cvtsi64_sd(__m128d __a,
		3204	long long __b) {
		3205	__a[0] = __b;
		3206	return __a;
		3207	}
		3208
		3209	/// Converts the first (lower) element of a vector of [2 x double] into a
		3210	/// 64-bit signed integer value, according to the current rounding mode.
		3211	///
		3212	/// \headerfile <x86intrin.h>
		3213	///
		3214	/// This intrinsic corresponds to the <c> VCVTSD2SI / CVTSD2SI </c> instruction.
		3215	///
		3216	/// \param __a
		3217	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
		3218	/// conversion.
		3219	/// \returns A 64-bit signed integer containing the converted value.
		3220	static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsd_si64(__m128d __a) {
		3221	return __builtin_ia32_cvtsd2si64((__v2df)__a);
		3222	}
		3223
		3224	/// Converts the first (lower) element of a vector of [2 x double] into a
		3225	/// 64-bit signed integer value, truncating the result when it is inexact.
		3226	///
		3227	/// \headerfile <x86intrin.h>
		3228	///
		3229	/// This intrinsic corresponds to the <c> VCVTTSD2SI / CVTTSD2SI </c>
		3230	/// instruction.
		3231	///
		3232	/// \param __a
		3233	/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the
		3234	/// conversion.
		3235	/// \returns A 64-bit signed integer containing the converted value.
		3236	static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvttsd_si64(__m128d __a) {
		3237	return __builtin_ia32_cvttsd2si64((__v2df)__a);
		3238	}
		3239	#endif
		3240
		3241	/// Converts a vector of [4 x i32] into a vector of [4 x float].
		3242	///
		3243	/// \headerfile <x86intrin.h>
		3244	///
		3245	/// This intrinsic corresponds to the <c> VCVTDQ2PS / CVTDQ2PS </c> instruction.
		3246	///
		3247	/// \param __a
		3248	/// A 128-bit integer vector.
		3249	/// \returns A 128-bit vector of [4 x float] containing the converted values.
		3250	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_cvtepi32_ps(__m128i __a) {
		3251	return (__m128) __builtin_convertvector((__v4si)__a, __v4sf);
		3252	}
		3253
		3254	/// Converts a vector of [4 x float] into a vector of [4 x i32].
		3255	///
		3256	/// \headerfile <x86intrin.h>
		3257	///
		3258	/// This intrinsic corresponds to the <c> VCVTPS2DQ / CVTPS2DQ </c> instruction.
		3259	///
		3260	/// \param __a
		3261	/// A 128-bit vector of [4 x float].
		3262	/// \returns A 128-bit integer vector of [4 x i32] containing the converted
		3263	/// values.
		3264	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtps_epi32(__m128 __a) {
		3265	return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a);
		3266	}
		3267
		3268	/// Converts a vector of [4 x float] into a vector of [4 x i32],
		3269	/// truncating the result when it is inexact.
		3270	///
		3271	/// \headerfile <x86intrin.h>
		3272	///
		3273	/// This intrinsic corresponds to the <c> VCVTTPS2DQ / CVTTPS2DQ </c>
		3274	/// instruction.
		3275	///
		3276	/// \param __a
		3277	/// A 128-bit vector of [4 x float].
		3278	/// \returns A 128-bit vector of [4 x i32] containing the converted values.
		3279	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvttps_epi32(__m128 __a) {
		3280	return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a);
		3281	}
		3282
		3283	/// Returns a vector of [4 x i32] where the lowest element is the input
		3284	/// operand and the remaining elements are zero.
		3285	///
		3286	/// \headerfile <x86intrin.h>
		3287	///
		3288	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
		3289	///
		3290	/// \param __a
		3291	/// A 32-bit signed integer operand.
		3292	/// \returns A 128-bit vector of [4 x i32].
		3293	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi32_si128(int __a) {
		3294	return __extension__(__m128i)(__v4si){__a, 0, 0, 0};
		3295	}
		3296
		3297	/// Returns a vector of [2 x i64] where the lower element is the input
		3298	/// operand and the upper element is zero.
		3299	///
		3300	/// \headerfile <x86intrin.h>
		3301	///
		3302	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction
		3303	/// in 64-bit mode.
		3304	///
		3305	/// \param __a
		3306	/// A 64-bit signed integer operand containing the value to be converted.
		3307	/// \returns A 128-bit vector of [2 x i64] containing the converted value.
		3308	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtsi64_si128(long long __a) {
		3309	return __extension__(__m128i)(__v2di){__a, 0};
		3310	}
		3311
		3312	/// Moves the least significant 32 bits of a vector of [4 x i32] to a
		3313	/// 32-bit signed integer value.
		3314	///
		3315	/// \headerfile <x86intrin.h>
		3316	///
		3317	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
		3318	///
		3319	/// \param __a
		3320	/// A vector of [4 x i32]. The least significant 32 bits are moved to the
		3321	/// destination.
		3322	/// \returns A 32-bit signed integer containing the moved value.
		3323	static __inline__ int __DEFAULT_FN_ATTRS _mm_cvtsi128_si32(__m128i __a) {
		3324	__v4si __b = (__v4si)__a;
		3325	return __b[0];
		3326	}
		3327
		3328	/// Moves the least significant 64 bits of a vector of [2 x i64] to a
		3329	/// 64-bit signed integer value.
		3330	///
		3331	/// \headerfile <x86intrin.h>
		3332	///
		3333	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
		3334	///
		3335	/// \param __a
		3336	/// A vector of [2 x i64]. The least significant 64 bits are moved to the
		3337	/// destination.
		3338	/// \returns A 64-bit signed integer containing the moved value.
		3339	static __inline__ long long __DEFAULT_FN_ATTRS _mm_cvtsi128_si64(__m128i __a) {
		3340	return __a[0];
		3341	}
		3342
		3343	/// Moves packed integer values from an aligned 128-bit memory location
		3344	/// to elements in a 128-bit integer vector.
		3345	///
		3346	/// \headerfile <x86intrin.h>
		3347	///
		3348	/// This intrinsic corresponds to the <c> VMOVDQA / MOVDQA </c> instruction.
		3349	///
		3350	/// \param __p
		3351	/// An aligned pointer to a memory location containing integer values.
		3352	/// \returns A 128-bit integer vector containing the moved values.
		3353	static __inline__ __m128i __DEFAULT_FN_ATTRS
		3354	_mm_load_si128(__m128i const *__p) {
		3355	return *__p;
		3356	}
		3357
		3358	/// Moves packed integer values from an unaligned 128-bit memory location
		3359	/// to elements in a 128-bit integer vector.
		3360	///
		3361	/// \headerfile <x86intrin.h>
		3362	///
		3363	/// This intrinsic corresponds to the <c> VMOVDQU / MOVDQU </c> instruction.
		3364	///
		3365	/// \param __p
		3366	/// A pointer to a memory location containing integer values.
		3367	/// \returns A 128-bit integer vector containing the moved values.
		3368	static __inline__ __m128i __DEFAULT_FN_ATTRS
		3369	_mm_loadu_si128(__m128i_u const *__p) {
		3370	struct __loadu_si128 {
		3371	__m128i_u __v;
		3372	} __attribute__((__packed__, __may_alias__));
		3373	return ((const struct __loadu_si128 *)__p)->__v;
		3374	}
		3375
		3376	/// Returns a vector of [2 x i64] where the lower element is taken from
		3377	/// the lower element of the operand, and the upper element is zero.
		3378	///
		3379	/// \headerfile <x86intrin.h>
		3380	///
		3381	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
		3382	///
		3383	/// \param __p
		3384	/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of
		3385	/// the destination.
		3386	/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the
		3387	/// moved value. The higher order bits are cleared.
		3388	static __inline__ __m128i __DEFAULT_FN_ATTRS
		3389	_mm_loadl_epi64(__m128i_u const *__p) {
		3390	struct __mm_loadl_epi64_struct {
		3391	long long __u;
		3392	} __attribute__((__packed__, __may_alias__));
		3393	return __extension__(__m128i){
		3394	((const struct __mm_loadl_epi64_struct *)__p)->__u, 0};
		3395	}
		3396
		3397	/// Generates a 128-bit vector of [4 x i32] with unspecified content.
		3398	/// This could be used as an argument to another intrinsic function where the
		3399	/// argument is required but the value is not actually used.
		3400	///
		3401	/// \headerfile <x86intrin.h>
		3402	///
		3403	/// This intrinsic has no corresponding instruction.
		3404	///
		3405	/// \returns A 128-bit vector of [4 x i32] with unspecified content.
		3406	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_undefined_si128(void) {
		3407	return (__m128i)__builtin_ia32_undef128();
		3408	}
		3409
		3410	/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
		3411	/// the specified 64-bit integer values.
		3412	///
		3413	/// \headerfile <x86intrin.h>
		3414	///
		3415	/// This intrinsic is a utility function and does not correspond to a specific
		3416	/// instruction.
		3417	///
		3418	/// \param __q1
		3419	/// A 64-bit integer value used to initialize the upper 64 bits of the
		3420	/// destination vector of [2 x i64].
		3421	/// \param __q0
		3422	/// A 64-bit integer value used to initialize the lower 64 bits of the
		3423	/// destination vector of [2 x i64].
		3424	/// \returns An initialized 128-bit vector of [2 x i64] containing the values
		3425	/// provided in the operands.
		3426	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64x(long long __q1,
		3427	long long __q0) {
		3428	return __extension__(__m128i)(__v2di){__q0, __q1};
		3429	}
		3430
		3431	/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with
		3432	/// the specified 64-bit integer values.
		3433	///
		3434	/// \headerfile <x86intrin.h>
		3435	///
		3436	/// This intrinsic is a utility function and does not correspond to a specific
		3437	/// instruction.
		3438	///
		3439	/// \param __q1
		3440	/// A 64-bit integer value used to initialize the upper 64 bits of the
		3441	/// destination vector of [2 x i64].
		3442	/// \param __q0
		3443	/// A 64-bit integer value used to initialize the lower 64 bits of the
		3444	/// destination vector of [2 x i64].
		3445	/// \returns An initialized 128-bit vector of [2 x i64] containing the values
		3446	/// provided in the operands.
		3447	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi64(__m64 __q1,
		3448	__m64 __q0) {
		3449	return _mm_set_epi64x((long long)__q1, (long long)__q0);
		3450	}
		3451
		3452	/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with
		3453	/// the specified 32-bit integer values.
		3454	///
		3455	/// \headerfile <x86intrin.h>
		3456	///
		3457	/// This intrinsic is a utility function and does not correspond to a specific
		3458	/// instruction.
		3459	///
		3460	/// \param __i3
		3461	/// A 32-bit integer value used to initialize bits [127:96] of the
		3462	/// destination vector.
		3463	/// \param __i2
		3464	/// A 32-bit integer value used to initialize bits [95:64] of the destination
		3465	/// vector.
		3466	/// \param __i1
		3467	/// A 32-bit integer value used to initialize bits [63:32] of the destination
		3468	/// vector.
		3469	/// \param __i0
		3470	/// A 32-bit integer value used to initialize bits [31:0] of the destination
		3471	/// vector.
		3472	/// \returns An initialized 128-bit vector of [4 x i32] containing the values
		3473	/// provided in the operands.
		3474	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set_epi32(int __i3, int __i2,
		3475	int __i1, int __i0) {
		3476	return __extension__(__m128i)(__v4si){__i0, __i1, __i2, __i3};
		3477	}
		3478
		3479	/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with
		3480	/// the specified 16-bit integer values.
		3481	///
		3482	/// \headerfile <x86intrin.h>
		3483	///
		3484	/// This intrinsic is a utility function and does not correspond to a specific
		3485	/// instruction.
		3486	///
		3487	/// \param __w7
		3488	/// A 16-bit integer value used to initialize bits [127:112] of the
		3489	/// destination vector.
		3490	/// \param __w6
		3491	/// A 16-bit integer value used to initialize bits [111:96] of the
		3492	/// destination vector.
		3493	/// \param __w5
		3494	/// A 16-bit integer value used to initialize bits [95:80] of the destination
		3495	/// vector.
		3496	/// \param __w4
		3497	/// A 16-bit integer value used to initialize bits [79:64] of the destination
		3498	/// vector.
		3499	/// \param __w3
		3500	/// A 16-bit integer value used to initialize bits [63:48] of the destination
		3501	/// vector.
		3502	/// \param __w2
		3503	/// A 16-bit integer value used to initialize bits [47:32] of the destination
		3504	/// vector.
		3505	/// \param __w1
		3506	/// A 16-bit integer value used to initialize bits [31:16] of the destination
		3507	/// vector.
		3508	/// \param __w0
		3509	/// A 16-bit integer value used to initialize bits [15:0] of the destination
		3510	/// vector.
		3511	/// \returns An initialized 128-bit vector of [8 x i16] containing the values
		3512	/// provided in the operands.
		3513	static __inline__ __m128i __DEFAULT_FN_ATTRS
		3514	_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3,
		3515	short __w2, short __w1, short __w0) {
		3516	return __extension__(__m128i)(__v8hi){__w0, __w1, __w2, __w3,
		3517	__w4, __w5, __w6, __w7};
		3518	}
		3519
		3520	/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with
		3521	/// the specified 8-bit integer values.
		3522	///
		3523	/// \headerfile <x86intrin.h>
		3524	///
		3525	/// This intrinsic is a utility function and does not correspond to a specific
		3526	/// instruction.
		3527	///
		3528	/// \param __b15
		3529	/// Initializes bits [127:120] of the destination vector.
		3530	/// \param __b14
		3531	/// Initializes bits [119:112] of the destination vector.
		3532	/// \param __b13
		3533	/// Initializes bits [111:104] of the destination vector.
		3534	/// \param __b12
		3535	/// Initializes bits [103:96] of the destination vector.
		3536	/// \param __b11
		3537	/// Initializes bits [95:88] of the destination vector.
		3538	/// \param __b10
		3539	/// Initializes bits [87:80] of the destination vector.
		3540	/// \param __b9
		3541	/// Initializes bits [79:72] of the destination vector.
		3542	/// \param __b8
		3543	/// Initializes bits [71:64] of the destination vector.
		3544	/// \param __b7
		3545	/// Initializes bits [63:56] of the destination vector.
		3546	/// \param __b6
		3547	/// Initializes bits [55:48] of the destination vector.
		3548	/// \param __b5
		3549	/// Initializes bits [47:40] of the destination vector.
		3550	/// \param __b4
		3551	/// Initializes bits [39:32] of the destination vector.
		3552	/// \param __b3
		3553	/// Initializes bits [31:24] of the destination vector.
		3554	/// \param __b2
		3555	/// Initializes bits [23:16] of the destination vector.
		3556	/// \param __b1
		3557	/// Initializes bits [15:8] of the destination vector.
		3558	/// \param __b0
		3559	/// Initializes bits [7:0] of the destination vector.
		3560	/// \returns An initialized 128-bit vector of [16 x i8] containing the values
		3561	/// provided in the operands.
		3562	static __inline__ __m128i __DEFAULT_FN_ATTRS
		3563	_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11,
		3564	char __b10, char __b9, char __b8, char __b7, char __b6, char __b5,
		3565	char __b4, char __b3, char __b2, char __b1, char __b0) {
		3566	return __extension__(__m128i)(__v16qi){
		3567	__b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7,
		3568	__b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15};
		3569	}
		3570
		3571	/// Initializes both values in a 128-bit integer vector with the
		3572	/// specified 64-bit integer value.
		3573	///
		3574	/// \headerfile <x86intrin.h>
		3575	///
		3576	/// This intrinsic is a utility function and does not correspond to a specific
		3577	/// instruction.
		3578	///
		3579	/// \param __q
		3580	/// Integer value used to initialize the elements of the destination integer
		3581	/// vector.
		3582	/// \returns An initialized 128-bit integer vector of [2 x i64] with both
		3583	/// elements containing the value provided in the operand.
		3584	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64x(long long __q) {
		3585	return _mm_set_epi64x(__q, __q);
		3586	}
		3587
		3588	/// Initializes both values in a 128-bit vector of [2 x i64] with the
		3589	/// specified 64-bit value.
		3590	///
		3591	/// \headerfile <x86intrin.h>
		3592	///
		3593	/// This intrinsic is a utility function and does not correspond to a specific
		3594	/// instruction.
		3595	///
		3596	/// \param __q
		3597	/// A 64-bit value used to initialize the elements of the destination integer
		3598	/// vector.
		3599	/// \returns An initialized 128-bit vector of [2 x i64] with all elements
		3600	/// containing the value provided in the operand.
		3601	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi64(__m64 __q) {
		3602	return _mm_set_epi64(__q, __q);
		3603	}
		3604
		3605	/// Initializes all values in a 128-bit vector of [4 x i32] with the
		3606	/// specified 32-bit value.
		3607	///
		3608	/// \headerfile <x86intrin.h>
		3609	///
		3610	/// This intrinsic is a utility function and does not correspond to a specific
		3611	/// instruction.
		3612	///
		3613	/// \param __i
		3614	/// A 32-bit value used to initialize the elements of the destination integer
		3615	/// vector.
		3616	/// \returns An initialized 128-bit vector of [4 x i32] with all elements
		3617	/// containing the value provided in the operand.
		3618	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi32(int __i) {
		3619	return _mm_set_epi32(__i, __i, __i, __i);
		3620	}
		3621
		3622	/// Initializes all values in a 128-bit vector of [8 x i16] with the
		3623	/// specified 16-bit value.
		3624	///
		3625	/// \headerfile <x86intrin.h>
		3626	///
		3627	/// This intrinsic is a utility function and does not correspond to a specific
		3628	/// instruction.
		3629	///
		3630	/// \param __w
		3631	/// A 16-bit value used to initialize the elements of the destination integer
		3632	/// vector.
		3633	/// \returns An initialized 128-bit vector of [8 x i16] with all elements
		3634	/// containing the value provided in the operand.
		3635	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi16(short __w) {
		3636	return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w);
		3637	}
		3638
		3639	/// Initializes all values in a 128-bit vector of [16 x i8] with the
		3640	/// specified 8-bit value.
		3641	///
		3642	/// \headerfile <x86intrin.h>
		3643	///
		3644	/// This intrinsic is a utility function and does not correspond to a specific
		3645	/// instruction.
		3646	///
		3647	/// \param __b
		3648	/// An 8-bit value used to initialize the elements of the destination integer
		3649	/// vector.
		3650	/// \returns An initialized 128-bit vector of [16 x i8] with all elements
		3651	/// containing the value provided in the operand.
		3652	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_set1_epi8(char __b) {
		3653	return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b,
		3654	__b, __b, __b, __b, __b);
		3655	}
		3656
		3657	/// Constructs a 128-bit integer vector, initialized in reverse order
		3658	/// with the specified 64-bit integral values.
		3659	///
		3660	/// \headerfile <x86intrin.h>
		3661	///
		3662	/// This intrinsic does not correspond to a specific instruction.
		3663	///
		3664	/// \param __q0
		3665	/// A 64-bit integral value used to initialize the lower 64 bits of the
		3666	/// result.
		3667	/// \param __q1
		3668	/// A 64-bit integral value used to initialize the upper 64 bits of the
		3669	/// result.
		3670	/// \returns An initialized 128-bit integer vector.
		3671	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi64(__m64 __q0,
		3672	__m64 __q1) {
		3673	return _mm_set_epi64(__q1, __q0);
		3674	}
		3675
		3676	/// Constructs a 128-bit integer vector, initialized in reverse order
		3677	/// with the specified 32-bit integral values.
		3678	///
		3679	/// \headerfile <x86intrin.h>
		3680	///
		3681	/// This intrinsic is a utility function and does not correspond to a specific
		3682	/// instruction.
		3683	///
		3684	/// \param __i0
		3685	/// A 32-bit integral value used to initialize bits [31:0] of the result.
		3686	/// \param __i1
		3687	/// A 32-bit integral value used to initialize bits [63:32] of the result.
		3688	/// \param __i2
		3689	/// A 32-bit integral value used to initialize bits [95:64] of the result.
		3690	/// \param __i3
		3691	/// A 32-bit integral value used to initialize bits [127:96] of the result.
		3692	/// \returns An initialized 128-bit integer vector.
		3693	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setr_epi32(int __i0, int __i1,
		3694	int __i2,
		3695	int __i3) {
		3696	return _mm_set_epi32(__i3, __i2, __i1, __i0);
		3697	}
		3698
		3699	/// Constructs a 128-bit integer vector, initialized in reverse order
		3700	/// with the specified 16-bit integral values.
		3701	///
		3702	/// \headerfile <x86intrin.h>
		3703	///
		3704	/// This intrinsic is a utility function and does not correspond to a specific
		3705	/// instruction.
		3706	///
		3707	/// \param __w0
		3708	/// A 16-bit integral value used to initialize bits [15:0] of the result.
		3709	/// \param __w1
		3710	/// A 16-bit integral value used to initialize bits [31:16] of the result.
		3711	/// \param __w2
		3712	/// A 16-bit integral value used to initialize bits [47:32] of the result.
		3713	/// \param __w3
		3714	/// A 16-bit integral value used to initialize bits [63:48] of the result.
		3715	/// \param __w4
		3716	/// A 16-bit integral value used to initialize bits [79:64] of the result.
		3717	/// \param __w5
		3718	/// A 16-bit integral value used to initialize bits [95:80] of the result.
		3719	/// \param __w6
		3720	/// A 16-bit integral value used to initialize bits [111:96] of the result.
		3721	/// \param __w7
		3722	/// A 16-bit integral value used to initialize bits [127:112] of the result.
		3723	/// \returns An initialized 128-bit integer vector.
		3724	static __inline__ __m128i __DEFAULT_FN_ATTRS
		3725	_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4,
		3726	short __w5, short __w6, short __w7) {
		3727	return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0);
		3728	}
		3729
		3730	/// Constructs a 128-bit integer vector, initialized in reverse order
		3731	/// with the specified 8-bit integral values.
		3732	///
		3733	/// \headerfile <x86intrin.h>
		3734	///
		3735	/// This intrinsic is a utility function and does not correspond to a specific
		3736	/// instruction.
		3737	///
		3738	/// \param __b0
		3739	/// An 8-bit integral value used to initialize bits [7:0] of the result.
		3740	/// \param __b1
		3741	/// An 8-bit integral value used to initialize bits [15:8] of the result.
		3742	/// \param __b2
		3743	/// An 8-bit integral value used to initialize bits [23:16] of the result.
		3744	/// \param __b3
		3745	/// An 8-bit integral value used to initialize bits [31:24] of the result.
		3746	/// \param __b4
		3747	/// An 8-bit integral value used to initialize bits [39:32] of the result.
		3748	/// \param __b5
		3749	/// An 8-bit integral value used to initialize bits [47:40] of the result.
		3750	/// \param __b6
		3751	/// An 8-bit integral value used to initialize bits [55:48] of the result.
		3752	/// \param __b7
		3753	/// An 8-bit integral value used to initialize bits [63:56] of the result.
		3754	/// \param __b8
		3755	/// An 8-bit integral value used to initialize bits [71:64] of the result.
		3756	/// \param __b9
		3757	/// An 8-bit integral value used to initialize bits [79:72] of the result.
		3758	/// \param __b10
		3759	/// An 8-bit integral value used to initialize bits [87:80] of the result.
		3760	/// \param __b11
		3761	/// An 8-bit integral value used to initialize bits [95:88] of the result.
		3762	/// \param __b12
		3763	/// An 8-bit integral value used to initialize bits [103:96] of the result.
		3764	/// \param __b13
		3765	/// An 8-bit integral value used to initialize bits [111:104] of the result.
		3766	/// \param __b14
		3767	/// An 8-bit integral value used to initialize bits [119:112] of the result.
		3768	/// \param __b15
		3769	/// An 8-bit integral value used to initialize bits [127:120] of the result.
		3770	/// \returns An initialized 128-bit integer vector.
		3771	static __inline__ __m128i __DEFAULT_FN_ATTRS
		3772	_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
		3773	char __b6, char __b7, char __b8, char __b9, char __b10,
		3774	char __b11, char __b12, char __b13, char __b14, char __b15) {
		3775	return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8,
		3776	__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
		3777	}
		3778
		3779	/// Creates a 128-bit integer vector initialized to zero.
		3780	///
		3781	/// \headerfile <x86intrin.h>
		3782	///
		3783	/// This intrinsic corresponds to the <c> VXORPS / XORPS </c> instruction.
		3784	///
		3785	/// \returns An initialized 128-bit integer vector with all elements set to
		3786	/// zero.
		3787	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_setzero_si128(void) {
		3788	return __extension__(__m128i)(__v2di){0LL, 0LL};
		3789	}
		3790
		3791	/// Stores a 128-bit integer vector to a memory location aligned on a
		3792	/// 128-bit boundary.
		3793	///
		3794	/// \headerfile <x86intrin.h>
		3795	///
		3796	/// This intrinsic corresponds to the <c> VMOVAPS / MOVAPS </c> instruction.
		3797	///
		3798	/// \param __p
		3799	/// A pointer to an aligned memory location that will receive the integer
		3800	/// values.
		3801	/// \param __b
		3802	/// A 128-bit integer vector containing the values to be moved.
		3803	static __inline__ void __DEFAULT_FN_ATTRS _mm_store_si128(__m128i *__p,
		3804	__m128i __b) {
		3805	*__p = __b;
		3806	}
		3807
		3808	/// Stores a 128-bit integer vector to an unaligned memory location.
		3809	///
		3810	/// \headerfile <x86intrin.h>
		3811	///
		3812	/// This intrinsic corresponds to the <c> VMOVUPS / MOVUPS </c> instruction.
		3813	///
		3814	/// \param __p
		3815	/// A pointer to a memory location that will receive the integer values.
		3816	/// \param __b
		3817	/// A 128-bit integer vector containing the values to be moved.
		3818	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si128(__m128i_u *__p,
		3819	__m128i __b) {
		3820	struct __storeu_si128 {
		3821	__m128i_u __v;
		3822	} __attribute__((__packed__, __may_alias__));
		3823	((struct __storeu_si128 *)__p)->__v = __b;
		3824	}
		3825
		3826	/// Stores a 64-bit integer value from the low element of a 128-bit integer
		3827	/// vector.
		3828	///
		3829	/// \headerfile <x86intrin.h>
		3830	///
		3831	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
		3832	///
		3833	/// \param __p
		3834	/// A pointer to a 64-bit memory location. The address of the memory
		3835	/// location does not have to be aligned.
		3836	/// \param __b
		3837	/// A 128-bit integer vector containing the value to be stored.
		3838	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si64(void *__p,
		3839	__m128i __b) {
		3840	struct __storeu_si64 {
		3841	long long __v;
		3842	} __attribute__((__packed__, __may_alias__));
		3843	((struct __storeu_si64 *)__p)->__v = ((__v2di)__b)[0];
		3844	}
		3845
		3846	/// Stores a 32-bit integer value from the low element of a 128-bit integer
		3847	/// vector.
		3848	///
		3849	/// \headerfile <x86intrin.h>
		3850	///
		3851	/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
		3852	///
		3853	/// \param __p
		3854	/// A pointer to a 32-bit memory location. The address of the memory
		3855	/// location does not have to be aligned.
		3856	/// \param __b
		3857	/// A 128-bit integer vector containing the value to be stored.
		3858	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si32(void *__p,
		3859	__m128i __b) {
		3860	struct __storeu_si32 {
		3861	int __v;
		3862	} __attribute__((__packed__, __may_alias__));
		3863	((struct __storeu_si32 *)__p)->__v = ((__v4si)__b)[0];
		3864	}
		3865
		3866	/// Stores a 16-bit integer value from the low element of a 128-bit integer
		3867	/// vector.
		3868	///
		3869	/// \headerfile <x86intrin.h>
		3870	///
		3871	/// This intrinsic does not correspond to a specific instruction.
		3872	///
		3873	/// \param __p
		3874	/// A pointer to a 16-bit memory location. The address of the memory
		3875	/// location does not have to be aligned.
		3876	/// \param __b
		3877	/// A 128-bit integer vector containing the value to be stored.
		3878	static __inline__ void __DEFAULT_FN_ATTRS _mm_storeu_si16(void *__p,
		3879	__m128i __b) {
		3880	struct __storeu_si16 {
		3881	short __v;
		3882	} __attribute__((__packed__, __may_alias__));
		3883	((struct __storeu_si16 *)__p)->__v = ((__v8hi)__b)[0];
		3884	}
		3885
		3886	/// Moves bytes selected by the mask from the first operand to the
		3887	/// specified unaligned memory location. When a mask bit is 1, the
		3888	/// corresponding byte is written, otherwise it is not written.
		3889	///
		3890	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
		3891	/// used again soon). Exception and trap behavior for elements not selected
		3892	/// for storage to memory are implementation dependent.
		3893	///
		3894	/// \headerfile <x86intrin.h>
		3895	///
		3896	/// This intrinsic corresponds to the <c> VMASKMOVDQU / MASKMOVDQU </c>
		3897	/// instruction.
		3898	///
		3899	/// \param __d
		3900	/// A 128-bit integer vector containing the values to be moved.
		3901	/// \param __n
		3902	/// A 128-bit integer vector containing the mask. The most significant bit of
		3903	/// each byte represents the mask bits.
		3904	/// \param __p
		3905	/// A pointer to an unaligned 128-bit memory location where the specified
		3906	/// values are moved.
		3907	static __inline__ void __DEFAULT_FN_ATTRS _mm_maskmoveu_si128(__m128i __d,
		3908	__m128i __n,
		3909	char *__p) {
		3910	__builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
		3911	}
		3912
		3913	/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
		3914	/// a memory location.
		3915	///
		3916	/// \headerfile <x86intrin.h>
		3917	///
		3918	/// This intrinsic corresponds to the <c> VMOVLPS / MOVLPS </c> instruction.
		3919	///
		3920	/// \param __p
		3921	/// A pointer to a 64-bit memory location that will receive the lower 64 bits
		3922	/// of the integer vector parameter.
		3923	/// \param __a
		3924	/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
		3925	/// value to be stored.
		3926	static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
		3927	__m128i __a) {
		3928	struct __mm_storel_epi64_struct {
		3929	long long __u;
		3930	} __attribute__((__packed__, __may_alias__));
		3931	((struct __mm_storel_epi64_struct *)__p)->__u = __a[0];
		3932	}
		3933
		3934	/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit
		3935	/// aligned memory location.
		3936	///
		3937	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
		3938	/// used again soon).
		3939	///
		3940	/// \headerfile <x86intrin.h>
		3941	///
		3942	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
		3943	///
		3944	/// \param __p
		3945	/// A pointer to the 128-bit aligned memory location used to store the value.
		3946	/// \param __a
		3947	/// A vector of [2 x double] containing the 64-bit values to be stored.
		3948	static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
		3949	__m128d __a) {
		3950	__builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
		3951	}
		3952
		3953	/// Stores a 128-bit integer vector to a 128-bit aligned memory location.
		3954	///
		3955	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
		3956	/// used again soon).
		3957	///
		3958	/// \headerfile <x86intrin.h>
		3959	///
		3960	/// This intrinsic corresponds to the <c> VMOVNTPS / MOVNTPS </c> instruction.
		3961	///
		3962	/// \param __p
		3963	/// A pointer to the 128-bit aligned memory location used to store the value.
		3964	/// \param __a
		3965	/// A 128-bit integer vector containing the values to be stored.
		3966	static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
		3967	__m128i __a) {
		3968	__builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
		3969	}
		3970
		3971	/// Stores a 32-bit integer value in the specified memory location.
		3972	///
		3973	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
		3974	/// used again soon).
		3975	///
		3976	/// \headerfile <x86intrin.h>
		3977	///
		3978	/// This intrinsic corresponds to the <c> MOVNTI </c> instruction.
		3979	///
		3980	/// \param __p
		3981	/// A pointer to the 32-bit memory location used to store the value.
		3982	/// \param __a
		3983	/// A 32-bit integer containing the value to be stored.
		3984	static __inline__ void
		3985	__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
		3986	_mm_stream_si32(int *__p, int __a) {
		3987	__builtin_ia32_movnti(__p, __a);
		3988	}
		3989
		3990	#ifdef __x86_64__
		3991	/// Stores a 64-bit integer value in the specified memory location.
		3992	///
		3993	/// To minimize caching, the data is flagged as non-temporal (unlikely to be
		3994	/// used again soon).
		3995	///
		3996	/// \headerfile <x86intrin.h>
		3997	///
		3998	/// This intrinsic corresponds to the <c> MOVNTIQ </c> instruction.
		3999	///
		4000	/// \param __p
		4001	/// A pointer to the 64-bit memory location used to store the value.
		4002	/// \param __a
		4003	/// A 64-bit integer containing the value to be stored.
		4004	static __inline__ void
		4005	__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
		4006	_mm_stream_si64(long long *__p, long long __a) {
		4007	__builtin_ia32_movnti64(__p, __a);
		4008	}
		4009	#endif
		4010
		4011	#if defined(__cplusplus)
		4012	extern "C" {
		4013	#endif
		4014
		4015	/// The cache line containing \a __p is flushed and invalidated from all
		4016	/// caches in the coherency domain.
		4017	///
		4018	/// \headerfile <x86intrin.h>
		4019	///
		4020	/// This intrinsic corresponds to the <c> CLFLUSH </c> instruction.
		4021	///
		4022	/// \param __p
		4023	/// A pointer to the memory location used to identify the cache line to be
		4024	/// flushed.
		4025	void _mm_clflush(void const *__p);
		4026
		4027	/// Forces strong memory ordering (serialization) between load
		4028	/// instructions preceding this instruction and load instructions following
		4029	/// this instruction, ensuring the system completes all previous loads before
		4030	/// executing subsequent loads.
		4031	///
		4032	/// \headerfile <x86intrin.h>
		4033	///
		4034	/// This intrinsic corresponds to the <c> LFENCE </c> instruction.
		4035	///
		4036	void _mm_lfence(void);
		4037
		4038	/// Forces strong memory ordering (serialization) between load and store
		4039	/// instructions preceding this instruction and load and store instructions
		4040	/// following this instruction, ensuring that the system completes all
		4041	/// previous memory accesses before executing subsequent memory accesses.
		4042	///
		4043	/// \headerfile <x86intrin.h>
		4044	///
		4045	/// This intrinsic corresponds to the <c> MFENCE </c> instruction.
		4046	///
		4047	void _mm_mfence(void);
		4048
		4049	#if defined(__cplusplus)
		4050	} // extern "C"
		4051	#endif
		4052
		4053	/// Converts 16-bit signed integers from both 128-bit integer vector
		4054	/// operands into 8-bit signed integers, and packs the results into the
		4055	/// destination. Positive values greater than 0x7F are saturated to 0x7F.
		4056	/// Negative values less than 0x80 are saturated to 0x80.
		4057	///
		4058	/// \headerfile <x86intrin.h>
		4059	///
		4060	/// This intrinsic corresponds to the <c> VPACKSSWB / PACKSSWB </c> instruction.
		4061	///
		4062	/// \param __a
		4063	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
		4064	/// a signed integer and is converted to a 8-bit signed integer with
		4065	/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
		4066	/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
		4067	/// written to the lower 64 bits of the result.
		4068	/// \param __b
		4069	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
		4070	/// a signed integer and is converted to a 8-bit signed integer with
		4071	/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less
		4072	/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are
		4073	/// written to the higher 64 bits of the result.
		4074	/// \returns A 128-bit vector of [16 x i8] containing the converted values.
		4075	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi16(__m128i __a,
		4076	__m128i __b) {
		4077	return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
		4078	}
		4079
		4080	/// Converts 32-bit signed integers from both 128-bit integer vector
		4081	/// operands into 16-bit signed integers, and packs the results into the
		4082	/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
		4083	/// Negative values less than 0x8000 are saturated to 0x8000.
		4084	///
		4085	/// \headerfile <x86intrin.h>
		4086	///
		4087	/// This intrinsic corresponds to the <c> VPACKSSDW / PACKSSDW </c> instruction.
		4088	///
		4089	/// \param __a
		4090	/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
		4091	/// a signed integer and is converted to a 16-bit signed integer with
		4092	/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
		4093	/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
		4094	/// are written to the lower 64 bits of the result.
		4095	/// \param __b
		4096	/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
		4097	/// a signed integer and is converted to a 16-bit signed integer with
		4098	/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
		4099	/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
		4100	/// are written to the higher 64 bits of the result.
		4101	/// \returns A 128-bit vector of [8 x i16] containing the converted values.
		4102	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packs_epi32(__m128i __a,
		4103	__m128i __b) {
		4104	return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
		4105	}
		4106
		4107	/// Converts 16-bit signed integers from both 128-bit integer vector
		4108	/// operands into 8-bit unsigned integers, and packs the results into the
		4109	/// destination. Values greater than 0xFF are saturated to 0xFF. Values less
		4110	/// than 0x00 are saturated to 0x00.
		4111	///
		4112	/// \headerfile <x86intrin.h>
		4113	///
		4114	/// This intrinsic corresponds to the <c> VPACKUSWB / PACKUSWB </c> instruction.
		4115	///
		4116	/// \param __a
		4117	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
		4118	/// a signed integer and is converted to an 8-bit unsigned integer with
		4119	/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
		4120	/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
		4121	/// written to the lower 64 bits of the result.
		4122	/// \param __b
		4123	/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
		4124	/// a signed integer and is converted to an 8-bit unsigned integer with
		4125	/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
		4126	/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are
		4127	/// written to the higher 64 bits of the result.
		4128	/// \returns A 128-bit vector of [16 x i8] containing the converted values.
		4129	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi16(__m128i __a,
		4130	__m128i __b) {
		4131	return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
		4132	}
		4133
		4134	/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
		4135	/// the immediate-value parameter as a selector.
		4136	///
		4137	/// \headerfile <x86intrin.h>
		4138	///
		4139	/// \code
		4140	/// __m128i _mm_extract_epi16(__m128i a, const int imm);
		4141	/// \endcode
		4142	///
		4143	/// This intrinsic corresponds to the <c> VPEXTRW / PEXTRW </c> instruction.
		4144	///
		4145	/// \param a
		4146	/// A 128-bit integer vector.
		4147	/// \param imm
		4148	/// An immediate value. Bits [2:0] selects values from \a a to be assigned
		4149	/// to bits[15:0] of the result. \n
		4150	/// 000: assign values from bits [15:0] of \a a. \n
		4151	/// 001: assign values from bits [31:16] of \a a. \n
		4152	/// 010: assign values from bits [47:32] of \a a. \n
		4153	/// 011: assign values from bits [63:48] of \a a. \n
		4154	/// 100: assign values from bits [79:64] of \a a. \n
		4155	/// 101: assign values from bits [95:80] of \a a. \n
		4156	/// 110: assign values from bits [111:96] of \a a. \n
		4157	/// 111: assign values from bits [127:112] of \a a.
		4158	/// \returns An integer, whose lower 16 bits are selected from the 128-bit
		4159	/// integer vector parameter and the remaining bits are assigned zeros.
		4160	#define _mm_extract_epi16(a, imm) \
		4161	((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \
		4162	(int)(imm)))
		4163
		4164	/// Constructs a 128-bit integer vector by first making a copy of the
		4165	/// 128-bit integer vector parameter, and then inserting the lower 16 bits
		4166	/// of an integer parameter into an offset specified by the immediate-value
		4167	/// parameter.
		4168	///
		4169	/// \headerfile <x86intrin.h>
		4170	///
		4171	/// \code
		4172	/// __m128i _mm_insert_epi16(__m128i a, int b, const int imm);
		4173	/// \endcode
		4174	///
		4175	/// This intrinsic corresponds to the <c> VPINSRW / PINSRW </c> instruction.
		4176	///
		4177	/// \param a
		4178	/// A 128-bit integer vector of [8 x i16]. This vector is copied to the
		4179	/// result and then one of the eight elements in the result is replaced by
		4180	/// the lower 16 bits of \a b.
		4181	/// \param b
		4182	/// An integer. The lower 16 bits of this parameter are written to the
		4183	/// result beginning at an offset specified by \a imm.
		4184	/// \param imm
		4185	/// An immediate value specifying the bit offset in the result at which the
		4186	/// lower 16 bits of \a b are written.
		4187	/// \returns A 128-bit integer vector containing the constructed values.
		4188	#define _mm_insert_epi16(a, b, imm) \
		4189	((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \
		4190	(int)(imm)))
		4191
		4192	/// Copies the values of the most significant bits from each 8-bit
		4193	/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
		4194	/// value, zero-extends the value, and writes it to the destination.
		4195	///
		4196	/// \headerfile <x86intrin.h>
		4197	///
		4198	/// This intrinsic corresponds to the <c> VPMOVMSKB / PMOVMSKB </c> instruction.
		4199	///
		4200	/// \param __a
		4201	/// A 128-bit integer vector containing the values with bits to be extracted.
		4202	/// \returns The most significant bits from each 8-bit element in \a __a,
		4203	/// written to bits [15:0]. The other bits are assigned zeros.
		4204	static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_epi8(__m128i __a) {
		4205	return __builtin_ia32_pmovmskb128((__v16qi)__a);
		4206	}
		4207
		4208	/// Constructs a 128-bit integer vector by shuffling four 32-bit
		4209	/// elements of a 128-bit integer vector parameter, using the immediate-value
		4210	/// parameter as a specifier.
		4211	///
		4212	/// \headerfile <x86intrin.h>
		4213	///
		4214	/// \code
		4215	/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
		4216	/// \endcode
		4217	///
		4218	/// This intrinsic corresponds to the <c> VPSHUFD / PSHUFD </c> instruction.
		4219	///
		4220	/// \param a
		4221	/// A 128-bit integer vector containing the values to be copied.
		4222	/// \param imm
		4223	/// An immediate value containing an 8-bit value specifying which elements to
		4224	/// copy from a. The destinations within the 128-bit destination are assigned
		4225	/// values as follows: \n
		4226	/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n
		4227	/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n
		4228	/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n
		4229	/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n
		4230	/// Bit value assignments: \n
		4231	/// 00: assign values from bits [31:0] of \a a. \n
		4232	/// 01: assign values from bits [63:32] of \a a. \n
		4233	/// 10: assign values from bits [95:64] of \a a. \n
		4234	/// 11: assign values from bits [127:96] of \a a. \n
		4235	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
		4236	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
		4237	/// <c>[b6, b4, b2, b0]</c>.
		4238	/// \returns A 128-bit integer vector containing the shuffled values.
		4239	#define _mm_shuffle_epi32(a, imm) \
		4240	((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm)))
		4241
		4242	/// Constructs a 128-bit integer vector by shuffling four lower 16-bit
		4243	/// elements of a 128-bit integer vector of [8 x i16], using the immediate
		4244	/// value parameter as a specifier.
		4245	///
		4246	/// \headerfile <x86intrin.h>
		4247	///
		4248	/// \code
		4249	/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
		4250	/// \endcode
		4251	///
		4252	/// This intrinsic corresponds to the <c> VPSHUFLW / PSHUFLW </c> instruction.
		4253	///
		4254	/// \param a
		4255	/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
		4256	/// [127:64] of the result.
		4257	/// \param imm
		4258	/// An 8-bit immediate value specifying which elements to copy from \a a. \n
		4259	/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n
		4260	/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n
		4261	/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n
		4262	/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n
		4263	/// Bit value assignments: \n
		4264	/// 00: assign values from bits [15:0] of \a a. \n
		4265	/// 01: assign values from bits [31:16] of \a a. \n
		4266	/// 10: assign values from bits [47:32] of \a a. \n
		4267	/// 11: assign values from bits [63:48] of \a a. \n
		4268	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
		4269	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
		4270	/// <c>[b6, b4, b2, b0]</c>.
		4271	/// \returns A 128-bit integer vector containing the shuffled values.
		4272	#define _mm_shufflelo_epi16(a, imm) \
		4273	((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm)))
		4274
		4275	/// Constructs a 128-bit integer vector by shuffling four upper 16-bit
		4276	/// elements of a 128-bit integer vector of [8 x i16], using the immediate
		4277	/// value parameter as a specifier.
		4278	///
		4279	/// \headerfile <x86intrin.h>
		4280	///
		4281	/// \code
		4282	/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
		4283	/// \endcode
		4284	///
		4285	/// This intrinsic corresponds to the <c> VPSHUFHW / PSHUFHW </c> instruction.
		4286	///
		4287	/// \param a
		4288	/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
		4289	/// [63:0] of the result.
		4290	/// \param imm
		4291	/// An 8-bit immediate value specifying which elements to copy from \a a. \n
		4292	/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n
		4293	/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n
		4294	/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n
		4295	/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n
		4296	/// Bit value assignments: \n
		4297	/// 00: assign values from bits [79:64] of \a a. \n
		4298	/// 01: assign values from bits [95:80] of \a a. \n
		4299	/// 10: assign values from bits [111:96] of \a a. \n
		4300	/// 11: assign values from bits [127:112] of \a a. \n
		4301	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
		4302	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
		4303	/// <c>[b6, b4, b2, b0]</c>.
		4304	/// \returns A 128-bit integer vector containing the shuffled values.
		4305	#define _mm_shufflehi_epi16(a, imm) \
		4306	((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm)))
		4307
		4308	/// Unpacks the high-order (index 8-15) values from two 128-bit vectors
		4309	/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
		4310	///
		4311	/// \headerfile <x86intrin.h>
		4312	///
		4313	/// This intrinsic corresponds to the <c> VPUNPCKHBW / PUNPCKHBW </c>
		4314	/// instruction.
		4315	///
		4316	/// \param __a
		4317	/// A 128-bit vector of [16 x i8].
		4318	/// Bits [71:64] are written to bits [7:0] of the result. \n
		4319	/// Bits [79:72] are written to bits [23:16] of the result. \n
		4320	/// Bits [87:80] are written to bits [39:32] of the result. \n
		4321	/// Bits [95:88] are written to bits [55:48] of the result. \n
		4322	/// Bits [103:96] are written to bits [71:64] of the result. \n
		4323	/// Bits [111:104] are written to bits [87:80] of the result. \n
		4324	/// Bits [119:112] are written to bits [103:96] of the result. \n
		4325	/// Bits [127:120] are written to bits [119:112] of the result.
		4326	/// \param __b
		4327	/// A 128-bit vector of [16 x i8]. \n
		4328	/// Bits [71:64] are written to bits [15:8] of the result. \n
		4329	/// Bits [79:72] are written to bits [31:24] of the result. \n
		4330	/// Bits [87:80] are written to bits [47:40] of the result. \n
		4331	/// Bits [95:88] are written to bits [63:56] of the result. \n
		4332	/// Bits [103:96] are written to bits [79:72] of the result. \n
		4333	/// Bits [111:104] are written to bits [95:88] of the result. \n
		4334	/// Bits [119:112] are written to bits [111:104] of the result. \n
		4335	/// Bits [127:120] are written to bits [127:120] of the result.
		4336	/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
		4337	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi8(__m128i __a,
		4338	__m128i __b) {
		4339	return (__m128i)__builtin_shufflevector(
		4340	(__v16qi)__a, (__v16qi)__b, 8, 16 + 8, 9, 16 + 9, 10, 16 + 10, 11,
		4341	16 + 11, 12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
		4342	}
		4343
		4344	/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of
		4345	/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
		4346	///
		4347	/// \headerfile <x86intrin.h>
		4348	///
		4349	/// This intrinsic corresponds to the <c> VPUNPCKHWD / PUNPCKHWD </c>
		4350	/// instruction.
		4351	///
		4352	/// \param __a
		4353	/// A 128-bit vector of [8 x i16].
		4354	/// Bits [79:64] are written to bits [15:0] of the result. \n
		4355	/// Bits [95:80] are written to bits [47:32] of the result. \n
		4356	/// Bits [111:96] are written to bits [79:64] of the result. \n
		4357	/// Bits [127:112] are written to bits [111:96] of the result.
		4358	/// \param __b
		4359	/// A 128-bit vector of [8 x i16].
		4360	/// Bits [79:64] are written to bits [31:16] of the result. \n
		4361	/// Bits [95:80] are written to bits [63:48] of the result. \n
		4362	/// Bits [111:96] are written to bits [95:80] of the result. \n
		4363	/// Bits [127:112] are written to bits [127:112] of the result.
		4364	/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
		4365	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi16(__m128i __a,
		4366	__m128i __b) {
		4367	return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8 + 4, 5,
		4368	8 + 5, 6, 8 + 6, 7, 8 + 7);
		4369	}
		4370
		4371	/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of
		4372	/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
		4373	///
		4374	/// \headerfile <x86intrin.h>
		4375	///
		4376	/// This intrinsic corresponds to the <c> VPUNPCKHDQ / PUNPCKHDQ </c>
		4377	/// instruction.
		4378	///
		4379	/// \param __a
		4380	/// A 128-bit vector of [4 x i32]. \n
		4381	/// Bits [95:64] are written to bits [31:0] of the destination. \n
		4382	/// Bits [127:96] are written to bits [95:64] of the destination.
		4383	/// \param __b
		4384	/// A 128-bit vector of [4 x i32]. \n
		4385	/// Bits [95:64] are written to bits [64:32] of the destination. \n
		4386	/// Bits [127:96] are written to bits [127:96] of the destination.
		4387	/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
		4388	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi32(__m128i __a,
		4389	__m128i __b) {
		4390	return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4 + 2, 3,
		4391	4 + 3);
		4392	}
		4393
		4394	/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
		4395	/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
		4396	///
		4397	/// \headerfile <x86intrin.h>
		4398	///
		4399	/// This intrinsic corresponds to the <c> VPUNPCKHQDQ / PUNPCKHQDQ </c>
		4400	/// instruction.
		4401	///
		4402	/// \param __a
		4403	/// A 128-bit vector of [2 x i64]. \n
		4404	/// Bits [127:64] are written to bits [63:0] of the destination.
		4405	/// \param __b
		4406	/// A 128-bit vector of [2 x i64]. \n
		4407	/// Bits [127:64] are written to bits [127:64] of the destination.
		4408	/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
		4409	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpackhi_epi64(__m128i __a,
		4410	__m128i __b) {
		4411	return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2 + 1);
		4412	}
		4413
		4414	/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of
		4415	/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
		4416	///
		4417	/// \headerfile <x86intrin.h>
		4418	///
		4419	/// This intrinsic corresponds to the <c> VPUNPCKLBW / PUNPCKLBW </c>
		4420	/// instruction.
		4421	///
		4422	/// \param __a
		4423	/// A 128-bit vector of [16 x i8]. \n
		4424	/// Bits [7:0] are written to bits [7:0] of the result. \n
		4425	/// Bits [15:8] are written to bits [23:16] of the result. \n
		4426	/// Bits [23:16] are written to bits [39:32] of the result. \n
		4427	/// Bits [31:24] are written to bits [55:48] of the result. \n
		4428	/// Bits [39:32] are written to bits [71:64] of the result. \n
		4429	/// Bits [47:40] are written to bits [87:80] of the result. \n
		4430	/// Bits [55:48] are written to bits [103:96] of the result. \n
		4431	/// Bits [63:56] are written to bits [119:112] of the result.
		4432	/// \param __b
		4433	/// A 128-bit vector of [16 x i8].
		4434	/// Bits [7:0] are written to bits [15:8] of the result. \n
		4435	/// Bits [15:8] are written to bits [31:24] of the result. \n
		4436	/// Bits [23:16] are written to bits [47:40] of the result. \n
		4437	/// Bits [31:24] are written to bits [63:56] of the result. \n
		4438	/// Bits [39:32] are written to bits [79:72] of the result. \n
		4439	/// Bits [47:40] are written to bits [95:88] of the result. \n
		4440	/// Bits [55:48] are written to bits [111:104] of the result. \n
		4441	/// Bits [63:56] are written to bits [127:120] of the result.
		4442	/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
		4443	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi8(__m128i __a,
		4444	__m128i __b) {
		4445	return (__m128i)__builtin_shufflevector(
		4446	(__v16qi)__a, (__v16qi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 4,
		4447	16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7);
		4448	}
		4449
		4450	/// Unpacks the low-order (index 0-3) values from each of the two 128-bit
		4451	/// vectors of [8 x i16] and interleaves them into a 128-bit vector of
		4452	/// [8 x i16].
		4453	///
		4454	/// \headerfile <x86intrin.h>
		4455	///
		4456	/// This intrinsic corresponds to the <c> VPUNPCKLWD / PUNPCKLWD </c>
		4457	/// instruction.
		4458	///
		4459	/// \param __a
		4460	/// A 128-bit vector of [8 x i16].
		4461	/// Bits [15:0] are written to bits [15:0] of the result. \n
		4462	/// Bits [31:16] are written to bits [47:32] of the result. \n
		4463	/// Bits [47:32] are written to bits [79:64] of the result. \n
		4464	/// Bits [63:48] are written to bits [111:96] of the result.
		4465	/// \param __b
		4466	/// A 128-bit vector of [8 x i16].
		4467	/// Bits [15:0] are written to bits [31:16] of the result. \n
		4468	/// Bits [31:16] are written to bits [63:48] of the result. \n
		4469	/// Bits [47:32] are written to bits [95:80] of the result. \n
		4470	/// Bits [63:48] are written to bits [127:112] of the result.
		4471	/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
		4472	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi16(__m128i __a,
		4473	__m128i __b) {
		4474	return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8 + 0, 1,
		4475	8 + 1, 2, 8 + 2, 3, 8 + 3);
		4476	}
		4477
		4478	/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of
		4479	/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
		4480	///
		4481	/// \headerfile <x86intrin.h>
		4482	///
		4483	/// This intrinsic corresponds to the <c> VPUNPCKLDQ / PUNPCKLDQ </c>
		4484	/// instruction.
		4485	///
		4486	/// \param __a
		4487	/// A 128-bit vector of [4 x i32]. \n
		4488	/// Bits [31:0] are written to bits [31:0] of the destination. \n
		4489	/// Bits [63:32] are written to bits [95:64] of the destination.
		4490	/// \param __b
		4491	/// A 128-bit vector of [4 x i32]. \n
		4492	/// Bits [31:0] are written to bits [64:32] of the destination. \n
		4493	/// Bits [63:32] are written to bits [127:96] of the destination.
		4494	/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
		4495	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi32(__m128i __a,
		4496	__m128i __b) {
		4497	return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4 + 0, 1,
		4498	4 + 1);
		4499	}
		4500
		4501	/// Unpacks the low-order 64-bit elements from two 128-bit vectors of
		4502	/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
		4503	///
		4504	/// \headerfile <x86intrin.h>
		4505	///
		4506	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ / PUNPCKLQDQ </c>
		4507	/// instruction.
		4508	///
		4509	/// \param __a
		4510	/// A 128-bit vector of [2 x i64]. \n
		4511	/// Bits [63:0] are written to bits [63:0] of the destination. \n
		4512	/// \param __b
		4513	/// A 128-bit vector of [2 x i64]. \n
		4514	/// Bits [63:0] are written to bits [127:64] of the destination. \n
		4515	/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
		4516	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_unpacklo_epi64(__m128i __a,
		4517	__m128i __b) {
		4518	return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2 + 0);
		4519	}
		4520
		4521	/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
		4522	/// integer.
		4523	///
		4524	/// \headerfile <x86intrin.h>
		4525	///
		4526	/// This intrinsic corresponds to the <c> MOVDQ2Q </c> instruction.
		4527	///
		4528	/// \param __a
		4529	/// A 128-bit integer vector operand. The lower 64 bits are moved to the
		4530	/// destination.
		4531	/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
		4532	static __inline__ __m64 __DEFAULT_FN_ATTRS _mm_movepi64_pi64(__m128i __a) {
		4533	return (__m64)__a[0];
		4534	}
		4535
		4536	/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the
		4537	/// upper bits.
		4538	///
		4539	/// \headerfile <x86intrin.h>
		4540	///
		4541	/// This intrinsic corresponds to the <c> MOVD+VMOVQ </c> instruction.
		4542	///
		4543	/// \param __a
		4544	/// A 64-bit value.
		4545	/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
		4546	/// the operand. The upper 64 bits are assigned zeros.
		4547	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_movpi64_epi64(__m64 __a) {
		4548	return __extension__(__m128i)(__v2di){(long long)__a, 0};
		4549	}
		4550
		4551	/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
		4552	/// integer vector, zeroing the upper bits.
		4553	///
		4554	/// \headerfile <x86intrin.h>
		4555	///
		4556	/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
		4557	///
		4558	/// \param __a
		4559	/// A 128-bit integer vector operand. The lower 64 bits are moved to the
		4560	/// destination.
		4561	/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
		4562	/// the operand. The upper 64 bits are assigned zeros.
		4563	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_move_epi64(__m128i __a) {
		4564	return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2);
		4565	}
		4566
		4567	/// Unpacks the high-order 64-bit elements from two 128-bit vectors of
		4568	/// [2 x double] and interleaves them into a 128-bit vector of [2 x
		4569	/// double].
		4570	///
		4571	/// \headerfile <x86intrin.h>
		4572	///
		4573	/// This intrinsic corresponds to the <c> VUNPCKHPD / UNPCKHPD </c> instruction.
		4574	///
		4575	/// \param __a
		4576	/// A 128-bit vector of [2 x double]. \n
		4577	/// Bits [127:64] are written to bits [63:0] of the destination.
		4578	/// \param __b
		4579	/// A 128-bit vector of [2 x double]. \n
		4580	/// Bits [127:64] are written to bits [127:64] of the destination.
		4581	/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
		4582	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpackhi_pd(__m128d __a,
		4583	__m128d __b) {
		4584	return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2 + 1);
		4585	}
		4586
		4587	/// Unpacks the low-order 64-bit elements from two 128-bit vectors
		4588	/// of [2 x double] and interleaves them into a 128-bit vector of [2 x
		4589	/// double].
		4590	///
		4591	/// \headerfile <x86intrin.h>
		4592	///
		4593	/// This intrinsic corresponds to the <c> VUNPCKLPD / UNPCKLPD </c> instruction.
		4594	///
		4595	/// \param __a
		4596	/// A 128-bit vector of [2 x double]. \n
		4597	/// Bits [63:0] are written to bits [63:0] of the destination.
		4598	/// \param __b
		4599	/// A 128-bit vector of [2 x double]. \n
		4600	/// Bits [63:0] are written to bits [127:64] of the destination.
		4601	/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
		4602	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_unpacklo_pd(__m128d __a,
		4603	__m128d __b) {
		4604	return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2 + 0);
		4605	}
		4606
		4607	/// Extracts the sign bits of the double-precision values in the 128-bit
		4608	/// vector of [2 x double], zero-extends the value, and writes it to the
		4609	/// low-order bits of the destination.
		4610	///
		4611	/// \headerfile <x86intrin.h>
		4612	///
		4613	/// This intrinsic corresponds to the <c> VMOVMSKPD / MOVMSKPD </c> instruction.
		4614	///
		4615	/// \param __a
		4616	/// A 128-bit vector of [2 x double] containing the values with sign bits to
		4617	/// be extracted.
		4618	/// \returns The sign bits from each of the double-precision elements in \a __a,
		4619	/// written to bits [1:0]. The remaining bits are assigned values of zero.
		4620	static __inline__ int __DEFAULT_FN_ATTRS _mm_movemask_pd(__m128d __a) {
		4621	return __builtin_ia32_movmskpd((__v2df)__a);
		4622	}
		4623
		4624	/// Constructs a 128-bit floating-point vector of [2 x double] from two
		4625	/// 128-bit vector parameters of [2 x double], using the immediate-value
		4626	/// parameter as a specifier.
		4627	///
		4628	/// \headerfile <x86intrin.h>
		4629	///
		4630	/// \code
		4631	/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
		4632	/// \endcode
		4633	///
		4634	/// This intrinsic corresponds to the <c> VSHUFPD / SHUFPD </c> instruction.
		4635	///
		4636	/// \param a
		4637	/// A 128-bit vector of [2 x double].
		4638	/// \param b
		4639	/// A 128-bit vector of [2 x double].
		4640	/// \param i
		4641	/// An 8-bit immediate value. The least significant two bits specify which
		4642	/// elements to copy from \a a and \a b: \n
		4643	/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n
		4644	/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n
		4645	/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n
		4646	/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n
		4647	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE2 macro.
		4648	/// <c>_MM_SHUFFLE2(b1, b0)</c> can create a 2-bit mask of the form
		4649	/// <c>[b1, b0]</c>.
		4650	/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
		4651	#define _mm_shuffle_pd(a, b, i) \
		4652	((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
		4653	(int)(i)))
		4654
		4655	/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
		4656	/// floating-point vector of [4 x float].
		4657	///
		4658	/// \headerfile <x86intrin.h>
		4659	///
		4660	/// This intrinsic has no corresponding instruction.
		4661	///
		4662	/// \param __a
		4663	/// A 128-bit floating-point vector of [2 x double].
		4664	/// \returns A 128-bit floating-point vector of [4 x float] containing the same
		4665	/// bitwise pattern as the parameter.
		4666	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castpd_ps(__m128d __a) {
		4667	return (__m128)__a;
		4668	}
		4669
		4670	/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
		4671	/// integer vector.
		4672	///
		4673	/// \headerfile <x86intrin.h>
		4674	///
		4675	/// This intrinsic has no corresponding instruction.
		4676	///
		4677	/// \param __a
		4678	/// A 128-bit floating-point vector of [2 x double].
		4679	/// \returns A 128-bit integer vector containing the same bitwise pattern as the
		4680	/// parameter.
		4681	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castpd_si128(__m128d __a) {
		4682	return (__m128i)__a;
		4683	}
		4684
		4685	/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
		4686	/// floating-point vector of [2 x double].
		4687	///
		4688	/// \headerfile <x86intrin.h>
		4689	///
		4690	/// This intrinsic has no corresponding instruction.
		4691	///
		4692	/// \param __a
		4693	/// A 128-bit floating-point vector of [4 x float].
		4694	/// \returns A 128-bit floating-point vector of [2 x double] containing the same
		4695	/// bitwise pattern as the parameter.
		4696	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castps_pd(__m128 __a) {
		4697	return (__m128d)__a;
		4698	}
		4699
		4700	/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
		4701	/// integer vector.
		4702	///
		4703	/// \headerfile <x86intrin.h>
		4704	///
		4705	/// This intrinsic has no corresponding instruction.
		4706	///
		4707	/// \param __a
		4708	/// A 128-bit floating-point vector of [4 x float].
		4709	/// \returns A 128-bit integer vector containing the same bitwise pattern as the
		4710	/// parameter.
		4711	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_castps_si128(__m128 __a) {
		4712	return (__m128i)__a;
		4713	}
		4714
		4715	/// Casts a 128-bit integer vector into a 128-bit floating-point vector
		4716	/// of [4 x float].
		4717	///
		4718	/// \headerfile <x86intrin.h>
		4719	///
		4720	/// This intrinsic has no corresponding instruction.
		4721	///
		4722	/// \param __a
		4723	/// A 128-bit integer vector.
		4724	/// \returns A 128-bit floating-point vector of [4 x float] containing the same
		4725	/// bitwise pattern as the parameter.
		4726	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_castsi128_ps(__m128i __a) {
		4727	return (__m128)__a;
		4728	}
		4729
		4730	/// Casts a 128-bit integer vector into a 128-bit floating-point vector
		4731	/// of [2 x double].
		4732	///
		4733	/// \headerfile <x86intrin.h>
		4734	///
		4735	/// This intrinsic has no corresponding instruction.
		4736	///
		4737	/// \param __a
		4738	/// A 128-bit integer vector.
		4739	/// \returns A 128-bit floating-point vector of [2 x double] containing the same
		4740	/// bitwise pattern as the parameter.
		4741	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_castsi128_pd(__m128i __a) {
		4742	return (__m128d)__a;
		4743	}
		4744
		4745	#if defined(__cplusplus)
		4746	extern "C" {
		4747	#endif
		4748
		4749	/// Indicates that a spin loop is being executed for the purposes of
		4750	/// optimizing power consumption during the loop.
		4751	///
		4752	/// \headerfile <x86intrin.h>
		4753	///
		4754	/// This intrinsic corresponds to the <c> PAUSE </c> instruction.
		4755	///
		4756	void _mm_pause(void);
		4757
		4758	#if defined(__cplusplus)
		4759	} // extern "C"
		4760	#endif
		4761	#undef __DEFAULT_FN_ATTRS
		4762	#undef __DEFAULT_FN_ATTRS_MMX
		4763
		4764	#define _MM_SHUFFLE2(x, y) (((x) << 1) \| (y))
		4765
		4766	#define _MM_DENORMALS_ZERO_ON (0x0040U)
		4767	#define _MM_DENORMALS_ZERO_OFF (0x0000U)
		4768
		4769	#define _MM_DENORMALS_ZERO_MASK (0x0040U)
		4770
		4771	#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK)
		4772	#define _MM_SET_DENORMALS_ZERO_MODE(x) \
		4773	(_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) \| (x)))
		4774
		4775	#endif /* __EMMINTRIN_H */

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite/llvm-build/x86_64/lib/clang/16/include/emmintrin.h – Rev 14