WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/lib/clang/16/include/smmintrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9
		10	#ifndef __SMMINTRIN_H
		11	#define __SMMINTRIN_H
		12
		13	#if !defined(__i386__) && !defined(__x86_64__)
		14	#error "This header is only meant to be used on x86 and x64 architecture"
		15	#endif
		16
		17	#include <tmmintrin.h>
		18
		19	/* Define the default attributes for the functions in this file. */
		20	#define __DEFAULT_FN_ATTRS \
		21	__attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), \
		22	__min_vector_width__(128)))
		23
		24	/* SSE4 Rounding macros. */
		25	#define _MM_FROUND_TO_NEAREST_INT 0x00
		26	#define _MM_FROUND_TO_NEG_INF 0x01
		27	#define _MM_FROUND_TO_POS_INF 0x02
		28	#define _MM_FROUND_TO_ZERO 0x03
		29	#define _MM_FROUND_CUR_DIRECTION 0x04
		30
		31	#define _MM_FROUND_RAISE_EXC 0x00
		32	#define _MM_FROUND_NO_EXC 0x08
		33
		34	#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEAREST_INT)
		35	#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_NEG_INF)
		36	#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_POS_INF)
		37	#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC \| _MM_FROUND_TO_ZERO)
		38	#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC \| _MM_FROUND_CUR_DIRECTION)
		39	#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC \| _MM_FROUND_CUR_DIRECTION)
		40
		41	/// Rounds up each element of the 128-bit vector of [4 x float] to an
		42	/// integer and returns the rounded values in a 128-bit vector of
		43	/// [4 x float].
		44	///
		45	/// \headerfile <x86intrin.h>
		46	///
		47	/// \code
		48	/// __m128 _mm_ceil_ps(__m128 X);
		49	/// \endcode
		50	///
		51	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
		52	///
		53	/// \param X
		54	/// A 128-bit vector of [4 x float] values to be rounded up.
		55	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
		56	#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL)
		57
		58	/// Rounds up each element of the 128-bit vector of [2 x double] to an
		59	/// integer and returns the rounded values in a 128-bit vector of
		60	/// [2 x double].
		61	///
		62	/// \headerfile <x86intrin.h>
		63	///
		64	/// \code
		65	/// __m128d _mm_ceil_pd(__m128d X);
		66	/// \endcode
		67	///
		68	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
		69	///
		70	/// \param X
		71	/// A 128-bit vector of [2 x double] values to be rounded up.
		72	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
		73	#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL)
		74
		75	/// Copies three upper elements of the first 128-bit vector operand to
		76	/// the corresponding three upper elements of the 128-bit result vector of
		77	/// [4 x float]. Rounds up the lowest element of the second 128-bit vector
		78	/// operand to an integer and copies it to the lowest element of the 128-bit
		79	/// result vector of [4 x float].
		80	///
		81	/// \headerfile <x86intrin.h>
		82	///
		83	/// \code
		84	/// __m128 _mm_ceil_ss(__m128 X, __m128 Y);
		85	/// \endcode
		86	///
		87	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
		88	///
		89	/// \param X
		90	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
		91	/// copied to the corresponding bits of the result.
		92	/// \param Y
		93	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
		94	/// rounded up to the nearest integer and copied to the corresponding bits
		95	/// of the result.
		96	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
		97	/// values.
		98	#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
		99
		100	/// Copies the upper element of the first 128-bit vector operand to the
		101	/// corresponding upper element of the 128-bit result vector of [2 x double].
		102	/// Rounds up the lower element of the second 128-bit vector operand to an
		103	/// integer and copies it to the lower element of the 128-bit result vector
		104	/// of [2 x double].
		105	///
		106	/// \headerfile <x86intrin.h>
		107	///
		108	/// \code
		109	/// __m128d _mm_ceil_sd(__m128d X, __m128d Y);
		110	/// \endcode
		111	///
		112	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
		113	///
		114	/// \param X
		115	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
		116	/// copied to the corresponding bits of the result.
		117	/// \param Y
		118	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
		119	/// rounded up to the nearest integer and copied to the corresponding bits
		120	/// of the result.
		121	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
		122	/// values.
		123	#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
		124
		125	/// Rounds down each element of the 128-bit vector of [4 x float] to an
		126	/// an integer and returns the rounded values in a 128-bit vector of
		127	/// [4 x float].
		128	///
		129	/// \headerfile <x86intrin.h>
		130	///
		131	/// \code
		132	/// __m128 _mm_floor_ps(__m128 X);
		133	/// \endcode
		134	///
		135	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
		136	///
		137	/// \param X
		138	/// A 128-bit vector of [4 x float] values to be rounded down.
		139	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
		140	#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR)
		141
		142	/// Rounds down each element of the 128-bit vector of [2 x double] to an
		143	/// integer and returns the rounded values in a 128-bit vector of
		144	/// [2 x double].
		145	///
		146	/// \headerfile <x86intrin.h>
		147	///
		148	/// \code
		149	/// __m128d _mm_floor_pd(__m128d X);
		150	/// \endcode
		151	///
		152	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
		153	///
		154	/// \param X
		155	/// A 128-bit vector of [2 x double].
		156	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
		157	#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR)
		158
		159	/// Copies three upper elements of the first 128-bit vector operand to
		160	/// the corresponding three upper elements of the 128-bit result vector of
		161	/// [4 x float]. Rounds down the lowest element of the second 128-bit vector
		162	/// operand to an integer and copies it to the lowest element of the 128-bit
		163	/// result vector of [4 x float].
		164	///
		165	/// \headerfile <x86intrin.h>
		166	///
		167	/// \code
		168	/// __m128 _mm_floor_ss(__m128 X, __m128 Y);
		169	/// \endcode
		170	///
		171	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
		172	///
		173	/// \param X
		174	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
		175	/// copied to the corresponding bits of the result.
		176	/// \param Y
		177	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
		178	/// rounded down to the nearest integer and copied to the corresponding bits
		179	/// of the result.
		180	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
		181	/// values.
		182	#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
		183
		184	/// Copies the upper element of the first 128-bit vector operand to the
		185	/// corresponding upper element of the 128-bit result vector of [2 x double].
		186	/// Rounds down the lower element of the second 128-bit vector operand to an
		187	/// integer and copies it to the lower element of the 128-bit result vector
		188	/// of [2 x double].
		189	///
		190	/// \headerfile <x86intrin.h>
		191	///
		192	/// \code
		193	/// __m128d _mm_floor_sd(__m128d X, __m128d Y);
		194	/// \endcode
		195	///
		196	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
		197	///
		198	/// \param X
		199	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
		200	/// copied to the corresponding bits of the result.
		201	/// \param Y
		202	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
		203	/// rounded down to the nearest integer and copied to the corresponding bits
		204	/// of the result.
		205	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
		206	/// values.
		207	#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
		208
		209	/// Rounds each element of the 128-bit vector of [4 x float] to an
		210	/// integer value according to the rounding control specified by the second
		211	/// argument and returns the rounded values in a 128-bit vector of
		212	/// [4 x float].
		213	///
		214	/// \headerfile <x86intrin.h>
		215	///
		216	/// \code
		217	/// __m128 _mm_round_ps(__m128 X, const int M);
		218	/// \endcode
		219	///
		220	/// This intrinsic corresponds to the <c> VROUNDPS / ROUNDPS </c> instruction.
		221	///
		222	/// \param X
		223	/// A 128-bit vector of [4 x float].
		224	/// \param M
		225	/// An integer value that specifies the rounding operation. \n
		226	/// Bits [7:4] are reserved. \n
		227	/// Bit [3] is a precision exception value: \n
		228	/// 0: A normal PE exception is used \n
		229	/// 1: The PE field is not updated \n
		230	/// Bit [2] is the rounding control source: \n
		231	/// 0: Use bits [1:0] of \a M \n
		232	/// 1: Use the current MXCSR setting \n
		233	/// Bits [1:0] contain the rounding control definition: \n
		234	/// 00: Nearest \n
		235	/// 01: Downward (toward negative infinity) \n
		236	/// 10: Upward (toward positive infinity) \n
		237	/// 11: Truncated
		238	/// \returns A 128-bit vector of [4 x float] containing the rounded values.
		239	#define _mm_round_ps(X, M) \
		240	((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)))
		241
		242	/// Copies three upper elements of the first 128-bit vector operand to
		243	/// the corresponding three upper elements of the 128-bit result vector of
		244	/// [4 x float]. Rounds the lowest element of the second 128-bit vector
		245	/// operand to an integer value according to the rounding control specified
		246	/// by the third argument and copies it to the lowest element of the 128-bit
		247	/// result vector of [4 x float].
		248	///
		249	/// \headerfile <x86intrin.h>
		250	///
		251	/// \code
		252	/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M);
		253	/// \endcode
		254	///
		255	/// This intrinsic corresponds to the <c> VROUNDSS / ROUNDSS </c> instruction.
		256	///
		257	/// \param X
		258	/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are
		259	/// copied to the corresponding bits of the result.
		260	/// \param Y
		261	/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is
		262	/// rounded to the nearest integer using the specified rounding control and
		263	/// copied to the corresponding bits of the result.
		264	/// \param M
		265	/// An integer value that specifies the rounding operation. \n
		266	/// Bits [7:4] are reserved. \n
		267	/// Bit [3] is a precision exception value: \n
		268	/// 0: A normal PE exception is used \n
		269	/// 1: The PE field is not updated \n
		270	/// Bit [2] is the rounding control source: \n
		271	/// 0: Use bits [1:0] of \a M \n
		272	/// 1: Use the current MXCSR setting \n
		273	/// Bits [1:0] contain the rounding control definition: \n
		274	/// 00: Nearest \n
		275	/// 01: Downward (toward negative infinity) \n
		276	/// 10: Upward (toward positive infinity) \n
		277	/// 11: Truncated
		278	/// \returns A 128-bit vector of [4 x float] containing the copied and rounded
		279	/// values.
		280	#define _mm_round_ss(X, Y, M) \
		281	((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \
		282	(M)))
		283
		284	/// Rounds each element of the 128-bit vector of [2 x double] to an
		285	/// integer value according to the rounding control specified by the second
		286	/// argument and returns the rounded values in a 128-bit vector of
		287	/// [2 x double].
		288	///
		289	/// \headerfile <x86intrin.h>
		290	///
		291	/// \code
		292	/// __m128d _mm_round_pd(__m128d X, const int M);
		293	/// \endcode
		294	///
		295	/// This intrinsic corresponds to the <c> VROUNDPD / ROUNDPD </c> instruction.
		296	///
		297	/// \param X
		298	/// A 128-bit vector of [2 x double].
		299	/// \param M
		300	/// An integer value that specifies the rounding operation. \n
		301	/// Bits [7:4] are reserved. \n
		302	/// Bit [3] is a precision exception value: \n
		303	/// 0: A normal PE exception is used \n
		304	/// 1: The PE field is not updated \n
		305	/// Bit [2] is the rounding control source: \n
		306	/// 0: Use bits [1:0] of \a M \n
		307	/// 1: Use the current MXCSR setting \n
		308	/// Bits [1:0] contain the rounding control definition: \n
		309	/// 00: Nearest \n
		310	/// 01: Downward (toward negative infinity) \n
		311	/// 10: Upward (toward positive infinity) \n
		312	/// 11: Truncated
		313	/// \returns A 128-bit vector of [2 x double] containing the rounded values.
		314	#define _mm_round_pd(X, M) \
		315	((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)))
		316
		317	/// Copies the upper element of the first 128-bit vector operand to the
		318	/// corresponding upper element of the 128-bit result vector of [2 x double].
		319	/// Rounds the lower element of the second 128-bit vector operand to an
		320	/// integer value according to the rounding control specified by the third
		321	/// argument and copies it to the lower element of the 128-bit result vector
		322	/// of [2 x double].
		323	///
		324	/// \headerfile <x86intrin.h>
		325	///
		326	/// \code
		327	/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M);
		328	/// \endcode
		329	///
		330	/// This intrinsic corresponds to the <c> VROUNDSD / ROUNDSD </c> instruction.
		331	///
		332	/// \param X
		333	/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is
		334	/// copied to the corresponding bits of the result.
		335	/// \param Y
		336	/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is
		337	/// rounded to the nearest integer using the specified rounding control and
		338	/// copied to the corresponding bits of the result.
		339	/// \param M
		340	/// An integer value that specifies the rounding operation. \n
		341	/// Bits [7:4] are reserved. \n
		342	/// Bit [3] is a precision exception value: \n
		343	/// 0: A normal PE exception is used \n
		344	/// 1: The PE field is not updated \n
		345	/// Bit [2] is the rounding control source: \n
		346	/// 0: Use bits [1:0] of \a M \n
		347	/// 1: Use the current MXCSR setting \n
		348	/// Bits [1:0] contain the rounding control definition: \n
		349	/// 00: Nearest \n
		350	/// 01: Downward (toward negative infinity) \n
		351	/// 10: Upward (toward positive infinity) \n
		352	/// 11: Truncated
		353	/// \returns A 128-bit vector of [2 x double] containing the copied and rounded
		354	/// values.
		355	#define _mm_round_sd(X, Y, M) \
		356	((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
		357	(M)))
		358
		359	/* SSE4 Packed Blending Intrinsics. */
		360	/// Returns a 128-bit vector of [2 x double] where the values are
		361	/// selected from either the first or second operand as specified by the
		362	/// third operand, the control mask.
		363	///
		364	/// \headerfile <x86intrin.h>
		365	///
		366	/// \code
		367	/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M);
		368	/// \endcode
		369	///
		370	/// This intrinsic corresponds to the <c> VBLENDPD / BLENDPD </c> instruction.
		371	///
		372	/// \param V1
		373	/// A 128-bit vector of [2 x double].
		374	/// \param V2
		375	/// A 128-bit vector of [2 x double].
		376	/// \param M
		377	/// An immediate integer operand, with mask bits [1:0] specifying how the
		378	/// values are to be copied. The position of the mask bit corresponds to the
		379	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
		380	/// element in operand \a V1 is copied to the same position in the result.
		381	/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2
		382	/// is copied to the same position in the result.
		383	/// \returns A 128-bit vector of [2 x double] containing the copied values.
		384	#define _mm_blend_pd(V1, V2, M) \
		385	((__m128d)__builtin_ia32_blendpd((__v2df)(__m128d)(V1), \
		386	(__v2df)(__m128d)(V2), (int)(M)))
		387
		388	/// Returns a 128-bit vector of [4 x float] where the values are selected
		389	/// from either the first or second operand as specified by the third
		390	/// operand, the control mask.
		391	///
		392	/// \headerfile <x86intrin.h>
		393	///
		394	/// \code
		395	/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M);
		396	/// \endcode
		397	///
		398	/// This intrinsic corresponds to the <c> VBLENDPS / BLENDPS </c> instruction.
		399	///
		400	/// \param V1
		401	/// A 128-bit vector of [4 x float].
		402	/// \param V2
		403	/// A 128-bit vector of [4 x float].
		404	/// \param M
		405	/// An immediate integer operand, with mask bits [3:0] specifying how the
		406	/// values are to be copied. The position of the mask bit corresponds to the
		407	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
		408	/// element in operand \a V1 is copied to the same position in the result.
		409	/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2
		410	/// is copied to the same position in the result.
		411	/// \returns A 128-bit vector of [4 x float] containing the copied values.
		412	#define _mm_blend_ps(V1, V2, M) \
		413	((__m128)__builtin_ia32_blendps((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
		414	(int)(M)))
		415
		416	/// Returns a 128-bit vector of [2 x double] where the values are
		417	/// selected from either the first or second operand as specified by the
		418	/// third operand, the control mask.
		419	///
		420	/// \headerfile <x86intrin.h>
		421	///
		422	/// This intrinsic corresponds to the <c> VBLENDVPD / BLENDVPD </c> instruction.
		423	///
		424	/// \param __V1
		425	/// A 128-bit vector of [2 x double].
		426	/// \param __V2
		427	/// A 128-bit vector of [2 x double].
		428	/// \param __M
		429	/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the
		430	/// values are to be copied. The position of the mask bit corresponds to the
		431	/// most significant bit of a copied value. When a mask bit is 0, the
		432	/// corresponding 64-bit element in operand \a __V1 is copied to the same
		433	/// position in the result. When a mask bit is 1, the corresponding 64-bit
		434	/// element in operand \a __V2 is copied to the same position in the result.
		435	/// \returns A 128-bit vector of [2 x double] containing the copied values.
		436	static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_blendv_pd(__m128d __V1,
		437	__m128d __V2,
		438	__m128d __M) {
		439	return (__m128d)__builtin_ia32_blendvpd((__v2df)__V1, (__v2df)__V2,
		440	(__v2df)__M);
		441	}
		442
		443	/// Returns a 128-bit vector of [4 x float] where the values are
		444	/// selected from either the first or second operand as specified by the
		445	/// third operand, the control mask.
		446	///
		447	/// \headerfile <x86intrin.h>
		448	///
		449	/// This intrinsic corresponds to the <c> VBLENDVPS / BLENDVPS </c> instruction.
		450	///
		451	/// \param __V1
		452	/// A 128-bit vector of [4 x float].
		453	/// \param __V2
		454	/// A 128-bit vector of [4 x float].
		455	/// \param __M
		456	/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying
		457	/// how the values are to be copied. The position of the mask bit corresponds
		458	/// to the most significant bit of a copied value. When a mask bit is 0, the
		459	/// corresponding 32-bit element in operand \a __V1 is copied to the same
		460	/// position in the result. When a mask bit is 1, the corresponding 32-bit
		461	/// element in operand \a __V2 is copied to the same position in the result.
		462	/// \returns A 128-bit vector of [4 x float] containing the copied values.
		463	static __inline__ __m128 __DEFAULT_FN_ATTRS _mm_blendv_ps(__m128 __V1,
		464	__m128 __V2,
		465	__m128 __M) {
		466	return (__m128)__builtin_ia32_blendvps((__v4sf)__V1, (__v4sf)__V2,
		467	(__v4sf)__M);
		468	}
		469
		470	/// Returns a 128-bit vector of [16 x i8] where the values are selected
		471	/// from either of the first or second operand as specified by the third
		472	/// operand, the control mask.
		473	///
		474	/// \headerfile <x86intrin.h>
		475	///
		476	/// This intrinsic corresponds to the <c> VPBLENDVB / PBLENDVB </c> instruction.
		477	///
		478	/// \param __V1
		479	/// A 128-bit vector of [16 x i8].
		480	/// \param __V2
		481	/// A 128-bit vector of [16 x i8].
		482	/// \param __M
		483	/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
		484	/// how the values are to be copied. The position of the mask bit corresponds
		485	/// to the most significant bit of a copied value. When a mask bit is 0, the
		486	/// corresponding 8-bit element in operand \a __V1 is copied to the same
		487	/// position in the result. When a mask bit is 1, the corresponding 8-bit
		488	/// element in operand \a __V2 is copied to the same position in the result.
		489	/// \returns A 128-bit vector of [16 x i8] containing the copied values.
		490	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_blendv_epi8(__m128i __V1,
		491	__m128i __V2,
		492	__m128i __M) {
		493	return (__m128i)__builtin_ia32_pblendvb128((__v16qi)__V1, (__v16qi)__V2,
		494	(__v16qi)__M);
		495	}
		496
		497	/// Returns a 128-bit vector of [8 x i16] where the values are selected
		498	/// from either of the first or second operand as specified by the third
		499	/// operand, the control mask.
		500	///
		501	/// \headerfile <x86intrin.h>
		502	///
		503	/// \code
		504	/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M);
		505	/// \endcode
		506	///
		507	/// This intrinsic corresponds to the <c> VPBLENDW / PBLENDW </c> instruction.
		508	///
		509	/// \param V1
		510	/// A 128-bit vector of [8 x i16].
		511	/// \param V2
		512	/// A 128-bit vector of [8 x i16].
		513	/// \param M
		514	/// An immediate integer operand, with mask bits [7:0] specifying how the
		515	/// values are to be copied. The position of the mask bit corresponds to the
		516	/// index of a copied value. When a mask bit is 0, the corresponding 16-bit
		517	/// element in operand \a V1 is copied to the same position in the result.
		518	/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2
		519	/// is copied to the same position in the result.
		520	/// \returns A 128-bit vector of [8 x i16] containing the copied values.
		521	#define _mm_blend_epi16(V1, V2, M) \
		522	((__m128i)__builtin_ia32_pblendw128((__v8hi)(__m128i)(V1), \
		523	(__v8hi)(__m128i)(V2), (int)(M)))
		524
		525	/* SSE4 Dword Multiply Instructions. */
		526	/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
		527	/// and returns the lower 32 bits of the each product in a 128-bit vector of
		528	/// [4 x i32].
		529	///
		530	/// \headerfile <x86intrin.h>
		531	///
		532	/// This intrinsic corresponds to the <c> VPMULLD / PMULLD </c> instruction.
		533	///
		534	/// \param __V1
		535	/// A 128-bit integer vector.
		536	/// \param __V2
		537	/// A 128-bit integer vector.
		538	/// \returns A 128-bit integer vector containing the products of both operands.
		539	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mullo_epi32(__m128i __V1,
		540	__m128i __V2) {
		541	return (__m128i)((__v4su)__V1 * (__v4su)__V2);
		542	}
		543
		544	/// Multiplies corresponding even-indexed elements of two 128-bit
		545	/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
		546	/// containing the products.
		547	///
		548	/// \headerfile <x86intrin.h>
		549	///
		550	/// This intrinsic corresponds to the <c> VPMULDQ / PMULDQ </c> instruction.
		551	///
		552	/// \param __V1
		553	/// A 128-bit vector of [4 x i32].
		554	/// \param __V2
		555	/// A 128-bit vector of [4 x i32].
		556	/// \returns A 128-bit vector of [2 x i64] containing the products of both
		557	/// operands.
		558	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
		559	__m128i __V2) {
		560	return (__m128i)__builtin_ia32_pmuldq128((__v4si)__V1, (__v4si)__V2);
		561	}
		562
		563	/* SSE4 Floating Point Dot Product Instructions. */
		564	/// Computes the dot product of the two 128-bit vectors of [4 x float]
		565	/// and returns it in the elements of the 128-bit result vector of
		566	/// [4 x float].
		567	///
		568	/// The immediate integer operand controls which input elements
		569	/// will contribute to the dot product, and where the final results are
		570	/// returned.
		571	///
		572	/// \headerfile <x86intrin.h>
		573	///
		574	/// \code
		575	/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M);
		576	/// \endcode
		577	///
		578	/// This intrinsic corresponds to the <c> VDPPS / DPPS </c> instruction.
		579	///
		580	/// \param X
		581	/// A 128-bit vector of [4 x float].
		582	/// \param Y
		583	/// A 128-bit vector of [4 x float].
		584	/// \param M
		585	/// An immediate integer operand. Mask bits [7:4] determine which elements
		586	/// of the input vectors are used, with bit [4] corresponding to the lowest
		587	/// element and bit [7] corresponding to the highest element of each [4 x
		588	/// float] vector. If a bit is set, the corresponding elements from the two
		589	/// input vectors are used as an input for dot product; otherwise that input
		590	/// is treated as zero. Bits [3:0] determine which elements of the result
		591	/// will receive a copy of the final dot product, with bit [0] corresponding
		592	/// to the lowest element and bit [3] corresponding to the highest element of
		593	/// each [4 x float] subvector. If a bit is set, the dot product is returned
		594	/// in the corresponding element; otherwise that element is set to zero.
		595	/// \returns A 128-bit vector of [4 x float] containing the dot product.
		596	#define _mm_dp_ps(X, Y, M) \
		597	((__m128)__builtin_ia32_dpps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), (M)))
		598
		599	/// Computes the dot product of the two 128-bit vectors of [2 x double]
		600	/// and returns it in the elements of the 128-bit result vector of
		601	/// [2 x double].
		602	///
		603	/// The immediate integer operand controls which input
		604	/// elements will contribute to the dot product, and where the final results
		605	/// are returned.
		606	///
		607	/// \headerfile <x86intrin.h>
		608	///
		609	/// \code
		610	/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M);
		611	/// \endcode
		612	///
		613	/// This intrinsic corresponds to the <c> VDPPD / DPPD </c> instruction.
		614	///
		615	/// \param X
		616	/// A 128-bit vector of [2 x double].
		617	/// \param Y
		618	/// A 128-bit vector of [2 x double].
		619	/// \param M
		620	/// An immediate integer operand. Mask bits [5:4] determine which elements
		621	/// of the input vectors are used, with bit [4] corresponding to the lowest
		622	/// element and bit [5] corresponding to the highest element of each of [2 x
		623	/// double] vector. If a bit is set, the corresponding elements from the two
		624	/// input vectors are used as an input for dot product; otherwise that input
		625	/// is treated as zero. Bits [1:0] determine which elements of the result
		626	/// will receive a copy of the final dot product, with bit [0] corresponding
		627	/// to the lowest element and bit [1] corresponding to the highest element of
		628	/// each [2 x double] vector. If a bit is set, the dot product is returned in
		629	/// the corresponding element; otherwise that element is set to zero.
		630	#define _mm_dp_pd(X, Y, M) \
		631	((__m128d)__builtin_ia32_dppd((__v2df)(__m128d)(X), (__v2df)(__m128d)(Y), \
		632	(M)))
		633
		634	/* SSE4 Streaming Load Hint Instruction. */
		635	/// Loads integer values from a 128-bit aligned memory location to a
		636	/// 128-bit integer vector.
		637	///
		638	/// \headerfile <x86intrin.h>
		639	///
		640	/// This intrinsic corresponds to the <c> VMOVNTDQA / MOVNTDQA </c> instruction.
		641	///
		642	/// \param __V
		643	/// A pointer to a 128-bit aligned memory location that contains the integer
		644	/// values.
		645	/// \returns A 128-bit integer vector containing the data stored at the
		646	/// specified memory location.
		647	static __inline__ __m128i __DEFAULT_FN_ATTRS
		648	_mm_stream_load_si128(__m128i const *__V) {
		649	return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
		650	}
		651
		652	/* SSE4 Packed Integer Min/Max Instructions. */
		653	/// Compares the corresponding elements of two 128-bit vectors of
		654	/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
		655	/// of the two values.
		656	///
		657	/// \headerfile <x86intrin.h>
		658	///
		659	/// This intrinsic corresponds to the <c> VPMINSB / PMINSB </c> instruction.
		660	///
		661	/// \param __V1
		662	/// A 128-bit vector of [16 x i8].
		663	/// \param __V2
		664	/// A 128-bit vector of [16 x i8]
		665	/// \returns A 128-bit vector of [16 x i8] containing the lesser values.
		666	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi8(__m128i __V1,
		667	__m128i __V2) {
		668	return (__m128i)__builtin_elementwise_min((__v16qs)__V1, (__v16qs)__V2);
		669	}
		670
		671	/// Compares the corresponding elements of two 128-bit vectors of
		672	/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
		673	/// greater value of the two.
		674	///
		675	/// \headerfile <x86intrin.h>
		676	///
		677	/// This intrinsic corresponds to the <c> VPMAXSB / PMAXSB </c> instruction.
		678	///
		679	/// \param __V1
		680	/// A 128-bit vector of [16 x i8].
		681	/// \param __V2
		682	/// A 128-bit vector of [16 x i8].
		683	/// \returns A 128-bit vector of [16 x i8] containing the greater values.
		684	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi8(__m128i __V1,
		685	__m128i __V2) {
		686	return (__m128i)__builtin_elementwise_max((__v16qs)__V1, (__v16qs)__V2);
		687	}
		688
		689	/// Compares the corresponding elements of two 128-bit vectors of
		690	/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
		691	/// value of the two.
		692	///
		693	/// \headerfile <x86intrin.h>
		694	///
		695	/// This intrinsic corresponds to the <c> VPMINUW / PMINUW </c> instruction.
		696	///
		697	/// \param __V1
		698	/// A 128-bit vector of [8 x u16].
		699	/// \param __V2
		700	/// A 128-bit vector of [8 x u16].
		701	/// \returns A 128-bit vector of [8 x u16] containing the lesser values.
		702	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu16(__m128i __V1,
		703	__m128i __V2) {
		704	return (__m128i)__builtin_elementwise_min((__v8hu)__V1, (__v8hu)__V2);
		705	}
		706
		707	/// Compares the corresponding elements of two 128-bit vectors of
		708	/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
		709	/// greater value of the two.
		710	///
		711	/// \headerfile <x86intrin.h>
		712	///
		713	/// This intrinsic corresponds to the <c> VPMAXUW / PMAXUW </c> instruction.
		714	///
		715	/// \param __V1
		716	/// A 128-bit vector of [8 x u16].
		717	/// \param __V2
		718	/// A 128-bit vector of [8 x u16].
		719	/// \returns A 128-bit vector of [8 x u16] containing the greater values.
		720	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu16(__m128i __V1,
		721	__m128i __V2) {
		722	return (__m128i)__builtin_elementwise_max((__v8hu)__V1, (__v8hu)__V2);
		723	}
		724
		725	/// Compares the corresponding elements of two 128-bit vectors of
		726	/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
		727	/// value of the two.
		728	///
		729	/// \headerfile <x86intrin.h>
		730	///
		731	/// This intrinsic corresponds to the <c> VPMINSD / PMINSD </c> instruction.
		732	///
		733	/// \param __V1
		734	/// A 128-bit vector of [4 x i32].
		735	/// \param __V2
		736	/// A 128-bit vector of [4 x i32].
		737	/// \returns A 128-bit vector of [4 x i32] containing the lesser values.
		738	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epi32(__m128i __V1,
		739	__m128i __V2) {
		740	return (__m128i)__builtin_elementwise_min((__v4si)__V1, (__v4si)__V2);
		741	}
		742
		743	/// Compares the corresponding elements of two 128-bit vectors of
		744	/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
		745	/// greater value of the two.
		746	///
		747	/// \headerfile <x86intrin.h>
		748	///
		749	/// This intrinsic corresponds to the <c> VPMAXSD / PMAXSD </c> instruction.
		750	///
		751	/// \param __V1
		752	/// A 128-bit vector of [4 x i32].
		753	/// \param __V2
		754	/// A 128-bit vector of [4 x i32].
		755	/// \returns A 128-bit vector of [4 x i32] containing the greater values.
		756	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epi32(__m128i __V1,
		757	__m128i __V2) {
		758	return (__m128i)__builtin_elementwise_max((__v4si)__V1, (__v4si)__V2);
		759	}
		760
		761	/// Compares the corresponding elements of two 128-bit vectors of
		762	/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
		763	/// value of the two.
		764	///
		765	/// \headerfile <x86intrin.h>
		766	///
		767	/// This intrinsic corresponds to the <c> VPMINUD / PMINUD </c> instruction.
		768	///
		769	/// \param __V1
		770	/// A 128-bit vector of [4 x u32].
		771	/// \param __V2
		772	/// A 128-bit vector of [4 x u32].
		773	/// \returns A 128-bit vector of [4 x u32] containing the lesser values.
		774	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_min_epu32(__m128i __V1,
		775	__m128i __V2) {
		776	return (__m128i)__builtin_elementwise_min((__v4su)__V1, (__v4su)__V2);
		777	}
		778
		779	/// Compares the corresponding elements of two 128-bit vectors of
		780	/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
		781	/// greater value of the two.
		782	///
		783	/// \headerfile <x86intrin.h>
		784	///
		785	/// This intrinsic corresponds to the <c> VPMAXUD / PMAXUD </c> instruction.
		786	///
		787	/// \param __V1
		788	/// A 128-bit vector of [4 x u32].
		789	/// \param __V2
		790	/// A 128-bit vector of [4 x u32].
		791	/// \returns A 128-bit vector of [4 x u32] containing the greater values.
		792	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
		793	__m128i __V2) {
		794	return (__m128i)__builtin_elementwise_max((__v4su)__V1, (__v4su)__V2);
		795	}
		796
		797	/* SSE4 Insertion and Extraction from XMM Register Instructions. */
		798	/// Takes the first argument \a X and inserts an element from the second
		799	/// argument \a Y as selected by the third argument \a N. That result then
		800	/// has elements zeroed out also as selected by the third argument \a N. The
		801	/// resulting 128-bit vector of [4 x float] is then returned.
		802	///
		803	/// \headerfile <x86intrin.h>
		804	///
		805	/// \code
		806	/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N);
		807	/// \endcode
		808	///
		809	/// This intrinsic corresponds to the <c> VINSERTPS </c> instruction.
		810	///
		811	/// \param X
		812	/// A 128-bit vector source operand of [4 x float]. With the exception of
		813	/// those bits in the result copied from parameter \a Y and zeroed by bits
		814	/// [3:0] of \a N, all bits from this parameter are copied to the result.
		815	/// \param Y
		816	/// A 128-bit vector source operand of [4 x float]. One single-precision
		817	/// floating-point element from this source, as determined by the immediate
		818	/// parameter, is copied to the result.
		819	/// \param N
		820	/// Specifies which bits from operand \a Y will be copied, which bits in the
		821	/// result they will be copied to, and which bits in the result will be
		822	/// cleared. The following assignments are made: \n
		823	/// Bits [7:6] specify the bits to copy from operand \a Y: \n
		824	/// 00: Selects bits [31:0] from operand \a Y. \n
		825	/// 01: Selects bits [63:32] from operand \a Y. \n
		826	/// 10: Selects bits [95:64] from operand \a Y. \n
		827	/// 11: Selects bits [127:96] from operand \a Y. \n
		828	/// Bits [5:4] specify the bits in the result to which the selected bits
		829	/// from operand \a Y are copied: \n
		830	/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n
		831	/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n
		832	/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n
		833	/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n
		834	/// Bits[3:0]: If any of these bits are set, the corresponding result
		835	/// element is cleared.
		836	/// \returns A 128-bit vector of [4 x float] containing the copied
		837	/// single-precision floating point elements from the operands.
		838	#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
		839
		840	/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
		841	/// returns it, using the immediate value parameter \a N as a selector.
		842	///
		843	/// \headerfile <x86intrin.h>
		844	///
		845	/// \code
		846	/// int _mm_extract_ps(__m128 X, const int N);
		847	/// \endcode
		848	///
		849	/// This intrinsic corresponds to the <c> VEXTRACTPS / EXTRACTPS </c>
		850	/// instruction.
		851	///
		852	/// \param X
		853	/// A 128-bit vector of [4 x float].
		854	/// \param N
		855	/// An immediate value. Bits [1:0] determines which bits from the argument
		856	/// \a X are extracted and returned: \n
		857	/// 00: Bits [31:0] of parameter \a X are returned. \n
		858	/// 01: Bits [63:32] of parameter \a X are returned. \n
		859	/// 10: Bits [95:64] of parameter \a X are returned. \n
		860	/// 11: Bits [127:96] of parameter \a X are returned.
		861	/// \returns A 32-bit integer containing the extracted 32 bits of float data.
		862	#define _mm_extract_ps(X, N) \
		863	__builtin_bit_cast( \
		864	int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)))
		865
		866	/* Miscellaneous insert and extract macros. */
		867	/* Extract a single-precision float from X at index N into D. */
		868	#define _MM_EXTRACT_FLOAT(D, X, N) \
		869	do { \
		870	(D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
		871	} while (0)
		872
		873	/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
		874	an index suitable for _mm_insert_ps. */
		875	#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) \| ((Y) << 4) \| (Z))
		876
		877	/* Extract a float from X at index N into the first index of the return. */
		878	#define _MM_PICK_OUT_PS(X, N) \
		879	_mm_insert_ps(_mm_setzero_ps(), (X), _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
		880
		881	/* Insert int into packed integer array at index. */
		882	/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
		883	/// the 128-bit integer vector parameter, and then inserting the lower 8 bits
		884	/// of an integer parameter \a I into an offset specified by the immediate
		885	/// value parameter \a N.
		886	///
		887	/// \headerfile <x86intrin.h>
		888	///
		889	/// \code
		890	/// __m128i _mm_insert_epi8(__m128i X, int I, const int N);
		891	/// \endcode
		892	///
		893	/// This intrinsic corresponds to the <c> VPINSRB / PINSRB </c> instruction.
		894	///
		895	/// \param X
		896	/// A 128-bit integer vector of [16 x i8]. This vector is copied to the
		897	/// result and then one of the sixteen elements in the result vector is
		898	/// replaced by the lower 8 bits of \a I.
		899	/// \param I
		900	/// An integer. The lower 8 bits of this operand are written to the result
		901	/// beginning at the offset specified by \a N.
		902	/// \param N
		903	/// An immediate value. Bits [3:0] specify the bit offset in the result at
		904	/// which the lower 8 bits of \a I are written. \n
		905	/// 0000: Bits [7:0] of the result are used for insertion. \n
		906	/// 0001: Bits [15:8] of the result are used for insertion. \n
		907	/// 0010: Bits [23:16] of the result are used for insertion. \n
		908	/// 0011: Bits [31:24] of the result are used for insertion. \n
		909	/// 0100: Bits [39:32] of the result are used for insertion. \n
		910	/// 0101: Bits [47:40] of the result are used for insertion. \n
		911	/// 0110: Bits [55:48] of the result are used for insertion. \n
		912	/// 0111: Bits [63:56] of the result are used for insertion. \n
		913	/// 1000: Bits [71:64] of the result are used for insertion. \n
		914	/// 1001: Bits [79:72] of the result are used for insertion. \n
		915	/// 1010: Bits [87:80] of the result are used for insertion. \n
		916	/// 1011: Bits [95:88] of the result are used for insertion. \n
		917	/// 1100: Bits [103:96] of the result are used for insertion. \n
		918	/// 1101: Bits [111:104] of the result are used for insertion. \n
		919	/// 1110: Bits [119:112] of the result are used for insertion. \n
		920	/// 1111: Bits [127:120] of the result are used for insertion.
		921	/// \returns A 128-bit integer vector containing the constructed values.
		922	#define _mm_insert_epi8(X, I, N) \
		923	((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), (int)(I), \
		924	(int)(N)))
		925
		926	/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
		927	/// the 128-bit integer vector parameter, and then inserting the 32-bit
		928	/// integer parameter \a I at the offset specified by the immediate value
		929	/// parameter \a N.
		930	///
		931	/// \headerfile <x86intrin.h>
		932	///
		933	/// \code
		934	/// __m128i _mm_insert_epi32(__m128i X, int I, const int N);
		935	/// \endcode
		936	///
		937	/// This intrinsic corresponds to the <c> VPINSRD / PINSRD </c> instruction.
		938	///
		939	/// \param X
		940	/// A 128-bit integer vector of [4 x i32]. This vector is copied to the
		941	/// result and then one of the four elements in the result vector is
		942	/// replaced by \a I.
		943	/// \param I
		944	/// A 32-bit integer that is written to the result beginning at the offset
		945	/// specified by \a N.
		946	/// \param N
		947	/// An immediate value. Bits [1:0] specify the bit offset in the result at
		948	/// which the integer \a I is written. \n
		949	/// 00: Bits [31:0] of the result are used for insertion. \n
		950	/// 01: Bits [63:32] of the result are used for insertion. \n
		951	/// 10: Bits [95:64] of the result are used for insertion. \n
		952	/// 11: Bits [127:96] of the result are used for insertion.
		953	/// \returns A 128-bit integer vector containing the constructed values.
		954	#define _mm_insert_epi32(X, I, N) \
		955	((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), (int)(I), \
		956	(int)(N)))
		957
		958	#ifdef __x86_64__
		959	/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
		960	/// the 128-bit integer vector parameter, and then inserting the 64-bit
		961	/// integer parameter \a I, using the immediate value parameter \a N as an
		962	/// insertion location selector.
		963	///
		964	/// \headerfile <x86intrin.h>
		965	///
		966	/// \code
		967	/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N);
		968	/// \endcode
		969	///
		970	/// This intrinsic corresponds to the <c> VPINSRQ / PINSRQ </c> instruction.
		971	///
		972	/// \param X
		973	/// A 128-bit integer vector of [2 x i64]. This vector is copied to the
		974	/// result and then one of the two elements in the result vector is replaced
		975	/// by \a I.
		976	/// \param I
		977	/// A 64-bit integer that is written to the result beginning at the offset
		978	/// specified by \a N.
		979	/// \param N
		980	/// An immediate value. Bit [0] specifies the bit offset in the result at
		981	/// which the integer \a I is written. \n
		982	/// 0: Bits [63:0] of the result are used for insertion. \n
		983	/// 1: Bits [127:64] of the result are used for insertion. \n
		984	/// \returns A 128-bit integer vector containing the constructed values.
		985	#define _mm_insert_epi64(X, I, N) \
		986	((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), (long long)(I), \
		987	(int)(N)))
		988	#endif /* __x86_64__ */
		989
		990	/* Extract int from packed integer array at index. This returns the element
		991	* as a zero extended value, so it is unsigned.
		992	*/
		993	/// Extracts an 8-bit element from the 128-bit integer vector of
		994	/// [16 x i8], using the immediate value parameter \a N as a selector.
		995	///
		996	/// \headerfile <x86intrin.h>
		997	///
		998	/// \code
		999	/// int _mm_extract_epi8(__m128i X, const int N);
		1000	/// \endcode
		1001	///
		1002	/// This intrinsic corresponds to the <c> VPEXTRB / PEXTRB </c> instruction.
		1003	///
		1004	/// \param X
		1005	/// A 128-bit integer vector.
		1006	/// \param N
		1007	/// An immediate value. Bits [3:0] specify which 8-bit vector element from
		1008	/// the argument \a X to extract and copy to the result. \n
		1009	/// 0000: Bits [7:0] of parameter \a X are extracted. \n
		1010	/// 0001: Bits [15:8] of the parameter \a X are extracted. \n
		1011	/// 0010: Bits [23:16] of the parameter \a X are extracted. \n
		1012	/// 0011: Bits [31:24] of the parameter \a X are extracted. \n
		1013	/// 0100: Bits [39:32] of the parameter \a X are extracted. \n
		1014	/// 0101: Bits [47:40] of the parameter \a X are extracted. \n
		1015	/// 0110: Bits [55:48] of the parameter \a X are extracted. \n
		1016	/// 0111: Bits [63:56] of the parameter \a X are extracted. \n
		1017	/// 1000: Bits [71:64] of the parameter \a X are extracted. \n
		1018	/// 1001: Bits [79:72] of the parameter \a X are extracted. \n
		1019	/// 1010: Bits [87:80] of the parameter \a X are extracted. \n
		1020	/// 1011: Bits [95:88] of the parameter \a X are extracted. \n
		1021	/// 1100: Bits [103:96] of the parameter \a X are extracted. \n
		1022	/// 1101: Bits [111:104] of the parameter \a X are extracted. \n
		1023	/// 1110: Bits [119:112] of the parameter \a X are extracted. \n
		1024	/// 1111: Bits [127:120] of the parameter \a X are extracted.
		1025	/// \returns An unsigned integer, whose lower 8 bits are selected from the
		1026	/// 128-bit integer vector parameter and the remaining bits are assigned
		1027	/// zeros.
		1028	#define _mm_extract_epi8(X, N) \
		1029	((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
		1030	(int)(N)))
		1031
		1032	/// Extracts a 32-bit element from the 128-bit integer vector of
		1033	/// [4 x i32], using the immediate value parameter \a N as a selector.
		1034	///
		1035	/// \headerfile <x86intrin.h>
		1036	///
		1037	/// \code
		1038	/// int _mm_extract_epi32(__m128i X, const int N);
		1039	/// \endcode
		1040	///
		1041	/// This intrinsic corresponds to the <c> VPEXTRD / PEXTRD </c> instruction.
		1042	///
		1043	/// \param X
		1044	/// A 128-bit integer vector.
		1045	/// \param N
		1046	/// An immediate value. Bits [1:0] specify which 32-bit vector element from
		1047	/// the argument \a X to extract and copy to the result. \n
		1048	/// 00: Bits [31:0] of the parameter \a X are extracted. \n
		1049	/// 01: Bits [63:32] of the parameter \a X are extracted. \n
		1050	/// 10: Bits [95:64] of the parameter \a X are extracted. \n
		1051	/// 11: Bits [127:96] of the parameter \a X are exracted.
		1052	/// \returns An integer, whose lower 32 bits are selected from the 128-bit
		1053	/// integer vector parameter and the remaining bits are assigned zeros.
		1054	#define _mm_extract_epi32(X, N) \
		1055	((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)))
		1056
		1057	/// Extracts a 64-bit element from the 128-bit integer vector of
		1058	/// [2 x i64], using the immediate value parameter \a N as a selector.
		1059	///
		1060	/// \headerfile <x86intrin.h>
		1061	///
		1062	/// \code
		1063	/// long long _mm_extract_epi64(__m128i X, const int N);
		1064	/// \endcode
		1065	///
		1066	/// This intrinsic corresponds to the <c> VPEXTRQ / PEXTRQ </c> instruction
		1067	/// in 64-bit mode.
		1068	///
		1069	/// \param X
		1070	/// A 128-bit integer vector.
		1071	/// \param N
		1072	/// An immediate value. Bit [0] specifies which 64-bit vector element from
		1073	/// the argument \a X to return. \n
		1074	/// 0: Bits [63:0] are returned. \n
		1075	/// 1: Bits [127:64] are returned. \n
		1076	/// \returns A 64-bit integer.
		1077	#define _mm_extract_epi64(X, N) \
		1078	((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)))
		1079
		1080	/* SSE4 128-bit Packed Integer Comparisons. */
		1081	/// Tests whether the specified bits in a 128-bit integer vector are all
		1082	/// zeros.
		1083	///
		1084	/// \headerfile <x86intrin.h>
		1085	///
		1086	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
		1087	///
		1088	/// \param __M
		1089	/// A 128-bit integer vector containing the bits to be tested.
		1090	/// \param __V
		1091	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
		1092	/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
		1093	static __inline__ int __DEFAULT_FN_ATTRS _mm_testz_si128(__m128i __M,
		1094	__m128i __V) {
		1095	return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
		1096	}
		1097
		1098	/// Tests whether the specified bits in a 128-bit integer vector are all
		1099	/// ones.
		1100	///
		1101	/// \headerfile <x86intrin.h>
		1102	///
		1103	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
		1104	///
		1105	/// \param __M
		1106	/// A 128-bit integer vector containing the bits to be tested.
		1107	/// \param __V
		1108	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
		1109	/// \returns TRUE if the specified bits are all ones; FALSE otherwise.
		1110	static __inline__ int __DEFAULT_FN_ATTRS _mm_testc_si128(__m128i __M,
		1111	__m128i __V) {
		1112	return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
		1113	}
		1114
		1115	/// Tests whether the specified bits in a 128-bit integer vector are
		1116	/// neither all zeros nor all ones.
		1117	///
		1118	/// \headerfile <x86intrin.h>
		1119	///
		1120	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
		1121	///
		1122	/// \param __M
		1123	/// A 128-bit integer vector containing the bits to be tested.
		1124	/// \param __V
		1125	/// A 128-bit integer vector selecting which bits to test in operand \a __M.
		1126	/// \returns TRUE if the specified bits are neither all zeros nor all ones;
		1127	/// FALSE otherwise.
		1128	static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
		1129	__m128i __V) {
		1130	return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
		1131	}
		1132
		1133	/// Tests whether the specified bits in a 128-bit integer vector are all
		1134	/// ones.
		1135	///
		1136	/// \headerfile <x86intrin.h>
		1137	///
		1138	/// \code
		1139	/// int _mm_test_all_ones(__m128i V);
		1140	/// \endcode
		1141	///
		1142	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
		1143	///
		1144	/// \param V
		1145	/// A 128-bit integer vector containing the bits to be tested.
		1146	/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE
		1147	/// otherwise.
		1148	#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_set1_epi32(-1))
		1149
		1150	/// Tests whether the specified bits in a 128-bit integer vector are
		1151	/// neither all zeros nor all ones.
		1152	///
		1153	/// \headerfile <x86intrin.h>
		1154	///
		1155	/// \code
		1156	/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V);
		1157	/// \endcode
		1158	///
		1159	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
		1160	///
		1161	/// \param M
		1162	/// A 128-bit integer vector containing the bits to be tested.
		1163	/// \param V
		1164	/// A 128-bit integer vector selecting which bits to test in operand \a M.
		1165	/// \returns TRUE if the specified bits are neither all zeros nor all ones;
		1166	/// FALSE otherwise.
		1167	#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
		1168
		1169	/// Tests whether the specified bits in a 128-bit integer vector are all
		1170	/// zeros.
		1171	///
		1172	/// \headerfile <x86intrin.h>
		1173	///
		1174	/// \code
		1175	/// int _mm_test_all_zeros(__m128i M, __m128i V);
		1176	/// \endcode
		1177	///
		1178	/// This intrinsic corresponds to the <c> VPTEST / PTEST </c> instruction.
		1179	///
		1180	/// \param M
		1181	/// A 128-bit integer vector containing the bits to be tested.
		1182	/// \param V
		1183	/// A 128-bit integer vector selecting which bits to test in operand \a M.
		1184	/// \returns TRUE if the specified bits are all zeros; FALSE otherwise.
		1185	#define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V))
		1186
		1187	/* SSE4 64-bit Packed Integer Comparisons. */
		1188	/// Compares each of the corresponding 64-bit values of the 128-bit
		1189	/// integer vectors for equality.
		1190	///
		1191	/// \headerfile <x86intrin.h>
		1192	///
		1193	/// This intrinsic corresponds to the <c> VPCMPEQQ / PCMPEQQ </c> instruction.
		1194	///
		1195	/// \param __V1
		1196	/// A 128-bit integer vector.
		1197	/// \param __V2
		1198	/// A 128-bit integer vector.
		1199	/// \returns A 128-bit integer vector containing the comparison results.
		1200	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
		1201	__m128i __V2) {
		1202	return (__m128i)((__v2di)__V1 == (__v2di)__V2);
		1203	}
		1204
		1205	/* SSE4 Packed Integer Sign-Extension. */
		1206	/// Sign-extends each of the lower eight 8-bit integer elements of a
		1207	/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
		1208	/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
		1209	/// are unused.
		1210	///
		1211	/// \headerfile <x86intrin.h>
		1212	///
		1213	/// This intrinsic corresponds to the <c> VPMOVSXBW / PMOVSXBW </c> instruction.
		1214	///
		1215	/// \param __V
		1216	/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
		1217	/// sign-extended to 16-bit values.
		1218	/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values.
		1219	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi16(__m128i __V) {
		1220	/* This function always performs a signed extension, but __v16qi is a char
		1221	which may be signed or unsigned, so use __v16qs. */
		1222	return (__m128i) __builtin_convertvector(
		1223	__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
		1224	7),
		1225	__v8hi);
		1226	}
		1227
		1228	/// Sign-extends each of the lower four 8-bit integer elements of a
		1229	/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
		1230	/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
		1231	/// vector are unused.
		1232	///
		1233	/// \headerfile <x86intrin.h>
		1234	///
		1235	/// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
		1236	///
		1237	/// \param __V
		1238	/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
		1239	/// sign-extended to 32-bit values.
		1240	/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
		1241	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) {
		1242	/* This function always performs a signed extension, but __v16qi is a char
		1243	which may be signed or unsigned, so use __v16qs. */
		1244	return (__m128i) __builtin_convertvector(
		1245	__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
		1246	}
		1247
		1248	/// Sign-extends each of the lower two 8-bit integer elements of a
		1249	/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
		1250	/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
		1251	/// vector are unused.
		1252	///
		1253	/// \headerfile <x86intrin.h>
		1254	///
		1255	/// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
		1256	///
		1257	/// \param __V
		1258	/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
		1259	/// sign-extended to 64-bit values.
		1260	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
		1261	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) {
		1262	/* This function always performs a signed extension, but __v16qi is a char
		1263	which may be signed or unsigned, so use __v16qs. */
		1264	return (__m128i) __builtin_convertvector(
		1265	__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
		1266	}
		1267
		1268	/// Sign-extends each of the lower four 16-bit integer elements of a
		1269	/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
		1270	/// a 128-bit vector of [4 x i32]. The upper four elements of the input
		1271	/// vector are unused.
		1272	///
		1273	/// \headerfile <x86intrin.h>
		1274	///
		1275	/// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
		1276	///
		1277	/// \param __V
		1278	/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
		1279	/// sign-extended to 32-bit values.
		1280	/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
		1281	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) {
		1282	return (__m128i) __builtin_convertvector(
		1283	__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
		1284	}
		1285
		1286	/// Sign-extends each of the lower two 16-bit integer elements of a
		1287	/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
		1288	/// a 128-bit vector of [2 x i64]. The upper six elements of the input
		1289	/// vector are unused.
		1290	///
		1291	/// \headerfile <x86intrin.h>
		1292	///
		1293	/// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
		1294	///
		1295	/// \param __V
		1296	/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
		1297	/// sign-extended to 64-bit values.
		1298	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
		1299	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) {
		1300	return (__m128i) __builtin_convertvector(
		1301	__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
		1302	}
		1303
		1304	/// Sign-extends each of the lower two 32-bit integer elements of a
		1305	/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
		1306	/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
		1307	/// are unused.
		1308	///
		1309	/// \headerfile <x86intrin.h>
		1310	///
		1311	/// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
		1312	///
		1313	/// \param __V
		1314	/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
		1315	/// sign-extended to 64-bit values.
		1316	/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
		1317	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) {
		1318	return (__m128i) __builtin_convertvector(
		1319	__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di);
		1320	}
		1321
		1322	/* SSE4 Packed Integer Zero-Extension. */
		1323	/// Zero-extends each of the lower eight 8-bit integer elements of a
		1324	/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a
		1325	/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector
		1326	/// are unused.
		1327	///
		1328	/// \headerfile <x86intrin.h>
		1329	///
		1330	/// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
		1331	///
		1332	/// \param __V
		1333	/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
		1334	/// zero-extended to 16-bit values.
		1335	/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
		1336	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) {
		1337	return (__m128i) __builtin_convertvector(
		1338	__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
		1339	7),
		1340	__v8hi);
		1341	}
		1342
		1343	/// Zero-extends each of the lower four 8-bit integer elements of a
		1344	/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a
		1345	/// 128-bit vector of [4 x i32]. The upper twelve elements of the input
		1346	/// vector are unused.
		1347	///
		1348	/// \headerfile <x86intrin.h>
		1349	///
		1350	/// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
		1351	///
		1352	/// \param __V
		1353	/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
		1354	/// zero-extended to 32-bit values.
		1355	/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
		1356	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) {
		1357	return (__m128i) __builtin_convertvector(
		1358	__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
		1359	}
		1360
		1361	/// Zero-extends each of the lower two 8-bit integer elements of a
		1362	/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in
		1363	/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
		1364	/// vector are unused.
		1365	///
		1366	/// \headerfile <x86intrin.h>
		1367	///
		1368	/// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
		1369	///
		1370	/// \param __V
		1371	/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
		1372	/// zero-extended to 64-bit values.
		1373	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
		1374	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) {
		1375	return (__m128i) __builtin_convertvector(
		1376	__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
		1377	}
		1378
		1379	/// Zero-extends each of the lower four 16-bit integer elements of a
		1380	/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in
		1381	/// a 128-bit vector of [4 x i32]. The upper four elements of the input
		1382	/// vector are unused.
		1383	///
		1384	/// \headerfile <x86intrin.h>
		1385	///
		1386	/// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
		1387	///
		1388	/// \param __V
		1389	/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
		1390	/// zero-extended to 32-bit values.
		1391	/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
		1392	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) {
		1393	return (__m128i) __builtin_convertvector(
		1394	__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
		1395	}
		1396
		1397	/// Zero-extends each of the lower two 16-bit integer elements of a
		1398	/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in
		1399	/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector
		1400	/// are unused.
		1401	///
		1402	/// \headerfile <x86intrin.h>
		1403	///
		1404	/// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
		1405	///
		1406	/// \param __V
		1407	/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
		1408	/// zero-extended to 64-bit values.
		1409	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
		1410	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) {
		1411	return (__m128i) __builtin_convertvector(
		1412	__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
		1413	}
		1414
		1415	/// Zero-extends each of the lower two 32-bit integer elements of a
		1416	/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in
		1417	/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector
		1418	/// are unused.
		1419	///
		1420	/// \headerfile <x86intrin.h>
		1421	///
		1422	/// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
		1423	///
		1424	/// \param __V
		1425	/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
		1426	/// zero-extended to 64-bit values.
		1427	/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
		1428	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) {
		1429	return (__m128i) __builtin_convertvector(
		1430	__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di);
		1431	}
		1432
		1433	/* SSE4 Pack with Unsigned Saturation. */
		1434	/// Converts 32-bit signed integers from both 128-bit integer vector
		1435	/// operands into 16-bit unsigned integers, and returns the packed result.
		1436	/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
		1437	/// 0x0000 are saturated to 0x0000.
		1438	///
		1439	/// \headerfile <x86intrin.h>
		1440	///
		1441	/// This intrinsic corresponds to the <c> VPACKUSDW / PACKUSDW </c> instruction.
		1442	///
		1443	/// \param __V1
		1444	/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
		1445	/// signed integer and is converted to a 16-bit unsigned integer with
		1446	/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
		1447	/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
		1448	/// are written to the lower 64 bits of the result.
		1449	/// \param __V2
		1450	/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a
		1451	/// signed integer and is converted to a 16-bit unsigned integer with
		1452	/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values
		1453	/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values
		1454	/// are written to the higher 64 bits of the result.
		1455	/// \returns A 128-bit vector of [8 x i16] containing the converted values.
		1456	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_packus_epi32(__m128i __V1,
		1457	__m128i __V2) {
		1458	return (__m128i)__builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2);
		1459	}
		1460
		1461	/* SSE4 Multiple Packed Sums of Absolute Difference. */
		1462	/// Subtracts 8-bit unsigned integer values and computes the absolute
		1463	/// values of the differences to the corresponding bits in the destination.
		1464	/// Then sums of the absolute differences are returned according to the bit
		1465	/// fields in the immediate operand.
		1466	///
		1467	/// \headerfile <x86intrin.h>
		1468	///
		1469	/// \code
		1470	/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M);
		1471	/// \endcode
		1472	///
		1473	/// This intrinsic corresponds to the <c> VMPSADBW / MPSADBW </c> instruction.
		1474	///
		1475	/// \param X
		1476	/// A 128-bit vector of [16 x i8].
		1477	/// \param Y
		1478	/// A 128-bit vector of [16 x i8].
		1479	/// \param M
		1480	/// An 8-bit immediate operand specifying how the absolute differences are to
		1481	/// be calculated, according to the following algorithm:
		1482	/// \code
		1483	/// // M2 represents bit 2 of the immediate operand
		1484	/// // M10 represents bits [1:0] of the immediate operand
		1485	/// i = M2 * 4;
		1486	/// j = M10 * 4;
		1487	/// for (k = 0; k < 8; k = k + 1) {
		1488	/// d0 = abs(X[i + k + 0] - Y[j + 0]);
		1489	/// d1 = abs(X[i + k + 1] - Y[j + 1]);
		1490	/// d2 = abs(X[i + k + 2] - Y[j + 2]);
		1491	/// d3 = abs(X[i + k + 3] - Y[j + 3]);
		1492	/// r[k] = d0 + d1 + d2 + d3;
		1493	/// }
		1494	/// \endcode
		1495	/// \returns A 128-bit integer vector containing the sums of the sets of
		1496	/// absolute differences between both operands.
		1497	#define _mm_mpsadbw_epu8(X, Y, M) \
		1498	((__m128i)__builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
		1499	(__v16qi)(__m128i)(Y), (M)))
		1500
		1501	/// Finds the minimum unsigned 16-bit element in the input 128-bit
		1502	/// vector of [8 x u16] and returns it and along with its index.
		1503	///
		1504	/// \headerfile <x86intrin.h>
		1505	///
		1506	/// This intrinsic corresponds to the <c> VPHMINPOSUW / PHMINPOSUW </c>
		1507	/// instruction.
		1508	///
		1509	/// \param __V
		1510	/// A 128-bit vector of [8 x u16].
		1511	/// \returns A 128-bit value where bits [15:0] contain the minimum value found
		1512	/// in parameter \a __V, bits [18:16] contain the index of the minimum value
		1513	/// and the remaining bits are set to 0.
		1514	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
		1515	return (__m128i)__builtin_ia32_phminposuw128((__v8hi)__V);
		1516	}
		1517
		1518	/* Handle the sse4.2 definitions here. */
		1519
		1520	/* These definitions are normally in nmmintrin.h, but gcc puts them in here
		1521	so we'll do the same. */
		1522
		1523	#undef __DEFAULT_FN_ATTRS
		1524	#define __DEFAULT_FN_ATTRS \
		1525	__attribute__((__always_inline__, __nodebug__, __target__("sse4.2")))
		1526
		1527	/* These specify the type of data that we're comparing. */
		1528	#define _SIDD_UBYTE_OPS 0x00
		1529	#define _SIDD_UWORD_OPS 0x01
		1530	#define _SIDD_SBYTE_OPS 0x02
		1531	#define _SIDD_SWORD_OPS 0x03
		1532
		1533	/* These specify the type of comparison operation. */
		1534	#define _SIDD_CMP_EQUAL_ANY 0x00
		1535	#define _SIDD_CMP_RANGES 0x04
		1536	#define _SIDD_CMP_EQUAL_EACH 0x08
		1537	#define _SIDD_CMP_EQUAL_ORDERED 0x0c
		1538
		1539	/* These macros specify the polarity of the operation. */
		1540	#define _SIDD_POSITIVE_POLARITY 0x00
		1541	#define _SIDD_NEGATIVE_POLARITY 0x10
		1542	#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
		1543	#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30
		1544
		1545	/* These macros are used in _mm_cmpXstri() to specify the return. */
		1546	#define _SIDD_LEAST_SIGNIFICANT 0x00
		1547	#define _SIDD_MOST_SIGNIFICANT 0x40
		1548
		1549	/* These macros are used in _mm_cmpXstri() to specify the return. */
		1550	#define _SIDD_BIT_MASK 0x00
		1551	#define _SIDD_UNIT_MASK 0x40
		1552
		1553	/* SSE4.2 Packed Comparison Intrinsics. */
		1554	/// Uses the immediate operand \a M to perform a comparison of string
		1555	/// data with implicitly defined lengths that is contained in source operands
		1556	/// \a A and \a B. Returns a 128-bit integer vector representing the result
		1557	/// mask of the comparison.
		1558	///
		1559	/// \headerfile <x86intrin.h>
		1560	///
		1561	/// \code
		1562	/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M);
		1563	/// \endcode
		1564	///
		1565	/// This intrinsic corresponds to the <c> VPCMPISTRM / PCMPISTRM </c>
		1566	/// instruction.
		1567	///
		1568	/// \param A
		1569	/// A 128-bit integer vector containing one of the source operands to be
		1570	/// compared.
		1571	/// \param B
		1572	/// A 128-bit integer vector containing one of the source operands to be
		1573	/// compared.
		1574	/// \param M
		1575	/// An 8-bit immediate operand specifying whether the characters are bytes or
		1576	/// words, the type of comparison to perform, and the format of the return
		1577	/// value. \n
		1578	/// Bits [1:0]: Determine source data format. \n
		1579	/// 00: 16 unsigned bytes \n
		1580	/// 01: 8 unsigned words \n
		1581	/// 10: 16 signed bytes \n
		1582	/// 11: 8 signed words \n
		1583	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		1584	/// 00: Subset: Each character in \a B is compared for equality with all
		1585	/// the characters in \a A. \n
		1586	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		1587	/// basis is greater than or equal for even-indexed elements in \a A,
		1588	/// and less than or equal for odd-indexed elements in \a A. \n
		1589	/// 10: Match: Compare each pair of corresponding characters in \a A and
		1590	/// \a B for equality. \n
		1591	/// 11: Substring: Search \a B for substring matches of \a A. \n
		1592	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		1593	/// mask of the comparison results. \n
		1594	/// 00: No effect. \n
		1595	/// 01: Negate the bit mask. \n
		1596	/// 10: No effect. \n
		1597	/// 11: Negate the bit mask only for bits with an index less than or equal
		1598	/// to the size of \a A or \a B. \n
		1599	/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
		1600	/// bytes. \n
		1601	/// 0: The result is zero-extended to 16 bytes. \n
		1602	/// 1: The result is expanded to 16 bytes (this expansion is performed by
		1603	/// repeating each bit 8 or 16 times).
		1604	/// \returns Returns a 128-bit integer vector representing the result mask of
		1605	/// the comparison.
		1606	#define _mm_cmpistrm(A, B, M) \
		1607	((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
		1608	(__v16qi)(__m128i)(B), (int)(M)))
		1609
		1610	/// Uses the immediate operand \a M to perform a comparison of string
		1611	/// data with implicitly defined lengths that is contained in source operands
		1612	/// \a A and \a B. Returns an integer representing the result index of the
		1613	/// comparison.
		1614	///
		1615	/// \headerfile <x86intrin.h>
		1616	///
		1617	/// \code
		1618	/// int _mm_cmpistri(__m128i A, __m128i B, const int M);
		1619	/// \endcode
		1620	///
		1621	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
		1622	/// instruction.
		1623	///
		1624	/// \param A
		1625	/// A 128-bit integer vector containing one of the source operands to be
		1626	/// compared.
		1627	/// \param B
		1628	/// A 128-bit integer vector containing one of the source operands to be
		1629	/// compared.
		1630	/// \param M
		1631	/// An 8-bit immediate operand specifying whether the characters are bytes or
		1632	/// words, the type of comparison to perform, and the format of the return
		1633	/// value. \n
		1634	/// Bits [1:0]: Determine source data format. \n
		1635	/// 00: 16 unsigned bytes \n
		1636	/// 01: 8 unsigned words \n
		1637	/// 10: 16 signed bytes \n
		1638	/// 11: 8 signed words \n
		1639	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		1640	/// 00: Subset: Each character in \a B is compared for equality with all
		1641	/// the characters in \a A. \n
		1642	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		1643	/// basis is greater than or equal for even-indexed elements in \a A,
		1644	/// and less than or equal for odd-indexed elements in \a A. \n
		1645	/// 10: Match: Compare each pair of corresponding characters in \a A and
		1646	/// \a B for equality. \n
		1647	/// 11: Substring: Search B for substring matches of \a A. \n
		1648	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		1649	/// mask of the comparison results. \n
		1650	/// 00: No effect. \n
		1651	/// 01: Negate the bit mask. \n
		1652	/// 10: No effect. \n
		1653	/// 11: Negate the bit mask only for bits with an index less than or equal
		1654	/// to the size of \a A or \a B. \n
		1655	/// Bit [6]: Determines whether the index of the lowest set bit or the
		1656	/// highest set bit is returned. \n
		1657	/// 0: The index of the least significant set bit. \n
		1658	/// 1: The index of the most significant set bit. \n
		1659	/// \returns Returns an integer representing the result index of the comparison.
		1660	#define _mm_cmpistri(A, B, M) \
		1661	((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
		1662	(__v16qi)(__m128i)(B), (int)(M)))
		1663
		1664	/// Uses the immediate operand \a M to perform a comparison of string
		1665	/// data with explicitly defined lengths that is contained in source operands
		1666	/// \a A and \a B. Returns a 128-bit integer vector representing the result
		1667	/// mask of the comparison.
		1668	///
		1669	/// \headerfile <x86intrin.h>
		1670	///
		1671	/// \code
		1672	/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M);
		1673	/// \endcode
		1674	///
		1675	/// This intrinsic corresponds to the <c> VPCMPESTRM / PCMPESTRM </c>
		1676	/// instruction.
		1677	///
		1678	/// \param A
		1679	/// A 128-bit integer vector containing one of the source operands to be
		1680	/// compared.
		1681	/// \param LA
		1682	/// An integer that specifies the length of the string in \a A.
		1683	/// \param B
		1684	/// A 128-bit integer vector containing one of the source operands to be
		1685	/// compared.
		1686	/// \param LB
		1687	/// An integer that specifies the length of the string in \a B.
		1688	/// \param M
		1689	/// An 8-bit immediate operand specifying whether the characters are bytes or
		1690	/// words, the type of comparison to perform, and the format of the return
		1691	/// value. \n
		1692	/// Bits [1:0]: Determine source data format. \n
		1693	/// 00: 16 unsigned bytes \n
		1694	/// 01: 8 unsigned words \n
		1695	/// 10: 16 signed bytes \n
		1696	/// 11: 8 signed words \n
		1697	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		1698	/// 00: Subset: Each character in \a B is compared for equality with all
		1699	/// the characters in \a A. \n
		1700	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		1701	/// basis is greater than or equal for even-indexed elements in \a A,
		1702	/// and less than or equal for odd-indexed elements in \a A. \n
		1703	/// 10: Match: Compare each pair of corresponding characters in \a A and
		1704	/// \a B for equality. \n
		1705	/// 11: Substring: Search \a B for substring matches of \a A. \n
		1706	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		1707	/// mask of the comparison results. \n
		1708	/// 00: No effect. \n
		1709	/// 01: Negate the bit mask. \n
		1710	/// 10: No effect. \n
		1711	/// 11: Negate the bit mask only for bits with an index less than or equal
		1712	/// to the size of \a A or \a B. \n
		1713	/// Bit [6]: Determines whether the result is zero-extended or expanded to 16
		1714	/// bytes. \n
		1715	/// 0: The result is zero-extended to 16 bytes. \n
		1716	/// 1: The result is expanded to 16 bytes (this expansion is performed by
		1717	/// repeating each bit 8 or 16 times). \n
		1718	/// \returns Returns a 128-bit integer vector representing the result mask of
		1719	/// the comparison.
		1720	#define _mm_cmpestrm(A, LA, B, LB, M) \
		1721	((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \
		1722	(__v16qi)(__m128i)(B), (int)(LB), \
		1723	(int)(M)))
		1724
		1725	/// Uses the immediate operand \a M to perform a comparison of string
		1726	/// data with explicitly defined lengths that is contained in source operands
		1727	/// \a A and \a B. Returns an integer representing the result index of the
		1728	/// comparison.
		1729	///
		1730	/// \headerfile <x86intrin.h>
		1731	///
		1732	/// \code
		1733	/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M);
		1734	/// \endcode
		1735	///
		1736	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
		1737	/// instruction.
		1738	///
		1739	/// \param A
		1740	/// A 128-bit integer vector containing one of the source operands to be
		1741	/// compared.
		1742	/// \param LA
		1743	/// An integer that specifies the length of the string in \a A.
		1744	/// \param B
		1745	/// A 128-bit integer vector containing one of the source operands to be
		1746	/// compared.
		1747	/// \param LB
		1748	/// An integer that specifies the length of the string in \a B.
		1749	/// \param M
		1750	/// An 8-bit immediate operand specifying whether the characters are bytes or
		1751	/// words, the type of comparison to perform, and the format of the return
		1752	/// value. \n
		1753	/// Bits [1:0]: Determine source data format. \n
		1754	/// 00: 16 unsigned bytes \n
		1755	/// 01: 8 unsigned words \n
		1756	/// 10: 16 signed bytes \n
		1757	/// 11: 8 signed words \n
		1758	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		1759	/// 00: Subset: Each character in \a B is compared for equality with all
		1760	/// the characters in \a A. \n
		1761	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		1762	/// basis is greater than or equal for even-indexed elements in \a A,
		1763	/// and less than or equal for odd-indexed elements in \a A. \n
		1764	/// 10: Match: Compare each pair of corresponding characters in \a A and
		1765	/// \a B for equality. \n
		1766	/// 11: Substring: Search B for substring matches of \a A. \n
		1767	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		1768	/// mask of the comparison results. \n
		1769	/// 00: No effect. \n
		1770	/// 01: Negate the bit mask. \n
		1771	/// 10: No effect. \n
		1772	/// 11: Negate the bit mask only for bits with an index less than or equal
		1773	/// to the size of \a A or \a B. \n
		1774	/// Bit [6]: Determines whether the index of the lowest set bit or the
		1775	/// highest set bit is returned. \n
		1776	/// 0: The index of the least significant set bit. \n
		1777	/// 1: The index of the most significant set bit. \n
		1778	/// \returns Returns an integer representing the result index of the comparison.
		1779	#define _mm_cmpestri(A, LA, B, LB, M) \
		1780	((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \
		1781	(__v16qi)(__m128i)(B), (int)(LB), \
		1782	(int)(M)))
		1783
		1784	/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */
		1785	/// Uses the immediate operand \a M to perform a comparison of string
		1786	/// data with implicitly defined lengths that is contained in source operands
		1787	/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
		1788	/// string in \a B is the maximum, otherwise, returns 0.
		1789	///
		1790	/// \headerfile <x86intrin.h>
		1791	///
		1792	/// \code
		1793	/// int _mm_cmpistra(__m128i A, __m128i B, const int M);
		1794	/// \endcode
		1795	///
		1796	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
		1797	/// instruction.
		1798	///
		1799	/// \param A
		1800	/// A 128-bit integer vector containing one of the source operands to be
		1801	/// compared.
		1802	/// \param B
		1803	/// A 128-bit integer vector containing one of the source operands to be
		1804	/// compared.
		1805	/// \param M
		1806	/// An 8-bit immediate operand specifying whether the characters are bytes or
		1807	/// words and the type of comparison to perform. \n
		1808	/// Bits [1:0]: Determine source data format. \n
		1809	/// 00: 16 unsigned bytes \n
		1810	/// 01: 8 unsigned words \n
		1811	/// 10: 16 signed bytes \n
		1812	/// 11: 8 signed words \n
		1813	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		1814	/// 00: Subset: Each character in \a B is compared for equality with all
		1815	/// the characters in \a A. \n
		1816	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		1817	/// basis is greater than or equal for even-indexed elements in \a A,
		1818	/// and less than or equal for odd-indexed elements in \a A. \n
		1819	/// 10: Match: Compare each pair of corresponding characters in \a A and
		1820	/// \a B for equality. \n
		1821	/// 11: Substring: Search \a B for substring matches of \a A. \n
		1822	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		1823	/// mask of the comparison results. \n
		1824	/// 00: No effect. \n
		1825	/// 01: Negate the bit mask. \n
		1826	/// 10: No effect. \n
		1827	/// 11: Negate the bit mask only for bits with an index less than or equal
		1828	/// to the size of \a A or \a B. \n
		1829	/// \returns Returns 1 if the bit mask is zero and the length of the string in
		1830	/// \a B is the maximum; otherwise, returns 0.
		1831	#define _mm_cmpistra(A, B, M) \
		1832	((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
		1833	(__v16qi)(__m128i)(B), (int)(M)))
		1834
		1835	/// Uses the immediate operand \a M to perform a comparison of string
		1836	/// data with implicitly defined lengths that is contained in source operands
		1837	/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
		1838	/// 0.
		1839	///
		1840	/// \headerfile <x86intrin.h>
		1841	///
		1842	/// \code
		1843	/// int _mm_cmpistrc(__m128i A, __m128i B, const int M);
		1844	/// \endcode
		1845	///
		1846	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
		1847	/// instruction.
		1848	///
		1849	/// \param A
		1850	/// A 128-bit integer vector containing one of the source operands to be
		1851	/// compared.
		1852	/// \param B
		1853	/// A 128-bit integer vector containing one of the source operands to be
		1854	/// compared.
		1855	/// \param M
		1856	/// An 8-bit immediate operand specifying whether the characters are bytes or
		1857	/// words and the type of comparison to perform. \n
		1858	/// Bits [1:0]: Determine source data format. \n
		1859	/// 00: 16 unsigned bytes \n
		1860	/// 01: 8 unsigned words \n
		1861	/// 10: 16 signed bytes \n
		1862	/// 11: 8 signed words \n
		1863	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		1864	/// 00: Subset: Each character in \a B is compared for equality with all
		1865	/// the characters in \a A. \n
		1866	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		1867	/// basis is greater than or equal for even-indexed elements in \a A,
		1868	/// and less than or equal for odd-indexed elements in \a A. \n
		1869	/// 10: Match: Compare each pair of corresponding characters in \a A and
		1870	/// \a B for equality. \n
		1871	/// 11: Substring: Search B for substring matches of \a A. \n
		1872	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		1873	/// mask of the comparison results. \n
		1874	/// 00: No effect. \n
		1875	/// 01: Negate the bit mask. \n
		1876	/// 10: No effect. \n
		1877	/// 11: Negate the bit mask only for bits with an index less than or equal
		1878	/// to the size of \a A or \a B.
		1879	/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0.
		1880	#define _mm_cmpistrc(A, B, M) \
		1881	((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
		1882	(__v16qi)(__m128i)(B), (int)(M)))
		1883
		1884	/// Uses the immediate operand \a M to perform a comparison of string
		1885	/// data with implicitly defined lengths that is contained in source operands
		1886	/// \a A and \a B. Returns bit 0 of the resulting bit mask.
		1887	///
		1888	/// \headerfile <x86intrin.h>
		1889	///
		1890	/// \code
		1891	/// int _mm_cmpistro(__m128i A, __m128i B, const int M);
		1892	/// \endcode
		1893	///
		1894	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
		1895	/// instruction.
		1896	///
		1897	/// \param A
		1898	/// A 128-bit integer vector containing one of the source operands to be
		1899	/// compared.
		1900	/// \param B
		1901	/// A 128-bit integer vector containing one of the source operands to be
		1902	/// compared.
		1903	/// \param M
		1904	/// An 8-bit immediate operand specifying whether the characters are bytes or
		1905	/// words and the type of comparison to perform. \n
		1906	/// Bits [1:0]: Determine source data format. \n
		1907	/// 00: 16 unsigned bytes \n
		1908	/// 01: 8 unsigned words \n
		1909	/// 10: 16 signed bytes \n
		1910	/// 11: 8 signed words \n
		1911	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		1912	/// 00: Subset: Each character in \a B is compared for equality with all
		1913	/// the characters in \a A. \n
		1914	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		1915	/// basis is greater than or equal for even-indexed elements in \a A,
		1916	/// and less than or equal for odd-indexed elements in \a A. \n
		1917	/// 10: Match: Compare each pair of corresponding characters in \a A and
		1918	/// \a B for equality. \n
		1919	/// 11: Substring: Search B for substring matches of \a A. \n
		1920	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		1921	/// mask of the comparison results. \n
		1922	/// 00: No effect. \n
		1923	/// 01: Negate the bit mask. \n
		1924	/// 10: No effect. \n
		1925	/// 11: Negate the bit mask only for bits with an index less than or equal
		1926	/// to the size of \a A or \a B. \n
		1927	/// \returns Returns bit 0 of the resulting bit mask.
		1928	#define _mm_cmpistro(A, B, M) \
		1929	((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
		1930	(__v16qi)(__m128i)(B), (int)(M)))
		1931
		1932	/// Uses the immediate operand \a M to perform a comparison of string
		1933	/// data with implicitly defined lengths that is contained in source operands
		1934	/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
		1935	/// the maximum, otherwise, returns 0.
		1936	///
		1937	/// \headerfile <x86intrin.h>
		1938	///
		1939	/// \code
		1940	/// int _mm_cmpistrs(__m128i A, __m128i B, const int M);
		1941	/// \endcode
		1942	///
		1943	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
		1944	/// instruction.
		1945	///
		1946	/// \param A
		1947	/// A 128-bit integer vector containing one of the source operands to be
		1948	/// compared.
		1949	/// \param B
		1950	/// A 128-bit integer vector containing one of the source operands to be
		1951	/// compared.
		1952	/// \param M
		1953	/// An 8-bit immediate operand specifying whether the characters are bytes or
		1954	/// words and the type of comparison to perform. \n
		1955	/// Bits [1:0]: Determine source data format. \n
		1956	/// 00: 16 unsigned bytes \n
		1957	/// 01: 8 unsigned words \n
		1958	/// 10: 16 signed bytes \n
		1959	/// 11: 8 signed words \n
		1960	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		1961	/// 00: Subset: Each character in \a B is compared for equality with all
		1962	/// the characters in \a A. \n
		1963	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		1964	/// basis is greater than or equal for even-indexed elements in \a A,
		1965	/// and less than or equal for odd-indexed elements in \a A. \n
		1966	/// 10: Match: Compare each pair of corresponding characters in \a A and
		1967	/// \a B for equality. \n
		1968	/// 11: Substring: Search \a B for substring matches of \a A. \n
		1969	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		1970	/// mask of the comparison results. \n
		1971	/// 00: No effect. \n
		1972	/// 01: Negate the bit mask. \n
		1973	/// 10: No effect. \n
		1974	/// 11: Negate the bit mask only for bits with an index less than or equal
		1975	/// to the size of \a A or \a B. \n
		1976	/// \returns Returns 1 if the length of the string in \a A is less than the
		1977	/// maximum, otherwise, returns 0.
		1978	#define _mm_cmpistrs(A, B, M) \
		1979	((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
		1980	(__v16qi)(__m128i)(B), (int)(M)))
		1981
		1982	/// Uses the immediate operand \a M to perform a comparison of string
		1983	/// data with implicitly defined lengths that is contained in source operands
		1984	/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
		1985	/// the maximum, otherwise, returns 0.
		1986	///
		1987	/// \headerfile <x86intrin.h>
		1988	///
		1989	/// \code
		1990	/// int _mm_cmpistrz(__m128i A, __m128i B, const int M);
		1991	/// \endcode
		1992	///
		1993	/// This intrinsic corresponds to the <c> VPCMPISTRI / PCMPISTRI </c>
		1994	/// instruction.
		1995	///
		1996	/// \param A
		1997	/// A 128-bit integer vector containing one of the source operands to be
		1998	/// compared.
		1999	/// \param B
		2000	/// A 128-bit integer vector containing one of the source operands to be
		2001	/// compared.
		2002	/// \param M
		2003	/// An 8-bit immediate operand specifying whether the characters are bytes or
		2004	/// words and the type of comparison to perform. \n
		2005	/// Bits [1:0]: Determine source data format. \n
		2006	/// 00: 16 unsigned bytes \n
		2007	/// 01: 8 unsigned words \n
		2008	/// 10: 16 signed bytes \n
		2009	/// 11: 8 signed words \n
		2010	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		2011	/// 00: Subset: Each character in \a B is compared for equality with all
		2012	/// the characters in \a A. \n
		2013	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		2014	/// basis is greater than or equal for even-indexed elements in \a A,
		2015	/// and less than or equal for odd-indexed elements in \a A. \n
		2016	/// 10: Match: Compare each pair of corresponding characters in \a A and
		2017	/// \a B for equality. \n
		2018	/// 11: Substring: Search \a B for substring matches of \a A. \n
		2019	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		2020	/// mask of the comparison results. \n
		2021	/// 00: No effect. \n
		2022	/// 01: Negate the bit mask. \n
		2023	/// 10: No effect. \n
		2024	/// 11: Negate the bit mask only for bits with an index less than or equal
		2025	/// to the size of \a A or \a B.
		2026	/// \returns Returns 1 if the length of the string in \a B is less than the
		2027	/// maximum, otherwise, returns 0.
		2028	#define _mm_cmpistrz(A, B, M) \
		2029	((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
		2030	(__v16qi)(__m128i)(B), (int)(M)))
		2031
		2032	/// Uses the immediate operand \a M to perform a comparison of string
		2033	/// data with explicitly defined lengths that is contained in source operands
		2034	/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the
		2035	/// string in \a B is the maximum, otherwise, returns 0.
		2036	///
		2037	/// \headerfile <x86intrin.h>
		2038	///
		2039	/// \code
		2040	/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M);
		2041	/// \endcode
		2042	///
		2043	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
		2044	/// instruction.
		2045	///
		2046	/// \param A
		2047	/// A 128-bit integer vector containing one of the source operands to be
		2048	/// compared.
		2049	/// \param LA
		2050	/// An integer that specifies the length of the string in \a A.
		2051	/// \param B
		2052	/// A 128-bit integer vector containing one of the source operands to be
		2053	/// compared.
		2054	/// \param LB
		2055	/// An integer that specifies the length of the string in \a B.
		2056	/// \param M
		2057	/// An 8-bit immediate operand specifying whether the characters are bytes or
		2058	/// words and the type of comparison to perform. \n
		2059	/// Bits [1:0]: Determine source data format. \n
		2060	/// 00: 16 unsigned bytes \n
		2061	/// 01: 8 unsigned words \n
		2062	/// 10: 16 signed bytes \n
		2063	/// 11: 8 signed words \n
		2064	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		2065	/// 00: Subset: Each character in \a B is compared for equality with all
		2066	/// the characters in \a A. \n
		2067	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		2068	/// basis is greater than or equal for even-indexed elements in \a A,
		2069	/// and less than or equal for odd-indexed elements in \a A. \n
		2070	/// 10: Match: Compare each pair of corresponding characters in \a A and
		2071	/// \a B for equality. \n
		2072	/// 11: Substring: Search \a B for substring matches of \a A. \n
		2073	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		2074	/// mask of the comparison results. \n
		2075	/// 00: No effect. \n
		2076	/// 01: Negate the bit mask. \n
		2077	/// 10: No effect. \n
		2078	/// 11: Negate the bit mask only for bits with an index less than or equal
		2079	/// to the size of \a A or \a B.
		2080	/// \returns Returns 1 if the bit mask is zero and the length of the string in
		2081	/// \a B is the maximum, otherwise, returns 0.
		2082	#define _mm_cmpestra(A, LA, B, LB, M) \
		2083	((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \
		2084	(__v16qi)(__m128i)(B), (int)(LB), \
		2085	(int)(M)))
		2086
		2087	/// Uses the immediate operand \a M to perform a comparison of string
		2088	/// data with explicitly defined lengths that is contained in source operands
		2089	/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
		2090	/// returns 0.
		2091	///
		2092	/// \headerfile <x86intrin.h>
		2093	///
		2094	/// \code
		2095	/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M);
		2096	/// \endcode
		2097	///
		2098	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
		2099	/// instruction.
		2100	///
		2101	/// \param A
		2102	/// A 128-bit integer vector containing one of the source operands to be
		2103	/// compared.
		2104	/// \param LA
		2105	/// An integer that specifies the length of the string in \a A.
		2106	/// \param B
		2107	/// A 128-bit integer vector containing one of the source operands to be
		2108	/// compared.
		2109	/// \param LB
		2110	/// An integer that specifies the length of the string in \a B.
		2111	/// \param M
		2112	/// An 8-bit immediate operand specifying whether the characters are bytes or
		2113	/// words and the type of comparison to perform. \n
		2114	/// Bits [1:0]: Determine source data format. \n
		2115	/// 00: 16 unsigned bytes \n
		2116	/// 01: 8 unsigned words \n
		2117	/// 10: 16 signed bytes \n
		2118	/// 11: 8 signed words \n
		2119	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		2120	/// 00: Subset: Each character in \a B is compared for equality with all
		2121	/// the characters in \a A. \n
		2122	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		2123	/// basis is greater than or equal for even-indexed elements in \a A,
		2124	/// and less than or equal for odd-indexed elements in \a A. \n
		2125	/// 10: Match: Compare each pair of corresponding characters in \a A and
		2126	/// \a B for equality. \n
		2127	/// 11: Substring: Search \a B for substring matches of \a A. \n
		2128	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		2129	/// mask of the comparison results. \n
		2130	/// 00: No effect. \n
		2131	/// 01: Negate the bit mask. \n
		2132	/// 10: No effect. \n
		2133	/// 11: Negate the bit mask only for bits with an index less than or equal
		2134	/// to the size of \a A or \a B. \n
		2135	/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0.
		2136	#define _mm_cmpestrc(A, LA, B, LB, M) \
		2137	((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \
		2138	(__v16qi)(__m128i)(B), (int)(LB), \
		2139	(int)(M)))
		2140
		2141	/// Uses the immediate operand \a M to perform a comparison of string
		2142	/// data with explicitly defined lengths that is contained in source operands
		2143	/// \a A and \a B. Returns bit 0 of the resulting bit mask.
		2144	///
		2145	/// \headerfile <x86intrin.h>
		2146	///
		2147	/// \code
		2148	/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M);
		2149	/// \endcode
		2150	///
		2151	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
		2152	/// instruction.
		2153	///
		2154	/// \param A
		2155	/// A 128-bit integer vector containing one of the source operands to be
		2156	/// compared.
		2157	/// \param LA
		2158	/// An integer that specifies the length of the string in \a A.
		2159	/// \param B
		2160	/// A 128-bit integer vector containing one of the source operands to be
		2161	/// compared.
		2162	/// \param LB
		2163	/// An integer that specifies the length of the string in \a B.
		2164	/// \param M
		2165	/// An 8-bit immediate operand specifying whether the characters are bytes or
		2166	/// words and the type of comparison to perform. \n
		2167	/// Bits [1:0]: Determine source data format. \n
		2168	/// 00: 16 unsigned bytes \n
		2169	/// 01: 8 unsigned words \n
		2170	/// 10: 16 signed bytes \n
		2171	/// 11: 8 signed words \n
		2172	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		2173	/// 00: Subset: Each character in \a B is compared for equality with all
		2174	/// the characters in \a A. \n
		2175	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		2176	/// basis is greater than or equal for even-indexed elements in \a A,
		2177	/// and less than or equal for odd-indexed elements in \a A. \n
		2178	/// 10: Match: Compare each pair of corresponding characters in \a A and
		2179	/// \a B for equality. \n
		2180	/// 11: Substring: Search \a B for substring matches of \a A. \n
		2181	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		2182	/// mask of the comparison results. \n
		2183	/// 00: No effect. \n
		2184	/// 01: Negate the bit mask. \n
		2185	/// 10: No effect. \n
		2186	/// 11: Negate the bit mask only for bits with an index less than or equal
		2187	/// to the size of \a A or \a B.
		2188	/// \returns Returns bit 0 of the resulting bit mask.
		2189	#define _mm_cmpestro(A, LA, B, LB, M) \
		2190	((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \
		2191	(__v16qi)(__m128i)(B), (int)(LB), \
		2192	(int)(M)))
		2193
		2194	/// Uses the immediate operand \a M to perform a comparison of string
		2195	/// data with explicitly defined lengths that is contained in source operands
		2196	/// \a A and \a B. Returns 1 if the length of the string in \a A is less than
		2197	/// the maximum, otherwise, returns 0.
		2198	///
		2199	/// \headerfile <x86intrin.h>
		2200	///
		2201	/// \code
		2202	/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M);
		2203	/// \endcode
		2204	///
		2205	/// This intrinsic corresponds to the <c> VPCMPESTRI / PCMPESTRI </c>
		2206	/// instruction.
		2207	///
		2208	/// \param A
		2209	/// A 128-bit integer vector containing one of the source operands to be
		2210	/// compared.
		2211	/// \param LA
		2212	/// An integer that specifies the length of the string in \a A.
		2213	/// \param B
		2214	/// A 128-bit integer vector containing one of the source operands to be
		2215	/// compared.
		2216	/// \param LB
		2217	/// An integer that specifies the length of the string in \a B.
		2218	/// \param M
		2219	/// An 8-bit immediate operand specifying whether the characters are bytes or
		2220	/// words and the type of comparison to perform. \n
		2221	/// Bits [1:0]: Determine source data format. \n
		2222	/// 00: 16 unsigned bytes \n
		2223	/// 01: 8 unsigned words \n
		2224	/// 10: 16 signed bytes \n
		2225	/// 11: 8 signed words \n
		2226	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		2227	/// 00: Subset: Each character in \a B is compared for equality with all
		2228	/// the characters in \a A. \n
		2229	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		2230	/// basis is greater than or equal for even-indexed elements in \a A,
		2231	/// and less than or equal for odd-indexed elements in \a A. \n
		2232	/// 10: Match: Compare each pair of corresponding characters in \a A and
		2233	/// \a B for equality. \n
		2234	/// 11: Substring: Search \a B for substring matches of \a A. \n
		2235	/// Bits [5:4]: Determine whether to perform a one's complement in the bit
		2236	/// mask of the comparison results. \n
		2237	/// 00: No effect. \n
		2238	/// 01: Negate the bit mask. \n
		2239	/// 10: No effect. \n
		2240	/// 11: Negate the bit mask only for bits with an index less than or equal
		2241	/// to the size of \a A or \a B. \n
		2242	/// \returns Returns 1 if the length of the string in \a A is less than the
		2243	/// maximum, otherwise, returns 0.
		2244	#define _mm_cmpestrs(A, LA, B, LB, M) \
		2245	((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \
		2246	(__v16qi)(__m128i)(B), (int)(LB), \
		2247	(int)(M)))
		2248
		2249	/// Uses the immediate operand \a M to perform a comparison of string
		2250	/// data with explicitly defined lengths that is contained in source operands
		2251	/// \a A and \a B. Returns 1 if the length of the string in \a B is less than
		2252	/// the maximum, otherwise, returns 0.
		2253	///
		2254	/// \headerfile <x86intrin.h>
		2255	///
		2256	/// \code
		2257	/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M);
		2258	/// \endcode
		2259	///
		2260	/// This intrinsic corresponds to the <c> VPCMPESTRI </c> instruction.
		2261	///
		2262	/// \param A
		2263	/// A 128-bit integer vector containing one of the source operands to be
		2264	/// compared.
		2265	/// \param LA
		2266	/// An integer that specifies the length of the string in \a A.
		2267	/// \param B
		2268	/// A 128-bit integer vector containing one of the source operands to be
		2269	/// compared.
		2270	/// \param LB
		2271	/// An integer that specifies the length of the string in \a B.
		2272	/// \param M
		2273	/// An 8-bit immediate operand specifying whether the characters are bytes or
		2274	/// words and the type of comparison to perform. \n
		2275	/// Bits [1:0]: Determine source data format. \n
		2276	/// 00: 16 unsigned bytes \n
		2277	/// 01: 8 unsigned words \n
		2278	/// 10: 16 signed bytes \n
		2279	/// 11: 8 signed words \n
		2280	/// Bits [3:2]: Determine comparison type and aggregation method. \n
		2281	/// 00: Subset: Each character in \a B is compared for equality with all
		2282	/// the characters in \a A. \n
		2283	/// 01: Ranges: Each character in \a B is compared to \a A. The comparison
		2284	/// basis is greater than or equal for even-indexed elements in \a A,
		2285	/// and less than or equal for odd-indexed elements in \a A. \n
		2286	/// 10: Match: Compare each pair of corresponding characters in \a A and
		2287	/// \a B for equality. \n
		2288	/// 11: Substring: Search \a B for substring matches of \a A. \n
		2289	/// Bits [5:4]: Determine whether to perform a one's complement on the bit
		2290	/// mask of the comparison results. \n
		2291	/// 00: No effect. \n
		2292	/// 01: Negate the bit mask. \n
		2293	/// 10: No effect. \n
		2294	/// 11: Negate the bit mask only for bits with an index less than or equal
		2295	/// to the size of \a A or \a B.
		2296	/// \returns Returns 1 if the length of the string in \a B is less than the
		2297	/// maximum, otherwise, returns 0.
		2298	#define _mm_cmpestrz(A, LA, B, LB, M) \
		2299	((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \
		2300	(__v16qi)(__m128i)(B), (int)(LB), \
		2301	(int)(M)))
		2302
		2303	/* SSE4.2 Compare Packed Data -- Greater Than. */
		2304	/// Compares each of the corresponding 64-bit values of the 128-bit
		2305	/// integer vectors to determine if the values in the first operand are
		2306	/// greater than those in the second operand.
		2307	///
		2308	/// \headerfile <x86intrin.h>
		2309	///
		2310	/// This intrinsic corresponds to the <c> VPCMPGTQ / PCMPGTQ </c> instruction.
		2311	///
		2312	/// \param __V1
		2313	/// A 128-bit integer vector.
		2314	/// \param __V2
		2315	/// A 128-bit integer vector.
		2316	/// \returns A 128-bit integer vector containing the comparison results.
		2317	static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
		2318	__m128i __V2) {
		2319	return (__m128i)((__v2di)__V1 > (__v2di)__V2);
		2320	}
		2321
		2322	#undef __DEFAULT_FN_ATTRS
		2323
		2324	#include <popcntintrin.h>
		2325
		2326	#include <crc32intrin.h>
		2327
		2328	#endif /* __SMMINTRIN_H */

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/lib/clang/16/include/smmintrin.h – Rev 14