WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/lib/clang/16/include/avxintrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===---- avxintrin.h - AVX intrinsics -------------------------------------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9
		10	#ifndef __IMMINTRIN_H
		11	#error "Never use <avxintrin.h> directly; include <immintrin.h> instead."
		12	#endif
		13
		14	#ifndef __AVXINTRIN_H
		15	#define __AVXINTRIN_H
		16
		17	typedef double __v4df __attribute__ ((__vector_size__ (32)));
		18	typedef float __v8sf __attribute__ ((__vector_size__ (32)));
		19	typedef long long __v4di __attribute__ ((__vector_size__ (32)));
		20	typedef int __v8si __attribute__ ((__vector_size__ (32)));
		21	typedef short __v16hi __attribute__ ((__vector_size__ (32)));
		22	typedef char __v32qi __attribute__ ((__vector_size__ (32)));
		23
		24	/* Unsigned types */
		25	typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32)));
		26	typedef unsigned int __v8su __attribute__ ((__vector_size__ (32)));
		27	typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32)));
		28	typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32)));
		29
		30	/* We need an explicitly signed variant for char. Note that this shouldn't
		31	* appear in the interface though. */
		32	typedef signed char __v32qs __attribute__((__vector_size__(32)));
		33
		34	typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32)));
		35	typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32)));
		36	typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32)));
		37
		38	typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
		39	typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
		40	typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
		41
		42	#ifdef __SSE2__
		43	/* Both _Float16 and __bf16 require SSE2 being enabled. */
		44	typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
		45	typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
		46	typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
		47
		48	typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
		49	typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
		50	#endif
		51
		52	/* Define the default attributes for the functions in this file. */
		53	#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
		54	#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
		55
		56	/* Arithmetic */
		57	/// Adds two 256-bit vectors of [4 x double].
		58	///
		59	/// \headerfile <x86intrin.h>
		60	///
		61	/// This intrinsic corresponds to the <c> VADDPD </c> instruction.
		62	///
		63	/// \param __a
		64	/// A 256-bit vector of [4 x double] containing one of the source operands.
		65	/// \param __b
		66	/// A 256-bit vector of [4 x double] containing one of the source operands.
		67	/// \returns A 256-bit vector of [4 x double] containing the sums of both
		68	/// operands.
		69	static __inline __m256d __DEFAULT_FN_ATTRS
		70	_mm256_add_pd(__m256d __a, __m256d __b)
		71	{
		72	return (__m256d)((__v4df)__a+(__v4df)__b);
		73	}
		74
		75	/// Adds two 256-bit vectors of [8 x float].
		76	///
		77	/// \headerfile <x86intrin.h>
		78	///
		79	/// This intrinsic corresponds to the <c> VADDPS </c> instruction.
		80	///
		81	/// \param __a
		82	/// A 256-bit vector of [8 x float] containing one of the source operands.
		83	/// \param __b
		84	/// A 256-bit vector of [8 x float] containing one of the source operands.
		85	/// \returns A 256-bit vector of [8 x float] containing the sums of both
		86	/// operands.
		87	static __inline __m256 __DEFAULT_FN_ATTRS
		88	_mm256_add_ps(__m256 __a, __m256 __b)
		89	{
		90	return (__m256)((__v8sf)__a+(__v8sf)__b);
		91	}
		92
		93	/// Subtracts two 256-bit vectors of [4 x double].
		94	///
		95	/// \headerfile <x86intrin.h>
		96	///
		97	/// This intrinsic corresponds to the <c> VSUBPD </c> instruction.
		98	///
		99	/// \param __a
		100	/// A 256-bit vector of [4 x double] containing the minuend.
		101	/// \param __b
		102	/// A 256-bit vector of [4 x double] containing the subtrahend.
		103	/// \returns A 256-bit vector of [4 x double] containing the differences between
		104	/// both operands.
		105	static __inline __m256d __DEFAULT_FN_ATTRS
		106	_mm256_sub_pd(__m256d __a, __m256d __b)
		107	{
		108	return (__m256d)((__v4df)__a-(__v4df)__b);
		109	}
		110
		111	/// Subtracts two 256-bit vectors of [8 x float].
		112	///
		113	/// \headerfile <x86intrin.h>
		114	///
		115	/// This intrinsic corresponds to the <c> VSUBPS </c> instruction.
		116	///
		117	/// \param __a
		118	/// A 256-bit vector of [8 x float] containing the minuend.
		119	/// \param __b
		120	/// A 256-bit vector of [8 x float] containing the subtrahend.
		121	/// \returns A 256-bit vector of [8 x float] containing the differences between
		122	/// both operands.
		123	static __inline __m256 __DEFAULT_FN_ATTRS
		124	_mm256_sub_ps(__m256 __a, __m256 __b)
		125	{
		126	return (__m256)((__v8sf)__a-(__v8sf)__b);
		127	}
		128
		129	/// Adds the even-indexed values and subtracts the odd-indexed values of
		130	/// two 256-bit vectors of [4 x double].
		131	///
		132	/// \headerfile <x86intrin.h>
		133	///
		134	/// This intrinsic corresponds to the <c> VADDSUBPD </c> instruction.
		135	///
		136	/// \param __a
		137	/// A 256-bit vector of [4 x double] containing the left source operand.
		138	/// \param __b
		139	/// A 256-bit vector of [4 x double] containing the right source operand.
		140	/// \returns A 256-bit vector of [4 x double] containing the alternating sums
		141	/// and differences between both operands.
		142	static __inline __m256d __DEFAULT_FN_ATTRS
		143	_mm256_addsub_pd(__m256d __a, __m256d __b)
		144	{
		145	return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b);
		146	}
		147
		148	/// Adds the even-indexed values and subtracts the odd-indexed values of
		149	/// two 256-bit vectors of [8 x float].
		150	///
		151	/// \headerfile <x86intrin.h>
		152	///
		153	/// This intrinsic corresponds to the <c> VADDSUBPS </c> instruction.
		154	///
		155	/// \param __a
		156	/// A 256-bit vector of [8 x float] containing the left source operand.
		157	/// \param __b
		158	/// A 256-bit vector of [8 x float] containing the right source operand.
		159	/// \returns A 256-bit vector of [8 x float] containing the alternating sums and
		160	/// differences between both operands.
		161	static __inline __m256 __DEFAULT_FN_ATTRS
		162	_mm256_addsub_ps(__m256 __a, __m256 __b)
		163	{
		164	return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b);
		165	}
		166
		167	/// Divides two 256-bit vectors of [4 x double].
		168	///
		169	/// \headerfile <x86intrin.h>
		170	///
		171	/// This intrinsic corresponds to the <c> VDIVPD </c> instruction.
		172	///
		173	/// \param __a
		174	/// A 256-bit vector of [4 x double] containing the dividend.
		175	/// \param __b
		176	/// A 256-bit vector of [4 x double] containing the divisor.
		177	/// \returns A 256-bit vector of [4 x double] containing the quotients of both
		178	/// operands.
		179	static __inline __m256d __DEFAULT_FN_ATTRS
		180	_mm256_div_pd(__m256d __a, __m256d __b)
		181	{
		182	return (__m256d)((__v4df)__a/(__v4df)__b);
		183	}
		184
		185	/// Divides two 256-bit vectors of [8 x float].
		186	///
		187	/// \headerfile <x86intrin.h>
		188	///
		189	/// This intrinsic corresponds to the <c> VDIVPS </c> instruction.
		190	///
		191	/// \param __a
		192	/// A 256-bit vector of [8 x float] containing the dividend.
		193	/// \param __b
		194	/// A 256-bit vector of [8 x float] containing the divisor.
		195	/// \returns A 256-bit vector of [8 x float] containing the quotients of both
		196	/// operands.
		197	static __inline __m256 __DEFAULT_FN_ATTRS
		198	_mm256_div_ps(__m256 __a, __m256 __b)
		199	{
		200	return (__m256)((__v8sf)__a/(__v8sf)__b);
		201	}
		202
		203	/// Compares two 256-bit vectors of [4 x double] and returns the greater
		204	/// of each pair of values.
		205	///
		206	/// \headerfile <x86intrin.h>
		207	///
		208	/// This intrinsic corresponds to the <c> VMAXPD </c> instruction.
		209	///
		210	/// \param __a
		211	/// A 256-bit vector of [4 x double] containing one of the operands.
		212	/// \param __b
		213	/// A 256-bit vector of [4 x double] containing one of the operands.
		214	/// \returns A 256-bit vector of [4 x double] containing the maximum values
		215	/// between both operands.
		216	static __inline __m256d __DEFAULT_FN_ATTRS
		217	_mm256_max_pd(__m256d __a, __m256d __b)
		218	{
		219	return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b);
		220	}
		221
		222	/// Compares two 256-bit vectors of [8 x float] and returns the greater
		223	/// of each pair of values.
		224	///
		225	/// \headerfile <x86intrin.h>
		226	///
		227	/// This intrinsic corresponds to the <c> VMAXPS </c> instruction.
		228	///
		229	/// \param __a
		230	/// A 256-bit vector of [8 x float] containing one of the operands.
		231	/// \param __b
		232	/// A 256-bit vector of [8 x float] containing one of the operands.
		233	/// \returns A 256-bit vector of [8 x float] containing the maximum values
		234	/// between both operands.
		235	static __inline __m256 __DEFAULT_FN_ATTRS
		236	_mm256_max_ps(__m256 __a, __m256 __b)
		237	{
		238	return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b);
		239	}
		240
		241	/// Compares two 256-bit vectors of [4 x double] and returns the lesser
		242	/// of each pair of values.
		243	///
		244	/// \headerfile <x86intrin.h>
		245	///
		246	/// This intrinsic corresponds to the <c> VMINPD </c> instruction.
		247	///
		248	/// \param __a
		249	/// A 256-bit vector of [4 x double] containing one of the operands.
		250	/// \param __b
		251	/// A 256-bit vector of [4 x double] containing one of the operands.
		252	/// \returns A 256-bit vector of [4 x double] containing the minimum values
		253	/// between both operands.
		254	static __inline __m256d __DEFAULT_FN_ATTRS
		255	_mm256_min_pd(__m256d __a, __m256d __b)
		256	{
		257	return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b);
		258	}
		259
		260	/// Compares two 256-bit vectors of [8 x float] and returns the lesser
		261	/// of each pair of values.
		262	///
		263	/// \headerfile <x86intrin.h>
		264	///
		265	/// This intrinsic corresponds to the <c> VMINPS </c> instruction.
		266	///
		267	/// \param __a
		268	/// A 256-bit vector of [8 x float] containing one of the operands.
		269	/// \param __b
		270	/// A 256-bit vector of [8 x float] containing one of the operands.
		271	/// \returns A 256-bit vector of [8 x float] containing the minimum values
		272	/// between both operands.
		273	static __inline __m256 __DEFAULT_FN_ATTRS
		274	_mm256_min_ps(__m256 __a, __m256 __b)
		275	{
		276	return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b);
		277	}
		278
		279	/// Multiplies two 256-bit vectors of [4 x double].
		280	///
		281	/// \headerfile <x86intrin.h>
		282	///
		283	/// This intrinsic corresponds to the <c> VMULPD </c> instruction.
		284	///
		285	/// \param __a
		286	/// A 256-bit vector of [4 x double] containing one of the operands.
		287	/// \param __b
		288	/// A 256-bit vector of [4 x double] containing one of the operands.
		289	/// \returns A 256-bit vector of [4 x double] containing the products of both
		290	/// operands.
		291	static __inline __m256d __DEFAULT_FN_ATTRS
		292	_mm256_mul_pd(__m256d __a, __m256d __b)
		293	{
		294	return (__m256d)((__v4df)__a * (__v4df)__b);
		295	}
		296
		297	/// Multiplies two 256-bit vectors of [8 x float].
		298	///
		299	/// \headerfile <x86intrin.h>
		300	///
		301	/// This intrinsic corresponds to the <c> VMULPS </c> instruction.
		302	///
		303	/// \param __a
		304	/// A 256-bit vector of [8 x float] containing one of the operands.
		305	/// \param __b
		306	/// A 256-bit vector of [8 x float] containing one of the operands.
		307	/// \returns A 256-bit vector of [8 x float] containing the products of both
		308	/// operands.
		309	static __inline __m256 __DEFAULT_FN_ATTRS
		310	_mm256_mul_ps(__m256 __a, __m256 __b)
		311	{
		312	return (__m256)((__v8sf)__a * (__v8sf)__b);
		313	}
		314
		315	/// Calculates the square roots of the values in a 256-bit vector of
		316	/// [4 x double].
		317	///
		318	/// \headerfile <x86intrin.h>
		319	///
		320	/// This intrinsic corresponds to the <c> VSQRTPD </c> instruction.
		321	///
		322	/// \param __a
		323	/// A 256-bit vector of [4 x double].
		324	/// \returns A 256-bit vector of [4 x double] containing the square roots of the
		325	/// values in the operand.
		326	static __inline __m256d __DEFAULT_FN_ATTRS
		327	_mm256_sqrt_pd(__m256d __a)
		328	{
		329	return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a);
		330	}
		331
		332	/// Calculates the square roots of the values in a 256-bit vector of
		333	/// [8 x float].
		334	///
		335	/// \headerfile <x86intrin.h>
		336	///
		337	/// This intrinsic corresponds to the <c> VSQRTPS </c> instruction.
		338	///
		339	/// \param __a
		340	/// A 256-bit vector of [8 x float].
		341	/// \returns A 256-bit vector of [8 x float] containing the square roots of the
		342	/// values in the operand.
		343	static __inline __m256 __DEFAULT_FN_ATTRS
		344	_mm256_sqrt_ps(__m256 __a)
		345	{
		346	return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a);
		347	}
		348
		349	/// Calculates the reciprocal square roots of the values in a 256-bit
		350	/// vector of [8 x float].
		351	///
		352	/// \headerfile <x86intrin.h>
		353	///
		354	/// This intrinsic corresponds to the <c> VRSQRTPS </c> instruction.
		355	///
		356	/// \param __a
		357	/// A 256-bit vector of [8 x float].
		358	/// \returns A 256-bit vector of [8 x float] containing the reciprocal square
		359	/// roots of the values in the operand.
		360	static __inline __m256 __DEFAULT_FN_ATTRS
		361	_mm256_rsqrt_ps(__m256 __a)
		362	{
		363	return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a);
		364	}
		365
		366	/// Calculates the reciprocals of the values in a 256-bit vector of
		367	/// [8 x float].
		368	///
		369	/// \headerfile <x86intrin.h>
		370	///
		371	/// This intrinsic corresponds to the <c> VRCPPS </c> instruction.
		372	///
		373	/// \param __a
		374	/// A 256-bit vector of [8 x float].
		375	/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the
		376	/// values in the operand.
		377	static __inline __m256 __DEFAULT_FN_ATTRS
		378	_mm256_rcp_ps(__m256 __a)
		379	{
		380	return (__m256)__builtin_ia32_rcpps256((__v8sf)__a);
		381	}
		382
		383	/// Rounds the values in a 256-bit vector of [4 x double] as specified
		384	/// by the byte operand. The source values are rounded to integer values and
		385	/// returned as 64-bit double-precision floating-point values.
		386	///
		387	/// \headerfile <x86intrin.h>
		388	///
		389	/// \code
		390	/// __m256d _mm256_round_pd(__m256d V, const int M);
		391	/// \endcode
		392	///
		393	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
		394	///
		395	/// \param V
		396	/// A 256-bit vector of [4 x double].
		397	/// \param M
		398	/// An integer value that specifies the rounding operation. \n
		399	/// Bits [7:4] are reserved. \n
		400	/// Bit [3] is a precision exception value: \n
		401	/// 0: A normal PE exception is used. \n
		402	/// 1: The PE field is not updated. \n
		403	/// Bit [2] is the rounding control source: \n
		404	/// 0: Use bits [1:0] of \a M. \n
		405	/// 1: Use the current MXCSR setting. \n
		406	/// Bits [1:0] contain the rounding control definition: \n
		407	/// 00: Nearest. \n
		408	/// 01: Downward (toward negative infinity). \n
		409	/// 10: Upward (toward positive infinity). \n
		410	/// 11: Truncated.
		411	/// \returns A 256-bit vector of [4 x double] containing the rounded values.
		412	#define _mm256_round_pd(V, M) \
		413	((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M)))
		414
		415	/// Rounds the values stored in a 256-bit vector of [8 x float] as
		416	/// specified by the byte operand. The source values are rounded to integer
		417	/// values and returned as floating-point values.
		418	///
		419	/// \headerfile <x86intrin.h>
		420	///
		421	/// \code
		422	/// __m256 _mm256_round_ps(__m256 V, const int M);
		423	/// \endcode
		424	///
		425	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
		426	///
		427	/// \param V
		428	/// A 256-bit vector of [8 x float].
		429	/// \param M
		430	/// An integer value that specifies the rounding operation. \n
		431	/// Bits [7:4] are reserved. \n
		432	/// Bit [3] is a precision exception value: \n
		433	/// 0: A normal PE exception is used. \n
		434	/// 1: The PE field is not updated. \n
		435	/// Bit [2] is the rounding control source: \n
		436	/// 0: Use bits [1:0] of \a M. \n
		437	/// 1: Use the current MXCSR setting. \n
		438	/// Bits [1:0] contain the rounding control definition: \n
		439	/// 00: Nearest. \n
		440	/// 01: Downward (toward negative infinity). \n
		441	/// 10: Upward (toward positive infinity). \n
		442	/// 11: Truncated.
		443	/// \returns A 256-bit vector of [8 x float] containing the rounded values.
		444	#define _mm256_round_ps(V, M) \
		445	((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M)))
		446
		447	/// Rounds up the values stored in a 256-bit vector of [4 x double]. The
		448	/// source values are rounded up to integer values and returned as 64-bit
		449	/// double-precision floating-point values.
		450	///
		451	/// \headerfile <x86intrin.h>
		452	///
		453	/// \code
		454	/// __m256d _mm256_ceil_pd(__m256d V);
		455	/// \endcode
		456	///
		457	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
		458	///
		459	/// \param V
		460	/// A 256-bit vector of [4 x double].
		461	/// \returns A 256-bit vector of [4 x double] containing the rounded up values.
		462	#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL)
		463
		464	/// Rounds down the values stored in a 256-bit vector of [4 x double].
		465	/// The source values are rounded down to integer values and returned as
		466	/// 64-bit double-precision floating-point values.
		467	///
		468	/// \headerfile <x86intrin.h>
		469	///
		470	/// \code
		471	/// __m256d _mm256_floor_pd(__m256d V);
		472	/// \endcode
		473	///
		474	/// This intrinsic corresponds to the <c> VROUNDPD </c> instruction.
		475	///
		476	/// \param V
		477	/// A 256-bit vector of [4 x double].
		478	/// \returns A 256-bit vector of [4 x double] containing the rounded down
		479	/// values.
		480	#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR)
		481
		482	/// Rounds up the values stored in a 256-bit vector of [8 x float]. The
		483	/// source values are rounded up to integer values and returned as
		484	/// floating-point values.
		485	///
		486	/// \headerfile <x86intrin.h>
		487	///
		488	/// \code
		489	/// __m256 _mm256_ceil_ps(__m256 V);
		490	/// \endcode
		491	///
		492	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
		493	///
		494	/// \param V
		495	/// A 256-bit vector of [8 x float].
		496	/// \returns A 256-bit vector of [8 x float] containing the rounded up values.
		497	#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL)
		498
		499	/// Rounds down the values stored in a 256-bit vector of [8 x float]. The
		500	/// source values are rounded down to integer values and returned as
		501	/// floating-point values.
		502	///
		503	/// \headerfile <x86intrin.h>
		504	///
		505	/// \code
		506	/// __m256 _mm256_floor_ps(__m256 V);
		507	/// \endcode
		508	///
		509	/// This intrinsic corresponds to the <c> VROUNDPS </c> instruction.
		510	///
		511	/// \param V
		512	/// A 256-bit vector of [8 x float].
		513	/// \returns A 256-bit vector of [8 x float] containing the rounded down values.
		514	#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR)
		515
		516	/* Logical */
		517	/// Performs a bitwise AND of two 256-bit vectors of [4 x double].
		518	///
		519	/// \headerfile <x86intrin.h>
		520	///
		521	/// This intrinsic corresponds to the <c> VANDPD </c> instruction.
		522	///
		523	/// \param __a
		524	/// A 256-bit vector of [4 x double] containing one of the source operands.
		525	/// \param __b
		526	/// A 256-bit vector of [4 x double] containing one of the source operands.
		527	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
		528	/// values between both operands.
		529	static __inline __m256d __DEFAULT_FN_ATTRS
		530	_mm256_and_pd(__m256d __a, __m256d __b)
		531	{
		532	return (__m256d)((__v4du)__a & (__v4du)__b);
		533	}
		534
		535	/// Performs a bitwise AND of two 256-bit vectors of [8 x float].
		536	///
		537	/// \headerfile <x86intrin.h>
		538	///
		539	/// This intrinsic corresponds to the <c> VANDPS </c> instruction.
		540	///
		541	/// \param __a
		542	/// A 256-bit vector of [8 x float] containing one of the source operands.
		543	/// \param __b
		544	/// A 256-bit vector of [8 x float] containing one of the source operands.
		545	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
		546	/// values between both operands.
		547	static __inline __m256 __DEFAULT_FN_ATTRS
		548	_mm256_and_ps(__m256 __a, __m256 __b)
		549	{
		550	return (__m256)((__v8su)__a & (__v8su)__b);
		551	}
		552
		553	/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using
		554	/// the one's complement of the values contained in the first source operand.
		555	///
		556	/// \headerfile <x86intrin.h>
		557	///
		558	/// This intrinsic corresponds to the <c> VANDNPD </c> instruction.
		559	///
		560	/// \param __a
		561	/// A 256-bit vector of [4 x double] containing the left source operand. The
		562	/// one's complement of this value is used in the bitwise AND.
		563	/// \param __b
		564	/// A 256-bit vector of [4 x double] containing the right source operand.
		565	/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the
		566	/// values of the second operand and the one's complement of the first
		567	/// operand.
		568	static __inline __m256d __DEFAULT_FN_ATTRS
		569	_mm256_andnot_pd(__m256d __a, __m256d __b)
		570	{
		571	return (__m256d)(~(__v4du)__a & (__v4du)__b);
		572	}
		573
		574	/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using
		575	/// the one's complement of the values contained in the first source operand.
		576	///
		577	/// \headerfile <x86intrin.h>
		578	///
		579	/// This intrinsic corresponds to the <c> VANDNPS </c> instruction.
		580	///
		581	/// \param __a
		582	/// A 256-bit vector of [8 x float] containing the left source operand. The
		583	/// one's complement of this value is used in the bitwise AND.
		584	/// \param __b
		585	/// A 256-bit vector of [8 x float] containing the right source operand.
		586	/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the
		587	/// values of the second operand and the one's complement of the first
		588	/// operand.
		589	static __inline __m256 __DEFAULT_FN_ATTRS
		590	_mm256_andnot_ps(__m256 __a, __m256 __b)
		591	{
		592	return (__m256)(~(__v8su)__a & (__v8su)__b);
		593	}
		594
		595	/// Performs a bitwise OR of two 256-bit vectors of [4 x double].
		596	///
		597	/// \headerfile <x86intrin.h>
		598	///
		599	/// This intrinsic corresponds to the <c> VORPD </c> instruction.
		600	///
		601	/// \param __a
		602	/// A 256-bit vector of [4 x double] containing one of the source operands.
		603	/// \param __b
		604	/// A 256-bit vector of [4 x double] containing one of the source operands.
		605	/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the
		606	/// values between both operands.
		607	static __inline __m256d __DEFAULT_FN_ATTRS
		608	_mm256_or_pd(__m256d __a, __m256d __b)
		609	{
		610	return (__m256d)((__v4du)__a \| (__v4du)__b);
		611	}
		612
		613	/// Performs a bitwise OR of two 256-bit vectors of [8 x float].
		614	///
		615	/// \headerfile <x86intrin.h>
		616	///
		617	/// This intrinsic corresponds to the <c> VORPS </c> instruction.
		618	///
		619	/// \param __a
		620	/// A 256-bit vector of [8 x float] containing one of the source operands.
		621	/// \param __b
		622	/// A 256-bit vector of [8 x float] containing one of the source operands.
		623	/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the
		624	/// values between both operands.
		625	static __inline __m256 __DEFAULT_FN_ATTRS
		626	_mm256_or_ps(__m256 __a, __m256 __b)
		627	{
		628	return (__m256)((__v8su)__a \| (__v8su)__b);
		629	}
		630
		631	/// Performs a bitwise XOR of two 256-bit vectors of [4 x double].
		632	///
		633	/// \headerfile <x86intrin.h>
		634	///
		635	/// This intrinsic corresponds to the <c> VXORPD </c> instruction.
		636	///
		637	/// \param __a
		638	/// A 256-bit vector of [4 x double] containing one of the source operands.
		639	/// \param __b
		640	/// A 256-bit vector of [4 x double] containing one of the source operands.
		641	/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the
		642	/// values between both operands.
		643	static __inline __m256d __DEFAULT_FN_ATTRS
		644	_mm256_xor_pd(__m256d __a, __m256d __b)
		645	{
		646	return (__m256d)((__v4du)__a ^ (__v4du)__b);
		647	}
		648
		649	/// Performs a bitwise XOR of two 256-bit vectors of [8 x float].
		650	///
		651	/// \headerfile <x86intrin.h>
		652	///
		653	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
		654	///
		655	/// \param __a
		656	/// A 256-bit vector of [8 x float] containing one of the source operands.
		657	/// \param __b
		658	/// A 256-bit vector of [8 x float] containing one of the source operands.
		659	/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the
		660	/// values between both operands.
		661	static __inline __m256 __DEFAULT_FN_ATTRS
		662	_mm256_xor_ps(__m256 __a, __m256 __b)
		663	{
		664	return (__m256)((__v8su)__a ^ (__v8su)__b);
		665	}
		666
		667	/* Horizontal arithmetic */
		668	/// Horizontally adds the adjacent pairs of values contained in two
		669	/// 256-bit vectors of [4 x double].
		670	///
		671	/// \headerfile <x86intrin.h>
		672	///
		673	/// This intrinsic corresponds to the <c> VHADDPD </c> instruction.
		674	///
		675	/// \param __a
		676	/// A 256-bit vector of [4 x double] containing one of the source operands.
		677	/// The horizontal sums of the values are returned in the even-indexed
		678	/// elements of a vector of [4 x double].
		679	/// \param __b
		680	/// A 256-bit vector of [4 x double] containing one of the source operands.
		681	/// The horizontal sums of the values are returned in the odd-indexed
		682	/// elements of a vector of [4 x double].
		683	/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of
		684	/// both operands.
		685	static __inline __m256d __DEFAULT_FN_ATTRS
		686	_mm256_hadd_pd(__m256d __a, __m256d __b)
		687	{
		688	return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b);
		689	}
		690
		691	/// Horizontally adds the adjacent pairs of values contained in two
		692	/// 256-bit vectors of [8 x float].
		693	///
		694	/// \headerfile <x86intrin.h>
		695	///
		696	/// This intrinsic corresponds to the <c> VHADDPS </c> instruction.
		697	///
		698	/// \param __a
		699	/// A 256-bit vector of [8 x float] containing one of the source operands.
		700	/// The horizontal sums of the values are returned in the elements with
		701	/// index 0, 1, 4, 5 of a vector of [8 x float].
		702	/// \param __b
		703	/// A 256-bit vector of [8 x float] containing one of the source operands.
		704	/// The horizontal sums of the values are returned in the elements with
		705	/// index 2, 3, 6, 7 of a vector of [8 x float].
		706	/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of
		707	/// both operands.
		708	static __inline __m256 __DEFAULT_FN_ATTRS
		709	_mm256_hadd_ps(__m256 __a, __m256 __b)
		710	{
		711	return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b);
		712	}
		713
		714	/// Horizontally subtracts the adjacent pairs of values contained in two
		715	/// 256-bit vectors of [4 x double].
		716	///
		717	/// \headerfile <x86intrin.h>
		718	///
		719	/// This intrinsic corresponds to the <c> VHSUBPD </c> instruction.
		720	///
		721	/// \param __a
		722	/// A 256-bit vector of [4 x double] containing one of the source operands.
		723	/// The horizontal differences between the values are returned in the
		724	/// even-indexed elements of a vector of [4 x double].
		725	/// \param __b
		726	/// A 256-bit vector of [4 x double] containing one of the source operands.
		727	/// The horizontal differences between the values are returned in the
		728	/// odd-indexed elements of a vector of [4 x double].
		729	/// \returns A 256-bit vector of [4 x double] containing the horizontal
		730	/// differences of both operands.
		731	static __inline __m256d __DEFAULT_FN_ATTRS
		732	_mm256_hsub_pd(__m256d __a, __m256d __b)
		733	{
		734	return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b);
		735	}
		736
		737	/// Horizontally subtracts the adjacent pairs of values contained in two
		738	/// 256-bit vectors of [8 x float].
		739	///
		740	/// \headerfile <x86intrin.h>
		741	///
		742	/// This intrinsic corresponds to the <c> VHSUBPS </c> instruction.
		743	///
		744	/// \param __a
		745	/// A 256-bit vector of [8 x float] containing one of the source operands.
		746	/// The horizontal differences between the values are returned in the
		747	/// elements with index 0, 1, 4, 5 of a vector of [8 x float].
		748	/// \param __b
		749	/// A 256-bit vector of [8 x float] containing one of the source operands.
		750	/// The horizontal differences between the values are returned in the
		751	/// elements with index 2, 3, 6, 7 of a vector of [8 x float].
		752	/// \returns A 256-bit vector of [8 x float] containing the horizontal
		753	/// differences of both operands.
		754	static __inline __m256 __DEFAULT_FN_ATTRS
		755	_mm256_hsub_ps(__m256 __a, __m256 __b)
		756	{
		757	return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b);
		758	}
		759
		760	/* Vector permutations */
		761	/// Copies the values in a 128-bit vector of [2 x double] as specified
		762	/// by the 128-bit integer vector operand.
		763	///
		764	/// \headerfile <x86intrin.h>
		765	///
		766	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
		767	///
		768	/// \param __a
		769	/// A 128-bit vector of [2 x double].
		770	/// \param __c
		771	/// A 128-bit integer vector operand specifying how the values are to be
		772	/// copied. \n
		773	/// Bit [1]: \n
		774	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
		775	/// vector. \n
		776	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
		777	/// returned vector. \n
		778	/// Bit [65]: \n
		779	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
		780	/// returned vector. \n
		781	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
		782	/// returned vector.
		783	/// \returns A 128-bit vector of [2 x double] containing the copied values.
		784	static __inline __m128d __DEFAULT_FN_ATTRS128
		785	_mm_permutevar_pd(__m128d __a, __m128i __c)
		786	{
		787	return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c);
		788	}
		789
		790	/// Copies the values in a 256-bit vector of [4 x double] as specified
		791	/// by the 256-bit integer vector operand.
		792	///
		793	/// \headerfile <x86intrin.h>
		794	///
		795	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
		796	///
		797	/// \param __a
		798	/// A 256-bit vector of [4 x double].
		799	/// \param __c
		800	/// A 256-bit integer vector operand specifying how the values are to be
		801	/// copied. \n
		802	/// Bit [1]: \n
		803	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
		804	/// vector. \n
		805	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
		806	/// returned vector. \n
		807	/// Bit [65]: \n
		808	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
		809	/// returned vector. \n
		810	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
		811	/// returned vector. \n
		812	/// Bit [129]: \n
		813	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
		814	/// returned vector. \n
		815	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
		816	/// returned vector. \n
		817	/// Bit [193]: \n
		818	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
		819	/// returned vector. \n
		820	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
		821	/// returned vector.
		822	/// \returns A 256-bit vector of [4 x double] containing the copied values.
		823	static __inline __m256d __DEFAULT_FN_ATTRS
		824	_mm256_permutevar_pd(__m256d __a, __m256i __c)
		825	{
		826	return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c);
		827	}
		828
		829	/// Copies the values stored in a 128-bit vector of [4 x float] as
		830	/// specified by the 128-bit integer vector operand.
		831	/// \headerfile <x86intrin.h>
		832	///
		833	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
		834	///
		835	/// \param __a
		836	/// A 128-bit vector of [4 x float].
		837	/// \param __c
		838	/// A 128-bit integer vector operand specifying how the values are to be
		839	/// copied. \n
		840	/// Bits [1:0]: \n
		841	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
		842	/// returned vector. \n
		843	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
		844	/// returned vector. \n
		845	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
		846	/// returned vector. \n
		847	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
		848	/// returned vector. \n
		849	/// Bits [33:32]: \n
		850	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
		851	/// returned vector. \n
		852	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
		853	/// returned vector. \n
		854	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
		855	/// returned vector. \n
		856	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
		857	/// returned vector. \n
		858	/// Bits [65:64]: \n
		859	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
		860	/// returned vector. \n
		861	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
		862	/// returned vector. \n
		863	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
		864	/// returned vector. \n
		865	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
		866	/// returned vector. \n
		867	/// Bits [97:96]: \n
		868	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
		869	/// returned vector. \n
		870	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
		871	/// returned vector. \n
		872	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
		873	/// returned vector. \n
		874	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
		875	/// returned vector.
		876	/// \returns A 128-bit vector of [4 x float] containing the copied values.
		877	static __inline __m128 __DEFAULT_FN_ATTRS128
		878	_mm_permutevar_ps(__m128 __a, __m128i __c)
		879	{
		880	return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c);
		881	}
		882
		883	/// Copies the values stored in a 256-bit vector of [8 x float] as
		884	/// specified by the 256-bit integer vector operand.
		885	///
		886	/// \headerfile <x86intrin.h>
		887	///
		888	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
		889	///
		890	/// \param __a
		891	/// A 256-bit vector of [8 x float].
		892	/// \param __c
		893	/// A 256-bit integer vector operand specifying how the values are to be
		894	/// copied. \n
		895	/// Bits [1:0]: \n
		896	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
		897	/// returned vector. \n
		898	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
		899	/// returned vector. \n
		900	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
		901	/// returned vector. \n
		902	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
		903	/// returned vector. \n
		904	/// Bits [33:32]: \n
		905	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
		906	/// returned vector. \n
		907	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
		908	/// returned vector. \n
		909	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
		910	/// returned vector. \n
		911	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
		912	/// returned vector. \n
		913	/// Bits [65:64]: \n
		914	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
		915	/// returned vector. \n
		916	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
		917	/// returned vector. \n
		918	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
		919	/// returned vector. \n
		920	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
		921	/// returned vector. \n
		922	/// Bits [97:96]: \n
		923	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
		924	/// returned vector. \n
		925	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
		926	/// returned vector. \n
		927	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
		928	/// returned vector. \n
		929	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
		930	/// returned vector. \n
		931	/// Bits [129:128]: \n
		932	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
		933	/// returned vector. \n
		934	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
		935	/// returned vector. \n
		936	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
		937	/// returned vector. \n
		938	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
		939	/// returned vector. \n
		940	/// Bits [161:160]: \n
		941	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
		942	/// returned vector. \n
		943	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
		944	/// returned vector. \n
		945	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
		946	/// returned vector. \n
		947	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
		948	/// returned vector. \n
		949	/// Bits [193:192]: \n
		950	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
		951	/// returned vector. \n
		952	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
		953	/// returned vector. \n
		954	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
		955	/// returned vector. \n
		956	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
		957	/// returned vector. \n
		958	/// Bits [225:224]: \n
		959	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
		960	/// returned vector. \n
		961	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
		962	/// returned vector. \n
		963	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
		964	/// returned vector. \n
		965	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
		966	/// returned vector.
		967	/// \returns A 256-bit vector of [8 x float] containing the copied values.
		968	static __inline __m256 __DEFAULT_FN_ATTRS
		969	_mm256_permutevar_ps(__m256 __a, __m256i __c)
		970	{
		971	return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c);
		972	}
		973
		974	/// Copies the values in a 128-bit vector of [2 x double] as specified
		975	/// by the immediate integer operand.
		976	///
		977	/// \headerfile <x86intrin.h>
		978	///
		979	/// \code
		980	/// __m128d _mm_permute_pd(__m128d A, const int C);
		981	/// \endcode
		982	///
		983	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
		984	///
		985	/// \param A
		986	/// A 128-bit vector of [2 x double].
		987	/// \param C
		988	/// An immediate integer operand specifying how the values are to be
		989	/// copied. \n
		990	/// Bit [0]: \n
		991	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
		992	/// vector. \n
		993	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
		994	/// returned vector. \n
		995	/// Bit [1]: \n
		996	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
		997	/// returned vector. \n
		998	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
		999	/// returned vector.
		1000	/// \returns A 128-bit vector of [2 x double] containing the copied values.
		1001	#define _mm_permute_pd(A, C) \
		1002	((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C)))
		1003
		1004	/// Copies the values in a 256-bit vector of [4 x double] as specified by
		1005	/// the immediate integer operand.
		1006	///
		1007	/// \headerfile <x86intrin.h>
		1008	///
		1009	/// \code
		1010	/// __m256d _mm256_permute_pd(__m256d A, const int C);
		1011	/// \endcode
		1012	///
		1013	/// This intrinsic corresponds to the <c> VPERMILPD </c> instruction.
		1014	///
		1015	/// \param A
		1016	/// A 256-bit vector of [4 x double].
		1017	/// \param C
		1018	/// An immediate integer operand specifying how the values are to be
		1019	/// copied. \n
		1020	/// Bit [0]: \n
		1021	/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned
		1022	/// vector. \n
		1023	/// 1: Bits [127:64] of the source are copied to bits [63:0] of the
		1024	/// returned vector. \n
		1025	/// Bit [1]: \n
		1026	/// 0: Bits [63:0] of the source are copied to bits [127:64] of the
		1027	/// returned vector. \n
		1028	/// 1: Bits [127:64] of the source are copied to bits [127:64] of the
		1029	/// returned vector. \n
		1030	/// Bit [2]: \n
		1031	/// 0: Bits [191:128] of the source are copied to bits [191:128] of the
		1032	/// returned vector. \n
		1033	/// 1: Bits [255:192] of the source are copied to bits [191:128] of the
		1034	/// returned vector. \n
		1035	/// Bit [3]: \n
		1036	/// 0: Bits [191:128] of the source are copied to bits [255:192] of the
		1037	/// returned vector. \n
		1038	/// 1: Bits [255:192] of the source are copied to bits [255:192] of the
		1039	/// returned vector.
		1040	/// \returns A 256-bit vector of [4 x double] containing the copied values.
		1041	#define _mm256_permute_pd(A, C) \
		1042	((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C)))
		1043
		1044	/// Copies the values in a 128-bit vector of [4 x float] as specified by
		1045	/// the immediate integer operand.
		1046	///
		1047	/// \headerfile <x86intrin.h>
		1048	///
		1049	/// \code
		1050	/// __m128 _mm_permute_ps(__m128 A, const int C);
		1051	/// \endcode
		1052	///
		1053	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
		1054	///
		1055	/// \param A
		1056	/// A 128-bit vector of [4 x float].
		1057	/// \param C
		1058	/// An immediate integer operand specifying how the values are to be
		1059	/// copied. \n
		1060	/// Bits [1:0]: \n
		1061	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
		1062	/// returned vector. \n
		1063	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
		1064	/// returned vector. \n
		1065	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
		1066	/// returned vector. \n
		1067	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
		1068	/// returned vector. \n
		1069	/// Bits [3:2]: \n
		1070	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
		1071	/// returned vector. \n
		1072	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
		1073	/// returned vector. \n
		1074	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
		1075	/// returned vector. \n
		1076	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
		1077	/// returned vector. \n
		1078	/// Bits [5:4]: \n
		1079	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
		1080	/// returned vector. \n
		1081	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
		1082	/// returned vector. \n
		1083	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
		1084	/// returned vector. \n
		1085	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
		1086	/// returned vector. \n
		1087	/// Bits [7:6]: \n
		1088	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
		1089	/// returned vector. \n
		1090	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
		1091	/// returned vector. \n
		1092	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
		1093	/// returned vector. \n
		1094	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
		1095	/// returned vector.
		1096	/// \returns A 128-bit vector of [4 x float] containing the copied values.
		1097	#define _mm_permute_ps(A, C) \
		1098	((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C)))
		1099
		1100	/// Copies the values in a 256-bit vector of [8 x float] as specified by
		1101	/// the immediate integer operand.
		1102	///
		1103	/// \headerfile <x86intrin.h>
		1104	///
		1105	/// \code
		1106	/// __m256 _mm256_permute_ps(__m256 A, const int C);
		1107	/// \endcode
		1108	///
		1109	/// This intrinsic corresponds to the <c> VPERMILPS </c> instruction.
		1110	///
		1111	/// \param A
		1112	/// A 256-bit vector of [8 x float].
		1113	/// \param C
		1114	/// An immediate integer operand specifying how the values are to be
		1115	/// copied. \n
		1116	/// Bits [1:0]: \n
		1117	/// 00: Bits [31:0] of the source are copied to bits [31:0] of the
		1118	/// returned vector. \n
		1119	/// 01: Bits [63:32] of the source are copied to bits [31:0] of the
		1120	/// returned vector. \n
		1121	/// 10: Bits [95:64] of the source are copied to bits [31:0] of the
		1122	/// returned vector. \n
		1123	/// 11: Bits [127:96] of the source are copied to bits [31:0] of the
		1124	/// returned vector. \n
		1125	/// Bits [3:2]: \n
		1126	/// 00: Bits [31:0] of the source are copied to bits [63:32] of the
		1127	/// returned vector. \n
		1128	/// 01: Bits [63:32] of the source are copied to bits [63:32] of the
		1129	/// returned vector. \n
		1130	/// 10: Bits [95:64] of the source are copied to bits [63:32] of the
		1131	/// returned vector. \n
		1132	/// 11: Bits [127:96] of the source are copied to bits [63:32] of the
		1133	/// returned vector. \n
		1134	/// Bits [5:4]: \n
		1135	/// 00: Bits [31:0] of the source are copied to bits [95:64] of the
		1136	/// returned vector. \n
		1137	/// 01: Bits [63:32] of the source are copied to bits [95:64] of the
		1138	/// returned vector. \n
		1139	/// 10: Bits [95:64] of the source are copied to bits [95:64] of the
		1140	/// returned vector. \n
		1141	/// 11: Bits [127:96] of the source are copied to bits [95:64] of the
		1142	/// returned vector. \n
		1143	/// Bits [7:6]: \n
		1144	/// 00: Bits [31:0] of the source are copied to bits [127:96] of the
		1145	/// returned vector. \n
		1146	/// 01: Bits [63:32] of the source are copied to bits [127:96] of the
		1147	/// returned vector. \n
		1148	/// 10: Bits [95:64] of the source are copied to bits [127:96] of the
		1149	/// returned vector. \n
		1150	/// 11: Bits [127:96] of the source are copied to bits [127:96] of the
		1151	/// returned vector. \n
		1152	/// Bits [1:0]: \n
		1153	/// 00: Bits [159:128] of the source are copied to bits [159:128] of the
		1154	/// returned vector. \n
		1155	/// 01: Bits [191:160] of the source are copied to bits [159:128] of the
		1156	/// returned vector. \n
		1157	/// 10: Bits [223:192] of the source are copied to bits [159:128] of the
		1158	/// returned vector. \n
		1159	/// 11: Bits [255:224] of the source are copied to bits [159:128] of the
		1160	/// returned vector. \n
		1161	/// Bits [3:2]: \n
		1162	/// 00: Bits [159:128] of the source are copied to bits [191:160] of the
		1163	/// returned vector. \n
		1164	/// 01: Bits [191:160] of the source are copied to bits [191:160] of the
		1165	/// returned vector. \n
		1166	/// 10: Bits [223:192] of the source are copied to bits [191:160] of the
		1167	/// returned vector. \n
		1168	/// 11: Bits [255:224] of the source are copied to bits [191:160] of the
		1169	/// returned vector. \n
		1170	/// Bits [5:4]: \n
		1171	/// 00: Bits [159:128] of the source are copied to bits [223:192] of the
		1172	/// returned vector. \n
		1173	/// 01: Bits [191:160] of the source are copied to bits [223:192] of the
		1174	/// returned vector. \n
		1175	/// 10: Bits [223:192] of the source are copied to bits [223:192] of the
		1176	/// returned vector. \n
		1177	/// 11: Bits [255:224] of the source are copied to bits [223:192] of the
		1178	/// returned vector. \n
		1179	/// Bits [7:6]: \n
		1180	/// 00: Bits [159:128] of the source are copied to bits [255:224] of the
		1181	/// returned vector. \n
		1182	/// 01: Bits [191:160] of the source are copied to bits [255:224] of the
		1183	/// returned vector. \n
		1184	/// 10: Bits [223:192] of the source are copied to bits [255:224] of the
		1185	/// returned vector. \n
		1186	/// 11: Bits [255:224] of the source are copied to bits [255:224] of the
		1187	/// returned vector.
		1188	/// \returns A 256-bit vector of [8 x float] containing the copied values.
		1189	#define _mm256_permute_ps(A, C) \
		1190	((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C)))
		1191
		1192	/// Permutes 128-bit data values stored in two 256-bit vectors of
		1193	/// [4 x double], as specified by the immediate integer operand.
		1194	///
		1195	/// \headerfile <x86intrin.h>
		1196	///
		1197	/// \code
		1198	/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M);
		1199	/// \endcode
		1200	///
		1201	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
		1202	///
		1203	/// \param V1
		1204	/// A 256-bit vector of [4 x double].
		1205	/// \param V2
		1206	/// A 256-bit vector of [4 x double.
		1207	/// \param M
		1208	/// An immediate integer operand specifying how the values are to be
		1209	/// permuted. \n
		1210	/// Bits [1:0]: \n
		1211	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
		1212	/// destination. \n
		1213	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
		1214	/// destination. \n
		1215	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
		1216	/// destination. \n
		1217	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
		1218	/// destination. \n
		1219	/// Bits [5:4]: \n
		1220	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
		1221	/// destination. \n
		1222	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
		1223	/// destination. \n
		1224	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
		1225	/// destination. \n
		1226	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
		1227	/// destination.
		1228	/// \returns A 256-bit vector of [4 x double] containing the copied values.
		1229	#define _mm256_permute2f128_pd(V1, V2, M) \
		1230	((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \
		1231	(__v4df)(__m256d)(V2), (int)(M)))
		1232
		1233	/// Permutes 128-bit data values stored in two 256-bit vectors of
		1234	/// [8 x float], as specified by the immediate integer operand.
		1235	///
		1236	/// \headerfile <x86intrin.h>
		1237	///
		1238	/// \code
		1239	/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M);
		1240	/// \endcode
		1241	///
		1242	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
		1243	///
		1244	/// \param V1
		1245	/// A 256-bit vector of [8 x float].
		1246	/// \param V2
		1247	/// A 256-bit vector of [8 x float].
		1248	/// \param M
		1249	/// An immediate integer operand specifying how the values are to be
		1250	/// permuted. \n
		1251	/// Bits [1:0]: \n
		1252	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
		1253	/// destination. \n
		1254	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
		1255	/// destination. \n
		1256	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
		1257	/// destination. \n
		1258	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
		1259	/// destination. \n
		1260	/// Bits [5:4]: \n
		1261	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
		1262	/// destination. \n
		1263	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
		1264	/// destination. \n
		1265	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
		1266	/// destination. \n
		1267	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
		1268	/// destination.
		1269	/// \returns A 256-bit vector of [8 x float] containing the copied values.
		1270	#define _mm256_permute2f128_ps(V1, V2, M) \
		1271	((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \
		1272	(__v8sf)(__m256)(V2), (int)(M)))
		1273
		1274	/// Permutes 128-bit data values stored in two 256-bit integer vectors,
		1275	/// as specified by the immediate integer operand.
		1276	///
		1277	/// \headerfile <x86intrin.h>
		1278	///
		1279	/// \code
		1280	/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M);
		1281	/// \endcode
		1282	///
		1283	/// This intrinsic corresponds to the <c> VPERM2F128 </c> instruction.
		1284	///
		1285	/// \param V1
		1286	/// A 256-bit integer vector.
		1287	/// \param V2
		1288	/// A 256-bit integer vector.
		1289	/// \param M
		1290	/// An immediate integer operand specifying how the values are to be copied.
		1291	/// Bits [1:0]: \n
		1292	/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the
		1293	/// destination. \n
		1294	/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the
		1295	/// destination. \n
		1296	/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the
		1297	/// destination. \n
		1298	/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the
		1299	/// destination. \n
		1300	/// Bits [5:4]: \n
		1301	/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the
		1302	/// destination. \n
		1303	/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the
		1304	/// destination. \n
		1305	/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the
		1306	/// destination. \n
		1307	/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the
		1308	/// destination.
		1309	/// \returns A 256-bit integer vector containing the copied values.
		1310	#define _mm256_permute2f128_si256(V1, V2, M) \
		1311	((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \
		1312	(__v8si)(__m256i)(V2), (int)(M)))
		1313
		1314	/* Vector Blend */
		1315	/// Merges 64-bit double-precision data values stored in either of the
		1316	/// two 256-bit vectors of [4 x double], as specified by the immediate
		1317	/// integer operand.
		1318	///
		1319	/// \headerfile <x86intrin.h>
		1320	///
		1321	/// \code
		1322	/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M);
		1323	/// \endcode
		1324	///
		1325	/// This intrinsic corresponds to the <c> VBLENDPD </c> instruction.
		1326	///
		1327	/// \param V1
		1328	/// A 256-bit vector of [4 x double].
		1329	/// \param V2
		1330	/// A 256-bit vector of [4 x double].
		1331	/// \param M
		1332	/// An immediate integer operand, with mask bits [3:0] specifying how the
		1333	/// values are to be copied. The position of the mask bit corresponds to the
		1334	/// index of a copied value. When a mask bit is 0, the corresponding 64-bit
		1335	/// element in operand \a V1 is copied to the same position in the
		1336	/// destination. When a mask bit is 1, the corresponding 64-bit element in
		1337	/// operand \a V2 is copied to the same position in the destination.
		1338	/// \returns A 256-bit vector of [4 x double] containing the copied values.
		1339	#define _mm256_blend_pd(V1, V2, M) \
		1340	((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \
		1341	(__v4df)(__m256d)(V2), (int)(M)))
		1342
		1343	/// Merges 32-bit single-precision data values stored in either of the
		1344	/// two 256-bit vectors of [8 x float], as specified by the immediate
		1345	/// integer operand.
		1346	///
		1347	/// \headerfile <x86intrin.h>
		1348	///
		1349	/// \code
		1350	/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M);
		1351	/// \endcode
		1352	///
		1353	/// This intrinsic corresponds to the <c> VBLENDPS </c> instruction.
		1354	///
		1355	/// \param V1
		1356	/// A 256-bit vector of [8 x float].
		1357	/// \param V2
		1358	/// A 256-bit vector of [8 x float].
		1359	/// \param M
		1360	/// An immediate integer operand, with mask bits [7:0] specifying how the
		1361	/// values are to be copied. The position of the mask bit corresponds to the
		1362	/// index of a copied value. When a mask bit is 0, the corresponding 32-bit
		1363	/// element in operand \a V1 is copied to the same position in the
		1364	/// destination. When a mask bit is 1, the corresponding 32-bit element in
		1365	/// operand \a V2 is copied to the same position in the destination.
		1366	/// \returns A 256-bit vector of [8 x float] containing the copied values.
		1367	#define _mm256_blend_ps(V1, V2, M) \
		1368	((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \
		1369	(__v8sf)(__m256)(V2), (int)(M)))
		1370
		1371	/// Merges 64-bit double-precision data values stored in either of the
		1372	/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector
		1373	/// operand.
		1374	///
		1375	/// \headerfile <x86intrin.h>
		1376	///
		1377	/// This intrinsic corresponds to the <c> VBLENDVPD </c> instruction.
		1378	///
		1379	/// \param __a
		1380	/// A 256-bit vector of [4 x double].
		1381	/// \param __b
		1382	/// A 256-bit vector of [4 x double].
		1383	/// \param __c
		1384	/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying
		1385	/// how the values are to be copied. The position of the mask bit corresponds
		1386	/// to the most significant bit of a copied value. When a mask bit is 0, the
		1387	/// corresponding 64-bit element in operand \a __a is copied to the same
		1388	/// position in the destination. When a mask bit is 1, the corresponding
		1389	/// 64-bit element in operand \a __b is copied to the same position in the
		1390	/// destination.
		1391	/// \returns A 256-bit vector of [4 x double] containing the copied values.
		1392	static __inline __m256d __DEFAULT_FN_ATTRS
		1393	_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c)
		1394	{
		1395	return (__m256d)__builtin_ia32_blendvpd256(
		1396	(__v4df)__a, (__v4df)__b, (__v4df)__c);
		1397	}
		1398
		1399	/// Merges 32-bit single-precision data values stored in either of the
		1400	/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector
		1401	/// operand.
		1402	///
		1403	/// \headerfile <x86intrin.h>
		1404	///
		1405	/// This intrinsic corresponds to the <c> VBLENDVPS </c> instruction.
		1406	///
		1407	/// \param __a
		1408	/// A 256-bit vector of [8 x float].
		1409	/// \param __b
		1410	/// A 256-bit vector of [8 x float].
		1411	/// \param __c
		1412	/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63,
		1413	/// and 31 specifying how the values are to be copied. The position of the
		1414	/// mask bit corresponds to the most significant bit of a copied value. When
		1415	/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is
		1416	/// copied to the same position in the destination. When a mask bit is 1, the
		1417	/// corresponding 32-bit element in operand \a __b is copied to the same
		1418	/// position in the destination.
		1419	/// \returns A 256-bit vector of [8 x float] containing the copied values.
		1420	static __inline __m256 __DEFAULT_FN_ATTRS
		1421	_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c)
		1422	{
		1423	return (__m256)__builtin_ia32_blendvps256(
		1424	(__v8sf)__a, (__v8sf)__b, (__v8sf)__c);
		1425	}
		1426
		1427	/* Vector Dot Product */
		1428	/// Computes two dot products in parallel, using the lower and upper
		1429	/// halves of two [8 x float] vectors as input to the two computations, and
		1430	/// returning the two dot products in the lower and upper halves of the
		1431	/// [8 x float] result.
		1432	///
		1433	/// The immediate integer operand controls which input elements will
		1434	/// contribute to the dot product, and where the final results are returned.
		1435	/// In general, for each dot product, the four corresponding elements of the
		1436	/// input vectors are multiplied; the first two and second two products are
		1437	/// summed, then the two sums are added to form the final result.
		1438	///
		1439	/// \headerfile <x86intrin.h>
		1440	///
		1441	/// \code
		1442	/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M);
		1443	/// \endcode
		1444	///
		1445	/// This intrinsic corresponds to the <c> VDPPS </c> instruction.
		1446	///
		1447	/// \param V1
		1448	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
		1449	/// \param V2
		1450	/// A vector of [8 x float] values, treated as two [4 x float] vectors.
		1451	/// \param M
		1452	/// An immediate integer argument. Bits [7:4] determine which elements of
		1453	/// the input vectors are used, with bit [4] corresponding to the lowest
		1454	/// element and bit [7] corresponding to the highest element of each [4 x
		1455	/// float] subvector. If a bit is set, the corresponding elements from the
		1456	/// two input vectors are used as an input for dot product; otherwise that
		1457	/// input is treated as zero. Bits [3:0] determine which elements of the
		1458	/// result will receive a copy of the final dot product, with bit [0]
		1459	/// corresponding to the lowest element and bit [3] corresponding to the
		1460	/// highest element of each [4 x float] subvector. If a bit is set, the dot
		1461	/// product is returned in the corresponding element; otherwise that element
		1462	/// is set to zero. The bitmask is applied in the same way to each of the
		1463	/// two parallel dot product computations.
		1464	/// \returns A 256-bit vector of [8 x float] containing the two dot products.
		1465	#define _mm256_dp_ps(V1, V2, M) \
		1466	((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \
		1467	(__v8sf)(__m256)(V2), (M)))
		1468
		1469	/* Vector shuffle */
		1470	/// Selects 8 float values from the 256-bit operands of [8 x float], as
		1471	/// specified by the immediate value operand.
		1472	///
		1473	/// The four selected elements in each operand are copied to the destination
		1474	/// according to the bits specified in the immediate operand. The selected
		1475	/// elements from the first 256-bit operand are copied to bits [63:0] and
		1476	/// bits [191:128] of the destination, and the selected elements from the
		1477	/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of
		1478	/// the destination. For example, if bits [7:0] of the immediate operand
		1479	/// contain a value of 0xFF, the 256-bit destination vector would contain the
		1480	/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3].
		1481	///
		1482	/// \headerfile <x86intrin.h>
		1483	///
		1484	/// \code
		1485	/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask);
		1486	/// \endcode
		1487	///
		1488	/// This intrinsic corresponds to the <c> VSHUFPS </c> instruction.
		1489	///
		1490	/// \param a
		1491	/// A 256-bit vector of [8 x float]. The four selected elements in this
		1492	/// operand are copied to bits [63:0] and bits [191:128] in the destination,
		1493	/// according to the bits specified in the immediate operand.
		1494	/// \param b
		1495	/// A 256-bit vector of [8 x float]. The four selected elements in this
		1496	/// operand are copied to bits [127:64] and bits [255:192] in the
		1497	/// destination, according to the bits specified in the immediate operand.
		1498	/// \param mask
		1499	/// An immediate value containing an 8-bit value specifying which elements to
		1500	/// copy from \a a and \a b \n.
		1501	/// Bits [3:0] specify the values copied from operand \a a. \n
		1502	/// Bits [7:4] specify the values copied from operand \a b. \n
		1503	/// The destinations within the 256-bit destination are assigned values as
		1504	/// follows, according to the bit value assignments described below: \n
		1505	/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the
		1506	/// destination. \n
		1507	/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the
		1508	/// destination. \n
		1509	/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the
		1510	/// destination. \n
		1511	/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in
		1512	/// the destination. \n
		1513	/// Bit value assignments: \n
		1514	/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n
		1515	/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n
		1516	/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n
		1517	/// 11: Bits [127:96] and [255:224] are copied from the selected operand. \n
		1518	/// Note: To generate a mask, you can use the \c _MM_SHUFFLE macro.
		1519	/// <c>_MM_SHUFFLE(b6, b4, b2, b0)</c> can create an 8-bit mask of the form
		1520	/// <c>[b6, b4, b2, b0]</c>.
		1521	/// \returns A 256-bit vector of [8 x float] containing the shuffled values.
		1522	#define _mm256_shuffle_ps(a, b, mask) \
		1523	((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \
		1524	(__v8sf)(__m256)(b), (int)(mask)))
		1525
		1526	/// Selects four double-precision values from the 256-bit operands of
		1527	/// [4 x double], as specified by the immediate value operand.
		1528	///
		1529	/// The selected elements from the first 256-bit operand are copied to bits
		1530	/// [63:0] and bits [191:128] in the destination, and the selected elements
		1531	/// from the second 256-bit operand are copied to bits [127:64] and bits
		1532	/// [255:192] in the destination. For example, if bits [3:0] of the immediate
		1533	/// operand contain a value of 0xF, the 256-bit destination vector would
		1534	/// contain the following values: b[3], a[3], b[1], a[1].
		1535	///
		1536	/// \headerfile <x86intrin.h>
		1537	///
		1538	/// \code
		1539	/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask);
		1540	/// \endcode
		1541	///
		1542	/// This intrinsic corresponds to the <c> VSHUFPD </c> instruction.
		1543	///
		1544	/// \param a
		1545	/// A 256-bit vector of [4 x double].
		1546	/// \param b
		1547	/// A 256-bit vector of [4 x double].
		1548	/// \param mask
		1549	/// An immediate value containing 8-bit values specifying which elements to
		1550	/// copy from \a a and \a b: \n
		1551	/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the
		1552	/// destination. \n
		1553	/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the
		1554	/// destination. \n
		1555	/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the
		1556	/// destination. \n
		1557	/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the
		1558	/// destination. \n
		1559	/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the
		1560	/// destination. \n
		1561	/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the
		1562	/// destination. \n
		1563	/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the
		1564	/// destination. \n
		1565	/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the
		1566	/// destination.
		1567	/// \returns A 256-bit vector of [4 x double] containing the shuffled values.
		1568	#define _mm256_shuffle_pd(a, b, mask) \
		1569	((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \
		1570	(__v4df)(__m256d)(b), (int)(mask)))
		1571
		1572	/* Compare */
		1573	#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */
		1574	#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */
		1575	#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */
		1576	#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */
		1577	#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */
		1578	#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */
		1579	#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */
		1580	#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */
		1581	#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */
		1582	#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */
		1583	#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */
		1584	#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */
		1585	#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */
		1586	#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */
		1587	#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */
		1588	#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */
		1589	#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */
		1590	#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */
		1591	#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */
		1592	#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */
		1593	#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */
		1594	#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */
		1595	#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */
		1596	#define _CMP_ORD_S 0x17 /* Ordered (signaling) */
		1597	#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */
		1598	#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */
		1599	#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */
		1600	#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */
		1601	#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */
		1602	#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */
		1603	#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */
		1604	#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */
		1605
		1606	/// Compares each of the corresponding double-precision values of two
		1607	/// 128-bit vectors of [2 x double], using the operation specified by the
		1608	/// immediate integer operand.
		1609	///
		1610	/// Returns a [2 x double] vector consisting of two doubles corresponding to
		1611	/// the two comparison results: zero if the comparison is false, and all 1's
		1612	/// if the comparison is true.
		1613	///
		1614	/// \headerfile <x86intrin.h>
		1615	///
		1616	/// \code
		1617	/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c);
		1618	/// \endcode
		1619	///
		1620	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
		1621	///
		1622	/// \param a
		1623	/// A 128-bit vector of [2 x double].
		1624	/// \param b
		1625	/// A 128-bit vector of [2 x double].
		1626	/// \param c
		1627	/// An immediate integer operand, with bits [4:0] specifying which comparison
		1628	/// operation to use: \n
		1629	/// 0x00: Equal (ordered, non-signaling) \n
		1630	/// 0x01: Less-than (ordered, signaling) \n
		1631	/// 0x02: Less-than-or-equal (ordered, signaling) \n
		1632	/// 0x03: Unordered (non-signaling) \n
		1633	/// 0x04: Not-equal (unordered, non-signaling) \n
		1634	/// 0x05: Not-less-than (unordered, signaling) \n
		1635	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
		1636	/// 0x07: Ordered (non-signaling) \n
		1637	/// 0x08: Equal (unordered, non-signaling) \n
		1638	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
		1639	/// 0x0A: Not-greater-than (unordered, signaling) \n
		1640	/// 0x0B: False (ordered, non-signaling) \n
		1641	/// 0x0C: Not-equal (ordered, non-signaling) \n
		1642	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
		1643	/// 0x0E: Greater-than (ordered, signaling) \n
		1644	/// 0x0F: True (unordered, non-signaling) \n
		1645	/// 0x10: Equal (ordered, signaling) \n
		1646	/// 0x11: Less-than (ordered, non-signaling) \n
		1647	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
		1648	/// 0x13: Unordered (signaling) \n
		1649	/// 0x14: Not-equal (unordered, signaling) \n
		1650	/// 0x15: Not-less-than (unordered, non-signaling) \n
		1651	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
		1652	/// 0x17: Ordered (signaling) \n
		1653	/// 0x18: Equal (unordered, signaling) \n
		1654	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
		1655	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
		1656	/// 0x1B: False (ordered, signaling) \n
		1657	/// 0x1C: Not-equal (ordered, signaling) \n
		1658	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
		1659	/// 0x1E: Greater-than (ordered, non-signaling) \n
		1660	/// 0x1F: True (unordered, signaling)
		1661	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
		1662	#define _mm_cmp_pd(a, b, c) \
		1663	((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \
		1664	(__v2df)(__m128d)(b), (c)))
		1665
		1666	/// Compares each of the corresponding values of two 128-bit vectors of
		1667	/// [4 x float], using the operation specified by the immediate integer
		1668	/// operand.
		1669	///
		1670	/// Returns a [4 x float] vector consisting of four floats corresponding to
		1671	/// the four comparison results: zero if the comparison is false, and all 1's
		1672	/// if the comparison is true.
		1673	///
		1674	/// \headerfile <x86intrin.h>
		1675	///
		1676	/// \code
		1677	/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c);
		1678	/// \endcode
		1679	///
		1680	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
		1681	///
		1682	/// \param a
		1683	/// A 128-bit vector of [4 x float].
		1684	/// \param b
		1685	/// A 128-bit vector of [4 x float].
		1686	/// \param c
		1687	/// An immediate integer operand, with bits [4:0] specifying which comparison
		1688	/// operation to use: \n
		1689	/// 0x00: Equal (ordered, non-signaling) \n
		1690	/// 0x01: Less-than (ordered, signaling) \n
		1691	/// 0x02: Less-than-or-equal (ordered, signaling) \n
		1692	/// 0x03: Unordered (non-signaling) \n
		1693	/// 0x04: Not-equal (unordered, non-signaling) \n
		1694	/// 0x05: Not-less-than (unordered, signaling) \n
		1695	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
		1696	/// 0x07: Ordered (non-signaling) \n
		1697	/// 0x08: Equal (unordered, non-signaling) \n
		1698	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
		1699	/// 0x0A: Not-greater-than (unordered, signaling) \n
		1700	/// 0x0B: False (ordered, non-signaling) \n
		1701	/// 0x0C: Not-equal (ordered, non-signaling) \n
		1702	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
		1703	/// 0x0E: Greater-than (ordered, signaling) \n
		1704	/// 0x0F: True (unordered, non-signaling) \n
		1705	/// 0x10: Equal (ordered, signaling) \n
		1706	/// 0x11: Less-than (ordered, non-signaling) \n
		1707	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
		1708	/// 0x13: Unordered (signaling) \n
		1709	/// 0x14: Not-equal (unordered, signaling) \n
		1710	/// 0x15: Not-less-than (unordered, non-signaling) \n
		1711	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
		1712	/// 0x17: Ordered (signaling) \n
		1713	/// 0x18: Equal (unordered, signaling) \n
		1714	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
		1715	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
		1716	/// 0x1B: False (ordered, signaling) \n
		1717	/// 0x1C: Not-equal (ordered, signaling) \n
		1718	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
		1719	/// 0x1E: Greater-than (ordered, non-signaling) \n
		1720	/// 0x1F: True (unordered, signaling)
		1721	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		1722	#define _mm_cmp_ps(a, b, c) \
		1723	((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \
		1724	(__v4sf)(__m128)(b), (c)))
		1725
		1726	/// Compares each of the corresponding double-precision values of two
		1727	/// 256-bit vectors of [4 x double], using the operation specified by the
		1728	/// immediate integer operand.
		1729	///
		1730	/// Returns a [4 x double] vector consisting of four doubles corresponding to
		1731	/// the four comparison results: zero if the comparison is false, and all 1's
		1732	/// if the comparison is true.
		1733	///
		1734	/// \headerfile <x86intrin.h>
		1735	///
		1736	/// \code
		1737	/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c);
		1738	/// \endcode
		1739	///
		1740	/// This intrinsic corresponds to the <c> VCMPPD </c> instruction.
		1741	///
		1742	/// \param a
		1743	/// A 256-bit vector of [4 x double].
		1744	/// \param b
		1745	/// A 256-bit vector of [4 x double].
		1746	/// \param c
		1747	/// An immediate integer operand, with bits [4:0] specifying which comparison
		1748	/// operation to use: \n
		1749	/// 0x00: Equal (ordered, non-signaling) \n
		1750	/// 0x01: Less-than (ordered, signaling) \n
		1751	/// 0x02: Less-than-or-equal (ordered, signaling) \n
		1752	/// 0x03: Unordered (non-signaling) \n
		1753	/// 0x04: Not-equal (unordered, non-signaling) \n
		1754	/// 0x05: Not-less-than (unordered, signaling) \n
		1755	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
		1756	/// 0x07: Ordered (non-signaling) \n
		1757	/// 0x08: Equal (unordered, non-signaling) \n
		1758	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
		1759	/// 0x0A: Not-greater-than (unordered, signaling) \n
		1760	/// 0x0B: False (ordered, non-signaling) \n
		1761	/// 0x0C: Not-equal (ordered, non-signaling) \n
		1762	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
		1763	/// 0x0E: Greater-than (ordered, signaling) \n
		1764	/// 0x0F: True (unordered, non-signaling) \n
		1765	/// 0x10: Equal (ordered, signaling) \n
		1766	/// 0x11: Less-than (ordered, non-signaling) \n
		1767	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
		1768	/// 0x13: Unordered (signaling) \n
		1769	/// 0x14: Not-equal (unordered, signaling) \n
		1770	/// 0x15: Not-less-than (unordered, non-signaling) \n
		1771	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
		1772	/// 0x17: Ordered (signaling) \n
		1773	/// 0x18: Equal (unordered, signaling) \n
		1774	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
		1775	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
		1776	/// 0x1B: False (ordered, signaling) \n
		1777	/// 0x1C: Not-equal (ordered, signaling) \n
		1778	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
		1779	/// 0x1E: Greater-than (ordered, non-signaling) \n
		1780	/// 0x1F: True (unordered, signaling)
		1781	/// \returns A 256-bit vector of [4 x double] containing the comparison results.
		1782	#define _mm256_cmp_pd(a, b, c) \
		1783	((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \
		1784	(__v4df)(__m256d)(b), (c)))
		1785
		1786	/// Compares each of the corresponding values of two 256-bit vectors of
		1787	/// [8 x float], using the operation specified by the immediate integer
		1788	/// operand.
		1789	///
		1790	/// Returns a [8 x float] vector consisting of eight floats corresponding to
		1791	/// the eight comparison results: zero if the comparison is false, and all
		1792	/// 1's if the comparison is true.
		1793	///
		1794	/// \headerfile <x86intrin.h>
		1795	///
		1796	/// \code
		1797	/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c);
		1798	/// \endcode
		1799	///
		1800	/// This intrinsic corresponds to the <c> VCMPPS </c> instruction.
		1801	///
		1802	/// \param a
		1803	/// A 256-bit vector of [8 x float].
		1804	/// \param b
		1805	/// A 256-bit vector of [8 x float].
		1806	/// \param c
		1807	/// An immediate integer operand, with bits [4:0] specifying which comparison
		1808	/// operation to use: \n
		1809	/// 0x00: Equal (ordered, non-signaling) \n
		1810	/// 0x01: Less-than (ordered, signaling) \n
		1811	/// 0x02: Less-than-or-equal (ordered, signaling) \n
		1812	/// 0x03: Unordered (non-signaling) \n
		1813	/// 0x04: Not-equal (unordered, non-signaling) \n
		1814	/// 0x05: Not-less-than (unordered, signaling) \n
		1815	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
		1816	/// 0x07: Ordered (non-signaling) \n
		1817	/// 0x08: Equal (unordered, non-signaling) \n
		1818	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
		1819	/// 0x0A: Not-greater-than (unordered, signaling) \n
		1820	/// 0x0B: False (ordered, non-signaling) \n
		1821	/// 0x0C: Not-equal (ordered, non-signaling) \n
		1822	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
		1823	/// 0x0E: Greater-than (ordered, signaling) \n
		1824	/// 0x0F: True (unordered, non-signaling) \n
		1825	/// 0x10: Equal (ordered, signaling) \n
		1826	/// 0x11: Less-than (ordered, non-signaling) \n
		1827	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
		1828	/// 0x13: Unordered (signaling) \n
		1829	/// 0x14: Not-equal (unordered, signaling) \n
		1830	/// 0x15: Not-less-than (unordered, non-signaling) \n
		1831	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
		1832	/// 0x17: Ordered (signaling) \n
		1833	/// 0x18: Equal (unordered, signaling) \n
		1834	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
		1835	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
		1836	/// 0x1B: False (ordered, signaling) \n
		1837	/// 0x1C: Not-equal (ordered, signaling) \n
		1838	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
		1839	/// 0x1E: Greater-than (ordered, non-signaling) \n
		1840	/// 0x1F: True (unordered, signaling)
		1841	/// \returns A 256-bit vector of [8 x float] containing the comparison results.
		1842	#define _mm256_cmp_ps(a, b, c) \
		1843	((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \
		1844	(__v8sf)(__m256)(b), (c)))
		1845
		1846	/// Compares each of the corresponding scalar double-precision values of
		1847	/// two 128-bit vectors of [2 x double], using the operation specified by the
		1848	/// immediate integer operand.
		1849	///
		1850	/// If the result is true, all 64 bits of the destination vector are set;
		1851	/// otherwise they are cleared.
		1852	///
		1853	/// \headerfile <x86intrin.h>
		1854	///
		1855	/// \code
		1856	/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c);
		1857	/// \endcode
		1858	///
		1859	/// This intrinsic corresponds to the <c> VCMPSD </c> instruction.
		1860	///
		1861	/// \param a
		1862	/// A 128-bit vector of [2 x double].
		1863	/// \param b
		1864	/// A 128-bit vector of [2 x double].
		1865	/// \param c
		1866	/// An immediate integer operand, with bits [4:0] specifying which comparison
		1867	/// operation to use: \n
		1868	/// 0x00: Equal (ordered, non-signaling) \n
		1869	/// 0x01: Less-than (ordered, signaling) \n
		1870	/// 0x02: Less-than-or-equal (ordered, signaling) \n
		1871	/// 0x03: Unordered (non-signaling) \n
		1872	/// 0x04: Not-equal (unordered, non-signaling) \n
		1873	/// 0x05: Not-less-than (unordered, signaling) \n
		1874	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
		1875	/// 0x07: Ordered (non-signaling) \n
		1876	/// 0x08: Equal (unordered, non-signaling) \n
		1877	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
		1878	/// 0x0A: Not-greater-than (unordered, signaling) \n
		1879	/// 0x0B: False (ordered, non-signaling) \n
		1880	/// 0x0C: Not-equal (ordered, non-signaling) \n
		1881	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
		1882	/// 0x0E: Greater-than (ordered, signaling) \n
		1883	/// 0x0F: True (unordered, non-signaling) \n
		1884	/// 0x10: Equal (ordered, signaling) \n
		1885	/// 0x11: Less-than (ordered, non-signaling) \n
		1886	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
		1887	/// 0x13: Unordered (signaling) \n
		1888	/// 0x14: Not-equal (unordered, signaling) \n
		1889	/// 0x15: Not-less-than (unordered, non-signaling) \n
		1890	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
		1891	/// 0x17: Ordered (signaling) \n
		1892	/// 0x18: Equal (unordered, signaling) \n
		1893	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
		1894	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
		1895	/// 0x1B: False (ordered, signaling) \n
		1896	/// 0x1C: Not-equal (ordered, signaling) \n
		1897	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
		1898	/// 0x1E: Greater-than (ordered, non-signaling) \n
		1899	/// 0x1F: True (unordered, signaling)
		1900	/// \returns A 128-bit vector of [2 x double] containing the comparison results.
		1901	#define _mm_cmp_sd(a, b, c) \
		1902	((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \
		1903	(__v2df)(__m128d)(b), (c)))
		1904
		1905	/// Compares each of the corresponding scalar values of two 128-bit
		1906	/// vectors of [4 x float], using the operation specified by the immediate
		1907	/// integer operand.
		1908	///
		1909	/// If the result is true, all 32 bits of the destination vector are set;
		1910	/// otherwise they are cleared.
		1911	///
		1912	/// \headerfile <x86intrin.h>
		1913	///
		1914	/// \code
		1915	/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c);
		1916	/// \endcode
		1917	///
		1918	/// This intrinsic corresponds to the <c> VCMPSS </c> instruction.
		1919	///
		1920	/// \param a
		1921	/// A 128-bit vector of [4 x float].
		1922	/// \param b
		1923	/// A 128-bit vector of [4 x float].
		1924	/// \param c
		1925	/// An immediate integer operand, with bits [4:0] specifying which comparison
		1926	/// operation to use: \n
		1927	/// 0x00: Equal (ordered, non-signaling) \n
		1928	/// 0x01: Less-than (ordered, signaling) \n
		1929	/// 0x02: Less-than-or-equal (ordered, signaling) \n
		1930	/// 0x03: Unordered (non-signaling) \n
		1931	/// 0x04: Not-equal (unordered, non-signaling) \n
		1932	/// 0x05: Not-less-than (unordered, signaling) \n
		1933	/// 0x06: Not-less-than-or-equal (unordered, signaling) \n
		1934	/// 0x07: Ordered (non-signaling) \n
		1935	/// 0x08: Equal (unordered, non-signaling) \n
		1936	/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n
		1937	/// 0x0A: Not-greater-than (unordered, signaling) \n
		1938	/// 0x0B: False (ordered, non-signaling) \n
		1939	/// 0x0C: Not-equal (ordered, non-signaling) \n
		1940	/// 0x0D: Greater-than-or-equal (ordered, signaling) \n
		1941	/// 0x0E: Greater-than (ordered, signaling) \n
		1942	/// 0x0F: True (unordered, non-signaling) \n
		1943	/// 0x10: Equal (ordered, signaling) \n
		1944	/// 0x11: Less-than (ordered, non-signaling) \n
		1945	/// 0x12: Less-than-or-equal (ordered, non-signaling) \n
		1946	/// 0x13: Unordered (signaling) \n
		1947	/// 0x14: Not-equal (unordered, signaling) \n
		1948	/// 0x15: Not-less-than (unordered, non-signaling) \n
		1949	/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n
		1950	/// 0x17: Ordered (signaling) \n
		1951	/// 0x18: Equal (unordered, signaling) \n
		1952	/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n
		1953	/// 0x1A: Not-greater-than (unordered, non-signaling) \n
		1954	/// 0x1B: False (ordered, signaling) \n
		1955	/// 0x1C: Not-equal (ordered, signaling) \n
		1956	/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n
		1957	/// 0x1E: Greater-than (ordered, non-signaling) \n
		1958	/// 0x1F: True (unordered, signaling)
		1959	/// \returns A 128-bit vector of [4 x float] containing the comparison results.
		1960	#define _mm_cmp_ss(a, b, c) \
		1961	((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \
		1962	(__v4sf)(__m128)(b), (c)))
		1963
		1964	/// Takes a [8 x i32] vector and returns the vector element value
		1965	/// indexed by the immediate constant operand.
		1966	///
		1967	/// \headerfile <x86intrin.h>
		1968	///
		1969	/// \code
		1970	/// int _mm256_extract_epi32(__m256i X, const int N);
		1971	/// \endcode
		1972	///
		1973	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
		1974	/// instruction.
		1975	///
		1976	/// \param X
		1977	/// A 256-bit vector of [8 x i32].
		1978	/// \param N
		1979	/// An immediate integer operand with bits [2:0] determining which vector
		1980	/// element is extracted and returned.
		1981	/// \returns A 32-bit integer containing the extracted 32 bits of extended
		1982	/// packed data.
		1983	#define _mm256_extract_epi32(X, N) \
		1984	((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N)))
		1985
		1986	/// Takes a [16 x i16] vector and returns the vector element value
		1987	/// indexed by the immediate constant operand.
		1988	///
		1989	/// \headerfile <x86intrin.h>
		1990	///
		1991	/// \code
		1992	/// int _mm256_extract_epi16(__m256i X, const int N);
		1993	/// \endcode
		1994	///
		1995	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
		1996	/// instruction.
		1997	///
		1998	/// \param X
		1999	/// A 256-bit integer vector of [16 x i16].
		2000	/// \param N
		2001	/// An immediate integer operand with bits [3:0] determining which vector
		2002	/// element is extracted and returned.
		2003	/// \returns A 32-bit integer containing the extracted 16 bits of zero extended
		2004	/// packed data.
		2005	#define _mm256_extract_epi16(X, N) \
		2006	((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \
		2007	(int)(N)))
		2008
		2009	/// Takes a [32 x i8] vector and returns the vector element value
		2010	/// indexed by the immediate constant operand.
		2011	///
		2012	/// \headerfile <x86intrin.h>
		2013	///
		2014	/// \code
		2015	/// int _mm256_extract_epi8(__m256i X, const int N);
		2016	/// \endcode
		2017	///
		2018	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
		2019	/// instruction.
		2020	///
		2021	/// \param X
		2022	/// A 256-bit integer vector of [32 x i8].
		2023	/// \param N
		2024	/// An immediate integer operand with bits [4:0] determining which vector
		2025	/// element is extracted and returned.
		2026	/// \returns A 32-bit integer containing the extracted 8 bits of zero extended
		2027	/// packed data.
		2028	#define _mm256_extract_epi8(X, N) \
		2029	((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \
		2030	(int)(N)))
		2031
		2032	#ifdef __x86_64__
		2033	/// Takes a [4 x i64] vector and returns the vector element value
		2034	/// indexed by the immediate constant operand.
		2035	///
		2036	/// \headerfile <x86intrin.h>
		2037	///
		2038	/// \code
		2039	/// long long _mm256_extract_epi64(__m256i X, const int N);
		2040	/// \endcode
		2041	///
		2042	/// This intrinsic corresponds to the <c> VEXTRACTF128+COMPOSITE </c>
		2043	/// instruction.
		2044	///
		2045	/// \param X
		2046	/// A 256-bit integer vector of [4 x i64].
		2047	/// \param N
		2048	/// An immediate integer operand with bits [1:0] determining which vector
		2049	/// element is extracted and returned.
		2050	/// \returns A 64-bit integer containing the extracted 64 bits of extended
		2051	/// packed data.
		2052	#define _mm256_extract_epi64(X, N) \
		2053	((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N)))
		2054	#endif
		2055
		2056	/// Takes a [8 x i32] vector and replaces the vector element value
		2057	/// indexed by the immediate constant operand by a new value. Returns the
		2058	/// modified vector.
		2059	///
		2060	/// \headerfile <x86intrin.h>
		2061	///
		2062	/// \code
		2063	/// __m256i _mm256_insert_epi32(__m256i X, int I, const int N);
		2064	/// \endcode
		2065	///
		2066	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
		2067	/// instruction.
		2068	///
		2069	/// \param X
		2070	/// A vector of [8 x i32] to be used by the insert operation.
		2071	/// \param I
		2072	/// An integer value. The replacement value for the insert operation.
		2073	/// \param N
		2074	/// An immediate integer specifying the index of the vector element to be
		2075	/// replaced.
		2076	/// \returns A copy of vector \a X, after replacing its element indexed by
		2077	/// \a N with \a I.
		2078	#define _mm256_insert_epi32(X, I, N) \
		2079	((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \
		2080	(int)(I), (int)(N)))
		2081
		2082
		2083	/// Takes a [16 x i16] vector and replaces the vector element value
		2084	/// indexed by the immediate constant operand with a new value. Returns the
		2085	/// modified vector.
		2086	///
		2087	/// \headerfile <x86intrin.h>
		2088	///
		2089	/// \code
		2090	/// __m256i _mm256_insert_epi16(__m256i X, int I, const int N);
		2091	/// \endcode
		2092	///
		2093	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
		2094	/// instruction.
		2095	///
		2096	/// \param X
		2097	/// A vector of [16 x i16] to be used by the insert operation.
		2098	/// \param I
		2099	/// An i16 integer value. The replacement value for the insert operation.
		2100	/// \param N
		2101	/// An immediate integer specifying the index of the vector element to be
		2102	/// replaced.
		2103	/// \returns A copy of vector \a X, after replacing its element indexed by
		2104	/// \a N with \a I.
		2105	#define _mm256_insert_epi16(X, I, N) \
		2106	((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \
		2107	(int)(I), (int)(N)))
		2108
		2109	/// Takes a [32 x i8] vector and replaces the vector element value
		2110	/// indexed by the immediate constant operand with a new value. Returns the
		2111	/// modified vector.
		2112	///
		2113	/// \headerfile <x86intrin.h>
		2114	///
		2115	/// \code
		2116	/// __m256i _mm256_insert_epi8(__m256i X, int I, const int N);
		2117	/// \endcode
		2118	///
		2119	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
		2120	/// instruction.
		2121	///
		2122	/// \param X
		2123	/// A vector of [32 x i8] to be used by the insert operation.
		2124	/// \param I
		2125	/// An i8 integer value. The replacement value for the insert operation.
		2126	/// \param N
		2127	/// An immediate integer specifying the index of the vector element to be
		2128	/// replaced.
		2129	/// \returns A copy of vector \a X, after replacing its element indexed by
		2130	/// \a N with \a I.
		2131	#define _mm256_insert_epi8(X, I, N) \
		2132	((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \
		2133	(int)(I), (int)(N)))
		2134
		2135	#ifdef __x86_64__
		2136	/// Takes a [4 x i64] vector and replaces the vector element value
		2137	/// indexed by the immediate constant operand with a new value. Returns the
		2138	/// modified vector.
		2139	///
		2140	/// \headerfile <x86intrin.h>
		2141	///
		2142	/// \code
		2143	/// __m256i _mm256_insert_epi64(__m256i X, int I, const int N);
		2144	/// \endcode
		2145	///
		2146	/// This intrinsic corresponds to the <c> VINSERTF128+COMPOSITE </c>
		2147	/// instruction.
		2148	///
		2149	/// \param X
		2150	/// A vector of [4 x i64] to be used by the insert operation.
		2151	/// \param I
		2152	/// A 64-bit integer value. The replacement value for the insert operation.
		2153	/// \param N
		2154	/// An immediate integer specifying the index of the vector element to be
		2155	/// replaced.
		2156	/// \returns A copy of vector \a X, after replacing its element indexed by
		2157	/// \a N with \a I.
		2158	#define _mm256_insert_epi64(X, I, N) \
		2159	((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \
		2160	(long long)(I), (int)(N)))
		2161	#endif
		2162
		2163	/* Conversion */
		2164	/// Converts a vector of [4 x i32] into a vector of [4 x double].
		2165	///
		2166	/// \headerfile <x86intrin.h>
		2167	///
		2168	/// This intrinsic corresponds to the <c> VCVTDQ2PD </c> instruction.
		2169	///
		2170	/// \param __a
		2171	/// A 128-bit integer vector of [4 x i32].
		2172	/// \returns A 256-bit vector of [4 x double] containing the converted values.
		2173	static __inline __m256d __DEFAULT_FN_ATTRS
		2174	_mm256_cvtepi32_pd(__m128i __a)
		2175	{
		2176	return (__m256d)__builtin_convertvector((__v4si)__a, __v4df);
		2177	}
		2178
		2179	/// Converts a vector of [8 x i32] into a vector of [8 x float].
		2180	///
		2181	/// \headerfile <x86intrin.h>
		2182	///
		2183	/// This intrinsic corresponds to the <c> VCVTDQ2PS </c> instruction.
		2184	///
		2185	/// \param __a
		2186	/// A 256-bit integer vector.
		2187	/// \returns A 256-bit vector of [8 x float] containing the converted values.
		2188	static __inline __m256 __DEFAULT_FN_ATTRS
		2189	_mm256_cvtepi32_ps(__m256i __a)
		2190	{
		2191	return (__m256)__builtin_convertvector((__v8si)__a, __v8sf);
		2192	}
		2193
		2194	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of
		2195	/// [4 x float].
		2196	///
		2197	/// \headerfile <x86intrin.h>
		2198	///
		2199	/// This intrinsic corresponds to the <c> VCVTPD2PS </c> instruction.
		2200	///
		2201	/// \param __a
		2202	/// A 256-bit vector of [4 x double].
		2203	/// \returns A 128-bit vector of [4 x float] containing the converted values.
		2204	static __inline __m128 __DEFAULT_FN_ATTRS
		2205	_mm256_cvtpd_ps(__m256d __a)
		2206	{
		2207	return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a);
		2208	}
		2209
		2210	/// Converts a vector of [8 x float] into a vector of [8 x i32].
		2211	///
		2212	/// \headerfile <x86intrin.h>
		2213	///
		2214	/// This intrinsic corresponds to the <c> VCVTPS2DQ </c> instruction.
		2215	///
		2216	/// \param __a
		2217	/// A 256-bit vector of [8 x float].
		2218	/// \returns A 256-bit integer vector containing the converted values.
		2219	static __inline __m256i __DEFAULT_FN_ATTRS
		2220	_mm256_cvtps_epi32(__m256 __a)
		2221	{
		2222	return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a);
		2223	}
		2224
		2225	/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4
		2226	/// x double].
		2227	///
		2228	/// \headerfile <x86intrin.h>
		2229	///
		2230	/// This intrinsic corresponds to the <c> VCVTPS2PD </c> instruction.
		2231	///
		2232	/// \param __a
		2233	/// A 128-bit vector of [4 x float].
		2234	/// \returns A 256-bit vector of [4 x double] containing the converted values.
		2235	static __inline __m256d __DEFAULT_FN_ATTRS
		2236	_mm256_cvtps_pd(__m128 __a)
		2237	{
		2238	return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df);
		2239	}
		2240
		2241	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
		2242	/// x i32], truncating the result by rounding towards zero when it is
		2243	/// inexact.
		2244	///
		2245	/// \headerfile <x86intrin.h>
		2246	///
		2247	/// This intrinsic corresponds to the <c> VCVTTPD2DQ </c> instruction.
		2248	///
		2249	/// \param __a
		2250	/// A 256-bit vector of [4 x double].
		2251	/// \returns A 128-bit integer vector containing the converted values.
		2252	static __inline __m128i __DEFAULT_FN_ATTRS
		2253	_mm256_cvttpd_epi32(__m256d __a)
		2254	{
		2255	return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a);
		2256	}
		2257
		2258	/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4
		2259	/// x i32]. When a conversion is inexact, the value returned is rounded
		2260	/// according to the rounding control bits in the MXCSR register.
		2261	///
		2262	/// \headerfile <x86intrin.h>
		2263	///
		2264	/// This intrinsic corresponds to the <c> VCVTPD2DQ </c> instruction.
		2265	///
		2266	/// \param __a
		2267	/// A 256-bit vector of [4 x double].
		2268	/// \returns A 128-bit integer vector containing the converted values.
		2269	static __inline __m128i __DEFAULT_FN_ATTRS
		2270	_mm256_cvtpd_epi32(__m256d __a)
		2271	{
		2272	return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a);
		2273	}
		2274
		2275	/// Converts a vector of [8 x float] into a vector of [8 x i32],
		2276	/// truncating the result by rounding towards zero when it is inexact.
		2277	///
		2278	/// \headerfile <x86intrin.h>
		2279	///
		2280	/// This intrinsic corresponds to the <c> VCVTTPS2DQ </c> instruction.
		2281	///
		2282	/// \param __a
		2283	/// A 256-bit vector of [8 x float].
		2284	/// \returns A 256-bit integer vector containing the converted values.
		2285	static __inline __m256i __DEFAULT_FN_ATTRS
		2286	_mm256_cvttps_epi32(__m256 __a)
		2287	{
		2288	return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a);
		2289	}
		2290
		2291	/// Returns the first element of the input vector of [4 x double].
		2292	///
		2293	/// \headerfile <x86intrin.h>
		2294	///
		2295	/// This intrinsic is a utility function and does not correspond to a specific
		2296	/// instruction.
		2297	///
		2298	/// \param __a
		2299	/// A 256-bit vector of [4 x double].
		2300	/// \returns A 64 bit double containing the first element of the input vector.
		2301	static __inline double __DEFAULT_FN_ATTRS
		2302	_mm256_cvtsd_f64(__m256d __a)
		2303	{
		2304	return __a[0];
		2305	}
		2306
		2307	/// Returns the first element of the input vector of [8 x i32].
		2308	///
		2309	/// \headerfile <x86intrin.h>
		2310	///
		2311	/// This intrinsic is a utility function and does not correspond to a specific
		2312	/// instruction.
		2313	///
		2314	/// \param __a
		2315	/// A 256-bit vector of [8 x i32].
		2316	/// \returns A 32 bit integer containing the first element of the input vector.
		2317	static __inline int __DEFAULT_FN_ATTRS
		2318	_mm256_cvtsi256_si32(__m256i __a)
		2319	{
		2320	__v8si __b = (__v8si)__a;
		2321	return __b[0];
		2322	}
		2323
		2324	/// Returns the first element of the input vector of [8 x float].
		2325	///
		2326	/// \headerfile <x86intrin.h>
		2327	///
		2328	/// This intrinsic is a utility function and does not correspond to a specific
		2329	/// instruction.
		2330	///
		2331	/// \param __a
		2332	/// A 256-bit vector of [8 x float].
		2333	/// \returns A 32 bit float containing the first element of the input vector.
		2334	static __inline float __DEFAULT_FN_ATTRS
		2335	_mm256_cvtss_f32(__m256 __a)
		2336	{
		2337	return __a[0];
		2338	}
		2339
		2340	/* Vector replicate */
		2341	/// Moves and duplicates odd-indexed values from a 256-bit vector of
		2342	/// [8 x float] to float values in a 256-bit vector of [8 x float].
		2343	///
		2344	/// \headerfile <x86intrin.h>
		2345	///
		2346	/// This intrinsic corresponds to the <c> VMOVSHDUP </c> instruction.
		2347	///
		2348	/// \param __a
		2349	/// A 256-bit vector of [8 x float]. \n
		2350	/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of
		2351	/// the return value. \n
		2352	/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of
		2353	/// the return value. \n
		2354	/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the
		2355	/// return value. \n
		2356	/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the
		2357	/// return value.
		2358	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
		2359	/// values.
		2360	static __inline __m256 __DEFAULT_FN_ATTRS
		2361	_mm256_movehdup_ps(__m256 __a)
		2362	{
		2363	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7);
		2364	}
		2365
		2366	/// Moves and duplicates even-indexed values from a 256-bit vector of
		2367	/// [8 x float] to float values in a 256-bit vector of [8 x float].
		2368	///
		2369	/// \headerfile <x86intrin.h>
		2370	///
		2371	/// This intrinsic corresponds to the <c> VMOVSLDUP </c> instruction.
		2372	///
		2373	/// \param __a
		2374	/// A 256-bit vector of [8 x float]. \n
		2375	/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of
		2376	/// the return value. \n
		2377	/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of
		2378	/// the return value. \n
		2379	/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the
		2380	/// return value. \n
		2381	/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the
		2382	/// return value.
		2383	/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated
		2384	/// values.
		2385	static __inline __m256 __DEFAULT_FN_ATTRS
		2386	_mm256_moveldup_ps(__m256 __a)
		2387	{
		2388	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6);
		2389	}
		2390
		2391	/// Moves and duplicates double-precision floating point values from a
		2392	/// 256-bit vector of [4 x double] to double-precision values in a 256-bit
		2393	/// vector of [4 x double].
		2394	///
		2395	/// \headerfile <x86intrin.h>
		2396	///
		2397	/// This intrinsic corresponds to the <c> VMOVDDUP </c> instruction.
		2398	///
		2399	/// \param __a
		2400	/// A 256-bit vector of [4 x double]. \n
		2401	/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the
		2402	/// return value. \n
		2403	/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of
		2404	/// the return value.
		2405	/// \returns A 256-bit vector of [4 x double] containing the moved and
		2406	/// duplicated values.
		2407	static __inline __m256d __DEFAULT_FN_ATTRS
		2408	_mm256_movedup_pd(__m256d __a)
		2409	{
		2410	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2);
		2411	}
		2412
		2413	/* Unpack and Interleave */
		2414	/// Unpacks the odd-indexed vector elements from two 256-bit vectors of
		2415	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
		2416	///
		2417	/// \headerfile <x86intrin.h>
		2418	///
		2419	/// This intrinsic corresponds to the <c> VUNPCKHPD </c> instruction.
		2420	///
		2421	/// \param __a
		2422	/// A 256-bit floating-point vector of [4 x double]. \n
		2423	/// Bits [127:64] are written to bits [63:0] of the return value. \n
		2424	/// Bits [255:192] are written to bits [191:128] of the return value. \n
		2425	/// \param __b
		2426	/// A 256-bit floating-point vector of [4 x double]. \n
		2427	/// Bits [127:64] are written to bits [127:64] of the return value. \n
		2428	/// Bits [255:192] are written to bits [255:192] of the return value. \n
		2429	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
		2430	static __inline __m256d __DEFAULT_FN_ATTRS
		2431	_mm256_unpackhi_pd(__m256d __a, __m256d __b)
		2432	{
		2433	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2);
		2434	}
		2435
		2436	/// Unpacks the even-indexed vector elements from two 256-bit vectors of
		2437	/// [4 x double] and interleaves them into a 256-bit vector of [4 x double].
		2438	///
		2439	/// \headerfile <x86intrin.h>
		2440	///
		2441	/// This intrinsic corresponds to the <c> VUNPCKLPD </c> instruction.
		2442	///
		2443	/// \param __a
		2444	/// A 256-bit floating-point vector of [4 x double]. \n
		2445	/// Bits [63:0] are written to bits [63:0] of the return value. \n
		2446	/// Bits [191:128] are written to bits [191:128] of the return value.
		2447	/// \param __b
		2448	/// A 256-bit floating-point vector of [4 x double]. \n
		2449	/// Bits [63:0] are written to bits [127:64] of the return value. \n
		2450	/// Bits [191:128] are written to bits [255:192] of the return value. \n
		2451	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
		2452	static __inline __m256d __DEFAULT_FN_ATTRS
		2453	_mm256_unpacklo_pd(__m256d __a, __m256d __b)
		2454	{
		2455	return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2);
		2456	}
		2457
		2458	/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the
		2459	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
		2460	/// vector of [8 x float].
		2461	///
		2462	/// \headerfile <x86intrin.h>
		2463	///
		2464	/// This intrinsic corresponds to the <c> VUNPCKHPS </c> instruction.
		2465	///
		2466	/// \param __a
		2467	/// A 256-bit vector of [8 x float]. \n
		2468	/// Bits [95:64] are written to bits [31:0] of the return value. \n
		2469	/// Bits [127:96] are written to bits [95:64] of the return value. \n
		2470	/// Bits [223:192] are written to bits [159:128] of the return value. \n
		2471	/// Bits [255:224] are written to bits [223:192] of the return value.
		2472	/// \param __b
		2473	/// A 256-bit vector of [8 x float]. \n
		2474	/// Bits [95:64] are written to bits [63:32] of the return value. \n
		2475	/// Bits [127:96] are written to bits [127:96] of the return value. \n
		2476	/// Bits [223:192] are written to bits [191:160] of the return value. \n
		2477	/// Bits [255:224] are written to bits [255:224] of the return value.
		2478	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
		2479	static __inline __m256 __DEFAULT_FN_ATTRS
		2480	_mm256_unpackhi_ps(__m256 __a, __m256 __b)
		2481	{
		2482	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1);
		2483	}
		2484
		2485	/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the
		2486	/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit
		2487	/// vector of [8 x float].
		2488	///
		2489	/// \headerfile <x86intrin.h>
		2490	///
		2491	/// This intrinsic corresponds to the <c> VUNPCKLPS </c> instruction.
		2492	///
		2493	/// \param __a
		2494	/// A 256-bit vector of [8 x float]. \n
		2495	/// Bits [31:0] are written to bits [31:0] of the return value. \n
		2496	/// Bits [63:32] are written to bits [95:64] of the return value. \n
		2497	/// Bits [159:128] are written to bits [159:128] of the return value. \n
		2498	/// Bits [191:160] are written to bits [223:192] of the return value.
		2499	/// \param __b
		2500	/// A 256-bit vector of [8 x float]. \n
		2501	/// Bits [31:0] are written to bits [63:32] of the return value. \n
		2502	/// Bits [63:32] are written to bits [127:96] of the return value. \n
		2503	/// Bits [159:128] are written to bits [191:160] of the return value. \n
		2504	/// Bits [191:160] are written to bits [255:224] of the return value.
		2505	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
		2506	static __inline __m256 __DEFAULT_FN_ATTRS
		2507	_mm256_unpacklo_ps(__m256 __a, __m256 __b)
		2508	{
		2509	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1);
		2510	}
		2511
		2512	/* Bit Test */
		2513	/// Given two 128-bit floating-point vectors of [2 x double], perform an
		2514	/// element-by-element comparison of the double-precision element in the
		2515	/// first source vector and the corresponding element in the second source
		2516	/// vector.
		2517	///
		2518	/// The EFLAGS register is updated as follows: \n
		2519	/// If there is at least one pair of double-precision elements where the
		2520	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2521	/// ZF flag is set to 1. \n
		2522	/// If there is at least one pair of double-precision elements where the
		2523	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2524	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2525	/// This intrinsic returns the value of the ZF flag.
		2526	///
		2527	/// \headerfile <x86intrin.h>
		2528	///
		2529	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
		2530	///
		2531	/// \param __a
		2532	/// A 128-bit vector of [2 x double].
		2533	/// \param __b
		2534	/// A 128-bit vector of [2 x double].
		2535	/// \returns the ZF flag in the EFLAGS register.
		2536	static __inline int __DEFAULT_FN_ATTRS128
		2537	_mm_testz_pd(__m128d __a, __m128d __b)
		2538	{
		2539	return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b);
		2540	}
		2541
		2542	/// Given two 128-bit floating-point vectors of [2 x double], perform an
		2543	/// element-by-element comparison of the double-precision element in the
		2544	/// first source vector and the corresponding element in the second source
		2545	/// vector.
		2546	///
		2547	/// The EFLAGS register is updated as follows: \n
		2548	/// If there is at least one pair of double-precision elements where the
		2549	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2550	/// ZF flag is set to 1. \n
		2551	/// If there is at least one pair of double-precision elements where the
		2552	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2553	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2554	/// This intrinsic returns the value of the CF flag.
		2555	///
		2556	/// \headerfile <x86intrin.h>
		2557	///
		2558	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
		2559	///
		2560	/// \param __a
		2561	/// A 128-bit vector of [2 x double].
		2562	/// \param __b
		2563	/// A 128-bit vector of [2 x double].
		2564	/// \returns the CF flag in the EFLAGS register.
		2565	static __inline int __DEFAULT_FN_ATTRS128
		2566	_mm_testc_pd(__m128d __a, __m128d __b)
		2567	{
		2568	return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b);
		2569	}
		2570
		2571	/// Given two 128-bit floating-point vectors of [2 x double], perform an
		2572	/// element-by-element comparison of the double-precision element in the
		2573	/// first source vector and the corresponding element in the second source
		2574	/// vector.
		2575	///
		2576	/// The EFLAGS register is updated as follows: \n
		2577	/// If there is at least one pair of double-precision elements where the
		2578	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2579	/// ZF flag is set to 1. \n
		2580	/// If there is at least one pair of double-precision elements where the
		2581	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2582	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2583	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
		2584	/// otherwise it returns 0.
		2585	///
		2586	/// \headerfile <x86intrin.h>
		2587	///
		2588	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
		2589	///
		2590	/// \param __a
		2591	/// A 128-bit vector of [2 x double].
		2592	/// \param __b
		2593	/// A 128-bit vector of [2 x double].
		2594	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
		2595	static __inline int __DEFAULT_FN_ATTRS128
		2596	_mm_testnzc_pd(__m128d __a, __m128d __b)
		2597	{
		2598	return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b);
		2599	}
		2600
		2601	/// Given two 128-bit floating-point vectors of [4 x float], perform an
		2602	/// element-by-element comparison of the single-precision element in the
		2603	/// first source vector and the corresponding element in the second source
		2604	/// vector.
		2605	///
		2606	/// The EFLAGS register is updated as follows: \n
		2607	/// If there is at least one pair of single-precision elements where the
		2608	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2609	/// ZF flag is set to 1. \n
		2610	/// If there is at least one pair of single-precision elements where the
		2611	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2612	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2613	/// This intrinsic returns the value of the ZF flag.
		2614	///
		2615	/// \headerfile <x86intrin.h>
		2616	///
		2617	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
		2618	///
		2619	/// \param __a
		2620	/// A 128-bit vector of [4 x float].
		2621	/// \param __b
		2622	/// A 128-bit vector of [4 x float].
		2623	/// \returns the ZF flag.
		2624	static __inline int __DEFAULT_FN_ATTRS128
		2625	_mm_testz_ps(__m128 __a, __m128 __b)
		2626	{
		2627	return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b);
		2628	}
		2629
		2630	/// Given two 128-bit floating-point vectors of [4 x float], perform an
		2631	/// element-by-element comparison of the single-precision element in the
		2632	/// first source vector and the corresponding element in the second source
		2633	/// vector.
		2634	///
		2635	/// The EFLAGS register is updated as follows: \n
		2636	/// If there is at least one pair of single-precision elements where the
		2637	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2638	/// ZF flag is set to 1. \n
		2639	/// If there is at least one pair of single-precision elements where the
		2640	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2641	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2642	/// This intrinsic returns the value of the CF flag.
		2643	///
		2644	/// \headerfile <x86intrin.h>
		2645	///
		2646	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
		2647	///
		2648	/// \param __a
		2649	/// A 128-bit vector of [4 x float].
		2650	/// \param __b
		2651	/// A 128-bit vector of [4 x float].
		2652	/// \returns the CF flag.
		2653	static __inline int __DEFAULT_FN_ATTRS128
		2654	_mm_testc_ps(__m128 __a, __m128 __b)
		2655	{
		2656	return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b);
		2657	}
		2658
		2659	/// Given two 128-bit floating-point vectors of [4 x float], perform an
		2660	/// element-by-element comparison of the single-precision element in the
		2661	/// first source vector and the corresponding element in the second source
		2662	/// vector.
		2663	///
		2664	/// The EFLAGS register is updated as follows: \n
		2665	/// If there is at least one pair of single-precision elements where the
		2666	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2667	/// ZF flag is set to 1. \n
		2668	/// If there is at least one pair of single-precision elements where the
		2669	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2670	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2671	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
		2672	/// otherwise it returns 0.
		2673	///
		2674	/// \headerfile <x86intrin.h>
		2675	///
		2676	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
		2677	///
		2678	/// \param __a
		2679	/// A 128-bit vector of [4 x float].
		2680	/// \param __b
		2681	/// A 128-bit vector of [4 x float].
		2682	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
		2683	static __inline int __DEFAULT_FN_ATTRS128
		2684	_mm_testnzc_ps(__m128 __a, __m128 __b)
		2685	{
		2686	return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b);
		2687	}
		2688
		2689	/// Given two 256-bit floating-point vectors of [4 x double], perform an
		2690	/// element-by-element comparison of the double-precision elements in the
		2691	/// first source vector and the corresponding elements in the second source
		2692	/// vector.
		2693	///
		2694	/// The EFLAGS register is updated as follows: \n
		2695	/// If there is at least one pair of double-precision elements where the
		2696	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2697	/// ZF flag is set to 1. \n
		2698	/// If there is at least one pair of double-precision elements where the
		2699	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2700	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2701	/// This intrinsic returns the value of the ZF flag.
		2702	///
		2703	/// \headerfile <x86intrin.h>
		2704	///
		2705	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
		2706	///
		2707	/// \param __a
		2708	/// A 256-bit vector of [4 x double].
		2709	/// \param __b
		2710	/// A 256-bit vector of [4 x double].
		2711	/// \returns the ZF flag.
		2712	static __inline int __DEFAULT_FN_ATTRS
		2713	_mm256_testz_pd(__m256d __a, __m256d __b)
		2714	{
		2715	return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b);
		2716	}
		2717
		2718	/// Given two 256-bit floating-point vectors of [4 x double], perform an
		2719	/// element-by-element comparison of the double-precision elements in the
		2720	/// first source vector and the corresponding elements in the second source
		2721	/// vector.
		2722	///
		2723	/// The EFLAGS register is updated as follows: \n
		2724	/// If there is at least one pair of double-precision elements where the
		2725	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2726	/// ZF flag is set to 1. \n
		2727	/// If there is at least one pair of double-precision elements where the
		2728	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2729	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2730	/// This intrinsic returns the value of the CF flag.
		2731	///
		2732	/// \headerfile <x86intrin.h>
		2733	///
		2734	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
		2735	///
		2736	/// \param __a
		2737	/// A 256-bit vector of [4 x double].
		2738	/// \param __b
		2739	/// A 256-bit vector of [4 x double].
		2740	/// \returns the CF flag.
		2741	static __inline int __DEFAULT_FN_ATTRS
		2742	_mm256_testc_pd(__m256d __a, __m256d __b)
		2743	{
		2744	return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b);
		2745	}
		2746
		2747	/// Given two 256-bit floating-point vectors of [4 x double], perform an
		2748	/// element-by-element comparison of the double-precision elements in the
		2749	/// first source vector and the corresponding elements in the second source
		2750	/// vector.
		2751	///
		2752	/// The EFLAGS register is updated as follows: \n
		2753	/// If there is at least one pair of double-precision elements where the
		2754	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2755	/// ZF flag is set to 1. \n
		2756	/// If there is at least one pair of double-precision elements where the
		2757	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2758	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2759	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
		2760	/// otherwise it returns 0.
		2761	///
		2762	/// \headerfile <x86intrin.h>
		2763	///
		2764	/// This intrinsic corresponds to the <c> VTESTPD </c> instruction.
		2765	///
		2766	/// \param __a
		2767	/// A 256-bit vector of [4 x double].
		2768	/// \param __b
		2769	/// A 256-bit vector of [4 x double].
		2770	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
		2771	static __inline int __DEFAULT_FN_ATTRS
		2772	_mm256_testnzc_pd(__m256d __a, __m256d __b)
		2773	{
		2774	return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b);
		2775	}
		2776
		2777	/// Given two 256-bit floating-point vectors of [8 x float], perform an
		2778	/// element-by-element comparison of the single-precision element in the
		2779	/// first source vector and the corresponding element in the second source
		2780	/// vector.
		2781	///
		2782	/// The EFLAGS register is updated as follows: \n
		2783	/// If there is at least one pair of single-precision elements where the
		2784	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2785	/// ZF flag is set to 1. \n
		2786	/// If there is at least one pair of single-precision elements where the
		2787	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2788	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2789	/// This intrinsic returns the value of the ZF flag.
		2790	///
		2791	/// \headerfile <x86intrin.h>
		2792	///
		2793	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
		2794	///
		2795	/// \param __a
		2796	/// A 256-bit vector of [8 x float].
		2797	/// \param __b
		2798	/// A 256-bit vector of [8 x float].
		2799	/// \returns the ZF flag.
		2800	static __inline int __DEFAULT_FN_ATTRS
		2801	_mm256_testz_ps(__m256 __a, __m256 __b)
		2802	{
		2803	return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b);
		2804	}
		2805
		2806	/// Given two 256-bit floating-point vectors of [8 x float], perform an
		2807	/// element-by-element comparison of the single-precision element in the
		2808	/// first source vector and the corresponding element in the second source
		2809	/// vector.
		2810	///
		2811	/// The EFLAGS register is updated as follows: \n
		2812	/// If there is at least one pair of single-precision elements where the
		2813	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2814	/// ZF flag is set to 1. \n
		2815	/// If there is at least one pair of single-precision elements where the
		2816	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2817	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2818	/// This intrinsic returns the value of the CF flag.
		2819	///
		2820	/// \headerfile <x86intrin.h>
		2821	///
		2822	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
		2823	///
		2824	/// \param __a
		2825	/// A 256-bit vector of [8 x float].
		2826	/// \param __b
		2827	/// A 256-bit vector of [8 x float].
		2828	/// \returns the CF flag.
		2829	static __inline int __DEFAULT_FN_ATTRS
		2830	_mm256_testc_ps(__m256 __a, __m256 __b)
		2831	{
		2832	return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b);
		2833	}
		2834
		2835	/// Given two 256-bit floating-point vectors of [8 x float], perform an
		2836	/// element-by-element comparison of the single-precision elements in the
		2837	/// first source vector and the corresponding elements in the second source
		2838	/// vector.
		2839	///
		2840	/// The EFLAGS register is updated as follows: \n
		2841	/// If there is at least one pair of single-precision elements where the
		2842	/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the
		2843	/// ZF flag is set to 1. \n
		2844	/// If there is at least one pair of single-precision elements where the
		2845	/// sign-bit of the first element is 0 and the sign-bit of the second element
		2846	/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n
		2847	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
		2848	/// otherwise it returns 0.
		2849	///
		2850	/// \headerfile <x86intrin.h>
		2851	///
		2852	/// This intrinsic corresponds to the <c> VTESTPS </c> instruction.
		2853	///
		2854	/// \param __a
		2855	/// A 256-bit vector of [8 x float].
		2856	/// \param __b
		2857	/// A 256-bit vector of [8 x float].
		2858	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
		2859	static __inline int __DEFAULT_FN_ATTRS
		2860	_mm256_testnzc_ps(__m256 __a, __m256 __b)
		2861	{
		2862	return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b);
		2863	}
		2864
		2865	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
		2866	/// of the two source vectors.
		2867	///
		2868	/// The EFLAGS register is updated as follows: \n
		2869	/// If there is at least one pair of bits where both bits are 1, the ZF flag
		2870	/// is set to 0. Otherwise the ZF flag is set to 1. \n
		2871	/// If there is at least one pair of bits where the bit from the first source
		2872	/// vector is 0 and the bit from the second source vector is 1, the CF flag
		2873	/// is set to 0. Otherwise the CF flag is set to 1. \n
		2874	/// This intrinsic returns the value of the ZF flag.
		2875	///
		2876	/// \headerfile <x86intrin.h>
		2877	///
		2878	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
		2879	///
		2880	/// \param __a
		2881	/// A 256-bit integer vector.
		2882	/// \param __b
		2883	/// A 256-bit integer vector.
		2884	/// \returns the ZF flag.
		2885	static __inline int __DEFAULT_FN_ATTRS
		2886	_mm256_testz_si256(__m256i __a, __m256i __b)
		2887	{
		2888	return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b);
		2889	}
		2890
		2891	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
		2892	/// of the two source vectors.
		2893	///
		2894	/// The EFLAGS register is updated as follows: \n
		2895	/// If there is at least one pair of bits where both bits are 1, the ZF flag
		2896	/// is set to 0. Otherwise the ZF flag is set to 1. \n
		2897	/// If there is at least one pair of bits where the bit from the first source
		2898	/// vector is 0 and the bit from the second source vector is 1, the CF flag
		2899	/// is set to 0. Otherwise the CF flag is set to 1. \n
		2900	/// This intrinsic returns the value of the CF flag.
		2901	///
		2902	/// \headerfile <x86intrin.h>
		2903	///
		2904	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
		2905	///
		2906	/// \param __a
		2907	/// A 256-bit integer vector.
		2908	/// \param __b
		2909	/// A 256-bit integer vector.
		2910	/// \returns the CF flag.
		2911	static __inline int __DEFAULT_FN_ATTRS
		2912	_mm256_testc_si256(__m256i __a, __m256i __b)
		2913	{
		2914	return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b);
		2915	}
		2916
		2917	/// Given two 256-bit integer vectors, perform a bit-by-bit comparison
		2918	/// of the two source vectors.
		2919	///
		2920	/// The EFLAGS register is updated as follows: \n
		2921	/// If there is at least one pair of bits where both bits are 1, the ZF flag
		2922	/// is set to 0. Otherwise the ZF flag is set to 1. \n
		2923	/// If there is at least one pair of bits where the bit from the first source
		2924	/// vector is 0 and the bit from the second source vector is 1, the CF flag
		2925	/// is set to 0. Otherwise the CF flag is set to 1. \n
		2926	/// This intrinsic returns 1 if both the ZF and CF flags are set to 0,
		2927	/// otherwise it returns 0.
		2928	///
		2929	/// \headerfile <x86intrin.h>
		2930	///
		2931	/// This intrinsic corresponds to the <c> VPTEST </c> instruction.
		2932	///
		2933	/// \param __a
		2934	/// A 256-bit integer vector.
		2935	/// \param __b
		2936	/// A 256-bit integer vector.
		2937	/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0.
		2938	static __inline int __DEFAULT_FN_ATTRS
		2939	_mm256_testnzc_si256(__m256i __a, __m256i __b)
		2940	{
		2941	return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b);
		2942	}
		2943
		2944	/* Vector extract sign mask */
		2945	/// Extracts the sign bits of double-precision floating point elements
		2946	/// in a 256-bit vector of [4 x double] and writes them to the lower order
		2947	/// bits of the return value.
		2948	///
		2949	/// \headerfile <x86intrin.h>
		2950	///
		2951	/// This intrinsic corresponds to the <c> VMOVMSKPD </c> instruction.
		2952	///
		2953	/// \param __a
		2954	/// A 256-bit vector of [4 x double] containing the double-precision
		2955	/// floating point values with sign bits to be extracted.
		2956	/// \returns The sign bits from the operand, written to bits [3:0].
		2957	static __inline int __DEFAULT_FN_ATTRS
		2958	_mm256_movemask_pd(__m256d __a)
		2959	{
		2960	return __builtin_ia32_movmskpd256((__v4df)__a);
		2961	}
		2962
		2963	/// Extracts the sign bits of single-precision floating point elements
		2964	/// in a 256-bit vector of [8 x float] and writes them to the lower order
		2965	/// bits of the return value.
		2966	///
		2967	/// \headerfile <x86intrin.h>
		2968	///
		2969	/// This intrinsic corresponds to the <c> VMOVMSKPS </c> instruction.
		2970	///
		2971	/// \param __a
		2972	/// A 256-bit vector of [8 x float] containing the single-precision floating
		2973	/// point values with sign bits to be extracted.
		2974	/// \returns The sign bits from the operand, written to bits [7:0].
		2975	static __inline int __DEFAULT_FN_ATTRS
		2976	_mm256_movemask_ps(__m256 __a)
		2977	{
		2978	return __builtin_ia32_movmskps256((__v8sf)__a);
		2979	}
		2980
		2981	/* Vector __zero */
		2982	/// Zeroes the contents of all XMM or YMM registers.
		2983	///
		2984	/// \headerfile <x86intrin.h>
		2985	///
		2986	/// This intrinsic corresponds to the <c> VZEROALL </c> instruction.
		2987	static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
		2988	_mm256_zeroall(void)
		2989	{
		2990	__builtin_ia32_vzeroall();
		2991	}
		2992
		2993	/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers.
		2994	///
		2995	/// \headerfile <x86intrin.h>
		2996	///
		2997	/// This intrinsic corresponds to the <c> VZEROUPPER </c> instruction.
		2998	static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx")))
		2999	_mm256_zeroupper(void)
		3000	{
		3001	__builtin_ia32_vzeroupper();
		3002	}
		3003
		3004	/* Vector load with broadcast */
		3005	/// Loads a scalar single-precision floating point value from the
		3006	/// specified address pointed to by \a __a and broadcasts it to the elements
		3007	/// of a [4 x float] vector.
		3008	///
		3009	/// \headerfile <x86intrin.h>
		3010	///
		3011	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
		3012	///
		3013	/// \param __a
		3014	/// The single-precision floating point value to be broadcast.
		3015	/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set
		3016	/// equal to the broadcast value.
		3017	static __inline __m128 __DEFAULT_FN_ATTRS128
		3018	_mm_broadcast_ss(float const *__a)
		3019	{
		3020	float __f = *__a;
		3021	return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f };
		3022	}
		3023
		3024	/// Loads a scalar double-precision floating point value from the
		3025	/// specified address pointed to by \a __a and broadcasts it to the elements
		3026	/// of a [4 x double] vector.
		3027	///
		3028	/// \headerfile <x86intrin.h>
		3029	///
		3030	/// This intrinsic corresponds to the <c> VBROADCASTSD </c> instruction.
		3031	///
		3032	/// \param __a
		3033	/// The double-precision floating point value to be broadcast.
		3034	/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set
		3035	/// equal to the broadcast value.
		3036	static __inline __m256d __DEFAULT_FN_ATTRS
		3037	_mm256_broadcast_sd(double const *__a)
		3038	{
		3039	double __d = *__a;
		3040	return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d };
		3041	}
		3042
		3043	/// Loads a scalar single-precision floating point value from the
		3044	/// specified address pointed to by \a __a and broadcasts it to the elements
		3045	/// of a [8 x float] vector.
		3046	///
		3047	/// \headerfile <x86intrin.h>
		3048	///
		3049	/// This intrinsic corresponds to the <c> VBROADCASTSS </c> instruction.
		3050	///
		3051	/// \param __a
		3052	/// The single-precision floating point value to be broadcast.
		3053	/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set
		3054	/// equal to the broadcast value.
		3055	static __inline __m256 __DEFAULT_FN_ATTRS
		3056	_mm256_broadcast_ss(float const *__a)
		3057	{
		3058	float __f = *__a;
		3059	return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f };
		3060	}
		3061
		3062	/// Loads the data from a 128-bit vector of [2 x double] from the
		3063	/// specified address pointed to by \a __a and broadcasts it to 128-bit
		3064	/// elements in a 256-bit vector of [4 x double].
		3065	///
		3066	/// \headerfile <x86intrin.h>
		3067	///
		3068	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
		3069	///
		3070	/// \param __a
		3071	/// The 128-bit vector of [2 x double] to be broadcast.
		3072	/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set
		3073	/// equal to the broadcast value.
		3074	static __inline __m256d __DEFAULT_FN_ATTRS
		3075	_mm256_broadcast_pd(__m128d const *__a)
		3076	{
		3077	__m128d __b = _mm_loadu_pd((const double *)__a);
		3078	return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b,
		3079	0, 1, 0, 1);
		3080	}
		3081
		3082	/// Loads the data from a 128-bit vector of [4 x float] from the
		3083	/// specified address pointed to by \a __a and broadcasts it to 128-bit
		3084	/// elements in a 256-bit vector of [8 x float].
		3085	///
		3086	/// \headerfile <x86intrin.h>
		3087	///
		3088	/// This intrinsic corresponds to the <c> VBROADCASTF128 </c> instruction.
		3089	///
		3090	/// \param __a
		3091	/// The 128-bit vector of [4 x float] to be broadcast.
		3092	/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set
		3093	/// equal to the broadcast value.
		3094	static __inline __m256 __DEFAULT_FN_ATTRS
		3095	_mm256_broadcast_ps(__m128 const *__a)
		3096	{
		3097	__m128 __b = _mm_loadu_ps((const float *)__a);
		3098	return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b,
		3099	0, 1, 2, 3, 0, 1, 2, 3);
		3100	}
		3101
		3102	/* SIMD load ops */
		3103	/// Loads 4 double-precision floating point values from a 32-byte aligned
		3104	/// memory location pointed to by \a __p into a vector of [4 x double].
		3105	///
		3106	/// \headerfile <x86intrin.h>
		3107	///
		3108	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
		3109	///
		3110	/// \param __p
		3111	/// A 32-byte aligned pointer to a memory location containing
		3112	/// double-precision floating point values.
		3113	/// \returns A 256-bit vector of [4 x double] containing the moved values.
		3114	static __inline __m256d __DEFAULT_FN_ATTRS
		3115	_mm256_load_pd(double const *__p)
		3116	{
		3117	return (const __m256d )__p;
		3118	}
		3119
		3120	/// Loads 8 single-precision floating point values from a 32-byte aligned
		3121	/// memory location pointed to by \a __p into a vector of [8 x float].
		3122	///
		3123	/// \headerfile <x86intrin.h>
		3124	///
		3125	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
		3126	///
		3127	/// \param __p
		3128	/// A 32-byte aligned pointer to a memory location containing float values.
		3129	/// \returns A 256-bit vector of [8 x float] containing the moved values.
		3130	static __inline __m256 __DEFAULT_FN_ATTRS
		3131	_mm256_load_ps(float const *__p)
		3132	{
		3133	return (const __m256 )__p;
		3134	}
		3135
		3136	/// Loads 4 double-precision floating point values from an unaligned
		3137	/// memory location pointed to by \a __p into a vector of [4 x double].
		3138	///
		3139	/// \headerfile <x86intrin.h>
		3140	///
		3141	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
		3142	///
		3143	/// \param __p
		3144	/// A pointer to a memory location containing double-precision floating
		3145	/// point values.
		3146	/// \returns A 256-bit vector of [4 x double] containing the moved values.
		3147	static __inline __m256d __DEFAULT_FN_ATTRS
		3148	_mm256_loadu_pd(double const *__p)
		3149	{
		3150	struct __loadu_pd {
		3151	__m256d_u __v;
		3152	} __attribute__((__packed__, __may_alias__));
		3153	return ((const struct __loadu_pd*)__p)->__v;
		3154	}
		3155
		3156	/// Loads 8 single-precision floating point values from an unaligned
		3157	/// memory location pointed to by \a __p into a vector of [8 x float].
		3158	///
		3159	/// \headerfile <x86intrin.h>
		3160	///
		3161	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
		3162	///
		3163	/// \param __p
		3164	/// A pointer to a memory location containing single-precision floating
		3165	/// point values.
		3166	/// \returns A 256-bit vector of [8 x float] containing the moved values.
		3167	static __inline __m256 __DEFAULT_FN_ATTRS
		3168	_mm256_loadu_ps(float const *__p)
		3169	{
		3170	struct __loadu_ps {
		3171	__m256_u __v;
		3172	} __attribute__((__packed__, __may_alias__));
		3173	return ((const struct __loadu_ps*)__p)->__v;
		3174	}
		3175
		3176	/// Loads 256 bits of integer data from a 32-byte aligned memory
		3177	/// location pointed to by \a __p into elements of a 256-bit integer vector.
		3178	///
		3179	/// \headerfile <x86intrin.h>
		3180	///
		3181	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
		3182	///
		3183	/// \param __p
		3184	/// A 32-byte aligned pointer to a 256-bit integer vector containing integer
		3185	/// values.
		3186	/// \returns A 256-bit integer vector containing the moved values.
		3187	static __inline __m256i __DEFAULT_FN_ATTRS
		3188	_mm256_load_si256(__m256i const *__p)
		3189	{
		3190	return *__p;
		3191	}
		3192
		3193	/// Loads 256 bits of integer data from an unaligned memory location
		3194	/// pointed to by \a __p into a 256-bit integer vector.
		3195	///
		3196	/// \headerfile <x86intrin.h>
		3197	///
		3198	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
		3199	///
		3200	/// \param __p
		3201	/// A pointer to a 256-bit integer vector containing integer values.
		3202	/// \returns A 256-bit integer vector containing the moved values.
		3203	static __inline __m256i __DEFAULT_FN_ATTRS
		3204	_mm256_loadu_si256(__m256i_u const *__p)
		3205	{
		3206	struct __loadu_si256 {
		3207	__m256i_u __v;
		3208	} __attribute__((__packed__, __may_alias__));
		3209	return ((const struct __loadu_si256*)__p)->__v;
		3210	}
		3211
		3212	/// Loads 256 bits of integer data from an unaligned memory location
		3213	/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may
		3214	/// perform better than \c _mm256_loadu_si256 when the data crosses a cache
		3215	/// line boundary.
		3216	///
		3217	/// \headerfile <x86intrin.h>
		3218	///
		3219	/// This intrinsic corresponds to the <c> VLDDQU </c> instruction.
		3220	///
		3221	/// \param __p
		3222	/// A pointer to a 256-bit integer vector containing integer values.
		3223	/// \returns A 256-bit integer vector containing the moved values.
		3224	static __inline __m256i __DEFAULT_FN_ATTRS
		3225	_mm256_lddqu_si256(__m256i_u const *__p)
		3226	{
		3227	return (__m256i)__builtin_ia32_lddqu256((char const *)__p);
		3228	}
		3229
		3230	/* SIMD store ops */
		3231	/// Stores double-precision floating point values from a 256-bit vector
		3232	/// of [4 x double] to a 32-byte aligned memory location pointed to by
		3233	/// \a __p.
		3234	///
		3235	/// \headerfile <x86intrin.h>
		3236	///
		3237	/// This intrinsic corresponds to the <c> VMOVAPD </c> instruction.
		3238	///
		3239	/// \param __p
		3240	/// A 32-byte aligned pointer to a memory location that will receive the
		3241	/// double-precision floaing point values.
		3242	/// \param __a
		3243	/// A 256-bit vector of [4 x double] containing the values to be moved.
		3244	static __inline void __DEFAULT_FN_ATTRS
		3245	_mm256_store_pd(double *__p, __m256d __a)
		3246	{
		3247	(__m256d )__p = __a;
		3248	}
		3249
		3250	/// Stores single-precision floating point values from a 256-bit vector
		3251	/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p.
		3252	///
		3253	/// \headerfile <x86intrin.h>
		3254	///
		3255	/// This intrinsic corresponds to the <c> VMOVAPS </c> instruction.
		3256	///
		3257	/// \param __p
		3258	/// A 32-byte aligned pointer to a memory location that will receive the
		3259	/// float values.
		3260	/// \param __a
		3261	/// A 256-bit vector of [8 x float] containing the values to be moved.
		3262	static __inline void __DEFAULT_FN_ATTRS
		3263	_mm256_store_ps(float *__p, __m256 __a)
		3264	{
		3265	(__m256 )__p = __a;
		3266	}
		3267
		3268	/// Stores double-precision floating point values from a 256-bit vector
		3269	/// of [4 x double] to an unaligned memory location pointed to by \a __p.
		3270	///
		3271	/// \headerfile <x86intrin.h>
		3272	///
		3273	/// This intrinsic corresponds to the <c> VMOVUPD </c> instruction.
		3274	///
		3275	/// \param __p
		3276	/// A pointer to a memory location that will receive the double-precision
		3277	/// floating point values.
		3278	/// \param __a
		3279	/// A 256-bit vector of [4 x double] containing the values to be moved.
		3280	static __inline void __DEFAULT_FN_ATTRS
		3281	_mm256_storeu_pd(double *__p, __m256d __a)
		3282	{
		3283	struct __storeu_pd {
		3284	__m256d_u __v;
		3285	} __attribute__((__packed__, __may_alias__));
		3286	((struct __storeu_pd*)__p)->__v = __a;
		3287	}
		3288
		3289	/// Stores single-precision floating point values from a 256-bit vector
		3290	/// of [8 x float] to an unaligned memory location pointed to by \a __p.
		3291	///
		3292	/// \headerfile <x86intrin.h>
		3293	///
		3294	/// This intrinsic corresponds to the <c> VMOVUPS </c> instruction.
		3295	///
		3296	/// \param __p
		3297	/// A pointer to a memory location that will receive the float values.
		3298	/// \param __a
		3299	/// A 256-bit vector of [8 x float] containing the values to be moved.
		3300	static __inline void __DEFAULT_FN_ATTRS
		3301	_mm256_storeu_ps(float *__p, __m256 __a)
		3302	{
		3303	struct __storeu_ps {
		3304	__m256_u __v;
		3305	} __attribute__((__packed__, __may_alias__));
		3306	((struct __storeu_ps*)__p)->__v = __a;
		3307	}
		3308
		3309	/// Stores integer values from a 256-bit integer vector to a 32-byte
		3310	/// aligned memory location pointed to by \a __p.
		3311	///
		3312	/// \headerfile <x86intrin.h>
		3313	///
		3314	/// This intrinsic corresponds to the <c> VMOVDQA </c> instruction.
		3315	///
		3316	/// \param __p
		3317	/// A 32-byte aligned pointer to a memory location that will receive the
		3318	/// integer values.
		3319	/// \param __a
		3320	/// A 256-bit integer vector containing the values to be moved.
		3321	static __inline void __DEFAULT_FN_ATTRS
		3322	_mm256_store_si256(__m256i *__p, __m256i __a)
		3323	{
		3324	*__p = __a;
		3325	}
		3326
		3327	/// Stores integer values from a 256-bit integer vector to an unaligned
		3328	/// memory location pointed to by \a __p.
		3329	///
		3330	/// \headerfile <x86intrin.h>
		3331	///
		3332	/// This intrinsic corresponds to the <c> VMOVDQU </c> instruction.
		3333	///
		3334	/// \param __p
		3335	/// A pointer to a memory location that will receive the integer values.
		3336	/// \param __a
		3337	/// A 256-bit integer vector containing the values to be moved.
		3338	static __inline void __DEFAULT_FN_ATTRS
		3339	_mm256_storeu_si256(__m256i_u *__p, __m256i __a)
		3340	{
		3341	struct __storeu_si256 {
		3342	__m256i_u __v;
		3343	} __attribute__((__packed__, __may_alias__));
		3344	((struct __storeu_si256*)__p)->__v = __a;
		3345	}
		3346
		3347	/* Conditional load ops */
		3348	/// Conditionally loads double-precision floating point elements from a
		3349	/// memory location pointed to by \a __p into a 128-bit vector of
		3350	/// [2 x double], depending on the mask bits associated with each data
		3351	/// element.
		3352	///
		3353	/// \headerfile <x86intrin.h>
		3354	///
		3355	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
		3356	///
		3357	/// \param __p
		3358	/// A pointer to a memory location that contains the double-precision
		3359	/// floating point values.
		3360	/// \param __m
		3361	/// A 128-bit integer vector containing the mask. The most significant bit of
		3362	/// each data element represents the mask bits. If a mask bit is zero, the
		3363	/// corresponding value in the memory location is not loaded and the
		3364	/// corresponding field in the return value is set to zero.
		3365	/// \returns A 128-bit vector of [2 x double] containing the loaded values.
		3366	static __inline __m128d __DEFAULT_FN_ATTRS128
		3367	_mm_maskload_pd(double const *__p, __m128i __m)
		3368	{
		3369	return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m);
		3370	}
		3371
		3372	/// Conditionally loads double-precision floating point elements from a
		3373	/// memory location pointed to by \a __p into a 256-bit vector of
		3374	/// [4 x double], depending on the mask bits associated with each data
		3375	/// element.
		3376	///
		3377	/// \headerfile <x86intrin.h>
		3378	///
		3379	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
		3380	///
		3381	/// \param __p
		3382	/// A pointer to a memory location that contains the double-precision
		3383	/// floating point values.
		3384	/// \param __m
		3385	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
		3386	/// significant bit of each quadword element represents the mask bits. If a
		3387	/// mask bit is zero, the corresponding value in the memory location is not
		3388	/// loaded and the corresponding field in the return value is set to zero.
		3389	/// \returns A 256-bit vector of [4 x double] containing the loaded values.
		3390	static __inline __m256d __DEFAULT_FN_ATTRS
		3391	_mm256_maskload_pd(double const *__p, __m256i __m)
		3392	{
		3393	return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p,
		3394	(__v4di)__m);
		3395	}
		3396
		3397	/// Conditionally loads single-precision floating point elements from a
		3398	/// memory location pointed to by \a __p into a 128-bit vector of
		3399	/// [4 x float], depending on the mask bits associated with each data
		3400	/// element.
		3401	///
		3402	/// \headerfile <x86intrin.h>
		3403	///
		3404	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
		3405	///
		3406	/// \param __p
		3407	/// A pointer to a memory location that contains the single-precision
		3408	/// floating point values.
		3409	/// \param __m
		3410	/// A 128-bit integer vector containing the mask. The most significant bit of
		3411	/// each data element represents the mask bits. If a mask bit is zero, the
		3412	/// corresponding value in the memory location is not loaded and the
		3413	/// corresponding field in the return value is set to zero.
		3414	/// \returns A 128-bit vector of [4 x float] containing the loaded values.
		3415	static __inline __m128 __DEFAULT_FN_ATTRS128
		3416	_mm_maskload_ps(float const *__p, __m128i __m)
		3417	{
		3418	return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m);
		3419	}
		3420
		3421	/// Conditionally loads single-precision floating point elements from a
		3422	/// memory location pointed to by \a __p into a 256-bit vector of
		3423	/// [8 x float], depending on the mask bits associated with each data
		3424	/// element.
		3425	///
		3426	/// \headerfile <x86intrin.h>
		3427	///
		3428	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
		3429	///
		3430	/// \param __p
		3431	/// A pointer to a memory location that contains the single-precision
		3432	/// floating point values.
		3433	/// \param __m
		3434	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
		3435	/// significant bit of each dword element represents the mask bits. If a mask
		3436	/// bit is zero, the corresponding value in the memory location is not loaded
		3437	/// and the corresponding field in the return value is set to zero.
		3438	/// \returns A 256-bit vector of [8 x float] containing the loaded values.
		3439	static __inline __m256 __DEFAULT_FN_ATTRS
		3440	_mm256_maskload_ps(float const *__p, __m256i __m)
		3441	{
		3442	return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m);
		3443	}
		3444
		3445	/* Conditional store ops */
		3446	/// Moves single-precision floating point values from a 256-bit vector
		3447	/// of [8 x float] to a memory location pointed to by \a __p, according to
		3448	/// the specified mask.
		3449	///
		3450	/// \headerfile <x86intrin.h>
		3451	///
		3452	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
		3453	///
		3454	/// \param __p
		3455	/// A pointer to a memory location that will receive the float values.
		3456	/// \param __m
		3457	/// A 256-bit integer vector of [8 x dword] containing the mask. The most
		3458	/// significant bit of each dword element in the mask vector represents the
		3459	/// mask bits. If a mask bit is zero, the corresponding value from vector
		3460	/// \a __a is not stored and the corresponding field in the memory location
		3461	/// pointed to by \a __p is not changed.
		3462	/// \param __a
		3463	/// A 256-bit vector of [8 x float] containing the values to be stored.
		3464	static __inline void __DEFAULT_FN_ATTRS
		3465	_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a)
		3466	{
		3467	__builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a);
		3468	}
		3469
		3470	/// Moves double-precision values from a 128-bit vector of [2 x double]
		3471	/// to a memory location pointed to by \a __p, according to the specified
		3472	/// mask.
		3473	///
		3474	/// \headerfile <x86intrin.h>
		3475	///
		3476	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
		3477	///
		3478	/// \param __p
		3479	/// A pointer to a memory location that will receive the float values.
		3480	/// \param __m
		3481	/// A 128-bit integer vector containing the mask. The most significant bit of
		3482	/// each field in the mask vector represents the mask bits. If a mask bit is
		3483	/// zero, the corresponding value from vector \a __a is not stored and the
		3484	/// corresponding field in the memory location pointed to by \a __p is not
		3485	/// changed.
		3486	/// \param __a
		3487	/// A 128-bit vector of [2 x double] containing the values to be stored.
		3488	static __inline void __DEFAULT_FN_ATTRS128
		3489	_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a)
		3490	{
		3491	__builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a);
		3492	}
		3493
		3494	/// Moves double-precision values from a 256-bit vector of [4 x double]
		3495	/// to a memory location pointed to by \a __p, according to the specified
		3496	/// mask.
		3497	///
		3498	/// \headerfile <x86intrin.h>
		3499	///
		3500	/// This intrinsic corresponds to the <c> VMASKMOVPD </c> instruction.
		3501	///
		3502	/// \param __p
		3503	/// A pointer to a memory location that will receive the float values.
		3504	/// \param __m
		3505	/// A 256-bit integer vector of [4 x quadword] containing the mask. The most
		3506	/// significant bit of each quadword element in the mask vector represents
		3507	/// the mask bits. If a mask bit is zero, the corresponding value from vector
		3508	/// __a is not stored and the corresponding field in the memory location
		3509	/// pointed to by \a __p is not changed.
		3510	/// \param __a
		3511	/// A 256-bit vector of [4 x double] containing the values to be stored.
		3512	static __inline void __DEFAULT_FN_ATTRS
		3513	_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a)
		3514	{
		3515	__builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a);
		3516	}
		3517
		3518	/// Moves single-precision floating point values from a 128-bit vector
		3519	/// of [4 x float] to a memory location pointed to by \a __p, according to
		3520	/// the specified mask.
		3521	///
		3522	/// \headerfile <x86intrin.h>
		3523	///
		3524	/// This intrinsic corresponds to the <c> VMASKMOVPS </c> instruction.
		3525	///
		3526	/// \param __p
		3527	/// A pointer to a memory location that will receive the float values.
		3528	/// \param __m
		3529	/// A 128-bit integer vector containing the mask. The most significant bit of
		3530	/// each field in the mask vector represents the mask bits. If a mask bit is
		3531	/// zero, the corresponding value from vector __a is not stored and the
		3532	/// corresponding field in the memory location pointed to by \a __p is not
		3533	/// changed.
		3534	/// \param __a
		3535	/// A 128-bit vector of [4 x float] containing the values to be stored.
		3536	static __inline void __DEFAULT_FN_ATTRS128
		3537	_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
		3538	{
		3539	__builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a);
		3540	}
		3541
		3542	/* Cacheability support ops */
		3543	/// Moves integer data from a 256-bit integer vector to a 32-byte
		3544	/// aligned memory location. To minimize caching, the data is flagged as
		3545	/// non-temporal (unlikely to be used again soon).
		3546	///
		3547	/// \headerfile <x86intrin.h>
		3548	///
		3549	/// This intrinsic corresponds to the <c> VMOVNTDQ </c> instruction.
		3550	///
		3551	/// \param __a
		3552	/// A pointer to a 32-byte aligned memory location that will receive the
		3553	/// integer values.
		3554	/// \param __b
		3555	/// A 256-bit integer vector containing the values to be moved.
		3556	static __inline void __DEFAULT_FN_ATTRS
		3557	_mm256_stream_si256(__m256i *__a, __m256i __b)
		3558	{
		3559	typedef __v4di __v4di_aligned __attribute__((aligned(32)));
		3560	__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
		3561	}
		3562
		3563	/// Moves double-precision values from a 256-bit vector of [4 x double]
		3564	/// to a 32-byte aligned memory location. To minimize caching, the data is
		3565	/// flagged as non-temporal (unlikely to be used again soon).
		3566	///
		3567	/// \headerfile <x86intrin.h>
		3568	///
		3569	/// This intrinsic corresponds to the <c> VMOVNTPD </c> instruction.
		3570	///
		3571	/// \param __a
		3572	/// A pointer to a 32-byte aligned memory location that will receive the
		3573	/// double-precision floating-point values.
		3574	/// \param __b
		3575	/// A 256-bit vector of [4 x double] containing the values to be moved.
		3576	static __inline void __DEFAULT_FN_ATTRS
		3577	_mm256_stream_pd(double *__a, __m256d __b)
		3578	{
		3579	typedef __v4df __v4df_aligned __attribute__((aligned(32)));
		3580	__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
		3581	}
		3582
		3583	/// Moves single-precision floating point values from a 256-bit vector
		3584	/// of [8 x float] to a 32-byte aligned memory location. To minimize
		3585	/// caching, the data is flagged as non-temporal (unlikely to be used again
		3586	/// soon).
		3587	///
		3588	/// \headerfile <x86intrin.h>
		3589	///
		3590	/// This intrinsic corresponds to the <c> VMOVNTPS </c> instruction.
		3591	///
		3592	/// \param __p
		3593	/// A pointer to a 32-byte aligned memory location that will receive the
		3594	/// single-precision floating point values.
		3595	/// \param __a
		3596	/// A 256-bit vector of [8 x float] containing the values to be moved.
		3597	static __inline void __DEFAULT_FN_ATTRS
		3598	_mm256_stream_ps(float *__p, __m256 __a)
		3599	{
		3600	typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
		3601	__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
		3602	}
		3603
		3604	/* Create vectors */
		3605	/// Create a 256-bit vector of [4 x double] with undefined values.
		3606	///
		3607	/// \headerfile <x86intrin.h>
		3608	///
		3609	/// This intrinsic has no corresponding instruction.
		3610	///
		3611	/// \returns A 256-bit vector of [4 x double] containing undefined values.
		3612	static __inline__ __m256d __DEFAULT_FN_ATTRS
		3613	_mm256_undefined_pd(void)
		3614	{
		3615	return (__m256d)__builtin_ia32_undef256();
		3616	}
		3617
		3618	/// Create a 256-bit vector of [8 x float] with undefined values.
		3619	///
		3620	/// \headerfile <x86intrin.h>
		3621	///
		3622	/// This intrinsic has no corresponding instruction.
		3623	///
		3624	/// \returns A 256-bit vector of [8 x float] containing undefined values.
		3625	static __inline__ __m256 __DEFAULT_FN_ATTRS
		3626	_mm256_undefined_ps(void)
		3627	{
		3628	return (__m256)__builtin_ia32_undef256();
		3629	}
		3630
		3631	/// Create a 256-bit integer vector with undefined values.
		3632	///
		3633	/// \headerfile <x86intrin.h>
		3634	///
		3635	/// This intrinsic has no corresponding instruction.
		3636	///
		3637	/// \returns A 256-bit integer vector containing undefined values.
		3638	static __inline__ __m256i __DEFAULT_FN_ATTRS
		3639	_mm256_undefined_si256(void)
		3640	{
		3641	return (__m256i)__builtin_ia32_undef256();
		3642	}
		3643
		3644	/// Constructs a 256-bit floating-point vector of [4 x double]
		3645	/// initialized with the specified double-precision floating-point values.
		3646	///
		3647	/// \headerfile <x86intrin.h>
		3648	///
		3649	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
		3650	/// instruction.
		3651	///
		3652	/// \param __a
		3653	/// A double-precision floating-point value used to initialize bits [255:192]
		3654	/// of the result.
		3655	/// \param __b
		3656	/// A double-precision floating-point value used to initialize bits [191:128]
		3657	/// of the result.
		3658	/// \param __c
		3659	/// A double-precision floating-point value used to initialize bits [127:64]
		3660	/// of the result.
		3661	/// \param __d
		3662	/// A double-precision floating-point value used to initialize bits [63:0]
		3663	/// of the result.
		3664	/// \returns An initialized 256-bit floating-point vector of [4 x double].
		3665	static __inline __m256d __DEFAULT_FN_ATTRS
		3666	_mm256_set_pd(double __a, double __b, double __c, double __d)
		3667	{
		3668	return __extension__ (__m256d){ __d, __c, __b, __a };
		3669	}
		3670
		3671	/// Constructs a 256-bit floating-point vector of [8 x float] initialized
		3672	/// with the specified single-precision floating-point values.
		3673	///
		3674	/// \headerfile <x86intrin.h>
		3675	///
		3676	/// This intrinsic is a utility function and does not correspond to a specific
		3677	/// instruction.
		3678	///
		3679	/// \param __a
		3680	/// A single-precision floating-point value used to initialize bits [255:224]
		3681	/// of the result.
		3682	/// \param __b
		3683	/// A single-precision floating-point value used to initialize bits [223:192]
		3684	/// of the result.
		3685	/// \param __c
		3686	/// A single-precision floating-point value used to initialize bits [191:160]
		3687	/// of the result.
		3688	/// \param __d
		3689	/// A single-precision floating-point value used to initialize bits [159:128]
		3690	/// of the result.
		3691	/// \param __e
		3692	/// A single-precision floating-point value used to initialize bits [127:96]
		3693	/// of the result.
		3694	/// \param __f
		3695	/// A single-precision floating-point value used to initialize bits [95:64]
		3696	/// of the result.
		3697	/// \param __g
		3698	/// A single-precision floating-point value used to initialize bits [63:32]
		3699	/// of the result.
		3700	/// \param __h
		3701	/// A single-precision floating-point value used to initialize bits [31:0]
		3702	/// of the result.
		3703	/// \returns An initialized 256-bit floating-point vector of [8 x float].
		3704	static __inline __m256 __DEFAULT_FN_ATTRS
		3705	_mm256_set_ps(float __a, float __b, float __c, float __d,
		3706	float __e, float __f, float __g, float __h)
		3707	{
		3708	return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a };
		3709	}
		3710
		3711	/// Constructs a 256-bit integer vector initialized with the specified
		3712	/// 32-bit integral values.
		3713	///
		3714	/// \headerfile <x86intrin.h>
		3715	///
		3716	/// This intrinsic is a utility function and does not correspond to a specific
		3717	/// instruction.
		3718	///
		3719	/// \param __i0
		3720	/// A 32-bit integral value used to initialize bits [255:224] of the result.
		3721	/// \param __i1
		3722	/// A 32-bit integral value used to initialize bits [223:192] of the result.
		3723	/// \param __i2
		3724	/// A 32-bit integral value used to initialize bits [191:160] of the result.
		3725	/// \param __i3
		3726	/// A 32-bit integral value used to initialize bits [159:128] of the result.
		3727	/// \param __i4
		3728	/// A 32-bit integral value used to initialize bits [127:96] of the result.
		3729	/// \param __i5
		3730	/// A 32-bit integral value used to initialize bits [95:64] of the result.
		3731	/// \param __i6
		3732	/// A 32-bit integral value used to initialize bits [63:32] of the result.
		3733	/// \param __i7
		3734	/// A 32-bit integral value used to initialize bits [31:0] of the result.
		3735	/// \returns An initialized 256-bit integer vector.
		3736	static __inline __m256i __DEFAULT_FN_ATTRS
		3737	_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3,
		3738	int __i4, int __i5, int __i6, int __i7)
		3739	{
		3740	return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 };
		3741	}
		3742
		3743	/// Constructs a 256-bit integer vector initialized with the specified
		3744	/// 16-bit integral values.
		3745	///
		3746	/// \headerfile <x86intrin.h>
		3747	///
		3748	/// This intrinsic is a utility function and does not correspond to a specific
		3749	/// instruction.
		3750	///
		3751	/// \param __w15
		3752	/// A 16-bit integral value used to initialize bits [255:240] of the result.
		3753	/// \param __w14
		3754	/// A 16-bit integral value used to initialize bits [239:224] of the result.
		3755	/// \param __w13
		3756	/// A 16-bit integral value used to initialize bits [223:208] of the result.
		3757	/// \param __w12
		3758	/// A 16-bit integral value used to initialize bits [207:192] of the result.
		3759	/// \param __w11
		3760	/// A 16-bit integral value used to initialize bits [191:176] of the result.
		3761	/// \param __w10
		3762	/// A 16-bit integral value used to initialize bits [175:160] of the result.
		3763	/// \param __w09
		3764	/// A 16-bit integral value used to initialize bits [159:144] of the result.
		3765	/// \param __w08
		3766	/// A 16-bit integral value used to initialize bits [143:128] of the result.
		3767	/// \param __w07
		3768	/// A 16-bit integral value used to initialize bits [127:112] of the result.
		3769	/// \param __w06
		3770	/// A 16-bit integral value used to initialize bits [111:96] of the result.
		3771	/// \param __w05
		3772	/// A 16-bit integral value used to initialize bits [95:80] of the result.
		3773	/// \param __w04
		3774	/// A 16-bit integral value used to initialize bits [79:64] of the result.
		3775	/// \param __w03
		3776	/// A 16-bit integral value used to initialize bits [63:48] of the result.
		3777	/// \param __w02
		3778	/// A 16-bit integral value used to initialize bits [47:32] of the result.
		3779	/// \param __w01
		3780	/// A 16-bit integral value used to initialize bits [31:16] of the result.
		3781	/// \param __w00
		3782	/// A 16-bit integral value used to initialize bits [15:0] of the result.
		3783	/// \returns An initialized 256-bit integer vector.
		3784	static __inline __m256i __DEFAULT_FN_ATTRS
		3785	_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12,
		3786	short __w11, short __w10, short __w09, short __w08,
		3787	short __w07, short __w06, short __w05, short __w04,
		3788	short __w03, short __w02, short __w01, short __w00)
		3789	{
		3790	return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06,
		3791	__w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 };
		3792	}
		3793
		3794	/// Constructs a 256-bit integer vector initialized with the specified
		3795	/// 8-bit integral values.
		3796	///
		3797	/// \headerfile <x86intrin.h>
		3798	///
		3799	/// This intrinsic is a utility function and does not correspond to a specific
		3800	/// instruction.
		3801	///
		3802	/// \param __b31
		3803	/// An 8-bit integral value used to initialize bits [255:248] of the result.
		3804	/// \param __b30
		3805	/// An 8-bit integral value used to initialize bits [247:240] of the result.
		3806	/// \param __b29
		3807	/// An 8-bit integral value used to initialize bits [239:232] of the result.
		3808	/// \param __b28
		3809	/// An 8-bit integral value used to initialize bits [231:224] of the result.
		3810	/// \param __b27
		3811	/// An 8-bit integral value used to initialize bits [223:216] of the result.
		3812	/// \param __b26
		3813	/// An 8-bit integral value used to initialize bits [215:208] of the result.
		3814	/// \param __b25
		3815	/// An 8-bit integral value used to initialize bits [207:200] of the result.
		3816	/// \param __b24
		3817	/// An 8-bit integral value used to initialize bits [199:192] of the result.
		3818	/// \param __b23
		3819	/// An 8-bit integral value used to initialize bits [191:184] of the result.
		3820	/// \param __b22
		3821	/// An 8-bit integral value used to initialize bits [183:176] of the result.
		3822	/// \param __b21
		3823	/// An 8-bit integral value used to initialize bits [175:168] of the result.
		3824	/// \param __b20
		3825	/// An 8-bit integral value used to initialize bits [167:160] of the result.
		3826	/// \param __b19
		3827	/// An 8-bit integral value used to initialize bits [159:152] of the result.
		3828	/// \param __b18
		3829	/// An 8-bit integral value used to initialize bits [151:144] of the result.
		3830	/// \param __b17
		3831	/// An 8-bit integral value used to initialize bits [143:136] of the result.
		3832	/// \param __b16
		3833	/// An 8-bit integral value used to initialize bits [135:128] of the result.
		3834	/// \param __b15
		3835	/// An 8-bit integral value used to initialize bits [127:120] of the result.
		3836	/// \param __b14
		3837	/// An 8-bit integral value used to initialize bits [119:112] of the result.
		3838	/// \param __b13
		3839	/// An 8-bit integral value used to initialize bits [111:104] of the result.
		3840	/// \param __b12
		3841	/// An 8-bit integral value used to initialize bits [103:96] of the result.
		3842	/// \param __b11
		3843	/// An 8-bit integral value used to initialize bits [95:88] of the result.
		3844	/// \param __b10
		3845	/// An 8-bit integral value used to initialize bits [87:80] of the result.
		3846	/// \param __b09
		3847	/// An 8-bit integral value used to initialize bits [79:72] of the result.
		3848	/// \param __b08
		3849	/// An 8-bit integral value used to initialize bits [71:64] of the result.
		3850	/// \param __b07
		3851	/// An 8-bit integral value used to initialize bits [63:56] of the result.
		3852	/// \param __b06
		3853	/// An 8-bit integral value used to initialize bits [55:48] of the result.
		3854	/// \param __b05
		3855	/// An 8-bit integral value used to initialize bits [47:40] of the result.
		3856	/// \param __b04
		3857	/// An 8-bit integral value used to initialize bits [39:32] of the result.
		3858	/// \param __b03
		3859	/// An 8-bit integral value used to initialize bits [31:24] of the result.
		3860	/// \param __b02
		3861	/// An 8-bit integral value used to initialize bits [23:16] of the result.
		3862	/// \param __b01
		3863	/// An 8-bit integral value used to initialize bits [15:8] of the result.
		3864	/// \param __b00
		3865	/// An 8-bit integral value used to initialize bits [7:0] of the result.
		3866	/// \returns An initialized 256-bit integer vector.
		3867	static __inline __m256i __DEFAULT_FN_ATTRS
		3868	_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28,
		3869	char __b27, char __b26, char __b25, char __b24,
		3870	char __b23, char __b22, char __b21, char __b20,
		3871	char __b19, char __b18, char __b17, char __b16,
		3872	char __b15, char __b14, char __b13, char __b12,
		3873	char __b11, char __b10, char __b09, char __b08,
		3874	char __b07, char __b06, char __b05, char __b04,
		3875	char __b03, char __b02, char __b01, char __b00)
		3876	{
		3877	return __extension__ (__m256i)(__v32qi){
		3878	__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
		3879	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
		3880	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
		3881	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31
		3882	};
		3883	}
		3884
		3885	/// Constructs a 256-bit integer vector initialized with the specified
		3886	/// 64-bit integral values.
		3887	///
		3888	/// \headerfile <x86intrin.h>
		3889	///
		3890	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
		3891	/// instruction.
		3892	///
		3893	/// \param __a
		3894	/// A 64-bit integral value used to initialize bits [255:192] of the result.
		3895	/// \param __b
		3896	/// A 64-bit integral value used to initialize bits [191:128] of the result.
		3897	/// \param __c
		3898	/// A 64-bit integral value used to initialize bits [127:64] of the result.
		3899	/// \param __d
		3900	/// A 64-bit integral value used to initialize bits [63:0] of the result.
		3901	/// \returns An initialized 256-bit integer vector.
		3902	static __inline __m256i __DEFAULT_FN_ATTRS
		3903	_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d)
		3904	{
		3905	return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a };
		3906	}
		3907
		3908	/* Create vectors with elements in reverse order */
		3909	/// Constructs a 256-bit floating-point vector of [4 x double],
		3910	/// initialized in reverse order with the specified double-precision
		3911	/// floating-point values.
		3912	///
		3913	/// \headerfile <x86intrin.h>
		3914	///
		3915	/// This intrinsic corresponds to the <c> VUNPCKLPD+VINSERTF128 </c>
		3916	/// instruction.
		3917	///
		3918	/// \param __a
		3919	/// A double-precision floating-point value used to initialize bits [63:0]
		3920	/// of the result.
		3921	/// \param __b
		3922	/// A double-precision floating-point value used to initialize bits [127:64]
		3923	/// of the result.
		3924	/// \param __c
		3925	/// A double-precision floating-point value used to initialize bits [191:128]
		3926	/// of the result.
		3927	/// \param __d
		3928	/// A double-precision floating-point value used to initialize bits [255:192]
		3929	/// of the result.
		3930	/// \returns An initialized 256-bit floating-point vector of [4 x double].
		3931	static __inline __m256d __DEFAULT_FN_ATTRS
		3932	_mm256_setr_pd(double __a, double __b, double __c, double __d)
		3933	{
		3934	return _mm256_set_pd(__d, __c, __b, __a);
		3935	}
		3936
		3937	/// Constructs a 256-bit floating-point vector of [8 x float],
		3938	/// initialized in reverse order with the specified single-precision
		3939	/// float-point values.
		3940	///
		3941	/// \headerfile <x86intrin.h>
		3942	///
		3943	/// This intrinsic is a utility function and does not correspond to a specific
		3944	/// instruction.
		3945	///
		3946	/// \param __a
		3947	/// A single-precision floating-point value used to initialize bits [31:0]
		3948	/// of the result.
		3949	/// \param __b
		3950	/// A single-precision floating-point value used to initialize bits [63:32]
		3951	/// of the result.
		3952	/// \param __c
		3953	/// A single-precision floating-point value used to initialize bits [95:64]
		3954	/// of the result.
		3955	/// \param __d
		3956	/// A single-precision floating-point value used to initialize bits [127:96]
		3957	/// of the result.
		3958	/// \param __e
		3959	/// A single-precision floating-point value used to initialize bits [159:128]
		3960	/// of the result.
		3961	/// \param __f
		3962	/// A single-precision floating-point value used to initialize bits [191:160]
		3963	/// of the result.
		3964	/// \param __g
		3965	/// A single-precision floating-point value used to initialize bits [223:192]
		3966	/// of the result.
		3967	/// \param __h
		3968	/// A single-precision floating-point value used to initialize bits [255:224]
		3969	/// of the result.
		3970	/// \returns An initialized 256-bit floating-point vector of [8 x float].
		3971	static __inline __m256 __DEFAULT_FN_ATTRS
		3972	_mm256_setr_ps(float __a, float __b, float __c, float __d,
		3973	float __e, float __f, float __g, float __h)
		3974	{
		3975	return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a);
		3976	}
		3977
		3978	/// Constructs a 256-bit integer vector, initialized in reverse order
		3979	/// with the specified 32-bit integral values.
		3980	///
		3981	/// \headerfile <x86intrin.h>
		3982	///
		3983	/// This intrinsic is a utility function and does not correspond to a specific
		3984	/// instruction.
		3985	///
		3986	/// \param __i0
		3987	/// A 32-bit integral value used to initialize bits [31:0] of the result.
		3988	/// \param __i1
		3989	/// A 32-bit integral value used to initialize bits [63:32] of the result.
		3990	/// \param __i2
		3991	/// A 32-bit integral value used to initialize bits [95:64] of the result.
		3992	/// \param __i3
		3993	/// A 32-bit integral value used to initialize bits [127:96] of the result.
		3994	/// \param __i4
		3995	/// A 32-bit integral value used to initialize bits [159:128] of the result.
		3996	/// \param __i5
		3997	/// A 32-bit integral value used to initialize bits [191:160] of the result.
		3998	/// \param __i6
		3999	/// A 32-bit integral value used to initialize bits [223:192] of the result.
		4000	/// \param __i7
		4001	/// A 32-bit integral value used to initialize bits [255:224] of the result.
		4002	/// \returns An initialized 256-bit integer vector.
		4003	static __inline __m256i __DEFAULT_FN_ATTRS
		4004	_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3,
		4005	int __i4, int __i5, int __i6, int __i7)
		4006	{
		4007	return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0);
		4008	}
		4009
		4010	/// Constructs a 256-bit integer vector, initialized in reverse order
		4011	/// with the specified 16-bit integral values.
		4012	///
		4013	/// \headerfile <x86intrin.h>
		4014	///
		4015	/// This intrinsic is a utility function and does not correspond to a specific
		4016	/// instruction.
		4017	///
		4018	/// \param __w15
		4019	/// A 16-bit integral value used to initialize bits [15:0] of the result.
		4020	/// \param __w14
		4021	/// A 16-bit integral value used to initialize bits [31:16] of the result.
		4022	/// \param __w13
		4023	/// A 16-bit integral value used to initialize bits [47:32] of the result.
		4024	/// \param __w12
		4025	/// A 16-bit integral value used to initialize bits [63:48] of the result.
		4026	/// \param __w11
		4027	/// A 16-bit integral value used to initialize bits [79:64] of the result.
		4028	/// \param __w10
		4029	/// A 16-bit integral value used to initialize bits [95:80] of the result.
		4030	/// \param __w09
		4031	/// A 16-bit integral value used to initialize bits [111:96] of the result.
		4032	/// \param __w08
		4033	/// A 16-bit integral value used to initialize bits [127:112] of the result.
		4034	/// \param __w07
		4035	/// A 16-bit integral value used to initialize bits [143:128] of the result.
		4036	/// \param __w06
		4037	/// A 16-bit integral value used to initialize bits [159:144] of the result.
		4038	/// \param __w05
		4039	/// A 16-bit integral value used to initialize bits [175:160] of the result.
		4040	/// \param __w04
		4041	/// A 16-bit integral value used to initialize bits [191:176] of the result.
		4042	/// \param __w03
		4043	/// A 16-bit integral value used to initialize bits [207:192] of the result.
		4044	/// \param __w02
		4045	/// A 16-bit integral value used to initialize bits [223:208] of the result.
		4046	/// \param __w01
		4047	/// A 16-bit integral value used to initialize bits [239:224] of the result.
		4048	/// \param __w00
		4049	/// A 16-bit integral value used to initialize bits [255:240] of the result.
		4050	/// \returns An initialized 256-bit integer vector.
		4051	static __inline __m256i __DEFAULT_FN_ATTRS
		4052	_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12,
		4053	short __w11, short __w10, short __w09, short __w08,
		4054	short __w07, short __w06, short __w05, short __w04,
		4055	short __w03, short __w02, short __w01, short __w00)
		4056	{
		4057	return _mm256_set_epi16(__w00, __w01, __w02, __w03,
		4058	__w04, __w05, __w06, __w07,
		4059	__w08, __w09, __w10, __w11,
		4060	__w12, __w13, __w14, __w15);
		4061	}
		4062
		4063	/// Constructs a 256-bit integer vector, initialized in reverse order
		4064	/// with the specified 8-bit integral values.
		4065	///
		4066	/// \headerfile <x86intrin.h>
		4067	///
		4068	/// This intrinsic is a utility function and does not correspond to a specific
		4069	/// instruction.
		4070	///
		4071	/// \param __b31
		4072	/// An 8-bit integral value used to initialize bits [7:0] of the result.
		4073	/// \param __b30
		4074	/// An 8-bit integral value used to initialize bits [15:8] of the result.
		4075	/// \param __b29
		4076	/// An 8-bit integral value used to initialize bits [23:16] of the result.
		4077	/// \param __b28
		4078	/// An 8-bit integral value used to initialize bits [31:24] of the result.
		4079	/// \param __b27
		4080	/// An 8-bit integral value used to initialize bits [39:32] of the result.
		4081	/// \param __b26
		4082	/// An 8-bit integral value used to initialize bits [47:40] of the result.
		4083	/// \param __b25
		4084	/// An 8-bit integral value used to initialize bits [55:48] of the result.
		4085	/// \param __b24
		4086	/// An 8-bit integral value used to initialize bits [63:56] of the result.
		4087	/// \param __b23
		4088	/// An 8-bit integral value used to initialize bits [71:64] of the result.
		4089	/// \param __b22
		4090	/// An 8-bit integral value used to initialize bits [79:72] of the result.
		4091	/// \param __b21
		4092	/// An 8-bit integral value used to initialize bits [87:80] of the result.
		4093	/// \param __b20
		4094	/// An 8-bit integral value used to initialize bits [95:88] of the result.
		4095	/// \param __b19
		4096	/// An 8-bit integral value used to initialize bits [103:96] of the result.
		4097	/// \param __b18
		4098	/// An 8-bit integral value used to initialize bits [111:104] of the result.
		4099	/// \param __b17
		4100	/// An 8-bit integral value used to initialize bits [119:112] of the result.
		4101	/// \param __b16
		4102	/// An 8-bit integral value used to initialize bits [127:120] of the result.
		4103	/// \param __b15
		4104	/// An 8-bit integral value used to initialize bits [135:128] of the result.
		4105	/// \param __b14
		4106	/// An 8-bit integral value used to initialize bits [143:136] of the result.
		4107	/// \param __b13
		4108	/// An 8-bit integral value used to initialize bits [151:144] of the result.
		4109	/// \param __b12
		4110	/// An 8-bit integral value used to initialize bits [159:152] of the result.
		4111	/// \param __b11
		4112	/// An 8-bit integral value used to initialize bits [167:160] of the result.
		4113	/// \param __b10
		4114	/// An 8-bit integral value used to initialize bits [175:168] of the result.
		4115	/// \param __b09
		4116	/// An 8-bit integral value used to initialize bits [183:176] of the result.
		4117	/// \param __b08
		4118	/// An 8-bit integral value used to initialize bits [191:184] of the result.
		4119	/// \param __b07
		4120	/// An 8-bit integral value used to initialize bits [199:192] of the result.
		4121	/// \param __b06
		4122	/// An 8-bit integral value used to initialize bits [207:200] of the result.
		4123	/// \param __b05
		4124	/// An 8-bit integral value used to initialize bits [215:208] of the result.
		4125	/// \param __b04
		4126	/// An 8-bit integral value used to initialize bits [223:216] of the result.
		4127	/// \param __b03
		4128	/// An 8-bit integral value used to initialize bits [231:224] of the result.
		4129	/// \param __b02
		4130	/// An 8-bit integral value used to initialize bits [239:232] of the result.
		4131	/// \param __b01
		4132	/// An 8-bit integral value used to initialize bits [247:240] of the result.
		4133	/// \param __b00
		4134	/// An 8-bit integral value used to initialize bits [255:248] of the result.
		4135	/// \returns An initialized 256-bit integer vector.
		4136	static __inline __m256i __DEFAULT_FN_ATTRS
		4137	_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28,
		4138	char __b27, char __b26, char __b25, char __b24,
		4139	char __b23, char __b22, char __b21, char __b20,
		4140	char __b19, char __b18, char __b17, char __b16,
		4141	char __b15, char __b14, char __b13, char __b12,
		4142	char __b11, char __b10, char __b09, char __b08,
		4143	char __b07, char __b06, char __b05, char __b04,
		4144	char __b03, char __b02, char __b01, char __b00)
		4145	{
		4146	return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07,
		4147	__b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15,
		4148	__b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23,
		4149	__b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31);
		4150	}
		4151
		4152	/// Constructs a 256-bit integer vector, initialized in reverse order
		4153	/// with the specified 64-bit integral values.
		4154	///
		4155	/// \headerfile <x86intrin.h>
		4156	///
		4157	/// This intrinsic corresponds to the <c> VPUNPCKLQDQ+VINSERTF128 </c>
		4158	/// instruction.
		4159	///
		4160	/// \param __a
		4161	/// A 64-bit integral value used to initialize bits [63:0] of the result.
		4162	/// \param __b
		4163	/// A 64-bit integral value used to initialize bits [127:64] of the result.
		4164	/// \param __c
		4165	/// A 64-bit integral value used to initialize bits [191:128] of the result.
		4166	/// \param __d
		4167	/// A 64-bit integral value used to initialize bits [255:192] of the result.
		4168	/// \returns An initialized 256-bit integer vector.
		4169	static __inline __m256i __DEFAULT_FN_ATTRS
		4170	_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d)
		4171	{
		4172	return _mm256_set_epi64x(__d, __c, __b, __a);
		4173	}
		4174
		4175	/* Create vectors with repeated elements */
		4176	/// Constructs a 256-bit floating-point vector of [4 x double], with each
		4177	/// of the four double-precision floating-point vector elements set to the
		4178	/// specified double-precision floating-point value.
		4179	///
		4180	/// \headerfile <x86intrin.h>
		4181	///
		4182	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
		4183	///
		4184	/// \param __w
		4185	/// A double-precision floating-point value used to initialize each vector
		4186	/// element of the result.
		4187	/// \returns An initialized 256-bit floating-point vector of [4 x double].
		4188	static __inline __m256d __DEFAULT_FN_ATTRS
		4189	_mm256_set1_pd(double __w)
		4190	{
		4191	return _mm256_set_pd(__w, __w, __w, __w);
		4192	}
		4193
		4194	/// Constructs a 256-bit floating-point vector of [8 x float], with each
		4195	/// of the eight single-precision floating-point vector elements set to the
		4196	/// specified single-precision floating-point value.
		4197	///
		4198	/// \headerfile <x86intrin.h>
		4199	///
		4200	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
		4201	/// instruction.
		4202	///
		4203	/// \param __w
		4204	/// A single-precision floating-point value used to initialize each vector
		4205	/// element of the result.
		4206	/// \returns An initialized 256-bit floating-point vector of [8 x float].
		4207	static __inline __m256 __DEFAULT_FN_ATTRS
		4208	_mm256_set1_ps(float __w)
		4209	{
		4210	return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w);
		4211	}
		4212
		4213	/// Constructs a 256-bit integer vector of [8 x i32], with each of the
		4214	/// 32-bit integral vector elements set to the specified 32-bit integral
		4215	/// value.
		4216	///
		4217	/// \headerfile <x86intrin.h>
		4218	///
		4219	/// This intrinsic corresponds to the <c> VPERMILPS+VINSERTF128 </c>
		4220	/// instruction.
		4221	///
		4222	/// \param __i
		4223	/// A 32-bit integral value used to initialize each vector element of the
		4224	/// result.
		4225	/// \returns An initialized 256-bit integer vector of [8 x i32].
		4226	static __inline __m256i __DEFAULT_FN_ATTRS
		4227	_mm256_set1_epi32(int __i)
		4228	{
		4229	return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i);
		4230	}
		4231
		4232	/// Constructs a 256-bit integer vector of [16 x i16], with each of the
		4233	/// 16-bit integral vector elements set to the specified 16-bit integral
		4234	/// value.
		4235	///
		4236	/// \headerfile <x86intrin.h>
		4237	///
		4238	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
		4239	///
		4240	/// \param __w
		4241	/// A 16-bit integral value used to initialize each vector element of the
		4242	/// result.
		4243	/// \returns An initialized 256-bit integer vector of [16 x i16].
		4244	static __inline __m256i __DEFAULT_FN_ATTRS
		4245	_mm256_set1_epi16(short __w)
		4246	{
		4247	return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w,
		4248	__w, __w, __w, __w, __w, __w, __w, __w);
		4249	}
		4250
		4251	/// Constructs a 256-bit integer vector of [32 x i8], with each of the
		4252	/// 8-bit integral vector elements set to the specified 8-bit integral value.
		4253	///
		4254	/// \headerfile <x86intrin.h>
		4255	///
		4256	/// This intrinsic corresponds to the <c> VPSHUFB+VINSERTF128 </c> instruction.
		4257	///
		4258	/// \param __b
		4259	/// An 8-bit integral value used to initialize each vector element of the
		4260	/// result.
		4261	/// \returns An initialized 256-bit integer vector of [32 x i8].
		4262	static __inline __m256i __DEFAULT_FN_ATTRS
		4263	_mm256_set1_epi8(char __b)
		4264	{
		4265	return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b,
		4266	__b, __b, __b, __b, __b, __b, __b, __b,
		4267	__b, __b, __b, __b, __b, __b, __b, __b,
		4268	__b, __b, __b, __b, __b, __b, __b, __b);
		4269	}
		4270
		4271	/// Constructs a 256-bit integer vector of [4 x i64], with each of the
		4272	/// 64-bit integral vector elements set to the specified 64-bit integral
		4273	/// value.
		4274	///
		4275	/// \headerfile <x86intrin.h>
		4276	///
		4277	/// This intrinsic corresponds to the <c> VMOVDDUP+VINSERTF128 </c> instruction.
		4278	///
		4279	/// \param __q
		4280	/// A 64-bit integral value used to initialize each vector element of the
		4281	/// result.
		4282	/// \returns An initialized 256-bit integer vector of [4 x i64].
		4283	static __inline __m256i __DEFAULT_FN_ATTRS
		4284	_mm256_set1_epi64x(long long __q)
		4285	{
		4286	return _mm256_set_epi64x(__q, __q, __q, __q);
		4287	}
		4288
		4289	/* Create __zeroed vectors */
		4290	/// Constructs a 256-bit floating-point vector of [4 x double] with all
		4291	/// vector elements initialized to zero.
		4292	///
		4293	/// \headerfile <x86intrin.h>
		4294	///
		4295	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
		4296	///
		4297	/// \returns A 256-bit vector of [4 x double] with all elements set to zero.
		4298	static __inline __m256d __DEFAULT_FN_ATTRS
		4299	_mm256_setzero_pd(void)
		4300	{
		4301	return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
		4302	}
		4303
		4304	/// Constructs a 256-bit floating-point vector of [8 x float] with all
		4305	/// vector elements initialized to zero.
		4306	///
		4307	/// \headerfile <x86intrin.h>
		4308	///
		4309	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
		4310	///
		4311	/// \returns A 256-bit vector of [8 x float] with all elements set to zero.
		4312	static __inline __m256 __DEFAULT_FN_ATTRS
		4313	_mm256_setzero_ps(void)
		4314	{
		4315	return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
		4316	}
		4317
		4318	/// Constructs a 256-bit integer vector initialized to zero.
		4319	///
		4320	/// \headerfile <x86intrin.h>
		4321	///
		4322	/// This intrinsic corresponds to the <c> VXORPS </c> instruction.
		4323	///
		4324	/// \returns A 256-bit integer vector initialized to zero.
		4325	static __inline __m256i __DEFAULT_FN_ATTRS
		4326	_mm256_setzero_si256(void)
		4327	{
		4328	return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 };
		4329	}
		4330
		4331	/* Cast between vector types */
		4332	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
		4333	/// floating-point vector of [8 x float].
		4334	///
		4335	/// \headerfile <x86intrin.h>
		4336	///
		4337	/// This intrinsic has no corresponding instruction.
		4338	///
		4339	/// \param __a
		4340	/// A 256-bit floating-point vector of [4 x double].
		4341	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
		4342	/// bitwise pattern as the parameter.
		4343	static __inline __m256 __DEFAULT_FN_ATTRS
		4344	_mm256_castpd_ps(__m256d __a)
		4345	{
		4346	return (__m256)__a;
		4347	}
		4348
		4349	/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit
		4350	/// integer vector.
		4351	///
		4352	/// \headerfile <x86intrin.h>
		4353	///
		4354	/// This intrinsic has no corresponding instruction.
		4355	///
		4356	/// \param __a
		4357	/// A 256-bit floating-point vector of [4 x double].
		4358	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
		4359	/// parameter.
		4360	static __inline __m256i __DEFAULT_FN_ATTRS
		4361	_mm256_castpd_si256(__m256d __a)
		4362	{
		4363	return (__m256i)__a;
		4364	}
		4365
		4366	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
		4367	/// floating-point vector of [4 x double].
		4368	///
		4369	/// \headerfile <x86intrin.h>
		4370	///
		4371	/// This intrinsic has no corresponding instruction.
		4372	///
		4373	/// \param __a
		4374	/// A 256-bit floating-point vector of [8 x float].
		4375	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
		4376	/// bitwise pattern as the parameter.
		4377	static __inline __m256d __DEFAULT_FN_ATTRS
		4378	_mm256_castps_pd(__m256 __a)
		4379	{
		4380	return (__m256d)__a;
		4381	}
		4382
		4383	/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit
		4384	/// integer vector.
		4385	///
		4386	/// \headerfile <x86intrin.h>
		4387	///
		4388	/// This intrinsic has no corresponding instruction.
		4389	///
		4390	/// \param __a
		4391	/// A 256-bit floating-point vector of [8 x float].
		4392	/// \returns A 256-bit integer vector containing the same bitwise pattern as the
		4393	/// parameter.
		4394	static __inline __m256i __DEFAULT_FN_ATTRS
		4395	_mm256_castps_si256(__m256 __a)
		4396	{
		4397	return (__m256i)__a;
		4398	}
		4399
		4400	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
		4401	/// of [8 x float].
		4402	///
		4403	/// \headerfile <x86intrin.h>
		4404	///
		4405	/// This intrinsic has no corresponding instruction.
		4406	///
		4407	/// \param __a
		4408	/// A 256-bit integer vector.
		4409	/// \returns A 256-bit floating-point vector of [8 x float] containing the same
		4410	/// bitwise pattern as the parameter.
		4411	static __inline __m256 __DEFAULT_FN_ATTRS
		4412	_mm256_castsi256_ps(__m256i __a)
		4413	{
		4414	return (__m256)__a;
		4415	}
		4416
		4417	/// Casts a 256-bit integer vector into a 256-bit floating-point vector
		4418	/// of [4 x double].
		4419	///
		4420	/// \headerfile <x86intrin.h>
		4421	///
		4422	/// This intrinsic has no corresponding instruction.
		4423	///
		4424	/// \param __a
		4425	/// A 256-bit integer vector.
		4426	/// \returns A 256-bit floating-point vector of [4 x double] containing the same
		4427	/// bitwise pattern as the parameter.
		4428	static __inline __m256d __DEFAULT_FN_ATTRS
		4429	_mm256_castsi256_pd(__m256i __a)
		4430	{
		4431	return (__m256d)__a;
		4432	}
		4433
		4434	/// Returns the lower 128 bits of a 256-bit floating-point vector of
		4435	/// [4 x double] as a 128-bit floating-point vector of [2 x double].
		4436	///
		4437	/// \headerfile <x86intrin.h>
		4438	///
		4439	/// This intrinsic has no corresponding instruction.
		4440	///
		4441	/// \param __a
		4442	/// A 256-bit floating-point vector of [4 x double].
		4443	/// \returns A 128-bit floating-point vector of [2 x double] containing the
		4444	/// lower 128 bits of the parameter.
		4445	static __inline __m128d __DEFAULT_FN_ATTRS
		4446	_mm256_castpd256_pd128(__m256d __a)
		4447	{
		4448	return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1);
		4449	}
		4450
		4451	/// Returns the lower 128 bits of a 256-bit floating-point vector of
		4452	/// [8 x float] as a 128-bit floating-point vector of [4 x float].
		4453	///
		4454	/// \headerfile <x86intrin.h>
		4455	///
		4456	/// This intrinsic has no corresponding instruction.
		4457	///
		4458	/// \param __a
		4459	/// A 256-bit floating-point vector of [8 x float].
		4460	/// \returns A 128-bit floating-point vector of [4 x float] containing the
		4461	/// lower 128 bits of the parameter.
		4462	static __inline __m128 __DEFAULT_FN_ATTRS
		4463	_mm256_castps256_ps128(__m256 __a)
		4464	{
		4465	return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3);
		4466	}
		4467
		4468	/// Truncates a 256-bit integer vector into a 128-bit integer vector.
		4469	///
		4470	/// \headerfile <x86intrin.h>
		4471	///
		4472	/// This intrinsic has no corresponding instruction.
		4473	///
		4474	/// \param __a
		4475	/// A 256-bit integer vector.
		4476	/// \returns A 128-bit integer vector containing the lower 128 bits of the
		4477	/// parameter.
		4478	static __inline __m128i __DEFAULT_FN_ATTRS
		4479	_mm256_castsi256_si128(__m256i __a)
		4480	{
		4481	return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1);
		4482	}
		4483
		4484	/// Constructs a 256-bit floating-point vector of [4 x double] from a
		4485	/// 128-bit floating-point vector of [2 x double].
		4486	///
		4487	/// The lower 128 bits contain the value of the source vector. The contents
		4488	/// of the upper 128 bits are undefined.
		4489	///
		4490	/// \headerfile <x86intrin.h>
		4491	///
		4492	/// This intrinsic has no corresponding instruction.
		4493	///
		4494	/// \param __a
		4495	/// A 128-bit vector of [2 x double].
		4496	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
		4497	/// contain the value of the parameter. The contents of the upper 128 bits
		4498	/// are undefined.
		4499	static __inline __m256d __DEFAULT_FN_ATTRS
		4500	_mm256_castpd128_pd256(__m128d __a)
		4501	{
		4502	return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1);
		4503	}
		4504
		4505	/// Constructs a 256-bit floating-point vector of [8 x float] from a
		4506	/// 128-bit floating-point vector of [4 x float].
		4507	///
		4508	/// The lower 128 bits contain the value of the source vector. The contents
		4509	/// of the upper 128 bits are undefined.
		4510	///
		4511	/// \headerfile <x86intrin.h>
		4512	///
		4513	/// This intrinsic has no corresponding instruction.
		4514	///
		4515	/// \param __a
		4516	/// A 128-bit vector of [4 x float].
		4517	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
		4518	/// contain the value of the parameter. The contents of the upper 128 bits
		4519	/// are undefined.
		4520	static __inline __m256 __DEFAULT_FN_ATTRS
		4521	_mm256_castps128_ps256(__m128 __a)
		4522	{
		4523	return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1);
		4524	}
		4525
		4526	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
		4527	///
		4528	/// The lower 128 bits contain the value of the source vector. The contents
		4529	/// of the upper 128 bits are undefined.
		4530	///
		4531	/// \headerfile <x86intrin.h>
		4532	///
		4533	/// This intrinsic has no corresponding instruction.
		4534	///
		4535	/// \param __a
		4536	/// A 128-bit integer vector.
		4537	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
		4538	/// the parameter. The contents of the upper 128 bits are undefined.
		4539	static __inline __m256i __DEFAULT_FN_ATTRS
		4540	_mm256_castsi128_si256(__m128i __a)
		4541	{
		4542	return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1);
		4543	}
		4544
		4545	/// Constructs a 256-bit floating-point vector of [4 x double] from a
		4546	/// 128-bit floating-point vector of [2 x double]. The lower 128 bits
		4547	/// contain the value of the source vector. The upper 128 bits are set
		4548	/// to zero.
		4549	///
		4550	/// \headerfile <x86intrin.h>
		4551	///
		4552	/// This intrinsic has no corresponding instruction.
		4553	///
		4554	/// \param __a
		4555	/// A 128-bit vector of [2 x double].
		4556	/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits
		4557	/// contain the value of the parameter. The upper 128 bits are set to zero.
		4558	static __inline __m256d __DEFAULT_FN_ATTRS
		4559	_mm256_zextpd128_pd256(__m128d __a)
		4560	{
		4561	return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3);
		4562	}
		4563
		4564	/// Constructs a 256-bit floating-point vector of [8 x float] from a
		4565	/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain
		4566	/// the value of the source vector. The upper 128 bits are set to zero.
		4567	///
		4568	/// \headerfile <x86intrin.h>
		4569	///
		4570	/// This intrinsic has no corresponding instruction.
		4571	///
		4572	/// \param __a
		4573	/// A 128-bit vector of [4 x float].
		4574	/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits
		4575	/// contain the value of the parameter. The upper 128 bits are set to zero.
		4576	static __inline __m256 __DEFAULT_FN_ATTRS
		4577	_mm256_zextps128_ps256(__m128 __a)
		4578	{
		4579	return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7);
		4580	}
		4581
		4582	/// Constructs a 256-bit integer vector from a 128-bit integer vector.
		4583	/// The lower 128 bits contain the value of the source vector. The upper
		4584	/// 128 bits are set to zero.
		4585	///
		4586	/// \headerfile <x86intrin.h>
		4587	///
		4588	/// This intrinsic has no corresponding instruction.
		4589	///
		4590	/// \param __a
		4591	/// A 128-bit integer vector.
		4592	/// \returns A 256-bit integer vector. The lower 128 bits contain the value of
		4593	/// the parameter. The upper 128 bits are set to zero.
		4594	static __inline __m256i __DEFAULT_FN_ATTRS
		4595	_mm256_zextsi128_si256(__m128i __a)
		4596	{
		4597	return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3);
		4598	}
		4599
		4600	/*
		4601	Vector insert.
		4602	We use macros rather than inlines because we only want to accept
		4603	invocations where the immediate M is a constant expression.
		4604	*/
		4605	/// Constructs a new 256-bit vector of [8 x float] by first duplicating
		4606	/// a 256-bit vector of [8 x float] given in the first parameter, and then
		4607	/// replacing either the upper or the lower 128 bits with the contents of a
		4608	/// 128-bit vector of [4 x float] in the second parameter.
		4609	///
		4610	/// The immediate integer parameter determines between the upper or the lower
		4611	/// 128 bits.
		4612	///
		4613	/// \headerfile <x86intrin.h>
		4614	///
		4615	/// \code
		4616	/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M);
		4617	/// \endcode
		4618	///
		4619	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
		4620	///
		4621	/// \param V1
		4622	/// A 256-bit vector of [8 x float]. This vector is copied to the result
		4623	/// first, and then either the upper or the lower 128 bits of the result will
		4624	/// be replaced by the contents of \a V2.
		4625	/// \param V2
		4626	/// A 128-bit vector of [4 x float]. The contents of this parameter are
		4627	/// written to either the upper or the lower 128 bits of the result depending
		4628	/// on the value of parameter \a M.
		4629	/// \param M
		4630	/// An immediate integer. The least significant bit determines how the values
		4631	/// from the two parameters are interleaved: \n
		4632	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
		4633	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
		4634	/// result. \n
		4635	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
		4636	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
		4637	/// result.
		4638	/// \returns A 256-bit vector of [8 x float] containing the interleaved values.
		4639	#define _mm256_insertf128_ps(V1, V2, M) \
		4640	((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \
		4641	(__v4sf)(__m128)(V2), (int)(M)))
		4642
		4643	/// Constructs a new 256-bit vector of [4 x double] by first duplicating
		4644	/// a 256-bit vector of [4 x double] given in the first parameter, and then
		4645	/// replacing either the upper or the lower 128 bits with the contents of a
		4646	/// 128-bit vector of [2 x double] in the second parameter.
		4647	///
		4648	/// The immediate integer parameter determines between the upper or the lower
		4649	/// 128 bits.
		4650	///
		4651	/// \headerfile <x86intrin.h>
		4652	///
		4653	/// \code
		4654	/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M);
		4655	/// \endcode
		4656	///
		4657	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
		4658	///
		4659	/// \param V1
		4660	/// A 256-bit vector of [4 x double]. This vector is copied to the result
		4661	/// first, and then either the upper or the lower 128 bits of the result will
		4662	/// be replaced by the contents of \a V2.
		4663	/// \param V2
		4664	/// A 128-bit vector of [2 x double]. The contents of this parameter are
		4665	/// written to either the upper or the lower 128 bits of the result depending
		4666	/// on the value of parameter \a M.
		4667	/// \param M
		4668	/// An immediate integer. The least significant bit determines how the values
		4669	/// from the two parameters are interleaved: \n
		4670	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
		4671	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
		4672	/// result. \n
		4673	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
		4674	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
		4675	/// result.
		4676	/// \returns A 256-bit vector of [4 x double] containing the interleaved values.
		4677	#define _mm256_insertf128_pd(V1, V2, M) \
		4678	((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \
		4679	(__v2df)(__m128d)(V2), (int)(M)))
		4680
		4681	/// Constructs a new 256-bit integer vector by first duplicating a
		4682	/// 256-bit integer vector given in the first parameter, and then replacing
		4683	/// either the upper or the lower 128 bits with the contents of a 128-bit
		4684	/// integer vector in the second parameter.
		4685	///
		4686	/// The immediate integer parameter determines between the upper or the lower
		4687	/// 128 bits.
		4688	///
		4689	/// \headerfile <x86intrin.h>
		4690	///
		4691	/// \code
		4692	/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M);
		4693	/// \endcode
		4694	///
		4695	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
		4696	///
		4697	/// \param V1
		4698	/// A 256-bit integer vector. This vector is copied to the result first, and
		4699	/// then either the upper or the lower 128 bits of the result will be
		4700	/// replaced by the contents of \a V2.
		4701	/// \param V2
		4702	/// A 128-bit integer vector. The contents of this parameter are written to
		4703	/// either the upper or the lower 128 bits of the result depending on the
		4704	/// value of parameter \a M.
		4705	/// \param M
		4706	/// An immediate integer. The least significant bit determines how the values
		4707	/// from the two parameters are interleaved: \n
		4708	/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result,
		4709	/// and bits [255:128] of \a V1 are copied to bits [255:128] of the
		4710	/// result. \n
		4711	/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the
		4712	/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the
		4713	/// result.
		4714	/// \returns A 256-bit integer vector containing the interleaved values.
		4715	#define _mm256_insertf128_si256(V1, V2, M) \
		4716	((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \
		4717	(__v4si)(__m128i)(V2), (int)(M)))
		4718
		4719	/*
		4720	Vector extract.
		4721	We use macros rather than inlines because we only want to accept
		4722	invocations where the immediate M is a constant expression.
		4723	*/
		4724	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
		4725	/// of [8 x float], as determined by the immediate integer parameter, and
		4726	/// returns the extracted bits as a 128-bit vector of [4 x float].
		4727	///
		4728	/// \headerfile <x86intrin.h>
		4729	///
		4730	/// \code
		4731	/// __m128 _mm256_extractf128_ps(__m256 V, const int M);
		4732	/// \endcode
		4733	///
		4734	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
		4735	///
		4736	/// \param V
		4737	/// A 256-bit vector of [8 x float].
		4738	/// \param M
		4739	/// An immediate integer. The least significant bit determines which bits are
		4740	/// extracted from the first parameter: \n
		4741	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
		4742	/// result. \n
		4743	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
		4744	/// \returns A 128-bit vector of [4 x float] containing the extracted bits.
		4745	#define _mm256_extractf128_ps(V, M) \
		4746	((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M)))
		4747
		4748	/// Extracts either the upper or the lower 128 bits from a 256-bit vector
		4749	/// of [4 x double], as determined by the immediate integer parameter, and
		4750	/// returns the extracted bits as a 128-bit vector of [2 x double].
		4751	///
		4752	/// \headerfile <x86intrin.h>
		4753	///
		4754	/// \code
		4755	/// __m128d _mm256_extractf128_pd(__m256d V, const int M);
		4756	/// \endcode
		4757	///
		4758	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
		4759	///
		4760	/// \param V
		4761	/// A 256-bit vector of [4 x double].
		4762	/// \param M
		4763	/// An immediate integer. The least significant bit determines which bits are
		4764	/// extracted from the first parameter: \n
		4765	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
		4766	/// result. \n
		4767	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
		4768	/// \returns A 128-bit vector of [2 x double] containing the extracted bits.
		4769	#define _mm256_extractf128_pd(V, M) \
		4770	((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M)))
		4771
		4772	/// Extracts either the upper or the lower 128 bits from a 256-bit
		4773	/// integer vector, as determined by the immediate integer parameter, and
		4774	/// returns the extracted bits as a 128-bit integer vector.
		4775	///
		4776	/// \headerfile <x86intrin.h>
		4777	///
		4778	/// \code
		4779	/// __m128i _mm256_extractf128_si256(__m256i V, const int M);
		4780	/// \endcode
		4781	///
		4782	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction.
		4783	///
		4784	/// \param V
		4785	/// A 256-bit integer vector.
		4786	/// \param M
		4787	/// An immediate integer. The least significant bit determines which bits are
		4788	/// extracted from the first parameter: \n
		4789	/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the
		4790	/// result. \n
		4791	/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result.
		4792	/// \returns A 128-bit integer vector containing the extracted bits.
		4793	#define _mm256_extractf128_si256(V, M) \
		4794	((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M)))
		4795
		4796	/// Constructs a 256-bit floating-point vector of [8 x float] by
		4797	/// concatenating two 128-bit floating-point vectors of [4 x float].
		4798	///
		4799	/// \headerfile <x86intrin.h>
		4800	///
		4801	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
		4802	///
		4803	/// \param __hi
		4804	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
		4805	/// 128 bits of the result.
		4806	/// \param __lo
		4807	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
		4808	/// 128 bits of the result.
		4809	/// \returns A 256-bit floating-point vector of [8 x float] containing the
		4810	/// concatenated result.
		4811	static __inline __m256 __DEFAULT_FN_ATTRS
		4812	_mm256_set_m128 (__m128 __hi, __m128 __lo)
		4813	{
		4814	return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7);
		4815	}
		4816
		4817	/// Constructs a 256-bit floating-point vector of [4 x double] by
		4818	/// concatenating two 128-bit floating-point vectors of [2 x double].
		4819	///
		4820	/// \headerfile <x86intrin.h>
		4821	///
		4822	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
		4823	///
		4824	/// \param __hi
		4825	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
		4826	/// 128 bits of the result.
		4827	/// \param __lo
		4828	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
		4829	/// 128 bits of the result.
		4830	/// \returns A 256-bit floating-point vector of [4 x double] containing the
		4831	/// concatenated result.
		4832	static __inline __m256d __DEFAULT_FN_ATTRS
		4833	_mm256_set_m128d (__m128d __hi, __m128d __lo)
		4834	{
		4835	return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3);
		4836	}
		4837
		4838	/// Constructs a 256-bit integer vector by concatenating two 128-bit
		4839	/// integer vectors.
		4840	///
		4841	/// \headerfile <x86intrin.h>
		4842	///
		4843	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
		4844	///
		4845	/// \param __hi
		4846	/// A 128-bit integer vector to be copied to the upper 128 bits of the
		4847	/// result.
		4848	/// \param __lo
		4849	/// A 128-bit integer vector to be copied to the lower 128 bits of the
		4850	/// result.
		4851	/// \returns A 256-bit integer vector containing the concatenated result.
		4852	static __inline __m256i __DEFAULT_FN_ATTRS
		4853	_mm256_set_m128i (__m128i __hi, __m128i __lo)
		4854	{
		4855	return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3);
		4856	}
		4857
		4858	/// Constructs a 256-bit floating-point vector of [8 x float] by
		4859	/// concatenating two 128-bit floating-point vectors of [4 x float]. This is
		4860	/// similar to _mm256_set_m128, but the order of the input parameters is
		4861	/// swapped.
		4862	///
		4863	/// \headerfile <x86intrin.h>
		4864	///
		4865	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
		4866	///
		4867	/// \param __lo
		4868	/// A 128-bit floating-point vector of [4 x float] to be copied to the lower
		4869	/// 128 bits of the result.
		4870	/// \param __hi
		4871	/// A 128-bit floating-point vector of [4 x float] to be copied to the upper
		4872	/// 128 bits of the result.
		4873	/// \returns A 256-bit floating-point vector of [8 x float] containing the
		4874	/// concatenated result.
		4875	static __inline __m256 __DEFAULT_FN_ATTRS
		4876	_mm256_setr_m128 (__m128 __lo, __m128 __hi)
		4877	{
		4878	return _mm256_set_m128(__hi, __lo);
		4879	}
		4880
		4881	/// Constructs a 256-bit floating-point vector of [4 x double] by
		4882	/// concatenating two 128-bit floating-point vectors of [2 x double]. This is
		4883	/// similar to _mm256_set_m128d, but the order of the input parameters is
		4884	/// swapped.
		4885	///
		4886	/// \headerfile <x86intrin.h>
		4887	///
		4888	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
		4889	///
		4890	/// \param __lo
		4891	/// A 128-bit floating-point vector of [2 x double] to be copied to the lower
		4892	/// 128 bits of the result.
		4893	/// \param __hi
		4894	/// A 128-bit floating-point vector of [2 x double] to be copied to the upper
		4895	/// 128 bits of the result.
		4896	/// \returns A 256-bit floating-point vector of [4 x double] containing the
		4897	/// concatenated result.
		4898	static __inline __m256d __DEFAULT_FN_ATTRS
		4899	_mm256_setr_m128d (__m128d __lo, __m128d __hi)
		4900	{
		4901	return (__m256d)_mm256_set_m128d(__hi, __lo);
		4902	}
		4903
		4904	/// Constructs a 256-bit integer vector by concatenating two 128-bit
		4905	/// integer vectors. This is similar to _mm256_set_m128i, but the order of
		4906	/// the input parameters is swapped.
		4907	///
		4908	/// \headerfile <x86intrin.h>
		4909	///
		4910	/// This intrinsic corresponds to the <c> VINSERTF128 </c> instruction.
		4911	///
		4912	/// \param __lo
		4913	/// A 128-bit integer vector to be copied to the lower 128 bits of the
		4914	/// result.
		4915	/// \param __hi
		4916	/// A 128-bit integer vector to be copied to the upper 128 bits of the
		4917	/// result.
		4918	/// \returns A 256-bit integer vector containing the concatenated result.
		4919	static __inline __m256i __DEFAULT_FN_ATTRS
		4920	_mm256_setr_m128i (__m128i __lo, __m128i __hi)
		4921	{
		4922	return (__m256i)_mm256_set_m128i(__hi, __lo);
		4923	}
		4924
		4925	/* SIMD load ops (unaligned) */
		4926	/// Loads two 128-bit floating-point vectors of [4 x float] from
		4927	/// unaligned memory locations and constructs a 256-bit floating-point vector
		4928	/// of [8 x float] by concatenating the two 128-bit vectors.
		4929	///
		4930	/// \headerfile <x86intrin.h>
		4931	///
		4932	/// This intrinsic corresponds to load instructions followed by the
		4933	/// <c> VINSERTF128 </c> instruction.
		4934	///
		4935	/// \param __addr_hi
		4936	/// A pointer to a 128-bit memory location containing 4 consecutive
		4937	/// single-precision floating-point values. These values are to be copied to
		4938	/// bits[255:128] of the result. The address of the memory location does not
		4939	/// have to be aligned.
		4940	/// \param __addr_lo
		4941	/// A pointer to a 128-bit memory location containing 4 consecutive
		4942	/// single-precision floating-point values. These values are to be copied to
		4943	/// bits[127:0] of the result. The address of the memory location does not
		4944	/// have to be aligned.
		4945	/// \returns A 256-bit floating-point vector of [8 x float] containing the
		4946	/// concatenated result.
		4947	static __inline __m256 __DEFAULT_FN_ATTRS
		4948	_mm256_loadu2_m128(float const __addr_hi, float const __addr_lo)
		4949	{
		4950	return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo));
		4951	}
		4952
		4953	/// Loads two 128-bit floating-point vectors of [2 x double] from
		4954	/// unaligned memory locations and constructs a 256-bit floating-point vector
		4955	/// of [4 x double] by concatenating the two 128-bit vectors.
		4956	///
		4957	/// \headerfile <x86intrin.h>
		4958	///
		4959	/// This intrinsic corresponds to load instructions followed by the
		4960	/// <c> VINSERTF128 </c> instruction.
		4961	///
		4962	/// \param __addr_hi
		4963	/// A pointer to a 128-bit memory location containing two consecutive
		4964	/// double-precision floating-point values. These values are to be copied to
		4965	/// bits[255:128] of the result. The address of the memory location does not
		4966	/// have to be aligned.
		4967	/// \param __addr_lo
		4968	/// A pointer to a 128-bit memory location containing two consecutive
		4969	/// double-precision floating-point values. These values are to be copied to
		4970	/// bits[127:0] of the result. The address of the memory location does not
		4971	/// have to be aligned.
		4972	/// \returns A 256-bit floating-point vector of [4 x double] containing the
		4973	/// concatenated result.
		4974	static __inline __m256d __DEFAULT_FN_ATTRS
		4975	_mm256_loadu2_m128d(double const __addr_hi, double const __addr_lo)
		4976	{
		4977	return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo));
		4978	}
		4979
		4980	/// Loads two 128-bit integer vectors from unaligned memory locations and
		4981	/// constructs a 256-bit integer vector by concatenating the two 128-bit
		4982	/// vectors.
		4983	///
		4984	/// \headerfile <x86intrin.h>
		4985	///
		4986	/// This intrinsic corresponds to load instructions followed by the
		4987	/// <c> VINSERTF128 </c> instruction.
		4988	///
		4989	/// \param __addr_hi
		4990	/// A pointer to a 128-bit memory location containing a 128-bit integer
		4991	/// vector. This vector is to be copied to bits[255:128] of the result. The
		4992	/// address of the memory location does not have to be aligned.
		4993	/// \param __addr_lo
		4994	/// A pointer to a 128-bit memory location containing a 128-bit integer
		4995	/// vector. This vector is to be copied to bits[127:0] of the result. The
		4996	/// address of the memory location does not have to be aligned.
		4997	/// \returns A 256-bit integer vector containing the concatenated result.
		4998	static __inline __m256i __DEFAULT_FN_ATTRS
		4999	_mm256_loadu2_m128i(__m128i_u const __addr_hi, __m128i_u const __addr_lo)
		5000	{
		5001	return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo));
		5002	}
		5003
		5004	/* SIMD store ops (unaligned) */
		5005	/// Stores the upper and lower 128 bits of a 256-bit floating-point
		5006	/// vector of [8 x float] into two different unaligned memory locations.
		5007	///
		5008	/// \headerfile <x86intrin.h>
		5009	///
		5010	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
		5011	/// store instructions.
		5012	///
		5013	/// \param __addr_hi
		5014	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
		5015	/// copied to this memory location. The address of this memory location does
		5016	/// not have to be aligned.
		5017	/// \param __addr_lo
		5018	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
		5019	/// copied to this memory location. The address of this memory location does
		5020	/// not have to be aligned.
		5021	/// \param __a
		5022	/// A 256-bit floating-point vector of [8 x float].
		5023	static __inline void __DEFAULT_FN_ATTRS
		5024	_mm256_storeu2_m128(float __addr_hi, float __addr_lo, __m256 __a)
		5025	{
		5026	__m128 __v128;
		5027
		5028	__v128 = _mm256_castps256_ps128(__a);
		5029	_mm_storeu_ps(__addr_lo, __v128);
		5030	__v128 = _mm256_extractf128_ps(__a, 1);
		5031	_mm_storeu_ps(__addr_hi, __v128);
		5032	}
		5033
		5034	/// Stores the upper and lower 128 bits of a 256-bit floating-point
		5035	/// vector of [4 x double] into two different unaligned memory locations.
		5036	///
		5037	/// \headerfile <x86intrin.h>
		5038	///
		5039	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
		5040	/// store instructions.
		5041	///
		5042	/// \param __addr_hi
		5043	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
		5044	/// copied to this memory location. The address of this memory location does
		5045	/// not have to be aligned.
		5046	/// \param __addr_lo
		5047	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
		5048	/// copied to this memory location. The address of this memory location does
		5049	/// not have to be aligned.
		5050	/// \param __a
		5051	/// A 256-bit floating-point vector of [4 x double].
		5052	static __inline void __DEFAULT_FN_ATTRS
		5053	_mm256_storeu2_m128d(double __addr_hi, double __addr_lo, __m256d __a)
		5054	{
		5055	__m128d __v128;
		5056
		5057	__v128 = _mm256_castpd256_pd128(__a);
		5058	_mm_storeu_pd(__addr_lo, __v128);
		5059	__v128 = _mm256_extractf128_pd(__a, 1);
		5060	_mm_storeu_pd(__addr_hi, __v128);
		5061	}
		5062
		5063	/// Stores the upper and lower 128 bits of a 256-bit integer vector into
		5064	/// two different unaligned memory locations.
		5065	///
		5066	/// \headerfile <x86intrin.h>
		5067	///
		5068	/// This intrinsic corresponds to the <c> VEXTRACTF128 </c> instruction and the
		5069	/// store instructions.
		5070	///
		5071	/// \param __addr_hi
		5072	/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be
		5073	/// copied to this memory location. The address of this memory location does
		5074	/// not have to be aligned.
		5075	/// \param __addr_lo
		5076	/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be
		5077	/// copied to this memory location. The address of this memory location does
		5078	/// not have to be aligned.
		5079	/// \param __a
		5080	/// A 256-bit integer vector.
		5081	static __inline void __DEFAULT_FN_ATTRS
		5082	_mm256_storeu2_m128i(__m128i_u __addr_hi, __m128i_u __addr_lo, __m256i __a)
		5083	{
		5084	__m128i __v128;
		5085
		5086	__v128 = _mm256_castsi256_si128(__a);
		5087	_mm_storeu_si128(__addr_lo, __v128);
		5088	__v128 = _mm256_extractf128_si256(__a, 1);
		5089	_mm_storeu_si128(__addr_hi, __v128);
		5090	}
		5091
		5092	#undef __DEFAULT_FN_ATTRS
		5093	#undef __DEFAULT_FN_ATTRS128
		5094
		5095	#endif /* __AVXINTRIN_H */

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/lib/clang/16/include/avxintrin.h – Rev 14