WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – /llvm-build/x86_64/lib/clang/16/include/avx512vlvnniintrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
		2	*
		3	*
		4	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		5	* See https://llvm.org/LICENSE.txt for license information.
		6	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		7	*
		8	*===-----------------------------------------------------------------------===
		9	*/
		10	#ifndef __IMMINTRIN_H
		11	#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
		12	#endif
		13
		14	#ifndef __AVX512VLVNNIINTRIN_H
		15	#define __AVX512VLVNNIINTRIN_H
		16
		17	/* Define the default attributes for the functions in this file. */
		18	#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
		19	#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
		20
		21	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
		22	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
		23	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
		24	/// in \a S, and store the packed 32-bit results in DST.
		25	///
		26	/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
		27	///
		28	/// \code{.operation}
		29	/// FOR j := 0 to 7
		30	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
		31	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
		32	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
		33	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
		34	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
		35	/// ENDFOR
		36	/// DST[MAX:256] := 0
		37	/// \endcode
		38	#define _mm256_dpbusd_epi32(S, A, B) \
		39	((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
		40
		41	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
		42	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
		43	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
		44	/// in \a S using signed saturation, and store the packed 32-bit results in DST.
		45	///
		46	/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
		47	///
		48	/// \code{.operation}
		49	/// FOR j := 0 to 7
		50	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
		51	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
		52	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
		53	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
		54	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
		55	/// ENDFOR
		56	/// DST[MAX:256] := 0
		57	/// \endcode
		58	#define _mm256_dpbusds_epi32(S, A, B) \
		59	((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
		60
		61	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
		62	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
		63	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
		64	/// and store the packed 32-bit results in DST.
		65	///
		66	/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
		67	///
		68	/// \code{.operation}
		69	/// FOR j := 0 to 7
		70	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
		71	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
		72	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
		73	/// ENDFOR
		74	/// DST[MAX:256] := 0
		75	/// \endcode
		76	#define _mm256_dpwssd_epi32(S, A, B) \
		77	((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
		78
		79	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
		80	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
		81	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
		82	/// using signed saturation, and store the packed 32-bit results in DST.
		83	///
		84	/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
		85	///
		86	/// \code{.operation}
		87	/// FOR j := 0 to 7
		88	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
		89	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
		90	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
		91	/// ENDFOR
		92	/// DST[MAX:256] := 0
		93	/// \endcode
		94	#define _mm256_dpwssds_epi32(S, A, B) \
		95	((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
		96
		97	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
		98	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
		99	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
		100	/// in \a S, and store the packed 32-bit results in DST.
		101	///
		102	/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
		103	///
		104	/// \code{.operation}
		105	/// FOR j := 0 to 3
		106	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
		107	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
		108	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
		109	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
		110	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
		111	/// ENDFOR
		112	/// DST[MAX:128] := 0
		113	/// \endcode
		114	#define _mm_dpbusd_epi32(S, A, B) \
		115	((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
		116
		117	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
		118	/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
		119	/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
		120	/// in \a S using signed saturation, and store the packed 32-bit results in DST.
		121	///
		122	/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
		123	///
		124	/// \code{.operation}
		125	/// FOR j := 0 to 3
		126	/// tmp1.word := Signed(ZeroExtend16(A.byte[4j]) SignExtend16(B.byte[4*j]))
		127	/// tmp2.word := Signed(ZeroExtend16(A.byte[4j+1]) SignExtend16(B.byte[4*j+1]))
		128	/// tmp3.word := Signed(ZeroExtend16(A.byte[4j+2]) SignExtend16(B.byte[4*j+2]))
		129	/// tmp4.word := Signed(ZeroExtend16(A.byte[4j+3]) SignExtend16(B.byte[4*j+3]))
		130	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
		131	/// ENDFOR
		132	/// DST[MAX:128] := 0
		133	/// \endcode
		134	#define _mm_dpbusds_epi32(S, A, B) \
		135	((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
		136
		137	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
		138	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
		139	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
		140	/// and store the packed 32-bit results in DST.
		141	///
		142	/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
		143	///
		144	/// \code{.operation}
		145	/// FOR j := 0 to 3
		146	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
		147	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
		148	/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
		149	/// ENDFOR
		150	/// DST[MAX:128] := 0
		151	/// \endcode
		152	#define _mm_dpwssd_epi32(S, A, B) \
		153	((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
		154
		155	/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
		156	/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
		157	/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
		158	/// using signed saturation, and store the packed 32-bit results in DST.
		159	///
		160	/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
		161	///
		162	/// \code{.operation}
		163	/// FOR j := 0 to 3
		164	/// tmp1.dword := SignExtend32(A.word[2j]) SignExtend32(B.word[2*j])
		165	/// tmp2.dword := SignExtend32(A.word[2j+1]) SignExtend32(B.word[2*j+1])
		166	/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
		167	/// ENDFOR
		168	/// DST[MAX:128] := 0
		169	/// \endcode
		170	#define _mm_dpwssds_epi32(S, A, B) \
		171	((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
		172
		173	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		174	_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
		175	{
		176	return (__m256i)__builtin_ia32_selectd_256(__U,
		177	(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
		178	(__v8si)__S);
		179	}
		180
		181	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		182	_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
		183	{
		184	return (__m256i)__builtin_ia32_selectd_256(__U,
		185	(__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
		186	(__v8si)_mm256_setzero_si256());
		187	}
		188
		189	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		190	_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
		191	{
		192	return (__m256i)__builtin_ia32_selectd_256(__U,
		193	(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
		194	(__v8si)__S);
		195	}
		196
		197	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		198	_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
		199	{
		200	return (__m256i)__builtin_ia32_selectd_256(__U,
		201	(__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
		202	(__v8si)_mm256_setzero_si256());
		203	}
		204
		205	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		206	_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
		207	{
		208	return (__m256i)__builtin_ia32_selectd_256(__U,
		209	(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
		210	(__v8si)__S);
		211	}
		212
		213	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		214	_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
		215	{
		216	return (__m256i)__builtin_ia32_selectd_256(__U,
		217	(__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
		218	(__v8si)_mm256_setzero_si256());
		219	}
		220
		221	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		222	_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
		223	{
		224	return (__m256i)__builtin_ia32_selectd_256(__U,
		225	(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
		226	(__v8si)__S);
		227	}
		228
		229	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		230	_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
		231	{
		232	return (__m256i)__builtin_ia32_selectd_256(__U,
		233	(__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
		234	(__v8si)_mm256_setzero_si256());
		235	}
		236
		237	static __inline__ __m128i __DEFAULT_FN_ATTRS128
		238	_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
		239	{
		240	return (__m128i)__builtin_ia32_selectd_128(__U,
		241	(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
		242	(__v4si)__S);
		243	}
		244
		245	static __inline__ __m128i __DEFAULT_FN_ATTRS128
		246	_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
		247	{
		248	return (__m128i)__builtin_ia32_selectd_128(__U,
		249	(__v4si)_mm_dpbusd_epi32(__S, __A, __B),
		250	(__v4si)_mm_setzero_si128());
		251	}
		252
		253	static __inline__ __m128i __DEFAULT_FN_ATTRS128
		254	_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
		255	{
		256	return (__m128i)__builtin_ia32_selectd_128(__U,
		257	(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
		258	(__v4si)__S);
		259	}
		260
		261	static __inline__ __m128i __DEFAULT_FN_ATTRS128
		262	_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
		263	{
		264	return (__m128i)__builtin_ia32_selectd_128(__U,
		265	(__v4si)_mm_dpbusds_epi32(__S, __A, __B),
		266	(__v4si)_mm_setzero_si128());
		267	}
		268
		269	static __inline__ __m128i __DEFAULT_FN_ATTRS128
		270	_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
		271	{
		272	return (__m128i)__builtin_ia32_selectd_128(__U,
		273	(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
		274	(__v4si)__S);
		275	}
		276
		277	static __inline__ __m128i __DEFAULT_FN_ATTRS128
		278	_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
		279	{
		280	return (__m128i)__builtin_ia32_selectd_128(__U,
		281	(__v4si)_mm_dpwssd_epi32(__S, __A, __B),
		282	(__v4si)_mm_setzero_si128());
		283	}
		284
		285	static __inline__ __m128i __DEFAULT_FN_ATTRS128
		286	_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
		287	{
		288	return (__m128i)__builtin_ia32_selectd_128(__U,
		289	(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
		290	(__v4si)__S);
		291	}
		292
		293	static __inline__ __m128i __DEFAULT_FN_ATTRS128
		294	_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
		295	{
		296	return (__m128i)__builtin_ia32_selectd_128(__U,
		297	(__v4si)_mm_dpwssds_epi32(__S, __A, __B),
		298	(__v4si)_mm_setzero_si128());
		299	}
		300
		301	#undef __DEFAULT_FN_ATTRS128
		302	#undef __DEFAULT_FN_ATTRS256
		303
		304	#endif

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite/llvm-build/x86_64/lib/clang/16/include/avx512vlvnniintrin.h – Rev 14