WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/lib/clang/16/include/avxvnniint8intrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9	#ifndef __IMMINTRIN_H
		10	#error \
		11	"Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
		12	#endif
		13
		14	#ifndef __AVXVNNIINT8INTRIN_H
		15	#define __AVXVNNIINT8INTRIN_H
		16
		17	/* Define the default attributes for the functions in this file. */
		18	#define __DEFAULT_FN_ATTRS256 \
		19	__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
		20	__min_vector_width__(256)))
		21	#define __DEFAULT_FN_ATTRS128 \
		22	__attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"), \
		23	__min_vector_width__(128)))
		24
		25	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
		26	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
		27	/// signed 16-bit results. Sum these 4 results with the corresponding
		28	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
		29	///
		30	/// \headerfile <x86intrin.h>
		31	///
		32	/// \code
		33	/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
		34	/// \endcode
		35	///
		36	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
		37	///
		38	/// \param __A
		39	/// A 128-bit vector of [16 x char].
		40	/// \param __B
		41	/// A 128-bit vector of [16 x char].
		42	/// \returns
		43	/// A 128-bit vector of [4 x int].
		44	///
		45	/// \code{.operation}
		46	/// FOR j := 0 to 3
		47	/// tmp1.word := SignExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j])
		48	/// tmp2.word := SignExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1])
		49	/// tmp3.word := SignExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2])
		50	/// tmp4.word := SignExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3])
		51	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
		52	/// ENDFOR
		53	/// dst[MAX:128] := 0
		54	/// \endcode
		55	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
		56	__m128i __A,
		57	__m128i __B) {
		58	return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
		59	(__v4si)__B);
		60	}
		61
		62	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
		63	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
		64	/// signed 16-bit results. Sum these 4 results with the corresponding
		65	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
		66	///
		67	/// \headerfile <x86intrin.h>
		68	///
		69	/// \code
		70	/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
		71	/// \endcode
		72	///
		73	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
		74	///
		75	/// \param __A
		76	/// A 256-bit vector of [32 x char].
		77	/// \param __B
		78	/// A 256-bit vector of [32 x char].
		79	/// \returns
		80	/// A 256-bit vector of [8 x int].
		81	///
		82	/// \code{.operation}
		83	/// FOR j := 0 to 7
		84	/// tmp1.word := SignExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j])
		85	/// tmp2.word := SignExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1])
		86	/// tmp3.word := SignExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2])
		87	/// tmp4.word := SignExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3])
		88	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
		89	/// ENDFOR
		90	/// dst[MAX:256] := 0
		91	/// \endcode
		92	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		93	_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
		94	return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
		95	(__v8si)__B);
		96	}
		97
		98	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
		99	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
		100	/// signed 16-bit results. Sum these 4 results with the corresponding
		101	/// 32-bit integer in \a __W with signed saturation, and store the packed
		102	/// 32-bit results in \a dst.
		103	///
		104	/// \headerfile <x86intrin.h>
		105	///
		106	/// \code
		107	/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
		108	/// \endcode
		109	///
		110	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
		111	///
		112	/// \param __A
		113	/// A 128-bit vector of [16 x char].
		114	/// \param __B
		115	/// A 128-bit vector of [16 x char].
		116	/// \returns
		117	/// A 128-bit vector of [4 x int].
		118	///
		119	/// \code{.operation}
		120	/// FOR j := 0 to 3
		121	/// tmp1.word := SignExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j])
		122	/// tmp2.word := SignExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1])
		123	/// tmp3.word := SignExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2])
		124	/// tmp4.word := SignExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3])
		125	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
		126	/// ENDFOR
		127	/// dst[MAX:128] := 0
		128	/// \endcode
		129	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
		130	__m128i __A,
		131	__m128i __B) {
		132	return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
		133	(__v4si)__B);
		134	}
		135
		136	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
		137	/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate
		138	/// signed 16-bit results. Sum these 4 results with the corresponding
		139	/// 32-bit integer in \a __W with signed saturation, and store the packed
		140	/// 32-bit results in \a dst.
		141	///
		142	/// \headerfile <x86intrin.h>
		143	///
		144	/// \code
		145	/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
		146	/// \endcode
		147	///
		148	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
		149	///
		150	/// \param __A
		151	/// A 256-bit vector of [32 x char].
		152	/// \param __B
		153	/// A 256-bit vector of [32 x char].
		154	/// \returns
		155	/// A 256-bit vector of [8 x int].
		156	///
		157	/// \code{.operation}
		158	/// FOR j := 0 to 7
		159	/// tmp1.word := SignExtend16(__A.byte[4j]) SignExtend16(__B.byte[4*j])
		160	/// tmp2.word := SignExtend16(__A.byte[4j+1]) SignExtend16(__B.byte[4*j+1])
		161	/// tmp3.word := SignExtend16(__A.byte[4j+2]) SignExtend16(__B.byte[4*j+2])
		162	/// tmp4.word := SignExtend16(__A.byte[4j+3]) SignExtend16(__B.byte[4*j+3])
		163	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
		164	/// ENDFOR
		165	/// dst[MAX:256] := 0
		166	/// \endcode
		167	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		168	_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
		169	return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
		170	(__v8si)__B);
		171	}
		172
		173	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
		174	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
		175	/// signed 16-bit results. Sum these 4 results with the corresponding
		176	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
		177	///
		178	/// \headerfile <x86intrin.h>
		179	///
		180	/// \code
		181	/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
		182	/// \endcode
		183	///
		184	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
		185	///
		186	/// \param __A
		187	/// A 128-bit vector of [16 x char].
		188	/// \param __B
		189	/// A 128-bit vector of [16 x unsigned char].
		190	/// \returns
		191	/// A 128-bit vector of [4 x int].
		192	///
		193	/// \code{.operation}
		194	/// FOR j := 0 to 3
		195	/// tmp1.word := Signed(SignExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j]))
		196	/// tmp2.word := Signed(SignExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1]))
		197	/// tmp3.word := Signed(SignExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2]))
		198	/// tmp4.word := Signed(SignExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3]))
		199	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
		200	/// ENDFOR
		201	/// dst[MAX:128] := 0
		202	/// \endcode
		203	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
		204	__m128i __A,
		205	__m128i __B) {
		206	return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
		207	(__v4si)__B);
		208	}
		209
		210	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
		211	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
		212	/// signed 16-bit results. Sum these 4 results with the corresponding
		213	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
		214	///
		215	/// \headerfile <x86intrin.h>
		216	///
		217	/// \code
		218	/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
		219	/// \endcode
		220	///
		221	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
		222	///
		223	/// \param __A
		224	/// A 256-bit vector of [32 x char].
		225	/// \param __B
		226	/// A 256-bit vector of [32 x unsigned char].
		227	/// \returns
		228	/// A 256-bit vector of [8 x int].
		229	///
		230	/// \code{.operation}
		231	/// FOR j := 0 to 7
		232	/// tmp1.word := Signed(SignExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j]))
		233	/// tmp2.word := Signed(SignExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1]))
		234	/// tmp3.word := Signed(SignExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2]))
		235	/// tmp4.word := Signed(SignExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3]))
		236	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
		237	/// ENDFOR
		238	/// dst[MAX:256] := 0
		239	/// \endcode
		240	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		241	_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
		242	return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
		243	(__v8si)__B);
		244	}
		245
		246	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
		247	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
		248	/// signed 16-bit results. Sum these 4 results with the corresponding
		249	/// 32-bit integer in \a __W with signed saturation, and store the packed
		250	/// 32-bit results in \a dst.
		251	///
		252	/// \headerfile <x86intrin.h>
		253	///
		254	/// \code
		255	/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
		256	/// \endcode
		257	///
		258	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
		259	///
		260	/// \param __A
		261	/// A 128-bit vector of [16 x char].
		262	/// \param __B
		263	/// A 128-bit vector of [16 x unsigned char].
		264	/// \returns
		265	/// A 128-bit vector of [4 x int].
		266	///
		267	/// \code{.operation}
		268	/// FOR j := 0 to 3
		269	/// tmp1.word := Signed(SignExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j]))
		270	/// tmp2.word := Signed(SignExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1]))
		271	/// tmp3.word := Signed(SignExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2]))
		272	/// tmp4.word := Signed(SignExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3]))
		273	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
		274	/// ENDFOR
		275	/// dst[MAX:128] := 0
		276	/// \endcode
		277	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
		278	__m128i __A,
		279	__m128i __B) {
		280	return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
		281	(__v4si)__B);
		282	}
		283
		284	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
		285	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
		286	/// signed 16-bit results. Sum these 4 results with the corresponding
		287	/// 32-bit integer in \a __W with signed saturation, and store the packed
		288	/// 32-bit results in \a dst.
		289	///
		290	/// \headerfile <x86intrin.h>
		291	///
		292	/// \code
		293	/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
		294	/// \endcode
		295	///
		296	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
		297	///
		298	/// \param __A
		299	/// A 256-bit vector of [32 x char].
		300	/// \param __B
		301	/// A 256-bit vector of [32 x unsigned char].
		302	/// \returns
		303	/// A 256-bit vector of [8 x int].
		304	///
		305	/// \code{.operation}
		306	/// FOR j := 0 to 7
		307	/// tmp1.word := Signed(SignExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j]))
		308	/// tmp2.word := Signed(SignExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1]))
		309	/// tmp3.word := Signed(SignExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2]))
		310	/// tmp4.word := Signed(SignExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3]))
		311	/// dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
		312	/// ENDFOR
		313	/// dst[MAX:256] := 0
		314	/// \endcode
		315	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		316	_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
		317	return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
		318	(__v8si)__B);
		319	}
		320
		321	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
		322	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
		323	/// signed 16-bit results. Sum these 4 results with the corresponding
		324	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
		325	///
		326	/// \headerfile <x86intrin.h>
		327	///
		328	/// \code
		329	/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
		330	/// \endcode
		331	///
		332	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
		333	///
		334	/// \param __A
		335	/// A 128-bit vector of [16 x unsigned char].
		336	/// \param __B
		337	/// A 128-bit vector of [16 x unsigned char].
		338	/// \returns
		339	/// A 128-bit vector of [4 x int].
		340	///
		341	/// \code{.operation}
		342	/// FOR j := 0 to 3
		343	/// tmp1.word := ZeroExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j])
		344	/// tmp2.word := ZeroExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1])
		345	/// tmp3.word := ZeroExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2])
		346	/// tmp4.word := ZeroExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3])
		347	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
		348	/// ENDFOR
		349	/// dst[MAX:128] := 0
		350	/// \endcode
		351	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
		352	__m128i __A,
		353	__m128i __B) {
		354	return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
		355	(__v4si)__B);
		356	}
		357
		358	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
		359	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
		360	/// signed 16-bit results. Sum these 4 results with the corresponding
		361	/// 32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
		362	///
		363	/// \headerfile <x86intrin.h>
		364	///
		365	/// \code
		366	/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
		367	/// \endcode
		368	///
		369	/// This intrinsic corresponds to the \c VPDPBSSD instruction.
		370	///
		371	/// \param __A
		372	/// A 256-bit vector of [32 x unsigned char].
		373	/// \param __B
		374	/// A 256-bit vector of [32 x unsigned char].
		375	/// \returns
		376	/// A 256-bit vector of [8 x int].
		377	///
		378	/// \code{.operation}
		379	/// FOR j := 0 to 7
		380	/// tmp1.word := ZeroExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j])
		381	/// tmp2.word := ZeroExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1])
		382	/// tmp3.word := ZeroExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2])
		383	/// tmp4.word := ZeroExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3])
		384	/// dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
		385	/// ENDFOR
		386	/// dst[MAX:256] := 0
		387	/// \endcode
		388	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		389	_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
		390	return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
		391	(__v8si)__B);
		392	}
		393
		394	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
		395	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
		396	/// signed 16-bit results. Sum these 4 results with the corresponding
		397	/// 32-bit integer in \a __W with signed saturation, and store the packed
		398	/// 32-bit results in \a dst.
		399	///
		400	/// \headerfile <x86intrin.h>
		401	///
		402	/// \code
		403	/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
		404	/// \endcode
		405	///
		406	/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
		407	///
		408	/// \param __A
		409	/// A 128-bit vector of [16 x unsigned char].
		410	/// \param __B
		411	/// A 128-bit vector of [16 x unsigned char].
		412	/// \returns
		413	/// A 128-bit vector of [4 x int].
		414	///
		415	/// \code{.operation}
		416	/// FOR j := 0 to 3
		417	/// tmp1.word := ZeroExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j])
		418	/// tmp2.word := ZeroExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1])
		419	/// tmp3.word := ZeroExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2])
		420	/// tmp4.word := ZeroExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3])
		421	/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
		422	/// ENDFOR
		423	/// dst[MAX:128] := 0
		424	/// \endcode
		425	static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
		426	__m128i __A,
		427	__m128i __B) {
		428	return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
		429	(__v4si)__B);
		430	}
		431
		432	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
		433	/// corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
		434	/// signed 16-bit results. Sum these 4 results with the corresponding
		435	/// 32-bit integer in \a __W with signed saturation, and store the packed
		436	/// 32-bit results in \a dst.
		437	///
		438	/// \headerfile <x86intrin.h>
		439	///
		440	/// \code
		441	/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
		442	/// \endcode
		443	///
		444	/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
		445	///
		446	/// \param __A
		447	/// A 256-bit vector of [32 x unsigned char].
		448	/// \param __B
		449	/// A 256-bit vector of [32 x unsigned char].
		450	/// \returns
		451	/// A 256-bit vector of [8 x int].
		452	///
		453	/// \code{.operation}
		454	/// FOR j := 0 to 7
		455	/// tmp1.word := ZeroExtend16(__A.byte[4j]) ZeroExtend16(__B.byte[4*j])
		456	/// tmp2.word := ZeroExtend16(__A.byte[4j+1]) ZeroExtend16(__B.byte[4*j+1])
		457	/// tmp3.word := ZeroExtend16(__A.byte[4j+2]) ZeroExtend16(__B.byte[4*j+2])
		458	/// tmp4.word := ZeroExtend16(__A.byte[4j+3]) ZeroExtend16(__B.byte[4*j+3])
		459	/// dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
		460	/// ENDFOR
		461	/// dst[MAX:256] := 0
		462	/// \endcode
		463	static __inline__ __m256i __DEFAULT_FN_ATTRS256
		464	_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
		465	return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
		466	(__v8si)__B);
		467	}
		468	#undef __DEFAULT_FN_ATTRS128
		469	#undef __DEFAULT_FN_ATTRS256
		470
		471	#endif // __AVXVNNIINT8INTRIN_H

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/lib/clang/16/include/avxvnniint8intrin.h – Rev 14