WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/lib/clang/16/include/avx512bf16intrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9	#ifndef __IMMINTRIN_H
		10	#error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
		11	#endif
		12
		13	#ifdef __SSE2__
		14
		15	#ifndef __AVX512BF16INTRIN_H
		16	#define __AVX512BF16INTRIN_H
		17
		18	typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
		19	typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
		20	typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
		21
		22	#define __DEFAULT_FN_ATTRS512 \
		23	__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
		24	__min_vector_width__(512)))
		25	#define __DEFAULT_FN_ATTRS \
		26	__attribute__((__always_inline__, __nodebug__, __target__("avx512bf16")))
		27
		28	/// Convert One BF16 Data to One Single Float Data.
		29	///
		30	/// \headerfile <x86intrin.h>
		31	///
		32	/// This intrinsic does not correspond to a specific instruction.
		33	///
		34	/// \param __A
		35	/// A bfloat data.
		36	/// \returns A float data whose sign field and exponent field keep unchanged,
		37	/// and fraction field is extended to 23 bits.
		38	static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
		39	return __builtin_ia32_cvtsbf162ss_32(__A);
		40	}
		41
		42	/// Convert Two Packed Single Data to One Packed BF16 Data.
		43	///
		44	/// \headerfile <x86intrin.h>
		45	///
		46	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
		47	///
		48	/// \param __A
		49	/// A 512-bit vector of [16 x float].
		50	/// \param __B
		51	/// A 512-bit vector of [16 x float].
		52	/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
		53	/// conversion of __B, and higher 256 bits come from conversion of __A.
		54	static __inline__ __m512bh __DEFAULT_FN_ATTRS512
		55	_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
		56	return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A,
		57	(__v16sf) __B);
		58	}
		59
		60	/// Convert Two Packed Single Data to One Packed BF16 Data.
		61	///
		62	/// \headerfile <x86intrin.h>
		63	///
		64	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
		65	///
		66	/// \param __A
		67	/// A 512-bit vector of [16 x float].
		68	/// \param __B
		69	/// A 512-bit vector of [16 x float].
		70	/// \param __W
		71	/// A 512-bit vector of [32 x bfloat].
		72	/// \param __U
		73	/// A 32-bit mask value specifying what is chosen for each element.
		74	/// A 1 means conversion of __A or __B. A 0 means element from __W.
		75	/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
		76	/// conversion of __B, and higher 256 bits come from conversion of __A.
		77	static __inline__ __m512bh __DEFAULT_FN_ATTRS512
		78	_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
		79	return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
		80	(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
		81	(__v32bf)__W);
		82	}
		83
		84	/// Convert Two Packed Single Data to One Packed BF16 Data.
		85	///
		86	/// \headerfile <x86intrin.h>
		87	///
		88	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
		89	///
		90	/// \param __A
		91	/// A 512-bit vector of [16 x float].
		92	/// \param __B
		93	/// A 512-bit vector of [16 x float].
		94	/// \param __U
		95	/// A 32-bit mask value specifying what is chosen for each element.
		96	/// A 1 means conversion of __A or __B. A 0 means element is zero.
		97	/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from
		98	/// conversion of __B, and higher 256 bits come from conversion of __A.
		99	static __inline__ __m512bh __DEFAULT_FN_ATTRS512
		100	_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
		101	return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
		102	(__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
		103	(__v32bf)_mm512_setzero_si512());
		104	}
		105
		106	/// Convert Packed Single Data to Packed BF16 Data.
		107	///
		108	/// \headerfile <x86intrin.h>
		109	///
		110	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
		111	///
		112	/// \param __A
		113	/// A 512-bit vector of [16 x float].
		114	/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
		115	static __inline__ __m256bh __DEFAULT_FN_ATTRS512
		116	_mm512_cvtneps_pbh(__m512 __A) {
		117	return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
		118	(__v16bf)_mm256_undefined_si256(),
		119	(__mmask16)-1);
		120	}
		121
		122	/// Convert Packed Single Data to Packed BF16 Data.
		123	///
		124	/// \headerfile <x86intrin.h>
		125	///
		126	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
		127	///
		128	/// \param __A
		129	/// A 512-bit vector of [16 x float].
		130	/// \param __W
		131	/// A 256-bit vector of [16 x bfloat].
		132	/// \param __U
		133	/// A 16-bit mask value specifying what is chosen for each element.
		134	/// A 1 means conversion of __A. A 0 means element from __W.
		135	/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
		136	static __inline__ __m256bh __DEFAULT_FN_ATTRS512
		137	_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
		138	return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
		139	(__v16bf)__W,
		140	(__mmask16)__U);
		141	}
		142
		143	/// Convert Packed Single Data to Packed BF16 Data.
		144	///
		145	/// \headerfile <x86intrin.h>
		146	///
		147	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
		148	///
		149	/// \param __A
		150	/// A 512-bit vector of [16 x float].
		151	/// \param __U
		152	/// A 16-bit mask value specifying what is chosen for each element.
		153	/// A 1 means conversion of __A. A 0 means element is zero.
		154	/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A.
		155	static __inline__ __m256bh __DEFAULT_FN_ATTRS512
		156	_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
		157	return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
		158	(__v16bf)_mm256_setzero_si256(),
		159	(__mmask16)__U);
		160	}
		161
		162	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
		163	///
		164	/// \headerfile <x86intrin.h>
		165	///
		166	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
		167	///
		168	/// \param __A
		169	/// A 512-bit vector of [32 x bfloat].
		170	/// \param __B
		171	/// A 512-bit vector of [32 x bfloat].
		172	/// \param __D
		173	/// A 512-bit vector of [16 x float].
		174	/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
		175	/// __A, __B and __D
		176	static __inline__ __m512 __DEFAULT_FN_ATTRS512
		177	_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
		178	return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
		179	(__v32bf) __A,
		180	(__v32bf) __B);
		181	}
		182
		183	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
		184	///
		185	/// \headerfile <x86intrin.h>
		186	///
		187	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
		188	///
		189	/// \param __A
		190	/// A 512-bit vector of [32 x bfloat].
		191	/// \param __B
		192	/// A 512-bit vector of [32 x bfloat].
		193	/// \param __D
		194	/// A 512-bit vector of [16 x float].
		195	/// \param __U
		196	/// A 16-bit mask value specifying what is chosen for each element.
		197	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
		198	/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
		199	/// __A, __B and __D
		200	static __inline__ __m512 __DEFAULT_FN_ATTRS512
		201	_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) {
		202	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
		203	(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
		204	(__v16sf)__D);
		205	}
		206
		207	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
		208	///
		209	/// \headerfile <x86intrin.h>
		210	///
		211	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
		212	///
		213	/// \param __A
		214	/// A 512-bit vector of [32 x bfloat].
		215	/// \param __B
		216	/// A 512-bit vector of [32 x bfloat].
		217	/// \param __D
		218	/// A 512-bit vector of [16 x float].
		219	/// \param __U
		220	/// A 16-bit mask value specifying what is chosen for each element.
		221	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
		222	/// \returns A 512-bit vector of [16 x float] comes from Dot Product of
		223	/// __A, __B and __D
		224	static __inline__ __m512 __DEFAULT_FN_ATTRS512
		225	_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) {
		226	return (__m512)__builtin_ia32_selectps_512((__mmask16)__U,
		227	(__v16sf)_mm512_dpbf16_ps(__D, __A, __B),
		228	(__v16sf)_mm512_setzero_si512());
		229	}
		230
		231	/// Convert Packed BF16 Data to Packed float Data.
		232	///
		233	/// \headerfile <x86intrin.h>
		234	///
		235	/// \param __A
		236	/// A 256-bit vector of [16 x bfloat].
		237	/// \returns A 512-bit vector of [16 x float] come from conversion of __A
		238	static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) {
		239	return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
		240	(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
		241	}
		242
		243	/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
		244	///
		245	/// \headerfile <x86intrin.h>
		246	///
		247	/// \param __U
		248	/// A 16-bit mask. Elements are zeroed out when the corresponding mask
		249	/// bit is not set.
		250	/// \param __A
		251	/// A 256-bit vector of [16 x bfloat].
		252	/// \returns A 512-bit vector of [16 x float] come from conversion of __A
		253	static __inline__ __m512 __DEFAULT_FN_ATTRS512
		254	_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) {
		255	return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32(
		256	(__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16));
		257	}
		258
		259	/// Convert Packed BF16 Data to Packed float Data using merging mask.
		260	///
		261	/// \headerfile <x86intrin.h>
		262	///
		263	/// \param __S
		264	/// A 512-bit vector of [16 x float]. Elements are copied from __S when
		265	/// the corresponding mask bit is not set.
		266	/// \param __U
		267	/// A 16-bit mask.
		268	/// \param __A
		269	/// A 256-bit vector of [16 x bfloat].
		270	/// \returns A 512-bit vector of [16 x float] come from conversion of __A
		271	static __inline__ __m512 __DEFAULT_FN_ATTRS512
		272	_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
		273	return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32(
		274	(__m512i)__S, (__mmask16)__U,
		275	(__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16));
		276	}
		277
		278	#undef __DEFAULT_FN_ATTRS
		279	#undef __DEFAULT_FN_ATTRS512
		280
		281	#endif
		282	#endif

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/lib/clang/16/include/avx512bf16intrin.h – Rev 14