WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – /llvm-build/x86_64/lib/clang/16/include/avx512vlbf16intrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9	#ifndef __IMMINTRIN_H
		10	#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
		11	#endif
		12
		13	#ifdef __SSE2__
		14
		15	#ifndef __AVX512VLBF16INTRIN_H
		16	#define __AVX512VLBF16INTRIN_H
		17
		18	#define __DEFAULT_FN_ATTRS128 \
		19	__attribute__((__always_inline__, __nodebug__, \
		20	__target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
		21	#define __DEFAULT_FN_ATTRS256 \
		22	__attribute__((__always_inline__, __nodebug__, \
		23	__target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
		24
		25	/// Convert Two Packed Single Data to One Packed BF16 Data.
		26	///
		27	/// \headerfile <x86intrin.h>
		28	///
		29	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
		30	///
		31	/// \param __A
		32	/// A 128-bit vector of [4 x float].
		33	/// \param __B
		34	/// A 128-bit vector of [4 x float].
		35	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
		36	/// conversion of __B, and higher 64 bits come from conversion of __A.
		37	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
		38	_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
		39	return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
		40	(__v4sf) __B);
		41	}
		42
		43	/// Convert Two Packed Single Data to One Packed BF16 Data.
		44	///
		45	/// \headerfile <x86intrin.h>
		46	///
		47	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
		48	///
		49	/// \param __A
		50	/// A 128-bit vector of [4 x float].
		51	/// \param __B
		52	/// A 128-bit vector of [4 x float].
		53	/// \param __W
		54	/// A 128-bit vector of [8 x bfloat].
		55	/// \param __U
		56	/// A 8-bit mask value specifying what is chosen for each element.
		57	/// A 1 means conversion of __A or __B. A 0 means element from __W.
		58	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
		59	/// conversion of __B, and higher 64 bits come from conversion of __A.
		60	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
		61	_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
		62	return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
		63	(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
		64	(__v8bf)__W);
		65	}
		66
		67	/// Convert Two Packed Single Data to One Packed BF16 Data.
		68	///
		69	/// \headerfile <x86intrin.h>
		70	///
		71	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
		72	///
		73	/// \param __A
		74	/// A 128-bit vector of [4 x float].
		75	/// \param __B
		76	/// A 128-bit vector of [4 x float].
		77	/// \param __U
		78	/// A 8-bit mask value specifying what is chosen for each element.
		79	/// A 1 means conversion of __A or __B. A 0 means element is zero.
		80	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
		81	/// conversion of __B, and higher 64 bits come from conversion of __A.
		82	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
		83	_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
		84	return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
		85	(__v8bf)_mm_cvtne2ps_pbh(__A, __B),
		86	(__v8bf)_mm_setzero_si128());
		87	}
		88
		89	/// Convert Two Packed Single Data to One Packed BF16 Data.
		90	///
		91	/// \headerfile <x86intrin.h>
		92	///
		93	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
		94	///
		95	/// \param __A
		96	/// A 256-bit vector of [8 x float].
		97	/// \param __B
		98	/// A 256-bit vector of [8 x float].
		99	/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
		100	/// conversion of __B, and higher 128 bits come from conversion of __A.
		101	static __inline__ __m256bh __DEFAULT_FN_ATTRS256
		102	_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
		103	return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
		104	(__v8sf) __B);
		105	}
		106
		107	/// Convert Two Packed Single Data to One Packed BF16 Data.
		108	///
		109	/// \headerfile <x86intrin.h>
		110	///
		111	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
		112	///
		113	/// \param __A
		114	/// A 256-bit vector of [8 x float].
		115	/// \param __B
		116	/// A 256-bit vector of [8 x float].
		117	/// \param __W
		118	/// A 256-bit vector of [16 x bfloat].
		119	/// \param __U
		120	/// A 16-bit mask value specifying what is chosen for each element.
		121	/// A 1 means conversion of __A or __B. A 0 means element from __W.
		122	/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
		123	/// conversion of __B, and higher 128 bits come from conversion of __A.
		124	static __inline__ __m256bh __DEFAULT_FN_ATTRS256
		125	_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
		126	return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
		127	(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
		128	(__v16bf)__W);
		129	}
		130
		131	/// Convert Two Packed Single Data to One Packed BF16 Data.
		132	///
		133	/// \headerfile <x86intrin.h>
		134	///
		135	/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
		136	///
		137	/// \param __A
		138	/// A 256-bit vector of [8 x float].
		139	/// \param __B
		140	/// A 256-bit vector of [8 x float].
		141	/// \param __U
		142	/// A 16-bit mask value specifying what is chosen for each element.
		143	/// A 1 means conversion of __A or __B. A 0 means element is zero.
		144	/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
		145	/// conversion of __B, and higher 128 bits come from conversion of __A.
		146	static __inline__ __m256bh __DEFAULT_FN_ATTRS256
		147	_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
		148	return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
		149	(__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
		150	(__v16bf)_mm256_setzero_si256());
		151	}
		152
		153	/// Convert Packed Single Data to Packed BF16 Data.
		154	///
		155	/// \headerfile <x86intrin.h>
		156	///
		157	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
		158	///
		159	/// \param __A
		160	/// A 128-bit vector of [4 x float].
		161	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
		162	/// conversion of __A, and higher 64 bits are 0.
		163	#define _mm_cvtneps_pbh(A) \
		164	((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
		165
		166	/// Convert Packed Single Data to Packed BF16 Data.
		167	///
		168	/// \headerfile <x86intrin.h>
		169	///
		170	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
		171	///
		172	/// \param __A
		173	/// A 128-bit vector of [4 x float].
		174	/// \param __W
		175	/// A 128-bit vector of [8 x bfloat].
		176	/// \param __U
		177	/// A 4-bit mask value specifying what is chosen for each element.
		178	/// A 1 means conversion of __A. A 0 means element from __W.
		179	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
		180	/// conversion of __A, and higher 64 bits are 0.
		181	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
		182	_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
		183	return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
		184	(__v8bf)__W,
		185	(__mmask8)__U);
		186	}
		187
		188	/// Convert Packed Single Data to Packed BF16 Data.
		189	///
		190	/// \headerfile <x86intrin.h>
		191	///
		192	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
		193	///
		194	/// \param __A
		195	/// A 128-bit vector of [4 x float].
		196	/// \param __U
		197	/// A 4-bit mask value specifying what is chosen for each element.
		198	/// A 1 means conversion of __A. A 0 means element is zero.
		199	/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
		200	/// conversion of __A, and higher 64 bits are 0.
		201	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
		202	_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
		203	return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
		204	(__v8bf)_mm_setzero_si128(),
		205	(__mmask8)__U);
		206	}
		207
		208	/// Convert Packed Single Data to Packed BF16 Data.
		209	///
		210	/// \headerfile <x86intrin.h>
		211	///
		212	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
		213	///
		214	/// \param __A
		215	/// A 256-bit vector of [8 x float].
		216	/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
		217	#define _mm256_cvtneps_pbh(A) \
		218	((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
		219
		220	/// Convert Packed Single Data to Packed BF16 Data.
		221	///
		222	/// \headerfile <x86intrin.h>
		223	///
		224	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
		225	///
		226	/// \param __A
		227	/// A 256-bit vector of [8 x float].
		228	/// \param __W
		229	/// A 256-bit vector of [8 x bfloat].
		230	/// \param __U
		231	/// A 8-bit mask value specifying what is chosen for each element.
		232	/// A 1 means conversion of __A. A 0 means element from __W.
		233	/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
		234	static __inline__ __m128bh __DEFAULT_FN_ATTRS256
		235	_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
		236	return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
		237	(__v8bf)__W,
		238	(__mmask8)__U);
		239	}
		240
		241	/// Convert Packed Single Data to Packed BF16 Data.
		242	///
		243	/// \headerfile <x86intrin.h>
		244	///
		245	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
		246	///
		247	/// \param __A
		248	/// A 256-bit vector of [8 x float].
		249	/// \param __U
		250	/// A 8-bit mask value specifying what is chosen for each element.
		251	/// A 1 means conversion of __A. A 0 means element is zero.
		252	/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
		253	static __inline__ __m128bh __DEFAULT_FN_ATTRS256
		254	_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
		255	return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
		256	(__v8bf)_mm_setzero_si128(),
		257	(__mmask8)__U);
		258	}
		259
		260	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
		261	///
		262	/// \headerfile <x86intrin.h>
		263	///
		264	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
		265	///
		266	/// \param __A
		267	/// A 128-bit vector of [8 x bfloat].
		268	/// \param __B
		269	/// A 128-bit vector of [8 x bfloat].
		270	/// \param __D
		271	/// A 128-bit vector of [4 x float].
		272	/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
		273	/// __A, __B and __D
		274	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		275	_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
		276	return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
		277	(__v8bf)__A,
		278	(__v8bf)__B);
		279	}
		280
		281	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
		282	///
		283	/// \headerfile <x86intrin.h>
		284	///
		285	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
		286	///
		287	/// \param __A
		288	/// A 128-bit vector of [8 x bfloat].
		289	/// \param __B
		290	/// A 128-bit vector of [8 x bfloat].
		291	/// \param __D
		292	/// A 128-bit vector of [4 x float].
		293	/// \param __U
		294	/// A 8-bit mask value specifying what is chosen for each element.
		295	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
		296	/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
		297	/// __A, __B and __D
		298	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		299	_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
		300	return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
		301	(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
		302	(__v4sf)__D);
		303	}
		304
		305	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
		306	///
		307	/// \headerfile <x86intrin.h>
		308	///
		309	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
		310	///
		311	/// \param __A
		312	/// A 128-bit vector of [8 x bfloat].
		313	/// \param __B
		314	/// A 128-bit vector of [8 x bfloat].
		315	/// \param __D
		316	/// A 128-bit vector of [4 x float].
		317	/// \param __U
		318	/// A 8-bit mask value specifying what is chosen for each element.
		319	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
		320	/// \returns A 128-bit vector of [4 x float] comes from Dot Product of
		321	/// __A, __B and __D
		322	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		323	_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
		324	return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
		325	(__v4sf)_mm_dpbf16_ps(__D, __A, __B),
		326	(__v4sf)_mm_setzero_si128());
		327	}
		328
		329	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
		330	///
		331	/// \headerfile <x86intrin.h>
		332	///
		333	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
		334	///
		335	/// \param __A
		336	/// A 256-bit vector of [16 x bfloat].
		337	/// \param __B
		338	/// A 256-bit vector of [16 x bfloat].
		339	/// \param __D
		340	/// A 256-bit vector of [8 x float].
		341	/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
		342	/// __A, __B and __D
		343	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		344	_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
		345	return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
		346	(__v16bf)__A,
		347	(__v16bf)__B);
		348	}
		349
		350	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
		351	///
		352	/// \headerfile <x86intrin.h>
		353	///
		354	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
		355	///
		356	/// \param __A
		357	/// A 256-bit vector of [16 x bfloat].
		358	/// \param __B
		359	/// A 256-bit vector of [16 x bfloat].
		360	/// \param __D
		361	/// A 256-bit vector of [8 x float].
		362	/// \param __U
		363	/// A 16-bit mask value specifying what is chosen for each element.
		364	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
		365	/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
		366	/// __A, __B and __D
		367	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		368	_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
		369	return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
		370	(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
		371	(__v8sf)__D);
		372	}
		373
		374	/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
		375	///
		376	/// \headerfile <x86intrin.h>
		377	///
		378	/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
		379	///
		380	/// \param __A
		381	/// A 256-bit vector of [16 x bfloat].
		382	/// \param __B
		383	/// A 256-bit vector of [16 x bfloat].
		384	/// \param __D
		385	/// A 256-bit vector of [8 x float].
		386	/// \param __U
		387	/// A 8-bit mask value specifying what is chosen for each element.
		388	/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
		389	/// \returns A 256-bit vector of [8 x float] comes from Dot Product of
		390	/// __A, __B and __D
		391	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		392	_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
		393	return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
		394	(__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
		395	(__v8sf)_mm256_setzero_si256());
		396	}
		397
		398	/// Convert One Single float Data to One BF16 Data.
		399	///
		400	/// \headerfile <x86intrin.h>
		401	///
		402	/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
		403	///
		404	/// \param __A
		405	/// A float data.
		406	/// \returns A bf16 data whose sign field and exponent field keep unchanged,
		407	/// and fraction field is truncated to 7 bits.
		408	static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
		409	__v4sf __V = {__A, 0, 0, 0};
		410	__v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
		411	(__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
		412	return (__bf16)__R[0];
		413	}
		414
		415	/// Convert Packed BF16 Data to Packed float Data.
		416	///
		417	/// \headerfile <x86intrin.h>
		418	///
		419	/// \param __A
		420	/// A 128-bit vector of [4 x bfloat].
		421	/// \returns A 128-bit vector of [4 x float] come from conversion of __A
		422	static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
		423	return _mm_castsi128_ps(
		424	(__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
		425	}
		426
		427	/// Convert Packed BF16 Data to Packed float Data.
		428	///
		429	/// \headerfile <x86intrin.h>
		430	///
		431	/// \param __A
		432	/// A 128-bit vector of [8 x bfloat].
		433	/// \returns A 256-bit vector of [8 x float] come from conversion of __A
		434	static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
		435	return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
		436	(__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
		437	}
		438
		439	/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
		440	///
		441	/// \headerfile <x86intrin.h>
		442	///
		443	/// \param __U
		444	/// A 4-bit mask. Elements are zeroed out when the corresponding mask
		445	/// bit is not set.
		446	/// \param __A
		447	/// A 128-bit vector of [4 x bfloat].
		448	/// \returns A 128-bit vector of [4 x float] come from conversion of __A
		449	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		450	_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
		451	return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
		452	(__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
		453	}
		454
		455	/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
		456	///
		457	/// \headerfile <x86intrin.h>
		458	///
		459	/// \param __U
		460	/// A 8-bit mask. Elements are zeroed out when the corresponding mask
		461	/// bit is not set.
		462	/// \param __A
		463	/// A 128-bit vector of [8 x bfloat].
		464	/// \returns A 256-bit vector of [8 x float] come from conversion of __A
		465	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		466	_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
		467	return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
		468	(__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
		469	}
		470
		471	/// Convert Packed BF16 Data to Packed float Data using merging mask.
		472	///
		473	/// \headerfile <x86intrin.h>
		474	///
		475	/// \param __S
		476	/// A 128-bit vector of [4 x float]. Elements are copied from __S when
		477	/// the corresponding mask bit is not set.
		478	/// \param __U
		479	/// A 4-bit mask. Elements are zeroed out when the corresponding mask
		480	/// bit is not set.
		481	/// \param __A
		482	/// A 128-bit vector of [4 x bfloat].
		483	/// \returns A 128-bit vector of [4 x float] come from conversion of __A
		484	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		485	_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
		486	return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
		487	(__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
		488	16));
		489	}
		490
		491	/// Convert Packed BF16 Data to Packed float Data using merging mask.
		492	///
		493	/// \headerfile <x86intrin.h>
		494	///
		495	/// \param __S
		496	/// A 256-bit vector of [8 x float]. Elements are copied from __S when
		497	/// the corresponding mask bit is not set.
		498	/// \param __U
		499	/// A 8-bit mask. Elements are zeroed out when the corresponding mask
		500	/// bit is not set.
		501	/// \param __A
		502	/// A 128-bit vector of [8 x bfloat].
		503	/// \returns A 256-bit vector of [8 x float] come from conversion of __A
		504	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		505	_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
		506	return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
		507	(__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
		508	16));
		509	}
		510
		511	#undef __DEFAULT_FN_ATTRS128
		512	#undef __DEFAULT_FN_ATTRS256
		513
		514	#endif
		515	#endif

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite/llvm-build/x86_64/lib/clang/16/include/avx512vlbf16intrin.h – Rev 14