WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/lib/clang/16/include/avxneconvertintrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===-----------------------------------------------------------------------===
		8	*/
		9
		10	#ifndef __IMMINTRIN_H
		11	#error \
		12	"Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
		13	#endif // __IMMINTRIN_H
		14
		15	#ifdef __SSE2__
		16
		17	#ifndef __AVXNECONVERTINTRIN_H
		18	#define __AVXNECONVERTINTRIN_H
		19
		20	/* Define the default attributes for the functions in this file. */
		21	#define __DEFAULT_FN_ATTRS128 \
		22	__attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
		23	__min_vector_width__(128)))
		24	#define __DEFAULT_FN_ATTRS256 \
		25	__attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"), \
		26	__min_vector_width__(256)))
		27
		28	/// Convert scalar BF16 (16-bit) floating-point element
		29	/// stored at memory locations starting at location \a __A to a
		30	/// single-precision (32-bit) floating-point, broadcast it to packed
		31	/// single-precision (32-bit) floating-point elements, and store the results in
		32	/// \a dst.
		33	///
		34	/// \headerfile <x86intrin.h>
		35	///
		36	/// \code
		37	/// _mm_bcstnebf16_ps(const void *__A);
		38	/// \endcode
		39	///
		40	/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
		41	///
		42	/// \param __A
		43	/// A pointer to a 16-bit memory location. The address of the memory
		44	/// location does not have to be aligned.
		45	/// \returns
		46	/// A 128-bit vector of [4 x float].
		47	///
		48	/// \code{.operation}
		49	/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
		50	/// FOR j := 0 to 3
		51	/// m := j*32
		52	/// dst[m+31:m] := b
		53	/// ENDFOR
		54	/// dst[MAX:128] := 0
		55	/// \endcode
		56	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		57	_mm_bcstnebf16_ps(const void *__A) {
		58	return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
		59	}
		60
		61	/// Convert scalar BF16 (16-bit) floating-point element
		62	/// stored at memory locations starting at location \a __A to a
		63	/// single-precision (32-bit) floating-point, broadcast it to packed
		64	/// single-precision (32-bit) floating-point elements, and store the results in
		65	/// \a dst.
		66	///
		67	/// \headerfile <x86intrin.h>
		68	///
		69	/// \code
		70	/// _mm256_bcstnebf16_ps(const void *__A);
		71	/// \endcode
		72	///
		73	/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
		74	///
		75	/// \param __A
		76	/// A pointer to a 16-bit memory location. The address of the memory
		77	/// location does not have to be aligned.
		78	/// \returns
		79	/// A 256-bit vector of [8 x float].
		80	///
		81	/// \code{.operation}
		82	/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
		83	/// FOR j := 0 to 7
		84	/// m := j*32
		85	/// dst[m+31:m] := b
		86	/// ENDFOR
		87	/// dst[MAX:256] := 0
		88	/// \endcode
		89	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		90	_mm256_bcstnebf16_ps(const void *__A) {
		91	return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
		92	}
		93
		94	/// Convert scalar half-precision (16-bit) floating-point element
		95	/// stored at memory locations starting at location \a __A to a
		96	/// single-precision (32-bit) floating-point, broadcast it to packed
		97	/// single-precision (32-bit) floating-point elements, and store the results in
		98	/// \a dst.
		99	///
		100	/// \headerfile <x86intrin.h>
		101	///
		102	/// \code
		103	/// _mm_bcstnesh_ps(const void *__A);
		104	/// \endcode
		105	///
		106	/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
		107	///
		108	/// \param __A
		109	/// A pointer to a 16-bit memory location. The address of the memory
		110	/// location does not have to be aligned.
		111	/// \returns
		112	/// A 128-bit vector of [4 x float].
		113	///
		114	/// \code{.operation}
		115	/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
		116	/// FOR j := 0 to 3
		117	/// m := j*32
		118	/// dst[m+31:m] := b
		119	/// ENDFOR
		120	/// dst[MAX:128] := 0
		121	/// \endcode
		122	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		123	_mm_bcstnesh_ps(const void *__A) {
		124	return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
		125	}
		126
		127	/// Convert scalar half-precision (16-bit) floating-point element
		128	/// stored at memory locations starting at location \a __A to a
		129	/// single-precision (32-bit) floating-point, broadcast it to packed
		130	/// single-precision (32-bit) floating-point elements, and store the results in
		131	/// \a dst.
		132	///
		133	/// \headerfile <x86intrin.h>
		134	///
		135	/// \code
		136	/// _mm256_bcstnesh_ps(const void *__A);
		137	/// \endcode
		138	///
		139	/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
		140	///
		141	/// \param __A
		142	/// A pointer to a 16-bit memory location. The address of the memory
		143	/// location does not have to be aligned.
		144	/// \returns
		145	/// A 256-bit vector of [8 x float].
		146	///
		147	/// \code{.operation}
		148	/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
		149	/// FOR j := 0 to 7
		150	/// m := j*32
		151	/// dst[m+31:m] := b
		152	/// ENDFOR
		153	/// dst[MAX:256] := 0
		154	/// \endcode
		155	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		156	_mm256_bcstnesh_ps(const void *__A) {
		157	return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
		158	}
		159
		160	/// Convert packed BF16 (16-bit) floating-point even-indexed elements
		161	/// stored at memory locations starting at location \a __A to packed
		162	/// single-precision (32-bit) floating-point elements, and store the results in
		163	/// \a dst.
		164	///
		165	/// \headerfile <x86intrin.h>
		166	///
		167	/// \code
		168	/// _mm_cvtneebf16_ps(const __m128bh *__A);
		169	/// \endcode
		170	///
		171	/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
		172	///
		173	/// \param __A
		174	/// A pointer to a 128-bit memory location containing 8 consecutive
		175	/// BF16 (16-bit) floating-point values.
		176	/// \returns
		177	/// A 128-bit vector of [4 x float].
		178	///
		179	/// \code{.operation}
		180	/// FOR j := 0 to 3
		181	/// k := j*2
		182	/// i := k*16
		183	/// m := j*32
		184	/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
		185	/// ENDFOR
		186	/// dst[MAX:128] := 0
		187	/// \endcode
		188	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		189	_mm_cvtneebf16_ps(const __m128bh *__A) {
		190	return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
		191	}
		192
		193	/// Convert packed BF16 (16-bit) floating-point even-indexed elements
		194	/// stored at memory locations starting at location \a __A to packed
		195	/// single-precision (32-bit) floating-point elements, and store the results in
		196	/// \a dst.
		197	///
		198	/// \headerfile <x86intrin.h>
		199	///
		200	/// \code
		201	/// _mm256_cvtneebf16_ps(const __m256bh *__A);
		202	/// \endcode
		203	///
		204	/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
		205	///
		206	/// \param __A
		207	/// A pointer to a 256-bit memory location containing 16 consecutive
		208	/// BF16 (16-bit) floating-point values.
		209	/// \returns
		210	/// A 256-bit vector of [8 x float].
		211	///
		212	/// \code{.operation}
		213	/// FOR j := 0 to 7
		214	/// k := j*2
		215	/// i := k*16
		216	/// m := j*32
		217	/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
		218	/// ENDFOR
		219	/// dst[MAX:256] := 0
		220	/// \endcode
		221	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		222	_mm256_cvtneebf16_ps(const __m256bh *__A) {
		223	return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
		224	}
		225
		226	/// Convert packed half-precision (16-bit) floating-point even-indexed elements
		227	/// stored at memory locations starting at location \a __A to packed
		228	/// single-precision (32-bit) floating-point elements, and store the results in
		229	/// \a dst.
		230	///
		231	/// \headerfile <x86intrin.h>
		232	///
		233	/// \code
		234	/// _mm_cvtneeph_ps(const __m128h *__A);
		235	/// \endcode
		236	///
		237	/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
		238	///
		239	/// \param __A
		240	/// A pointer to a 128-bit memory location containing 8 consecutive
		241	/// half-precision (16-bit) floating-point values.
		242	/// \returns
		243	/// A 128-bit vector of [4 x float].
		244	///
		245	/// \code{.operation}
		246	/// FOR j := 0 to 3
		247	/// k := j*2
		248	/// i := k*16
		249	/// m := j*32
		250	/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
		251	/// ENDFOR
		252	/// dst[MAX:128] := 0
		253	/// \endcode
		254	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		255	_mm_cvtneeph_ps(const __m128h *__A) {
		256	return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
		257	}
		258
		259	/// Convert packed half-precision (16-bit) floating-point even-indexed elements
		260	/// stored at memory locations starting at location \a __A to packed
		261	/// single-precision (32-bit) floating-point elements, and store the results in
		262	/// \a dst.
		263	///
		264	/// \headerfile <x86intrin.h>
		265	///
		266	/// \code
		267	/// _mm256_cvtneeph_ps(const __m256h *__A);
		268	/// \endcode
		269	///
		270	/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
		271	///
		272	/// \param __A
		273	/// A pointer to a 256-bit memory location containing 16 consecutive
		274	/// half-precision (16-bit) floating-point values.
		275	/// \returns
		276	/// A 256-bit vector of [8 x float].
		277	///
		278	/// \code{.operation}
		279	/// FOR j := 0 to 7
		280	/// k := j*2
		281	/// i := k*16
		282	/// m := j*32
		283	/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
		284	/// ENDFOR
		285	/// dst[MAX:256] := 0
		286	/// \endcode
		287	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		288	_mm256_cvtneeph_ps(const __m256h *__A) {
		289	return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
		290	}
		291
		292	/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
		293	/// stored at memory locations starting at location \a __A to packed
		294	/// single-precision (32-bit) floating-point elements, and store the results in
		295	/// \a dst.
		296	///
		297	/// \headerfile <x86intrin.h>
		298	///
		299	/// \code
		300	/// _mm_cvtneobf16_ps(const __m128bh *__A);
		301	/// \endcode
		302	///
		303	/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
		304	///
		305	/// \param __A
		306	/// A pointer to a 128-bit memory location containing 8 consecutive
		307	/// BF16 (16-bit) floating-point values.
		308	/// \returns
		309	/// A 128-bit vector of [4 x float].
		310	///
		311	/// \code{.operation}
		312	/// FOR j := 0 to 3
		313	/// k := j*2+1
		314	/// i := k*16
		315	/// m := j*32
		316	/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
		317	/// ENDFOR
		318	/// dst[MAX:128] := 0
		319	/// \endcode
		320	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		321	_mm_cvtneobf16_ps(const __m128bh *__A) {
		322	return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
		323	}
		324
		325	/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
		326	/// stored at memory locations starting at location \a __A to packed
		327	/// single-precision (32-bit) floating-point elements, and store the results in
		328	/// \a dst.
		329	///
		330	/// \headerfile <x86intrin.h>
		331	///
		332	/// \code
		333	/// _mm256_cvtneobf16_ps(const __m256bh *__A);
		334	/// \endcode
		335	///
		336	/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
		337	///
		338	/// \param __A
		339	/// A pointer to a 256-bit memory location containing 16 consecutive
		340	/// BF16 (16-bit) floating-point values.
		341	/// \returns
		342	/// A 256-bit vector of [8 x float].
		343	///
		344	/// \code{.operation}
		345	/// FOR j := 0 to 7
		346	/// k := j*2+1
		347	/// i := k*16
		348	/// m := j*32
		349	/// dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
		350	/// ENDFOR
		351	/// dst[MAX:256] := 0
		352	/// \endcode
		353	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		354	_mm256_cvtneobf16_ps(const __m256bh *__A) {
		355	return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
		356	}
		357
		358	/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
		359	/// stored at memory locations starting at location \a __A to packed
		360	/// single-precision (32-bit) floating-point elements, and store the results in
		361	/// \a dst.
		362	///
		363	/// \headerfile <x86intrin.h>
		364	///
		365	/// \code
		366	/// _mm_cvtneoph_ps(const __m128h *__A);
		367	/// \endcode
		368	///
		369	/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
		370	///
		371	/// \param __A
		372	/// A pointer to a 128-bit memory location containing 8 consecutive
		373	/// half-precision (16-bit) floating-point values.
		374	/// \returns
		375	/// A 128-bit vector of [4 x float].
		376	///
		377	/// \code{.operation}
		378	/// FOR j := 0 to 3
		379	/// k := j*2+1
		380	/// i := k*16
		381	/// m := j*32
		382	/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
		383	/// ENDFOR
		384	/// dst[MAX:128] := 0
		385	/// \endcode
		386	static __inline__ __m128 __DEFAULT_FN_ATTRS128
		387	_mm_cvtneoph_ps(const __m128h *__A) {
		388	return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
		389	}
		390
		391	/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
		392	/// stored at memory locations starting at location \a __A to packed
		393	/// single-precision (32-bit) floating-point elements, and store the results in
		394	/// \a dst.
		395	///
		396	/// \headerfile <x86intrin.h>
		397	///
		398	/// \code
		399	/// _mm256_cvtneoph_ps(const __m256h *__A);
		400	/// \endcode
		401	///
		402	/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
		403	///
		404	/// \param __A
		405	/// A pointer to a 256-bit memory location containing 16 consecutive
		406	/// half-precision (16-bit) floating-point values.
		407	/// \returns
		408	/// A 256-bit vector of [8 x float].
		409	///
		410	/// \code{.operation}
		411	/// FOR j := 0 to 7
		412	/// k := j*2+1
		413	/// i := k*16
		414	/// m := j*32
		415	/// dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
		416	/// ENDFOR
		417	/// dst[MAX:256] := 0
		418	/// \endcode
		419	static __inline__ __m256 __DEFAULT_FN_ATTRS256
		420	_mm256_cvtneoph_ps(const __m256h *__A) {
		421	return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
		422	}
		423
		424	/// Convert packed single-precision (32-bit) floating-point elements in \a __A
		425	/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
		426	/// dst.
		427	///
		428	/// \headerfile <x86intrin.h>
		429	///
		430	/// \code
		431	/// _mm_cvtneps_avx_pbh(__m128 __A);
		432	/// \endcode
		433	///
		434	/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
		435	///
		436	/// \param __A
		437	/// A 128-bit vector of [4 x float].
		438	/// \returns
		439	/// A 128-bit vector of [8 x bfloat].
		440	///
		441	/// \code{.operation}
		442	/// FOR j := 0 to 3
		443	/// dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
		444	/// ENDFOR
		445	/// dst[MAX:128] := 0
		446	/// \endcode
		447	static __inline__ __m128bh __DEFAULT_FN_ATTRS128
		448	_mm_cvtneps_avx_pbh(__m128 __A) {
		449	return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
		450	}
		451
		452	/// Convert packed single-precision (32-bit) floating-point elements in \a __A
		453	/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
		454	/// dst.
		455	///
		456	/// \headerfile <x86intrin.h>
		457	///
		458	/// \code
		459	/// _mm256_cvtneps_avx_pbh(__m256 __A);
		460	/// \endcode
		461	///
		462	/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
		463	///
		464	/// \param __A
		465	/// A 256-bit vector of [8 x float].
		466	/// \returns
		467	/// A 128-bit vector of [8 x bfloat].
		468	///
		469	/// \code{.operation}
		470	/// FOR j := 0 to 7
		471	/// dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
		472	/// ENDFOR
		473	/// dst[MAX:128] := 0
		474	/// \endcode
		475	static __inline__ __m128bh __DEFAULT_FN_ATTRS256
		476	_mm256_cvtneps_avx_pbh(__m256 __A) {
		477	return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
		478	}
		479
		480	#undef __DEFAULT_FN_ATTRS128
		481	#undef __DEFAULT_FN_ATTRS256
		482
		483	#endif // __AVXNECONVERTINTRIN_H
		484	#endif // __SSE2__

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/lib/clang/16/include/avxneconvertintrin.h – Rev 14