WebSVN – QNX 8.QNX8 LLVM/Clang compiler suite – Blame – //llvm-build/x86_64/lib/clang/16/include/amxintrin.h

Rev	Author	Line No.	Line
14	pmbaty	1	/===--------------- amxintrin.h - AMX intrinsics -- C/C++ -*---------------===
		2	*
		3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
		4	* See https://llvm.org/LICENSE.txt for license information.
		5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
		6	*
		7	*===------------------------------------------------------------------------===
		8	*/
		9
		10	#ifndef __IMMINTRIN_H
		11	#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
		12	#endif /* __IMMINTRIN_H */
		13
		14	#ifndef __AMXINTRIN_H
		15	#define __AMXINTRIN_H
		16	#ifdef __x86_64__
		17
		18	/* Define the default attributes for the functions in this file. */
		19	#define __DEFAULT_FN_ATTRS_TILE \
		20	__attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
		21	#define __DEFAULT_FN_ATTRS_INT8 \
		22	__attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
		23	#define __DEFAULT_FN_ATTRS_BF16 \
		24	__attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
		25	#define __DEFAULT_FN_ATTRS_FP16 \
		26	__attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
		27
		28	/// Load tile configuration from a 64-byte memory location specified by
		29	/// "mem_addr". The tile configuration includes the tile type palette, the
		30	/// number of bytes per row, and the number of rows. If the specified
		31	/// palette_id is zero, that signifies the init state for both the tile
		32	/// config and the tile data, and the tiles are zeroed. Any invalid
		33	/// configurations will result in #GP fault.
		34	///
		35	/// \headerfile <immintrin.h>
		36	///
		37	/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
		38	///
		39	/// \param __config
		40	/// A pointer to 512-bits configuration
		41	static __inline__ void __DEFAULT_FN_ATTRS_TILE
		42	_tile_loadconfig(const void *__config) {
		43	__builtin_ia32_tile_loadconfig(__config);
		44	}
		45
		46	/// Stores the current tile configuration to a 64-byte memory location
		47	/// specified by "mem_addr". The tile configuration includes the tile type
		48	/// palette, the number of bytes per row, and the number of rows. If tiles
		49	/// are not configured, all zeroes will be stored to memory.
		50	///
		51	/// \headerfile <immintrin.h>
		52	///
		53	/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
		54	///
		55	/// \param __config
		56	/// A pointer to 512-bits configuration
		57	static __inline__ void __DEFAULT_FN_ATTRS_TILE
		58	_tile_storeconfig(void *__config) {
		59	__builtin_ia32_tile_storeconfig(__config);
		60	}
		61
		62	/// Release the tile configuration to return to the init state, which
		63	/// releases all storage it currently holds.
		64	///
		65	/// \headerfile <immintrin.h>
		66	///
		67	/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
		68	static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
		69	__builtin_ia32_tilerelease();
		70	}
		71
		72	/// Load tile rows from memory specifieid by "base" address and "stride" into
		73	/// destination tile "dst" using the tile configuration previously configured
		74	/// via "_tile_loadconfig".
		75	///
		76	/// \headerfile <immintrin.h>
		77	///
		78	/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
		79	///
		80	/// \param dst
		81	/// A destination tile. Max size is 1024 Bytes.
		82	/// \param base
		83	/// A pointer to base address.
		84	/// \param stride
		85	/// The stride between the rows' data to be loaded in memory.
		86	#define _tile_loadd(dst, base, stride) \
		87	__builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
		88	(__SIZE_TYPE__)(stride))
		89
		90	/// Load tile rows from memory specifieid by "base" address and "stride" into
		91	/// destination tile "dst" using the tile configuration previously configured
		92	/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
		93	/// that the data will likely not be reused in the near future and the data
		94	/// caching can be optimized accordingly.
		95	///
		96	/// \headerfile <immintrin.h>
		97	///
		98	/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
		99	///
		100	/// \param dst
		101	/// A destination tile. Max size is 1024 Bytes.
		102	/// \param base
		103	/// A pointer to base address.
		104	/// \param stride
		105	/// The stride between the rows' data to be loaded in memory.
		106	#define _tile_stream_loadd(dst, base, stride) \
		107	__builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
		108	(__SIZE_TYPE__)(stride))
		109
		110	/// Store the tile specified by "src" to memory specifieid by "base" address and
		111	/// "stride" using the tile configuration previously configured via
		112	/// "_tile_loadconfig".
		113	///
		114	/// \headerfile <immintrin.h>
		115	///
		116	/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
		117	///
		118	/// \param dst
		119	/// A destination tile. Max size is 1024 Bytes.
		120	/// \param base
		121	/// A pointer to base address.
		122	/// \param stride
		123	/// The stride between the rows' data to be stored in memory.
		124	#define _tile_stored(dst, base, stride) \
		125	__builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
		126
		127	/// Zero the tile specified by "tdest".
		128	///
		129	/// \headerfile <immintrin.h>
		130	///
		131	/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
		132	///
		133	/// \param tile
		134	/// The destination tile to be zero. Max size is 1024 Bytes.
		135	#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
		136
		137	/// Compute dot-product of bytes in tiles with a source/destination accumulator.
		138	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
		139	/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
		140	/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
		141	/// and store the 32-bit result back to tile "dst".
		142	///
		143	/// \headerfile <immintrin.h>
		144	///
		145	/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
		146	///
		147	/// \param dst
		148	/// The destination tile. Max size is 1024 Bytes.
		149	/// \param src0
		150	/// The 1st source tile. Max size is 1024 Bytes.
		151	/// \param src1
		152	/// The 2nd source tile. Max size is 1024 Bytes.
		153	#define _tile_dpbssd(dst, src0, src1) \
		154	__builtin_ia32_tdpbssd((dst), (src0), (src1))
		155
		156	/// Compute dot-product of bytes in tiles with a source/destination accumulator.
		157	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
		158	/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
		159	/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
		160	/// in "dst", and store the 32-bit result back to tile "dst".
		161	///
		162	/// \headerfile <immintrin.h>
		163	///
		164	/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
		165	///
		166	/// \param dst
		167	/// The destination tile. Max size is 1024 Bytes.
		168	/// \param src0
		169	/// The 1st source tile. Max size is 1024 Bytes.
		170	/// \param src1
		171	/// The 2nd source tile. Max size is 1024 Bytes.
		172	#define _tile_dpbsud(dst, src0, src1) \
		173	__builtin_ia32_tdpbsud((dst), (src0), (src1))
		174
		175	/// Compute dot-product of bytes in tiles with a source/destination accumulator.
		176	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
		177	/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
		178	/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
		179	/// and store the 32-bit result back to tile "dst".
		180	///
		181	/// \headerfile <immintrin.h>
		182	///
		183	/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
		184	///
		185	/// \param dst
		186	/// The destination tile. Max size is 1024 Bytes.
		187	/// \param src0
		188	/// The 1st source tile. Max size is 1024 Bytes.
		189	/// \param src1
		190	/// The 2nd source tile. Max size is 1024 Bytes.
		191	#define _tile_dpbusd(dst, src0, src1) \
		192	__builtin_ia32_tdpbusd((dst), (src0), (src1))
		193
		194	/// Compute dot-product of bytes in tiles with a source/destination accumulator.
		195	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
		196	/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
		197	/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
		198	/// "dst", and store the 32-bit result back to tile "dst".
		199	///
		200	/// \headerfile <immintrin.h>
		201	///
		202	/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
		203	///
		204	/// \param dst
		205	/// The destination tile. Max size is 1024 Bytes.
		206	/// \param src0
		207	/// The 1st source tile. Max size is 1024 Bytes.
		208	/// \param src1
		209	/// The 2nd source tile. Max size is 1024 Bytes.
		210	#define _tile_dpbuud(dst, src0, src1) \
		211	__builtin_ia32_tdpbuud((dst), (src0), (src1))
		212
		213	/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
		214	/// src1, accumulating the intermediate single-precision (32-bit) floating-point
		215	/// elements with elements in "dst", and store the 32-bit result back to tile
		216	/// "dst".
		217	///
		218	/// \headerfile <immintrin.h>
		219	///
		220	/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
		221	///
		222	/// \param dst
		223	/// The destination tile. Max size is 1024 Bytes.
		224	/// \param src0
		225	/// The 1st source tile. Max size is 1024 Bytes.
		226	/// \param src1
		227	/// The 2nd source tile. Max size is 1024 Bytes.
		228	#define _tile_dpbf16ps(dst, src0, src1) \
		229	__builtin_ia32_tdpbf16ps((dst), (src0), (src1))
		230
		231	/// AMX tile register size can be configured, the maximum size is 16x64=1024
		232	/// bytes. Since there is no 2D type in llvm IR, we use vector type to
		233	/// represent 2D tile and the fixed size is maximum amx tile register size.
		234	typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
		235
		236	/// This is internal intrinsic. C/C++ user should avoid calling it directly.
		237	static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
		238	_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
		239	__SIZE_TYPE__ stride) {
		240	return __builtin_ia32_tileloadd64_internal(m, n, base,
		241	(__SIZE_TYPE__)(stride));
		242	}
		243
		244	/// This is internal intrinsic. C/C++ user should avoid calling it directly.
		245	static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
		246	_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
		247	__SIZE_TYPE__ stride) {
		248	return __builtin_ia32_tileloaddt164_internal(m, n, base,
		249	(__SIZE_TYPE__)(stride));
		250	}
		251
		252	/// This is internal intrinsic. C/C++ user should avoid calling it directly.
		253	static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
		254	_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
		255	_tile1024i dst, _tile1024i src1, _tile1024i src2) {
		256	return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
		257	}
		258
		259	/// This is internal intrinsic. C/C++ user should avoid calling it directly.
		260	static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
		261	_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
		262	_tile1024i dst, _tile1024i src1, _tile1024i src2) {
		263	return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
		264	}
		265
		266	/// This is internal intrinsic. C/C++ user should avoid calling it directly.
		267	static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
		268	_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
		269	_tile1024i dst, _tile1024i src1, _tile1024i src2) {
		270	return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
		271	}
		272
		273	/// This is internal intrinsic. C/C++ user should avoid calling it directly.
		274	static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
		275	_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
		276	_tile1024i dst, _tile1024i src1, _tile1024i src2) {
		277	return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
		278	}
		279
		280	/// This is internal intrinsic. C/C++ user should avoid calling it directly.
		281	static __inline__ void __DEFAULT_FN_ATTRS_INT8
		282	_tile_stored_internal(unsigned short m, unsigned short n, void *base,
		283	__SIZE_TYPE__ stride, _tile1024i tile) {
		284	return __builtin_ia32_tilestored64_internal(m, n, base,
		285	(__SIZE_TYPE__)(stride), tile);
		286	}
		287
		288	/// This is internal intrinsic. C/C++ user should avoid calling it directly.
		289	static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
		290	_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
		291	_tile1024i dst, _tile1024i src1, _tile1024i src2) {
		292	return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
		293	}
		294
		295	/// This is internal intrinsic. C/C++ user should avoid calling it directly.
		296	static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
		297	_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
		298	_tile1024i dst, _tile1024i src1, _tile1024i src2) {
		299	return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
		300	}
		301
		302	/// This struct pack the shape and tile data together for user. We suggest
		303	/// initializing the struct as early as possible, because compiler depends
		304	/// on the shape information to do configure. The constant value is preferred
		305	/// for optimization by compiler.
		306	typedef struct __tile1024i_str {
		307	const unsigned short row;
		308	const unsigned short col;
		309	_tile1024i tile;
		310	} __tile1024i;
		311
		312	/// Load tile rows from memory specifieid by "base" address and "stride" into
		313	/// destination tile "dst".
		314	///
		315	/// \headerfile <immintrin.h>
		316	///
		317	/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
		318	///
		319	/// \param dst
		320	/// A destination tile. Max size is 1024 Bytes.
		321	/// \param base
		322	/// A pointer to base address.
		323	/// \param stride
		324	/// The stride between the rows' data to be loaded in memory.
		325	__DEFAULT_FN_ATTRS_TILE
		326	static __inline__ void __tile_loadd(__tile1024i dst, const void base,
		327	__SIZE_TYPE__ stride) {
		328	dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
		329	}
		330
		331	/// Load tile rows from memory specifieid by "base" address and "stride" into
		332	/// destination tile "dst". This intrinsic provides a hint to the implementation
		333	/// that the data will likely not be reused in the near future and the data
		334	/// caching can be optimized accordingly.
		335	///
		336	/// \headerfile <immintrin.h>
		337	///
		338	/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
		339	///
		340	/// \param dst
		341	/// A destination tile. Max size is 1024 Bytes.
		342	/// \param base
		343	/// A pointer to base address.
		344	/// \param stride
		345	/// The stride between the rows' data to be loaded in memory.
		346	__DEFAULT_FN_ATTRS_TILE
		347	static __inline__ void __tile_stream_loadd(__tile1024i dst, const void base,
		348	__SIZE_TYPE__ stride) {
		349	dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
		350	}
		351
		352	/// Compute dot-product of bytes in tiles with a source/destination accumulator.
		353	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
		354	/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
		355	/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
		356	/// and store the 32-bit result back to tile "dst".
		357	///
		358	/// \headerfile <immintrin.h>
		359	///
		360	/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
		361	///
		362	/// \param dst
		363	/// The destination tile. Max size is 1024 Bytes.
		364	/// \param src0
		365	/// The 1st source tile. Max size is 1024 Bytes.
		366	/// \param src1
		367	/// The 2nd source tile. Max size is 1024 Bytes.
		368	__DEFAULT_FN_ATTRS_INT8
		369	static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
		370	__tile1024i src1) {
		371	dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
		372	src0.tile, src1.tile);
		373	}
		374
		375	/// Compute dot-product of bytes in tiles with a source/destination accumulator.
		376	/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
		377	/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
		378	/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
		379	/// in "dst", and store the 32-bit result back to tile "dst".
		380	///
		381	/// \headerfile <immintrin.h>
		382	///
		383	/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
		384	///
		385	/// \param dst
		386	/// The destination tile. Max size is 1024 Bytes.
		387	/// \param src0
		388	/// The 1st source tile. Max size is 1024 Bytes.
		389	/// \param src1
		390	/// The 2nd source tile. Max size is 1024 Bytes.
		391	__DEFAULT_FN_ATTRS_INT8
		392	static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
		393	__tile1024i src1) {
		394	dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
		395	src0.tile, src1.tile);
		396	}
		397
		398	/// Compute dot-product of bytes in tiles with a source/destination accumulator.
		399	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
		400	/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
		401	/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
		402	/// and store the 32-bit result back to tile "dst".
		403	///
		404	/// \headerfile <immintrin.h>
		405	///
		406	/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
		407	///
		408	/// \param dst
		409	/// The destination tile. Max size is 1024 Bytes.
		410	/// \param src0
		411	/// The 1st source tile. Max size is 1024 Bytes.
		412	/// \param src1
		413	/// The 2nd source tile. Max size is 1024 Bytes.
		414	__DEFAULT_FN_ATTRS_INT8
		415	static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
		416	__tile1024i src1) {
		417	dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
		418	src0.tile, src1.tile);
		419	}
		420
		421	/// Compute dot-product of bytes in tiles with a source/destination accumulator.
		422	/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
		423	/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
		424	/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
		425	/// "dst", and store the 32-bit result back to tile "dst".
		426	///
		427	/// \headerfile <immintrin.h>
		428	///
		429	/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
		430	///
		431	/// \param dst
		432	/// The destination tile. Max size is 1024 Bytes.
		433	/// \param src0
		434	/// The 1st source tile. Max size is 1024 Bytes.
		435	/// \param src1
		436	/// The 2nd source tile. Max size is 1024 Bytes.
		437	__DEFAULT_FN_ATTRS_INT8
		438	static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
		439	__tile1024i src1) {
		440	dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
		441	src0.tile, src1.tile);
		442	}
		443
		444	/// Store the tile specified by "src" to memory specifieid by "base" address and
		445	/// "stride".
		446	///
		447	/// \headerfile <immintrin.h>
		448	///
		449	/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
		450	///
		451	/// \param base
		452	/// A pointer to base address.
		453	/// \param stride
		454	/// The stride between the rows' data to be stored in memory.
		455	__DEFAULT_FN_ATTRS_TILE
		456	static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
		457	__tile1024i src) {
		458	_tile_stored_internal(src.row, src.col, base, stride, src.tile);
		459	}
		460
		461	/// Zero the tile specified by "dst".
		462	///
		463	/// \headerfile <immintrin.h>
		464	///
		465	/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
		466	///
		467	/// \param dst
		468	/// The destination tile to be zero. Max size is 1024 Bytes.
		469	__DEFAULT_FN_ATTRS_TILE
		470	static __inline__ void __tile_zero(__tile1024i *dst) {
		471	dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
		472	}
		473
		474	/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
		475	/// src1, accumulating the intermediate single-precision (32-bit) floating-point
		476	/// elements with elements in "dst", and store the 32-bit result back to tile
		477	/// "dst".
		478	///
		479	/// \headerfile <immintrin.h>
		480	///
		481	/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
		482	///
		483	/// \param dst
		484	/// The destination tile. Max size is 1024 Bytes.
		485	/// \param src0
		486	/// The 1st source tile. Max size is 1024 Bytes.
		487	/// \param src1
		488	/// The 2nd source tile. Max size is 1024 Bytes.
		489	__DEFAULT_FN_ATTRS_BF16
		490	static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
		491	__tile1024i src1) {
		492	dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
		493	src0.tile, src1.tile);
		494	}
		495
		496	/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
		497	/// src1, accumulating the intermediate single-precision (32-bit) floating-point
		498	/// elements with elements in "dst", and store the 32-bit result back to tile
		499	/// "dst".
		500	///
		501	/// \headerfile <immintrin.h>
		502	///
		503	/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
		504	///
		505	/// \param dst
		506	/// The destination tile. Max size is 1024 Bytes.
		507	/// \param src0
		508	/// The 1st source tile. Max size is 1024 Bytes.
		509	/// \param src1
		510	/// The 2nd source tile. Max size is 1024 Bytes.
		511	__DEFAULT_FN_ATTRS_FP16
		512	static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
		513	__tile1024i src1) {
		514	dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
		515	src0.tile, src1.tile);
		516	}
		517
		518	#undef __DEFAULT_FN_ATTRS_TILE
		519	#undef __DEFAULT_FN_ATTRS_INT8
		520	#undef __DEFAULT_FN_ATTRS_BF16
		521	#undef __DEFAULT_FN_ATTRS_FP16
		522
		523	#endif /* __x86_64__ */
		524	#endif /* __AMXINTRIN_H */

Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

QNX 8.QNX8 LLVM/Clang compiler suite//llvm-build/x86_64/lib/clang/16/include/amxintrin.h – Rev 14