Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===------------------------------------------------------------------------===
  8.  */
  9.  
  10. #ifndef __IMMINTRIN_H
  11. #error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
  12. #endif /* __IMMINTRIN_H */
  13.  
  14. #ifndef __AMXINTRIN_H
  15. #define __AMXINTRIN_H
  16. #ifdef __x86_64__
  17.  
  18. /* Define the default attributes for the functions in this file. */
  19. #define __DEFAULT_FN_ATTRS_TILE                                                \
  20.   __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
  21. #define __DEFAULT_FN_ATTRS_INT8                                                \
  22.   __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
  23. #define __DEFAULT_FN_ATTRS_BF16                                                \
  24.   __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
  25. #define __DEFAULT_FN_ATTRS_FP16                                                \
  26.   __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
  27.  
  28. /// Load tile configuration from a 64-byte memory location specified by
  29. /// "mem_addr". The tile configuration includes the tile type palette, the
  30. /// number of bytes per row, and the number of rows. If the specified
  31. /// palette_id is zero, that signifies the init state for both the tile
  32. /// config and the tile data, and the tiles are zeroed. Any invalid
  33. /// configurations will result in #GP fault.
  34. ///
  35. /// \headerfile <immintrin.h>
  36. ///
  37. /// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
  38. ///
  39. /// \param __config
  40. ///    A pointer to 512-bits configuration
  41. static __inline__ void __DEFAULT_FN_ATTRS_TILE
  42. _tile_loadconfig(const void *__config) {
  43.   __builtin_ia32_tile_loadconfig(__config);
  44. }
  45.  
  46. /// Stores the current tile configuration to a 64-byte memory location
  47. /// specified by "mem_addr". The tile configuration includes the tile type
  48. /// palette, the number of bytes per row, and the number of rows. If tiles
  49. /// are not configured, all zeroes will be stored to memory.
  50. ///
  51. /// \headerfile <immintrin.h>
  52. ///
  53. /// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
  54. ///
  55. /// \param __config
  56. ///    A pointer to 512-bits configuration
  57. static __inline__ void __DEFAULT_FN_ATTRS_TILE
  58. _tile_storeconfig(void *__config) {
  59.   __builtin_ia32_tile_storeconfig(__config);
  60. }
  61.  
  62. /// Release the tile configuration to return to the init state, which
  63. /// releases all storage it currently holds.
  64. ///
  65. /// \headerfile <immintrin.h>
  66. ///
  67. /// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
  68. static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
  69.   __builtin_ia32_tilerelease();
  70. }
  71.  
  72. /// Load tile rows from memory specifieid by "base" address and "stride" into
  73. /// destination tile "dst" using the tile configuration previously configured
  74. /// via "_tile_loadconfig".
  75. ///
  76. /// \headerfile <immintrin.h>
  77. ///
  78. /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
  79. ///
  80. /// \param dst
  81. ///    A destination tile. Max size is 1024 Bytes.
  82. /// \param base
  83. ///    A pointer to base address.
  84. /// \param stride
  85. ///    The stride between the rows' data to be loaded in memory.
  86. #define _tile_loadd(dst, base, stride)                                         \
  87.   __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
  88.                              (__SIZE_TYPE__)(stride))
  89.  
  90. /// Load tile rows from memory specifieid by "base" address and "stride" into
  91. /// destination tile "dst" using the tile configuration previously configured
  92. /// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
  93. /// that the data will likely not be reused in the near future and the data
  94. /// caching can be optimized accordingly.
  95. ///
  96. /// \headerfile <immintrin.h>
  97. ///
  98. /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
  99. ///
  100. /// \param dst
  101. ///    A destination tile. Max size is 1024 Bytes.
  102. /// \param base
  103. ///    A pointer to base address.
  104. /// \param stride
  105. ///    The stride between the rows' data to be loaded in memory.
  106. #define _tile_stream_loadd(dst, base, stride)                                  \
  107.   __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
  108.                                (__SIZE_TYPE__)(stride))
  109.  
  110. /// Store the tile specified by "src" to memory specifieid by "base" address and
  111. /// "stride" using the tile configuration previously configured via
  112. /// "_tile_loadconfig".
  113. ///
  114. /// \headerfile <immintrin.h>
  115. ///
  116. /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
  117. ///
  118. /// \param dst
  119. ///    A destination tile. Max size is 1024 Bytes.
  120. /// \param base
  121. ///    A pointer to base address.
  122. /// \param stride
  123. ///    The stride between the rows' data to be stored in memory.
  124. #define _tile_stored(dst, base, stride)                                        \
  125.   __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
  126.  
  127. /// Zero the tile specified by "tdest".
  128. ///
  129. /// \headerfile <immintrin.h>
  130. ///
  131. /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
  132. ///
  133. /// \param tile
  134. ///    The destination tile to be zero. Max size is 1024 Bytes.
  135. #define _tile_zero(tile) __builtin_ia32_tilezero((tile))
  136.  
  137. /// Compute dot-product of bytes in tiles with a source/destination accumulator.
  138. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
  139. /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
  140. /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
  141. /// and store the 32-bit result back to tile "dst".
  142. ///
  143. /// \headerfile <immintrin.h>
  144. ///
  145. /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
  146. ///
  147. /// \param dst
  148. ///    The destination tile. Max size is 1024 Bytes.
  149. /// \param src0
  150. ///    The 1st source tile. Max size is 1024 Bytes.
  151. /// \param src1
  152. ///    The 2nd source tile. Max size is 1024 Bytes.
  153. #define _tile_dpbssd(dst, src0, src1)                                          \
  154.   __builtin_ia32_tdpbssd((dst), (src0), (src1))
  155.  
  156. /// Compute dot-product of bytes in tiles with a source/destination accumulator.
  157. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
  158. /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
  159. /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
  160. /// in "dst", and store the 32-bit result back to tile "dst".
  161. ///
  162. /// \headerfile <immintrin.h>
  163. ///
  164. /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
  165. ///
  166. /// \param dst
  167. ///    The destination tile. Max size is 1024 Bytes.
  168. /// \param src0
  169. ///    The 1st source tile. Max size is 1024 Bytes.
  170. /// \param src1
  171. ///    The 2nd source tile. Max size is 1024 Bytes.
  172. #define _tile_dpbsud(dst, src0, src1)                                          \
  173.   __builtin_ia32_tdpbsud((dst), (src0), (src1))
  174.  
  175. /// Compute dot-product of bytes in tiles with a source/destination accumulator.
  176. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
  177. /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
  178. /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
  179. /// and store the 32-bit result back to tile "dst".
  180. ///
  181. /// \headerfile <immintrin.h>
  182. ///
  183. /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
  184. ///
  185. /// \param dst
  186. ///    The destination tile. Max size is 1024 Bytes.
  187. /// \param src0
  188. ///    The 1st source tile. Max size is 1024 Bytes.
  189. /// \param src1
  190. ///    The 2nd source tile. Max size is 1024 Bytes.
  191. #define _tile_dpbusd(dst, src0, src1)                                          \
  192.   __builtin_ia32_tdpbusd((dst), (src0), (src1))
  193.  
  194. /// Compute dot-product of bytes in tiles with a source/destination accumulator.
  195. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
  196. /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
  197. /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
  198. /// "dst", and store the 32-bit result back to tile "dst".
  199. ///
  200. /// \headerfile <immintrin.h>
  201. ///
  202. /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
  203. ///
  204. /// \param dst
  205. ///    The destination tile. Max size is 1024 Bytes.
  206. /// \param src0
  207. ///    The 1st source tile. Max size is 1024 Bytes.
  208. /// \param src1
  209. ///    The 2nd source tile. Max size is 1024 Bytes.
  210. #define _tile_dpbuud(dst, src0, src1)                                          \
  211.   __builtin_ia32_tdpbuud((dst), (src0), (src1))
  212.  
  213. /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
  214. /// src1, accumulating the intermediate single-precision (32-bit) floating-point
  215. /// elements with elements in "dst", and store the 32-bit result back to tile
  216. /// "dst".
  217. ///
  218. /// \headerfile <immintrin.h>
  219. ///
  220. /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
  221. ///
  222. /// \param dst
  223. ///    The destination tile. Max size is 1024 Bytes.
  224. /// \param src0
  225. ///    The 1st source tile. Max size is 1024 Bytes.
  226. /// \param src1
  227. ///    The 2nd source tile. Max size is 1024 Bytes.
  228. #define _tile_dpbf16ps(dst, src0, src1)                                        \
  229.   __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
  230.  
  231. /// AMX tile register size can be configured, the maximum size is 16x64=1024
  232. /// bytes. Since there is no 2D type in llvm IR, we use vector type to
  233. /// represent 2D tile and the fixed size is maximum amx tile register size.
  234. typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
  235.  
  236. /// This is internal intrinsic. C/C++ user should avoid calling it directly.
  237. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
  238. _tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
  239.                      __SIZE_TYPE__ stride) {
  240.   return __builtin_ia32_tileloadd64_internal(m, n, base,
  241.                                              (__SIZE_TYPE__)(stride));
  242. }
  243.  
  244. /// This is internal intrinsic. C/C++ user should avoid calling it directly.
  245. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
  246. _tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
  247.                        __SIZE_TYPE__ stride) {
  248.   return __builtin_ia32_tileloaddt164_internal(m, n, base,
  249.                                                (__SIZE_TYPE__)(stride));
  250. }
  251.  
  252. /// This is internal intrinsic. C/C++ user should avoid calling it directly.
  253. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
  254. _tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
  255.                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  256.   return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
  257. }
  258.  
  259. /// This is internal intrinsic. C/C++ user should avoid calling it directly.
  260. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
  261. _tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
  262.                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  263.   return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
  264. }
  265.  
  266. /// This is internal intrinsic. C/C++ user should avoid calling it directly.
  267. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
  268. _tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
  269.                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  270.   return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
  271. }
  272.  
  273. /// This is internal intrinsic. C/C++ user should avoid calling it directly.
  274. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
  275. _tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
  276.                       _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  277.   return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
  278. }
  279.  
  280. /// This is internal intrinsic. C/C++ user should avoid calling it directly.
  281. static __inline__ void __DEFAULT_FN_ATTRS_INT8
  282. _tile_stored_internal(unsigned short m, unsigned short n, void *base,
  283.                       __SIZE_TYPE__ stride, _tile1024i tile) {
  284.   return __builtin_ia32_tilestored64_internal(m, n, base,
  285.                                               (__SIZE_TYPE__)(stride), tile);
  286. }
  287.  
  288. /// This is internal intrinsic. C/C++ user should avoid calling it directly.
  289. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
  290. _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
  291.                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  292.   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
  293. }
  294.  
  295. /// This is internal intrinsic. C/C++ user should avoid calling it directly.
  296. static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
  297. _tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
  298.                         _tile1024i dst, _tile1024i src1, _tile1024i src2) {
  299.   return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
  300. }
  301.  
  302. /// This struct pack the shape and tile data together for user. We suggest
  303. /// initializing the struct as early as possible, because compiler depends
  304. /// on the shape information to do configure. The constant value is preferred
  305. /// for optimization by compiler.
  306. typedef struct __tile1024i_str {
  307.   const unsigned short row;
  308.   const unsigned short col;
  309.   _tile1024i tile;
  310. } __tile1024i;
  311.  
  312. /// Load tile rows from memory specifieid by "base" address and "stride" into
  313. /// destination tile "dst".
  314. ///
  315. /// \headerfile <immintrin.h>
  316. ///
  317. /// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
  318. ///
  319. /// \param dst
  320. ///    A destination tile. Max size is 1024 Bytes.
  321. /// \param base
  322. ///    A pointer to base address.
  323. /// \param stride
  324. ///    The stride between the rows' data to be loaded in memory.
  325. __DEFAULT_FN_ATTRS_TILE
  326. static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
  327.                                     __SIZE_TYPE__ stride) {
  328.   dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
  329. }
  330.  
  331. /// Load tile rows from memory specifieid by "base" address and "stride" into
  332. /// destination tile "dst". This intrinsic provides a hint to the implementation
  333. /// that the data will likely not be reused in the near future and the data
  334. /// caching can be optimized accordingly.
  335. ///
  336. /// \headerfile <immintrin.h>
  337. ///
  338. /// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
  339. ///
  340. /// \param dst
  341. ///    A destination tile. Max size is 1024 Bytes.
  342. /// \param base
  343. ///    A pointer to base address.
  344. /// \param stride
  345. ///    The stride between the rows' data to be loaded in memory.
  346. __DEFAULT_FN_ATTRS_TILE
  347. static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
  348.                                            __SIZE_TYPE__ stride) {
  349.   dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
  350. }
  351.  
  352. /// Compute dot-product of bytes in tiles with a source/destination accumulator.
  353. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
  354. /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
  355. /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
  356. /// and store the 32-bit result back to tile "dst".
  357. ///
  358. /// \headerfile <immintrin.h>
  359. ///
  360. /// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
  361. ///
  362. /// \param dst
  363. ///    The destination tile. Max size is 1024 Bytes.
  364. /// \param src0
  365. ///    The 1st source tile. Max size is 1024 Bytes.
  366. /// \param src1
  367. ///    The 2nd source tile. Max size is 1024 Bytes.
  368. __DEFAULT_FN_ATTRS_INT8
  369. static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
  370.                                      __tile1024i src1) {
  371.   dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
  372.                                     src0.tile, src1.tile);
  373. }
  374.  
  375. /// Compute dot-product of bytes in tiles with a source/destination accumulator.
  376. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
  377. /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
  378. /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
  379. /// in "dst", and store the 32-bit result back to tile "dst".
  380. ///
  381. /// \headerfile <immintrin.h>
  382. ///
  383. /// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
  384. ///
  385. /// \param dst
  386. ///    The destination tile. Max size is 1024 Bytes.
  387. /// \param src0
  388. ///    The 1st source tile. Max size is 1024 Bytes.
  389. /// \param src1
  390. ///    The 2nd source tile. Max size is 1024 Bytes.
  391. __DEFAULT_FN_ATTRS_INT8
  392. static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
  393.                                      __tile1024i src1) {
  394.   dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
  395.                                     src0.tile, src1.tile);
  396. }
  397.  
  398. /// Compute dot-product of bytes in tiles with a source/destination accumulator.
  399. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
  400. /// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
  401. /// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
  402. /// and store the 32-bit result back to tile "dst".
  403. ///
  404. /// \headerfile <immintrin.h>
  405. ///
  406. /// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
  407. ///
  408. /// \param dst
  409. ///    The destination tile. Max size is 1024 Bytes.
  410. /// \param src0
  411. ///    The 1st source tile. Max size is 1024 Bytes.
  412. /// \param src1
  413. ///    The 2nd source tile. Max size is 1024 Bytes.
  414. __DEFAULT_FN_ATTRS_INT8
  415. static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
  416.                                      __tile1024i src1) {
  417.   dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
  418.                                     src0.tile, src1.tile);
  419. }
  420.  
  421. /// Compute dot-product of bytes in tiles with a source/destination accumulator.
  422. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
  423. /// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
  424. /// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
  425. /// "dst", and store the 32-bit result back to tile "dst".
  426. ///
  427. /// \headerfile <immintrin.h>
  428. ///
  429. /// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
  430. ///
  431. /// \param dst
  432. ///    The destination tile. Max size is 1024 Bytes.
  433. /// \param src0
  434. ///    The 1st source tile. Max size is 1024 Bytes.
  435. /// \param src1
  436. ///    The 2nd source tile. Max size is 1024 Bytes.
  437. __DEFAULT_FN_ATTRS_INT8
  438. static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
  439.                                      __tile1024i src1) {
  440.   dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
  441.                                     src0.tile, src1.tile);
  442. }
  443.  
  444. /// Store the tile specified by "src" to memory specifieid by "base" address and
  445. /// "stride".
  446. ///
  447. /// \headerfile <immintrin.h>
  448. ///
  449. /// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
  450. ///
  451. /// \param base
  452. ///    A pointer to base address.
  453. /// \param stride
  454. ///    The stride between the rows' data to be stored in memory.
  455. __DEFAULT_FN_ATTRS_TILE
  456. static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
  457.                                      __tile1024i src) {
  458.   _tile_stored_internal(src.row, src.col, base, stride, src.tile);
  459. }
  460.  
  461. /// Zero the tile specified by "dst".
  462. ///
  463. /// \headerfile <immintrin.h>
  464. ///
  465. /// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
  466. ///
  467. /// \param dst
  468. ///    The destination tile to be zero. Max size is 1024 Bytes.
  469. __DEFAULT_FN_ATTRS_TILE
  470. static __inline__ void __tile_zero(__tile1024i *dst) {
  471.   dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
  472. }
  473.  
  474. /// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
  475. /// src1, accumulating the intermediate single-precision (32-bit) floating-point
  476. /// elements with elements in "dst", and store the 32-bit result back to tile
  477. /// "dst".
  478. ///
  479. /// \headerfile <immintrin.h>
  480. ///
  481. /// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
  482. ///
  483. /// \param dst
  484. ///    The destination tile. Max size is 1024 Bytes.
  485. /// \param src0
  486. ///    The 1st source tile. Max size is 1024 Bytes.
  487. /// \param src1
  488. ///    The 2nd source tile. Max size is 1024 Bytes.
  489. __DEFAULT_FN_ATTRS_BF16
  490. static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
  491.                                        __tile1024i src1) {
  492.   dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
  493.                                       src0.tile, src1.tile);
  494. }
  495.  
  496. /// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
  497. /// src1, accumulating the intermediate single-precision (32-bit) floating-point
  498. /// elements with elements in "dst", and store the 32-bit result back to tile
  499. /// "dst".
  500. ///
  501. /// \headerfile <immintrin.h>
  502. ///
  503. /// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
  504. ///
  505. /// \param dst
  506. ///    The destination tile. Max size is 1024 Bytes.
  507. /// \param src0
  508. ///    The 1st source tile. Max size is 1024 Bytes.
  509. /// \param src1
  510. ///    The 2nd source tile. Max size is 1024 Bytes.
  511. __DEFAULT_FN_ATTRS_FP16
  512. static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
  513.                                        __tile1024i src1) {
  514.   dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
  515.                                       src0.tile, src1.tile);
  516. }
  517.  
  518. #undef __DEFAULT_FN_ATTRS_TILE
  519. #undef __DEFAULT_FN_ATTRS_INT8
  520. #undef __DEFAULT_FN_ATTRS_BF16
  521. #undef __DEFAULT_FN_ATTRS_FP16
  522.  
  523. #endif /* __x86_64__ */
  524. #endif /* __AMXINTRIN_H */
  525.