Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===------------------------------------------------------------------------===
8
 */
9
 
10
#ifndef __IMMINTRIN_H
11
#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
12
#endif /* __IMMINTRIN_H */
13
 
14
#ifndef __AMXINTRIN_H
15
#define __AMXINTRIN_H
16
#ifdef __x86_64__
17
 
18
/* Define the default attributes for the functions in this file. */
19
#define __DEFAULT_FN_ATTRS_TILE                                                \
20
  __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
21
#define __DEFAULT_FN_ATTRS_INT8                                                \
22
  __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
23
#define __DEFAULT_FN_ATTRS_BF16                                                \
24
  __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
25
#define __DEFAULT_FN_ATTRS_FP16                                                \
26
  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
27
 
28
/// Load tile configuration from a 64-byte memory location specified by
29
/// "mem_addr". The tile configuration includes the tile type palette, the
30
/// number of bytes per row, and the number of rows. If the specified
31
/// palette_id is zero, that signifies the init state for both the tile
32
/// config and the tile data, and the tiles are zeroed. Any invalid
33
/// configurations will result in #GP fault.
34
///
35
/// \headerfile <immintrin.h>
36
///
37
/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
38
///
39
/// \param __config
40
///    A pointer to 512-bits configuration
41
static __inline__ void __DEFAULT_FN_ATTRS_TILE
42
_tile_loadconfig(const void *__config) {
43
  __builtin_ia32_tile_loadconfig(__config);
44
}
45
 
46
/// Stores the current tile configuration to a 64-byte memory location
47
/// specified by "mem_addr". The tile configuration includes the tile type
48
/// palette, the number of bytes per row, and the number of rows. If tiles
49
/// are not configured, all zeroes will be stored to memory.
50
///
51
/// \headerfile <immintrin.h>
52
///
53
/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
54
///
55
/// \param __config
56
///    A pointer to 512-bits configuration
57
static __inline__ void __DEFAULT_FN_ATTRS_TILE
58
_tile_storeconfig(void *__config) {
59
  __builtin_ia32_tile_storeconfig(__config);
60
}
61
 
62
/// Release the tile configuration to return to the init state, which
63
/// releases all storage it currently holds.
64
///
65
/// \headerfile <immintrin.h>
66
///
67
/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
68
static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
69
  __builtin_ia32_tilerelease();
70
}
71
 
72
/// Load tile rows from memory specifieid by "base" address and "stride" into
73
/// destination tile "dst" using the tile configuration previously configured
74
/// via "_tile_loadconfig".
75
///
76
/// \headerfile <immintrin.h>
77
///
78
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
79
///
80
/// \param dst
81
///    A destination tile. Max size is 1024 Bytes.
82
/// \param base
83
///    A pointer to base address.
84
/// \param stride
85
///    The stride between the rows' data to be loaded in memory.
86
#define _tile_loadd(dst, base, stride)                                         \
87
  __builtin_ia32_tileloadd64((dst), ((const void *)(base)),                    \
88
                             (__SIZE_TYPE__)(stride))
89
 
90
/// Load tile rows from memory specifieid by "base" address and "stride" into
91
/// destination tile "dst" using the tile configuration previously configured
92
/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
93
/// that the data will likely not be reused in the near future and the data
94
/// caching can be optimized accordingly.
95
///
96
/// \headerfile <immintrin.h>
97
///
98
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
99
///
100
/// \param dst
101
///    A destination tile. Max size is 1024 Bytes.
102
/// \param base
103
///    A pointer to base address.
104
/// \param stride
105
///    The stride between the rows' data to be loaded in memory.
106
#define _tile_stream_loadd(dst, base, stride)                                  \
107
  __builtin_ia32_tileloaddt164((dst), ((const void *)(base)),                  \
108
                               (__SIZE_TYPE__)(stride))
109
 
110
/// Store the tile specified by "src" to memory specifieid by "base" address and
111
/// "stride" using the tile configuration previously configured via
112
/// "_tile_loadconfig".
113
///
114
/// \headerfile <immintrin.h>
115
///
116
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
117
///
118
/// \param dst
119
///    A destination tile. Max size is 1024 Bytes.
120
/// \param base
121
///    A pointer to base address.
122
/// \param stride
123
///    The stride between the rows' data to be stored in memory.
124
#define _tile_stored(dst, base, stride)                                        \
125
  __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
126
 
127
/// Zero the tile specified by "tdest".
128
///
129
/// \headerfile <immintrin.h>
130
///
131
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
132
///
133
/// \param tile
134
///    The destination tile to be zero. Max size is 1024 Bytes.
135
#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
136
 
137
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
138
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
139
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
140
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
141
/// and store the 32-bit result back to tile "dst".
142
///
143
/// \headerfile <immintrin.h>
144
///
145
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
146
///
147
/// \param dst
148
///    The destination tile. Max size is 1024 Bytes.
149
/// \param src0
150
///    The 1st source tile. Max size is 1024 Bytes.
151
/// \param src1
152
///    The 2nd source tile. Max size is 1024 Bytes.
153
#define _tile_dpbssd(dst, src0, src1)                                          \
154
  __builtin_ia32_tdpbssd((dst), (src0), (src1))
155
 
156
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
157
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
158
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
159
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
160
/// in "dst", and store the 32-bit result back to tile "dst".
161
///
162
/// \headerfile <immintrin.h>
163
///
164
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
165
///
166
/// \param dst
167
///    The destination tile. Max size is 1024 Bytes.
168
/// \param src0
169
///    The 1st source tile. Max size is 1024 Bytes.
170
/// \param src1
171
///    The 2nd source tile. Max size is 1024 Bytes.
172
#define _tile_dpbsud(dst, src0, src1)                                          \
173
  __builtin_ia32_tdpbsud((dst), (src0), (src1))
174
 
175
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
176
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
177
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
178
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
179
/// and store the 32-bit result back to tile "dst".
180
///
181
/// \headerfile <immintrin.h>
182
///
183
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
184
///
185
/// \param dst
186
///    The destination tile. Max size is 1024 Bytes.
187
/// \param src0
188
///    The 1st source tile. Max size is 1024 Bytes.
189
/// \param src1
190
///    The 2nd source tile. Max size is 1024 Bytes.
191
#define _tile_dpbusd(dst, src0, src1)                                          \
192
  __builtin_ia32_tdpbusd((dst), (src0), (src1))
193
 
194
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
195
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
196
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
197
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
198
/// "dst", and store the 32-bit result back to tile "dst".
199
///
200
/// \headerfile <immintrin.h>
201
///
202
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
203
///
204
/// \param dst
205
///    The destination tile. Max size is 1024 Bytes.
206
/// \param src0
207
///    The 1st source tile. Max size is 1024 Bytes.
208
/// \param src1
209
///    The 2nd source tile. Max size is 1024 Bytes.
210
#define _tile_dpbuud(dst, src0, src1)                                          \
211
  __builtin_ia32_tdpbuud((dst), (src0), (src1))
212
 
213
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
214
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
215
/// elements with elements in "dst", and store the 32-bit result back to tile
216
/// "dst".
217
///
218
/// \headerfile <immintrin.h>
219
///
220
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
221
///
222
/// \param dst
223
///    The destination tile. Max size is 1024 Bytes.
224
/// \param src0
225
///    The 1st source tile. Max size is 1024 Bytes.
226
/// \param src1
227
///    The 2nd source tile. Max size is 1024 Bytes.
228
#define _tile_dpbf16ps(dst, src0, src1)                                        \
229
  __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
230
 
231
/// AMX tile register size can be configured, the maximum size is 16x64=1024
232
/// bytes. Since there is no 2D type in llvm IR, we use vector type to
233
/// represent 2D tile and the fixed size is maximum amx tile register size.
234
typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
235
 
236
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
237
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
238
_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
239
                     __SIZE_TYPE__ stride) {
240
  return __builtin_ia32_tileloadd64_internal(m, n, base,
241
                                             (__SIZE_TYPE__)(stride));
242
}
243
 
244
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
245
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
246
_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
247
                       __SIZE_TYPE__ stride) {
248
  return __builtin_ia32_tileloaddt164_internal(m, n, base,
249
                                               (__SIZE_TYPE__)(stride));
250
}
251
 
252
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
253
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
254
_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
255
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
256
  return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
257
}
258
 
259
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
260
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
261
_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
262
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
263
  return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
264
}
265
 
266
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
267
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
268
_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
269
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
270
  return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
271
}
272
 
273
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
274
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
275
_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
276
                      _tile1024i dst, _tile1024i src1, _tile1024i src2) {
277
  return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
278
}
279
 
280
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
281
static __inline__ void __DEFAULT_FN_ATTRS_INT8
282
_tile_stored_internal(unsigned short m, unsigned short n, void *base,
283
                      __SIZE_TYPE__ stride, _tile1024i tile) {
284
  return __builtin_ia32_tilestored64_internal(m, n, base,
285
                                              (__SIZE_TYPE__)(stride), tile);
286
}
287
 
288
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
289
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
290
_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
291
                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
292
  return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
293
}
294
 
295
/// This is internal intrinsic. C/C++ user should avoid calling it directly.
296
static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
297
_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
298
                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
299
  return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
300
}
301
 
302
/// This struct pack the shape and tile data together for user. We suggest
303
/// initializing the struct as early as possible, because compiler depends
304
/// on the shape information to do configure. The constant value is preferred
305
/// for optimization by compiler.
306
typedef struct __tile1024i_str {
307
  const unsigned short row;
308
  const unsigned short col;
309
  _tile1024i tile;
310
} __tile1024i;
311
 
312
/// Load tile rows from memory specifieid by "base" address and "stride" into
313
/// destination tile "dst".
314
///
315
/// \headerfile <immintrin.h>
316
///
317
/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
318
///
319
/// \param dst
320
///    A destination tile. Max size is 1024 Bytes.
321
/// \param base
322
///    A pointer to base address.
323
/// \param stride
324
///    The stride between the rows' data to be loaded in memory.
325
__DEFAULT_FN_ATTRS_TILE
326
static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
327
                                    __SIZE_TYPE__ stride) {
328
  dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
329
}
330
 
331
/// Load tile rows from memory specifieid by "base" address and "stride" into
332
/// destination tile "dst". This intrinsic provides a hint to the implementation
333
/// that the data will likely not be reused in the near future and the data
334
/// caching can be optimized accordingly.
335
///
336
/// \headerfile <immintrin.h>
337
///
338
/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
339
///
340
/// \param dst
341
///    A destination tile. Max size is 1024 Bytes.
342
/// \param base
343
///    A pointer to base address.
344
/// \param stride
345
///    The stride between the rows' data to be loaded in memory.
346
__DEFAULT_FN_ATTRS_TILE
347
static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
348
                                           __SIZE_TYPE__ stride) {
349
  dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
350
}
351
 
352
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
353
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
354
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
355
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
356
/// and store the 32-bit result back to tile "dst".
357
///
358
/// \headerfile <immintrin.h>
359
///
360
/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
361
///
362
/// \param dst
363
///    The destination tile. Max size is 1024 Bytes.
364
/// \param src0
365
///    The 1st source tile. Max size is 1024 Bytes.
366
/// \param src1
367
///    The 2nd source tile. Max size is 1024 Bytes.
368
__DEFAULT_FN_ATTRS_INT8
369
static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
370
                                     __tile1024i src1) {
371
  dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
372
                                    src0.tile, src1.tile);
373
}
374
 
375
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
376
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
377
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
378
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
379
/// in "dst", and store the 32-bit result back to tile "dst".
380
///
381
/// \headerfile <immintrin.h>
382
///
383
/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
384
///
385
/// \param dst
386
///    The destination tile. Max size is 1024 Bytes.
387
/// \param src0
388
///    The 1st source tile. Max size is 1024 Bytes.
389
/// \param src1
390
///    The 2nd source tile. Max size is 1024 Bytes.
391
__DEFAULT_FN_ATTRS_INT8
392
static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
393
                                     __tile1024i src1) {
394
  dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
395
                                    src0.tile, src1.tile);
396
}
397
 
398
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
399
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
400
/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
401
/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
402
/// and store the 32-bit result back to tile "dst".
403
///
404
/// \headerfile <immintrin.h>
405
///
406
/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
407
///
408
/// \param dst
409
///    The destination tile. Max size is 1024 Bytes.
410
/// \param src0
411
///    The 1st source tile. Max size is 1024 Bytes.
412
/// \param src1
413
///    The 2nd source tile. Max size is 1024 Bytes.
414
__DEFAULT_FN_ATTRS_INT8
415
static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
416
                                     __tile1024i src1) {
417
  dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
418
                                    src0.tile, src1.tile);
419
}
420
 
421
/// Compute dot-product of bytes in tiles with a source/destination accumulator.
422
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
423
/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
424
/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
425
/// "dst", and store the 32-bit result back to tile "dst".
426
///
427
/// \headerfile <immintrin.h>
428
///
429
/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
430
///
431
/// \param dst
432
///    The destination tile. Max size is 1024 Bytes.
433
/// \param src0
434
///    The 1st source tile. Max size is 1024 Bytes.
435
/// \param src1
436
///    The 2nd source tile. Max size is 1024 Bytes.
437
__DEFAULT_FN_ATTRS_INT8
438
static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
439
                                     __tile1024i src1) {
440
  dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
441
                                    src0.tile, src1.tile);
442
}
443
 
444
/// Store the tile specified by "src" to memory specifieid by "base" address and
445
/// "stride".
446
///
447
/// \headerfile <immintrin.h>
448
///
449
/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
450
///
451
/// \param base
452
///    A pointer to base address.
453
/// \param stride
454
///    The stride between the rows' data to be stored in memory.
455
__DEFAULT_FN_ATTRS_TILE
456
static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
457
                                     __tile1024i src) {
458
  _tile_stored_internal(src.row, src.col, base, stride, src.tile);
459
}
460
 
461
/// Zero the tile specified by "dst".
462
///
463
/// \headerfile <immintrin.h>
464
///
465
/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
466
///
467
/// \param dst
468
///    The destination tile to be zero. Max size is 1024 Bytes.
469
__DEFAULT_FN_ATTRS_TILE
470
static __inline__ void __tile_zero(__tile1024i *dst) {
471
  dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
472
}
473
 
474
/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
475
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
476
/// elements with elements in "dst", and store the 32-bit result back to tile
477
/// "dst".
478
///
479
/// \headerfile <immintrin.h>
480
///
481
/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
482
///
483
/// \param dst
484
///    The destination tile. Max size is 1024 Bytes.
485
/// \param src0
486
///    The 1st source tile. Max size is 1024 Bytes.
487
/// \param src1
488
///    The 2nd source tile. Max size is 1024 Bytes.
489
__DEFAULT_FN_ATTRS_BF16
490
static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
491
                                       __tile1024i src1) {
492
  dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
493
                                      src0.tile, src1.tile);
494
}
495
 
496
/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
497
/// src1, accumulating the intermediate single-precision (32-bit) floating-point
498
/// elements with elements in "dst", and store the 32-bit result back to tile
499
/// "dst".
500
///
501
/// \headerfile <immintrin.h>
502
///
503
/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
504
///
505
/// \param dst
506
///    The destination tile. Max size is 1024 Bytes.
507
/// \param src0
508
///    The 1st source tile. Max size is 1024 Bytes.
509
/// \param src1
510
///    The 2nd source tile. Max size is 1024 Bytes.
511
__DEFAULT_FN_ATTRS_FP16
512
static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
513
                                       __tile1024i src1) {
514
  dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
515
                                      src0.tile, src1.tile);
516
}
517
 
518
#undef __DEFAULT_FN_ATTRS_TILE
519
#undef __DEFAULT_FN_ATTRS_INT8
520
#undef __DEFAULT_FN_ATTRS_BF16
521
#undef __DEFAULT_FN_ATTRS_FP16
522
 
523
#endif /* __x86_64__ */
524
#endif /* __AMXINTRIN_H */