Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
 
10
#ifndef __IMMINTRIN_H
11
#error                                                                         \
12
    "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
13
#endif // __IMMINTRIN_H
14
 
15
#ifdef __SSE2__
16
 
17
#ifndef __AVXNECONVERTINTRIN_H
18
#define __AVXNECONVERTINTRIN_H
19
 
20
/* Define the default attributes for the functions in this file. */
21
#define __DEFAULT_FN_ATTRS128                                                  \
22
  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
23
                 __min_vector_width__(128)))
24
#define __DEFAULT_FN_ATTRS256                                                  \
25
  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
26
                 __min_vector_width__(256)))
27
 
28
/// Convert scalar BF16 (16-bit) floating-point element
29
/// stored at memory locations starting at location \a __A to a
30
/// single-precision (32-bit) floating-point, broadcast it to packed
31
/// single-precision (32-bit) floating-point elements, and store the results in
32
/// \a dst.
33
///
34
/// \headerfile <x86intrin.h>
35
///
36
/// \code
37
/// _mm_bcstnebf16_ps(const void *__A);
38
/// \endcode
39
///
40
/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
41
///
42
/// \param __A
43
///    A pointer to a 16-bit memory location. The address of the memory
44
///    location does not have to be aligned.
45
/// \returns
46
///    A 128-bit vector of [4 x float].
47
///
48
/// \code{.operation}
49
/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
50
/// FOR j := 0 to 3
51
///   m := j*32
52
///   dst[m+31:m] := b
53
/// ENDFOR
54
/// dst[MAX:128] := 0
55
/// \endcode
56
static __inline__ __m128 __DEFAULT_FN_ATTRS128
57
_mm_bcstnebf16_ps(const void *__A) {
58
  return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
59
}
60
 
61
/// Convert scalar BF16 (16-bit) floating-point element
62
/// stored at memory locations starting at location \a __A to a
63
/// single-precision (32-bit) floating-point, broadcast it to packed
64
/// single-precision (32-bit) floating-point elements, and store the results in
65
/// \a dst.
66
///
67
/// \headerfile <x86intrin.h>
68
///
69
/// \code
70
/// _mm256_bcstnebf16_ps(const void *__A);
71
/// \endcode
72
///
73
/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
74
///
75
/// \param __A
76
///    A pointer to a 16-bit memory location. The address of the memory
77
///    location does not have to be aligned.
78
/// \returns
79
///    A 256-bit vector of [8 x float].
80
///
81
/// \code{.operation}
82
/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
83
/// FOR j := 0 to 7
84
///   m := j*32
85
///   dst[m+31:m] := b
86
/// ENDFOR
87
/// dst[MAX:256] := 0
88
/// \endcode
89
static __inline__ __m256 __DEFAULT_FN_ATTRS256
90
_mm256_bcstnebf16_ps(const void *__A) {
91
  return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
92
}
93
 
94
/// Convert scalar half-precision (16-bit) floating-point element
95
/// stored at memory locations starting at location \a __A to a
96
/// single-precision (32-bit) floating-point, broadcast it to packed
97
/// single-precision (32-bit) floating-point elements, and store the results in
98
/// \a dst.
99
///
100
/// \headerfile <x86intrin.h>
101
///
102
/// \code
103
/// _mm_bcstnesh_ps(const void *__A);
104
/// \endcode
105
///
106
/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
107
///
108
/// \param __A
109
///    A pointer to a 16-bit memory location. The address of the memory
110
///    location does not have to be aligned.
111
/// \returns
112
///    A 128-bit vector of [4 x float].
113
///
114
/// \code{.operation}
115
/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
116
/// FOR j := 0 to 3
117
///   m := j*32
118
///   dst[m+31:m] := b
119
/// ENDFOR
120
/// dst[MAX:128] := 0
121
/// \endcode
122
static __inline__ __m128 __DEFAULT_FN_ATTRS128
123
_mm_bcstnesh_ps(const void *__A) {
124
  return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
125
}
126
 
127
/// Convert scalar half-precision (16-bit) floating-point element
128
/// stored at memory locations starting at location \a __A to a
129
/// single-precision (32-bit) floating-point, broadcast it to packed
130
/// single-precision (32-bit) floating-point elements, and store the results in
131
/// \a dst.
132
///
133
/// \headerfile <x86intrin.h>
134
///
135
/// \code
136
/// _mm256_bcstnesh_ps(const void *__A);
137
/// \endcode
138
///
139
/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
140
///
141
/// \param __A
142
///    A pointer to a 16-bit memory location. The address of the memory
143
///    location does not have to be aligned.
144
/// \returns
145
///    A 256-bit vector of [8 x float].
146
///
147
/// \code{.operation}
148
/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
149
/// FOR j := 0 to 7
150
///   m := j*32
151
///   dst[m+31:m] := b
152
/// ENDFOR
153
/// dst[MAX:256] := 0
154
/// \endcode
155
static __inline__ __m256 __DEFAULT_FN_ATTRS256
156
_mm256_bcstnesh_ps(const void *__A) {
157
  return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
158
}
159
 
160
/// Convert packed BF16 (16-bit) floating-point even-indexed elements
161
/// stored at memory locations starting at location \a __A to packed
162
/// single-precision (32-bit) floating-point elements, and store the results in
163
/// \a dst.
164
///
165
/// \headerfile <x86intrin.h>
166
///
167
/// \code
168
/// _mm_cvtneebf16_ps(const __m128bh *__A);
169
/// \endcode
170
///
171
/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
172
///
173
/// \param __A
174
///    A pointer to a 128-bit memory location containing 8 consecutive
175
///    BF16 (16-bit) floating-point values.
176
/// \returns
177
///    A 128-bit vector of [4 x float].
178
///
179
/// \code{.operation}
180
/// FOR j := 0 to 3
181
///     k := j*2
182
///     i := k*16
183
///     m := j*32
184
///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
185
/// ENDFOR
186
/// dst[MAX:128] := 0
187
/// \endcode
188
static __inline__ __m128 __DEFAULT_FN_ATTRS128
189
_mm_cvtneebf16_ps(const __m128bh *__A) {
190
  return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
191
}
192
 
193
/// Convert packed BF16 (16-bit) floating-point even-indexed elements
194
/// stored at memory locations starting at location \a __A to packed
195
/// single-precision (32-bit) floating-point elements, and store the results in
196
/// \a dst.
197
///
198
/// \headerfile <x86intrin.h>
199
///
200
/// \code
201
/// _mm256_cvtneebf16_ps(const __m256bh *__A);
202
/// \endcode
203
///
204
/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
205
///
206
/// \param __A
207
///    A pointer to a 256-bit memory location containing 16 consecutive
208
///    BF16 (16-bit) floating-point values.
209
/// \returns
210
///    A 256-bit vector of [8 x float].
211
///
212
/// \code{.operation}
213
/// FOR j := 0 to 7
214
///     k := j*2
215
///     i := k*16
216
///     m := j*32
217
///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
218
/// ENDFOR
219
/// dst[MAX:256] := 0
220
/// \endcode
221
static __inline__ __m256 __DEFAULT_FN_ATTRS256
222
_mm256_cvtneebf16_ps(const __m256bh *__A) {
223
  return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
224
}
225
 
226
/// Convert packed half-precision (16-bit) floating-point even-indexed elements
227
/// stored at memory locations starting at location \a __A to packed
228
/// single-precision (32-bit) floating-point elements, and store the results in
229
/// \a dst.
230
///
231
/// \headerfile <x86intrin.h>
232
///
233
/// \code
234
/// _mm_cvtneeph_ps(const __m128h *__A);
235
/// \endcode
236
///
237
/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
238
///
239
/// \param __A
240
///    A pointer to a 128-bit memory location containing 8 consecutive
241
///    half-precision (16-bit) floating-point values.
242
/// \returns
243
///    A 128-bit vector of [4 x float].
244
///
245
/// \code{.operation}
246
/// FOR j := 0 to 3
247
///     k := j*2
248
///     i := k*16
249
///     m := j*32
250
///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
251
/// ENDFOR
252
/// dst[MAX:128] := 0
253
/// \endcode
254
static __inline__ __m128 __DEFAULT_FN_ATTRS128
255
_mm_cvtneeph_ps(const __m128h *__A) {
256
  return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
257
}
258
 
259
/// Convert packed half-precision (16-bit) floating-point even-indexed elements
260
/// stored at memory locations starting at location \a __A to packed
261
/// single-precision (32-bit) floating-point elements, and store the results in
262
/// \a dst.
263
///
264
/// \headerfile <x86intrin.h>
265
///
266
/// \code
267
/// _mm256_cvtneeph_ps(const __m256h *__A);
268
/// \endcode
269
///
270
/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
271
///
272
/// \param __A
273
///    A pointer to a 256-bit memory location containing 16 consecutive
274
///    half-precision (16-bit) floating-point values.
275
/// \returns
276
///    A 256-bit vector of [8 x float].
277
///
278
/// \code{.operation}
279
/// FOR j := 0 to 7
280
///     k := j*2
281
///     i := k*16
282
///     m := j*32
283
///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
284
/// ENDFOR
285
/// dst[MAX:256] := 0
286
/// \endcode
287
static __inline__ __m256 __DEFAULT_FN_ATTRS256
288
_mm256_cvtneeph_ps(const __m256h *__A) {
289
  return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
290
}
291
 
292
/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
293
/// stored at memory locations starting at location \a __A to packed
294
/// single-precision (32-bit) floating-point elements, and store the results in
295
/// \a dst.
296
///
297
/// \headerfile <x86intrin.h>
298
///
299
/// \code
300
/// _mm_cvtneobf16_ps(const __m128bh *__A);
301
/// \endcode
302
///
303
/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
304
///
305
/// \param __A
306
///    A pointer to a 128-bit memory location containing 8 consecutive
307
///    BF16 (16-bit) floating-point values.
308
/// \returns
309
///    A 128-bit vector of [4 x float].
310
///
311
/// \code{.operation}
312
/// FOR j := 0 to 3
313
///     k := j*2+1
314
///     i := k*16
315
///     m := j*32
316
///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
317
/// ENDFOR
318
/// dst[MAX:128] := 0
319
/// \endcode
320
static __inline__ __m128 __DEFAULT_FN_ATTRS128
321
_mm_cvtneobf16_ps(const __m128bh *__A) {
322
  return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
323
}
324
 
325
/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
326
/// stored at memory locations starting at location \a __A to packed
327
/// single-precision (32-bit) floating-point elements, and store the results in
328
/// \a dst.
329
///
330
/// \headerfile <x86intrin.h>
331
///
332
/// \code
333
/// _mm256_cvtneobf16_ps(const __m256bh *__A);
334
/// \endcode
335
///
336
/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
337
///
338
/// \param __A
339
///    A pointer to a 256-bit memory location containing 16 consecutive
340
///    BF16 (16-bit) floating-point values.
341
/// \returns
342
///    A 256-bit vector of [8 x float].
343
///
344
/// \code{.operation}
345
/// FOR j := 0 to 7
346
///     k := j*2+1
347
///     i := k*16
348
///     m := j*32
349
///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
350
/// ENDFOR
351
/// dst[MAX:256] := 0
352
/// \endcode
353
static __inline__ __m256 __DEFAULT_FN_ATTRS256
354
_mm256_cvtneobf16_ps(const __m256bh *__A) {
355
  return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
356
}
357
 
358
/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
359
/// stored at memory locations starting at location \a __A to packed
360
/// single-precision (32-bit) floating-point elements, and store the results in
361
/// \a dst.
362
///
363
/// \headerfile <x86intrin.h>
364
///
365
/// \code
366
/// _mm_cvtneoph_ps(const __m128h *__A);
367
/// \endcode
368
///
369
/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
370
///
371
/// \param __A
372
///    A pointer to a 128-bit memory location containing 8 consecutive
373
///    half-precision (16-bit) floating-point values.
374
/// \returns
375
///    A 128-bit vector of [4 x float].
376
///
377
/// \code{.operation}
378
/// FOR j := 0 to 3
379
///     k := j*2+1
380
///     i := k*16
381
///     m := j*32
382
///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
383
/// ENDFOR
384
/// dst[MAX:128] := 0
385
/// \endcode
386
static __inline__ __m128 __DEFAULT_FN_ATTRS128
387
_mm_cvtneoph_ps(const __m128h *__A) {
388
  return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
389
}
390
 
391
/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
392
/// stored at memory locations starting at location \a __A to packed
393
/// single-precision (32-bit) floating-point elements, and store the results in
394
/// \a dst.
395
///
396
/// \headerfile <x86intrin.h>
397
///
398
/// \code
399
/// _mm256_cvtneoph_ps(const __m256h *__A);
400
/// \endcode
401
///
402
/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
403
///
404
/// \param __A
405
///    A pointer to a 256-bit memory location containing 16 consecutive
406
///    half-precision (16-bit) floating-point values.
407
/// \returns
408
///    A 256-bit vector of [8 x float].
409
///
410
/// \code{.operation}
411
/// FOR j := 0 to 7
412
///     k := j*2+1
413
///     i := k*16
414
///     m := j*32
415
///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
416
/// ENDFOR
417
/// dst[MAX:256] := 0
418
/// \endcode
419
static __inline__ __m256 __DEFAULT_FN_ATTRS256
420
_mm256_cvtneoph_ps(const __m256h *__A) {
421
  return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
422
}
423
 
424
/// Convert packed single-precision (32-bit) floating-point elements in \a __A
425
/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
426
/// dst.
427
///
428
/// \headerfile <x86intrin.h>
429
///
430
/// \code
431
/// _mm_cvtneps_avx_pbh(__m128 __A);
432
/// \endcode
433
///
434
/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
435
///
436
/// \param __A
437
///    A 128-bit vector of [4 x float].
438
/// \returns
439
///    A 128-bit vector of [8 x bfloat].
440
///
441
/// \code{.operation}
442
/// FOR j := 0 to 3
443
///     dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
444
/// ENDFOR
445
/// dst[MAX:128] := 0
446
/// \endcode
447
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
448
_mm_cvtneps_avx_pbh(__m128 __A) {
449
  return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
450
}
451
 
452
/// Convert packed single-precision (32-bit) floating-point elements in \a __A
453
/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
454
/// dst.
455
///
456
/// \headerfile <x86intrin.h>
457
///
458
/// \code
459
/// _mm256_cvtneps_avx_pbh(__m256 __A);
460
/// \endcode
461
///
462
/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
463
///
464
/// \param __A
465
///    A 256-bit vector of [8 x float].
466
/// \returns
467
///    A 128-bit vector of [8 x bfloat].
468
///
469
/// \code{.operation}
470
/// FOR j := 0 to 7
471
///     dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
472
/// ENDFOR
473
/// dst[MAX:128] := 0
474
/// \endcode
475
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
476
_mm256_cvtneps_avx_pbh(__m256 __A) {
477
  return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
478
}
479
 
480
#undef __DEFAULT_FN_ATTRS128
481
#undef __DEFAULT_FN_ATTRS256
482
 
483
#endif // __AVXNECONVERTINTRIN_H
484
#endif // __SSE2__