Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
#ifndef __IMMINTRIN_H
10
#error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
11
#endif
12
 
13
#ifdef __SSE2__
14
 
15
#ifndef __AVX512VLBF16INTRIN_H
16
#define __AVX512VLBF16INTRIN_H
17
 
18
#define __DEFAULT_FN_ATTRS128 \
19
  __attribute__((__always_inline__, __nodebug__, \
20
                 __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
21
#define __DEFAULT_FN_ATTRS256 \
22
  __attribute__((__always_inline__, __nodebug__, \
23
                 __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
24
 
25
/// Convert Two Packed Single Data to One Packed BF16 Data.
26
///
27
/// \headerfile <x86intrin.h>
28
///
29
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
30
///
31
/// \param __A
32
///    A 128-bit vector of [4 x float].
33
/// \param __B
34
///    A 128-bit vector of [4 x float].
35
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
36
///    conversion of __B, and higher 64 bits come from conversion of __A.
37
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
38
_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
39
  return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
40
                                                    (__v4sf) __B);
41
}
42
 
43
/// Convert Two Packed Single Data to One Packed BF16 Data.
44
///
45
/// \headerfile <x86intrin.h>
46
///
47
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
48
///
49
/// \param __A
50
///    A 128-bit vector of [4 x float].
51
/// \param __B
52
///    A 128-bit vector of [4 x float].
53
/// \param __W
54
///    A 128-bit vector of [8 x bfloat].
55
/// \param __U
56
///    A 8-bit mask value specifying what is chosen for each element.
57
///    A 1 means conversion of __A or __B. A 0 means element from __W.
58
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
59
///    conversion of __B, and higher 64 bits come from conversion of __A.
60
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
61
_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
62
  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
63
                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
64
                                             (__v8bf)__W);
65
}
66
 
67
/// Convert Two Packed Single Data to One Packed BF16 Data.
68
///
69
/// \headerfile <x86intrin.h>
70
///
71
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
72
///
73
/// \param __A
74
///    A 128-bit vector of [4 x float].
75
/// \param __B
76
///    A 128-bit vector of [4 x float].
77
/// \param __U
78
///    A 8-bit mask value specifying what is chosen for each element.
79
///    A 1 means conversion of __A or __B. A 0 means element is zero.
80
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
81
///    conversion of __B, and higher 64 bits come from conversion of __A.
82
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
83
_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
84
  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
85
                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
86
                                             (__v8bf)_mm_setzero_si128());
87
}
88
 
89
/// Convert Two Packed Single Data to One Packed BF16 Data.
90
///
91
/// \headerfile <x86intrin.h>
92
///
93
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
94
///
95
/// \param __A
96
///    A 256-bit vector of [8 x float].
97
/// \param __B
98
///    A 256-bit vector of [8 x float].
99
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
100
///    conversion of __B, and higher 128 bits come from conversion of __A.
101
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
102
_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
103
  return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
104
                                                    (__v8sf) __B);
105
}
106
 
107
/// Convert Two Packed Single Data to One Packed BF16 Data.
108
///
109
/// \headerfile <x86intrin.h>
110
///
111
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
112
///
113
/// \param __A
114
///    A 256-bit vector of [8 x float].
115
/// \param __B
116
///    A 256-bit vector of [8 x float].
117
/// \param __W
118
///    A 256-bit vector of [16 x bfloat].
119
/// \param __U
120
///    A 16-bit mask value specifying what is chosen for each element.
121
///    A 1 means conversion of __A or __B. A 0 means element from __W.
122
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
123
///    conversion of __B, and higher 128 bits come from conversion of __A.
124
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
125
_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
126
  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
127
                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
128
                                         (__v16bf)__W);
129
}
130
 
131
/// Convert Two Packed Single Data to One Packed BF16 Data.
132
///
133
/// \headerfile <x86intrin.h>
134
///
135
/// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
136
///
137
/// \param __A
138
///    A 256-bit vector of [8 x float].
139
/// \param __B
140
///    A 256-bit vector of [8 x float].
141
/// \param __U
142
///    A 16-bit mask value specifying what is chosen for each element.
143
///    A 1 means conversion of __A or __B. A 0 means element is zero.
144
/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
145
///    conversion of __B, and higher 128 bits come from conversion of __A.
146
static __inline__ __m256bh __DEFAULT_FN_ATTRS256
147
_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
148
  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
149
                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
150
                                         (__v16bf)_mm256_setzero_si256());
151
}
152
 
153
/// Convert Packed Single Data to Packed BF16 Data.
154
///
155
/// \headerfile <x86intrin.h>
156
///
157
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
158
///
159
/// \param __A
160
///    A 128-bit vector of [4 x float].
161
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
162
///    conversion of __A, and higher 64 bits are 0.
163
#define _mm_cvtneps_pbh(A)                                                     \
164
  ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
165
 
166
/// Convert Packed Single Data to Packed BF16 Data.
167
///
168
/// \headerfile <x86intrin.h>
169
///
170
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
171
///
172
/// \param __A
173
///    A 128-bit vector of [4 x float].
174
/// \param __W
175
///    A 128-bit vector of [8 x bfloat].
176
/// \param __U
177
///    A 4-bit mask value specifying what is chosen for each element.
178
///    A 1 means conversion of __A. A 0 means element from __W.
179
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
180
///    conversion of __A, and higher 64 bits are 0.
181
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
182
_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
183
  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
184
                                                        (__v8bf)__W,
185
                                                        (__mmask8)__U);
186
}
187
 
188
/// Convert Packed Single Data to Packed BF16 Data.
189
///
190
/// \headerfile <x86intrin.h>
191
///
192
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
193
///
194
/// \param __A
195
///    A 128-bit vector of [4 x float].
196
/// \param __U
197
///    A 4-bit mask value specifying what is chosen for each element.
198
///    A 1 means conversion of __A. A 0 means element is zero.
199
/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
200
///    conversion of __A, and higher 64 bits are 0.
201
static __inline__ __m128bh __DEFAULT_FN_ATTRS128
202
_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
203
  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
204
                                                    (__v8bf)_mm_setzero_si128(),
205
                                                    (__mmask8)__U);
206
}
207
 
208
/// Convert Packed Single Data to Packed BF16 Data.
209
///
210
/// \headerfile <x86intrin.h>
211
///
212
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
213
///
214
/// \param __A
215
///    A 256-bit vector of [8 x float].
216
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
217
#define _mm256_cvtneps_pbh(A)                                                  \
218
  ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
219
 
220
/// Convert Packed Single Data to Packed BF16 Data.
221
///
222
/// \headerfile <x86intrin.h>
223
///
224
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
225
///
226
/// \param __A
227
///    A 256-bit vector of [8 x float].
228
/// \param __W
229
///    A 256-bit vector of [8 x bfloat].
230
/// \param __U
231
///    A 8-bit mask value specifying what is chosen for each element.
232
///    A 1 means conversion of __A. A 0 means element from __W.
233
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
234
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
235
_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
236
  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
237
                                                        (__v8bf)__W,
238
                                                        (__mmask8)__U);
239
}
240
 
241
/// Convert Packed Single Data to Packed BF16 Data.
242
///
243
/// \headerfile <x86intrin.h>
244
///
245
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
246
///
247
/// \param __A
248
///    A 256-bit vector of [8 x float].
249
/// \param __U
250
///    A 8-bit mask value specifying what is chosen for each element.
251
///    A 1 means conversion of __A. A 0 means element is zero.
252
/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
253
static __inline__ __m128bh __DEFAULT_FN_ATTRS256
254
_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
255
  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
256
                                                    (__v8bf)_mm_setzero_si128(),
257
                                                    (__mmask8)__U);
258
}
259
 
260
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
261
///
262
/// \headerfile <x86intrin.h>
263
///
264
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
265
///
266
/// \param __A
267
///    A 128-bit vector of [8 x bfloat].
268
/// \param __B
269
///    A 128-bit vector of [8 x bfloat].
270
/// \param __D
271
///    A 128-bit vector of [4 x float].
272
/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
273
///  __A, __B and __D
274
static __inline__ __m128 __DEFAULT_FN_ATTRS128
275
_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
276
  return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
277
                                             (__v8bf)__A,
278
                                             (__v8bf)__B);
279
}
280
 
281
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
282
///
283
/// \headerfile <x86intrin.h>
284
///
285
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
286
///
287
/// \param __A
288
///    A 128-bit vector of [8 x bfloat].
289
/// \param __B
290
///    A 128-bit vector of [8 x bfloat].
291
/// \param __D
292
///    A 128-bit vector of [4 x float].
293
/// \param __U
294
///    A 8-bit mask value specifying what is chosen for each element.
295
///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
296
/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
297
///  __A, __B and __D
298
static __inline__ __m128 __DEFAULT_FN_ATTRS128
299
_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
300
  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
301
                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
302
                                           (__v4sf)__D);
303
}
304
 
305
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
306
///
307
/// \headerfile <x86intrin.h>
308
///
309
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
310
///
311
/// \param __A
312
///    A 128-bit vector of [8 x bfloat].
313
/// \param __B
314
///    A 128-bit vector of [8 x bfloat].
315
/// \param __D
316
///    A 128-bit vector of [4 x float].
317
/// \param __U
318
///    A 8-bit mask value specifying what is chosen for each element.
319
///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
320
/// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
321
///  __A, __B and __D
322
static __inline__ __m128 __DEFAULT_FN_ATTRS128
323
_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
324
  return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
325
                                           (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
326
                                           (__v4sf)_mm_setzero_si128());
327
}
328
 
329
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
330
///
331
/// \headerfile <x86intrin.h>
332
///
333
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
334
///
335
/// \param __A
336
///    A 256-bit vector of [16 x bfloat].
337
/// \param __B
338
///    A 256-bit vector of [16 x bfloat].
339
/// \param __D
340
///    A 256-bit vector of [8 x float].
341
/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
342
///  __A, __B and __D
343
static __inline__ __m256 __DEFAULT_FN_ATTRS256
344
_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
345
  return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
346
                                             (__v16bf)__A,
347
                                             (__v16bf)__B);
348
}
349
 
350
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
351
///
352
/// \headerfile <x86intrin.h>
353
///
354
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
355
///
356
/// \param __A
357
///    A 256-bit vector of [16 x bfloat].
358
/// \param __B
359
///    A 256-bit vector of [16 x bfloat].
360
/// \param __D
361
///    A 256-bit vector of [8 x float].
362
/// \param __U
363
///    A 16-bit mask value specifying what is chosen for each element.
364
///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
365
/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
366
///  __A, __B and __D
367
static __inline__ __m256 __DEFAULT_FN_ATTRS256
368
_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
369
  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
370
                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
371
                                        (__v8sf)__D);
372
}
373
 
374
/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
375
///
376
/// \headerfile <x86intrin.h>
377
///
378
/// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
379
///
380
/// \param __A
381
///    A 256-bit vector of [16 x bfloat].
382
/// \param __B
383
///    A 256-bit vector of [16 x bfloat].
384
/// \param __D
385
///    A 256-bit vector of [8 x float].
386
/// \param __U
387
///    A 8-bit mask value specifying what is chosen for each element.
388
///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
389
/// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
390
///  __A, __B and __D
391
static __inline__ __m256 __DEFAULT_FN_ATTRS256
392
_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
393
  return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
394
                                        (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
395
                                        (__v8sf)_mm256_setzero_si256());
396
}
397
 
398
/// Convert One Single float Data to One BF16 Data.
399
///
400
/// \headerfile <x86intrin.h>
401
///
402
/// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
403
///
404
/// \param __A
405
///    A float data.
406
/// \returns A bf16 data whose sign field and exponent field keep unchanged,
407
///    and fraction field is truncated to 7 bits.
408
static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
409
  __v4sf __V = {__A, 0, 0, 0};
410
  __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
411
      (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
412
  return (__bf16)__R[0];
413
}
414
 
415
/// Convert Packed BF16 Data to Packed float Data.
416
///
417
/// \headerfile <x86intrin.h>
418
///
419
/// \param __A
420
///    A 128-bit vector of [4 x bfloat].
421
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
422
static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
423
  return _mm_castsi128_ps(
424
      (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
425
}
426
 
427
/// Convert Packed BF16 Data to Packed float Data.
428
///
429
/// \headerfile <x86intrin.h>
430
///
431
/// \param __A
432
///    A 128-bit vector of [8 x bfloat].
433
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
434
static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
435
  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
436
      (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
437
}
438
 
439
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
440
///
441
/// \headerfile <x86intrin.h>
442
///
443
/// \param __U
444
///    A 4-bit mask. Elements are zeroed out when the corresponding mask
445
///    bit is not set.
446
/// \param __A
447
///    A 128-bit vector of [4 x bfloat].
448
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
449
static __inline__ __m128 __DEFAULT_FN_ATTRS128
450
_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
451
  return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
452
      (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
453
}
454
 
455
/// Convert Packed BF16 Data to Packed float Data using zeroing mask.
456
///
457
/// \headerfile <x86intrin.h>
458
///
459
/// \param __U
460
///    A 8-bit mask. Elements are zeroed out when the corresponding mask
461
///    bit is not set.
462
/// \param __A
463
///    A 128-bit vector of [8 x bfloat].
464
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
465
static __inline__ __m256 __DEFAULT_FN_ATTRS256
466
_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
467
  return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
468
      (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
469
}
470
 
471
/// Convert Packed BF16 Data to Packed float Data using merging mask.
472
///
473
/// \headerfile <x86intrin.h>
474
///
475
/// \param __S
476
///    A 128-bit vector of [4 x float]. Elements are copied from __S when
477
///     the corresponding mask bit is not set.
478
/// \param __U
479
///    A 4-bit mask. Elements are zeroed out when the corresponding mask
480
///    bit is not set.
481
/// \param __A
482
///    A 128-bit vector of [4 x bfloat].
483
/// \returns A 128-bit vector of [4 x float] come from conversion of __A
484
static __inline__ __m128 __DEFAULT_FN_ATTRS128
485
_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
486
  return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
487
      (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
488
      16));
489
}
490
 
491
/// Convert Packed BF16 Data to Packed float Data using merging mask.
492
///
493
/// \headerfile <x86intrin.h>
494
///
495
/// \param __S
496
///    A 256-bit vector of [8 x float]. Elements are copied from __S when
497
///     the corresponding mask bit is not set.
498
/// \param __U
499
///    A 8-bit mask. Elements are zeroed out when the corresponding mask
500
///    bit is not set.
501
/// \param __A
502
///    A 128-bit vector of [8 x bfloat].
503
/// \returns A 256-bit vector of [8 x float] come from conversion of __A
504
static __inline__ __m256 __DEFAULT_FN_ATTRS256
505
_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
506
  return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
507
      (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
508
      16));
509
}
510
 
511
#undef __DEFAULT_FN_ATTRS128
512
#undef __DEFAULT_FN_ATTRS256
513
 
514
#endif
515
#endif