Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9. #ifndef __IMMINTRIN_H
  10. #error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
  11. #endif
  12.  
  13. #ifdef __SSE2__
  14.  
  15. #ifndef __AVX512VLBF16INTRIN_H
  16. #define __AVX512VLBF16INTRIN_H
  17.  
  18. #define __DEFAULT_FN_ATTRS128 \
  19.   __attribute__((__always_inline__, __nodebug__, \
  20.                  __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
  21. #define __DEFAULT_FN_ATTRS256 \
  22.   __attribute__((__always_inline__, __nodebug__, \
  23.                  __target__("avx512vl, avx512bf16"), __min_vector_width__(256)))
  24.  
  25. /// Convert Two Packed Single Data to One Packed BF16 Data.
  26. ///
  27. /// \headerfile <x86intrin.h>
  28. ///
  29. /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  30. ///
  31. /// \param __A
  32. ///    A 128-bit vector of [4 x float].
  33. /// \param __B
  34. ///    A 128-bit vector of [4 x float].
  35. /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
  36. ///    conversion of __B, and higher 64 bits come from conversion of __A.
  37. static __inline__ __m128bh __DEFAULT_FN_ATTRS128
  38. _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
  39.   return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A,
  40.                                                     (__v4sf) __B);
  41. }
  42.  
  43. /// Convert Two Packed Single Data to One Packed BF16 Data.
  44. ///
  45. /// \headerfile <x86intrin.h>
  46. ///
  47. /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  48. ///
  49. /// \param __A
  50. ///    A 128-bit vector of [4 x float].
  51. /// \param __B
  52. ///    A 128-bit vector of [4 x float].
  53. /// \param __W
  54. ///    A 128-bit vector of [8 x bfloat].
  55. /// \param __U
  56. ///    A 8-bit mask value specifying what is chosen for each element.
  57. ///    A 1 means conversion of __A or __B. A 0 means element from __W.
  58. /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
  59. ///    conversion of __B, and higher 64 bits come from conversion of __A.
  60. static __inline__ __m128bh __DEFAULT_FN_ATTRS128
  61. _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
  62.   return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
  63.                                              (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
  64.                                              (__v8bf)__W);
  65. }
  66.  
  67. /// Convert Two Packed Single Data to One Packed BF16 Data.
  68. ///
  69. /// \headerfile <x86intrin.h>
  70. ///
  71. /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  72. ///
  73. /// \param __A
  74. ///    A 128-bit vector of [4 x float].
  75. /// \param __B
  76. ///    A 128-bit vector of [4 x float].
  77. /// \param __U
  78. ///    A 8-bit mask value specifying what is chosen for each element.
  79. ///    A 1 means conversion of __A or __B. A 0 means element is zero.
  80. /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
  81. ///    conversion of __B, and higher 64 bits come from conversion of __A.
  82. static __inline__ __m128bh __DEFAULT_FN_ATTRS128
  83. _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
  84.   return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
  85.                                              (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
  86.                                              (__v8bf)_mm_setzero_si128());
  87. }
  88.  
  89. /// Convert Two Packed Single Data to One Packed BF16 Data.
  90. ///
  91. /// \headerfile <x86intrin.h>
  92. ///
  93. /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  94. ///
  95. /// \param __A
  96. ///    A 256-bit vector of [8 x float].
  97. /// \param __B
  98. ///    A 256-bit vector of [8 x float].
  99. /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
  100. ///    conversion of __B, and higher 128 bits come from conversion of __A.
  101. static __inline__ __m256bh __DEFAULT_FN_ATTRS256
  102. _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
  103.   return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A,
  104.                                                     (__v8sf) __B);
  105. }
  106.  
  107. /// Convert Two Packed Single Data to One Packed BF16 Data.
  108. ///
  109. /// \headerfile <x86intrin.h>
  110. ///
  111. /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  112. ///
  113. /// \param __A
  114. ///    A 256-bit vector of [8 x float].
  115. /// \param __B
  116. ///    A 256-bit vector of [8 x float].
  117. /// \param __W
  118. ///    A 256-bit vector of [16 x bfloat].
  119. /// \param __U
  120. ///    A 16-bit mask value specifying what is chosen for each element.
  121. ///    A 1 means conversion of __A or __B. A 0 means element from __W.
  122. /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
  123. ///    conversion of __B, and higher 128 bits come from conversion of __A.
  124. static __inline__ __m256bh __DEFAULT_FN_ATTRS256
  125. _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
  126.   return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
  127.                                          (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
  128.                                          (__v16bf)__W);
  129. }
  130.  
  131. /// Convert Two Packed Single Data to One Packed BF16 Data.
  132. ///
  133. /// \headerfile <x86intrin.h>
  134. ///
  135. /// This intrinsic corresponds to the <c> VCVTNE2PS2BF16 </c> instructions.
  136. ///
  137. /// \param __A
  138. ///    A 256-bit vector of [8 x float].
  139. /// \param __B
  140. ///    A 256-bit vector of [8 x float].
  141. /// \param __U
  142. ///    A 16-bit mask value specifying what is chosen for each element.
  143. ///    A 1 means conversion of __A or __B. A 0 means element is zero.
  144. /// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from
  145. ///    conversion of __B, and higher 128 bits come from conversion of __A.
  146. static __inline__ __m256bh __DEFAULT_FN_ATTRS256
  147. _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
  148.   return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
  149.                                          (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
  150.                                          (__v16bf)_mm256_setzero_si256());
  151. }
  152.  
  153. /// Convert Packed Single Data to Packed BF16 Data.
  154. ///
  155. /// \headerfile <x86intrin.h>
  156. ///
  157. /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
  158. ///
  159. /// \param __A
  160. ///    A 128-bit vector of [4 x float].
  161. /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
  162. ///    conversion of __A, and higher 64 bits are 0.
  163. #define _mm_cvtneps_pbh(A)                                                     \
  164.   ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
  165.  
  166. /// Convert Packed Single Data to Packed BF16 Data.
  167. ///
  168. /// \headerfile <x86intrin.h>
  169. ///
  170. /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
  171. ///
  172. /// \param __A
  173. ///    A 128-bit vector of [4 x float].
  174. /// \param __W
  175. ///    A 128-bit vector of [8 x bfloat].
  176. /// \param __U
  177. ///    A 4-bit mask value specifying what is chosen for each element.
  178. ///    A 1 means conversion of __A. A 0 means element from __W.
  179. /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
  180. ///    conversion of __A, and higher 64 bits are 0.
  181. static __inline__ __m128bh __DEFAULT_FN_ATTRS128
  182. _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
  183.   return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
  184.                                                         (__v8bf)__W,
  185.                                                         (__mmask8)__U);
  186. }
  187.  
  188. /// Convert Packed Single Data to Packed BF16 Data.
  189. ///
  190. /// \headerfile <x86intrin.h>
  191. ///
  192. /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
  193. ///
  194. /// \param __A
  195. ///    A 128-bit vector of [4 x float].
  196. /// \param __U
  197. ///    A 4-bit mask value specifying what is chosen for each element.
  198. ///    A 1 means conversion of __A. A 0 means element is zero.
  199. /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
  200. ///    conversion of __A, and higher 64 bits are 0.
  201. static __inline__ __m128bh __DEFAULT_FN_ATTRS128
  202. _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
  203.   return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
  204.                                                     (__v8bf)_mm_setzero_si128(),
  205.                                                     (__mmask8)__U);
  206. }
  207.  
  208. /// Convert Packed Single Data to Packed BF16 Data.
  209. ///
  210. /// \headerfile <x86intrin.h>
  211. ///
  212. /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
  213. ///
  214. /// \param __A
  215. ///    A 256-bit vector of [8 x float].
  216. /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
  217. #define _mm256_cvtneps_pbh(A)                                                  \
  218.   ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
  219.  
  220. /// Convert Packed Single Data to Packed BF16 Data.
  221. ///
  222. /// \headerfile <x86intrin.h>
  223. ///
  224. /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
  225. ///
  226. /// \param __A
  227. ///    A 256-bit vector of [8 x float].
  228. /// \param __W
  229. ///    A 256-bit vector of [8 x bfloat].
  230. /// \param __U
  231. ///    A 8-bit mask value specifying what is chosen for each element.
  232. ///    A 1 means conversion of __A. A 0 means element from __W.
  233. /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
  234. static __inline__ __m128bh __DEFAULT_FN_ATTRS256
  235. _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
  236.   return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
  237.                                                         (__v8bf)__W,
  238.                                                         (__mmask8)__U);
  239. }
  240.  
  241. /// Convert Packed Single Data to Packed BF16 Data.
  242. ///
  243. /// \headerfile <x86intrin.h>
  244. ///
  245. /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
  246. ///
  247. /// \param __A
  248. ///    A 256-bit vector of [8 x float].
  249. /// \param __U
  250. ///    A 8-bit mask value specifying what is chosen for each element.
  251. ///    A 1 means conversion of __A. A 0 means element is zero.
  252. /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
  253. static __inline__ __m128bh __DEFAULT_FN_ATTRS256
  254. _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
  255.   return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
  256.                                                     (__v8bf)_mm_setzero_si128(),
  257.                                                     (__mmask8)__U);
  258. }
  259.  
  260. /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
  261. ///
  262. /// \headerfile <x86intrin.h>
  263. ///
  264. /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
  265. ///
  266. /// \param __A
  267. ///    A 128-bit vector of [8 x bfloat].
  268. /// \param __B
  269. ///    A 128-bit vector of [8 x bfloat].
  270. /// \param __D
  271. ///    A 128-bit vector of [4 x float].
  272. /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
  273. ///  __A, __B and __D
  274. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  275. _mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
  276.   return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
  277.                                              (__v8bf)__A,
  278.                                              (__v8bf)__B);
  279. }
  280.  
  281. /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
  282. ///
  283. /// \headerfile <x86intrin.h>
  284. ///
  285. /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
  286. ///
  287. /// \param __A
  288. ///    A 128-bit vector of [8 x bfloat].
  289. /// \param __B
  290. ///    A 128-bit vector of [8 x bfloat].
  291. /// \param __D
  292. ///    A 128-bit vector of [4 x float].
  293. /// \param __U
  294. ///    A 8-bit mask value specifying what is chosen for each element.
  295. ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
  296. /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
  297. ///  __A, __B and __D
  298. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  299. _mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) {
  300.   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
  301.                                            (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
  302.                                            (__v4sf)__D);
  303. }
  304.  
  305. /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
  306. ///
  307. /// \headerfile <x86intrin.h>
  308. ///
  309. /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
  310. ///
  311. /// \param __A
  312. ///    A 128-bit vector of [8 x bfloat].
  313. /// \param __B
  314. ///    A 128-bit vector of [8 x bfloat].
  315. /// \param __D
  316. ///    A 128-bit vector of [4 x float].
  317. /// \param __U
  318. ///    A 8-bit mask value specifying what is chosen for each element.
  319. ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
  320. /// \returns A 128-bit vector of [4 x float] comes from  Dot Product of
  321. ///  __A, __B and __D
  322. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  323. _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
  324.   return (__m128)__builtin_ia32_selectps_128((__mmask8)__U,
  325.                                            (__v4sf)_mm_dpbf16_ps(__D, __A, __B),
  326.                                            (__v4sf)_mm_setzero_si128());
  327. }
  328.  
  329. /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
  330. ///
  331. /// \headerfile <x86intrin.h>
  332. ///
  333. /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
  334. ///
  335. /// \param __A
  336. ///    A 256-bit vector of [16 x bfloat].
  337. /// \param __B
  338. ///    A 256-bit vector of [16 x bfloat].
  339. /// \param __D
  340. ///    A 256-bit vector of [8 x float].
  341. /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
  342. ///  __A, __B and __D
  343. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  344. _mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
  345.   return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
  346.                                              (__v16bf)__A,
  347.                                              (__v16bf)__B);
  348. }
  349.  
  350. /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
  351. ///
  352. /// \headerfile <x86intrin.h>
  353. ///
  354. /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
  355. ///
  356. /// \param __A
  357. ///    A 256-bit vector of [16 x bfloat].
  358. /// \param __B
  359. ///    A 256-bit vector of [16 x bfloat].
  360. /// \param __D
  361. ///    A 256-bit vector of [8 x float].
  362. /// \param __U
  363. ///    A 16-bit mask value specifying what is chosen for each element.
  364. ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means __D.
  365. /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
  366. ///  __A, __B and __D
  367. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  368. _mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) {
  369.   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
  370.                                         (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
  371.                                         (__v8sf)__D);
  372. }
  373.  
  374. /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
  375. ///
  376. /// \headerfile <x86intrin.h>
  377. ///
  378. /// This intrinsic corresponds to the <c> VDPBF16PS </c> instructions.
  379. ///
  380. /// \param __A
  381. ///    A 256-bit vector of [16 x bfloat].
  382. /// \param __B
  383. ///    A 256-bit vector of [16 x bfloat].
  384. /// \param __D
  385. ///    A 256-bit vector of [8 x float].
  386. /// \param __U
  387. ///    A 8-bit mask value specifying what is chosen for each element.
  388. ///    A 1 means __A and __B's dot product accumulated with __D. A 0 means 0.
  389. /// \returns A 256-bit vector of [8 x float] comes from  Dot Product of
  390. ///  __A, __B and __D
  391. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  392. _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
  393.   return (__m256)__builtin_ia32_selectps_256((__mmask8)__U,
  394.                                         (__v8sf)_mm256_dpbf16_ps(__D, __A, __B),
  395.                                         (__v8sf)_mm256_setzero_si256());
  396. }
  397.  
  398. /// Convert One Single float Data to One BF16 Data.
  399. ///
  400. /// \headerfile <x86intrin.h>
  401. ///
  402. /// This intrinsic corresponds to the <c> VCVTNEPS2BF16 </c> instructions.
  403. ///
  404. /// \param __A
  405. ///    A float data.
  406. /// \returns A bf16 data whose sign field and exponent field keep unchanged,
  407. ///    and fraction field is truncated to 7 bits.
  408. static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
  409.   __v4sf __V = {__A, 0, 0, 0};
  410.   __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
  411.       (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
  412.   return (__bf16)__R[0];
  413. }
  414.  
  415. /// Convert Packed BF16 Data to Packed float Data.
  416. ///
  417. /// \headerfile <x86intrin.h>
  418. ///
  419. /// \param __A
  420. ///    A 128-bit vector of [4 x bfloat].
  421. /// \returns A 128-bit vector of [4 x float] come from conversion of __A
  422. static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) {
  423.   return _mm_castsi128_ps(
  424.       (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16));
  425. }
  426.  
  427. /// Convert Packed BF16 Data to Packed float Data.
  428. ///
  429. /// \headerfile <x86intrin.h>
  430. ///
  431. /// \param __A
  432. ///    A 128-bit vector of [8 x bfloat].
  433. /// \returns A 256-bit vector of [8 x float] come from conversion of __A
  434. static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) {
  435.   return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
  436.       (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16));
  437. }
  438.  
  439. /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
  440. ///
  441. /// \headerfile <x86intrin.h>
  442. ///
  443. /// \param __U
  444. ///    A 4-bit mask. Elements are zeroed out when the corresponding mask
  445. ///    bit is not set.
  446. /// \param __A
  447. ///    A 128-bit vector of [4 x bfloat].
  448. /// \returns A 128-bit vector of [4 x float] come from conversion of __A
  449. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  450. _mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
  451.   return _mm_castsi128_ps((__m128i)_mm_slli_epi32(
  452.       (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
  453. }
  454.  
  455. /// Convert Packed BF16 Data to Packed float Data using zeroing mask.
  456. ///
  457. /// \headerfile <x86intrin.h>
  458. ///
  459. /// \param __U
  460. ///    A 8-bit mask. Elements are zeroed out when the corresponding mask
  461. ///    bit is not set.
  462. /// \param __A
  463. ///    A 128-bit vector of [8 x bfloat].
  464. /// \returns A 256-bit vector of [8 x float] come from conversion of __A
  465. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  466. _mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) {
  467.   return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32(
  468.       (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16));
  469. }
  470.  
  471. /// Convert Packed BF16 Data to Packed float Data using merging mask.
  472. ///
  473. /// \headerfile <x86intrin.h>
  474. ///
  475. /// \param __S
  476. ///    A 128-bit vector of [4 x float]. Elements are copied from __S when
  477. ///     the corresponding mask bit is not set.
  478. /// \param __U
  479. ///    A 4-bit mask. Elements are zeroed out when the corresponding mask
  480. ///    bit is not set.
  481. /// \param __A
  482. ///    A 128-bit vector of [4 x bfloat].
  483. /// \returns A 128-bit vector of [4 x float] come from conversion of __A
  484. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  485. _mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) {
  486.   return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32(
  487.       (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A),
  488.       16));
  489. }
  490.  
  491. /// Convert Packed BF16 Data to Packed float Data using merging mask.
  492. ///
  493. /// \headerfile <x86intrin.h>
  494. ///
  495. /// \param __S
  496. ///    A 256-bit vector of [8 x float]. Elements are copied from __S when
  497. ///     the corresponding mask bit is not set.
  498. /// \param __U
  499. ///    A 8-bit mask. Elements are zeroed out when the corresponding mask
  500. ///    bit is not set.
  501. /// \param __A
  502. ///    A 128-bit vector of [8 x bfloat].
  503. /// \returns A 256-bit vector of [8 x float] come from conversion of __A
  504. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  505. _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
  506.   return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32(
  507.       (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A),
  508.       16));
  509. }
  510.  
  511. #undef __DEFAULT_FN_ATTRS128
  512. #undef __DEFAULT_FN_ATTRS256
  513.  
  514. #endif
  515. #endif
  516.