Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9.  
  10. #ifndef __IMMINTRIN_H
  11. #error                                                                         \
  12.     "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
  13. #endif // __IMMINTRIN_H
  14.  
  15. #ifdef __SSE2__
  16.  
  17. #ifndef __AVXNECONVERTINTRIN_H
  18. #define __AVXNECONVERTINTRIN_H
  19.  
  20. /* Define the default attributes for the functions in this file. */
  21. #define __DEFAULT_FN_ATTRS128                                                  \
  22.   __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
  23.                  __min_vector_width__(128)))
  24. #define __DEFAULT_FN_ATTRS256                                                  \
  25.   __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
  26.                  __min_vector_width__(256)))
  27.  
  28. /// Convert scalar BF16 (16-bit) floating-point element
  29. /// stored at memory locations starting at location \a __A to a
  30. /// single-precision (32-bit) floating-point, broadcast it to packed
  31. /// single-precision (32-bit) floating-point elements, and store the results in
  32. /// \a dst.
  33. ///
  34. /// \headerfile <x86intrin.h>
  35. ///
  36. /// \code
  37. /// _mm_bcstnebf16_ps(const void *__A);
  38. /// \endcode
  39. ///
  40. /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
  41. ///
  42. /// \param __A
  43. ///    A pointer to a 16-bit memory location. The address of the memory
  44. ///    location does not have to be aligned.
  45. /// \returns
  46. ///    A 128-bit vector of [4 x float].
  47. ///
  48. /// \code{.operation}
  49. /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
  50. /// FOR j := 0 to 3
  51. ///   m := j*32
  52. ///   dst[m+31:m] := b
  53. /// ENDFOR
  54. /// dst[MAX:128] := 0
  55. /// \endcode
  56. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  57. _mm_bcstnebf16_ps(const void *__A) {
  58.   return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
  59. }
  60.  
  61. /// Convert scalar BF16 (16-bit) floating-point element
  62. /// stored at memory locations starting at location \a __A to a
  63. /// single-precision (32-bit) floating-point, broadcast it to packed
  64. /// single-precision (32-bit) floating-point elements, and store the results in
  65. /// \a dst.
  66. ///
  67. /// \headerfile <x86intrin.h>
  68. ///
  69. /// \code
  70. /// _mm256_bcstnebf16_ps(const void *__A);
  71. /// \endcode
  72. ///
  73. /// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
  74. ///
  75. /// \param __A
  76. ///    A pointer to a 16-bit memory location. The address of the memory
  77. ///    location does not have to be aligned.
  78. /// \returns
  79. ///    A 256-bit vector of [8 x float].
  80. ///
  81. /// \code{.operation}
  82. /// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
  83. /// FOR j := 0 to 7
  84. ///   m := j*32
  85. ///   dst[m+31:m] := b
  86. /// ENDFOR
  87. /// dst[MAX:256] := 0
  88. /// \endcode
  89. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  90. _mm256_bcstnebf16_ps(const void *__A) {
  91.   return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
  92. }
  93.  
  94. /// Convert scalar half-precision (16-bit) floating-point element
  95. /// stored at memory locations starting at location \a __A to a
  96. /// single-precision (32-bit) floating-point, broadcast it to packed
  97. /// single-precision (32-bit) floating-point elements, and store the results in
  98. /// \a dst.
  99. ///
  100. /// \headerfile <x86intrin.h>
  101. ///
  102. /// \code
  103. /// _mm_bcstnesh_ps(const void *__A);
  104. /// \endcode
  105. ///
  106. /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
  107. ///
  108. /// \param __A
  109. ///    A pointer to a 16-bit memory location. The address of the memory
  110. ///    location does not have to be aligned.
  111. /// \returns
  112. ///    A 128-bit vector of [4 x float].
  113. ///
  114. /// \code{.operation}
  115. /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
  116. /// FOR j := 0 to 3
  117. ///   m := j*32
  118. ///   dst[m+31:m] := b
  119. /// ENDFOR
  120. /// dst[MAX:128] := 0
  121. /// \endcode
  122. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  123. _mm_bcstnesh_ps(const void *__A) {
  124.   return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
  125. }
  126.  
  127. /// Convert scalar half-precision (16-bit) floating-point element
  128. /// stored at memory locations starting at location \a __A to a
  129. /// single-precision (32-bit) floating-point, broadcast it to packed
  130. /// single-precision (32-bit) floating-point elements, and store the results in
  131. /// \a dst.
  132. ///
  133. /// \headerfile <x86intrin.h>
  134. ///
  135. /// \code
  136. /// _mm256_bcstnesh_ps(const void *__A);
  137. /// \endcode
  138. ///
  139. /// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
  140. ///
  141. /// \param __A
  142. ///    A pointer to a 16-bit memory location. The address of the memory
  143. ///    location does not have to be aligned.
  144. /// \returns
  145. ///    A 256-bit vector of [8 x float].
  146. ///
  147. /// \code{.operation}
  148. /// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
  149. /// FOR j := 0 to 7
  150. ///   m := j*32
  151. ///   dst[m+31:m] := b
  152. /// ENDFOR
  153. /// dst[MAX:256] := 0
  154. /// \endcode
  155. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  156. _mm256_bcstnesh_ps(const void *__A) {
  157.   return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
  158. }
  159.  
  160. /// Convert packed BF16 (16-bit) floating-point even-indexed elements
  161. /// stored at memory locations starting at location \a __A to packed
  162. /// single-precision (32-bit) floating-point elements, and store the results in
  163. /// \a dst.
  164. ///
  165. /// \headerfile <x86intrin.h>
  166. ///
  167. /// \code
  168. /// _mm_cvtneebf16_ps(const __m128bh *__A);
  169. /// \endcode
  170. ///
  171. /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
  172. ///
  173. /// \param __A
  174. ///    A pointer to a 128-bit memory location containing 8 consecutive
  175. ///    BF16 (16-bit) floating-point values.
  176. /// \returns
  177. ///    A 128-bit vector of [4 x float].
  178. ///
  179. /// \code{.operation}
  180. /// FOR j := 0 to 3
  181. ///     k := j*2
  182. ///     i := k*16
  183. ///     m := j*32
  184. ///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
  185. /// ENDFOR
  186. /// dst[MAX:128] := 0
  187. /// \endcode
  188. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  189. _mm_cvtneebf16_ps(const __m128bh *__A) {
  190.   return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
  191. }
  192.  
  193. /// Convert packed BF16 (16-bit) floating-point even-indexed elements
  194. /// stored at memory locations starting at location \a __A to packed
  195. /// single-precision (32-bit) floating-point elements, and store the results in
  196. /// \a dst.
  197. ///
  198. /// \headerfile <x86intrin.h>
  199. ///
  200. /// \code
  201. /// _mm256_cvtneebf16_ps(const __m256bh *__A);
  202. /// \endcode
  203. ///
  204. /// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
  205. ///
  206. /// \param __A
  207. ///    A pointer to a 256-bit memory location containing 16 consecutive
  208. ///    BF16 (16-bit) floating-point values.
  209. /// \returns
  210. ///    A 256-bit vector of [8 x float].
  211. ///
  212. /// \code{.operation}
  213. /// FOR j := 0 to 7
  214. ///     k := j*2
  215. ///     i := k*16
  216. ///     m := j*32
  217. ///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
  218. /// ENDFOR
  219. /// dst[MAX:256] := 0
  220. /// \endcode
  221. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  222. _mm256_cvtneebf16_ps(const __m256bh *__A) {
  223.   return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
  224. }
  225.  
  226. /// Convert packed half-precision (16-bit) floating-point even-indexed elements
  227. /// stored at memory locations starting at location \a __A to packed
  228. /// single-precision (32-bit) floating-point elements, and store the results in
  229. /// \a dst.
  230. ///
  231. /// \headerfile <x86intrin.h>
  232. ///
  233. /// \code
  234. /// _mm_cvtneeph_ps(const __m128h *__A);
  235. /// \endcode
  236. ///
  237. /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
  238. ///
  239. /// \param __A
  240. ///    A pointer to a 128-bit memory location containing 8 consecutive
  241. ///    half-precision (16-bit) floating-point values.
  242. /// \returns
  243. ///    A 128-bit vector of [4 x float].
  244. ///
  245. /// \code{.operation}
  246. /// FOR j := 0 to 3
  247. ///     k := j*2
  248. ///     i := k*16
  249. ///     m := j*32
  250. ///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
  251. /// ENDFOR
  252. /// dst[MAX:128] := 0
  253. /// \endcode
  254. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  255. _mm_cvtneeph_ps(const __m128h *__A) {
  256.   return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
  257. }
  258.  
  259. /// Convert packed half-precision (16-bit) floating-point even-indexed elements
  260. /// stored at memory locations starting at location \a __A to packed
  261. /// single-precision (32-bit) floating-point elements, and store the results in
  262. /// \a dst.
  263. ///
  264. /// \headerfile <x86intrin.h>
  265. ///
  266. /// \code
  267. /// _mm256_cvtneeph_ps(const __m256h *__A);
  268. /// \endcode
  269. ///
  270. /// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
  271. ///
  272. /// \param __A
  273. ///    A pointer to a 256-bit memory location containing 16 consecutive
  274. ///    half-precision (16-bit) floating-point values.
  275. /// \returns
  276. ///    A 256-bit vector of [8 x float].
  277. ///
  278. /// \code{.operation}
  279. /// FOR j := 0 to 7
  280. ///     k := j*2
  281. ///     i := k*16
  282. ///     m := j*32
  283. ///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
  284. /// ENDFOR
  285. /// dst[MAX:256] := 0
  286. /// \endcode
  287. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  288. _mm256_cvtneeph_ps(const __m256h *__A) {
  289.   return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
  290. }
  291.  
  292. /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
  293. /// stored at memory locations starting at location \a __A to packed
  294. /// single-precision (32-bit) floating-point elements, and store the results in
  295. /// \a dst.
  296. ///
  297. /// \headerfile <x86intrin.h>
  298. ///
  299. /// \code
  300. /// _mm_cvtneobf16_ps(const __m128bh *__A);
  301. /// \endcode
  302. ///
  303. /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
  304. ///
  305. /// \param __A
  306. ///    A pointer to a 128-bit memory location containing 8 consecutive
  307. ///    BF16 (16-bit) floating-point values.
  308. /// \returns
  309. ///    A 128-bit vector of [4 x float].
  310. ///
  311. /// \code{.operation}
  312. /// FOR j := 0 to 3
  313. ///     k := j*2+1
  314. ///     i := k*16
  315. ///     m := j*32
  316. ///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
  317. /// ENDFOR
  318. /// dst[MAX:128] := 0
  319. /// \endcode
  320. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  321. _mm_cvtneobf16_ps(const __m128bh *__A) {
  322.   return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
  323. }
  324.  
  325. /// Convert packed BF16 (16-bit) floating-point odd-indexed elements
  326. /// stored at memory locations starting at location \a __A to packed
  327. /// single-precision (32-bit) floating-point elements, and store the results in
  328. /// \a dst.
  329. ///
  330. /// \headerfile <x86intrin.h>
  331. ///
  332. /// \code
  333. /// _mm256_cvtneobf16_ps(const __m256bh *__A);
  334. /// \endcode
  335. ///
  336. /// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
  337. ///
  338. /// \param __A
  339. ///    A pointer to a 256-bit memory location containing 16 consecutive
  340. ///    BF16 (16-bit) floating-point values.
  341. /// \returns
  342. ///    A 256-bit vector of [8 x float].
  343. ///
  344. /// \code{.operation}
  345. /// FOR j := 0 to 7
  346. ///     k := j*2+1
  347. ///     i := k*16
  348. ///     m := j*32
  349. ///     dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
  350. /// ENDFOR
  351. /// dst[MAX:256] := 0
  352. /// \endcode
  353. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  354. _mm256_cvtneobf16_ps(const __m256bh *__A) {
  355.   return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
  356. }
  357.  
  358. /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
  359. /// stored at memory locations starting at location \a __A to packed
  360. /// single-precision (32-bit) floating-point elements, and store the results in
  361. /// \a dst.
  362. ///
  363. /// \headerfile <x86intrin.h>
  364. ///
  365. /// \code
  366. /// _mm_cvtneoph_ps(const __m128h *__A);
  367. /// \endcode
  368. ///
  369. /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
  370. ///
  371. /// \param __A
  372. ///    A pointer to a 128-bit memory location containing 8 consecutive
  373. ///    half-precision (16-bit) floating-point values.
  374. /// \returns
  375. ///    A 128-bit vector of [4 x float].
  376. ///
  377. /// \code{.operation}
  378. /// FOR j := 0 to 3
  379. ///     k := j*2+1
  380. ///     i := k*16
  381. ///     m := j*32
  382. ///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
  383. /// ENDFOR
  384. /// dst[MAX:128] := 0
  385. /// \endcode
  386. static __inline__ __m128 __DEFAULT_FN_ATTRS128
  387. _mm_cvtneoph_ps(const __m128h *__A) {
  388.   return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
  389. }
  390.  
  391. /// Convert packed half-precision (16-bit) floating-point odd-indexed elements
  392. /// stored at memory locations starting at location \a __A to packed
  393. /// single-precision (32-bit) floating-point elements, and store the results in
  394. /// \a dst.
  395. ///
  396. /// \headerfile <x86intrin.h>
  397. ///
  398. /// \code
  399. /// _mm256_cvtneoph_ps(const __m256h *__A);
  400. /// \endcode
  401. ///
  402. /// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
  403. ///
  404. /// \param __A
  405. ///    A pointer to a 256-bit memory location containing 16 consecutive
  406. ///    half-precision (16-bit) floating-point values.
  407. /// \returns
  408. ///    A 256-bit vector of [8 x float].
  409. ///
  410. /// \code{.operation}
  411. /// FOR j := 0 to 7
  412. ///     k := j*2+1
  413. ///     i := k*16
  414. ///     m := j*32
  415. ///     dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
  416. /// ENDFOR
  417. /// dst[MAX:256] := 0
  418. /// \endcode
  419. static __inline__ __m256 __DEFAULT_FN_ATTRS256
  420. _mm256_cvtneoph_ps(const __m256h *__A) {
  421.   return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
  422. }
  423.  
  424. /// Convert packed single-precision (32-bit) floating-point elements in \a __A
  425. /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
  426. /// dst.
  427. ///
  428. /// \headerfile <x86intrin.h>
  429. ///
  430. /// \code
  431. /// _mm_cvtneps_avx_pbh(__m128 __A);
  432. /// \endcode
  433. ///
  434. /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
  435. ///
  436. /// \param __A
  437. ///    A 128-bit vector of [4 x float].
  438. /// \returns
  439. ///    A 128-bit vector of [8 x bfloat].
  440. ///
  441. /// \code{.operation}
  442. /// FOR j := 0 to 3
  443. ///     dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
  444. /// ENDFOR
  445. /// dst[MAX:128] := 0
  446. /// \endcode
  447. static __inline__ __m128bh __DEFAULT_FN_ATTRS128
  448. _mm_cvtneps_avx_pbh(__m128 __A) {
  449.   return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
  450. }
  451.  
  452. /// Convert packed single-precision (32-bit) floating-point elements in \a __A
  453. /// to packed BF16 (16-bit) floating-point elements, and store the results in \a
  454. /// dst.
  455. ///
  456. /// \headerfile <x86intrin.h>
  457. ///
  458. /// \code
  459. /// _mm256_cvtneps_avx_pbh(__m256 __A);
  460. /// \endcode
  461. ///
  462. /// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
  463. ///
  464. /// \param __A
  465. ///    A 256-bit vector of [8 x float].
  466. /// \returns
  467. ///    A 128-bit vector of [8 x bfloat].
  468. ///
  469. /// \code{.operation}
  470. /// FOR j := 0 to 7
  471. ///     dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
  472. /// ENDFOR
  473. /// dst[MAX:128] := 0
  474. /// \endcode
  475. static __inline__ __m128bh __DEFAULT_FN_ATTRS256
  476. _mm256_cvtneps_avx_pbh(__m256 __A) {
  477.   return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
  478. }
  479.  
  480. #undef __DEFAULT_FN_ATTRS128
  481. #undef __DEFAULT_FN_ATTRS256
  482.  
  483. #endif // __AVXNECONVERTINTRIN_H
  484. #endif // __SSE2__
  485.