Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Blame | Last modification | View Log | Download | RSS feed

  1. /*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
  2.  *
  3.  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  4.  * See https://llvm.org/LICENSE.txt for license information.
  5.  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
  6.  *
  7.  *===-----------------------------------------------------------------------===
  8.  */
  9. #ifndef __IMMINTRIN_H
  10. #error                                                                         \
  11.     "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
  12. #endif
  13.  
  14. #ifndef __AVXVNNIINT8INTRIN_H
  15. #define __AVXVNNIINT8INTRIN_H
  16.  
  17. /* Define the default attributes for the functions in this file. */
  18. #define __DEFAULT_FN_ATTRS256                                                  \
  19.   __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
  20.                  __min_vector_width__(256)))
  21. #define __DEFAULT_FN_ATTRS128                                                  \
  22.   __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
  23.                  __min_vector_width__(128)))
  24.  
  25. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  26. ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
  27. ///    signed 16-bit results. Sum these 4 results with the corresponding
  28. ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  29. ///
  30. /// \headerfile <x86intrin.h>
  31. ///
  32. /// \code
  33. /// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
  34. /// \endcode
  35. ///
  36. /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  37. ///
  38. /// \param __A
  39. ///    A 128-bit vector of [16 x char].
  40. /// \param __B
  41. ///    A 128-bit vector of [16 x char].
  42. /// \returns
  43. ///    A 128-bit vector of [4 x int].
  44. ///
  45. /// \code{.operation}
  46. /// FOR j := 0 to 3
  47. ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
  48. ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
  49. ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
  50. ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
  51. ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
  52. /// ENDFOR
  53. /// dst[MAX:128] := 0
  54. /// \endcode
  55. static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
  56.                                                                  __m128i __A,
  57.                                                                  __m128i __B) {
  58.   return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
  59.                                              (__v4si)__B);
  60. }
  61.  
  62. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  63. ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
  64. ///    signed 16-bit results. Sum these 4 results with the corresponding
  65. ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  66. ///
  67. /// \headerfile <x86intrin.h>
  68. ///
  69. /// \code
  70. /// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
  71. /// \endcode
  72. ///
  73. /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  74. ///
  75. /// \param __A
  76. ///    A 256-bit vector of [32 x char].
  77. /// \param __B
  78. ///    A 256-bit vector of [32 x char].
  79. /// \returns
  80. ///    A 256-bit vector of [8 x int].
  81. ///
  82. /// \code{.operation}
  83. /// FOR j := 0 to 7
  84. ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
  85. ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
  86. ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
  87. ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
  88. ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
  89. /// ENDFOR
  90. /// dst[MAX:256] := 0
  91. /// \endcode
  92. static __inline__ __m256i __DEFAULT_FN_ATTRS256
  93. _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
  94.   return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
  95.                                              (__v8si)__B);
  96. }
  97.  
  98. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  99. ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
  100. ///    signed 16-bit results. Sum these 4 results with the corresponding
  101. ///    32-bit integer in \a __W with signed saturation, and store the packed
  102. ///    32-bit results in \a dst.
  103. ///
  104. /// \headerfile <x86intrin.h>
  105. ///
  106. /// \code
  107. /// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
  108. /// \endcode
  109. ///
  110. /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  111. ///
  112. /// \param __A
  113. ///    A 128-bit vector of [16 x char].
  114. /// \param __B
  115. ///    A 128-bit vector of [16 x char].
  116. /// \returns
  117. ///    A 128-bit vector of [4 x int].
  118. ///
  119. /// \code{.operation}
  120. /// FOR j := 0 to 3
  121. ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
  122. ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
  123. ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
  124. ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
  125. ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
  126. /// ENDFOR
  127. /// dst[MAX:128] := 0
  128. /// \endcode
  129. static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
  130.                                                                   __m128i __A,
  131.                                                                   __m128i __B) {
  132.   return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
  133.                                               (__v4si)__B);
  134. }
  135.  
  136. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  137. ///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
  138. ///    signed 16-bit results. Sum these 4 results with the corresponding
  139. ///    32-bit integer in \a __W with signed saturation, and store the packed
  140. ///    32-bit results in \a dst.
  141. ///
  142. /// \headerfile <x86intrin.h>
  143. ///
  144. /// \code
  145. /// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
  146. /// \endcode
  147. ///
  148. /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  149. ///
  150. /// \param __A
  151. ///    A 256-bit vector of [32 x char].
  152. /// \param __B
  153. ///    A 256-bit vector of [32 x char].
  154. /// \returns
  155. ///    A 256-bit vector of [8 x int].
  156. ///
  157. /// \code{.operation}
  158. /// FOR j := 0 to 7
  159. ///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
  160. ///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
  161. ///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
  162. ///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
  163. ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
  164. /// ENDFOR
  165. /// dst[MAX:256] := 0
  166. /// \endcode
  167. static __inline__ __m256i __DEFAULT_FN_ATTRS256
  168. _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
  169.   return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
  170.                                               (__v8si)__B);
  171. }
  172.  
  173. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  174. ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
  175. ///    signed 16-bit results. Sum these 4 results with the corresponding
  176. ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  177. ///
  178. /// \headerfile <x86intrin.h>
  179. ///
  180. /// \code
  181. /// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
  182. /// \endcode
  183. ///
  184. /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  185. ///
  186. /// \param __A
  187. ///    A 128-bit vector of [16 x char].
  188. /// \param __B
  189. ///    A 128-bit vector of [16 x unsigned char].
  190. /// \returns
  191. ///    A 128-bit vector of [4 x int].
  192. ///
  193. /// \code{.operation}
  194. /// FOR j := 0 to 3
  195. ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
  196. ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
  197. ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
  198. ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
  199. ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
  200. /// ENDFOR
  201. /// dst[MAX:128] := 0
  202. /// \endcode
  203. static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
  204.                                                                  __m128i __A,
  205.                                                                  __m128i __B) {
  206.   return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
  207.                                              (__v4si)__B);
  208. }
  209.  
  210. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  211. ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
  212. ///    signed 16-bit results. Sum these 4 results with the corresponding
  213. ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  214. ///
  215. /// \headerfile <x86intrin.h>
  216. ///
  217. /// \code
  218. /// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
  219. /// \endcode
  220. ///
  221. /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  222. ///
  223. /// \param __A
  224. ///    A 256-bit vector of [32 x char].
  225. /// \param __B
  226. ///    A 256-bit vector of [32 x unsigned char].
  227. /// \returns
  228. ///    A 256-bit vector of [8 x int].
  229. ///
  230. /// \code{.operation}
  231. /// FOR j := 0 to 7
  232. ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
  233. ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
  234. ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
  235. ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
  236. ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
  237. /// ENDFOR
  238. /// dst[MAX:256] := 0
  239. /// \endcode
  240. static __inline__ __m256i __DEFAULT_FN_ATTRS256
  241. _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
  242.   return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
  243.                                              (__v8si)__B);
  244. }
  245.  
  246. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  247. ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
  248. ///    signed 16-bit results. Sum these 4 results with the corresponding
  249. ///    32-bit integer in \a __W with signed saturation, and store the packed
  250. ///    32-bit results in \a dst.
  251. ///
  252. /// \headerfile <x86intrin.h>
  253. ///
  254. /// \code
  255. /// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
  256. /// \endcode
  257. ///
  258. /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  259. ///
  260. /// \param __A
  261. ///    A 128-bit vector of [16 x char].
  262. /// \param __B
  263. ///    A 128-bit vector of [16 x unsigned char].
  264. /// \returns
  265. ///    A 128-bit vector of [4 x int].
  266. ///
  267. /// \code{.operation}
  268. /// FOR j := 0 to 3
  269. ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
  270. ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
  271. ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
  272. ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
  273. ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
  274. /// ENDFOR
  275. /// dst[MAX:128] := 0
  276. /// \endcode
  277. static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
  278.                                                                   __m128i __A,
  279.                                                                   __m128i __B) {
  280.   return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
  281.                                               (__v4si)__B);
  282. }
  283.  
  284. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  285. ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
  286. ///    signed 16-bit results. Sum these 4 results with the corresponding
  287. ///    32-bit integer in \a __W with signed saturation, and store the packed
  288. ///    32-bit results in \a dst.
  289. ///
  290. /// \headerfile <x86intrin.h>
  291. ///
  292. /// \code
  293. /// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
  294. /// \endcode
  295. ///
  296. /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  297. ///
  298. /// \param __A
  299. ///    A 256-bit vector of [32 x char].
  300. /// \param __B
  301. ///    A 256-bit vector of [32 x unsigned char].
  302. /// \returns
  303. ///    A 256-bit vector of [8 x int].
  304. ///
  305. /// \code{.operation}
  306. /// FOR j := 0 to 7
  307. ///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
  308. ///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
  309. ///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
  310. ///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
  311. ///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
  312. /// ENDFOR
  313. /// dst[MAX:256] := 0
  314. /// \endcode
  315. static __inline__ __m256i __DEFAULT_FN_ATTRS256
  316. _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
  317.   return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
  318.                                               (__v8si)__B);
  319. }
  320.  
  321. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
  322. ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
  323. ///    signed 16-bit results. Sum these 4 results with the corresponding
  324. ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  325. ///
  326. /// \headerfile <x86intrin.h>
  327. ///
  328. /// \code
  329. /// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
  330. /// \endcode
  331. ///
  332. /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  333. ///
  334. /// \param __A
  335. ///    A 128-bit vector of [16 x unsigned char].
  336. /// \param __B
  337. ///    A 128-bit vector of [16 x unsigned char].
  338. /// \returns
  339. ///    A 128-bit vector of [4 x int].
  340. ///
  341. /// \code{.operation}
  342. /// FOR j := 0 to 3
  343. ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
  344. ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
  345. ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
  346. ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
  347. ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
  348. /// ENDFOR
  349. /// dst[MAX:128] := 0
  350. /// \endcode
  351. static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
  352.                                                                  __m128i __A,
  353.                                                                  __m128i __B) {
  354.   return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
  355.                                              (__v4si)__B);
  356. }
  357.  
  358. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
  359. ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
  360. ///    signed 16-bit results. Sum these 4 results with the corresponding
  361. ///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
  362. ///
  363. /// \headerfile <x86intrin.h>
  364. ///
  365. /// \code
  366. /// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
  367. /// \endcode
  368. ///
  369. /// This intrinsic corresponds to the \c VPDPBSSD instruction.
  370. ///
  371. /// \param __A
  372. ///    A 256-bit vector of [32 x unsigned char].
  373. /// \param __B
  374. ///    A 256-bit vector of [32 x unsigned char].
  375. /// \returns
  376. ///    A 256-bit vector of [8 x int].
  377. ///
  378. /// \code{.operation}
  379. /// FOR j := 0 to 7
  380. ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
  381. ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
  382. ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
  383. ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
  384. ///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
  385. /// ENDFOR
  386. /// dst[MAX:256] := 0
  387. /// \endcode
  388. static __inline__ __m256i __DEFAULT_FN_ATTRS256
  389. _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
  390.   return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
  391.                                              (__v8si)__B);
  392. }
  393.  
  394. /// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
  395. ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
  396. ///    signed 16-bit results. Sum these 4 results with the corresponding
  397. ///    32-bit integer in \a __W with signed saturation, and store the packed
  398. ///    32-bit results in \a dst.
  399. ///
  400. /// \headerfile <x86intrin.h>
  401. ///
  402. /// \code
  403. /// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
  404. /// \endcode
  405. ///
  406. /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
  407. ///
  408. /// \param __A
  409. ///    A 128-bit vector of [16 x unsigned char].
  410. /// \param __B
  411. ///    A 128-bit vector of [16 x unsigned char].
  412. /// \returns
  413. ///    A 128-bit vector of [4 x int].
  414. ///
  415. /// \code{.operation}
  416. /// FOR j := 0 to 3
  417. ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
  418. ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
  419. ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
  420. ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
  421. ///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
  422. /// ENDFOR
  423. /// dst[MAX:128] := 0
  424. /// \endcode
  425. static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
  426.                                                                   __m128i __A,
  427.                                                                   __m128i __B) {
  428.   return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
  429.                                               (__v4si)__B);
  430. }
  431.  
  432. /// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
  433. ///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
  434. ///    signed 16-bit results. Sum these 4 results with the corresponding
  435. ///    32-bit integer in \a __W with signed saturation, and store the packed
  436. ///    32-bit results in \a dst.
  437. ///
  438. /// \headerfile <x86intrin.h>
  439. ///
  440. /// \code
  441. /// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
  442. /// \endcode
  443. ///
  444. /// This intrinsic corresponds to the \c VPDPBUUDS instruction.
  445. ///
  446. /// \param __A
  447. ///    A 256-bit vector of [32 x unsigned char].
  448. /// \param __B
  449. ///    A 256-bit vector of [32 x unsigned char].
  450. /// \returns
  451. ///    A 256-bit vector of [8 x int].
  452. ///
  453. /// \code{.operation}
  454. /// FOR j := 0 to 7
  455. ///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
  456. ///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
  457. ///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
  458. ///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
  459. ///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
  460. /// ENDFOR
  461. /// dst[MAX:256] := 0
  462. /// \endcode
  463. static __inline__ __m256i __DEFAULT_FN_ATTRS256
  464. _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
  465.   return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
  466.                                               (__v8si)__B);
  467. }
  468. #undef __DEFAULT_FN_ATTRS128
  469. #undef __DEFAULT_FN_ATTRS256
  470.  
  471. #endif // __AVXVNNIINT8INTRIN_H
  472.