Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
#ifndef __IMMINTRIN_H
10
#error                                                                         \
11
    "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
12
#endif
13
 
14
#ifndef __AVXVNNIINT8INTRIN_H
15
#define __AVXVNNIINT8INTRIN_H
16
 
17
/* Define the default attributes for the functions in this file. */
18
#define __DEFAULT_FN_ATTRS256                                                  \
19
  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
20
                 __min_vector_width__(256)))
21
#define __DEFAULT_FN_ATTRS128                                                  \
22
  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
23
                 __min_vector_width__(128)))
24
 
25
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
26
///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
27
///    signed 16-bit results. Sum these 4 results with the corresponding
28
///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
29
///
30
/// \headerfile <x86intrin.h>
31
///
32
/// \code
33
/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
34
/// \endcode
35
///
36
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
37
///
38
/// \param __A
39
///    A 128-bit vector of [16 x char].
40
/// \param __B
41
///    A 128-bit vector of [16 x char].
42
/// \returns
43
///    A 128-bit vector of [4 x int].
44
///
45
/// \code{.operation}
46
/// FOR j := 0 to 3
47
///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
48
///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
49
///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
50
///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
51
///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
52
/// ENDFOR
53
/// dst[MAX:128] := 0
54
/// \endcode
55
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
56
                                                                 __m128i __A,
57
                                                                 __m128i __B) {
58
  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
59
                                             (__v4si)__B);
60
}
61
 
62
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
63
///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
64
///    signed 16-bit results. Sum these 4 results with the corresponding
65
///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
66
///
67
/// \headerfile <x86intrin.h>
68
///
69
/// \code
70
/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
71
/// \endcode
72
///
73
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
74
///
75
/// \param __A
76
///    A 256-bit vector of [32 x char].
77
/// \param __B
78
///    A 256-bit vector of [32 x char].
79
/// \returns
80
///    A 256-bit vector of [8 x int].
81
///
82
/// \code{.operation}
83
/// FOR j := 0 to 7
84
///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
85
///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
86
///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
87
///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
88
///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
89
/// ENDFOR
90
/// dst[MAX:256] := 0
91
/// \endcode
92
static __inline__ __m256i __DEFAULT_FN_ATTRS256
93
_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
94
  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
95
                                             (__v8si)__B);
96
}
97
 
98
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
99
///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
100
///    signed 16-bit results. Sum these 4 results with the corresponding
101
///    32-bit integer in \a __W with signed saturation, and store the packed
102
///    32-bit results in \a dst.
103
///
104
/// \headerfile <x86intrin.h>
105
///
106
/// \code
107
/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
108
/// \endcode
109
///
110
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
111
///
112
/// \param __A
113
///    A 128-bit vector of [16 x char].
114
/// \param __B
115
///    A 128-bit vector of [16 x char].
116
/// \returns
117
///    A 128-bit vector of [4 x int].
118
///
119
/// \code{.operation}
120
/// FOR j := 0 to 3
121
///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
122
///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
123
///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
124
///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
125
///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
126
/// ENDFOR
127
/// dst[MAX:128] := 0
128
/// \endcode
129
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
130
                                                                  __m128i __A,
131
                                                                  __m128i __B) {
132
  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
133
                                              (__v4si)__B);
134
}
135
 
136
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
137
///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
138
///    signed 16-bit results. Sum these 4 results with the corresponding
139
///    32-bit integer in \a __W with signed saturation, and store the packed
140
///    32-bit results in \a dst.
141
///
142
/// \headerfile <x86intrin.h>
143
///
144
/// \code
145
/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
146
/// \endcode
147
///
148
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
149
///
150
/// \param __A
151
///    A 256-bit vector of [32 x char].
152
/// \param __B
153
///    A 256-bit vector of [32 x char].
154
/// \returns
155
///    A 256-bit vector of [8 x int].
156
///
157
/// \code{.operation}
158
/// FOR j := 0 to 7
159
///     tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
160
///     tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
161
///     tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
162
///     tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
163
///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
164
/// ENDFOR
165
/// dst[MAX:256] := 0
166
/// \endcode
167
static __inline__ __m256i __DEFAULT_FN_ATTRS256
168
_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
169
  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
170
                                              (__v8si)__B);
171
}
172
 
173
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
174
///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
175
///    signed 16-bit results. Sum these 4 results with the corresponding
176
///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
177
///
178
/// \headerfile <x86intrin.h>
179
///
180
/// \code
181
/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
182
/// \endcode
183
///
184
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
185
///
186
/// \param __A
187
///    A 128-bit vector of [16 x char].
188
/// \param __B
189
///    A 128-bit vector of [16 x unsigned char].
190
/// \returns
191
///    A 128-bit vector of [4 x int].
192
///
193
/// \code{.operation}
194
/// FOR j := 0 to 3
195
///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
196
///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
197
///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
198
///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
199
///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
200
/// ENDFOR
201
/// dst[MAX:128] := 0
202
/// \endcode
203
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
204
                                                                 __m128i __A,
205
                                                                 __m128i __B) {
206
  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
207
                                             (__v4si)__B);
208
}
209
 
210
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
211
///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
212
///    signed 16-bit results. Sum these 4 results with the corresponding
213
///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
214
///
215
/// \headerfile <x86intrin.h>
216
///
217
/// \code
218
/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
219
/// \endcode
220
///
221
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
222
///
223
/// \param __A
224
///    A 256-bit vector of [32 x char].
225
/// \param __B
226
///    A 256-bit vector of [32 x unsigned char].
227
/// \returns
228
///    A 256-bit vector of [8 x int].
229
///
230
/// \code{.operation}
231
/// FOR j := 0 to 7
232
///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
233
///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
234
///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
235
///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
236
///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
237
/// ENDFOR
238
/// dst[MAX:256] := 0
239
/// \endcode
240
static __inline__ __m256i __DEFAULT_FN_ATTRS256
241
_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
242
  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
243
                                             (__v8si)__B);
244
}
245
 
246
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
247
///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
248
///    signed 16-bit results. Sum these 4 results with the corresponding
249
///    32-bit integer in \a __W with signed saturation, and store the packed
250
///    32-bit results in \a dst.
251
///
252
/// \headerfile <x86intrin.h>
253
///
254
/// \code
255
/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
256
/// \endcode
257
///
258
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
259
///
260
/// \param __A
261
///    A 128-bit vector of [16 x char].
262
/// \param __B
263
///    A 128-bit vector of [16 x unsigned char].
264
/// \returns
265
///    A 128-bit vector of [4 x int].
266
///
267
/// \code{.operation}
268
/// FOR j := 0 to 3
269
///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
270
///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
271
///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
272
///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
273
///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
274
/// ENDFOR
275
/// dst[MAX:128] := 0
276
/// \endcode
277
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
278
                                                                  __m128i __A,
279
                                                                  __m128i __B) {
280
  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
281
                                              (__v4si)__B);
282
}
283
 
284
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
285
///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
286
///    signed 16-bit results. Sum these 4 results with the corresponding
287
///    32-bit integer in \a __W with signed saturation, and store the packed
288
///    32-bit results in \a dst.
289
///
290
/// \headerfile <x86intrin.h>
291
///
292
/// \code
293
/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
294
/// \endcode
295
///
296
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
297
///
298
/// \param __A
299
///    A 256-bit vector of [32 x char].
300
/// \param __B
301
///    A 256-bit vector of [32 x unsigned char].
302
/// \returns
303
///    A 256-bit vector of [8 x int].
304
///
305
/// \code{.operation}
306
/// FOR j := 0 to 7
307
///     tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
308
///     tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
309
///     tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
310
///     tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
311
///     dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
312
/// ENDFOR
313
/// dst[MAX:256] := 0
314
/// \endcode
315
static __inline__ __m256i __DEFAULT_FN_ATTRS256
316
_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
317
  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
318
                                              (__v8si)__B);
319
}
320
 
321
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
322
///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
323
///    signed 16-bit results. Sum these 4 results with the corresponding
324
///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
325
///
326
/// \headerfile <x86intrin.h>
327
///
328
/// \code
329
/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
330
/// \endcode
331
///
332
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
333
///
334
/// \param __A
335
///    A 128-bit vector of [16 x unsigned char].
336
/// \param __B
337
///    A 128-bit vector of [16 x unsigned char].
338
/// \returns
339
///    A 128-bit vector of [4 x int].
340
///
341
/// \code{.operation}
342
/// FOR j := 0 to 3
343
///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
344
///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
345
///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
346
///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
347
///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
348
/// ENDFOR
349
/// dst[MAX:128] := 0
350
/// \endcode
351
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
352
                                                                 __m128i __A,
353
                                                                 __m128i __B) {
354
  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
355
                                             (__v4si)__B);
356
}
357
 
358
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
359
///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
360
///    signed 16-bit results. Sum these 4 results with the corresponding
361
///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
362
///
363
/// \headerfile <x86intrin.h>
364
///
365
/// \code
366
/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
367
/// \endcode
368
///
369
/// This intrinsic corresponds to the \c VPDPBSSD instruction.
370
///
371
/// \param __A
372
///    A 256-bit vector of [32 x unsigned char].
373
/// \param __B
374
///    A 256-bit vector of [32 x unsigned char].
375
/// \returns
376
///    A 256-bit vector of [8 x int].
377
///
378
/// \code{.operation}
379
/// FOR j := 0 to 7
380
///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
381
///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
382
///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
383
///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
384
///     dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
385
/// ENDFOR
386
/// dst[MAX:256] := 0
387
/// \endcode
388
static __inline__ __m256i __DEFAULT_FN_ATTRS256
389
_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
390
  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
391
                                             (__v8si)__B);
392
}
393
 
394
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
395
///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
396
///    signed 16-bit results. Sum these 4 results with the corresponding
397
///    32-bit integer in \a __W with signed saturation, and store the packed
398
///    32-bit results in \a dst.
399
///
400
/// \headerfile <x86intrin.h>
401
///
402
/// \code
403
/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
404
/// \endcode
405
///
406
/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
407
///
408
/// \param __A
409
///    A 128-bit vector of [16 x unsigned char].
410
/// \param __B
411
///    A 128-bit vector of [16 x unsigned char].
412
/// \returns
413
///    A 128-bit vector of [4 x int].
414
///
415
/// \code{.operation}
416
/// FOR j := 0 to 3
417
///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
418
///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
419
///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
420
///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
421
///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
422
/// ENDFOR
423
/// dst[MAX:128] := 0
424
/// \endcode
425
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
426
                                                                  __m128i __A,
427
                                                                  __m128i __B) {
428
  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
429
                                              (__v4si)__B);
430
}
431
 
432
/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
433
///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
434
///    signed 16-bit results. Sum these 4 results with the corresponding
435
///    32-bit integer in \a __W with signed saturation, and store the packed
436
///    32-bit results in \a dst.
437
///
438
/// \headerfile <x86intrin.h>
439
///
440
/// \code
441
/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
442
/// \endcode
443
///
444
/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
445
///
446
/// \param __A
447
///    A 256-bit vector of [32 x unsigned char].
448
/// \param __B
449
///    A 256-bit vector of [32 x unsigned char].
450
/// \returns
451
///    A 256-bit vector of [8 x int].
452
///
453
/// \code{.operation}
454
/// FOR j := 0 to 7
455
///     tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
456
///     tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
457
///     tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
458
///     tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
459
///     dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
460
/// ENDFOR
461
/// dst[MAX:256] := 0
462
/// \endcode
463
static __inline__ __m256i __DEFAULT_FN_ATTRS256
464
_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
465
  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
466
                                              (__v8si)__B);
467
}
468
#undef __DEFAULT_FN_ATTRS128
469
#undef __DEFAULT_FN_ATTRS256
470
 
471
#endif // __AVXVNNIINT8INTRIN_H