Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------===
2
 *
3
 *
4
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5
 * See https://llvm.org/LICENSE.txt for license information.
6
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7
 *
8
 *===-----------------------------------------------------------------------===
9
 */
10
#ifndef __IMMINTRIN_H
11
#error "Never use <avx512vlvnniintrin.h> directly; include <immintrin.h> instead."
12
#endif
13
 
14
#ifndef __AVX512VLVNNIINTRIN_H
15
#define __AVX512VLVNNIINTRIN_H
16
 
17
/* Define the default attributes for the functions in this file. */
18
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
19
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
20
 
21
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
22
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
23
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
24
/// in \a S, and store the packed 32-bit results in DST.
25
///
26
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
27
///
28
/// \code{.operation}
29
///    FOR j := 0 to 7
30
///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
31
///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
32
///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
33
///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
34
///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
35
///    ENDFOR
36
///    DST[MAX:256] := 0
37
/// \endcode
38
#define _mm256_dpbusd_epi32(S, A, B) \
39
  ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
40
 
41
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
42
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
43
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
44
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
45
///
46
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
47
///
48
/// \code{.operation}
49
///    FOR j := 0 to 7
50
///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
51
///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
52
///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
53
///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
54
///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
55
///    ENDFOR
56
///    DST[MAX:256] := 0
57
/// \endcode
58
#define _mm256_dpbusds_epi32(S, A, B) \
59
  ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
60
 
61
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
62
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
63
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
64
///  and store the packed 32-bit results in DST.
65
///
66
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
67
///
68
/// \code{.operation}
69
///    FOR j := 0 to 7
70
///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
71
///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
72
///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
73
///    ENDFOR
74
///    DST[MAX:256] := 0
75
/// \endcode
76
#define _mm256_dpwssd_epi32(S, A, B) \
77
  ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
78
 
79
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
80
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
81
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
82
/// using signed saturation, and store the packed 32-bit results in DST.
83
///
84
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
85
///
86
/// \code{.operation}
87
///    FOR j := 0 to 7
88
///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
89
///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
90
///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
91
///    ENDFOR
92
///    DST[MAX:256] := 0
93
/// \endcode
94
#define _mm256_dpwssds_epi32(S, A, B) \
95
  ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B)))
96
 
97
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
98
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
99
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
100
/// in \a S, and store the packed 32-bit results in DST.
101
///
102
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
103
///
104
/// \code{.operation}
105
///    FOR j := 0 to 3
106
///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
107
///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
108
///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
109
///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
110
///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
111
///    ENDFOR
112
///    DST[MAX:128] := 0
113
/// \endcode
114
#define _mm_dpbusd_epi32(S, A, B) \
115
  ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
116
 
117
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
118
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
119
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
120
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
121
///
122
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
123
///
124
/// \code{.operation}
125
///    FOR j := 0 to 3
126
///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
127
///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
128
///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
129
///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
130
///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
131
///    ENDFOR
132
///    DST[MAX:128] := 0
133
/// \endcode
134
#define _mm_dpbusds_epi32(S, A, B) \
135
  ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
136
 
137
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
138
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
139
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
140
/// and store the packed 32-bit results in DST.
141
///
142
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
143
///
144
/// \code{.operation}
145
///    FOR j := 0 to 3
146
///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
147
///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
148
///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
149
///    ENDFOR
150
///    DST[MAX:128] := 0
151
/// \endcode
152
#define _mm_dpwssd_epi32(S, A, B) \
153
  ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
154
 
155
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
156
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
157
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
158
/// using signed saturation, and store the packed 32-bit results in DST.
159
///
160
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
161
///
162
/// \code{.operation}
163
///    FOR j := 0 to 3
164
///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
165
///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
166
///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
167
///    ENDFOR
168
///    DST[MAX:128] := 0
169
/// \endcode
170
#define _mm_dpwssds_epi32(S, A, B) \
171
  ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B)))
172
 
173
static __inline__ __m256i __DEFAULT_FN_ATTRS256
174
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
175
{
176
  return (__m256i)__builtin_ia32_selectd_256(__U,
177
                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
178
                                     (__v8si)__S);
179
}
180
 
181
static __inline__ __m256i __DEFAULT_FN_ATTRS256
182
_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
183
{
184
  return (__m256i)__builtin_ia32_selectd_256(__U,
185
                                     (__v8si)_mm256_dpbusd_epi32(__S, __A, __B),
186
                                     (__v8si)_mm256_setzero_si256());
187
}
188
 
189
static __inline__ __m256i __DEFAULT_FN_ATTRS256
190
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
191
{
192
  return (__m256i)__builtin_ia32_selectd_256(__U,
193
                                    (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
194
                                    (__v8si)__S);
195
}
196
 
197
static __inline__ __m256i __DEFAULT_FN_ATTRS256
198
_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
199
{
200
  return (__m256i)__builtin_ia32_selectd_256(__U,
201
                                     (__v8si)_mm256_dpbusds_epi32(__S, __A, __B),
202
                                     (__v8si)_mm256_setzero_si256());
203
}
204
 
205
static __inline__ __m256i __DEFAULT_FN_ATTRS256
206
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
207
{
208
  return (__m256i)__builtin_ia32_selectd_256(__U,
209
                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
210
                                     (__v8si)__S);
211
}
212
 
213
static __inline__ __m256i __DEFAULT_FN_ATTRS256
214
_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
215
{
216
  return (__m256i)__builtin_ia32_selectd_256(__U,
217
                                     (__v8si)_mm256_dpwssd_epi32(__S, __A, __B),
218
                                     (__v8si)_mm256_setzero_si256());
219
}
220
 
221
static __inline__ __m256i __DEFAULT_FN_ATTRS256
222
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
223
{
224
  return (__m256i)__builtin_ia32_selectd_256(__U,
225
                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
226
                                    (__v8si)__S);
227
}
228
 
229
static __inline__ __m256i __DEFAULT_FN_ATTRS256
230
_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
231
{
232
  return (__m256i)__builtin_ia32_selectd_256(__U,
233
                                    (__v8si)_mm256_dpwssds_epi32(__S, __A, __B),
234
                                    (__v8si)_mm256_setzero_si256());
235
}
236
 
237
static __inline__ __m128i __DEFAULT_FN_ATTRS128
238
_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
239
{
240
  return (__m128i)__builtin_ia32_selectd_128(__U,
241
                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
242
                                        (__v4si)__S);
243
}
244
 
245
static __inline__ __m128i __DEFAULT_FN_ATTRS128
246
_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
247
{
248
  return (__m128i)__builtin_ia32_selectd_128(__U,
249
                                        (__v4si)_mm_dpbusd_epi32(__S, __A, __B),
250
                                        (__v4si)_mm_setzero_si128());
251
}
252
 
253
static __inline__ __m128i __DEFAULT_FN_ATTRS128
254
_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
255
{
256
  return (__m128i)__builtin_ia32_selectd_128(__U,
257
                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
258
                                       (__v4si)__S);
259
}
260
 
261
static __inline__ __m128i __DEFAULT_FN_ATTRS128
262
_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
263
{
264
  return (__m128i)__builtin_ia32_selectd_128(__U,
265
                                       (__v4si)_mm_dpbusds_epi32(__S, __A, __B),
266
                                       (__v4si)_mm_setzero_si128());
267
}
268
 
269
static __inline__ __m128i __DEFAULT_FN_ATTRS128
270
_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
271
{
272
  return (__m128i)__builtin_ia32_selectd_128(__U,
273
                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
274
                                        (__v4si)__S);
275
}
276
 
277
static __inline__ __m128i __DEFAULT_FN_ATTRS128
278
_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
279
{
280
  return (__m128i)__builtin_ia32_selectd_128(__U,
281
                                        (__v4si)_mm_dpwssd_epi32(__S, __A, __B),
282
                                        (__v4si)_mm_setzero_si128());
283
}
284
 
285
static __inline__ __m128i __DEFAULT_FN_ATTRS128
286
_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
287
{
288
  return (__m128i)__builtin_ia32_selectd_128(__U,
289
                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
290
                                       (__v4si)__S);
291
}
292
 
293
static __inline__ __m128i __DEFAULT_FN_ATTRS128
294
_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
295
{
296
  return (__m128i)__builtin_ia32_selectd_128(__U,
297
                                       (__v4si)_mm_dpwssds_epi32(__S, __A, __B),
298
                                       (__v4si)_mm_setzero_si128());
299
}
300
 
301
#undef __DEFAULT_FN_ATTRS128
302
#undef __DEFAULT_FN_ATTRS256
303
 
304
#endif