Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
2
 *
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a copy
5
 * of this software and associated documentation files (the "Software"), to deal
6
 * in the Software without restriction, including without limitation the rights
7
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8
 * copies of the Software, and to permit persons to whom the Software is
9
 * furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice shall be included in
12
 * all copies or substantial portions of the Software.
13
 *
14
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20
 * THE SOFTWARE.
21
 *
22
 *===-----------------------------------------------------------------------===
23
 */
24
#ifndef __IMMINTRIN_H
25
#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
26
#endif
27
 
28
#ifndef __AVXVNNIINTRIN_H
29
#define __AVXVNNIINTRIN_H
30
 
31
/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
32
/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
33
/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
34
/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
35
/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
36
/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
37
/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
38
/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
39
/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
40
 
41
/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
42
/* Define the default attributes for the functions in this file. */
43
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
44
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
45
 
46
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
47
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
48
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
49
/// in \a __S, and store the packed 32-bit results in DST.
50
///
51
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
52
///
53
/// \code{.operation}
54
///    FOR j := 0 to 7
55
///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
56
///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
57
///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
58
///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
59
///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
60
///    ENDFOR
61
///    DST[MAX:256] := 0
62
/// \endcode
63
static __inline__ __m256i __DEFAULT_FN_ATTRS256
64
_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
65
{
66
  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
67
}
68
 
69
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
70
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
71
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
72
/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
73
///
74
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
75
///
76
/// \code{.operation}
77
///    FOR j := 0 to 7
78
///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
79
///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
80
///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
81
///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
82
///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
83
///    ENDFOR
84
///    DST[MAX:256] := 0
85
/// \endcode
86
static __inline__ __m256i __DEFAULT_FN_ATTRS256
87
_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
88
{
89
  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
90
}
91
 
92
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
93
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
94
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
95
///  and store the packed 32-bit results in DST.
96
///
97
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
98
///
99
/// \code{.operation}
100
///    FOR j := 0 to 7
101
///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
102
///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
103
///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
104
///    ENDFOR
105
///    DST[MAX:256] := 0
106
/// \endcode
107
static __inline__ __m256i __DEFAULT_FN_ATTRS256
108
_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
109
{
110
  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
111
}
112
 
113
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
114
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
115
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
116
/// using signed saturation, and store the packed 32-bit results in DST.
117
///
118
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
119
///
120
/// \code{.operation}
121
///    FOR j := 0 to 7
122
///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
123
///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
124
///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
125
///    ENDFOR
126
///    DST[MAX:256] := 0
127
/// \endcode
128
static __inline__ __m256i __DEFAULT_FN_ATTRS256
129
_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
130
{
131
  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
132
}
133
 
134
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
135
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
136
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
137
/// in \a __S, and store the packed 32-bit results in DST.
138
///
139
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
140
///
141
/// \code{.operation}
142
///    FOR j := 0 to 3
143
///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
144
///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
145
///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
146
///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
147
///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
148
///    ENDFOR
149
///    DST[MAX:128] := 0
150
/// \endcode
151
static __inline__ __m128i __DEFAULT_FN_ATTRS128
152
_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
153
{
154
  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
155
}
156
 
157
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
158
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
159
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
160
/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
161
///
162
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
163
///
164
/// \code{.operation}
165
///    FOR j := 0 to 3
166
///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
167
///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
168
///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
169
///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
170
///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
171
///    ENDFOR
172
///    DST[MAX:128] := 0
173
/// \endcode
174
static __inline__ __m128i __DEFAULT_FN_ATTRS128
175
_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
176
{
177
  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
178
}
179
 
180
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
181
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
182
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
183
/// and store the packed 32-bit results in DST.
184
///
185
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
186
///
187
/// \code{.operation}
188
///    FOR j := 0 to 3
189
///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
190
///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
191
///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
192
///    ENDFOR
193
///    DST[MAX:128] := 0
194
/// \endcode
195
static __inline__ __m128i __DEFAULT_FN_ATTRS128
196
_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
197
{
198
  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
199
}
200
 
201
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
202
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
203
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
204
/// using signed saturation, and store the packed 32-bit results in DST.
205
///
206
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
207
///
208
/// \code{.operation}
209
///    FOR j := 0 to 3
210
///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
211
///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
212
///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
213
///    ENDFOR
214
///    DST[MAX:128] := 0
215
/// \endcode
216
static __inline__ __m128i __DEFAULT_FN_ATTRS128
217
_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
218
{
219
  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
220
}
221
 
222
#undef __DEFAULT_FN_ATTRS128
223
#undef __DEFAULT_FN_ATTRS256
224
 
225
#endif // __AVXVNNIINTRIN_H