Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | /*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------=== |
2 | * |
||
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | * See https://llvm.org/LICENSE.txt for license information. |
||
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | * |
||
7 | *===-----------------------------------------------------------------------=== |
||
8 | */ |
||
9 | #ifndef __IMMINTRIN_H |
||
10 | #error \ |
||
11 | "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead." |
||
12 | #endif |
||
13 | |||
14 | #ifdef __SSE2__ |
||
15 | |||
16 | #ifndef __AVX512VLFP16INTRIN_H |
||
17 | #define __AVX512VLFP16INTRIN_H |
||
18 | |||
19 | /* Define the default attributes for the functions in this file. */ |
||
20 | #define __DEFAULT_FN_ATTRS256 \ |
||
21 | __attribute__((__always_inline__, __nodebug__, \ |
||
22 | __target__("avx512fp16, avx512vl"), \ |
||
23 | __min_vector_width__(256))) |
||
24 | #define __DEFAULT_FN_ATTRS128 \ |
||
25 | __attribute__((__always_inline__, __nodebug__, \ |
||
26 | __target__("avx512fp16, avx512vl"), \ |
||
27 | __min_vector_width__(128))) |
||
28 | |||
29 | static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) { |
||
30 | return __a[0]; |
||
31 | } |
||
32 | |||
33 | static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) { |
||
34 | return __a[0]; |
||
35 | } |
||
36 | |||
37 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) { |
||
38 | return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0}; |
||
39 | } |
||
40 | |||
41 | static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) { |
||
42 | return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h}; |
||
43 | } |
||
44 | |||
45 | static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) { |
||
46 | return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h, |
||
47 | __h, __h, __h, __h, __h, __h, __h, __h}; |
||
48 | } |
||
49 | |||
50 | static __inline __m128h __DEFAULT_FN_ATTRS128 |
||
51 | _mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, |
||
52 | _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) { |
||
53 | return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1}; |
||
54 | } |
||
55 | |||
56 | static __inline __m256h __DEFAULT_FN_ATTRS256 |
||
57 | _mm256_set1_pch(_Float16 _Complex h) { |
||
58 | return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h)); |
||
59 | } |
||
60 | |||
61 | static __inline __m128h __DEFAULT_FN_ATTRS128 |
||
62 | _mm_set1_pch(_Float16 _Complex h) { |
||
63 | return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h)); |
||
64 | } |
||
65 | |||
66 | static __inline __m256h __DEFAULT_FN_ATTRS256 |
||
67 | _mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, |
||
68 | _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, |
||
69 | _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, |
||
70 | _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) { |
||
71 | return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11, |
||
72 | __h10, __h9, __h8, __h7, __h6, __h5, |
||
73 | __h4, __h3, __h2, __h1}; |
||
74 | } |
||
75 | |||
76 | #define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \ |
||
77 | _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1)) |
||
78 | |||
79 | #define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \ |
||
80 | h14, h15, h16) \ |
||
81 | _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \ |
||
82 | (h7), (h6), (h5), (h4), (h3), (h2), (h1)) |
||
83 | |||
84 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A, |
||
85 | __m256h __B) { |
||
86 | return (__m256h)((__v16hf)__A + (__v16hf)__B); |
||
87 | } |
||
88 | |||
89 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
90 | _mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { |
||
91 | return (__m256h)__builtin_ia32_selectph_256( |
||
92 | __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W); |
||
93 | } |
||
94 | |||
95 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
96 | _mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) { |
||
97 | return (__m256h)__builtin_ia32_selectph_256( |
||
98 | __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); |
||
99 | } |
||
100 | |||
101 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A, |
||
102 | __m128h __B) { |
||
103 | return (__m128h)((__v8hf)__A + (__v8hf)__B); |
||
104 | } |
||
105 | |||
106 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W, |
||
107 | __mmask8 __U, |
||
108 | __m128h __A, |
||
109 | __m128h __B) { |
||
110 | return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), |
||
111 | (__v8hf)__W); |
||
112 | } |
||
113 | |||
114 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U, |
||
115 | __m128h __A, |
||
116 | __m128h __B) { |
||
117 | return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), |
||
118 | (__v8hf)_mm_setzero_ph()); |
||
119 | } |
||
120 | |||
121 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A, |
||
122 | __m256h __B) { |
||
123 | return (__m256h)((__v16hf)__A - (__v16hf)__B); |
||
124 | } |
||
125 | |||
126 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
127 | _mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { |
||
128 | return (__m256h)__builtin_ia32_selectph_256( |
||
129 | __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W); |
||
130 | } |
||
131 | |||
132 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
133 | _mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) { |
||
134 | return (__m256h)__builtin_ia32_selectph_256( |
||
135 | __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); |
||
136 | } |
||
137 | |||
138 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A, |
||
139 | __m128h __B) { |
||
140 | return (__m128h)((__v8hf)__A - (__v8hf)__B); |
||
141 | } |
||
142 | |||
143 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W, |
||
144 | __mmask8 __U, |
||
145 | __m128h __A, |
||
146 | __m128h __B) { |
||
147 | return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), |
||
148 | (__v8hf)__W); |
||
149 | } |
||
150 | |||
151 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U, |
||
152 | __m128h __A, |
||
153 | __m128h __B) { |
||
154 | return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), |
||
155 | (__v8hf)_mm_setzero_ph()); |
||
156 | } |
||
157 | |||
158 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A, |
||
159 | __m256h __B) { |
||
160 | return (__m256h)((__v16hf)__A * (__v16hf)__B); |
||
161 | } |
||
162 | |||
163 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
164 | _mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { |
||
165 | return (__m256h)__builtin_ia32_selectph_256( |
||
166 | __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W); |
||
167 | } |
||
168 | |||
169 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
170 | _mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) { |
||
171 | return (__m256h)__builtin_ia32_selectph_256( |
||
172 | __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); |
||
173 | } |
||
174 | |||
175 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A, |
||
176 | __m128h __B) { |
||
177 | return (__m128h)((__v8hf)__A * (__v8hf)__B); |
||
178 | } |
||
179 | |||
180 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W, |
||
181 | __mmask8 __U, |
||
182 | __m128h __A, |
||
183 | __m128h __B) { |
||
184 | return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), |
||
185 | (__v8hf)__W); |
||
186 | } |
||
187 | |||
188 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U, |
||
189 | __m128h __A, |
||
190 | __m128h __B) { |
||
191 | return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), |
||
192 | (__v8hf)_mm_setzero_ph()); |
||
193 | } |
||
194 | |||
195 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A, |
||
196 | __m256h __B) { |
||
197 | return (__m256h)((__v16hf)__A / (__v16hf)__B); |
||
198 | } |
||
199 | |||
200 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
201 | _mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { |
||
202 | return (__m256h)__builtin_ia32_selectph_256( |
||
203 | __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W); |
||
204 | } |
||
205 | |||
206 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
207 | _mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) { |
||
208 | return (__m256h)__builtin_ia32_selectph_256( |
||
209 | __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); |
||
210 | } |
||
211 | |||
212 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A, |
||
213 | __m128h __B) { |
||
214 | return (__m128h)((__v8hf)__A / (__v8hf)__B); |
||
215 | } |
||
216 | |||
217 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W, |
||
218 | __mmask8 __U, |
||
219 | __m128h __A, |
||
220 | __m128h __B) { |
||
221 | return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), |
||
222 | (__v8hf)__W); |
||
223 | } |
||
224 | |||
225 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U, |
||
226 | __m128h __A, |
||
227 | __m128h __B) { |
||
228 | return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), |
||
229 | (__v8hf)_mm_setzero_ph()); |
||
230 | } |
||
231 | |||
232 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A, |
||
233 | __m256h __B) { |
||
234 | return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B); |
||
235 | } |
||
236 | |||
237 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
238 | _mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { |
||
239 | return (__m256h)__builtin_ia32_selectph_256( |
||
240 | (__mmask16)__U, |
||
241 | (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), |
||
242 | (__v16hf)__W); |
||
243 | } |
||
244 | |||
245 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
246 | _mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) { |
||
247 | return (__m256h)__builtin_ia32_selectph_256( |
||
248 | (__mmask16)__U, |
||
249 | (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), |
||
250 | (__v16hf)_mm256_setzero_ph()); |
||
251 | } |
||
252 | |||
253 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A, |
||
254 | __m128h __B) { |
||
255 | return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B); |
||
256 | } |
||
257 | |||
258 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W, |
||
259 | __mmask8 __U, |
||
260 | __m128h __A, |
||
261 | __m128h __B) { |
||
262 | return (__m128h)__builtin_ia32_selectph_128( |
||
263 | (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), |
||
264 | (__v8hf)__W); |
||
265 | } |
||
266 | |||
267 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U, |
||
268 | __m128h __A, |
||
269 | __m128h __B) { |
||
270 | return (__m128h)__builtin_ia32_selectph_128( |
||
271 | (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), |
||
272 | (__v8hf)_mm_setzero_ph()); |
||
273 | } |
||
274 | |||
275 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A, |
||
276 | __m256h __B) { |
||
277 | return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B); |
||
278 | } |
||
279 | |||
280 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
281 | _mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { |
||
282 | return (__m256h)__builtin_ia32_selectph_256( |
||
283 | (__mmask16)__U, |
||
284 | (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), |
||
285 | (__v16hf)__W); |
||
286 | } |
||
287 | |||
288 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
289 | _mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) { |
||
290 | return (__m256h)__builtin_ia32_selectph_256( |
||
291 | (__mmask16)__U, |
||
292 | (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), |
||
293 | (__v16hf)_mm256_setzero_ph()); |
||
294 | } |
||
295 | |||
296 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A, |
||
297 | __m128h __B) { |
||
298 | return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B); |
||
299 | } |
||
300 | |||
301 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W, |
||
302 | __mmask8 __U, |
||
303 | __m128h __A, |
||
304 | __m128h __B) { |
||
305 | return (__m128h)__builtin_ia32_selectph_128( |
||
306 | (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), |
||
307 | (__v8hf)__W); |
||
308 | } |
||
309 | |||
310 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U, |
||
311 | __m128h __A, |
||
312 | __m128h __B) { |
||
313 | return (__m128h)__builtin_ia32_selectph_128( |
||
314 | (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), |
||
315 | (__v8hf)_mm_setzero_ph()); |
||
316 | } |
||
317 | |||
318 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { |
||
319 | return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A); |
||
320 | } |
||
321 | |||
322 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) { |
||
323 | return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); |
||
324 | } |
||
325 | |||
326 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) { |
||
327 | return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f)); |
||
328 | } |
||
329 | |||
330 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
331 | _mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) { |
||
332 | return (__m256h)__builtin_ia32_selectps_256( |
||
333 | (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W); |
||
334 | } |
||
335 | |||
336 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
337 | _mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) { |
||
338 | return (__m256h)__builtin_ia32_selectps_256( |
||
339 | (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps()); |
||
340 | } |
||
341 | |||
342 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) { |
||
343 | return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f)); |
||
344 | } |
||
345 | |||
346 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W, |
||
347 | __mmask8 __U, |
||
348 | __m128h __A) { |
||
349 | return (__m128h)__builtin_ia32_selectps_128( |
||
350 | (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W); |
||
351 | } |
||
352 | |||
353 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
354 | _mm_maskz_conj_pch(__mmask8 __U, __m128h __A) { |
||
355 | return (__m128h)__builtin_ia32_selectps_128( |
||
356 | (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps()); |
||
357 | } |
||
358 | |||
359 | #define _mm256_cmp_ph_mask(a, b, p) \ |
||
360 | ((__mmask16)__builtin_ia32_cmpph256_mask( \ |
||
361 | (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1)) |
||
362 | |||
363 | #define _mm256_mask_cmp_ph_mask(m, a, b, p) \ |
||
364 | ((__mmask16)__builtin_ia32_cmpph256_mask( \ |
||
365 | (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m))) |
||
366 | |||
367 | #define _mm_cmp_ph_mask(a, b, p) \ |
||
368 | ((__mmask8)__builtin_ia32_cmpph128_mask( \ |
||
369 | (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1)) |
||
370 | |||
371 | #define _mm_mask_cmp_ph_mask(m, a, b, p) \ |
||
372 | ((__mmask8)__builtin_ia32_cmpph128_mask( \ |
||
373 | (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m))) |
||
374 | |||
375 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) { |
||
376 | return (__m256h)__builtin_ia32_rcpph256_mask( |
||
377 | (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); |
||
378 | } |
||
379 | |||
380 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
381 | _mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) { |
||
382 | return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W, |
||
383 | (__mmask16)__U); |
||
384 | } |
||
385 | |||
386 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
387 | _mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) { |
||
388 | return (__m256h)__builtin_ia32_rcpph256_mask( |
||
389 | (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); |
||
390 | } |
||
391 | |||
392 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) { |
||
393 | return (__m128h)__builtin_ia32_rcpph128_mask( |
||
394 | (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
395 | } |
||
396 | |||
397 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W, |
||
398 | __mmask8 __U, |
||
399 | __m128h __A) { |
||
400 | return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W, |
||
401 | (__mmask8)__U); |
||
402 | } |
||
403 | |||
404 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U, |
||
405 | __m128h __A) { |
||
406 | return (__m128h)__builtin_ia32_rcpph128_mask( |
||
407 | (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
408 | } |
||
409 | |||
410 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) { |
||
411 | return (__m256h)__builtin_ia32_rsqrtph256_mask( |
||
412 | (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); |
||
413 | } |
||
414 | |||
415 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
416 | _mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { |
||
417 | return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W, |
||
418 | (__mmask16)__U); |
||
419 | } |
||
420 | |||
421 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
422 | _mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) { |
||
423 | return (__m256h)__builtin_ia32_rsqrtph256_mask( |
||
424 | (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); |
||
425 | } |
||
426 | |||
427 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) { |
||
428 | return (__m128h)__builtin_ia32_rsqrtph128_mask( |
||
429 | (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
430 | } |
||
431 | |||
432 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W, |
||
433 | __mmask8 __U, |
||
434 | __m128h __A) { |
||
435 | return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W, |
||
436 | (__mmask8)__U); |
||
437 | } |
||
438 | |||
439 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
440 | _mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) { |
||
441 | return (__m128h)__builtin_ia32_rsqrtph128_mask( |
||
442 | (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
443 | } |
||
444 | |||
445 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) { |
||
446 | return (__m128h)__builtin_ia32_getexpph128_mask( |
||
447 | (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); |
||
448 | } |
||
449 | |||
450 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
451 | _mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) { |
||
452 | return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W, |
||
453 | (__mmask8)__U); |
||
454 | } |
||
455 | |||
456 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
457 | _mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) { |
||
458 | return (__m128h)__builtin_ia32_getexpph128_mask( |
||
459 | (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
460 | } |
||
461 | |||
462 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) { |
||
463 | return (__m256h)__builtin_ia32_getexpph256_mask( |
||
464 | (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); |
||
465 | } |
||
466 | |||
467 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
468 | _mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) { |
||
469 | return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W, |
||
470 | (__mmask16)__U); |
||
471 | } |
||
472 | |||
473 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
474 | _mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) { |
||
475 | return (__m256h)__builtin_ia32_getexpph256_mask( |
||
476 | (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); |
||
477 | } |
||
478 | |||
479 | #define _mm_getmant_ph(A, B, C) \ |
||
480 | ((__m128h)__builtin_ia32_getmantph128_mask( \ |
||
481 | (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ |
||
482 | (__mmask8)-1)) |
||
483 | |||
484 | #define _mm_mask_getmant_ph(W, U, A, B, C) \ |
||
485 | ((__m128h)__builtin_ia32_getmantph128_mask( \ |
||
486 | (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \ |
||
487 | (__mmask8)(U))) |
||
488 | |||
489 | #define _mm_maskz_getmant_ph(U, A, B, C) \ |
||
490 | ((__m128h)__builtin_ia32_getmantph128_mask( \ |
||
491 | (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ |
||
492 | (__mmask8)(U))) |
||
493 | |||
494 | #define _mm256_getmant_ph(A, B, C) \ |
||
495 | ((__m256h)__builtin_ia32_getmantph256_mask( \ |
||
496 | (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ |
||
497 | (__v16hf)_mm256_setzero_ph(), (__mmask16)-1)) |
||
498 | |||
499 | #define _mm256_mask_getmant_ph(W, U, A, B, C) \ |
||
500 | ((__m256h)__builtin_ia32_getmantph256_mask( \ |
||
501 | (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \ |
||
502 | (__mmask16)(U))) |
||
503 | |||
504 | #define _mm256_maskz_getmant_ph(U, A, B, C) \ |
||
505 | ((__m256h)__builtin_ia32_getmantph256_mask( \ |
||
506 | (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ |
||
507 | (__v16hf)_mm256_setzero_ph(), (__mmask16)(U))) |
||
508 | |||
509 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A, |
||
510 | __m128h __B) { |
||
511 | return (__m128h)__builtin_ia32_scalefph128_mask( |
||
512 | (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); |
||
513 | } |
||
514 | |||
515 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
516 | _mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { |
||
517 | return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B, |
||
518 | (__v8hf)__W, (__mmask8)__U); |
||
519 | } |
||
520 | |||
521 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
522 | _mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) { |
||
523 | return (__m128h)__builtin_ia32_scalefph128_mask( |
||
524 | (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
525 | } |
||
526 | |||
527 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A, |
||
528 | __m256h __B) { |
||
529 | return (__m256h)__builtin_ia32_scalefph256_mask( |
||
530 | (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); |
||
531 | } |
||
532 | |||
533 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
534 | _mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { |
||
535 | return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B, |
||
536 | (__v16hf)__W, (__mmask16)__U); |
||
537 | } |
||
538 | |||
539 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
540 | _mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) { |
||
541 | return (__m256h)__builtin_ia32_scalefph256_mask( |
||
542 | (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); |
||
543 | } |
||
544 | |||
545 | #define _mm_roundscale_ph(A, imm) \ |
||
546 | ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ |
||
547 | (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ |
||
548 | (__mmask8)-1)) |
||
549 | |||
550 | #define _mm_mask_roundscale_ph(W, U, A, imm) \ |
||
551 | ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ |
||
552 | (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) |
||
553 | |||
554 | #define _mm_maskz_roundscale_ph(U, A, imm) \ |
||
555 | ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ |
||
556 | (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ |
||
557 | (__mmask8)(U))) |
||
558 | |||
559 | #define _mm256_roundscale_ph(A, imm) \ |
||
560 | ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ |
||
561 | (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ |
||
562 | (__mmask16)-1)) |
||
563 | |||
564 | #define _mm256_mask_roundscale_ph(W, U, A, imm) \ |
||
565 | ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ |
||
566 | (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \ |
||
567 | (__mmask16)(U))) |
||
568 | |||
569 | #define _mm256_maskz_roundscale_ph(U, A, imm) \ |
||
570 | ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ |
||
571 | (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ |
||
572 | (__mmask16)(U))) |
||
573 | |||
574 | #define _mm_reduce_ph(A, imm) \ |
||
575 | ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ |
||
576 | (__v8hf)_mm_setzero_ph(), \ |
||
577 | (__mmask8)-1)) |
||
578 | |||
579 | #define _mm_mask_reduce_ph(W, U, A, imm) \ |
||
580 | ((__m128h)__builtin_ia32_reduceph128_mask( \ |
||
581 | (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) |
||
582 | |||
583 | #define _mm_maskz_reduce_ph(U, A, imm) \ |
||
584 | ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ |
||
585 | (__v8hf)_mm_setzero_ph(), \ |
||
586 | (__mmask8)(U))) |
||
587 | |||
588 | #define _mm256_reduce_ph(A, imm) \ |
||
589 | ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ |
||
590 | (__v16hf)_mm256_setzero_ph(), \ |
||
591 | (__mmask16)-1)) |
||
592 | |||
593 | #define _mm256_mask_reduce_ph(W, U, A, imm) \ |
||
594 | ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ |
||
595 | (__v16hf)(__m256h)(W), \ |
||
596 | (__mmask16)(U))) |
||
597 | |||
598 | #define _mm256_maskz_reduce_ph(U, A, imm) \ |
||
599 | ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ |
||
600 | (__v16hf)_mm256_setzero_ph(), \ |
||
601 | (__mmask16)(U))) |
||
602 | |||
603 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) { |
||
604 | return __builtin_ia32_sqrtph((__v8hf)__a); |
||
605 | } |
||
606 | |||
607 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W, |
||
608 | __mmask8 __U, |
||
609 | __m128h __A) { |
||
610 | return (__m128h)__builtin_ia32_selectph_128( |
||
611 | (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W); |
||
612 | } |
||
613 | |||
614 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U, |
||
615 | __m128h __A) { |
||
616 | return (__m128h)__builtin_ia32_selectph_128( |
||
617 | (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph()); |
||
618 | } |
||
619 | |||
620 | static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) { |
||
621 | return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a); |
||
622 | } |
||
623 | |||
624 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
625 | _mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { |
||
626 | return (__m256h)__builtin_ia32_selectph_256( |
||
627 | (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W); |
||
628 | } |
||
629 | |||
630 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
631 | _mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) { |
||
632 | return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, |
||
633 | (__v16hf)_mm256_sqrt_ph(__A), |
||
634 | (__v16hf)_mm256_setzero_ph()); |
||
635 | } |
||
636 | |||
637 | #define _mm_mask_fpclass_ph_mask(U, A, imm) \ |
||
638 | ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ |
||
639 | (int)(imm), (__mmask8)(U))) |
||
640 | |||
641 | #define _mm_fpclass_ph_mask(A, imm) \ |
||
642 | ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ |
||
643 | (int)(imm), (__mmask8)-1)) |
||
644 | |||
645 | #define _mm256_mask_fpclass_ph_mask(U, A, imm) \ |
||
646 | ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ |
||
647 | (int)(imm), (__mmask16)(U))) |
||
648 | |||
649 | #define _mm256_fpclass_ph_mask(A, imm) \ |
||
650 | ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ |
||
651 | (int)(imm), (__mmask16)-1)) |
||
652 | |||
653 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) { |
||
654 | return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( |
||
655 | (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
656 | } |
||
657 | |||
658 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W, |
||
659 | __mmask8 __U, |
||
660 | __m128d __A) { |
||
661 | return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W, |
||
662 | (__mmask8)__U); |
||
663 | } |
||
664 | |||
665 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
666 | _mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) { |
||
667 | return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( |
||
668 | (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
669 | } |
||
670 | |||
671 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) { |
||
672 | return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( |
||
673 | (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
674 | } |
||
675 | |||
676 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
677 | _mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) { |
||
678 | return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W, |
||
679 | (__mmask8)__U); |
||
680 | } |
||
681 | |||
682 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
683 | _mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) { |
||
684 | return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( |
||
685 | (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
686 | } |
||
687 | |||
688 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) { |
||
689 | return (__m128d)__builtin_ia32_vcvtph2pd128_mask( |
||
690 | (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1); |
||
691 | } |
||
692 | |||
693 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W, |
||
694 | __mmask8 __U, |
||
695 | __m128h __A) { |
||
696 | return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W, |
||
697 | (__mmask8)__U); |
||
698 | } |
||
699 | |||
700 | static __inline__ __m128d __DEFAULT_FN_ATTRS128 |
||
701 | _mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { |
||
702 | return (__m128d)__builtin_ia32_vcvtph2pd128_mask( |
||
703 | (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); |
||
704 | } |
||
705 | |||
706 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) { |
||
707 | return (__m256d)__builtin_ia32_vcvtph2pd256_mask( |
||
708 | (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); |
||
709 | } |
||
710 | |||
711 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
||
712 | _mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) { |
||
713 | return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W, |
||
714 | (__mmask8)__U); |
||
715 | } |
||
716 | |||
717 | static __inline__ __m256d __DEFAULT_FN_ATTRS256 |
||
718 | _mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { |
||
719 | return (__m256d)__builtin_ia32_vcvtph2pd256_mask( |
||
720 | (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); |
||
721 | } |
||
722 | |||
723 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) { |
||
724 | return (__m128i)__builtin_ia32_vcvtph2w128_mask( |
||
725 | (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); |
||
726 | } |
||
727 | |||
728 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
729 | _mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { |
||
730 | return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W, |
||
731 | (__mmask8)__U); |
||
732 | } |
||
733 | |||
734 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
735 | _mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) { |
||
736 | return (__m128i)__builtin_ia32_vcvtph2w128_mask( |
||
737 | (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); |
||
738 | } |
||
739 | |||
740 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
741 | _mm256_cvtph_epi16(__m256h __A) { |
||
742 | return (__m256i)__builtin_ia32_vcvtph2w256_mask( |
||
743 | (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); |
||
744 | } |
||
745 | |||
746 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
747 | _mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { |
||
748 | return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W, |
||
749 | (__mmask16)__U); |
||
750 | } |
||
751 | |||
752 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
753 | _mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) { |
||
754 | return (__m256i)__builtin_ia32_vcvtph2w256_mask( |
||
755 | (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); |
||
756 | } |
||
757 | |||
758 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) { |
||
759 | return (__m128i)__builtin_ia32_vcvttph2w128_mask( |
||
760 | (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); |
||
761 | } |
||
762 | |||
763 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
764 | _mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { |
||
765 | return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W, |
||
766 | (__mmask8)__U); |
||
767 | } |
||
768 | |||
769 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
770 | _mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) { |
||
771 | return (__m128i)__builtin_ia32_vcvttph2w128_mask( |
||
772 | (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); |
||
773 | } |
||
774 | |||
775 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
776 | _mm256_cvttph_epi16(__m256h __A) { |
||
777 | return (__m256i)__builtin_ia32_vcvttph2w256_mask( |
||
778 | (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); |
||
779 | } |
||
780 | |||
781 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
782 | _mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { |
||
783 | return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W, |
||
784 | (__mmask16)__U); |
||
785 | } |
||
786 | |||
787 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
788 | _mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) { |
||
789 | return (__m256i)__builtin_ia32_vcvttph2w256_mask( |
||
790 | (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); |
||
791 | } |
||
792 | |||
793 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) { |
||
794 | return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf); |
||
795 | } |
||
796 | |||
797 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
798 | _mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) { |
||
799 | return (__m128h)__builtin_ia32_selectph_128( |
||
800 | (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W); |
||
801 | } |
||
802 | |||
803 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
804 | _mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) { |
||
805 | return (__m128h)__builtin_ia32_selectph_128( |
||
806 | (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph()); |
||
807 | } |
||
808 | |||
809 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
810 | _mm256_cvtepi16_ph(__m256i __A) { |
||
811 | return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf); |
||
812 | } |
||
813 | |||
814 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
815 | _mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) { |
||
816 | return (__m256h)__builtin_ia32_selectph_256( |
||
817 | (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W); |
||
818 | } |
||
819 | |||
820 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
821 | _mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) { |
||
822 | return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, |
||
823 | (__v16hf)_mm256_cvtepi16_ph(__A), |
||
824 | (__v16hf)_mm256_setzero_ph()); |
||
825 | } |
||
826 | |||
827 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) { |
||
828 | return (__m128i)__builtin_ia32_vcvtph2uw128_mask( |
||
829 | (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); |
||
830 | } |
||
831 | |||
832 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
833 | _mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { |
||
834 | return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W, |
||
835 | (__mmask8)__U); |
||
836 | } |
||
837 | |||
838 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
839 | _mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) { |
||
840 | return (__m128i)__builtin_ia32_vcvtph2uw128_mask( |
||
841 | (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); |
||
842 | } |
||
843 | |||
844 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
845 | _mm256_cvtph_epu16(__m256h __A) { |
||
846 | return (__m256i)__builtin_ia32_vcvtph2uw256_mask( |
||
847 | (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); |
||
848 | } |
||
849 | |||
850 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
851 | _mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { |
||
852 | return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W, |
||
853 | (__mmask16)__U); |
||
854 | } |
||
855 | |||
856 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
857 | _mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) { |
||
858 | return (__m256i)__builtin_ia32_vcvtph2uw256_mask( |
||
859 | (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); |
||
860 | } |
||
861 | |||
862 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) { |
||
863 | return (__m128i)__builtin_ia32_vcvttph2uw128_mask( |
||
864 | (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); |
||
865 | } |
||
866 | |||
867 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
868 | _mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { |
||
869 | return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W, |
||
870 | (__mmask8)__U); |
||
871 | } |
||
872 | |||
873 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
874 | _mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) { |
||
875 | return (__m128i)__builtin_ia32_vcvttph2uw128_mask( |
||
876 | (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); |
||
877 | } |
||
878 | |||
879 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
880 | _mm256_cvttph_epu16(__m256h __A) { |
||
881 | return (__m256i)__builtin_ia32_vcvttph2uw256_mask( |
||
882 | (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); |
||
883 | } |
||
884 | |||
885 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
886 | _mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { |
||
887 | return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W, |
||
888 | (__mmask16)__U); |
||
889 | } |
||
890 | |||
891 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
892 | _mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) { |
||
893 | return (__m256i)__builtin_ia32_vcvttph2uw256_mask( |
||
894 | (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); |
||
895 | } |
||
896 | |||
897 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) { |
||
898 | return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf); |
||
899 | } |
||
900 | |||
901 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
902 | _mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) { |
||
903 | return (__m128h)__builtin_ia32_selectph_128( |
||
904 | (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W); |
||
905 | } |
||
906 | |||
907 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
908 | _mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) { |
||
909 | return (__m128h)__builtin_ia32_selectph_128( |
||
910 | (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph()); |
||
911 | } |
||
912 | |||
913 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
914 | _mm256_cvtepu16_ph(__m256i __A) { |
||
915 | return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf); |
||
916 | } |
||
917 | |||
918 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
919 | _mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) { |
||
920 | return (__m256h)__builtin_ia32_selectph_256( |
||
921 | (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W); |
||
922 | } |
||
923 | |||
924 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
925 | _mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) { |
||
926 | return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, |
||
927 | (__v16hf)_mm256_cvtepu16_ph(__A), |
||
928 | (__v16hf)_mm256_setzero_ph()); |
||
929 | } |
||
930 | |||
931 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) { |
||
932 | return (__m128i)__builtin_ia32_vcvtph2dq128_mask( |
||
933 | (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); |
||
934 | } |
||
935 | |||
936 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
937 | _mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { |
||
938 | return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W, |
||
939 | (__mmask8)__U); |
||
940 | } |
||
941 | |||
942 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
943 | _mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { |
||
944 | return (__m128i)__builtin_ia32_vcvtph2dq128_mask( |
||
945 | (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); |
||
946 | } |
||
947 | |||
948 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
949 | _mm256_cvtph_epi32(__m128h __A) { |
||
950 | return (__m256i)__builtin_ia32_vcvtph2dq256_mask( |
||
951 | (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); |
||
952 | } |
||
953 | |||
954 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
955 | _mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { |
||
956 | return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W, |
||
957 | (__mmask8)__U); |
||
958 | } |
||
959 | |||
960 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
961 | _mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { |
||
962 | return (__m256i)__builtin_ia32_vcvtph2dq256_mask( |
||
963 | (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); |
||
964 | } |
||
965 | |||
966 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) { |
||
967 | return (__m128i)__builtin_ia32_vcvtph2udq128_mask( |
||
968 | (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); |
||
969 | } |
||
970 | |||
971 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
972 | _mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { |
||
973 | return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W, |
||
974 | (__mmask8)__U); |
||
975 | } |
||
976 | |||
977 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
978 | _mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { |
||
979 | return (__m128i)__builtin_ia32_vcvtph2udq128_mask( |
||
980 | (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); |
||
981 | } |
||
982 | |||
983 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
984 | _mm256_cvtph_epu32(__m128h __A) { |
||
985 | return (__m256i)__builtin_ia32_vcvtph2udq256_mask( |
||
986 | (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); |
||
987 | } |
||
988 | |||
989 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
990 | _mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { |
||
991 | return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W, |
||
992 | (__mmask8)__U); |
||
993 | } |
||
994 | |||
995 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
996 | _mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { |
||
997 | return (__m256i)__builtin_ia32_vcvtph2udq256_mask( |
||
998 | (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); |
||
999 | } |
||
1000 | |||
1001 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) { |
||
1002 | return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( |
||
1003 | (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
1004 | } |
||
1005 | |||
1006 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1007 | _mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) { |
||
1008 | return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W, |
||
1009 | (__mmask8)__U); |
||
1010 | } |
||
1011 | |||
1012 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1013 | _mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) { |
||
1014 | return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( |
||
1015 | (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
1016 | } |
||
1017 | |||
1018 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1019 | _mm256_cvtepi32_ph(__m256i __A) { |
||
1020 | return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf); |
||
1021 | } |
||
1022 | |||
1023 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1024 | _mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) { |
||
1025 | return (__m128h)__builtin_ia32_selectph_128( |
||
1026 | (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W); |
||
1027 | } |
||
1028 | |||
1029 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1030 | _mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) { |
||
1031 | return (__m128h)__builtin_ia32_selectph_128( |
||
1032 | (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph()); |
||
1033 | } |
||
1034 | |||
1035 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) { |
||
1036 | return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( |
||
1037 | (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
1038 | } |
||
1039 | |||
1040 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1041 | _mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) { |
||
1042 | return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W, |
||
1043 | (__mmask8)__U); |
||
1044 | } |
||
1045 | |||
1046 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1047 | _mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) { |
||
1048 | return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( |
||
1049 | (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
1050 | } |
||
1051 | |||
1052 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1053 | _mm256_cvtepu32_ph(__m256i __A) { |
||
1054 | return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf); |
||
1055 | } |
||
1056 | |||
1057 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1058 | _mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) { |
||
1059 | return (__m128h)__builtin_ia32_selectph_128( |
||
1060 | (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W); |
||
1061 | } |
||
1062 | |||
1063 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1064 | _mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) { |
||
1065 | return (__m128h)__builtin_ia32_selectph_128( |
||
1066 | (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph()); |
||
1067 | } |
||
1068 | |||
1069 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) { |
||
1070 | return (__m128i)__builtin_ia32_vcvttph2dq128_mask( |
||
1071 | (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); |
||
1072 | } |
||
1073 | |||
1074 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1075 | _mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { |
||
1076 | return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W, |
||
1077 | (__mmask8)__U); |
||
1078 | } |
||
1079 | |||
1080 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1081 | _mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { |
||
1082 | return (__m128i)__builtin_ia32_vcvttph2dq128_mask( |
||
1083 | (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); |
||
1084 | } |
||
1085 | |||
1086 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1087 | _mm256_cvttph_epi32(__m128h __A) { |
||
1088 | return (__m256i)__builtin_ia32_vcvttph2dq256_mask( |
||
1089 | (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); |
||
1090 | } |
||
1091 | |||
1092 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1093 | _mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { |
||
1094 | return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W, |
||
1095 | (__mmask8)__U); |
||
1096 | } |
||
1097 | |||
1098 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1099 | _mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { |
||
1100 | return (__m256i)__builtin_ia32_vcvttph2dq256_mask( |
||
1101 | (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); |
||
1102 | } |
||
1103 | |||
1104 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) { |
||
1105 | return (__m128i)__builtin_ia32_vcvttph2udq128_mask( |
||
1106 | (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); |
||
1107 | } |
||
1108 | |||
1109 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1110 | _mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { |
||
1111 | return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W, |
||
1112 | (__mmask8)__U); |
||
1113 | } |
||
1114 | |||
1115 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1116 | _mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { |
||
1117 | return (__m128i)__builtin_ia32_vcvttph2udq128_mask( |
||
1118 | (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); |
||
1119 | } |
||
1120 | |||
1121 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1122 | _mm256_cvttph_epu32(__m128h __A) { |
||
1123 | return (__m256i)__builtin_ia32_vcvttph2udq256_mask( |
||
1124 | (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); |
||
1125 | } |
||
1126 | |||
1127 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1128 | _mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { |
||
1129 | return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W, |
||
1130 | (__mmask8)__U); |
||
1131 | } |
||
1132 | |||
1133 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1134 | _mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { |
||
1135 | return (__m256i)__builtin_ia32_vcvttph2udq256_mask( |
||
1136 | (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); |
||
1137 | } |
||
1138 | |||
1139 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) { |
||
1140 | return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( |
||
1141 | (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
1142 | } |
||
1143 | |||
1144 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1145 | _mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) { |
||
1146 | return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W, |
||
1147 | (__mmask8)__U); |
||
1148 | } |
||
1149 | |||
1150 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1151 | _mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) { |
||
1152 | return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( |
||
1153 | (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
1154 | } |
||
1155 | |||
1156 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1157 | _mm256_cvtepi64_ph(__m256i __A) { |
||
1158 | return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( |
||
1159 | (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
1160 | } |
||
1161 | |||
1162 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1163 | _mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) { |
||
1164 | return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W, |
||
1165 | (__mmask8)__U); |
||
1166 | } |
||
1167 | |||
1168 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1169 | _mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) { |
||
1170 | return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( |
||
1171 | (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
1172 | } |
||
1173 | |||
1174 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) { |
||
1175 | return (__m128i)__builtin_ia32_vcvtph2qq128_mask( |
||
1176 | (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); |
||
1177 | } |
||
1178 | |||
1179 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1180 | _mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { |
||
1181 | return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W, |
||
1182 | (__mmask8)__U); |
||
1183 | } |
||
1184 | |||
1185 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1186 | _mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { |
||
1187 | return (__m128i)__builtin_ia32_vcvtph2qq128_mask( |
||
1188 | (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); |
||
1189 | } |
||
1190 | |||
1191 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1192 | _mm256_cvtph_epi64(__m128h __A) { |
||
1193 | return (__m256i)__builtin_ia32_vcvtph2qq256_mask( |
||
1194 | (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); |
||
1195 | } |
||
1196 | |||
1197 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1198 | _mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { |
||
1199 | return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W, |
||
1200 | (__mmask8)__U); |
||
1201 | } |
||
1202 | |||
1203 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1204 | _mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { |
||
1205 | return (__m256i)__builtin_ia32_vcvtph2qq256_mask( |
||
1206 | (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); |
||
1207 | } |
||
1208 | |||
1209 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) { |
||
1210 | return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( |
||
1211 | (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
1212 | } |
||
1213 | |||
1214 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1215 | _mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) { |
||
1216 | return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W, |
||
1217 | (__mmask8)__U); |
||
1218 | } |
||
1219 | |||
1220 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1221 | _mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) { |
||
1222 | return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( |
||
1223 | (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
1224 | } |
||
1225 | |||
1226 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1227 | _mm256_cvtepu64_ph(__m256i __A) { |
||
1228 | return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( |
||
1229 | (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
1230 | } |
||
1231 | |||
1232 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1233 | _mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) { |
||
1234 | return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W, |
||
1235 | (__mmask8)__U); |
||
1236 | } |
||
1237 | |||
1238 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1239 | _mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) { |
||
1240 | return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( |
||
1241 | (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
1242 | } |
||
1243 | |||
1244 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) { |
||
1245 | return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( |
||
1246 | (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); |
||
1247 | } |
||
1248 | |||
1249 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1250 | _mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { |
||
1251 | return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W, |
||
1252 | (__mmask8)__U); |
||
1253 | } |
||
1254 | |||
1255 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1256 | _mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { |
||
1257 | return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( |
||
1258 | (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); |
||
1259 | } |
||
1260 | |||
1261 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1262 | _mm256_cvtph_epu64(__m128h __A) { |
||
1263 | return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( |
||
1264 | (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); |
||
1265 | } |
||
1266 | |||
1267 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1268 | _mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { |
||
1269 | return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W, |
||
1270 | (__mmask8)__U); |
||
1271 | } |
||
1272 | |||
1273 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1274 | _mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { |
||
1275 | return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( |
||
1276 | (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); |
||
1277 | } |
||
1278 | |||
1279 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) { |
||
1280 | return (__m128i)__builtin_ia32_vcvttph2qq128_mask( |
||
1281 | (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); |
||
1282 | } |
||
1283 | |||
1284 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1285 | _mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { |
||
1286 | return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W, |
||
1287 | (__mmask8)__U); |
||
1288 | } |
||
1289 | |||
1290 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1291 | _mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { |
||
1292 | return (__m128i)__builtin_ia32_vcvttph2qq128_mask( |
||
1293 | (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); |
||
1294 | } |
||
1295 | |||
1296 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1297 | _mm256_cvttph_epi64(__m128h __A) { |
||
1298 | return (__m256i)__builtin_ia32_vcvttph2qq256_mask( |
||
1299 | (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); |
||
1300 | } |
||
1301 | |||
1302 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1303 | _mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { |
||
1304 | return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W, |
||
1305 | (__mmask8)__U); |
||
1306 | } |
||
1307 | |||
1308 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1309 | _mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { |
||
1310 | return (__m256i)__builtin_ia32_vcvttph2qq256_mask( |
||
1311 | (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); |
||
1312 | } |
||
1313 | |||
1314 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) { |
||
1315 | return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( |
||
1316 | (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); |
||
1317 | } |
||
1318 | |||
1319 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1320 | _mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { |
||
1321 | return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W, |
||
1322 | (__mmask8)__U); |
||
1323 | } |
||
1324 | |||
1325 | static __inline__ __m128i __DEFAULT_FN_ATTRS128 |
||
1326 | _mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { |
||
1327 | return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( |
||
1328 | (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); |
||
1329 | } |
||
1330 | |||
1331 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1332 | _mm256_cvttph_epu64(__m128h __A) { |
||
1333 | return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( |
||
1334 | (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); |
||
1335 | } |
||
1336 | |||
1337 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1338 | _mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { |
||
1339 | return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W, |
||
1340 | (__mmask8)__U); |
||
1341 | } |
||
1342 | |||
1343 | static __inline__ __m256i __DEFAULT_FN_ATTRS256 |
||
1344 | _mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { |
||
1345 | return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( |
||
1346 | (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); |
||
1347 | } |
||
1348 | |||
1349 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) { |
||
1350 | return (__m128)__builtin_ia32_vcvtph2psx128_mask( |
||
1351 | (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1); |
||
1352 | } |
||
1353 | |||
1354 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W, |
||
1355 | __mmask8 __U, |
||
1356 | __m128h __A) { |
||
1357 | return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W, |
||
1358 | (__mmask8)__U); |
||
1359 | } |
||
1360 | |||
1361 | static __inline__ __m128 __DEFAULT_FN_ATTRS128 |
||
1362 | _mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { |
||
1363 | return (__m128)__builtin_ia32_vcvtph2psx128_mask( |
||
1364 | (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); |
||
1365 | } |
||
1366 | |||
1367 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) { |
||
1368 | return (__m256)__builtin_ia32_vcvtph2psx256_mask( |
||
1369 | (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1); |
||
1370 | } |
||
1371 | |||
1372 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
||
1373 | _mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) { |
||
1374 | return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W, |
||
1375 | (__mmask8)__U); |
||
1376 | } |
||
1377 | |||
1378 | static __inline__ __m256 __DEFAULT_FN_ATTRS256 |
||
1379 | _mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { |
||
1380 | return (__m256)__builtin_ia32_vcvtph2psx256_mask( |
||
1381 | (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); |
||
1382 | } |
||
1383 | |||
1384 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) { |
||
1385 | return (__m128h)__builtin_ia32_vcvtps2phx128_mask( |
||
1386 | (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
1387 | } |
||
1388 | |||
1389 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W, |
||
1390 | __mmask8 __U, |
||
1391 | __m128 __A) { |
||
1392 | return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W, |
||
1393 | (__mmask8)__U); |
||
1394 | } |
||
1395 | |||
1396 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1397 | _mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) { |
||
1398 | return (__m128h)__builtin_ia32_vcvtps2phx128_mask( |
||
1399 | (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
1400 | } |
||
1401 | |||
1402 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) { |
||
1403 | return (__m128h)__builtin_ia32_vcvtps2phx256_mask( |
||
1404 | (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); |
||
1405 | } |
||
1406 | |||
1407 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1408 | _mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) { |
||
1409 | return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W, |
||
1410 | (__mmask8)__U); |
||
1411 | } |
||
1412 | |||
1413 | static __inline__ __m128h __DEFAULT_FN_ATTRS256 |
||
1414 | _mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) { |
||
1415 | return (__m128h)__builtin_ia32_vcvtps2phx256_mask( |
||
1416 | (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); |
||
1417 | } |
||
1418 | |||
1419 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A, |
||
1420 | __m128h __B, |
||
1421 | __m128h __C) { |
||
1422 | return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, |
||
1423 | (__v8hf)__C); |
||
1424 | } |
||
1425 | |||
1426 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A, |
||
1427 | __mmask8 __U, |
||
1428 | __m128h __B, |
||
1429 | __m128h __C) { |
||
1430 | return (__m128h)__builtin_ia32_selectph_128( |
||
1431 | (__mmask8)__U, |
||
1432 | __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), |
||
1433 | (__v8hf)__A); |
||
1434 | } |
||
1435 | |||
1436 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1437 | _mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { |
||
1438 | return (__m128h)__builtin_ia32_selectph_128( |
||
1439 | (__mmask8)__U, |
||
1440 | __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), |
||
1441 | (__v8hf)__C); |
||
1442 | } |
||
1443 | |||
1444 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1445 | _mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { |
||
1446 | return (__m128h)__builtin_ia32_selectph_128( |
||
1447 | (__mmask8)__U, |
||
1448 | __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), |
||
1449 | (__v8hf)_mm_setzero_ph()); |
||
1450 | } |
||
1451 | |||
1452 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A, |
||
1453 | __m128h __B, |
||
1454 | __m128h __C) { |
||
1455 | return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, |
||
1456 | -(__v8hf)__C); |
||
1457 | } |
||
1458 | |||
1459 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A, |
||
1460 | __mmask8 __U, |
||
1461 | __m128h __B, |
||
1462 | __m128h __C) { |
||
1463 | return (__m128h)__builtin_ia32_selectph_128( |
||
1464 | (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), |
||
1465 | (__v8hf)__A); |
||
1466 | } |
||
1467 | |||
1468 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1469 | _mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { |
||
1470 | return (__m128h)__builtin_ia32_selectph_128( |
||
1471 | (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), |
||
1472 | (__v8hf)_mm_setzero_ph()); |
||
1473 | } |
||
1474 | |||
1475 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1476 | _mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { |
||
1477 | return (__m128h)__builtin_ia32_selectph_128( |
||
1478 | (__mmask8)__U, |
||
1479 | __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), |
||
1480 | (__v8hf)__C); |
||
1481 | } |
||
1482 | |||
1483 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1484 | _mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { |
||
1485 | return (__m128h)__builtin_ia32_selectph_128( |
||
1486 | (__mmask8)__U, |
||
1487 | __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), |
||
1488 | (__v8hf)_mm_setzero_ph()); |
||
1489 | } |
||
1490 | |||
1491 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1492 | _mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { |
||
1493 | return (__m128h)__builtin_ia32_selectph_128( |
||
1494 | (__mmask8)__U, |
||
1495 | __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), |
||
1496 | (__v8hf)_mm_setzero_ph()); |
||
1497 | } |
||
1498 | |||
1499 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A, |
||
1500 | __m256h __B, |
||
1501 | __m256h __C) { |
||
1502 | return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, |
||
1503 | (__v16hf)__C); |
||
1504 | } |
||
1505 | |||
1506 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1507 | _mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { |
||
1508 | return (__m256h)__builtin_ia32_selectph_256( |
||
1509 | (__mmask16)__U, |
||
1510 | __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), |
||
1511 | (__v16hf)__A); |
||
1512 | } |
||
1513 | |||
1514 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1515 | _mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { |
||
1516 | return (__m256h)__builtin_ia32_selectph_256( |
||
1517 | (__mmask16)__U, |
||
1518 | __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), |
||
1519 | (__v16hf)__C); |
||
1520 | } |
||
1521 | |||
1522 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1523 | _mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { |
||
1524 | return (__m256h)__builtin_ia32_selectph_256( |
||
1525 | (__mmask16)__U, |
||
1526 | __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), |
||
1527 | (__v16hf)_mm256_setzero_ph()); |
||
1528 | } |
||
1529 | |||
1530 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A, |
||
1531 | __m256h __B, |
||
1532 | __m256h __C) { |
||
1533 | return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, |
||
1534 | -(__v16hf)__C); |
||
1535 | } |
||
1536 | |||
1537 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1538 | _mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { |
||
1539 | return (__m256h)__builtin_ia32_selectph_256( |
||
1540 | (__mmask16)__U, |
||
1541 | __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), |
||
1542 | (__v16hf)__A); |
||
1543 | } |
||
1544 | |||
1545 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1546 | _mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { |
||
1547 | return (__m256h)__builtin_ia32_selectph_256( |
||
1548 | (__mmask16)__U, |
||
1549 | __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), |
||
1550 | (__v16hf)_mm256_setzero_ph()); |
||
1551 | } |
||
1552 | |||
1553 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1554 | _mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { |
||
1555 | return (__m256h)__builtin_ia32_selectph_256( |
||
1556 | (__mmask16)__U, |
||
1557 | __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), |
||
1558 | (__v16hf)__C); |
||
1559 | } |
||
1560 | |||
1561 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1562 | _mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { |
||
1563 | return (__m256h)__builtin_ia32_selectph_256( |
||
1564 | (__mmask16)__U, |
||
1565 | __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), |
||
1566 | (__v16hf)_mm256_setzero_ph()); |
||
1567 | } |
||
1568 | |||
1569 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1570 | _mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { |
||
1571 | return (__m256h)__builtin_ia32_selectph_256( |
||
1572 | (__mmask16)__U, |
||
1573 | __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), |
||
1574 | (__v16hf)_mm256_setzero_ph()); |
||
1575 | } |
||
1576 | |||
1577 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A, |
||
1578 | __m128h __B, |
||
1579 | __m128h __C) { |
||
1580 | return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, |
||
1581 | (__v8hf)__C); |
||
1582 | } |
||
1583 | |||
1584 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1585 | _mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { |
||
1586 | return (__m128h)__builtin_ia32_selectph_128( |
||
1587 | (__mmask8)__U, |
||
1588 | __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), |
||
1589 | (__v8hf)__A); |
||
1590 | } |
||
1591 | |||
1592 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1593 | _mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { |
||
1594 | return (__m128h)__builtin_ia32_selectph_128( |
||
1595 | (__mmask8)__U, |
||
1596 | __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), |
||
1597 | (__v8hf)__C); |
||
1598 | } |
||
1599 | |||
1600 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1601 | _mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { |
||
1602 | return (__m128h)__builtin_ia32_selectph_128( |
||
1603 | (__mmask8)__U, |
||
1604 | __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), |
||
1605 | (__v8hf)_mm_setzero_ph()); |
||
1606 | } |
||
1607 | |||
1608 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A, |
||
1609 | __m128h __B, |
||
1610 | __m128h __C) { |
||
1611 | return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, |
||
1612 | -(__v8hf)__C); |
||
1613 | } |
||
1614 | |||
1615 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1616 | _mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { |
||
1617 | return (__m128h)__builtin_ia32_selectph_128( |
||
1618 | (__mmask8)__U, |
||
1619 | __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), |
||
1620 | (__v8hf)__A); |
||
1621 | } |
||
1622 | |||
1623 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1624 | _mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { |
||
1625 | return (__m128h)__builtin_ia32_selectph_128( |
||
1626 | (__mmask8)__U, |
||
1627 | __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), |
||
1628 | (__v8hf)_mm_setzero_ph()); |
||
1629 | } |
||
1630 | |||
1631 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1632 | _mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) { |
||
1633 | return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, |
||
1634 | (__v16hf)__C); |
||
1635 | } |
||
1636 | |||
1637 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1638 | _mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { |
||
1639 | return (__m256h)__builtin_ia32_selectph_256( |
||
1640 | (__mmask16)__U, |
||
1641 | __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), |
||
1642 | (__v16hf)__A); |
||
1643 | } |
||
1644 | |||
1645 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1646 | _mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { |
||
1647 | return (__m256h)__builtin_ia32_selectph_256( |
||
1648 | (__mmask16)__U, |
||
1649 | __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), |
||
1650 | (__v16hf)__C); |
||
1651 | } |
||
1652 | |||
1653 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1654 | _mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { |
||
1655 | return (__m256h)__builtin_ia32_selectph_256( |
||
1656 | (__mmask16)__U, |
||
1657 | __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), |
||
1658 | (__v16hf)_mm256_setzero_ph()); |
||
1659 | } |
||
1660 | |||
1661 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1662 | _mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) { |
||
1663 | return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, |
||
1664 | -(__v16hf)__C); |
||
1665 | } |
||
1666 | |||
1667 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1668 | _mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { |
||
1669 | return (__m256h)__builtin_ia32_selectph_256( |
||
1670 | (__mmask16)__U, |
||
1671 | __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), |
||
1672 | (__v16hf)__A); |
||
1673 | } |
||
1674 | |||
1675 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1676 | _mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { |
||
1677 | return (__m256h)__builtin_ia32_selectph_256( |
||
1678 | (__mmask16)__U, |
||
1679 | __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), |
||
1680 | (__v16hf)_mm256_setzero_ph()); |
||
1681 | } |
||
1682 | |||
1683 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1684 | _mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { |
||
1685 | return (__m128h)__builtin_ia32_selectph_128( |
||
1686 | (__mmask8)__U, |
||
1687 | __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), |
||
1688 | (__v8hf)__C); |
||
1689 | } |
||
1690 | |||
1691 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1692 | _mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { |
||
1693 | return (__m256h)__builtin_ia32_selectph_256( |
||
1694 | (__mmask16)__U, |
||
1695 | __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), |
||
1696 | (__v16hf)__C); |
||
1697 | } |
||
1698 | |||
1699 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1700 | _mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { |
||
1701 | return (__m128h)__builtin_ia32_selectph_128( |
||
1702 | (__mmask8)__U, |
||
1703 | __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), |
||
1704 | (__v8hf)__C); |
||
1705 | } |
||
1706 | |||
1707 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1708 | _mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { |
||
1709 | return (__m256h)__builtin_ia32_selectph_256( |
||
1710 | (__mmask16)__U, |
||
1711 | __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), |
||
1712 | (__v16hf)__C); |
||
1713 | } |
||
1714 | |||
1715 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A, |
||
1716 | __m128h __B, |
||
1717 | __m128h __C) { |
||
1718 | return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, |
||
1719 | (__v8hf)__C); |
||
1720 | } |
||
1721 | |||
1722 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1723 | _mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { |
||
1724 | return (__m128h)__builtin_ia32_selectph_128( |
||
1725 | (__mmask8)__U, |
||
1726 | __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C), |
||
1727 | (__v8hf)__A); |
||
1728 | } |
||
1729 | |||
1730 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A, |
||
1731 | __m256h __B, |
||
1732 | __m256h __C) { |
||
1733 | return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, |
||
1734 | (__v16hf)__C); |
||
1735 | } |
||
1736 | |||
1737 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1738 | _mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { |
||
1739 | return (__m256h)__builtin_ia32_selectph_256( |
||
1740 | (__mmask16)__U, |
||
1741 | __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C), |
||
1742 | (__v16hf)__A); |
||
1743 | } |
||
1744 | |||
1745 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A, |
||
1746 | __m128h __B, |
||
1747 | __m128h __C) { |
||
1748 | return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, |
||
1749 | -(__v8hf)__C); |
||
1750 | } |
||
1751 | |||
1752 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1753 | _mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { |
||
1754 | return (__m128h)__builtin_ia32_selectph_128( |
||
1755 | (__mmask8)__U, |
||
1756 | __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), |
||
1757 | (__v8hf)__A); |
||
1758 | } |
||
1759 | |||
1760 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1761 | _mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { |
||
1762 | return (__m128h)__builtin_ia32_selectph_128( |
||
1763 | (__mmask8)__U, |
||
1764 | __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), |
||
1765 | (__v8hf)__C); |
||
1766 | } |
||
1767 | |||
1768 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A, |
||
1769 | __m256h __B, |
||
1770 | __m256h __C) { |
||
1771 | return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, |
||
1772 | -(__v16hf)__C); |
||
1773 | } |
||
1774 | |||
1775 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1776 | _mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { |
||
1777 | return (__m256h)__builtin_ia32_selectph_256( |
||
1778 | (__mmask16)__U, |
||
1779 | __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), |
||
1780 | (__v16hf)__A); |
||
1781 | } |
||
1782 | |||
1783 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1784 | _mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { |
||
1785 | return (__m256h)__builtin_ia32_selectph_256( |
||
1786 | (__mmask16)__U, |
||
1787 | __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), |
||
1788 | (__v16hf)__C); |
||
1789 | } |
||
1790 | |||
1791 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A, |
||
1792 | __m128h __B) { |
||
1793 | return (__m128h)__builtin_ia32_vfcmulcph128_mask( |
||
1794 | (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); |
||
1795 | } |
||
1796 | |||
1797 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1798 | _mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { |
||
1799 | return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B, |
||
1800 | (__v4sf)__W, (__mmask8)__U); |
||
1801 | } |
||
1802 | |||
1803 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1804 | _mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { |
||
1805 | return (__m128h)__builtin_ia32_vfcmulcph128_mask( |
||
1806 | (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); |
||
1807 | } |
||
1808 | |||
1809 | static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A, |
||
1810 | __m256h __B) { |
||
1811 | return (__m256h)__builtin_ia32_vfcmulcph256_mask( |
||
1812 | (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); |
||
1813 | } |
||
1814 | |||
1815 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1816 | _mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { |
||
1817 | return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B, |
||
1818 | (__v8sf)__W, (__mmask8)__U); |
||
1819 | } |
||
1820 | |||
1821 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1822 | _mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { |
||
1823 | return (__m256h)__builtin_ia32_vfcmulcph256_mask( |
||
1824 | (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); |
||
1825 | } |
||
1826 | |||
1827 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A, |
||
1828 | __m128h __B, |
||
1829 | __m128h __C) { |
||
1830 | return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, |
||
1831 | (__v4sf)__C, (__mmask8)-1); |
||
1832 | } |
||
1833 | |||
1834 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1835 | _mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { |
||
1836 | return (__m128h)__builtin_ia32_selectps_128( |
||
1837 | __U, |
||
1838 | __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B, |
||
1839 | (__v4sf)__C, (__mmask8)__U), |
||
1840 | (__v4sf)__A); |
||
1841 | } |
||
1842 | |||
1843 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1844 | _mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { |
||
1845 | return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, |
||
1846 | (__v4sf)__C, (__mmask8)__U); |
||
1847 | } |
||
1848 | |||
1849 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1850 | _mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { |
||
1851 | return (__m128h)__builtin_ia32_vfcmaddcph128_maskz( |
||
1852 | (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); |
||
1853 | } |
||
1854 | |||
1855 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A, |
||
1856 | __m256h __B, |
||
1857 | __m256h __C) { |
||
1858 | return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, |
||
1859 | (__v8sf)__C, (__mmask8)-1); |
||
1860 | } |
||
1861 | |||
1862 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1863 | _mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { |
||
1864 | return (__m256h)__builtin_ia32_selectps_256( |
||
1865 | __U, |
||
1866 | __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, |
||
1867 | (__mmask8)__U), |
||
1868 | (__v8sf)__A); |
||
1869 | } |
||
1870 | |||
1871 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1872 | _mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { |
||
1873 | return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, |
||
1874 | (__v8sf)__C, (__mmask8)__U); |
||
1875 | } |
||
1876 | |||
1877 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1878 | _mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { |
||
1879 | return (__m256h)__builtin_ia32_vfcmaddcph256_maskz( |
||
1880 | (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); |
||
1881 | } |
||
1882 | |||
1883 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A, |
||
1884 | __m128h __B) { |
||
1885 | return (__m128h)__builtin_ia32_vfmulcph128_mask( |
||
1886 | (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); |
||
1887 | } |
||
1888 | |||
1889 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W, |
||
1890 | __mmask8 __U, |
||
1891 | __m128h __A, |
||
1892 | __m128h __B) { |
||
1893 | return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B, |
||
1894 | (__v4sf)__W, (__mmask8)__U); |
||
1895 | } |
||
1896 | |||
1897 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1898 | _mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { |
||
1899 | return (__m128h)__builtin_ia32_vfmulcph128_mask( |
||
1900 | (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); |
||
1901 | } |
||
1902 | |||
1903 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A, |
||
1904 | __m256h __B) { |
||
1905 | return (__m256h)__builtin_ia32_vfmulcph256_mask( |
||
1906 | (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); |
||
1907 | } |
||
1908 | |||
1909 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1910 | _mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { |
||
1911 | return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B, |
||
1912 | (__v8sf)__W, (__mmask8)__U); |
||
1913 | } |
||
1914 | |||
1915 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1916 | _mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { |
||
1917 | return (__m256h)__builtin_ia32_vfmulcph256_mask( |
||
1918 | (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); |
||
1919 | } |
||
1920 | |||
1921 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A, |
||
1922 | __m128h __B, |
||
1923 | __m128h __C) { |
||
1924 | return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, |
||
1925 | (__v4sf)__C, (__mmask8)-1); |
||
1926 | } |
||
1927 | |||
1928 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1929 | _mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { |
||
1930 | return (__m128h)__builtin_ia32_selectps_128( |
||
1931 | __U, |
||
1932 | __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, |
||
1933 | (__mmask8)__U), |
||
1934 | (__v4sf)__A); |
||
1935 | } |
||
1936 | |||
1937 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1938 | _mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { |
||
1939 | return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, |
||
1940 | (__v4sf)__C, (__mmask8)__U); |
||
1941 | } |
||
1942 | |||
1943 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1944 | _mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { |
||
1945 | return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B, |
||
1946 | (__v4sf)__C, (__mmask8)__U); |
||
1947 | } |
||
1948 | |||
1949 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A, |
||
1950 | __m256h __B, |
||
1951 | __m256h __C) { |
||
1952 | return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, |
||
1953 | (__v8sf)__C, (__mmask8)-1); |
||
1954 | } |
||
1955 | |||
1956 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1957 | _mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { |
||
1958 | return (__m256h)__builtin_ia32_selectps_256( |
||
1959 | __U, |
||
1960 | __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, |
||
1961 | (__mmask8)__U), |
||
1962 | (__v8sf)__A); |
||
1963 | } |
||
1964 | |||
1965 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1966 | _mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { |
||
1967 | return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, |
||
1968 | (__v8sf)__C, (__mmask8)__U); |
||
1969 | } |
||
1970 | |||
1971 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1972 | _mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { |
||
1973 | return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B, |
||
1974 | (__v8sf)__C, (__mmask8)__U); |
||
1975 | } |
||
1976 | |||
1977 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, |
||
1978 | __m128h __A, |
||
1979 | __m128h __W) { |
||
1980 | return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W, |
||
1981 | (__v8hf)__A); |
||
1982 | } |
||
1983 | |||
1984 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1985 | _mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { |
||
1986 | return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W, |
||
1987 | (__v16hf)__A); |
||
1988 | } |
||
1989 | |||
1990 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
1991 | _mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { |
||
1992 | return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, |
||
1993 | (__v8hi)__B); |
||
1994 | } |
||
1995 | |||
1996 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
1997 | _mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { |
||
1998 | return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, |
||
1999 | (__v16hi)__B); |
||
2000 | } |
||
2001 | |||
2002 | static __inline__ __m128h __DEFAULT_FN_ATTRS128 |
||
2003 | _mm_permutexvar_ph(__m128i __A, __m128h __B) { |
||
2004 | return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); |
||
2005 | } |
||
2006 | |||
2007 | static __inline__ __m256h __DEFAULT_FN_ATTRS256 |
||
2008 | _mm256_permutexvar_ph(__m256i __A, __m256h __B) { |
||
2009 | return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); |
||
2010 | } |
||
2011 | |||
2012 | static __inline__ _Float16 __DEFAULT_FN_ATTRS256 |
||
2013 | _mm256_reduce_add_ph(__m256h __W) { |
||
2014 | return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W); |
||
2015 | } |
||
2016 | |||
2017 | static __inline__ _Float16 __DEFAULT_FN_ATTRS256 |
||
2018 | _mm256_reduce_mul_ph(__m256h __W) { |
||
2019 | return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W); |
||
2020 | } |
||
2021 | |||
2022 | static __inline__ _Float16 __DEFAULT_FN_ATTRS256 |
||
2023 | _mm256_reduce_max_ph(__m256h __V) { |
||
2024 | return __builtin_ia32_reduce_fmax_ph256(__V); |
||
2025 | } |
||
2026 | |||
2027 | static __inline__ _Float16 __DEFAULT_FN_ATTRS256 |
||
2028 | _mm256_reduce_min_ph(__m256h __V) { |
||
2029 | return __builtin_ia32_reduce_fmin_ph256(__V); |
||
2030 | } |
||
2031 | |||
2032 | static __inline__ _Float16 __DEFAULT_FN_ATTRS128 |
||
2033 | _mm_reduce_add_ph(__m128h __W) { |
||
2034 | return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W); |
||
2035 | } |
||
2036 | |||
2037 | static __inline__ _Float16 __DEFAULT_FN_ATTRS128 |
||
2038 | _mm_reduce_mul_ph(__m128h __W) { |
||
2039 | return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W); |
||
2040 | } |
||
2041 | |||
2042 | static __inline__ _Float16 __DEFAULT_FN_ATTRS128 |
||
2043 | _mm_reduce_max_ph(__m128h __V) { |
||
2044 | return __builtin_ia32_reduce_fmax_ph128(__V); |
||
2045 | } |
||
2046 | |||
2047 | static __inline__ _Float16 __DEFAULT_FN_ATTRS128 |
||
2048 | _mm_reduce_min_ph(__m128h __V) { |
||
2049 | return __builtin_ia32_reduce_fmin_ph128(__V); |
||
2050 | } |
||
2051 | |||
2052 | // intrinsics below are alias for f*mul_*ch |
||
2053 | #define _mm_mul_pch(A, B) _mm_fmul_pch(A, B) |
||
2054 | #define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B) |
||
2055 | #define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B) |
||
2056 | #define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B) |
||
2057 | #define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B) |
||
2058 | #define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B) |
||
2059 | |||
2060 | #define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B) |
||
2061 | #define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B) |
||
2062 | #define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B) |
||
2063 | #define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B) |
||
2064 | #define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B) |
||
2065 | #define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B) |
||
2066 | |||
2067 | #undef __DEFAULT_FN_ATTRS128 |
||
2068 | #undef __DEFAULT_FN_ATTRS256 |
||
2069 | |||
2070 | #endif |
||
2071 | #endif |