Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | /*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------=== |
2 | * |
||
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | * See https://llvm.org/LICENSE.txt for license information. |
||
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | * |
||
7 | *===-----------------------------------------------------------------------=== |
||
8 | */ |
||
9 | |||
10 | /* Implemented from the specification included in the Intel C++ Compiler |
||
11 | User Guide and Reference, version 9.0. |
||
12 | |||
13 | NOTE: This is NOT a complete implementation of the SSE4 intrinsics! */ |
||
14 | |||
15 | #ifndef NO_WARN_X86_INTRINSICS |
||
16 | /* This header is distributed to simplify porting x86_64 code that |
||
17 | makes explicit use of Intel intrinsics to powerp64/powerpc64le. |
||
18 | |||
19 | It is the user's responsibility to determine if the results are |
||
20 | acceptable and make additional changes as necessary. |
||
21 | |||
22 | Note that much code that uses Intel intrinsics can be rewritten in |
||
23 | standard C or GNU C extensions, which are more portable and better |
||
24 | optimized across multiple targets. */ |
||
25 | #error \ |
||
26 | "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." |
||
27 | #endif |
||
28 | |||
29 | #ifndef SMMINTRIN_H_ |
||
30 | #define SMMINTRIN_H_ |
||
31 | |||
32 | #if defined(__powerpc64__) && \ |
||
33 | (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) |
||
34 | |||
35 | #include <altivec.h> |
||
36 | #include <tmmintrin.h> |
||
37 | |||
38 | /* Rounding mode macros. */ |
||
39 | #define _MM_FROUND_TO_NEAREST_INT 0x00 |
||
40 | #define _MM_FROUND_TO_ZERO 0x01 |
||
41 | #define _MM_FROUND_TO_POS_INF 0x02 |
||
42 | #define _MM_FROUND_TO_NEG_INF 0x03 |
||
43 | #define _MM_FROUND_CUR_DIRECTION 0x04 |
||
44 | |||
45 | #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) |
||
46 | #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) |
||
47 | #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) |
||
48 | #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) |
||
49 | #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) |
||
50 | #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) |
||
51 | |||
52 | #define _MM_FROUND_RAISE_EXC 0x00 |
||
53 | #define _MM_FROUND_NO_EXC 0x08 |
||
54 | |||
55 | extern __inline __m128d |
||
56 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
57 | _mm_round_pd(__m128d __A, int __rounding) { |
||
58 | __v2df __r; |
||
59 | union { |
||
60 | double __fr; |
||
61 | long long __fpscr; |
||
62 | } __enables_save, __fpscr_save; |
||
63 | |||
64 | if (__rounding & _MM_FROUND_NO_EXC) { |
||
65 | /* Save enabled exceptions, disable all exceptions, |
||
66 | and preserve the rounding mode. */ |
||
67 | #ifdef _ARCH_PWR9 |
||
68 | __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); |
||
69 | __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; |
||
70 | #else |
||
71 | __fpscr_save.__fr = __builtin_mffs(); |
||
72 | __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; |
||
73 | __fpscr_save.__fpscr &= ~0xf8; |
||
74 | __builtin_mtfsf(0b00000011, __fpscr_save.__fr); |
||
75 | #endif |
||
76 | /* Insert an artificial "read/write" reference to the variable |
||
77 | read below, to ensure the compiler does not schedule |
||
78 | a read/use of the variable before the FPSCR is modified, above. |
||
79 | This can be removed if and when GCC PR102783 is fixed. |
||
80 | */ |
||
81 | __asm__("" : "+wa"(__A)); |
||
82 | } |
||
83 | |||
84 | switch (__rounding) { |
||
85 | case _MM_FROUND_TO_NEAREST_INT: |
||
86 | __fpscr_save.__fr = __builtin_mffsl(); |
||
87 | __attribute__((fallthrough)); |
||
88 | case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: |
||
89 | __builtin_set_fpscr_rn(0b00); |
||
90 | /* Insert an artificial "read/write" reference to the variable |
||
91 | read below, to ensure the compiler does not schedule |
||
92 | a read/use of the variable before the FPSCR is modified, above. |
||
93 | This can be removed if and when GCC PR102783 is fixed. |
||
94 | */ |
||
95 | __asm__("" : "+wa"(__A)); |
||
96 | |||
97 | __r = vec_rint((__v2df)__A); |
||
98 | |||
99 | /* Insert an artificial "read" reference to the variable written |
||
100 | above, to ensure the compiler does not schedule the computation |
||
101 | of the value after the manipulation of the FPSCR, below. |
||
102 | This can be removed if and when GCC PR102783 is fixed. |
||
103 | */ |
||
104 | __asm__("" : : "wa"(__r)); |
||
105 | __builtin_set_fpscr_rn(__fpscr_save.__fpscr); |
||
106 | break; |
||
107 | case _MM_FROUND_TO_NEG_INF: |
||
108 | case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: |
||
109 | __r = vec_floor((__v2df)__A); |
||
110 | break; |
||
111 | case _MM_FROUND_TO_POS_INF: |
||
112 | case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: |
||
113 | __r = vec_ceil((__v2df)__A); |
||
114 | break; |
||
115 | case _MM_FROUND_TO_ZERO: |
||
116 | case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: |
||
117 | __r = vec_trunc((__v2df)__A); |
||
118 | break; |
||
119 | case _MM_FROUND_CUR_DIRECTION: |
||
120 | __r = vec_rint((__v2df)__A); |
||
121 | break; |
||
122 | } |
||
123 | if (__rounding & _MM_FROUND_NO_EXC) { |
||
124 | /* Insert an artificial "read" reference to the variable written |
||
125 | above, to ensure the compiler does not schedule the computation |
||
126 | of the value after the manipulation of the FPSCR, below. |
||
127 | This can be removed if and when GCC PR102783 is fixed. |
||
128 | */ |
||
129 | __asm__("" : : "wa"(__r)); |
||
130 | /* Restore enabled exceptions. */ |
||
131 | __fpscr_save.__fr = __builtin_mffsl(); |
||
132 | __fpscr_save.__fpscr |= __enables_save.__fpscr; |
||
133 | __builtin_mtfsf(0b00000011, __fpscr_save.__fr); |
||
134 | } |
||
135 | return (__m128d)__r; |
||
136 | } |
||
137 | |||
138 | extern __inline __m128d |
||
139 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
140 | _mm_round_sd(__m128d __A, __m128d __B, int __rounding) { |
||
141 | __B = _mm_round_pd(__B, __rounding); |
||
142 | __v2df __r = {((__v2df)__B)[0], ((__v2df)__A)[1]}; |
||
143 | return (__m128d)__r; |
||
144 | } |
||
145 | |||
146 | extern __inline __m128 |
||
147 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
148 | _mm_round_ps(__m128 __A, int __rounding) { |
||
149 | __v4sf __r; |
||
150 | union { |
||
151 | double __fr; |
||
152 | long long __fpscr; |
||
153 | } __enables_save, __fpscr_save; |
||
154 | |||
155 | if (__rounding & _MM_FROUND_NO_EXC) { |
||
156 | /* Save enabled exceptions, disable all exceptions, |
||
157 | and preserve the rounding mode. */ |
||
158 | #ifdef _ARCH_PWR9 |
||
159 | __asm__("mffsce %0" : "=f"(__fpscr_save.__fr)); |
||
160 | __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; |
||
161 | #else |
||
162 | __fpscr_save.__fr = __builtin_mffs(); |
||
163 | __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8; |
||
164 | __fpscr_save.__fpscr &= ~0xf8; |
||
165 | __builtin_mtfsf(0b00000011, __fpscr_save.__fr); |
||
166 | #endif |
||
167 | /* Insert an artificial "read/write" reference to the variable |
||
168 | read below, to ensure the compiler does not schedule |
||
169 | a read/use of the variable before the FPSCR is modified, above. |
||
170 | This can be removed if and when GCC PR102783 is fixed. |
||
171 | */ |
||
172 | __asm__("" : "+wa"(__A)); |
||
173 | } |
||
174 | |||
175 | switch (__rounding) { |
||
176 | case _MM_FROUND_TO_NEAREST_INT: |
||
177 | __fpscr_save.__fr = __builtin_mffsl(); |
||
178 | __attribute__((fallthrough)); |
||
179 | case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC: |
||
180 | __builtin_set_fpscr_rn(0b00); |
||
181 | /* Insert an artificial "read/write" reference to the variable |
||
182 | read below, to ensure the compiler does not schedule |
||
183 | a read/use of the variable before the FPSCR is modified, above. |
||
184 | This can be removed if and when GCC PR102783 is fixed. |
||
185 | */ |
||
186 | __asm__("" : "+wa"(__A)); |
||
187 | |||
188 | __r = vec_rint((__v4sf)__A); |
||
189 | |||
190 | /* Insert an artificial "read" reference to the variable written |
||
191 | above, to ensure the compiler does not schedule the computation |
||
192 | of the value after the manipulation of the FPSCR, below. |
||
193 | This can be removed if and when GCC PR102783 is fixed. |
||
194 | */ |
||
195 | __asm__("" : : "wa"(__r)); |
||
196 | __builtin_set_fpscr_rn(__fpscr_save.__fpscr); |
||
197 | break; |
||
198 | case _MM_FROUND_TO_NEG_INF: |
||
199 | case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC: |
||
200 | __r = vec_floor((__v4sf)__A); |
||
201 | break; |
||
202 | case _MM_FROUND_TO_POS_INF: |
||
203 | case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC: |
||
204 | __r = vec_ceil((__v4sf)__A); |
||
205 | break; |
||
206 | case _MM_FROUND_TO_ZERO: |
||
207 | case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC: |
||
208 | __r = vec_trunc((__v4sf)__A); |
||
209 | break; |
||
210 | case _MM_FROUND_CUR_DIRECTION: |
||
211 | __r = vec_rint((__v4sf)__A); |
||
212 | break; |
||
213 | } |
||
214 | if (__rounding & _MM_FROUND_NO_EXC) { |
||
215 | /* Insert an artificial "read" reference to the variable written |
||
216 | above, to ensure the compiler does not schedule the computation |
||
217 | of the value after the manipulation of the FPSCR, below. |
||
218 | This can be removed if and when GCC PR102783 is fixed. |
||
219 | */ |
||
220 | __asm__("" : : "wa"(__r)); |
||
221 | /* Restore enabled exceptions. */ |
||
222 | __fpscr_save.__fr = __builtin_mffsl(); |
||
223 | __fpscr_save.__fpscr |= __enables_save.__fpscr; |
||
224 | __builtin_mtfsf(0b00000011, __fpscr_save.__fr); |
||
225 | } |
||
226 | return (__m128)__r; |
||
227 | } |
||
228 | |||
229 | extern __inline __m128 |
||
230 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
231 | _mm_round_ss(__m128 __A, __m128 __B, int __rounding) { |
||
232 | __B = _mm_round_ps(__B, __rounding); |
||
233 | __v4sf __r = (__v4sf)__A; |
||
234 | __r[0] = ((__v4sf)__B)[0]; |
||
235 | return (__m128)__r; |
||
236 | } |
||
237 | |||
238 | #define _mm_ceil_pd(V) _mm_round_pd((V), _MM_FROUND_CEIL) |
||
239 | #define _mm_ceil_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_CEIL) |
||
240 | |||
241 | #define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR) |
||
242 | #define _mm_floor_sd(D, V) _mm_round_sd((D), (V), _MM_FROUND_FLOOR) |
||
243 | |||
244 | #define _mm_ceil_ps(V) _mm_round_ps((V), _MM_FROUND_CEIL) |
||
245 | #define _mm_ceil_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_CEIL) |
||
246 | |||
247 | #define _mm_floor_ps(V) _mm_round_ps((V), _MM_FROUND_FLOOR) |
||
248 | #define _mm_floor_ss(D, V) _mm_round_ss((D), (V), _MM_FROUND_FLOOR) |
||
249 | |||
250 | extern __inline __m128i |
||
251 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
252 | _mm_insert_epi8(__m128i const __A, int const __D, int const __N) { |
||
253 | __v16qi __result = (__v16qi)__A; |
||
254 | |||
255 | __result[__N & 0xf] = __D; |
||
256 | |||
257 | return (__m128i)__result; |
||
258 | } |
||
259 | |||
260 | extern __inline __m128i |
||
261 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
262 | _mm_insert_epi32(__m128i const __A, int const __D, int const __N) { |
||
263 | __v4si __result = (__v4si)__A; |
||
264 | |||
265 | __result[__N & 3] = __D; |
||
266 | |||
267 | return (__m128i)__result; |
||
268 | } |
||
269 | |||
270 | extern __inline __m128i |
||
271 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
272 | _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) { |
||
273 | __v2di __result = (__v2di)__A; |
||
274 | |||
275 | __result[__N & 1] = __D; |
||
276 | |||
277 | return (__m128i)__result; |
||
278 | } |
||
279 | |||
280 | extern __inline int |
||
281 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
282 | _mm_extract_epi8(__m128i __X, const int __N) { |
||
283 | return (unsigned char)((__v16qi)__X)[__N & 15]; |
||
284 | } |
||
285 | |||
286 | extern __inline int |
||
287 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
288 | _mm_extract_epi32(__m128i __X, const int __N) { |
||
289 | return ((__v4si)__X)[__N & 3]; |
||
290 | } |
||
291 | |||
292 | extern __inline int |
||
293 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
294 | _mm_extract_epi64(__m128i __X, const int __N) { |
||
295 | return ((__v2di)__X)[__N & 1]; |
||
296 | } |
||
297 | |||
298 | extern __inline int |
||
299 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
300 | _mm_extract_ps(__m128 __X, const int __N) { |
||
301 | return ((__v4si)__X)[__N & 3]; |
||
302 | } |
||
303 | |||
304 | #ifdef _ARCH_PWR8 |
||
305 | extern __inline __m128i |
||
306 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
307 | _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) { |
||
308 | __v16qi __charmask = vec_splats((signed char)__imm8); |
||
309 | __charmask = vec_gb(__charmask); |
||
310 | __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask); |
||
311 | #ifdef __BIG_ENDIAN__ |
||
312 | __shortmask = vec_reve(__shortmask); |
||
313 | #endif |
||
314 | return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask); |
||
315 | } |
||
316 | #endif |
||
317 | |||
318 | extern __inline __m128i |
||
319 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
320 | _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) { |
||
321 | #ifdef _ARCH_PWR10 |
||
322 | return (__m128i)vec_blendv((__v16qi)__A, (__v16qi)__B, (__v16qu)__mask); |
||
323 | #else |
||
324 | const __v16qu __seven = vec_splats((unsigned char)0x07); |
||
325 | __v16qu __lmask = vec_sra((__v16qu)__mask, __seven); |
||
326 | return (__m128i)vec_sel((__v16qi)__A, (__v16qi)__B, __lmask); |
||
327 | #endif |
||
328 | } |
||
329 | |||
330 | extern __inline __m128 |
||
331 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
332 | _mm_blend_ps(__m128 __A, __m128 __B, const int __imm8) { |
||
333 | __v16qu __pcv[] = { |
||
334 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
||
335 | {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
||
336 | {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, |
||
337 | {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, |
||
338 | {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, |
||
339 | {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15}, |
||
340 | {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, |
||
341 | {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15}, |
||
342 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, |
||
343 | {16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31}, |
||
344 | {0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, |
||
345 | {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31}, |
||
346 | {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, |
||
347 | {16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, |
||
348 | {0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, |
||
349 | {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}, |
||
350 | }; |
||
351 | __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); |
||
352 | return (__m128)__r; |
||
353 | } |
||
354 | |||
355 | extern __inline __m128 |
||
356 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
357 | _mm_blendv_ps(__m128 __A, __m128 __B, __m128 __mask) { |
||
358 | #ifdef _ARCH_PWR10 |
||
359 | return (__m128)vec_blendv((__v4sf)__A, (__v4sf)__B, (__v4su)__mask); |
||
360 | #else |
||
361 | const __v4si __zero = {0}; |
||
362 | const __vector __bool int __boolmask = vec_cmplt((__v4si)__mask, __zero); |
||
363 | return (__m128)vec_sel((__v4su)__A, (__v4su)__B, (__v4su)__boolmask); |
||
364 | #endif |
||
365 | } |
||
366 | |||
367 | extern __inline __m128d |
||
368 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
369 | _mm_blend_pd(__m128d __A, __m128d __B, const int __imm8) { |
||
370 | __v16qu __pcv[] = { |
||
371 | {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, |
||
372 | {16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15}, |
||
373 | {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}, |
||
374 | {16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31}}; |
||
375 | __v16qu __r = vec_perm((__v16qu)__A, (__v16qu)__B, __pcv[__imm8]); |
||
376 | return (__m128d)__r; |
||
377 | } |
||
378 | |||
379 | #ifdef _ARCH_PWR8 |
||
380 | extern __inline __m128d |
||
381 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
382 | _mm_blendv_pd(__m128d __A, __m128d __B, __m128d __mask) { |
||
383 | #ifdef _ARCH_PWR10 |
||
384 | return (__m128d)vec_blendv((__v2df)__A, (__v2df)__B, (__v2du)__mask); |
||
385 | #else |
||
386 | const __v2di __zero = {0}; |
||
387 | const __vector __bool long long __boolmask = |
||
388 | vec_cmplt((__v2di)__mask, __zero); |
||
389 | return (__m128d)vec_sel((__v2du)__A, (__v2du)__B, (__v2du)__boolmask); |
||
390 | #endif |
||
391 | } |
||
392 | #endif |
||
393 | |||
394 | extern __inline int |
||
395 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
396 | _mm_testz_si128(__m128i __A, __m128i __B) { |
||
397 | /* Note: This implementation does NOT set "zero" or "carry" flags. */ |
||
398 | const __v16qu __zero = {0}; |
||
399 | return vec_all_eq(vec_and((__v16qu)__A, (__v16qu)__B), __zero); |
||
400 | } |
||
401 | |||
402 | extern __inline int |
||
403 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
404 | _mm_testc_si128(__m128i __A, __m128i __B) { |
||
405 | /* Note: This implementation does NOT set "zero" or "carry" flags. */ |
||
406 | const __v16qu __zero = {0}; |
||
407 | const __v16qu __notA = vec_nor((__v16qu)__A, (__v16qu)__A); |
||
408 | return vec_all_eq(vec_and((__v16qu)__notA, (__v16qu)__B), __zero); |
||
409 | } |
||
410 | |||
411 | extern __inline int |
||
412 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
413 | _mm_testnzc_si128(__m128i __A, __m128i __B) { |
||
414 | /* Note: This implementation does NOT set "zero" or "carry" flags. */ |
||
415 | return _mm_testz_si128(__A, __B) == 0 && _mm_testc_si128(__A, __B) == 0; |
||
416 | } |
||
417 | |||
418 | #define _mm_test_all_zeros(M, V) _mm_testz_si128((M), (V)) |
||
419 | |||
420 | #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) |
||
421 | |||
422 | #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) |
||
423 | |||
424 | #ifdef _ARCH_PWR8 |
||
425 | extern __inline __m128i |
||
426 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
427 | _mm_cmpeq_epi64(__m128i __X, __m128i __Y) { |
||
428 | return (__m128i)vec_cmpeq((__v2di)__X, (__v2di)__Y); |
||
429 | } |
||
430 | #endif |
||
431 | |||
432 | extern __inline __m128i |
||
433 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
434 | _mm_min_epi8(__m128i __X, __m128i __Y) { |
||
435 | return (__m128i)vec_min((__v16qi)__X, (__v16qi)__Y); |
||
436 | } |
||
437 | |||
438 | extern __inline __m128i |
||
439 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
440 | _mm_min_epu16(__m128i __X, __m128i __Y) { |
||
441 | return (__m128i)vec_min((__v8hu)__X, (__v8hu)__Y); |
||
442 | } |
||
443 | |||
444 | extern __inline __m128i |
||
445 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
446 | _mm_min_epi32(__m128i __X, __m128i __Y) { |
||
447 | return (__m128i)vec_min((__v4si)__X, (__v4si)__Y); |
||
448 | } |
||
449 | |||
450 | extern __inline __m128i |
||
451 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
452 | _mm_min_epu32(__m128i __X, __m128i __Y) { |
||
453 | return (__m128i)vec_min((__v4su)__X, (__v4su)__Y); |
||
454 | } |
||
455 | |||
456 | extern __inline __m128i |
||
457 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
458 | _mm_max_epi8(__m128i __X, __m128i __Y) { |
||
459 | return (__m128i)vec_max((__v16qi)__X, (__v16qi)__Y); |
||
460 | } |
||
461 | |||
462 | extern __inline __m128i |
||
463 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
464 | _mm_max_epu16(__m128i __X, __m128i __Y) { |
||
465 | return (__m128i)vec_max((__v8hu)__X, (__v8hu)__Y); |
||
466 | } |
||
467 | |||
468 | extern __inline __m128i |
||
469 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
470 | _mm_max_epi32(__m128i __X, __m128i __Y) { |
||
471 | return (__m128i)vec_max((__v4si)__X, (__v4si)__Y); |
||
472 | } |
||
473 | |||
474 | extern __inline __m128i |
||
475 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
476 | _mm_max_epu32(__m128i __X, __m128i __Y) { |
||
477 | return (__m128i)vec_max((__v4su)__X, (__v4su)__Y); |
||
478 | } |
||
479 | |||
480 | extern __inline __m128i |
||
481 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
482 | _mm_mullo_epi32(__m128i __X, __m128i __Y) { |
||
483 | return (__m128i)vec_mul((__v4su)__X, (__v4su)__Y); |
||
484 | } |
||
485 | |||
486 | #ifdef _ARCH_PWR8 |
||
487 | extern __inline __m128i |
||
488 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
489 | _mm_mul_epi32(__m128i __X, __m128i __Y) { |
||
490 | return (__m128i)vec_mule((__v4si)__X, (__v4si)__Y); |
||
491 | } |
||
492 | #endif |
||
493 | |||
494 | extern __inline __m128i |
||
495 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
496 | _mm_cvtepi8_epi16(__m128i __A) { |
||
497 | return (__m128i)vec_unpackh((__v16qi)__A); |
||
498 | } |
||
499 | |||
500 | extern __inline __m128i |
||
501 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
502 | _mm_cvtepi8_epi32(__m128i __A) { |
||
503 | __A = (__m128i)vec_unpackh((__v16qi)__A); |
||
504 | return (__m128i)vec_unpackh((__v8hi)__A); |
||
505 | } |
||
506 | |||
507 | #ifdef _ARCH_PWR8 |
||
508 | extern __inline __m128i |
||
509 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
510 | _mm_cvtepi8_epi64(__m128i __A) { |
||
511 | __A = (__m128i)vec_unpackh((__v16qi)__A); |
||
512 | __A = (__m128i)vec_unpackh((__v8hi)__A); |
||
513 | return (__m128i)vec_unpackh((__v4si)__A); |
||
514 | } |
||
515 | #endif |
||
516 | |||
517 | extern __inline __m128i |
||
518 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
519 | _mm_cvtepi16_epi32(__m128i __A) { |
||
520 | return (__m128i)vec_unpackh((__v8hi)__A); |
||
521 | } |
||
522 | |||
523 | #ifdef _ARCH_PWR8 |
||
524 | extern __inline __m128i |
||
525 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
526 | _mm_cvtepi16_epi64(__m128i __A) { |
||
527 | __A = (__m128i)vec_unpackh((__v8hi)__A); |
||
528 | return (__m128i)vec_unpackh((__v4si)__A); |
||
529 | } |
||
530 | #endif |
||
531 | |||
532 | #ifdef _ARCH_PWR8 |
||
533 | extern __inline __m128i |
||
534 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
535 | _mm_cvtepi32_epi64(__m128i __A) { |
||
536 | return (__m128i)vec_unpackh((__v4si)__A); |
||
537 | } |
||
538 | #endif |
||
539 | |||
540 | extern __inline __m128i |
||
541 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
542 | _mm_cvtepu8_epi16(__m128i __A) { |
||
543 | const __v16qu __zero = {0}; |
||
544 | #ifdef __LITTLE_ENDIAN__ |
||
545 | __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); |
||
546 | #else /* __BIG_ENDIAN__. */ |
||
547 | __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); |
||
548 | #endif /* __BIG_ENDIAN__. */ |
||
549 | return __A; |
||
550 | } |
||
551 | |||
552 | extern __inline __m128i |
||
553 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
554 | _mm_cvtepu8_epi32(__m128i __A) { |
||
555 | const __v16qu __zero = {0}; |
||
556 | #ifdef __LITTLE_ENDIAN__ |
||
557 | __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); |
||
558 | __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); |
||
559 | #else /* __BIG_ENDIAN__. */ |
||
560 | __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); |
||
561 | __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); |
||
562 | #endif /* __BIG_ENDIAN__. */ |
||
563 | return __A; |
||
564 | } |
||
565 | |||
566 | extern __inline __m128i |
||
567 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
568 | _mm_cvtepu8_epi64(__m128i __A) { |
||
569 | const __v16qu __zero = {0}; |
||
570 | #ifdef __LITTLE_ENDIAN__ |
||
571 | __A = (__m128i)vec_mergeh((__v16qu)__A, __zero); |
||
572 | __A = (__m128i)vec_mergeh((__v8hu)__A, (__v8hu)__zero); |
||
573 | __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); |
||
574 | #else /* __BIG_ENDIAN__. */ |
||
575 | __A = (__m128i)vec_mergeh(__zero, (__v16qu)__A); |
||
576 | __A = (__m128i)vec_mergeh((__v8hu)__zero, (__v8hu)__A); |
||
577 | __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); |
||
578 | #endif /* __BIG_ENDIAN__. */ |
||
579 | return __A; |
||
580 | } |
||
581 | |||
582 | extern __inline __m128i |
||
583 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
584 | _mm_cvtepu16_epi32(__m128i __A) { |
||
585 | const __v8hu __zero = {0}; |
||
586 | #ifdef __LITTLE_ENDIAN__ |
||
587 | __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); |
||
588 | #else /* __BIG_ENDIAN__. */ |
||
589 | __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); |
||
590 | #endif /* __BIG_ENDIAN__. */ |
||
591 | return __A; |
||
592 | } |
||
593 | |||
594 | extern __inline __m128i |
||
595 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
596 | _mm_cvtepu16_epi64(__m128i __A) { |
||
597 | const __v8hu __zero = {0}; |
||
598 | #ifdef __LITTLE_ENDIAN__ |
||
599 | __A = (__m128i)vec_mergeh((__v8hu)__A, __zero); |
||
600 | __A = (__m128i)vec_mergeh((__v4su)__A, (__v4su)__zero); |
||
601 | #else /* __BIG_ENDIAN__. */ |
||
602 | __A = (__m128i)vec_mergeh(__zero, (__v8hu)__A); |
||
603 | __A = (__m128i)vec_mergeh((__v4su)__zero, (__v4su)__A); |
||
604 | #endif /* __BIG_ENDIAN__. */ |
||
605 | return __A; |
||
606 | } |
||
607 | |||
608 | extern __inline __m128i |
||
609 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
610 | _mm_cvtepu32_epi64(__m128i __A) { |
||
611 | const __v4su __zero = {0}; |
||
612 | #ifdef __LITTLE_ENDIAN__ |
||
613 | __A = (__m128i)vec_mergeh((__v4su)__A, __zero); |
||
614 | #else /* __BIG_ENDIAN__. */ |
||
615 | __A = (__m128i)vec_mergeh(__zero, (__v4su)__A); |
||
616 | #endif /* __BIG_ENDIAN__. */ |
||
617 | return __A; |
||
618 | } |
||
619 | |||
620 | /* Return horizontal packed word minimum and its index in bits [15:0] |
||
621 | and bits [18:16] respectively. */ |
||
622 | extern __inline __m128i |
||
623 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
624 | _mm_minpos_epu16(__m128i __A) { |
||
625 | union __u { |
||
626 | __m128i __m; |
||
627 | __v8hu __uh; |
||
628 | }; |
||
629 | union __u __u = {.__m = __A}, __r = {.__m = {0}}; |
||
630 | unsigned short __ridx = 0; |
||
631 | unsigned short __rmin = __u.__uh[__ridx]; |
||
632 | unsigned long __i; |
||
633 | for (__i = 1; __i < 8; __i++) { |
||
634 | if (__u.__uh[__i] < __rmin) { |
||
635 | __rmin = __u.__uh[__i]; |
||
636 | __ridx = __i; |
||
637 | } |
||
638 | } |
||
639 | __r.__uh[0] = __rmin; |
||
640 | __r.__uh[1] = __ridx; |
||
641 | return __r.__m; |
||
642 | } |
||
643 | |||
644 | extern __inline __m128i |
||
645 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
646 | _mm_packus_epi32(__m128i __X, __m128i __Y) { |
||
647 | return (__m128i)vec_packsu((__v4si)__X, (__v4si)__Y); |
||
648 | } |
||
649 | |||
650 | #ifdef _ARCH_PWR8 |
||
651 | extern __inline __m128i |
||
652 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
653 | _mm_cmpgt_epi64(__m128i __X, __m128i __Y) { |
||
654 | return (__m128i)vec_cmpgt((__v2di)__X, (__v2di)__Y); |
||
655 | } |
||
656 | #endif |
||
657 | |||
658 | #else |
||
659 | #include_next <smmintrin.h> |
||
660 | #endif /* defined(__powerpc64__) && \ |
||
661 | * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ |
||
662 | |||
663 | #endif /* SMMINTRIN_H_ */ |