Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | /*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------=== |
2 | * |
||
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | * See https://llvm.org/LICENSE.txt for license information. |
||
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | * |
||
7 | *===-----------------------------------------------------------------------=== |
||
8 | */ |
||
9 | |||
10 | /* Implemented from the specification included in the Intel C++ Compiler |
||
11 | User Guide and Reference, version 9.0. */ |
||
12 | |||
13 | #ifndef NO_WARN_X86_INTRINSICS |
||
14 | /* This header file is to help porting code using Intel intrinsics |
||
15 | explicitly from x86_64 to powerpc64/powerpc64le. |
||
16 | |||
17 | Since X86 SSE intrinsics mainly handles __m128 type, PowerPC |
||
18 | VMX/VSX ISA is a good match for vector float SIMD operations. |
||
19 | However scalar float operations in vector (XMM) registers require |
||
20 | the POWER8 VSX ISA (2.07) level. There are differences for data |
||
21 | format and placement of float scalars in the vector register, which |
||
22 | require extra steps to match SSE scalar float semantics on POWER. |
||
23 | |||
24 | It should be noted that there's much difference between X86_64's |
||
25 | MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use |
||
26 | portable <fenv.h> instead of access MXSCR directly. |
||
27 | |||
28 | Most SSE scalar float intrinsic operations can be performed more |
||
29 | efficiently as C language float scalar operations or optimized to |
||
30 | use vector SIMD operations. We recommend this for new applications. */ |
||
31 | #error \ |
||
32 | "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." |
||
33 | #endif |
||
34 | |||
35 | #ifndef XMMINTRIN_H_ |
||
36 | #define XMMINTRIN_H_ |
||
37 | |||
38 | #if defined(__powerpc64__) && \ |
||
39 | (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) |
||
40 | |||
41 | /* Define four value permute mask */ |
||
42 | #define _MM_SHUFFLE(w, x, y, z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z)) |
||
43 | |||
44 | #include <altivec.h> |
||
45 | |||
46 | /* Avoid collisions between altivec.h and strict adherence to C++ and |
||
47 | C11 standards. This should eventually be done inside altivec.h itself, |
||
48 | but only after testing a full distro build. */ |
||
49 | #if defined(__STRICT_ANSI__) && \ |
||
50 | (defined(__cplusplus) || \ |
||
51 | (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)) |
||
52 | #undef vector |
||
53 | #undef pixel |
||
54 | #undef bool |
||
55 | #endif |
||
56 | |||
57 | /* We need type definitions from the MMX header file. */ |
||
58 | #include <mmintrin.h> |
||
59 | |||
60 | /* Get _mm_malloc () and _mm_free (). */ |
||
61 | #if __STDC_HOSTED__ |
||
62 | #include <mm_malloc.h> |
||
63 | #endif |
||
64 | |||
65 | /* The Intel API is flexible enough that we must allow aliasing with other |
||
66 | vector types, and their scalar components. */ |
||
67 | typedef vector float __m128 __attribute__((__may_alias__)); |
||
68 | |||
69 | /* Unaligned version of the same type. */ |
||
70 | typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1))); |
||
71 | |||
72 | /* Internal data types for implementing the intrinsics. */ |
||
73 | typedef vector float __v4sf; |
||
74 | |||
75 | /* Create an undefined vector. */ |
||
76 | extern __inline __m128 |
||
77 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
78 | _mm_undefined_ps(void) { |
||
79 | __m128 __Y = __Y; |
||
80 | return __Y; |
||
81 | } |
||
82 | |||
83 | /* Create a vector of zeros. */ |
||
84 | extern __inline __m128 |
||
85 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
86 | _mm_setzero_ps(void) { |
||
87 | return __extension__(__m128){0.0f, 0.0f, 0.0f, 0.0f}; |
||
88 | } |
||
89 | |||
90 | /* Load four SPFP values from P. The address must be 16-byte aligned. */ |
||
91 | extern __inline __m128 |
||
92 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
93 | _mm_load_ps(float const *__P) { |
||
94 | return ((__m128)vec_ld(0, (__v4sf *)__P)); |
||
95 | } |
||
96 | |||
97 | /* Load four SPFP values from P. The address need not be 16-byte aligned. */ |
||
98 | extern __inline __m128 |
||
99 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
100 | _mm_loadu_ps(float const *__P) { |
||
101 | return (vec_vsx_ld(0, __P)); |
||
102 | } |
||
103 | |||
104 | /* Load four SPFP values in reverse order. The address must be aligned. */ |
||
105 | extern __inline __m128 |
||
106 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
107 | _mm_loadr_ps(float const *__P) { |
||
108 | __v4sf __tmp; |
||
109 | __m128 __result; |
||
110 | static const __vector unsigned char __permute_vector = { |
||
111 | 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, |
||
112 | 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13}; |
||
113 | |||
114 | __tmp = vec_ld(0, (__v4sf *)__P); |
||
115 | __result = (__m128)vec_perm(__tmp, __tmp, __permute_vector); |
||
116 | return __result; |
||
117 | } |
||
118 | |||
119 | /* Create a vector with all four elements equal to F. */ |
||
120 | extern __inline __m128 |
||
121 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
122 | _mm_set1_ps(float __F) { |
||
123 | return __extension__(__m128)(__v4sf){__F, __F, __F, __F}; |
||
124 | } |
||
125 | |||
126 | extern __inline __m128 |
||
127 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
128 | _mm_set_ps1(float __F) { |
||
129 | return _mm_set1_ps(__F); |
||
130 | } |
||
131 | |||
132 | /* Create the vector [Z Y X W]. */ |
||
133 | extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, |
||
134 | __artificial__)) |
||
135 | _mm_set_ps(const float __Z, const float __Y, const float __X, const float __W) { |
||
136 | return __extension__(__m128)(__v4sf){__W, __X, __Y, __Z}; |
||
137 | } |
||
138 | |||
139 | /* Create the vector [W X Y Z]. */ |
||
140 | extern __inline __m128 |
||
141 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
142 | _mm_setr_ps(float __Z, float __Y, float __X, float __W) { |
||
143 | return __extension__(__m128)(__v4sf){__Z, __Y, __X, __W}; |
||
144 | } |
||
145 | |||
146 | /* Store four SPFP values. The address must be 16-byte aligned. */ |
||
147 | extern __inline void |
||
148 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
149 | _mm_store_ps(float *__P, __m128 __A) { |
||
150 | vec_st((__v4sf)__A, 0, (__v4sf *)__P); |
||
151 | } |
||
152 | |||
153 | /* Store four SPFP values. The address need not be 16-byte aligned. */ |
||
154 | extern __inline void |
||
155 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
156 | _mm_storeu_ps(float *__P, __m128 __A) { |
||
157 | *(__m128_u *)__P = __A; |
||
158 | } |
||
159 | |||
160 | /* Store four SPFP values in reverse order. The address must be aligned. */ |
||
161 | extern __inline void |
||
162 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
163 | _mm_storer_ps(float *__P, __m128 __A) { |
||
164 | __v4sf __tmp; |
||
165 | static const __vector unsigned char __permute_vector = { |
||
166 | 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, |
||
167 | 0x14, 0x15, 0x16, 0x17, 0x10, 0x11, 0x12, 0x13}; |
||
168 | |||
169 | __tmp = (__m128)vec_perm(__A, __A, __permute_vector); |
||
170 | |||
171 | _mm_store_ps(__P, __tmp); |
||
172 | } |
||
173 | |||
174 | /* Store the lower SPFP value across four words. */ |
||
175 | extern __inline void |
||
176 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
177 | _mm_store1_ps(float *__P, __m128 __A) { |
||
178 | __v4sf __va = vec_splat((__v4sf)__A, 0); |
||
179 | _mm_store_ps(__P, __va); |
||
180 | } |
||
181 | |||
182 | extern __inline void |
||
183 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
184 | _mm_store_ps1(float *__P, __m128 __A) { |
||
185 | _mm_store1_ps(__P, __A); |
||
186 | } |
||
187 | |||
188 | /* Create a vector with element 0 as F and the rest zero. */ |
||
189 | extern __inline __m128 |
||
190 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
191 | _mm_set_ss(float __F) { |
||
192 | return __extension__(__m128)(__v4sf){__F, 0.0f, 0.0f, 0.0f}; |
||
193 | } |
||
194 | |||
195 | /* Sets the low SPFP value of A from the low value of B. */ |
||
196 | extern __inline __m128 |
||
197 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
198 | _mm_move_ss(__m128 __A, __m128 __B) { |
||
199 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
200 | |||
201 | return (vec_sel((__v4sf)__A, (__v4sf)__B, __mask)); |
||
202 | } |
||
203 | |||
204 | /* Create a vector with element 0 as *P and the rest zero. */ |
||
205 | extern __inline __m128 |
||
206 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
207 | _mm_load_ss(float const *__P) { |
||
208 | return _mm_set_ss(*__P); |
||
209 | } |
||
210 | |||
211 | /* Stores the lower SPFP value. */ |
||
212 | extern __inline void |
||
213 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
214 | _mm_store_ss(float *__P, __m128 __A) { |
||
215 | *__P = ((__v4sf)__A)[0]; |
||
216 | } |
||
217 | |||
218 | /* Perform the respective operation on the lower SPFP (single-precision |
||
219 | floating-point) values of A and B; the upper three SPFP values are |
||
220 | passed through from A. */ |
||
221 | |||
222 | extern __inline __m128 |
||
223 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
224 | _mm_add_ss(__m128 __A, __m128 __B) { |
||
225 | #ifdef _ARCH_PWR7 |
||
226 | __m128 __a, __b, __c; |
||
227 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
228 | /* PowerISA VSX does not allow partial (for just lower double) |
||
229 | results. So to insure we don't generate spurious exceptions |
||
230 | (from the upper double values) we splat the lower double |
||
231 | before we to the operation. */ |
||
232 | __a = vec_splat(__A, 0); |
||
233 | __b = vec_splat(__B, 0); |
||
234 | __c = __a + __b; |
||
235 | /* Then we merge the lower float result with the original upper |
||
236 | float elements from __A. */ |
||
237 | return (vec_sel(__A, __c, __mask)); |
||
238 | #else |
||
239 | __A[0] = __A[0] + __B[0]; |
||
240 | return (__A); |
||
241 | #endif |
||
242 | } |
||
243 | |||
244 | extern __inline __m128 |
||
245 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
246 | _mm_sub_ss(__m128 __A, __m128 __B) { |
||
247 | #ifdef _ARCH_PWR7 |
||
248 | __m128 __a, __b, __c; |
||
249 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
250 | /* PowerISA VSX does not allow partial (for just lower double) |
||
251 | results. So to insure we don't generate spurious exceptions |
||
252 | (from the upper double values) we splat the lower double |
||
253 | before we to the operation. */ |
||
254 | __a = vec_splat(__A, 0); |
||
255 | __b = vec_splat(__B, 0); |
||
256 | __c = __a - __b; |
||
257 | /* Then we merge the lower float result with the original upper |
||
258 | float elements from __A. */ |
||
259 | return (vec_sel(__A, __c, __mask)); |
||
260 | #else |
||
261 | __A[0] = __A[0] - __B[0]; |
||
262 | return (__A); |
||
263 | #endif |
||
264 | } |
||
265 | |||
266 | extern __inline __m128 |
||
267 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
268 | _mm_mul_ss(__m128 __A, __m128 __B) { |
||
269 | #ifdef _ARCH_PWR7 |
||
270 | __m128 __a, __b, __c; |
||
271 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
272 | /* PowerISA VSX does not allow partial (for just lower double) |
||
273 | results. So to insure we don't generate spurious exceptions |
||
274 | (from the upper double values) we splat the lower double |
||
275 | before we to the operation. */ |
||
276 | __a = vec_splat(__A, 0); |
||
277 | __b = vec_splat(__B, 0); |
||
278 | __c = __a * __b; |
||
279 | /* Then we merge the lower float result with the original upper |
||
280 | float elements from __A. */ |
||
281 | return (vec_sel(__A, __c, __mask)); |
||
282 | #else |
||
283 | __A[0] = __A[0] * __B[0]; |
||
284 | return (__A); |
||
285 | #endif |
||
286 | } |
||
287 | |||
288 | extern __inline __m128 |
||
289 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
290 | _mm_div_ss(__m128 __A, __m128 __B) { |
||
291 | #ifdef _ARCH_PWR7 |
||
292 | __m128 __a, __b, __c; |
||
293 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
294 | /* PowerISA VSX does not allow partial (for just lower double) |
||
295 | results. So to insure we don't generate spurious exceptions |
||
296 | (from the upper double values) we splat the lower double |
||
297 | before we to the operation. */ |
||
298 | __a = vec_splat(__A, 0); |
||
299 | __b = vec_splat(__B, 0); |
||
300 | __c = __a / __b; |
||
301 | /* Then we merge the lower float result with the original upper |
||
302 | float elements from __A. */ |
||
303 | return (vec_sel(__A, __c, __mask)); |
||
304 | #else |
||
305 | __A[0] = __A[0] / __B[0]; |
||
306 | return (__A); |
||
307 | #endif |
||
308 | } |
||
309 | |||
310 | extern __inline __m128 |
||
311 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
312 | _mm_sqrt_ss(__m128 __A) { |
||
313 | __m128 __a, __c; |
||
314 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
315 | /* PowerISA VSX does not allow partial (for just lower double) |
||
316 | * results. So to insure we don't generate spurious exceptions |
||
317 | * (from the upper double values) we splat the lower double |
||
318 | * before we to the operation. */ |
||
319 | __a = vec_splat(__A, 0); |
||
320 | __c = vec_sqrt(__a); |
||
321 | /* Then we merge the lower float result with the original upper |
||
322 | * float elements from __A. */ |
||
323 | return (vec_sel(__A, __c, __mask)); |
||
324 | } |
||
325 | |||
326 | /* Perform the respective operation on the four SPFP values in A and B. */ |
||
327 | extern __inline __m128 |
||
328 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
329 | _mm_add_ps(__m128 __A, __m128 __B) { |
||
330 | return (__m128)((__v4sf)__A + (__v4sf)__B); |
||
331 | } |
||
332 | |||
333 | extern __inline __m128 |
||
334 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
335 | _mm_sub_ps(__m128 __A, __m128 __B) { |
||
336 | return (__m128)((__v4sf)__A - (__v4sf)__B); |
||
337 | } |
||
338 | |||
339 | extern __inline __m128 |
||
340 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
341 | _mm_mul_ps(__m128 __A, __m128 __B) { |
||
342 | return (__m128)((__v4sf)__A * (__v4sf)__B); |
||
343 | } |
||
344 | |||
345 | extern __inline __m128 |
||
346 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
347 | _mm_div_ps(__m128 __A, __m128 __B) { |
||
348 | return (__m128)((__v4sf)__A / (__v4sf)__B); |
||
349 | } |
||
350 | |||
351 | extern __inline __m128 |
||
352 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
353 | _mm_sqrt_ps(__m128 __A) { |
||
354 | return (vec_sqrt((__v4sf)__A)); |
||
355 | } |
||
356 | |||
357 | extern __inline __m128 |
||
358 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
359 | _mm_rcp_ps(__m128 __A) { |
||
360 | return (vec_re((__v4sf)__A)); |
||
361 | } |
||
362 | |||
363 | extern __inline __m128 |
||
364 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
365 | _mm_rsqrt_ps(__m128 __A) { |
||
366 | return (vec_rsqrte(__A)); |
||
367 | } |
||
368 | |||
369 | extern __inline __m128 |
||
370 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
371 | _mm_rcp_ss(__m128 __A) { |
||
372 | __m128 __a, __c; |
||
373 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
374 | /* PowerISA VSX does not allow partial (for just lower double) |
||
375 | * results. So to insure we don't generate spurious exceptions |
||
376 | * (from the upper double values) we splat the lower double |
||
377 | * before we to the operation. */ |
||
378 | __a = vec_splat(__A, 0); |
||
379 | __c = _mm_rcp_ps(__a); |
||
380 | /* Then we merge the lower float result with the original upper |
||
381 | * float elements from __A. */ |
||
382 | return (vec_sel(__A, __c, __mask)); |
||
383 | } |
||
384 | |||
385 | extern __inline __m128 |
||
386 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
387 | _mm_rsqrt_ss(__m128 __A) { |
||
388 | __m128 __a, __c; |
||
389 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
390 | /* PowerISA VSX does not allow partial (for just lower double) |
||
391 | * results. So to insure we don't generate spurious exceptions |
||
392 | * (from the upper double values) we splat the lower double |
||
393 | * before we to the operation. */ |
||
394 | __a = vec_splat(__A, 0); |
||
395 | __c = vec_rsqrte(__a); |
||
396 | /* Then we merge the lower float result with the original upper |
||
397 | * float elements from __A. */ |
||
398 | return (vec_sel(__A, __c, __mask)); |
||
399 | } |
||
400 | |||
401 | extern __inline __m128 |
||
402 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
403 | _mm_min_ss(__m128 __A, __m128 __B) { |
||
404 | __v4sf __a, __b, __c; |
||
405 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
406 | /* PowerISA VSX does not allow partial (for just lower float) |
||
407 | * results. So to insure we don't generate spurious exceptions |
||
408 | * (from the upper float values) we splat the lower float |
||
409 | * before we to the operation. */ |
||
410 | __a = vec_splat((__v4sf)__A, 0); |
||
411 | __b = vec_splat((__v4sf)__B, 0); |
||
412 | __c = vec_min(__a, __b); |
||
413 | /* Then we merge the lower float result with the original upper |
||
414 | * float elements from __A. */ |
||
415 | return (vec_sel((__v4sf)__A, __c, __mask)); |
||
416 | } |
||
417 | |||
418 | extern __inline __m128 |
||
419 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
420 | _mm_max_ss(__m128 __A, __m128 __B) { |
||
421 | __v4sf __a, __b, __c; |
||
422 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
423 | /* PowerISA VSX does not allow partial (for just lower float) |
||
424 | * results. So to insure we don't generate spurious exceptions |
||
425 | * (from the upper float values) we splat the lower float |
||
426 | * before we to the operation. */ |
||
427 | __a = vec_splat(__A, 0); |
||
428 | __b = vec_splat(__B, 0); |
||
429 | __c = vec_max(__a, __b); |
||
430 | /* Then we merge the lower float result with the original upper |
||
431 | * float elements from __A. */ |
||
432 | return (vec_sel((__v4sf)__A, __c, __mask)); |
||
433 | } |
||
434 | |||
435 | extern __inline __m128 |
||
436 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
437 | _mm_min_ps(__m128 __A, __m128 __B) { |
||
438 | __vector __bool int __m = vec_cmpgt((__v4sf)__B, (__v4sf)__A); |
||
439 | return vec_sel(__B, __A, __m); |
||
440 | } |
||
441 | |||
442 | extern __inline __m128 |
||
443 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
444 | _mm_max_ps(__m128 __A, __m128 __B) { |
||
445 | __vector __bool int __m = vec_cmpgt((__v4sf)__A, (__v4sf)__B); |
||
446 | return vec_sel(__B, __A, __m); |
||
447 | } |
||
448 | |||
449 | /* Perform logical bit-wise operations on 128-bit values. */ |
||
450 | extern __inline __m128 |
||
451 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
452 | _mm_and_ps(__m128 __A, __m128 __B) { |
||
453 | return ((__m128)vec_and((__v4sf)__A, (__v4sf)__B)); |
||
454 | // return __builtin_ia32_andps (__A, __B); |
||
455 | } |
||
456 | |||
457 | extern __inline __m128 |
||
458 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
459 | _mm_andnot_ps(__m128 __A, __m128 __B) { |
||
460 | return ((__m128)vec_andc((__v4sf)__B, (__v4sf)__A)); |
||
461 | } |
||
462 | |||
463 | extern __inline __m128 |
||
464 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
465 | _mm_or_ps(__m128 __A, __m128 __B) { |
||
466 | return ((__m128)vec_or((__v4sf)__A, (__v4sf)__B)); |
||
467 | } |
||
468 | |||
469 | extern __inline __m128 |
||
470 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
471 | _mm_xor_ps(__m128 __A, __m128 __B) { |
||
472 | return ((__m128)vec_xor((__v4sf)__A, (__v4sf)__B)); |
||
473 | } |
||
474 | |||
475 | /* Perform a comparison on the four SPFP values of A and B. For each |
||
476 | element, if the comparison is true, place a mask of all ones in the |
||
477 | result, otherwise a mask of zeros. */ |
||
478 | extern __inline __m128 |
||
479 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
480 | _mm_cmpeq_ps(__m128 __A, __m128 __B) { |
||
481 | return ((__m128)vec_cmpeq((__v4sf)__A, (__v4sf)__B)); |
||
482 | } |
||
483 | |||
484 | extern __inline __m128 |
||
485 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
486 | _mm_cmplt_ps(__m128 __A, __m128 __B) { |
||
487 | return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B)); |
||
488 | } |
||
489 | |||
490 | extern __inline __m128 |
||
491 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
492 | _mm_cmple_ps(__m128 __A, __m128 __B) { |
||
493 | return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B)); |
||
494 | } |
||
495 | |||
496 | extern __inline __m128 |
||
497 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
498 | _mm_cmpgt_ps(__m128 __A, __m128 __B) { |
||
499 | return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B)); |
||
500 | } |
||
501 | |||
502 | extern __inline __m128 |
||
503 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
504 | _mm_cmpge_ps(__m128 __A, __m128 __B) { |
||
505 | return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B)); |
||
506 | } |
||
507 | |||
508 | extern __inline __m128 |
||
509 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
510 | _mm_cmpneq_ps(__m128 __A, __m128 __B) { |
||
511 | __v4sf __temp = (__v4sf)vec_cmpeq((__v4sf)__A, (__v4sf)__B); |
||
512 | return ((__m128)vec_nor(__temp, __temp)); |
||
513 | } |
||
514 | |||
515 | extern __inline __m128 |
||
516 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
517 | _mm_cmpnlt_ps(__m128 __A, __m128 __B) { |
||
518 | return ((__m128)vec_cmpge((__v4sf)__A, (__v4sf)__B)); |
||
519 | } |
||
520 | |||
521 | extern __inline __m128 |
||
522 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
523 | _mm_cmpnle_ps(__m128 __A, __m128 __B) { |
||
524 | return ((__m128)vec_cmpgt((__v4sf)__A, (__v4sf)__B)); |
||
525 | } |
||
526 | |||
527 | extern __inline __m128 |
||
528 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
529 | _mm_cmpngt_ps(__m128 __A, __m128 __B) { |
||
530 | return ((__m128)vec_cmple((__v4sf)__A, (__v4sf)__B)); |
||
531 | } |
||
532 | |||
533 | extern __inline __m128 |
||
534 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
535 | _mm_cmpnge_ps(__m128 __A, __m128 __B) { |
||
536 | return ((__m128)vec_cmplt((__v4sf)__A, (__v4sf)__B)); |
||
537 | } |
||
538 | |||
539 | extern __inline __m128 |
||
540 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
541 | _mm_cmpord_ps(__m128 __A, __m128 __B) { |
||
542 | __vector unsigned int __a, __b; |
||
543 | __vector unsigned int __c, __d; |
||
544 | static const __vector unsigned int __float_exp_mask = { |
||
545 | 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; |
||
546 | |||
547 | __a = (__vector unsigned int)vec_abs((__v4sf)__A); |
||
548 | __b = (__vector unsigned int)vec_abs((__v4sf)__B); |
||
549 | __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a); |
||
550 | __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b); |
||
551 | return ((__m128)vec_and(__c, __d)); |
||
552 | } |
||
553 | |||
554 | extern __inline __m128 |
||
555 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
556 | _mm_cmpunord_ps(__m128 __A, __m128 __B) { |
||
557 | __vector unsigned int __a, __b; |
||
558 | __vector unsigned int __c, __d; |
||
559 | static const __vector unsigned int __float_exp_mask = { |
||
560 | 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; |
||
561 | |||
562 | __a = (__vector unsigned int)vec_abs((__v4sf)__A); |
||
563 | __b = (__vector unsigned int)vec_abs((__v4sf)__B); |
||
564 | __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask); |
||
565 | __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask); |
||
566 | return ((__m128)vec_or(__c, __d)); |
||
567 | } |
||
568 | |||
569 | /* Perform a comparison on the lower SPFP values of A and B. If the |
||
570 | comparison is true, place a mask of all ones in the result, otherwise a |
||
571 | mask of zeros. The upper three SPFP values are passed through from A. */ |
||
572 | extern __inline __m128 |
||
573 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
574 | _mm_cmpeq_ss(__m128 __A, __m128 __B) { |
||
575 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
576 | __v4sf __a, __b, __c; |
||
577 | /* PowerISA VMX does not allow partial (for just element 0) |
||
578 | * results. So to insure we don't generate spurious exceptions |
||
579 | * (from the upper elements) we splat the lower float |
||
580 | * before we to the operation. */ |
||
581 | __a = vec_splat((__v4sf)__A, 0); |
||
582 | __b = vec_splat((__v4sf)__B, 0); |
||
583 | __c = (__v4sf)vec_cmpeq(__a, __b); |
||
584 | /* Then we merge the lower float result with the original upper |
||
585 | * float elements from __A. */ |
||
586 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
||
587 | } |
||
588 | |||
589 | extern __inline __m128 |
||
590 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
591 | _mm_cmplt_ss(__m128 __A, __m128 __B) { |
||
592 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
593 | __v4sf __a, __b, __c; |
||
594 | /* PowerISA VMX does not allow partial (for just element 0) |
||
595 | * results. So to insure we don't generate spurious exceptions |
||
596 | * (from the upper elements) we splat the lower float |
||
597 | * before we to the operation. */ |
||
598 | __a = vec_splat((__v4sf)__A, 0); |
||
599 | __b = vec_splat((__v4sf)__B, 0); |
||
600 | __c = (__v4sf)vec_cmplt(__a, __b); |
||
601 | /* Then we merge the lower float result with the original upper |
||
602 | * float elements from __A. */ |
||
603 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
||
604 | } |
||
605 | |||
606 | extern __inline __m128 |
||
607 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
608 | _mm_cmple_ss(__m128 __A, __m128 __B) { |
||
609 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
610 | __v4sf __a, __b, __c; |
||
611 | /* PowerISA VMX does not allow partial (for just element 0) |
||
612 | * results. So to insure we don't generate spurious exceptions |
||
613 | * (from the upper elements) we splat the lower float |
||
614 | * before we to the operation. */ |
||
615 | __a = vec_splat((__v4sf)__A, 0); |
||
616 | __b = vec_splat((__v4sf)__B, 0); |
||
617 | __c = (__v4sf)vec_cmple(__a, __b); |
||
618 | /* Then we merge the lower float result with the original upper |
||
619 | * float elements from __A. */ |
||
620 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
||
621 | } |
||
622 | |||
623 | extern __inline __m128 |
||
624 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
625 | _mm_cmpgt_ss(__m128 __A, __m128 __B) { |
||
626 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
627 | __v4sf __a, __b, __c; |
||
628 | /* PowerISA VMX does not allow partial (for just element 0) |
||
629 | * results. So to insure we don't generate spurious exceptions |
||
630 | * (from the upper elements) we splat the lower float |
||
631 | * before we to the operation. */ |
||
632 | __a = vec_splat((__v4sf)__A, 0); |
||
633 | __b = vec_splat((__v4sf)__B, 0); |
||
634 | __c = (__v4sf)vec_cmpgt(__a, __b); |
||
635 | /* Then we merge the lower float result with the original upper |
||
636 | * float elements from __A. */ |
||
637 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
||
638 | } |
||
639 | |||
640 | extern __inline __m128 |
||
641 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
642 | _mm_cmpge_ss(__m128 __A, __m128 __B) { |
||
643 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
644 | __v4sf __a, __b, __c; |
||
645 | /* PowerISA VMX does not allow partial (for just element 0) |
||
646 | * results. So to insure we don't generate spurious exceptions |
||
647 | * (from the upper elements) we splat the lower float |
||
648 | * before we to the operation. */ |
||
649 | __a = vec_splat((__v4sf)__A, 0); |
||
650 | __b = vec_splat((__v4sf)__B, 0); |
||
651 | __c = (__v4sf)vec_cmpge(__a, __b); |
||
652 | /* Then we merge the lower float result with the original upper |
||
653 | * float elements from __A. */ |
||
654 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
||
655 | } |
||
656 | |||
657 | extern __inline __m128 |
||
658 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
659 | _mm_cmpneq_ss(__m128 __A, __m128 __B) { |
||
660 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
661 | __v4sf __a, __b, __c; |
||
662 | /* PowerISA VMX does not allow partial (for just element 0) |
||
663 | * results. So to insure we don't generate spurious exceptions |
||
664 | * (from the upper elements) we splat the lower float |
||
665 | * before we to the operation. */ |
||
666 | __a = vec_splat((__v4sf)__A, 0); |
||
667 | __b = vec_splat((__v4sf)__B, 0); |
||
668 | __c = (__v4sf)vec_cmpeq(__a, __b); |
||
669 | __c = vec_nor(__c, __c); |
||
670 | /* Then we merge the lower float result with the original upper |
||
671 | * float elements from __A. */ |
||
672 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
||
673 | } |
||
674 | |||
675 | extern __inline __m128 |
||
676 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
677 | _mm_cmpnlt_ss(__m128 __A, __m128 __B) { |
||
678 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
679 | __v4sf __a, __b, __c; |
||
680 | /* PowerISA VMX does not allow partial (for just element 0) |
||
681 | * results. So to insure we don't generate spurious exceptions |
||
682 | * (from the upper elements) we splat the lower float |
||
683 | * before we to the operation. */ |
||
684 | __a = vec_splat((__v4sf)__A, 0); |
||
685 | __b = vec_splat((__v4sf)__B, 0); |
||
686 | __c = (__v4sf)vec_cmpge(__a, __b); |
||
687 | /* Then we merge the lower float result with the original upper |
||
688 | * float elements from __A. */ |
||
689 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
||
690 | } |
||
691 | |||
692 | extern __inline __m128 |
||
693 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
694 | _mm_cmpnle_ss(__m128 __A, __m128 __B) { |
||
695 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
696 | __v4sf __a, __b, __c; |
||
697 | /* PowerISA VMX does not allow partial (for just element 0) |
||
698 | * results. So to insure we don't generate spurious exceptions |
||
699 | * (from the upper elements) we splat the lower float |
||
700 | * before we to the operation. */ |
||
701 | __a = vec_splat((__v4sf)__A, 0); |
||
702 | __b = vec_splat((__v4sf)__B, 0); |
||
703 | __c = (__v4sf)vec_cmpgt(__a, __b); |
||
704 | /* Then we merge the lower float result with the original upper |
||
705 | * float elements from __A. */ |
||
706 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
||
707 | } |
||
708 | |||
709 | extern __inline __m128 |
||
710 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
711 | _mm_cmpngt_ss(__m128 __A, __m128 __B) { |
||
712 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
713 | __v4sf __a, __b, __c; |
||
714 | /* PowerISA VMX does not allow partial (for just element 0) |
||
715 | * results. So to insure we don't generate spurious exceptions |
||
716 | * (from the upper elements) we splat the lower float |
||
717 | * before we to the operation. */ |
||
718 | __a = vec_splat((__v4sf)__A, 0); |
||
719 | __b = vec_splat((__v4sf)__B, 0); |
||
720 | __c = (__v4sf)vec_cmple(__a, __b); |
||
721 | /* Then we merge the lower float result with the original upper |
||
722 | * float elements from __A. */ |
||
723 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
||
724 | } |
||
725 | |||
726 | extern __inline __m128 |
||
727 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
728 | _mm_cmpnge_ss(__m128 __A, __m128 __B) { |
||
729 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
730 | __v4sf __a, __b, __c; |
||
731 | /* PowerISA VMX does not allow partial (for just element 0) |
||
732 | * results. So to insure we don't generate spurious exceptions |
||
733 | * (from the upper elements) we splat the lower float |
||
734 | * before we do the operation. */ |
||
735 | __a = vec_splat((__v4sf)__A, 0); |
||
736 | __b = vec_splat((__v4sf)__B, 0); |
||
737 | __c = (__v4sf)vec_cmplt(__a, __b); |
||
738 | /* Then we merge the lower float result with the original upper |
||
739 | * float elements from __A. */ |
||
740 | return ((__m128)vec_sel((__v4sf)__A, __c, __mask)); |
||
741 | } |
||
742 | |||
743 | extern __inline __m128 |
||
744 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
745 | _mm_cmpord_ss(__m128 __A, __m128 __B) { |
||
746 | __vector unsigned int __a, __b; |
||
747 | __vector unsigned int __c, __d; |
||
748 | static const __vector unsigned int __float_exp_mask = { |
||
749 | 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; |
||
750 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
751 | |||
752 | __a = (__vector unsigned int)vec_abs((__v4sf)__A); |
||
753 | __b = (__vector unsigned int)vec_abs((__v4sf)__B); |
||
754 | __c = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __a); |
||
755 | __d = (__vector unsigned int)vec_cmpgt(__float_exp_mask, __b); |
||
756 | __c = vec_and(__c, __d); |
||
757 | /* Then we merge the lower float result with the original upper |
||
758 | * float elements from __A. */ |
||
759 | return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask)); |
||
760 | } |
||
761 | |||
762 | extern __inline __m128 |
||
763 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
764 | _mm_cmpunord_ss(__m128 __A, __m128 __B) { |
||
765 | __vector unsigned int __a, __b; |
||
766 | __vector unsigned int __c, __d; |
||
767 | static const __vector unsigned int __float_exp_mask = { |
||
768 | 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000}; |
||
769 | static const __vector unsigned int __mask = {0xffffffff, 0, 0, 0}; |
||
770 | |||
771 | __a = (__vector unsigned int)vec_abs((__v4sf)__A); |
||
772 | __b = (__vector unsigned int)vec_abs((__v4sf)__B); |
||
773 | __c = (__vector unsigned int)vec_cmpgt(__a, __float_exp_mask); |
||
774 | __d = (__vector unsigned int)vec_cmpgt(__b, __float_exp_mask); |
||
775 | __c = vec_or(__c, __d); |
||
776 | /* Then we merge the lower float result with the original upper |
||
777 | * float elements from __A. */ |
||
778 | return ((__m128)vec_sel((__v4sf)__A, (__v4sf)__c, __mask)); |
||
779 | } |
||
780 | |||
781 | /* Compare the lower SPFP values of A and B and return 1 if true |
||
782 | and 0 if false. */ |
||
783 | extern __inline int |
||
784 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
785 | _mm_comieq_ss(__m128 __A, __m128 __B) { |
||
786 | return (__A[0] == __B[0]); |
||
787 | } |
||
788 | |||
789 | extern __inline int |
||
790 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
791 | _mm_comilt_ss(__m128 __A, __m128 __B) { |
||
792 | return (__A[0] < __B[0]); |
||
793 | } |
||
794 | |||
795 | extern __inline int |
||
796 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
797 | _mm_comile_ss(__m128 __A, __m128 __B) { |
||
798 | return (__A[0] <= __B[0]); |
||
799 | } |
||
800 | |||
801 | extern __inline int |
||
802 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
803 | _mm_comigt_ss(__m128 __A, __m128 __B) { |
||
804 | return (__A[0] > __B[0]); |
||
805 | } |
||
806 | |||
807 | extern __inline int |
||
808 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
809 | _mm_comige_ss(__m128 __A, __m128 __B) { |
||
810 | return (__A[0] >= __B[0]); |
||
811 | } |
||
812 | |||
813 | extern __inline int |
||
814 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
815 | _mm_comineq_ss(__m128 __A, __m128 __B) { |
||
816 | return (__A[0] != __B[0]); |
||
817 | } |
||
818 | |||
819 | /* FIXME |
||
820 | * The __mm_ucomi??_ss implementations below are exactly the same as |
||
821 | * __mm_comi??_ss because GCC for PowerPC only generates unordered |
||
822 | * compares (scalar and vector). |
||
823 | * Technically __mm_comieq_ss et al should be using the ordered |
||
824 | * compare and signal for QNaNs. |
||
825 | * The __mm_ucomieq_sd et all should be OK, as is. |
||
826 | */ |
||
827 | extern __inline int |
||
828 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
829 | _mm_ucomieq_ss(__m128 __A, __m128 __B) { |
||
830 | return (__A[0] == __B[0]); |
||
831 | } |
||
832 | |||
833 | extern __inline int |
||
834 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
835 | _mm_ucomilt_ss(__m128 __A, __m128 __B) { |
||
836 | return (__A[0] < __B[0]); |
||
837 | } |
||
838 | |||
839 | extern __inline int |
||
840 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
841 | _mm_ucomile_ss(__m128 __A, __m128 __B) { |
||
842 | return (__A[0] <= __B[0]); |
||
843 | } |
||
844 | |||
845 | extern __inline int |
||
846 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
847 | _mm_ucomigt_ss(__m128 __A, __m128 __B) { |
||
848 | return (__A[0] > __B[0]); |
||
849 | } |
||
850 | |||
851 | extern __inline int |
||
852 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
853 | _mm_ucomige_ss(__m128 __A, __m128 __B) { |
||
854 | return (__A[0] >= __B[0]); |
||
855 | } |
||
856 | |||
857 | extern __inline int |
||
858 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
859 | _mm_ucomineq_ss(__m128 __A, __m128 __B) { |
||
860 | return (__A[0] != __B[0]); |
||
861 | } |
||
862 | |||
863 | extern __inline float |
||
864 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
865 | _mm_cvtss_f32(__m128 __A) { |
||
866 | return ((__v4sf)__A)[0]; |
||
867 | } |
||
868 | |||
869 | /* Convert the lower SPFP value to a 32-bit integer according to the current |
||
870 | rounding mode. */ |
||
871 | extern __inline int |
||
872 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
873 | _mm_cvtss_si32(__m128 __A) { |
||
874 | int __res; |
||
875 | #ifdef _ARCH_PWR8 |
||
876 | double __dtmp; |
||
877 | __asm__( |
||
878 | #ifdef __LITTLE_ENDIAN__ |
||
879 | "xxsldwi %x0,%x0,%x0,3;\n" |
||
880 | #endif |
||
881 | "xscvspdp %x2,%x0;\n" |
||
882 | "fctiw %2,%2;\n" |
||
883 | "mfvsrd %1,%x2;\n" |
||
884 | : "+wa"(__A), "=r"(__res), "=f"(__dtmp) |
||
885 | :); |
||
886 | #else |
||
887 | __res = __builtin_rint(__A[0]); |
||
888 | #endif |
||
889 | return __res; |
||
890 | } |
||
891 | |||
892 | extern __inline int |
||
893 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
894 | _mm_cvt_ss2si(__m128 __A) { |
||
895 | return _mm_cvtss_si32(__A); |
||
896 | } |
||
897 | |||
898 | /* Convert the lower SPFP value to a 32-bit integer according to the |
||
899 | current rounding mode. */ |
||
900 | |||
901 | /* Intel intrinsic. */ |
||
902 | extern __inline long long |
||
903 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
904 | _mm_cvtss_si64(__m128 __A) { |
||
905 | long long __res; |
||
906 | #if defined(_ARCH_PWR8) && defined(__powerpc64__) |
||
907 | double __dtmp; |
||
908 | __asm__( |
||
909 | #ifdef __LITTLE_ENDIAN__ |
||
910 | "xxsldwi %x0,%x0,%x0,3;\n" |
||
911 | #endif |
||
912 | "xscvspdp %x2,%x0;\n" |
||
913 | "fctid %2,%2;\n" |
||
914 | "mfvsrd %1,%x2;\n" |
||
915 | : "+wa"(__A), "=r"(__res), "=f"(__dtmp) |
||
916 | :); |
||
917 | #else |
||
918 | __res = __builtin_llrint(__A[0]); |
||
919 | #endif |
||
920 | return __res; |
||
921 | } |
||
922 | |||
923 | /* Microsoft intrinsic. */ |
||
924 | extern __inline long long |
||
925 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
926 | _mm_cvtss_si64x(__m128 __A) { |
||
927 | return _mm_cvtss_si64((__v4sf)__A); |
||
928 | } |
||
929 | |||
930 | /* Constants for use with _mm_prefetch. */ |
||
931 | enum _mm_hint { |
||
932 | /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */ |
||
933 | _MM_HINT_ET0 = 7, |
||
934 | _MM_HINT_ET1 = 6, |
||
935 | _MM_HINT_T0 = 3, |
||
936 | _MM_HINT_T1 = 2, |
||
937 | _MM_HINT_T2 = 1, |
||
938 | _MM_HINT_NTA = 0 |
||
939 | }; |
||
940 | |||
941 | /* Loads one cache line from address P to a location "closer" to the |
||
942 | processor. The selector I specifies the type of prefetch operation. */ |
||
943 | extern __inline void |
||
944 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
945 | _mm_prefetch(const void *__P, enum _mm_hint __I) { |
||
946 | /* Current PowerPC will ignores the hint parameters. */ |
||
947 | __builtin_prefetch(__P); |
||
948 | } |
||
949 | |||
950 | /* Convert the two lower SPFP values to 32-bit integers according to the |
||
951 | current rounding mode. Return the integers in packed form. */ |
||
952 | extern __inline __m64 |
||
953 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
954 | _mm_cvtps_pi32(__m128 __A) { |
||
955 | /* Splat two lower SPFP values to both halves. */ |
||
956 | __v4sf __temp, __rounded; |
||
957 | __vector unsigned long long __result; |
||
958 | |||
959 | /* Splat two lower SPFP values to both halves. */ |
||
960 | __temp = (__v4sf)vec_splat((__vector long long)__A, 0); |
||
961 | __rounded = vec_rint(__temp); |
||
962 | __result = (__vector unsigned long long)vec_cts(__rounded, 0); |
||
963 | |||
964 | return (__m64)((__vector long long)__result)[0]; |
||
965 | } |
||
966 | |||
967 | extern __inline __m64 |
||
968 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
969 | _mm_cvt_ps2pi(__m128 __A) { |
||
970 | return _mm_cvtps_pi32(__A); |
||
971 | } |
||
972 | |||
973 | /* Truncate the lower SPFP value to a 32-bit integer. */ |
||
974 | extern __inline int |
||
975 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
976 | _mm_cvttss_si32(__m128 __A) { |
||
977 | /* Extract the lower float element. */ |
||
978 | float __temp = __A[0]; |
||
979 | /* truncate to 32-bit integer and return. */ |
||
980 | return __temp; |
||
981 | } |
||
982 | |||
983 | extern __inline int |
||
984 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
985 | _mm_cvtt_ss2si(__m128 __A) { |
||
986 | return _mm_cvttss_si32(__A); |
||
987 | } |
||
988 | |||
989 | /* Intel intrinsic. */ |
||
990 | extern __inline long long |
||
991 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
992 | _mm_cvttss_si64(__m128 __A) { |
||
993 | /* Extract the lower float element. */ |
||
994 | float __temp = __A[0]; |
||
995 | /* truncate to 32-bit integer and return. */ |
||
996 | return __temp; |
||
997 | } |
||
998 | |||
999 | /* Microsoft intrinsic. */ |
||
1000 | extern __inline long long |
||
1001 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1002 | _mm_cvttss_si64x(__m128 __A) { |
||
1003 | /* Extract the lower float element. */ |
||
1004 | float __temp = __A[0]; |
||
1005 | /* truncate to 32-bit integer and return. */ |
||
1006 | return __temp; |
||
1007 | } |
||
1008 | |||
1009 | /* Truncate the two lower SPFP values to 32-bit integers. Return the |
||
1010 | integers in packed form. */ |
||
1011 | extern __inline __m64 |
||
1012 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1013 | _mm_cvttps_pi32(__m128 __A) { |
||
1014 | __v4sf __temp; |
||
1015 | __vector unsigned long long __result; |
||
1016 | |||
1017 | /* Splat two lower SPFP values to both halves. */ |
||
1018 | __temp = (__v4sf)vec_splat((__vector long long)__A, 0); |
||
1019 | __result = (__vector unsigned long long)vec_cts(__temp, 0); |
||
1020 | |||
1021 | return (__m64)((__vector long long)__result)[0]; |
||
1022 | } |
||
1023 | |||
1024 | extern __inline __m64 |
||
1025 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1026 | _mm_cvtt_ps2pi(__m128 __A) { |
||
1027 | return _mm_cvttps_pi32(__A); |
||
1028 | } |
||
1029 | |||
1030 | /* Convert B to a SPFP value and insert it as element zero in A. */ |
||
1031 | extern __inline __m128 |
||
1032 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1033 | _mm_cvtsi32_ss(__m128 __A, int __B) { |
||
1034 | float __temp = __B; |
||
1035 | __A[0] = __temp; |
||
1036 | |||
1037 | return __A; |
||
1038 | } |
||
1039 | |||
1040 | extern __inline __m128 |
||
1041 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1042 | _mm_cvt_si2ss(__m128 __A, int __B) { |
||
1043 | return _mm_cvtsi32_ss(__A, __B); |
||
1044 | } |
||
1045 | |||
1046 | /* Convert B to a SPFP value and insert it as element zero in A. */ |
||
1047 | /* Intel intrinsic. */ |
||
1048 | extern __inline __m128 |
||
1049 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1050 | _mm_cvtsi64_ss(__m128 __A, long long __B) { |
||
1051 | float __temp = __B; |
||
1052 | __A[0] = __temp; |
||
1053 | |||
1054 | return __A; |
||
1055 | } |
||
1056 | |||
1057 | /* Microsoft intrinsic. */ |
||
1058 | extern __inline __m128 |
||
1059 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1060 | _mm_cvtsi64x_ss(__m128 __A, long long __B) { |
||
1061 | return _mm_cvtsi64_ss(__A, __B); |
||
1062 | } |
||
1063 | |||
1064 | /* Convert the two 32-bit values in B to SPFP form and insert them |
||
1065 | as the two lower elements in A. */ |
||
1066 | extern __inline __m128 |
||
1067 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1068 | _mm_cvtpi32_ps(__m128 __A, __m64 __B) { |
||
1069 | __vector signed int __vm1; |
||
1070 | __vector float __vf1; |
||
1071 | |||
1072 | __vm1 = (__vector signed int)(__vector unsigned long long){__B, __B}; |
||
1073 | __vf1 = (__vector float)vec_ctf(__vm1, 0); |
||
1074 | |||
1075 | return ((__m128)(__vector unsigned long long){ |
||
1076 | ((__vector unsigned long long)__vf1)[0], |
||
1077 | ((__vector unsigned long long)__A)[1]}); |
||
1078 | } |
||
1079 | |||
1080 | extern __inline __m128 |
||
1081 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1082 | _mm_cvt_pi2ps(__m128 __A, __m64 __B) { |
||
1083 | return _mm_cvtpi32_ps(__A, __B); |
||
1084 | } |
||
1085 | |||
1086 | /* Convert the four signed 16-bit values in A to SPFP form. */ |
||
1087 | extern __inline __m128 |
||
1088 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1089 | _mm_cvtpi16_ps(__m64 __A) { |
||
1090 | __vector signed short __vs8; |
||
1091 | __vector signed int __vi4; |
||
1092 | __vector float __vf1; |
||
1093 | |||
1094 | __vs8 = (__vector signed short)(__vector unsigned long long){__A, __A}; |
||
1095 | __vi4 = vec_vupklsh(__vs8); |
||
1096 | __vf1 = (__vector float)vec_ctf(__vi4, 0); |
||
1097 | |||
1098 | return (__m128)__vf1; |
||
1099 | } |
||
1100 | |||
1101 | /* Convert the four unsigned 16-bit values in A to SPFP form. */ |
||
1102 | extern __inline __m128 |
||
1103 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1104 | _mm_cvtpu16_ps(__m64 __A) { |
||
1105 | const __vector unsigned short __zero = {0, 0, 0, 0, 0, 0, 0, 0}; |
||
1106 | __vector unsigned short __vs8; |
||
1107 | __vector unsigned int __vi4; |
||
1108 | __vector float __vf1; |
||
1109 | |||
1110 | __vs8 = (__vector unsigned short)(__vector unsigned long long){__A, __A}; |
||
1111 | __vi4 = (__vector unsigned int)vec_mergel |
||
1112 | #ifdef __LITTLE_ENDIAN__ |
||
1113 | (__vs8, __zero); |
||
1114 | #else |
||
1115 | (__zero, __vs8); |
||
1116 | #endif |
||
1117 | __vf1 = (__vector float)vec_ctf(__vi4, 0); |
||
1118 | |||
1119 | return (__m128)__vf1; |
||
1120 | } |
||
1121 | |||
1122 | /* Convert the low four signed 8-bit values in A to SPFP form. */ |
||
1123 | extern __inline __m128 |
||
1124 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1125 | _mm_cvtpi8_ps(__m64 __A) { |
||
1126 | __vector signed char __vc16; |
||
1127 | __vector signed short __vs8; |
||
1128 | __vector signed int __vi4; |
||
1129 | __vector float __vf1; |
||
1130 | |||
1131 | __vc16 = (__vector signed char)(__vector unsigned long long){__A, __A}; |
||
1132 | __vs8 = vec_vupkhsb(__vc16); |
||
1133 | __vi4 = vec_vupkhsh(__vs8); |
||
1134 | __vf1 = (__vector float)vec_ctf(__vi4, 0); |
||
1135 | |||
1136 | return (__m128)__vf1; |
||
1137 | } |
||
1138 | |||
1139 | /* Convert the low four unsigned 8-bit values in A to SPFP form. */ |
||
1140 | extern __inline __m128 |
||
1141 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1142 | |||
1143 | _mm_cvtpu8_ps(__m64 __A) { |
||
1144 | const __vector unsigned char __zero = {0, 0, 0, 0, 0, 0, 0, 0}; |
||
1145 | __vector unsigned char __vc16; |
||
1146 | __vector unsigned short __vs8; |
||
1147 | __vector unsigned int __vi4; |
||
1148 | __vector float __vf1; |
||
1149 | |||
1150 | __vc16 = (__vector unsigned char)(__vector unsigned long long){__A, __A}; |
||
1151 | #ifdef __LITTLE_ENDIAN__ |
||
1152 | __vs8 = (__vector unsigned short)vec_mergel(__vc16, __zero); |
||
1153 | __vi4 = |
||
1154 | (__vector unsigned int)vec_mergeh(__vs8, (__vector unsigned short)__zero); |
||
1155 | #else |
||
1156 | __vs8 = (__vector unsigned short)vec_mergel(__zero, __vc16); |
||
1157 | __vi4 = |
||
1158 | (__vector unsigned int)vec_mergeh((__vector unsigned short)__zero, __vs8); |
||
1159 | #endif |
||
1160 | __vf1 = (__vector float)vec_ctf(__vi4, 0); |
||
1161 | |||
1162 | return (__m128)__vf1; |
||
1163 | } |
||
1164 | |||
1165 | /* Convert the four signed 32-bit values in A and B to SPFP form. */ |
||
1166 | extern __inline __m128 |
||
1167 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1168 | _mm_cvtpi32x2_ps(__m64 __A, __m64 __B) { |
||
1169 | __vector signed int __vi4; |
||
1170 | __vector float __vf4; |
||
1171 | |||
1172 | __vi4 = (__vector signed int)(__vector unsigned long long){__A, __B}; |
||
1173 | __vf4 = (__vector float)vec_ctf(__vi4, 0); |
||
1174 | return (__m128)__vf4; |
||
1175 | } |
||
1176 | |||
1177 | /* Convert the four SPFP values in A to four signed 16-bit integers. */ |
||
1178 | extern __inline __m64 |
||
1179 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1180 | _mm_cvtps_pi16(__m128 __A) { |
||
1181 | __v4sf __rounded; |
||
1182 | __vector signed int __temp; |
||
1183 | __vector unsigned long long __result; |
||
1184 | |||
1185 | __rounded = vec_rint(__A); |
||
1186 | __temp = vec_cts(__rounded, 0); |
||
1187 | __result = (__vector unsigned long long)vec_pack(__temp, __temp); |
||
1188 | |||
1189 | return (__m64)((__vector long long)__result)[0]; |
||
1190 | } |
||
1191 | |||
1192 | /* Convert the four SPFP values in A to four signed 8-bit integers. */ |
||
1193 | extern __inline __m64 |
||
1194 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1195 | _mm_cvtps_pi8(__m128 __A) { |
||
1196 | __v4sf __rounded; |
||
1197 | __vector signed int __tmp_i; |
||
1198 | static const __vector signed int __zero = {0, 0, 0, 0}; |
||
1199 | __vector signed short __tmp_s; |
||
1200 | __vector signed char __res_v; |
||
1201 | |||
1202 | __rounded = vec_rint(__A); |
||
1203 | __tmp_i = vec_cts(__rounded, 0); |
||
1204 | __tmp_s = vec_pack(__tmp_i, __zero); |
||
1205 | __res_v = vec_pack(__tmp_s, __tmp_s); |
||
1206 | return (__m64)((__vector long long)__res_v)[0]; |
||
1207 | } |
||
1208 | |||
1209 | /* Selects four specific SPFP values from A and B based on MASK. */ |
||
1210 | extern __inline __m128 |
||
1211 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1212 | |||
1213 | _mm_shuffle_ps(__m128 __A, __m128 __B, int const __mask) { |
||
1214 | unsigned long __element_selector_10 = __mask & 0x03; |
||
1215 | unsigned long __element_selector_32 = (__mask >> 2) & 0x03; |
||
1216 | unsigned long __element_selector_54 = (__mask >> 4) & 0x03; |
||
1217 | unsigned long __element_selector_76 = (__mask >> 6) & 0x03; |
||
1218 | static const unsigned int __permute_selectors[4] = { |
||
1219 | #ifdef __LITTLE_ENDIAN__ |
||
1220 | 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C |
||
1221 | #else |
||
1222 | 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F |
||
1223 | #endif |
||
1224 | }; |
||
1225 | __vector unsigned int __t; |
||
1226 | |||
1227 | __t[0] = __permute_selectors[__element_selector_10]; |
||
1228 | __t[1] = __permute_selectors[__element_selector_32]; |
||
1229 | __t[2] = __permute_selectors[__element_selector_54] + 0x10101010; |
||
1230 | __t[3] = __permute_selectors[__element_selector_76] + 0x10101010; |
||
1231 | return vec_perm((__v4sf)__A, (__v4sf)__B, (__vector unsigned char)__t); |
||
1232 | } |
||
1233 | |||
1234 | /* Selects and interleaves the upper two SPFP values from A and B. */ |
||
1235 | extern __inline __m128 |
||
1236 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1237 | _mm_unpackhi_ps(__m128 __A, __m128 __B) { |
||
1238 | return (__m128)vec_vmrglw((__v4sf)__A, (__v4sf)__B); |
||
1239 | } |
||
1240 | |||
1241 | /* Selects and interleaves the lower two SPFP values from A and B. */ |
||
1242 | extern __inline __m128 |
||
1243 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1244 | _mm_unpacklo_ps(__m128 __A, __m128 __B) { |
||
1245 | return (__m128)vec_vmrghw((__v4sf)__A, (__v4sf)__B); |
||
1246 | } |
||
1247 | |||
1248 | /* Sets the upper two SPFP values with 64-bits of data loaded from P; |
||
1249 | the lower two values are passed through from A. */ |
||
1250 | extern __inline __m128 |
||
1251 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1252 | _mm_loadh_pi(__m128 __A, __m64 const *__P) { |
||
1253 | __vector unsigned long long __a = (__vector unsigned long long)__A; |
||
1254 | __vector unsigned long long __p = vec_splats(*__P); |
||
1255 | __a[1] = __p[1]; |
||
1256 | |||
1257 | return (__m128)__a; |
||
1258 | } |
||
1259 | |||
1260 | /* Stores the upper two SPFP values of A into P. */ |
||
1261 | extern __inline void |
||
1262 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1263 | _mm_storeh_pi(__m64 *__P, __m128 __A) { |
||
1264 | __vector unsigned long long __a = (__vector unsigned long long)__A; |
||
1265 | |||
1266 | *__P = __a[1]; |
||
1267 | } |
||
1268 | |||
1269 | /* Moves the upper two values of B into the lower two values of A. */ |
||
1270 | extern __inline __m128 |
||
1271 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1272 | _mm_movehl_ps(__m128 __A, __m128 __B) { |
||
1273 | return (__m128)vec_mergel((__vector unsigned long long)__B, |
||
1274 | (__vector unsigned long long)__A); |
||
1275 | } |
||
1276 | |||
1277 | /* Moves the lower two values of B into the upper two values of A. */ |
||
1278 | extern __inline __m128 |
||
1279 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1280 | _mm_movelh_ps(__m128 __A, __m128 __B) { |
||
1281 | return (__m128)vec_mergeh((__vector unsigned long long)__A, |
||
1282 | (__vector unsigned long long)__B); |
||
1283 | } |
||
1284 | |||
1285 | /* Sets the lower two SPFP values with 64-bits of data loaded from P; |
||
1286 | the upper two values are passed through from A. */ |
||
1287 | extern __inline __m128 |
||
1288 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1289 | _mm_loadl_pi(__m128 __A, __m64 const *__P) { |
||
1290 | __vector unsigned long long __a = (__vector unsigned long long)__A; |
||
1291 | __vector unsigned long long __p = vec_splats(*__P); |
||
1292 | __a[0] = __p[0]; |
||
1293 | |||
1294 | return (__m128)__a; |
||
1295 | } |
||
1296 | |||
1297 | /* Stores the lower two SPFP values of A into P. */ |
||
1298 | extern __inline void |
||
1299 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1300 | _mm_storel_pi(__m64 *__P, __m128 __A) { |
||
1301 | __vector unsigned long long __a = (__vector unsigned long long)__A; |
||
1302 | |||
1303 | *__P = __a[0]; |
||
1304 | } |
||
1305 | |||
1306 | #ifdef _ARCH_PWR8 |
||
1307 | /* Intrinsic functions that require PowerISA 2.07 minimum. */ |
||
1308 | |||
1309 | /* Creates a 4-bit mask from the most significant bits of the SPFP values. */ |
||
1310 | extern __inline int |
||
1311 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1312 | _mm_movemask_ps(__m128 __A) { |
||
1313 | #ifdef _ARCH_PWR10 |
||
1314 | return vec_extractm((__vector unsigned int)__A); |
||
1315 | #else |
||
1316 | __vector unsigned long long __result; |
||
1317 | static const __vector unsigned int __perm_mask = { |
||
1318 | #ifdef __LITTLE_ENDIAN__ |
||
1319 | 0x00204060, 0x80808080, 0x80808080, 0x80808080 |
||
1320 | #else |
||
1321 | 0x80808080, 0x80808080, 0x80808080, 0x00204060 |
||
1322 | #endif |
||
1323 | }; |
||
1324 | |||
1325 | __result = ((__vector unsigned long long)vec_vbpermq( |
||
1326 | (__vector unsigned char)__A, (__vector unsigned char)__perm_mask)); |
||
1327 | |||
1328 | #ifdef __LITTLE_ENDIAN__ |
||
1329 | return __result[1]; |
||
1330 | #else |
||
1331 | return __result[0]; |
||
1332 | #endif |
||
1333 | #endif /* !_ARCH_PWR10 */ |
||
1334 | } |
||
1335 | #endif /* _ARCH_PWR8 */ |
||
1336 | |||
1337 | /* Create a vector with all four elements equal to *P. */ |
||
1338 | extern __inline __m128 |
||
1339 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1340 | _mm_load1_ps(float const *__P) { |
||
1341 | return _mm_set1_ps(*__P); |
||
1342 | } |
||
1343 | |||
1344 | extern __inline __m128 |
||
1345 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1346 | _mm_load_ps1(float const *__P) { |
||
1347 | return _mm_load1_ps(__P); |
||
1348 | } |
||
1349 | |||
1350 | /* Extracts one of the four words of A. The selector N must be immediate. */ |
||
1351 | extern __inline int |
||
1352 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1353 | _mm_extract_pi16(__m64 const __A, int const __N) { |
||
1354 | unsigned int __shiftr = __N & 3; |
||
1355 | #ifdef __BIG_ENDIAN__ |
||
1356 | __shiftr = 3 - __shiftr; |
||
1357 | #endif |
||
1358 | |||
1359 | return ((__A >> (__shiftr * 16)) & 0xffff); |
||
1360 | } |
||
1361 | |||
1362 | extern __inline int |
||
1363 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1364 | _m_pextrw(__m64 const __A, int const __N) { |
||
1365 | return _mm_extract_pi16(__A, __N); |
||
1366 | } |
||
1367 | |||
1368 | /* Inserts word D into one of four words of A. The selector N must be |
||
1369 | immediate. */ |
||
1370 | extern __inline __m64 |
||
1371 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1372 | _mm_insert_pi16(__m64 const __A, int const __D, int const __N) { |
||
1373 | const int __shiftl = (__N & 3) * 16; |
||
1374 | const __m64 __shiftD = (const __m64)__D << __shiftl; |
||
1375 | const __m64 __mask = 0xffffUL << __shiftl; |
||
1376 | __m64 __result = (__A & (~__mask)) | (__shiftD & __mask); |
||
1377 | |||
1378 | return __result; |
||
1379 | } |
||
1380 | |||
1381 | extern __inline __m64 |
||
1382 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1383 | _m_pinsrw(__m64 const __A, int const __D, int const __N) { |
||
1384 | return _mm_insert_pi16(__A, __D, __N); |
||
1385 | } |
||
1386 | |||
1387 | /* Compute the element-wise maximum of signed 16-bit values. */ |
||
1388 | extern __inline __m64 |
||
1389 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1390 | |||
1391 | _mm_max_pi16(__m64 __A, __m64 __B) { |
||
1392 | #if _ARCH_PWR8 |
||
1393 | __vector signed short __a, __b, __r; |
||
1394 | __vector __bool short __c; |
||
1395 | |||
1396 | __a = (__vector signed short)vec_splats(__A); |
||
1397 | __b = (__vector signed short)vec_splats(__B); |
||
1398 | __c = (__vector __bool short)vec_cmpgt(__a, __b); |
||
1399 | __r = vec_sel(__b, __a, __c); |
||
1400 | return (__m64)((__vector long long)__r)[0]; |
||
1401 | #else |
||
1402 | __m64_union __m1, __m2, __res; |
||
1403 | |||
1404 | __m1.as_m64 = __A; |
||
1405 | __m2.as_m64 = __B; |
||
1406 | |||
1407 | __res.as_short[0] = (__m1.as_short[0] > __m2.as_short[0]) ? __m1.as_short[0] |
||
1408 | : __m2.as_short[0]; |
||
1409 | __res.as_short[1] = (__m1.as_short[1] > __m2.as_short[1]) ? __m1.as_short[1] |
||
1410 | : __m2.as_short[1]; |
||
1411 | __res.as_short[2] = (__m1.as_short[2] > __m2.as_short[2]) ? __m1.as_short[2] |
||
1412 | : __m2.as_short[2]; |
||
1413 | __res.as_short[3] = (__m1.as_short[3] > __m2.as_short[3]) ? __m1.as_short[3] |
||
1414 | : __m2.as_short[3]; |
||
1415 | |||
1416 | return (__m64)__res.as_m64; |
||
1417 | #endif |
||
1418 | } |
||
1419 | |||
1420 | extern __inline __m64 |
||
1421 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1422 | _m_pmaxsw(__m64 __A, __m64 __B) { |
||
1423 | return _mm_max_pi16(__A, __B); |
||
1424 | } |
||
1425 | |||
1426 | /* Compute the element-wise maximum of unsigned 8-bit values. */ |
||
1427 | extern __inline __m64 |
||
1428 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1429 | _mm_max_pu8(__m64 __A, __m64 __B) { |
||
1430 | #if _ARCH_PWR8 |
||
1431 | __vector unsigned char __a, __b, __r; |
||
1432 | __vector __bool char __c; |
||
1433 | |||
1434 | __a = (__vector unsigned char)vec_splats(__A); |
||
1435 | __b = (__vector unsigned char)vec_splats(__B); |
||
1436 | __c = (__vector __bool char)vec_cmpgt(__a, __b); |
||
1437 | __r = vec_sel(__b, __a, __c); |
||
1438 | return (__m64)((__vector long long)__r)[0]; |
||
1439 | #else |
||
1440 | __m64_union __m1, __m2, __res; |
||
1441 | long __i; |
||
1442 | |||
1443 | __m1.as_m64 = __A; |
||
1444 | __m2.as_m64 = __B; |
||
1445 | |||
1446 | for (__i = 0; __i < 8; __i++) |
||
1447 | __res.as_char[__i] = |
||
1448 | ((unsigned char)__m1.as_char[__i] > (unsigned char)__m2.as_char[__i]) |
||
1449 | ? __m1.as_char[__i] |
||
1450 | : __m2.as_char[__i]; |
||
1451 | |||
1452 | return (__m64)__res.as_m64; |
||
1453 | #endif |
||
1454 | } |
||
1455 | |||
1456 | extern __inline __m64 |
||
1457 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1458 | _m_pmaxub(__m64 __A, __m64 __B) { |
||
1459 | return _mm_max_pu8(__A, __B); |
||
1460 | } |
||
1461 | |||
1462 | /* Compute the element-wise minimum of signed 16-bit values. */ |
||
1463 | extern __inline __m64 |
||
1464 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1465 | _mm_min_pi16(__m64 __A, __m64 __B) { |
||
1466 | #if _ARCH_PWR8 |
||
1467 | __vector signed short __a, __b, __r; |
||
1468 | __vector __bool short __c; |
||
1469 | |||
1470 | __a = (__vector signed short)vec_splats(__A); |
||
1471 | __b = (__vector signed short)vec_splats(__B); |
||
1472 | __c = (__vector __bool short)vec_cmplt(__a, __b); |
||
1473 | __r = vec_sel(__b, __a, __c); |
||
1474 | return (__m64)((__vector long long)__r)[0]; |
||
1475 | #else |
||
1476 | __m64_union __m1, __m2, __res; |
||
1477 | |||
1478 | __m1.as_m64 = __A; |
||
1479 | __m2.as_m64 = __B; |
||
1480 | |||
1481 | __res.as_short[0] = (__m1.as_short[0] < __m2.as_short[0]) ? __m1.as_short[0] |
||
1482 | : __m2.as_short[0]; |
||
1483 | __res.as_short[1] = (__m1.as_short[1] < __m2.as_short[1]) ? __m1.as_short[1] |
||
1484 | : __m2.as_short[1]; |
||
1485 | __res.as_short[2] = (__m1.as_short[2] < __m2.as_short[2]) ? __m1.as_short[2] |
||
1486 | : __m2.as_short[2]; |
||
1487 | __res.as_short[3] = (__m1.as_short[3] < __m2.as_short[3]) ? __m1.as_short[3] |
||
1488 | : __m2.as_short[3]; |
||
1489 | |||
1490 | return (__m64)__res.as_m64; |
||
1491 | #endif |
||
1492 | } |
||
1493 | |||
1494 | extern __inline __m64 |
||
1495 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1496 | _m_pminsw(__m64 __A, __m64 __B) { |
||
1497 | return _mm_min_pi16(__A, __B); |
||
1498 | } |
||
1499 | |||
1500 | /* Compute the element-wise minimum of unsigned 8-bit values. */ |
||
1501 | extern __inline __m64 |
||
1502 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1503 | _mm_min_pu8(__m64 __A, __m64 __B) { |
||
1504 | #if _ARCH_PWR8 |
||
1505 | __vector unsigned char __a, __b, __r; |
||
1506 | __vector __bool char __c; |
||
1507 | |||
1508 | __a = (__vector unsigned char)vec_splats(__A); |
||
1509 | __b = (__vector unsigned char)vec_splats(__B); |
||
1510 | __c = (__vector __bool char)vec_cmplt(__a, __b); |
||
1511 | __r = vec_sel(__b, __a, __c); |
||
1512 | return (__m64)((__vector long long)__r)[0]; |
||
1513 | #else |
||
1514 | __m64_union __m1, __m2, __res; |
||
1515 | long __i; |
||
1516 | |||
1517 | __m1.as_m64 = __A; |
||
1518 | __m2.as_m64 = __B; |
||
1519 | |||
1520 | for (__i = 0; __i < 8; __i++) |
||
1521 | __res.as_char[__i] = |
||
1522 | ((unsigned char)__m1.as_char[__i] < (unsigned char)__m2.as_char[__i]) |
||
1523 | ? __m1.as_char[__i] |
||
1524 | : __m2.as_char[__i]; |
||
1525 | |||
1526 | return (__m64)__res.as_m64; |
||
1527 | #endif |
||
1528 | } |
||
1529 | |||
1530 | extern __inline __m64 |
||
1531 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1532 | _m_pminub(__m64 __A, __m64 __B) { |
||
1533 | return _mm_min_pu8(__A, __B); |
||
1534 | } |
||
1535 | |||
1536 | /* Create an 8-bit mask of the signs of 8-bit values. */ |
||
1537 | extern __inline int |
||
1538 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1539 | _mm_movemask_pi8(__m64 __A) { |
||
1540 | #ifdef __powerpc64__ |
||
1541 | unsigned long long __p = |
||
1542 | #ifdef __LITTLE_ENDIAN__ |
||
1543 | 0x0008101820283038UL; // permute control for sign bits |
||
1544 | #else |
||
1545 | 0x3830282018100800UL; // permute control for sign bits |
||
1546 | #endif |
||
1547 | return __builtin_bpermd(__p, __A); |
||
1548 | #else |
||
1549 | #ifdef __LITTLE_ENDIAN__ |
||
1550 | unsigned int __mask = 0x20283038UL; |
||
1551 | unsigned int __r1 = __builtin_bpermd(__mask, __A) & 0xf; |
||
1552 | unsigned int __r2 = __builtin_bpermd(__mask, __A >> 32) & 0xf; |
||
1553 | #else |
||
1554 | unsigned int __mask = 0x38302820UL; |
||
1555 | unsigned int __r1 = __builtin_bpermd(__mask, __A >> 32) & 0xf; |
||
1556 | unsigned int __r2 = __builtin_bpermd(__mask, __A) & 0xf; |
||
1557 | #endif |
||
1558 | return (__r2 << 4) | __r1; |
||
1559 | #endif |
||
1560 | } |
||
1561 | |||
1562 | extern __inline int |
||
1563 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1564 | _m_pmovmskb(__m64 __A) { |
||
1565 | return _mm_movemask_pi8(__A); |
||
1566 | } |
||
1567 | |||
1568 | /* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values |
||
1569 | in B and produce the high 16 bits of the 32-bit results. */ |
||
1570 | extern __inline __m64 |
||
1571 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1572 | _mm_mulhi_pu16(__m64 __A, __m64 __B) { |
||
1573 | __vector unsigned short __a, __b; |
||
1574 | __vector unsigned short __c; |
||
1575 | __vector unsigned int __w0, __w1; |
||
1576 | __vector unsigned char __xform1 = { |
||
1577 | #ifdef __LITTLE_ENDIAN__ |
||
1578 | 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, |
||
1579 | 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F |
||
1580 | #else |
||
1581 | 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00, |
||
1582 | 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 |
||
1583 | #endif |
||
1584 | }; |
||
1585 | |||
1586 | __a = (__vector unsigned short)vec_splats(__A); |
||
1587 | __b = (__vector unsigned short)vec_splats(__B); |
||
1588 | |||
1589 | __w0 = vec_vmuleuh(__a, __b); |
||
1590 | __w1 = vec_vmulouh(__a, __b); |
||
1591 | __c = (__vector unsigned short)vec_perm(__w0, __w1, __xform1); |
||
1592 | |||
1593 | return (__m64)((__vector long long)__c)[0]; |
||
1594 | } |
||
1595 | |||
1596 | extern __inline __m64 |
||
1597 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1598 | _m_pmulhuw(__m64 __A, __m64 __B) { |
||
1599 | return _mm_mulhi_pu16(__A, __B); |
||
1600 | } |
||
1601 | |||
1602 | /* Return a combination of the four 16-bit values in A. The selector |
||
1603 | must be an immediate. */ |
||
1604 | extern __inline __m64 |
||
1605 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1606 | _mm_shuffle_pi16(__m64 __A, int const __N) { |
||
1607 | unsigned long __element_selector_10 = __N & 0x03; |
||
1608 | unsigned long __element_selector_32 = (__N >> 2) & 0x03; |
||
1609 | unsigned long __element_selector_54 = (__N >> 4) & 0x03; |
||
1610 | unsigned long __element_selector_76 = (__N >> 6) & 0x03; |
||
1611 | static const unsigned short __permute_selectors[4] = { |
||
1612 | #ifdef __LITTLE_ENDIAN__ |
||
1613 | 0x0908, 0x0B0A, 0x0D0C, 0x0F0E |
||
1614 | #else |
||
1615 | 0x0607, 0x0405, 0x0203, 0x0001 |
||
1616 | #endif |
||
1617 | }; |
||
1618 | __m64_union __t; |
||
1619 | __vector unsigned long long __a, __p, __r; |
||
1620 | |||
1621 | #ifdef __LITTLE_ENDIAN__ |
||
1622 | __t.as_short[0] = __permute_selectors[__element_selector_10]; |
||
1623 | __t.as_short[1] = __permute_selectors[__element_selector_32]; |
||
1624 | __t.as_short[2] = __permute_selectors[__element_selector_54]; |
||
1625 | __t.as_short[3] = __permute_selectors[__element_selector_76]; |
||
1626 | #else |
||
1627 | __t.as_short[3] = __permute_selectors[__element_selector_10]; |
||
1628 | __t.as_short[2] = __permute_selectors[__element_selector_32]; |
||
1629 | __t.as_short[1] = __permute_selectors[__element_selector_54]; |
||
1630 | __t.as_short[0] = __permute_selectors[__element_selector_76]; |
||
1631 | #endif |
||
1632 | __p = vec_splats(__t.as_m64); |
||
1633 | __a = vec_splats(__A); |
||
1634 | __r = vec_perm(__a, __a, (__vector unsigned char)__p); |
||
1635 | return (__m64)((__vector long long)__r)[0]; |
||
1636 | } |
||
1637 | |||
1638 | extern __inline __m64 |
||
1639 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1640 | _m_pshufw(__m64 __A, int const __N) { |
||
1641 | return _mm_shuffle_pi16(__A, __N); |
||
1642 | } |
||
1643 | |||
1644 | /* Conditionally store byte elements of A into P. The high bit of each |
||
1645 | byte in the selector N determines whether the corresponding byte from |
||
1646 | A is stored. */ |
||
1647 | extern __inline void |
||
1648 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1649 | _mm_maskmove_si64(__m64 __A, __m64 __N, char *__P) { |
||
1650 | __m64 __hibit = 0x8080808080808080UL; |
||
1651 | __m64 __mask, __tmp; |
||
1652 | __m64 *__p = (__m64 *)__P; |
||
1653 | |||
1654 | __tmp = *__p; |
||
1655 | __mask = _mm_cmpeq_pi8((__N & __hibit), __hibit); |
||
1656 | __tmp = (__tmp & (~__mask)) | (__A & __mask); |
||
1657 | *__p = __tmp; |
||
1658 | } |
||
1659 | |||
1660 | extern __inline void |
||
1661 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1662 | _m_maskmovq(__m64 __A, __m64 __N, char *__P) { |
||
1663 | _mm_maskmove_si64(__A, __N, __P); |
||
1664 | } |
||
1665 | |||
1666 | /* Compute the rounded averages of the unsigned 8-bit values in A and B. */ |
||
1667 | extern __inline __m64 |
||
1668 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1669 | _mm_avg_pu8(__m64 __A, __m64 __B) { |
||
1670 | __vector unsigned char __a, __b, __c; |
||
1671 | |||
1672 | __a = (__vector unsigned char)vec_splats(__A); |
||
1673 | __b = (__vector unsigned char)vec_splats(__B); |
||
1674 | __c = vec_avg(__a, __b); |
||
1675 | return (__m64)((__vector long long)__c)[0]; |
||
1676 | } |
||
1677 | |||
1678 | extern __inline __m64 |
||
1679 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1680 | _m_pavgb(__m64 __A, __m64 __B) { |
||
1681 | return _mm_avg_pu8(__A, __B); |
||
1682 | } |
||
1683 | |||
1684 | /* Compute the rounded averages of the unsigned 16-bit values in A and B. */ |
||
1685 | extern __inline __m64 |
||
1686 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1687 | _mm_avg_pu16(__m64 __A, __m64 __B) { |
||
1688 | __vector unsigned short __a, __b, __c; |
||
1689 | |||
1690 | __a = (__vector unsigned short)vec_splats(__A); |
||
1691 | __b = (__vector unsigned short)vec_splats(__B); |
||
1692 | __c = vec_avg(__a, __b); |
||
1693 | return (__m64)((__vector long long)__c)[0]; |
||
1694 | } |
||
1695 | |||
1696 | extern __inline __m64 |
||
1697 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1698 | _m_pavgw(__m64 __A, __m64 __B) { |
||
1699 | return _mm_avg_pu16(__A, __B); |
||
1700 | } |
||
1701 | |||
1702 | /* Compute the sum of the absolute differences of the unsigned 8-bit |
||
1703 | values in A and B. Return the value in the lower 16-bit word; the |
||
1704 | upper words are cleared. */ |
||
1705 | extern __inline __m64 |
||
1706 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1707 | _mm_sad_pu8(__m64 __A, __m64 __B) { |
||
1708 | __vector unsigned char __a, __b; |
||
1709 | __vector unsigned char __vmin, __vmax, __vabsdiff; |
||
1710 | __vector signed int __vsum; |
||
1711 | const __vector unsigned int __zero = {0, 0, 0, 0}; |
||
1712 | __m64_union __result = {0}; |
||
1713 | |||
1714 | __a = (__vector unsigned char)(__vector unsigned long long){0UL, __A}; |
||
1715 | __b = (__vector unsigned char)(__vector unsigned long long){0UL, __B}; |
||
1716 | __vmin = vec_min(__a, __b); |
||
1717 | __vmax = vec_max(__a, __b); |
||
1718 | __vabsdiff = vec_sub(__vmax, __vmin); |
||
1719 | /* Sum four groups of bytes into integers. */ |
||
1720 | __vsum = (__vector signed int)vec_sum4s(__vabsdiff, __zero); |
||
1721 | /* Sum across four integers with integer result. */ |
||
1722 | __vsum = vec_sums(__vsum, (__vector signed int)__zero); |
||
1723 | /* The sum is in the right most 32-bits of the vector result. |
||
1724 | Transfer to a GPR and truncate to 16 bits. */ |
||
1725 | __result.as_short[0] = __vsum[3]; |
||
1726 | return __result.as_m64; |
||
1727 | } |
||
1728 | |||
1729 | extern __inline __m64 |
||
1730 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1731 | _m_psadbw(__m64 __A, __m64 __B) { |
||
1732 | return _mm_sad_pu8(__A, __B); |
||
1733 | } |
||
1734 | |||
1735 | /* Stores the data in A to the address P without polluting the caches. */ |
||
1736 | extern __inline void |
||
1737 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1738 | _mm_stream_pi(__m64 *__P, __m64 __A) { |
||
1739 | /* Use the data cache block touch for store transient. */ |
||
1740 | __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory"); |
||
1741 | *__P = __A; |
||
1742 | } |
||
1743 | |||
1744 | /* Likewise. The address must be 16-byte aligned. */ |
||
1745 | extern __inline void |
||
1746 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1747 | _mm_stream_ps(float *__P, __m128 __A) { |
||
1748 | /* Use the data cache block touch for store transient. */ |
||
1749 | __asm__(" dcbtstt 0,%0" : : "b"(__P) : "memory"); |
||
1750 | _mm_store_ps(__P, __A); |
||
1751 | } |
||
1752 | |||
1753 | /* Guarantees that every preceding store is globally visible before |
||
1754 | any subsequent store. */ |
||
1755 | extern __inline void |
||
1756 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1757 | _mm_sfence(void) { |
||
1758 | /* Generate a light weight sync. */ |
||
1759 | __atomic_thread_fence(__ATOMIC_RELEASE); |
||
1760 | } |
||
1761 | |||
1762 | /* The execution of the next instruction is delayed by an implementation |
||
1763 | specific amount of time. The instruction does not modify the |
||
1764 | architectural state. This is after the pop_options pragma because |
||
1765 | it does not require SSE support in the processor--the encoding is a |
||
1766 | nop on processors that do not support it. */ |
||
1767 | extern __inline void |
||
1768 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1769 | _mm_pause(void) { |
||
1770 | /* There is no exact match with this construct, but the following is |
||
1771 | close to the desired effect. */ |
||
1772 | #if _ARCH_PWR8 |
||
1773 | /* On power8 and later processors we can depend on Program Priority |
||
1774 | (PRI) and associated "very low" PPI setting. Since we don't know |
||
1775 | what PPI this thread is running at we: 1) save the current PRI |
||
1776 | from the PPR SPR into a local GRP, 2) set the PRI to "very low* |
||
1777 | via the special or 31,31,31 encoding. 3) issue an "isync" to |
||
1778 | insure the PRI change takes effect before we execute any more |
||
1779 | instructions. |
||
1780 | Now we can execute a lwsync (release barrier) while we execute |
||
1781 | this thread at "very low" PRI. Finally we restore the original |
||
1782 | PRI and continue execution. */ |
||
1783 | unsigned long __PPR; |
||
1784 | |||
1785 | __asm__ volatile(" mfppr %0;" |
||
1786 | " or 31,31,31;" |
||
1787 | " isync;" |
||
1788 | " lwsync;" |
||
1789 | " isync;" |
||
1790 | " mtppr %0;" |
||
1791 | : "=r"(__PPR) |
||
1792 | : |
||
1793 | : "memory"); |
||
1794 | #else |
||
1795 | /* For older processor where we may not even have Program Priority |
||
1796 | controls we can only depend on Heavy Weight Sync. */ |
||
1797 | __atomic_thread_fence(__ATOMIC_SEQ_CST); |
||
1798 | #endif |
||
1799 | } |
||
1800 | |||
1801 | /* Transpose the 4x4 matrix composed of row[0-3]. */ |
||
1802 | #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ |
||
1803 | do { \ |
||
1804 | __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \ |
||
1805 | __v4sf __t0 = vec_vmrghw(__r0, __r1); \ |
||
1806 | __v4sf __t1 = vec_vmrghw(__r2, __r3); \ |
||
1807 | __v4sf __t2 = vec_vmrglw(__r0, __r1); \ |
||
1808 | __v4sf __t3 = vec_vmrglw(__r2, __r3); \ |
||
1809 | (row0) = (__v4sf)vec_mergeh((__vector long long)__t0, \ |
||
1810 | (__vector long long)__t1); \ |
||
1811 | (row1) = (__v4sf)vec_mergel((__vector long long)__t0, \ |
||
1812 | (__vector long long)__t1); \ |
||
1813 | (row2) = (__v4sf)vec_mergeh((__vector long long)__t2, \ |
||
1814 | (__vector long long)__t3); \ |
||
1815 | (row3) = (__v4sf)vec_mergel((__vector long long)__t2, \ |
||
1816 | (__vector long long)__t3); \ |
||
1817 | } while (0) |
||
1818 | |||
1819 | /* For backward source compatibility. */ |
||
1820 | //# include <emmintrin.h> |
||
1821 | |||
1822 | #else |
||
1823 | #include_next <xmmintrin.h> |
||
1824 | #endif /* defined(__powerpc64__) && \ |
||
1825 | * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ |
||
1826 | |||
1827 | #endif /* XMMINTRIN_H_ */ |