Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
14 | pmbaty | 1 | /*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------=== |
2 | * |
||
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
||
4 | * See https://llvm.org/LICENSE.txt for license information. |
||
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
||
6 | * |
||
7 | *===-----------------------------------------------------------------------=== |
||
8 | */ |
||
9 | |||
10 | /* Implemented from the specification included in the Intel C++ Compiler |
||
11 | User Guide and Reference, version 9.0. */ |
||
12 | |||
13 | #ifndef NO_WARN_X86_INTRINSICS |
||
14 | /* This header file is to help porting code using Intel intrinsics |
||
15 | explicitly from x86_64 to powerpc64/powerpc64le. |
||
16 | |||
17 | Since PowerPC target doesn't support native 64-bit vector type, we |
||
18 | typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which |
||
19 | works well for _si64 and some _pi32 operations. |
||
20 | |||
21 | For _pi16 and _pi8 operations, it's better to transfer __m64 into |
||
22 | 128-bit PowerPC vector first. Power8 introduced direct register |
||
23 | move instructions which helps for more efficient implementation. |
||
24 | |||
25 | It's user's responsibility to determine if the results of such port |
||
26 | are acceptable or further changes are needed. Please note that much |
||
27 | code using Intel intrinsics CAN BE REWRITTEN in more portable and |
||
28 | efficient standard C or GNU C extensions with 64-bit scalar |
||
29 | operations, or 128-bit SSE/Altivec operations, which are more |
||
30 | recommended. */ |
||
31 | #error \ |
||
32 | "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error." |
||
33 | #endif |
||
34 | |||
35 | #ifndef _MMINTRIN_H_INCLUDED |
||
36 | #define _MMINTRIN_H_INCLUDED |
||
37 | |||
38 | #if defined(__powerpc64__) && \ |
||
39 | (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) |
||
40 | |||
41 | #include <altivec.h> |
||
42 | /* The Intel API is flexible enough that we must allow aliasing with other |
||
43 | vector types, and their scalar components. */ |
||
44 | typedef __attribute__((__aligned__(8))) unsigned long long __m64; |
||
45 | |||
46 | typedef __attribute__((__aligned__(8))) union { |
||
47 | __m64 as_m64; |
||
48 | char as_char[8]; |
||
49 | signed char as_signed_char[8]; |
||
50 | short as_short[4]; |
||
51 | int as_int[2]; |
||
52 | long long as_long_long; |
||
53 | float as_float[2]; |
||
54 | double as_double; |
||
55 | } __m64_union; |
||
56 | |||
57 | /* Empty the multimedia state. */ |
||
58 | extern __inline void |
||
59 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
60 | _mm_empty(void) { |
||
61 | /* nothing to do on PowerPC. */ |
||
62 | } |
||
63 | |||
64 | extern __inline void |
||
65 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
66 | _m_empty(void) { |
||
67 | /* nothing to do on PowerPC. */ |
||
68 | } |
||
69 | |||
70 | /* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */ |
||
71 | extern __inline __m64 |
||
72 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
73 | _mm_cvtsi32_si64(int __i) { |
||
74 | return (__m64)(unsigned int)__i; |
||
75 | } |
||
76 | |||
77 | extern __inline __m64 |
||
78 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
79 | _m_from_int(int __i) { |
||
80 | return _mm_cvtsi32_si64(__i); |
||
81 | } |
||
82 | |||
83 | /* Convert the lower 32 bits of the __m64 object into an integer. */ |
||
84 | extern __inline int |
||
85 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
86 | _mm_cvtsi64_si32(__m64 __i) { |
||
87 | return ((int)__i); |
||
88 | } |
||
89 | |||
90 | extern __inline int |
||
91 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
92 | _m_to_int(__m64 __i) { |
||
93 | return _mm_cvtsi64_si32(__i); |
||
94 | } |
||
95 | |||
96 | /* Convert I to a __m64 object. */ |
||
97 | |||
98 | /* Intel intrinsic. */ |
||
99 | extern __inline __m64 |
||
100 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
101 | _m_from_int64(long long __i) { |
||
102 | return (__m64)__i; |
||
103 | } |
||
104 | |||
105 | extern __inline __m64 |
||
106 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
107 | _mm_cvtsi64_m64(long long __i) { |
||
108 | return (__m64)__i; |
||
109 | } |
||
110 | |||
111 | /* Microsoft intrinsic. */ |
||
112 | extern __inline __m64 |
||
113 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
114 | _mm_cvtsi64x_si64(long long __i) { |
||
115 | return (__m64)__i; |
||
116 | } |
||
117 | |||
118 | extern __inline __m64 |
||
119 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
120 | _mm_set_pi64x(long long __i) { |
||
121 | return (__m64)__i; |
||
122 | } |
||
123 | |||
124 | /* Convert the __m64 object to a 64bit integer. */ |
||
125 | |||
126 | /* Intel intrinsic. */ |
||
127 | extern __inline long long |
||
128 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
129 | _m_to_int64(__m64 __i) { |
||
130 | return (long long)__i; |
||
131 | } |
||
132 | |||
133 | extern __inline long long |
||
134 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
135 | _mm_cvtm64_si64(__m64 __i) { |
||
136 | return (long long)__i; |
||
137 | } |
||
138 | |||
139 | /* Microsoft intrinsic. */ |
||
140 | extern __inline long long |
||
141 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
142 | _mm_cvtsi64_si64x(__m64 __i) { |
||
143 | return (long long)__i; |
||
144 | } |
||
145 | |||
146 | #ifdef _ARCH_PWR8 |
||
147 | /* Pack the four 16-bit values from M1 into the lower four 8-bit values of |
||
148 | the result, and the four 16-bit values from M2 into the upper four 8-bit |
||
149 | values of the result, all with signed saturation. */ |
||
150 | extern __inline __m64 |
||
151 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
152 | _mm_packs_pi16(__m64 __m1, __m64 __m2) { |
||
153 | __vector signed short __vm1; |
||
154 | __vector signed char __vresult; |
||
155 | |||
156 | __vm1 = (__vector signed short)(__vector unsigned long long) |
||
157 | #ifdef __LITTLE_ENDIAN__ |
||
158 | {__m1, __m2}; |
||
159 | #else |
||
160 | {__m2, __m1}; |
||
161 | #endif |
||
162 | __vresult = vec_packs(__vm1, __vm1); |
||
163 | return (__m64)((__vector long long)__vresult)[0]; |
||
164 | } |
||
165 | |||
166 | extern __inline __m64 |
||
167 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
168 | _m_packsswb(__m64 __m1, __m64 __m2) { |
||
169 | return _mm_packs_pi16(__m1, __m2); |
||
170 | } |
||
171 | |||
172 | /* Pack the two 32-bit values from M1 in to the lower two 16-bit values of |
||
173 | the result, and the two 32-bit values from M2 into the upper two 16-bit |
||
174 | values of the result, all with signed saturation. */ |
||
175 | extern __inline __m64 |
||
176 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
177 | _mm_packs_pi32(__m64 __m1, __m64 __m2) { |
||
178 | __vector signed int __vm1; |
||
179 | __vector signed short __vresult; |
||
180 | |||
181 | __vm1 = (__vector signed int)(__vector unsigned long long) |
||
182 | #ifdef __LITTLE_ENDIAN__ |
||
183 | {__m1, __m2}; |
||
184 | #else |
||
185 | {__m2, __m1}; |
||
186 | #endif |
||
187 | __vresult = vec_packs(__vm1, __vm1); |
||
188 | return (__m64)((__vector long long)__vresult)[0]; |
||
189 | } |
||
190 | |||
191 | extern __inline __m64 |
||
192 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
193 | _m_packssdw(__m64 __m1, __m64 __m2) { |
||
194 | return _mm_packs_pi32(__m1, __m2); |
||
195 | } |
||
196 | |||
197 | /* Pack the four 16-bit values from M1 into the lower four 8-bit values of |
||
198 | the result, and the four 16-bit values from M2 into the upper four 8-bit |
||
199 | values of the result, all with unsigned saturation. */ |
||
200 | extern __inline __m64 |
||
201 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
202 | _mm_packs_pu16(__m64 __m1, __m64 __m2) { |
||
203 | __vector unsigned char __r; |
||
204 | __vector signed short __vm1 = (__vector signed short)(__vector long long) |
||
205 | #ifdef __LITTLE_ENDIAN__ |
||
206 | {__m1, __m2}; |
||
207 | #else |
||
208 | {__m2, __m1}; |
||
209 | #endif |
||
210 | const __vector signed short __zero = {0}; |
||
211 | __vector __bool short __select = vec_cmplt(__vm1, __zero); |
||
212 | __r = |
||
213 | vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1); |
||
214 | __vector __bool char __packsel = vec_pack(__select, __select); |
||
215 | __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel); |
||
216 | return (__m64)((__vector long long)__r)[0]; |
||
217 | } |
||
218 | |||
219 | extern __inline __m64 |
||
220 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
221 | _m_packuswb(__m64 __m1, __m64 __m2) { |
||
222 | return _mm_packs_pu16(__m1, __m2); |
||
223 | } |
||
224 | #endif /* end ARCH_PWR8 */ |
||
225 | |||
226 | /* Interleave the four 8-bit values from the high half of M1 with the four |
||
227 | 8-bit values from the high half of M2. */ |
||
228 | extern __inline __m64 |
||
229 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
230 | _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) { |
||
231 | #if _ARCH_PWR8 |
||
232 | __vector unsigned char __a, __b, __c; |
||
233 | |||
234 | __a = (__vector unsigned char)vec_splats(__m1); |
||
235 | __b = (__vector unsigned char)vec_splats(__m2); |
||
236 | __c = vec_mergel(__a, __b); |
||
237 | return (__m64)((__vector long long)__c)[1]; |
||
238 | #else |
||
239 | __m64_union __mu1, __mu2, __res; |
||
240 | |||
241 | __mu1.as_m64 = __m1; |
||
242 | __mu2.as_m64 = __m2; |
||
243 | |||
244 | __res.as_char[0] = __mu1.as_char[4]; |
||
245 | __res.as_char[1] = __mu2.as_char[4]; |
||
246 | __res.as_char[2] = __mu1.as_char[5]; |
||
247 | __res.as_char[3] = __mu2.as_char[5]; |
||
248 | __res.as_char[4] = __mu1.as_char[6]; |
||
249 | __res.as_char[5] = __mu2.as_char[6]; |
||
250 | __res.as_char[6] = __mu1.as_char[7]; |
||
251 | __res.as_char[7] = __mu2.as_char[7]; |
||
252 | |||
253 | return (__m64)__res.as_m64; |
||
254 | #endif |
||
255 | } |
||
256 | |||
257 | extern __inline __m64 |
||
258 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
259 | _m_punpckhbw(__m64 __m1, __m64 __m2) { |
||
260 | return _mm_unpackhi_pi8(__m1, __m2); |
||
261 | } |
||
262 | |||
263 | /* Interleave the two 16-bit values from the high half of M1 with the two |
||
264 | 16-bit values from the high half of M2. */ |
||
265 | extern __inline __m64 |
||
266 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
267 | _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) { |
||
268 | __m64_union __mu1, __mu2, __res; |
||
269 | |||
270 | __mu1.as_m64 = __m1; |
||
271 | __mu2.as_m64 = __m2; |
||
272 | |||
273 | __res.as_short[0] = __mu1.as_short[2]; |
||
274 | __res.as_short[1] = __mu2.as_short[2]; |
||
275 | __res.as_short[2] = __mu1.as_short[3]; |
||
276 | __res.as_short[3] = __mu2.as_short[3]; |
||
277 | |||
278 | return (__m64)__res.as_m64; |
||
279 | } |
||
280 | |||
281 | extern __inline __m64 |
||
282 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
283 | _m_punpckhwd(__m64 __m1, __m64 __m2) { |
||
284 | return _mm_unpackhi_pi16(__m1, __m2); |
||
285 | } |
||
286 | /* Interleave the 32-bit value from the high half of M1 with the 32-bit |
||
287 | value from the high half of M2. */ |
||
288 | extern __inline __m64 |
||
289 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
290 | _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) { |
||
291 | __m64_union __mu1, __mu2, __res; |
||
292 | |||
293 | __mu1.as_m64 = __m1; |
||
294 | __mu2.as_m64 = __m2; |
||
295 | |||
296 | __res.as_int[0] = __mu1.as_int[1]; |
||
297 | __res.as_int[1] = __mu2.as_int[1]; |
||
298 | |||
299 | return (__m64)__res.as_m64; |
||
300 | } |
||
301 | |||
302 | extern __inline __m64 |
||
303 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
304 | _m_punpckhdq(__m64 __m1, __m64 __m2) { |
||
305 | return _mm_unpackhi_pi32(__m1, __m2); |
||
306 | } |
||
307 | /* Interleave the four 8-bit values from the low half of M1 with the four |
||
308 | 8-bit values from the low half of M2. */ |
||
309 | extern __inline __m64 |
||
310 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
311 | _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) { |
||
312 | #if _ARCH_PWR8 |
||
313 | __vector unsigned char __a, __b, __c; |
||
314 | |||
315 | __a = (__vector unsigned char)vec_splats(__m1); |
||
316 | __b = (__vector unsigned char)vec_splats(__m2); |
||
317 | __c = vec_mergel(__a, __b); |
||
318 | return (__m64)((__vector long long)__c)[0]; |
||
319 | #else |
||
320 | __m64_union __mu1, __mu2, __res; |
||
321 | |||
322 | __mu1.as_m64 = __m1; |
||
323 | __mu2.as_m64 = __m2; |
||
324 | |||
325 | __res.as_char[0] = __mu1.as_char[0]; |
||
326 | __res.as_char[1] = __mu2.as_char[0]; |
||
327 | __res.as_char[2] = __mu1.as_char[1]; |
||
328 | __res.as_char[3] = __mu2.as_char[1]; |
||
329 | __res.as_char[4] = __mu1.as_char[2]; |
||
330 | __res.as_char[5] = __mu2.as_char[2]; |
||
331 | __res.as_char[6] = __mu1.as_char[3]; |
||
332 | __res.as_char[7] = __mu2.as_char[3]; |
||
333 | |||
334 | return (__m64)__res.as_m64; |
||
335 | #endif |
||
336 | } |
||
337 | |||
338 | extern __inline __m64 |
||
339 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
340 | _m_punpcklbw(__m64 __m1, __m64 __m2) { |
||
341 | return _mm_unpacklo_pi8(__m1, __m2); |
||
342 | } |
||
343 | /* Interleave the two 16-bit values from the low half of M1 with the two |
||
344 | 16-bit values from the low half of M2. */ |
||
345 | extern __inline __m64 |
||
346 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
347 | _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) { |
||
348 | __m64_union __mu1, __mu2, __res; |
||
349 | |||
350 | __mu1.as_m64 = __m1; |
||
351 | __mu2.as_m64 = __m2; |
||
352 | |||
353 | __res.as_short[0] = __mu1.as_short[0]; |
||
354 | __res.as_short[1] = __mu2.as_short[0]; |
||
355 | __res.as_short[2] = __mu1.as_short[1]; |
||
356 | __res.as_short[3] = __mu2.as_short[1]; |
||
357 | |||
358 | return (__m64)__res.as_m64; |
||
359 | } |
||
360 | |||
361 | extern __inline __m64 |
||
362 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
363 | _m_punpcklwd(__m64 __m1, __m64 __m2) { |
||
364 | return _mm_unpacklo_pi16(__m1, __m2); |
||
365 | } |
||
366 | |||
367 | /* Interleave the 32-bit value from the low half of M1 with the 32-bit |
||
368 | value from the low half of M2. */ |
||
369 | extern __inline __m64 |
||
370 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
371 | _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) { |
||
372 | __m64_union __mu1, __mu2, __res; |
||
373 | |||
374 | __mu1.as_m64 = __m1; |
||
375 | __mu2.as_m64 = __m2; |
||
376 | |||
377 | __res.as_int[0] = __mu1.as_int[0]; |
||
378 | __res.as_int[1] = __mu2.as_int[0]; |
||
379 | |||
380 | return (__m64)__res.as_m64; |
||
381 | } |
||
382 | |||
383 | extern __inline __m64 |
||
384 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
385 | _m_punpckldq(__m64 __m1, __m64 __m2) { |
||
386 | return _mm_unpacklo_pi32(__m1, __m2); |
||
387 | } |
||
388 | |||
389 | /* Add the 8-bit values in M1 to the 8-bit values in M2. */ |
||
390 | extern __inline __m64 |
||
391 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
392 | _mm_add_pi8(__m64 __m1, __m64 __m2) { |
||
393 | #if _ARCH_PWR8 |
||
394 | __vector signed char __a, __b, __c; |
||
395 | |||
396 | __a = (__vector signed char)vec_splats(__m1); |
||
397 | __b = (__vector signed char)vec_splats(__m2); |
||
398 | __c = vec_add(__a, __b); |
||
399 | return (__m64)((__vector long long)__c)[0]; |
||
400 | #else |
||
401 | __m64_union __mu1, __mu2, __res; |
||
402 | |||
403 | __mu1.as_m64 = __m1; |
||
404 | __mu2.as_m64 = __m2; |
||
405 | |||
406 | __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0]; |
||
407 | __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1]; |
||
408 | __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2]; |
||
409 | __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3]; |
||
410 | __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4]; |
||
411 | __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5]; |
||
412 | __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6]; |
||
413 | __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7]; |
||
414 | |||
415 | return (__m64)__res.as_m64; |
||
416 | #endif |
||
417 | } |
||
418 | |||
419 | extern __inline __m64 |
||
420 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
421 | _m_paddb(__m64 __m1, __m64 __m2) { |
||
422 | return _mm_add_pi8(__m1, __m2); |
||
423 | } |
||
424 | |||
425 | /* Add the 16-bit values in M1 to the 16-bit values in M2. */ |
||
426 | extern __inline __m64 |
||
427 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
428 | _mm_add_pi16(__m64 __m1, __m64 __m2) { |
||
429 | #if _ARCH_PWR8 |
||
430 | __vector signed short __a, __b, __c; |
||
431 | |||
432 | __a = (__vector signed short)vec_splats(__m1); |
||
433 | __b = (__vector signed short)vec_splats(__m2); |
||
434 | __c = vec_add(__a, __b); |
||
435 | return (__m64)((__vector long long)__c)[0]; |
||
436 | #else |
||
437 | __m64_union __mu1, __mu2, __res; |
||
438 | |||
439 | __mu1.as_m64 = __m1; |
||
440 | __mu2.as_m64 = __m2; |
||
441 | |||
442 | __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0]; |
||
443 | __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1]; |
||
444 | __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2]; |
||
445 | __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3]; |
||
446 | |||
447 | return (__m64)__res.as_m64; |
||
448 | #endif |
||
449 | } |
||
450 | |||
451 | extern __inline __m64 |
||
452 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
453 | _m_paddw(__m64 __m1, __m64 __m2) { |
||
454 | return _mm_add_pi16(__m1, __m2); |
||
455 | } |
||
456 | |||
457 | /* Add the 32-bit values in M1 to the 32-bit values in M2. */ |
||
458 | extern __inline __m64 |
||
459 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
460 | _mm_add_pi32(__m64 __m1, __m64 __m2) { |
||
461 | #if _ARCH_PWR9 |
||
462 | __vector signed int __a, __b, __c; |
||
463 | |||
464 | __a = (__vector signed int)vec_splats(__m1); |
||
465 | __b = (__vector signed int)vec_splats(__m2); |
||
466 | __c = vec_add(__a, __b); |
||
467 | return (__m64)((__vector long long)__c)[0]; |
||
468 | #else |
||
469 | __m64_union __mu1, __mu2, __res; |
||
470 | |||
471 | __mu1.as_m64 = __m1; |
||
472 | __mu2.as_m64 = __m2; |
||
473 | |||
474 | __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0]; |
||
475 | __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1]; |
||
476 | |||
477 | return (__m64)__res.as_m64; |
||
478 | #endif |
||
479 | } |
||
480 | |||
481 | extern __inline __m64 |
||
482 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
483 | _m_paddd(__m64 __m1, __m64 __m2) { |
||
484 | return _mm_add_pi32(__m1, __m2); |
||
485 | } |
||
486 | |||
487 | /* Subtract the 8-bit values in M2 from the 8-bit values in M1. */ |
||
488 | extern __inline __m64 |
||
489 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
490 | _mm_sub_pi8(__m64 __m1, __m64 __m2) { |
||
491 | #if _ARCH_PWR8 |
||
492 | __vector signed char __a, __b, __c; |
||
493 | |||
494 | __a = (__vector signed char)vec_splats(__m1); |
||
495 | __b = (__vector signed char)vec_splats(__m2); |
||
496 | __c = vec_sub(__a, __b); |
||
497 | return (__m64)((__vector long long)__c)[0]; |
||
498 | #else |
||
499 | __m64_union __mu1, __mu2, __res; |
||
500 | |||
501 | __mu1.as_m64 = __m1; |
||
502 | __mu2.as_m64 = __m2; |
||
503 | |||
504 | __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0]; |
||
505 | __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1]; |
||
506 | __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2]; |
||
507 | __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3]; |
||
508 | __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4]; |
||
509 | __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5]; |
||
510 | __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6]; |
||
511 | __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7]; |
||
512 | |||
513 | return (__m64)__res.as_m64; |
||
514 | #endif |
||
515 | } |
||
516 | |||
517 | extern __inline __m64 |
||
518 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
519 | _m_psubb(__m64 __m1, __m64 __m2) { |
||
520 | return _mm_sub_pi8(__m1, __m2); |
||
521 | } |
||
522 | |||
523 | /* Subtract the 16-bit values in M2 from the 16-bit values in M1. */ |
||
524 | extern __inline __m64 |
||
525 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
526 | _mm_sub_pi16(__m64 __m1, __m64 __m2) { |
||
527 | #if _ARCH_PWR8 |
||
528 | __vector signed short __a, __b, __c; |
||
529 | |||
530 | __a = (__vector signed short)vec_splats(__m1); |
||
531 | __b = (__vector signed short)vec_splats(__m2); |
||
532 | __c = vec_sub(__a, __b); |
||
533 | return (__m64)((__vector long long)__c)[0]; |
||
534 | #else |
||
535 | __m64_union __mu1, __mu2, __res; |
||
536 | |||
537 | __mu1.as_m64 = __m1; |
||
538 | __mu2.as_m64 = __m2; |
||
539 | |||
540 | __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0]; |
||
541 | __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1]; |
||
542 | __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2]; |
||
543 | __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3]; |
||
544 | |||
545 | return (__m64)__res.as_m64; |
||
546 | #endif |
||
547 | } |
||
548 | |||
549 | extern __inline __m64 |
||
550 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
551 | _m_psubw(__m64 __m1, __m64 __m2) { |
||
552 | return _mm_sub_pi16(__m1, __m2); |
||
553 | } |
||
554 | |||
555 | /* Subtract the 32-bit values in M2 from the 32-bit values in M1. */ |
||
556 | extern __inline __m64 |
||
557 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
558 | _mm_sub_pi32(__m64 __m1, __m64 __m2) { |
||
559 | #if _ARCH_PWR9 |
||
560 | __vector signed int __a, __b, __c; |
||
561 | |||
562 | __a = (__vector signed int)vec_splats(__m1); |
||
563 | __b = (__vector signed int)vec_splats(__m2); |
||
564 | __c = vec_sub(__a, __b); |
||
565 | return (__m64)((__vector long long)__c)[0]; |
||
566 | #else |
||
567 | __m64_union __mu1, __mu2, __res; |
||
568 | |||
569 | __mu1.as_m64 = __m1; |
||
570 | __mu2.as_m64 = __m2; |
||
571 | |||
572 | __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0]; |
||
573 | __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1]; |
||
574 | |||
575 | return (__m64)__res.as_m64; |
||
576 | #endif |
||
577 | } |
||
578 | |||
579 | extern __inline __m64 |
||
580 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
581 | _m_psubd(__m64 __m1, __m64 __m2) { |
||
582 | return _mm_sub_pi32(__m1, __m2); |
||
583 | } |
||
584 | |||
585 | extern __inline __m64 |
||
586 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
587 | _mm_add_si64(__m64 __m1, __m64 __m2) { |
||
588 | return (__m1 + __m2); |
||
589 | } |
||
590 | |||
591 | extern __inline __m64 |
||
592 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
593 | _mm_sub_si64(__m64 __m1, __m64 __m2) { |
||
594 | return (__m1 - __m2); |
||
595 | } |
||
596 | |||
597 | /* Shift the 64-bit value in M left by COUNT. */ |
||
598 | extern __inline __m64 |
||
599 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
600 | _mm_sll_si64(__m64 __m, __m64 __count) { |
||
601 | return (__m << __count); |
||
602 | } |
||
603 | |||
604 | extern __inline __m64 |
||
605 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
606 | _m_psllq(__m64 __m, __m64 __count) { |
||
607 | return _mm_sll_si64(__m, __count); |
||
608 | } |
||
609 | |||
610 | extern __inline __m64 |
||
611 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
612 | _mm_slli_si64(__m64 __m, const int __count) { |
||
613 | return (__m << __count); |
||
614 | } |
||
615 | |||
616 | extern __inline __m64 |
||
617 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
618 | _m_psllqi(__m64 __m, const int __count) { |
||
619 | return _mm_slli_si64(__m, __count); |
||
620 | } |
||
621 | |||
622 | /* Shift the 64-bit value in M left by COUNT; shift in zeros. */ |
||
623 | extern __inline __m64 |
||
624 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
625 | _mm_srl_si64(__m64 __m, __m64 __count) { |
||
626 | return (__m >> __count); |
||
627 | } |
||
628 | |||
629 | extern __inline __m64 |
||
630 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
631 | _m_psrlq(__m64 __m, __m64 __count) { |
||
632 | return _mm_srl_si64(__m, __count); |
||
633 | } |
||
634 | |||
635 | extern __inline __m64 |
||
636 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
637 | _mm_srli_si64(__m64 __m, const int __count) { |
||
638 | return (__m >> __count); |
||
639 | } |
||
640 | |||
641 | extern __inline __m64 |
||
642 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
643 | _m_psrlqi(__m64 __m, const int __count) { |
||
644 | return _mm_srli_si64(__m, __count); |
||
645 | } |
||
646 | |||
647 | /* Bit-wise AND the 64-bit values in M1 and M2. */ |
||
648 | extern __inline __m64 |
||
649 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
650 | _mm_and_si64(__m64 __m1, __m64 __m2) { |
||
651 | return (__m1 & __m2); |
||
652 | } |
||
653 | |||
654 | extern __inline __m64 |
||
655 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
656 | _m_pand(__m64 __m1, __m64 __m2) { |
||
657 | return _mm_and_si64(__m1, __m2); |
||
658 | } |
||
659 | |||
660 | /* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the |
||
661 | 64-bit value in M2. */ |
||
662 | extern __inline __m64 |
||
663 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
664 | _mm_andnot_si64(__m64 __m1, __m64 __m2) { |
||
665 | return (~__m1 & __m2); |
||
666 | } |
||
667 | |||
668 | extern __inline __m64 |
||
669 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
670 | _m_pandn(__m64 __m1, __m64 __m2) { |
||
671 | return _mm_andnot_si64(__m1, __m2); |
||
672 | } |
||
673 | |||
674 | /* Bit-wise inclusive OR the 64-bit values in M1 and M2. */ |
||
675 | extern __inline __m64 |
||
676 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
677 | _mm_or_si64(__m64 __m1, __m64 __m2) { |
||
678 | return (__m1 | __m2); |
||
679 | } |
||
680 | |||
681 | extern __inline __m64 |
||
682 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
683 | _m_por(__m64 __m1, __m64 __m2) { |
||
684 | return _mm_or_si64(__m1, __m2); |
||
685 | } |
||
686 | |||
687 | /* Bit-wise exclusive OR the 64-bit values in M1 and M2. */ |
||
688 | extern __inline __m64 |
||
689 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
690 | _mm_xor_si64(__m64 __m1, __m64 __m2) { |
||
691 | return (__m1 ^ __m2); |
||
692 | } |
||
693 | |||
694 | extern __inline __m64 |
||
695 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
696 | _m_pxor(__m64 __m1, __m64 __m2) { |
||
697 | return _mm_xor_si64(__m1, __m2); |
||
698 | } |
||
699 | |||
700 | /* Creates a 64-bit zero. */ |
||
701 | extern __inline __m64 |
||
702 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
703 | _mm_setzero_si64(void) { |
||
704 | return (__m64)0; |
||
705 | } |
||
706 | |||
707 | /* Compare eight 8-bit values. The result of the comparison is 0xFF if the |
||
708 | test is true and zero if false. */ |
||
709 | extern __inline __m64 |
||
710 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
711 | _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) { |
||
712 | #if defined(_ARCH_PWR6) && defined(__powerpc64__) |
||
713 | __m64 __res; |
||
714 | __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :); |
||
715 | return (__res); |
||
716 | #else |
||
717 | __m64_union __mu1, __mu2, __res; |
||
718 | |||
719 | __mu1.as_m64 = __m1; |
||
720 | __mu2.as_m64 = __m2; |
||
721 | |||
722 | __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0; |
||
723 | __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0; |
||
724 | __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0; |
||
725 | __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0; |
||
726 | __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0; |
||
727 | __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0; |
||
728 | __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0; |
||
729 | __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0; |
||
730 | |||
731 | return (__m64)__res.as_m64; |
||
732 | #endif |
||
733 | } |
||
734 | |||
735 | extern __inline __m64 |
||
736 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
737 | _m_pcmpeqb(__m64 __m1, __m64 __m2) { |
||
738 | return _mm_cmpeq_pi8(__m1, __m2); |
||
739 | } |
||
740 | |||
741 | extern __inline __m64 |
||
742 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
743 | _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) { |
||
744 | #if _ARCH_PWR8 |
||
745 | __vector signed char __a, __b, __c; |
||
746 | |||
747 | __a = (__vector signed char)vec_splats(__m1); |
||
748 | __b = (__vector signed char)vec_splats(__m2); |
||
749 | __c = (__vector signed char)vec_cmpgt(__a, __b); |
||
750 | return (__m64)((__vector long long)__c)[0]; |
||
751 | #else |
||
752 | __m64_union __mu1, __mu2, __res; |
||
753 | |||
754 | __mu1.as_m64 = __m1; |
||
755 | __mu2.as_m64 = __m2; |
||
756 | |||
757 | __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0; |
||
758 | __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0; |
||
759 | __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0; |
||
760 | __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0; |
||
761 | __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0; |
||
762 | __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0; |
||
763 | __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0; |
||
764 | __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0; |
||
765 | |||
766 | return (__m64)__res.as_m64; |
||
767 | #endif |
||
768 | } |
||
769 | |||
770 | extern __inline __m64 |
||
771 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
772 | _m_pcmpgtb(__m64 __m1, __m64 __m2) { |
||
773 | return _mm_cmpgt_pi8(__m1, __m2); |
||
774 | } |
||
775 | |||
776 | /* Compare four 16-bit values. The result of the comparison is 0xFFFF if |
||
777 | the test is true and zero if false. */ |
||
778 | extern __inline __m64 |
||
779 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
780 | _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) { |
||
781 | #if _ARCH_PWR8 |
||
782 | __vector signed short __a, __b, __c; |
||
783 | |||
784 | __a = (__vector signed short)vec_splats(__m1); |
||
785 | __b = (__vector signed short)vec_splats(__m2); |
||
786 | __c = (__vector signed short)vec_cmpeq(__a, __b); |
||
787 | return (__m64)((__vector long long)__c)[0]; |
||
788 | #else |
||
789 | __m64_union __mu1, __mu2, __res; |
||
790 | |||
791 | __mu1.as_m64 = __m1; |
||
792 | __mu2.as_m64 = __m2; |
||
793 | |||
794 | __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0; |
||
795 | __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0; |
||
796 | __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0; |
||
797 | __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0; |
||
798 | |||
799 | return (__m64)__res.as_m64; |
||
800 | #endif |
||
801 | } |
||
802 | |||
803 | extern __inline __m64 |
||
804 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
805 | _m_pcmpeqw(__m64 __m1, __m64 __m2) { |
||
806 | return _mm_cmpeq_pi16(__m1, __m2); |
||
807 | } |
||
808 | |||
809 | extern __inline __m64 |
||
810 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
811 | _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) { |
||
812 | #if _ARCH_PWR8 |
||
813 | __vector signed short __a, __b, __c; |
||
814 | |||
815 | __a = (__vector signed short)vec_splats(__m1); |
||
816 | __b = (__vector signed short)vec_splats(__m2); |
||
817 | __c = (__vector signed short)vec_cmpgt(__a, __b); |
||
818 | return (__m64)((__vector long long)__c)[0]; |
||
819 | #else |
||
820 | __m64_union __mu1, __mu2, __res; |
||
821 | |||
822 | __mu1.as_m64 = __m1; |
||
823 | __mu2.as_m64 = __m2; |
||
824 | |||
825 | __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0; |
||
826 | __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0; |
||
827 | __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0; |
||
828 | __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0; |
||
829 | |||
830 | return (__m64)__res.as_m64; |
||
831 | #endif |
||
832 | } |
||
833 | |||
834 | extern __inline __m64 |
||
835 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
836 | _m_pcmpgtw(__m64 __m1, __m64 __m2) { |
||
837 | return _mm_cmpgt_pi16(__m1, __m2); |
||
838 | } |
||
839 | |||
840 | /* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if |
||
841 | the test is true and zero if false. */ |
||
842 | extern __inline __m64 |
||
843 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
844 | _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) { |
||
845 | #if _ARCH_PWR9 |
||
846 | __vector signed int __a, __b, __c; |
||
847 | |||
848 | __a = (__vector signed int)vec_splats(__m1); |
||
849 | __b = (__vector signed int)vec_splats(__m2); |
||
850 | __c = (__vector signed int)vec_cmpeq(__a, __b); |
||
851 | return (__m64)((__vector long long)__c)[0]; |
||
852 | #else |
||
853 | __m64_union __mu1, __mu2, __res; |
||
854 | |||
855 | __mu1.as_m64 = __m1; |
||
856 | __mu2.as_m64 = __m2; |
||
857 | |||
858 | __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0; |
||
859 | __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0; |
||
860 | |||
861 | return (__m64)__res.as_m64; |
||
862 | #endif |
||
863 | } |
||
864 | |||
865 | extern __inline __m64 |
||
866 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
867 | _m_pcmpeqd(__m64 __m1, __m64 __m2) { |
||
868 | return _mm_cmpeq_pi32(__m1, __m2); |
||
869 | } |
||
870 | |||
871 | extern __inline __m64 |
||
872 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
873 | _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) { |
||
874 | #if _ARCH_PWR9 |
||
875 | __vector signed int __a, __b, __c; |
||
876 | |||
877 | __a = (__vector signed int)vec_splats(__m1); |
||
878 | __b = (__vector signed int)vec_splats(__m2); |
||
879 | __c = (__vector signed int)vec_cmpgt(__a, __b); |
||
880 | return (__m64)((__vector long long)__c)[0]; |
||
881 | #else |
||
882 | __m64_union __mu1, __mu2, __res; |
||
883 | |||
884 | __mu1.as_m64 = __m1; |
||
885 | __mu2.as_m64 = __m2; |
||
886 | |||
887 | __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0; |
||
888 | __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0; |
||
889 | |||
890 | return (__m64)__res.as_m64; |
||
891 | #endif |
||
892 | } |
||
893 | |||
894 | extern __inline __m64 |
||
895 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
896 | _m_pcmpgtd(__m64 __m1, __m64 __m2) { |
||
897 | return _mm_cmpgt_pi32(__m1, __m2); |
||
898 | } |
||
899 | |||
900 | #if _ARCH_PWR8 |
||
901 | /* Add the 8-bit values in M1 to the 8-bit values in M2 using signed |
||
902 | saturated arithmetic. */ |
||
903 | extern __inline __m64 |
||
904 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
905 | _mm_adds_pi8(__m64 __m1, __m64 __m2) { |
||
906 | __vector signed char __a, __b, __c; |
||
907 | |||
908 | __a = (__vector signed char)vec_splats(__m1); |
||
909 | __b = (__vector signed char)vec_splats(__m2); |
||
910 | __c = vec_adds(__a, __b); |
||
911 | return (__m64)((__vector long long)__c)[0]; |
||
912 | } |
||
913 | |||
914 | extern __inline __m64 |
||
915 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
916 | _m_paddsb(__m64 __m1, __m64 __m2) { |
||
917 | return _mm_adds_pi8(__m1, __m2); |
||
918 | } |
||
919 | /* Add the 16-bit values in M1 to the 16-bit values in M2 using signed |
||
920 | saturated arithmetic. */ |
||
921 | extern __inline __m64 |
||
922 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
923 | _mm_adds_pi16(__m64 __m1, __m64 __m2) { |
||
924 | __vector signed short __a, __b, __c; |
||
925 | |||
926 | __a = (__vector signed short)vec_splats(__m1); |
||
927 | __b = (__vector signed short)vec_splats(__m2); |
||
928 | __c = vec_adds(__a, __b); |
||
929 | return (__m64)((__vector long long)__c)[0]; |
||
930 | } |
||
931 | |||
932 | extern __inline __m64 |
||
933 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
934 | _m_paddsw(__m64 __m1, __m64 __m2) { |
||
935 | return _mm_adds_pi16(__m1, __m2); |
||
936 | } |
||
937 | /* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned |
||
938 | saturated arithmetic. */ |
||
939 | extern __inline __m64 |
||
940 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
941 | _mm_adds_pu8(__m64 __m1, __m64 __m2) { |
||
942 | __vector unsigned char __a, __b, __c; |
||
943 | |||
944 | __a = (__vector unsigned char)vec_splats(__m1); |
||
945 | __b = (__vector unsigned char)vec_splats(__m2); |
||
946 | __c = vec_adds(__a, __b); |
||
947 | return (__m64)((__vector long long)__c)[0]; |
||
948 | } |
||
949 | |||
950 | extern __inline __m64 |
||
951 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
952 | _m_paddusb(__m64 __m1, __m64 __m2) { |
||
953 | return _mm_adds_pu8(__m1, __m2); |
||
954 | } |
||
955 | |||
956 | /* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned |
||
957 | saturated arithmetic. */ |
||
958 | extern __inline __m64 |
||
959 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
960 | _mm_adds_pu16(__m64 __m1, __m64 __m2) { |
||
961 | __vector unsigned short __a, __b, __c; |
||
962 | |||
963 | __a = (__vector unsigned short)vec_splats(__m1); |
||
964 | __b = (__vector unsigned short)vec_splats(__m2); |
||
965 | __c = vec_adds(__a, __b); |
||
966 | return (__m64)((__vector long long)__c)[0]; |
||
967 | } |
||
968 | |||
969 | extern __inline __m64 |
||
970 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
971 | _m_paddusw(__m64 __m1, __m64 __m2) { |
||
972 | return _mm_adds_pu16(__m1, __m2); |
||
973 | } |
||
974 | |||
975 | /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed |
||
976 | saturating arithmetic. */ |
||
977 | extern __inline __m64 |
||
978 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
979 | _mm_subs_pi8(__m64 __m1, __m64 __m2) { |
||
980 | __vector signed char __a, __b, __c; |
||
981 | |||
982 | __a = (__vector signed char)vec_splats(__m1); |
||
983 | __b = (__vector signed char)vec_splats(__m2); |
||
984 | __c = vec_subs(__a, __b); |
||
985 | return (__m64)((__vector long long)__c)[0]; |
||
986 | } |
||
987 | |||
988 | extern __inline __m64 |
||
989 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
990 | _m_psubsb(__m64 __m1, __m64 __m2) { |
||
991 | return _mm_subs_pi8(__m1, __m2); |
||
992 | } |
||
993 | |||
994 | /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using |
||
995 | signed saturating arithmetic. */ |
||
996 | extern __inline __m64 |
||
997 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
998 | _mm_subs_pi16(__m64 __m1, __m64 __m2) { |
||
999 | __vector signed short __a, __b, __c; |
||
1000 | |||
1001 | __a = (__vector signed short)vec_splats(__m1); |
||
1002 | __b = (__vector signed short)vec_splats(__m2); |
||
1003 | __c = vec_subs(__a, __b); |
||
1004 | return (__m64)((__vector long long)__c)[0]; |
||
1005 | } |
||
1006 | |||
1007 | extern __inline __m64 |
||
1008 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1009 | _m_psubsw(__m64 __m1, __m64 __m2) { |
||
1010 | return _mm_subs_pi16(__m1, __m2); |
||
1011 | } |
||
1012 | |||
1013 | /* Subtract the 8-bit values in M2 from the 8-bit values in M1 using |
||
1014 | unsigned saturating arithmetic. */ |
||
1015 | extern __inline __m64 |
||
1016 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1017 | _mm_subs_pu8(__m64 __m1, __m64 __m2) { |
||
1018 | __vector unsigned char __a, __b, __c; |
||
1019 | |||
1020 | __a = (__vector unsigned char)vec_splats(__m1); |
||
1021 | __b = (__vector unsigned char)vec_splats(__m2); |
||
1022 | __c = vec_subs(__a, __b); |
||
1023 | return (__m64)((__vector long long)__c)[0]; |
||
1024 | } |
||
1025 | |||
1026 | extern __inline __m64 |
||
1027 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1028 | _m_psubusb(__m64 __m1, __m64 __m2) { |
||
1029 | return _mm_subs_pu8(__m1, __m2); |
||
1030 | } |
||
1031 | |||
1032 | /* Subtract the 16-bit values in M2 from the 16-bit values in M1 using |
||
1033 | unsigned saturating arithmetic. */ |
||
1034 | extern __inline __m64 |
||
1035 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1036 | _mm_subs_pu16(__m64 __m1, __m64 __m2) { |
||
1037 | __vector unsigned short __a, __b, __c; |
||
1038 | |||
1039 | __a = (__vector unsigned short)vec_splats(__m1); |
||
1040 | __b = (__vector unsigned short)vec_splats(__m2); |
||
1041 | __c = vec_subs(__a, __b); |
||
1042 | return (__m64)((__vector long long)__c)[0]; |
||
1043 | } |
||
1044 | |||
1045 | extern __inline __m64 |
||
1046 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1047 | _m_psubusw(__m64 __m1, __m64 __m2) { |
||
1048 | return _mm_subs_pu16(__m1, __m2); |
||
1049 | } |
||
1050 | |||
1051 | /* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing |
||
1052 | four 32-bit intermediate results, which are then summed by pairs to |
||
1053 | produce two 32-bit results. */ |
||
1054 | extern __inline __m64 |
||
1055 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1056 | _mm_madd_pi16(__m64 __m1, __m64 __m2) { |
||
1057 | __vector signed short __a, __b; |
||
1058 | __vector signed int __c; |
||
1059 | __vector signed int __zero = {0, 0, 0, 0}; |
||
1060 | |||
1061 | __a = (__vector signed short)vec_splats(__m1); |
||
1062 | __b = (__vector signed short)vec_splats(__m2); |
||
1063 | __c = vec_vmsumshm(__a, __b, __zero); |
||
1064 | return (__m64)((__vector long long)__c)[0]; |
||
1065 | } |
||
1066 | |||
1067 | extern __inline __m64 |
||
1068 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1069 | _m_pmaddwd(__m64 __m1, __m64 __m2) { |
||
1070 | return _mm_madd_pi16(__m1, __m2); |
||
1071 | } |
||
1072 | /* Multiply four signed 16-bit values in M1 by four signed 16-bit values in |
||
1073 | M2 and produce the high 16 bits of the 32-bit results. */ |
||
1074 | extern __inline __m64 |
||
1075 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1076 | _mm_mulhi_pi16(__m64 __m1, __m64 __m2) { |
||
1077 | __vector signed short __a, __b; |
||
1078 | __vector signed short __c; |
||
1079 | __vector signed int __w0, __w1; |
||
1080 | __vector unsigned char __xform1 = { |
||
1081 | #ifdef __LITTLE_ENDIAN__ |
||
1082 | 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A, |
||
1083 | 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F |
||
1084 | #else |
||
1085 | 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00, |
||
1086 | 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15 |
||
1087 | #endif |
||
1088 | }; |
||
1089 | |||
1090 | __a = (__vector signed short)vec_splats(__m1); |
||
1091 | __b = (__vector signed short)vec_splats(__m2); |
||
1092 | |||
1093 | __w0 = vec_vmulesh(__a, __b); |
||
1094 | __w1 = vec_vmulosh(__a, __b); |
||
1095 | __c = (__vector signed short)vec_perm(__w0, __w1, __xform1); |
||
1096 | |||
1097 | return (__m64)((__vector long long)__c)[0]; |
||
1098 | } |
||
1099 | |||
1100 | extern __inline __m64 |
||
1101 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1102 | _m_pmulhw(__m64 __m1, __m64 __m2) { |
||
1103 | return _mm_mulhi_pi16(__m1, __m2); |
||
1104 | } |
||
1105 | |||
1106 | /* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce |
||
1107 | the low 16 bits of the results. */ |
||
1108 | extern __inline __m64 |
||
1109 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1110 | _mm_mullo_pi16(__m64 __m1, __m64 __m2) { |
||
1111 | __vector signed short __a, __b, __c; |
||
1112 | |||
1113 | __a = (__vector signed short)vec_splats(__m1); |
||
1114 | __b = (__vector signed short)vec_splats(__m2); |
||
1115 | __c = __a * __b; |
||
1116 | return (__m64)((__vector long long)__c)[0]; |
||
1117 | } |
||
1118 | |||
1119 | extern __inline __m64 |
||
1120 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1121 | _m_pmullw(__m64 __m1, __m64 __m2) { |
||
1122 | return _mm_mullo_pi16(__m1, __m2); |
||
1123 | } |
||
1124 | |||
1125 | /* Shift four 16-bit values in M left by COUNT. */ |
||
1126 | extern __inline __m64 |
||
1127 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1128 | _mm_sll_pi16(__m64 __m, __m64 __count) { |
||
1129 | __vector signed short __r; |
||
1130 | __vector unsigned short __c; |
||
1131 | |||
1132 | if (__count <= 15) { |
||
1133 | __r = (__vector signed short)vec_splats(__m); |
||
1134 | __c = (__vector unsigned short)vec_splats((unsigned short)__count); |
||
1135 | __r = vec_sl(__r, (__vector unsigned short)__c); |
||
1136 | return (__m64)((__vector long long)__r)[0]; |
||
1137 | } else |
||
1138 | return (0); |
||
1139 | } |
||
1140 | |||
1141 | extern __inline __m64 |
||
1142 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1143 | _m_psllw(__m64 __m, __m64 __count) { |
||
1144 | return _mm_sll_pi16(__m, __count); |
||
1145 | } |
||
1146 | |||
1147 | extern __inline __m64 |
||
1148 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1149 | _mm_slli_pi16(__m64 __m, int __count) { |
||
1150 | /* Promote int to long then invoke mm_sll_pi16. */ |
||
1151 | return _mm_sll_pi16(__m, __count); |
||
1152 | } |
||
1153 | |||
1154 | extern __inline __m64 |
||
1155 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1156 | _m_psllwi(__m64 __m, int __count) { |
||
1157 | return _mm_slli_pi16(__m, __count); |
||
1158 | } |
||
1159 | |||
1160 | /* Shift two 32-bit values in M left by COUNT. */ |
||
1161 | extern __inline __m64 |
||
1162 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1163 | _mm_sll_pi32(__m64 __m, __m64 __count) { |
||
1164 | __m64_union __res; |
||
1165 | |||
1166 | __res.as_m64 = __m; |
||
1167 | |||
1168 | __res.as_int[0] = __res.as_int[0] << __count; |
||
1169 | __res.as_int[1] = __res.as_int[1] << __count; |
||
1170 | return (__res.as_m64); |
||
1171 | } |
||
1172 | |||
1173 | extern __inline __m64 |
||
1174 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1175 | _m_pslld(__m64 __m, __m64 __count) { |
||
1176 | return _mm_sll_pi32(__m, __count); |
||
1177 | } |
||
1178 | |||
1179 | extern __inline __m64 |
||
1180 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1181 | _mm_slli_pi32(__m64 __m, int __count) { |
||
1182 | /* Promote int to long then invoke mm_sll_pi32. */ |
||
1183 | return _mm_sll_pi32(__m, __count); |
||
1184 | } |
||
1185 | |||
1186 | extern __inline __m64 |
||
1187 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1188 | _m_pslldi(__m64 __m, int __count) { |
||
1189 | return _mm_slli_pi32(__m, __count); |
||
1190 | } |
||
1191 | |||
1192 | /* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */ |
||
1193 | extern __inline __m64 |
||
1194 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1195 | _mm_sra_pi16(__m64 __m, __m64 __count) { |
||
1196 | __vector signed short __r; |
||
1197 | __vector unsigned short __c; |
||
1198 | |||
1199 | if (__count <= 15) { |
||
1200 | __r = (__vector signed short)vec_splats(__m); |
||
1201 | __c = (__vector unsigned short)vec_splats((unsigned short)__count); |
||
1202 | __r = vec_sra(__r, (__vector unsigned short)__c); |
||
1203 | return (__m64)((__vector long long)__r)[0]; |
||
1204 | } else |
||
1205 | return (0); |
||
1206 | } |
||
1207 | |||
1208 | extern __inline __m64 |
||
1209 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1210 | _m_psraw(__m64 __m, __m64 __count) { |
||
1211 | return _mm_sra_pi16(__m, __count); |
||
1212 | } |
||
1213 | |||
1214 | extern __inline __m64 |
||
1215 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1216 | _mm_srai_pi16(__m64 __m, int __count) { |
||
1217 | /* Promote int to long then invoke mm_sra_pi32. */ |
||
1218 | return _mm_sra_pi16(__m, __count); |
||
1219 | } |
||
1220 | |||
1221 | extern __inline __m64 |
||
1222 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1223 | _m_psrawi(__m64 __m, int __count) { |
||
1224 | return _mm_srai_pi16(__m, __count); |
||
1225 | } |
||
1226 | |||
1227 | /* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */ |
||
1228 | extern __inline __m64 |
||
1229 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1230 | _mm_sra_pi32(__m64 __m, __m64 __count) { |
||
1231 | __m64_union __res; |
||
1232 | |||
1233 | __res.as_m64 = __m; |
||
1234 | |||
1235 | __res.as_int[0] = __res.as_int[0] >> __count; |
||
1236 | __res.as_int[1] = __res.as_int[1] >> __count; |
||
1237 | return (__res.as_m64); |
||
1238 | } |
||
1239 | |||
1240 | extern __inline __m64 |
||
1241 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1242 | _m_psrad(__m64 __m, __m64 __count) { |
||
1243 | return _mm_sra_pi32(__m, __count); |
||
1244 | } |
||
1245 | |||
1246 | extern __inline __m64 |
||
1247 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1248 | _mm_srai_pi32(__m64 __m, int __count) { |
||
1249 | /* Promote int to long then invoke mm_sra_pi32. */ |
||
1250 | return _mm_sra_pi32(__m, __count); |
||
1251 | } |
||
1252 | |||
1253 | extern __inline __m64 |
||
1254 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1255 | _m_psradi(__m64 __m, int __count) { |
||
1256 | return _mm_srai_pi32(__m, __count); |
||
1257 | } |
||
1258 | |||
1259 | /* Shift four 16-bit values in M right by COUNT; shift in zeros. */ |
||
1260 | extern __inline __m64 |
||
1261 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1262 | _mm_srl_pi16(__m64 __m, __m64 __count) { |
||
1263 | __vector unsigned short __r; |
||
1264 | __vector unsigned short __c; |
||
1265 | |||
1266 | if (__count <= 15) { |
||
1267 | __r = (__vector unsigned short)vec_splats(__m); |
||
1268 | __c = (__vector unsigned short)vec_splats((unsigned short)__count); |
||
1269 | __r = vec_sr(__r, (__vector unsigned short)__c); |
||
1270 | return (__m64)((__vector long long)__r)[0]; |
||
1271 | } else |
||
1272 | return (0); |
||
1273 | } |
||
1274 | |||
1275 | extern __inline __m64 |
||
1276 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1277 | _m_psrlw(__m64 __m, __m64 __count) { |
||
1278 | return _mm_srl_pi16(__m, __count); |
||
1279 | } |
||
1280 | |||
1281 | extern __inline __m64 |
||
1282 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1283 | _mm_srli_pi16(__m64 __m, int __count) { |
||
1284 | /* Promote int to long then invoke mm_sra_pi32. */ |
||
1285 | return _mm_srl_pi16(__m, __count); |
||
1286 | } |
||
1287 | |||
1288 | extern __inline __m64 |
||
1289 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1290 | _m_psrlwi(__m64 __m, int __count) { |
||
1291 | return _mm_srli_pi16(__m, __count); |
||
1292 | } |
||
1293 | |||
1294 | /* Shift two 32-bit values in M right by COUNT; shift in zeros. */ |
||
1295 | extern __inline __m64 |
||
1296 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1297 | _mm_srl_pi32(__m64 __m, __m64 __count) { |
||
1298 | __m64_union __res; |
||
1299 | |||
1300 | __res.as_m64 = __m; |
||
1301 | |||
1302 | __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count; |
||
1303 | __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count; |
||
1304 | return (__res.as_m64); |
||
1305 | } |
||
1306 | |||
1307 | extern __inline __m64 |
||
1308 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1309 | _m_psrld(__m64 __m, __m64 __count) { |
||
1310 | return _mm_srl_pi32(__m, __count); |
||
1311 | } |
||
1312 | |||
1313 | extern __inline __m64 |
||
1314 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1315 | _mm_srli_pi32(__m64 __m, int __count) { |
||
1316 | /* Promote int to long then invoke mm_srl_pi32. */ |
||
1317 | return _mm_srl_pi32(__m, __count); |
||
1318 | } |
||
1319 | |||
1320 | extern __inline __m64 |
||
1321 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1322 | _m_psrldi(__m64 __m, int __count) { |
||
1323 | return _mm_srli_pi32(__m, __count); |
||
1324 | } |
||
1325 | #endif /* _ARCH_PWR8 */ |
||
1326 | |||
1327 | /* Creates a vector of two 32-bit values; I0 is least significant. */ |
||
1328 | extern __inline __m64 |
||
1329 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1330 | _mm_set_pi32(int __i1, int __i0) { |
||
1331 | __m64_union __res; |
||
1332 | |||
1333 | __res.as_int[0] = __i0; |
||
1334 | __res.as_int[1] = __i1; |
||
1335 | return (__res.as_m64); |
||
1336 | } |
||
1337 | |||
1338 | /* Creates a vector of four 16-bit values; W0 is least significant. */ |
||
1339 | extern __inline __m64 |
||
1340 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1341 | _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) { |
||
1342 | __m64_union __res; |
||
1343 | |||
1344 | __res.as_short[0] = __w0; |
||
1345 | __res.as_short[1] = __w1; |
||
1346 | __res.as_short[2] = __w2; |
||
1347 | __res.as_short[3] = __w3; |
||
1348 | return (__res.as_m64); |
||
1349 | } |
||
1350 | |||
1351 | /* Creates a vector of eight 8-bit values; B0 is least significant. */ |
||
1352 | extern __inline __m64 |
||
1353 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1354 | _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, |
||
1355 | char __b2, char __b1, char __b0) { |
||
1356 | __m64_union __res; |
||
1357 | |||
1358 | __res.as_char[0] = __b0; |
||
1359 | __res.as_char[1] = __b1; |
||
1360 | __res.as_char[2] = __b2; |
||
1361 | __res.as_char[3] = __b3; |
||
1362 | __res.as_char[4] = __b4; |
||
1363 | __res.as_char[5] = __b5; |
||
1364 | __res.as_char[6] = __b6; |
||
1365 | __res.as_char[7] = __b7; |
||
1366 | return (__res.as_m64); |
||
1367 | } |
||
1368 | |||
1369 | /* Similar, but with the arguments in reverse order. */ |
||
1370 | extern __inline __m64 |
||
1371 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1372 | _mm_setr_pi32(int __i0, int __i1) { |
||
1373 | __m64_union __res; |
||
1374 | |||
1375 | __res.as_int[0] = __i0; |
||
1376 | __res.as_int[1] = __i1; |
||
1377 | return (__res.as_m64); |
||
1378 | } |
||
1379 | |||
1380 | extern __inline __m64 |
||
1381 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1382 | _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) { |
||
1383 | return _mm_set_pi16(__w3, __w2, __w1, __w0); |
||
1384 | } |
||
1385 | |||
1386 | extern __inline __m64 |
||
1387 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1388 | _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, |
||
1389 | char __b5, char __b6, char __b7) { |
||
1390 | return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); |
||
1391 | } |
||
1392 | |||
1393 | /* Creates a vector of two 32-bit values, both elements containing I. */ |
||
1394 | extern __inline __m64 |
||
1395 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1396 | _mm_set1_pi32(int __i) { |
||
1397 | __m64_union __res; |
||
1398 | |||
1399 | __res.as_int[0] = __i; |
||
1400 | __res.as_int[1] = __i; |
||
1401 | return (__res.as_m64); |
||
1402 | } |
||
1403 | |||
1404 | /* Creates a vector of four 16-bit values, all elements containing W. */ |
||
1405 | extern __inline __m64 |
||
1406 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1407 | _mm_set1_pi16(short __w) { |
||
1408 | #if _ARCH_PWR9 |
||
1409 | __vector signed short w; |
||
1410 | |||
1411 | w = (__vector signed short)vec_splats(__w); |
||
1412 | return (__m64)((__vector long long)w)[0]; |
||
1413 | #else |
||
1414 | __m64_union __res; |
||
1415 | |||
1416 | __res.as_short[0] = __w; |
||
1417 | __res.as_short[1] = __w; |
||
1418 | __res.as_short[2] = __w; |
||
1419 | __res.as_short[3] = __w; |
||
1420 | return (__res.as_m64); |
||
1421 | #endif |
||
1422 | } |
||
1423 | |||
1424 | /* Creates a vector of eight 8-bit values, all elements containing B. */ |
||
1425 | extern __inline __m64 |
||
1426 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
1427 | _mm_set1_pi8(signed char __b) { |
||
1428 | #if _ARCH_PWR8 |
||
1429 | __vector signed char __res; |
||
1430 | |||
1431 | __res = (__vector signed char)vec_splats(__b); |
||
1432 | return (__m64)((__vector long long)__res)[0]; |
||
1433 | #else |
||
1434 | __m64_union __res; |
||
1435 | |||
1436 | __res.as_char[0] = __b; |
||
1437 | __res.as_char[1] = __b; |
||
1438 | __res.as_char[2] = __b; |
||
1439 | __res.as_char[3] = __b; |
||
1440 | __res.as_char[4] = __b; |
||
1441 | __res.as_char[5] = __b; |
||
1442 | __res.as_char[6] = __b; |
||
1443 | __res.as_char[7] = __b; |
||
1444 | return (__res.as_m64); |
||
1445 | #endif |
||
1446 | } |
||
1447 | |||
1448 | #else |
||
1449 | #include_next <mmintrin.h> |
||
1450 | #endif /* defined(__powerpc64__) && \ |
||
1451 | * (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */ |
||
1452 | |||
1453 | #endif /* _MMINTRIN_H_INCLUDED */ |