Subversion Repositories QNX 8.QNX8 LLVM/Clang compiler suite

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
14 pmbaty 1
/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2
 *
3
 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4
 * See https://llvm.org/LICENSE.txt for license information.
5
 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6
 *
7
 *===-----------------------------------------------------------------------===
8
 */
9
 
10
/* Implemented from the specification included in the Intel C++ Compiler
11
   User Guide and Reference, version 9.0.  */
12
 
13
#ifndef NO_WARN_X86_INTRINSICS
14
/* This header file is to help porting code using Intel intrinsics
15
   explicitly from x86_64 to powerpc64/powerpc64le.
16
 
17
   Since PowerPC target doesn't support native 64-bit vector type, we
18
   typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19
   works well for _si64 and some _pi32 operations.
20
 
21
   For _pi16 and _pi8 operations, it's better to transfer __m64 into
22
   128-bit PowerPC vector first. Power8 introduced direct register
23
   move instructions which helps for more efficient implementation.
24
 
25
   It's user's responsibility to determine if the results of such port
26
   are acceptable or further changes are needed. Please note that much
27
   code using Intel intrinsics CAN BE REWRITTEN in more portable and
28
   efficient standard C or GNU C extensions with 64-bit scalar
29
   operations, or 128-bit SSE/Altivec operations, which are more
30
   recommended. */
31
#error                                                                         \
32
    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
33
#endif
34
 
35
#ifndef _MMINTRIN_H_INCLUDED
36
#define _MMINTRIN_H_INCLUDED
37
 
38
#if defined(__powerpc64__) &&                                                  \
39
    (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
40
 
41
#include <altivec.h>
42
/* The Intel API is flexible enough that we must allow aliasing with other
43
   vector types, and their scalar components.  */
44
typedef __attribute__((__aligned__(8))) unsigned long long __m64;
45
 
46
typedef __attribute__((__aligned__(8))) union {
47
  __m64 as_m64;
48
  char as_char[8];
49
  signed char as_signed_char[8];
50
  short as_short[4];
51
  int as_int[2];
52
  long long as_long_long;
53
  float as_float[2];
54
  double as_double;
55
} __m64_union;
56
 
57
/* Empty the multimedia state.  */
58
extern __inline void
59
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
60
    _mm_empty(void) {
61
  /* nothing to do on PowerPC.  */
62
}
63
 
64
extern __inline void
65
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
66
    _m_empty(void) {
67
  /* nothing to do on PowerPC.  */
68
}
69
 
70
/* Convert I to a __m64 object.  The integer is zero-extended to 64-bits.  */
71
extern __inline __m64
72
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
73
    _mm_cvtsi32_si64(int __i) {
74
  return (__m64)(unsigned int)__i;
75
}
76
 
77
extern __inline __m64
78
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
79
    _m_from_int(int __i) {
80
  return _mm_cvtsi32_si64(__i);
81
}
82
 
83
/* Convert the lower 32 bits of the __m64 object into an integer.  */
84
extern __inline int
85
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
86
    _mm_cvtsi64_si32(__m64 __i) {
87
  return ((int)__i);
88
}
89
 
90
extern __inline int
91
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
92
    _m_to_int(__m64 __i) {
93
  return _mm_cvtsi64_si32(__i);
94
}
95
 
96
/* Convert I to a __m64 object.  */
97
 
98
/* Intel intrinsic.  */
99
extern __inline __m64
100
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
101
    _m_from_int64(long long __i) {
102
  return (__m64)__i;
103
}
104
 
105
extern __inline __m64
106
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
107
    _mm_cvtsi64_m64(long long __i) {
108
  return (__m64)__i;
109
}
110
 
111
/* Microsoft intrinsic.  */
112
extern __inline __m64
113
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
114
    _mm_cvtsi64x_si64(long long __i) {
115
  return (__m64)__i;
116
}
117
 
118
extern __inline __m64
119
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
120
    _mm_set_pi64x(long long __i) {
121
  return (__m64)__i;
122
}
123
 
124
/* Convert the __m64 object to a 64bit integer.  */
125
 
126
/* Intel intrinsic.  */
127
extern __inline long long
128
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
129
    _m_to_int64(__m64 __i) {
130
  return (long long)__i;
131
}
132
 
133
extern __inline long long
134
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
135
    _mm_cvtm64_si64(__m64 __i) {
136
  return (long long)__i;
137
}
138
 
139
/* Microsoft intrinsic.  */
140
extern __inline long long
141
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
142
    _mm_cvtsi64_si64x(__m64 __i) {
143
  return (long long)__i;
144
}
145
 
146
#ifdef _ARCH_PWR8
147
/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
148
   the result, and the four 16-bit values from M2 into the upper four 8-bit
149
   values of the result, all with signed saturation.  */
150
extern __inline __m64
151
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
152
    _mm_packs_pi16(__m64 __m1, __m64 __m2) {
153
  __vector signed short __vm1;
154
  __vector signed char __vresult;
155
 
156
  __vm1 = (__vector signed short)(__vector unsigned long long)
157
#ifdef __LITTLE_ENDIAN__
158
      {__m1, __m2};
159
#else
160
      {__m2, __m1};
161
#endif
162
  __vresult = vec_packs(__vm1, __vm1);
163
  return (__m64)((__vector long long)__vresult)[0];
164
}
165
 
166
extern __inline __m64
167
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168
    _m_packsswb(__m64 __m1, __m64 __m2) {
169
  return _mm_packs_pi16(__m1, __m2);
170
}
171
 
172
/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
173
   the result, and the two 32-bit values from M2 into the upper two 16-bit
174
   values of the result, all with signed saturation.  */
175
extern __inline __m64
176
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
177
    _mm_packs_pi32(__m64 __m1, __m64 __m2) {
178
  __vector signed int __vm1;
179
  __vector signed short __vresult;
180
 
181
  __vm1 = (__vector signed int)(__vector unsigned long long)
182
#ifdef __LITTLE_ENDIAN__
183
      {__m1, __m2};
184
#else
185
      {__m2, __m1};
186
#endif
187
  __vresult = vec_packs(__vm1, __vm1);
188
  return (__m64)((__vector long long)__vresult)[0];
189
}
190
 
191
extern __inline __m64
192
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
193
    _m_packssdw(__m64 __m1, __m64 __m2) {
194
  return _mm_packs_pi32(__m1, __m2);
195
}
196
 
197
/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
198
   the result, and the four 16-bit values from M2 into the upper four 8-bit
199
   values of the result, all with unsigned saturation.  */
200
extern __inline __m64
201
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
202
    _mm_packs_pu16(__m64 __m1, __m64 __m2) {
203
  __vector unsigned char __r;
204
  __vector signed short __vm1 = (__vector signed short)(__vector long long)
205
#ifdef __LITTLE_ENDIAN__
206
      {__m1, __m2};
207
#else
208
      {__m2, __m1};
209
#endif
210
  const __vector signed short __zero = {0};
211
  __vector __bool short __select = vec_cmplt(__vm1, __zero);
212
  __r =
213
      vec_packs((__vector unsigned short)__vm1, (__vector unsigned short)__vm1);
214
  __vector __bool char __packsel = vec_pack(__select, __select);
215
  __r = vec_sel(__r, (const __vector unsigned char)__zero, __packsel);
216
  return (__m64)((__vector long long)__r)[0];
217
}
218
 
219
extern __inline __m64
220
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221
    _m_packuswb(__m64 __m1, __m64 __m2) {
222
  return _mm_packs_pu16(__m1, __m2);
223
}
224
#endif /* end ARCH_PWR8 */
225
 
226
/* Interleave the four 8-bit values from the high half of M1 with the four
227
   8-bit values from the high half of M2.  */
228
extern __inline __m64
229
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
230
    _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
231
#if _ARCH_PWR8
232
  __vector unsigned char __a, __b, __c;
233
 
234
  __a = (__vector unsigned char)vec_splats(__m1);
235
  __b = (__vector unsigned char)vec_splats(__m2);
236
  __c = vec_mergel(__a, __b);
237
  return (__m64)((__vector long long)__c)[1];
238
#else
239
  __m64_union __mu1, __mu2, __res;
240
 
241
  __mu1.as_m64 = __m1;
242
  __mu2.as_m64 = __m2;
243
 
244
  __res.as_char[0] = __mu1.as_char[4];
245
  __res.as_char[1] = __mu2.as_char[4];
246
  __res.as_char[2] = __mu1.as_char[5];
247
  __res.as_char[3] = __mu2.as_char[5];
248
  __res.as_char[4] = __mu1.as_char[6];
249
  __res.as_char[5] = __mu2.as_char[6];
250
  __res.as_char[6] = __mu1.as_char[7];
251
  __res.as_char[7] = __mu2.as_char[7];
252
 
253
  return (__m64)__res.as_m64;
254
#endif
255
}
256
 
257
extern __inline __m64
258
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
259
    _m_punpckhbw(__m64 __m1, __m64 __m2) {
260
  return _mm_unpackhi_pi8(__m1, __m2);
261
}
262
 
263
/* Interleave the two 16-bit values from the high half of M1 with the two
264
   16-bit values from the high half of M2.  */
265
extern __inline __m64
266
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
267
    _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
268
  __m64_union __mu1, __mu2, __res;
269
 
270
  __mu1.as_m64 = __m1;
271
  __mu2.as_m64 = __m2;
272
 
273
  __res.as_short[0] = __mu1.as_short[2];
274
  __res.as_short[1] = __mu2.as_short[2];
275
  __res.as_short[2] = __mu1.as_short[3];
276
  __res.as_short[3] = __mu2.as_short[3];
277
 
278
  return (__m64)__res.as_m64;
279
}
280
 
281
extern __inline __m64
282
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283
    _m_punpckhwd(__m64 __m1, __m64 __m2) {
284
  return _mm_unpackhi_pi16(__m1, __m2);
285
}
286
/* Interleave the 32-bit value from the high half of M1 with the 32-bit
287
   value from the high half of M2.  */
288
extern __inline __m64
289
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
290
    _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
291
  __m64_union __mu1, __mu2, __res;
292
 
293
  __mu1.as_m64 = __m1;
294
  __mu2.as_m64 = __m2;
295
 
296
  __res.as_int[0] = __mu1.as_int[1];
297
  __res.as_int[1] = __mu2.as_int[1];
298
 
299
  return (__m64)__res.as_m64;
300
}
301
 
302
extern __inline __m64
303
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
304
    _m_punpckhdq(__m64 __m1, __m64 __m2) {
305
  return _mm_unpackhi_pi32(__m1, __m2);
306
}
307
/* Interleave the four 8-bit values from the low half of M1 with the four
308
   8-bit values from the low half of M2.  */
309
extern __inline __m64
310
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311
    _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
312
#if _ARCH_PWR8
313
  __vector unsigned char __a, __b, __c;
314
 
315
  __a = (__vector unsigned char)vec_splats(__m1);
316
  __b = (__vector unsigned char)vec_splats(__m2);
317
  __c = vec_mergel(__a, __b);
318
  return (__m64)((__vector long long)__c)[0];
319
#else
320
  __m64_union __mu1, __mu2, __res;
321
 
322
  __mu1.as_m64 = __m1;
323
  __mu2.as_m64 = __m2;
324
 
325
  __res.as_char[0] = __mu1.as_char[0];
326
  __res.as_char[1] = __mu2.as_char[0];
327
  __res.as_char[2] = __mu1.as_char[1];
328
  __res.as_char[3] = __mu2.as_char[1];
329
  __res.as_char[4] = __mu1.as_char[2];
330
  __res.as_char[5] = __mu2.as_char[2];
331
  __res.as_char[6] = __mu1.as_char[3];
332
  __res.as_char[7] = __mu2.as_char[3];
333
 
334
  return (__m64)__res.as_m64;
335
#endif
336
}
337
 
338
extern __inline __m64
339
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
340
    _m_punpcklbw(__m64 __m1, __m64 __m2) {
341
  return _mm_unpacklo_pi8(__m1, __m2);
342
}
343
/* Interleave the two 16-bit values from the low half of M1 with the two
344
   16-bit values from the low half of M2.  */
345
extern __inline __m64
346
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
347
    _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
348
  __m64_union __mu1, __mu2, __res;
349
 
350
  __mu1.as_m64 = __m1;
351
  __mu2.as_m64 = __m2;
352
 
353
  __res.as_short[0] = __mu1.as_short[0];
354
  __res.as_short[1] = __mu2.as_short[0];
355
  __res.as_short[2] = __mu1.as_short[1];
356
  __res.as_short[3] = __mu2.as_short[1];
357
 
358
  return (__m64)__res.as_m64;
359
}
360
 
361
extern __inline __m64
362
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
363
    _m_punpcklwd(__m64 __m1, __m64 __m2) {
364
  return _mm_unpacklo_pi16(__m1, __m2);
365
}
366
 
367
/* Interleave the 32-bit value from the low half of M1 with the 32-bit
368
   value from the low half of M2.  */
369
extern __inline __m64
370
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
371
    _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
372
  __m64_union __mu1, __mu2, __res;
373
 
374
  __mu1.as_m64 = __m1;
375
  __mu2.as_m64 = __m2;
376
 
377
  __res.as_int[0] = __mu1.as_int[0];
378
  __res.as_int[1] = __mu2.as_int[0];
379
 
380
  return (__m64)__res.as_m64;
381
}
382
 
383
extern __inline __m64
384
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
385
    _m_punpckldq(__m64 __m1, __m64 __m2) {
386
  return _mm_unpacklo_pi32(__m1, __m2);
387
}
388
 
389
/* Add the 8-bit values in M1 to the 8-bit values in M2.  */
390
extern __inline __m64
391
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392
    _mm_add_pi8(__m64 __m1, __m64 __m2) {
393
#if _ARCH_PWR8
394
  __vector signed char __a, __b, __c;
395
 
396
  __a = (__vector signed char)vec_splats(__m1);
397
  __b = (__vector signed char)vec_splats(__m2);
398
  __c = vec_add(__a, __b);
399
  return (__m64)((__vector long long)__c)[0];
400
#else
401
  __m64_union __mu1, __mu2, __res;
402
 
403
  __mu1.as_m64 = __m1;
404
  __mu2.as_m64 = __m2;
405
 
406
  __res.as_char[0] = __mu1.as_char[0] + __mu2.as_char[0];
407
  __res.as_char[1] = __mu1.as_char[1] + __mu2.as_char[1];
408
  __res.as_char[2] = __mu1.as_char[2] + __mu2.as_char[2];
409
  __res.as_char[3] = __mu1.as_char[3] + __mu2.as_char[3];
410
  __res.as_char[4] = __mu1.as_char[4] + __mu2.as_char[4];
411
  __res.as_char[5] = __mu1.as_char[5] + __mu2.as_char[5];
412
  __res.as_char[6] = __mu1.as_char[6] + __mu2.as_char[6];
413
  __res.as_char[7] = __mu1.as_char[7] + __mu2.as_char[7];
414
 
415
  return (__m64)__res.as_m64;
416
#endif
417
}
418
 
419
extern __inline __m64
420
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
421
    _m_paddb(__m64 __m1, __m64 __m2) {
422
  return _mm_add_pi8(__m1, __m2);
423
}
424
 
425
/* Add the 16-bit values in M1 to the 16-bit values in M2.  */
426
extern __inline __m64
427
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
428
    _mm_add_pi16(__m64 __m1, __m64 __m2) {
429
#if _ARCH_PWR8
430
  __vector signed short __a, __b, __c;
431
 
432
  __a = (__vector signed short)vec_splats(__m1);
433
  __b = (__vector signed short)vec_splats(__m2);
434
  __c = vec_add(__a, __b);
435
  return (__m64)((__vector long long)__c)[0];
436
#else
437
  __m64_union __mu1, __mu2, __res;
438
 
439
  __mu1.as_m64 = __m1;
440
  __mu2.as_m64 = __m2;
441
 
442
  __res.as_short[0] = __mu1.as_short[0] + __mu2.as_short[0];
443
  __res.as_short[1] = __mu1.as_short[1] + __mu2.as_short[1];
444
  __res.as_short[2] = __mu1.as_short[2] + __mu2.as_short[2];
445
  __res.as_short[3] = __mu1.as_short[3] + __mu2.as_short[3];
446
 
447
  return (__m64)__res.as_m64;
448
#endif
449
}
450
 
451
extern __inline __m64
452
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
453
    _m_paddw(__m64 __m1, __m64 __m2) {
454
  return _mm_add_pi16(__m1, __m2);
455
}
456
 
457
/* Add the 32-bit values in M1 to the 32-bit values in M2.  */
458
extern __inline __m64
459
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
460
    _mm_add_pi32(__m64 __m1, __m64 __m2) {
461
#if _ARCH_PWR9
462
  __vector signed int __a, __b, __c;
463
 
464
  __a = (__vector signed int)vec_splats(__m1);
465
  __b = (__vector signed int)vec_splats(__m2);
466
  __c = vec_add(__a, __b);
467
  return (__m64)((__vector long long)__c)[0];
468
#else
469
  __m64_union __mu1, __mu2, __res;
470
 
471
  __mu1.as_m64 = __m1;
472
  __mu2.as_m64 = __m2;
473
 
474
  __res.as_int[0] = __mu1.as_int[0] + __mu2.as_int[0];
475
  __res.as_int[1] = __mu1.as_int[1] + __mu2.as_int[1];
476
 
477
  return (__m64)__res.as_m64;
478
#endif
479
}
480
 
481
extern __inline __m64
482
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483
    _m_paddd(__m64 __m1, __m64 __m2) {
484
  return _mm_add_pi32(__m1, __m2);
485
}
486
 
487
/* Subtract the 8-bit values in M2 from the 8-bit values in M1.  */
488
extern __inline __m64
489
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
490
    _mm_sub_pi8(__m64 __m1, __m64 __m2) {
491
#if _ARCH_PWR8
492
  __vector signed char __a, __b, __c;
493
 
494
  __a = (__vector signed char)vec_splats(__m1);
495
  __b = (__vector signed char)vec_splats(__m2);
496
  __c = vec_sub(__a, __b);
497
  return (__m64)((__vector long long)__c)[0];
498
#else
499
  __m64_union __mu1, __mu2, __res;
500
 
501
  __mu1.as_m64 = __m1;
502
  __mu2.as_m64 = __m2;
503
 
504
  __res.as_char[0] = __mu1.as_char[0] - __mu2.as_char[0];
505
  __res.as_char[1] = __mu1.as_char[1] - __mu2.as_char[1];
506
  __res.as_char[2] = __mu1.as_char[2] - __mu2.as_char[2];
507
  __res.as_char[3] = __mu1.as_char[3] - __mu2.as_char[3];
508
  __res.as_char[4] = __mu1.as_char[4] - __mu2.as_char[4];
509
  __res.as_char[5] = __mu1.as_char[5] - __mu2.as_char[5];
510
  __res.as_char[6] = __mu1.as_char[6] - __mu2.as_char[6];
511
  __res.as_char[7] = __mu1.as_char[7] - __mu2.as_char[7];
512
 
513
  return (__m64)__res.as_m64;
514
#endif
515
}
516
 
517
extern __inline __m64
518
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
519
    _m_psubb(__m64 __m1, __m64 __m2) {
520
  return _mm_sub_pi8(__m1, __m2);
521
}
522
 
523
/* Subtract the 16-bit values in M2 from the 16-bit values in M1.  */
524
extern __inline __m64
525
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526
    _mm_sub_pi16(__m64 __m1, __m64 __m2) {
527
#if _ARCH_PWR8
528
  __vector signed short __a, __b, __c;
529
 
530
  __a = (__vector signed short)vec_splats(__m1);
531
  __b = (__vector signed short)vec_splats(__m2);
532
  __c = vec_sub(__a, __b);
533
  return (__m64)((__vector long long)__c)[0];
534
#else
535
  __m64_union __mu1, __mu2, __res;
536
 
537
  __mu1.as_m64 = __m1;
538
  __mu2.as_m64 = __m2;
539
 
540
  __res.as_short[0] = __mu1.as_short[0] - __mu2.as_short[0];
541
  __res.as_short[1] = __mu1.as_short[1] - __mu2.as_short[1];
542
  __res.as_short[2] = __mu1.as_short[2] - __mu2.as_short[2];
543
  __res.as_short[3] = __mu1.as_short[3] - __mu2.as_short[3];
544
 
545
  return (__m64)__res.as_m64;
546
#endif
547
}
548
 
549
extern __inline __m64
550
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
551
    _m_psubw(__m64 __m1, __m64 __m2) {
552
  return _mm_sub_pi16(__m1, __m2);
553
}
554
 
555
/* Subtract the 32-bit values in M2 from the 32-bit values in M1.  */
556
extern __inline __m64
557
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
558
    _mm_sub_pi32(__m64 __m1, __m64 __m2) {
559
#if _ARCH_PWR9
560
  __vector signed int __a, __b, __c;
561
 
562
  __a = (__vector signed int)vec_splats(__m1);
563
  __b = (__vector signed int)vec_splats(__m2);
564
  __c = vec_sub(__a, __b);
565
  return (__m64)((__vector long long)__c)[0];
566
#else
567
  __m64_union __mu1, __mu2, __res;
568
 
569
  __mu1.as_m64 = __m1;
570
  __mu2.as_m64 = __m2;
571
 
572
  __res.as_int[0] = __mu1.as_int[0] - __mu2.as_int[0];
573
  __res.as_int[1] = __mu1.as_int[1] - __mu2.as_int[1];
574
 
575
  return (__m64)__res.as_m64;
576
#endif
577
}
578
 
579
extern __inline __m64
580
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
581
    _m_psubd(__m64 __m1, __m64 __m2) {
582
  return _mm_sub_pi32(__m1, __m2);
583
}
584
 
585
extern __inline __m64
586
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
587
    _mm_add_si64(__m64 __m1, __m64 __m2) {
588
  return (__m1 + __m2);
589
}
590
 
591
extern __inline __m64
592
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
593
    _mm_sub_si64(__m64 __m1, __m64 __m2) {
594
  return (__m1 - __m2);
595
}
596
 
597
/* Shift the 64-bit value in M left by COUNT.  */
598
extern __inline __m64
599
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
600
    _mm_sll_si64(__m64 __m, __m64 __count) {
601
  return (__m << __count);
602
}
603
 
604
extern __inline __m64
605
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
606
    _m_psllq(__m64 __m, __m64 __count) {
607
  return _mm_sll_si64(__m, __count);
608
}
609
 
610
extern __inline __m64
611
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
612
    _mm_slli_si64(__m64 __m, const int __count) {
613
  return (__m << __count);
614
}
615
 
616
extern __inline __m64
617
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
618
    _m_psllqi(__m64 __m, const int __count) {
619
  return _mm_slli_si64(__m, __count);
620
}
621
 
622
/* Shift the 64-bit value in M left by COUNT; shift in zeros.  */
623
extern __inline __m64
624
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625
    _mm_srl_si64(__m64 __m, __m64 __count) {
626
  return (__m >> __count);
627
}
628
 
629
extern __inline __m64
630
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631
    _m_psrlq(__m64 __m, __m64 __count) {
632
  return _mm_srl_si64(__m, __count);
633
}
634
 
635
extern __inline __m64
636
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637
    _mm_srli_si64(__m64 __m, const int __count) {
638
  return (__m >> __count);
639
}
640
 
641
extern __inline __m64
642
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643
    _m_psrlqi(__m64 __m, const int __count) {
644
  return _mm_srli_si64(__m, __count);
645
}
646
 
647
/* Bit-wise AND the 64-bit values in M1 and M2.  */
648
extern __inline __m64
649
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
650
    _mm_and_si64(__m64 __m1, __m64 __m2) {
651
  return (__m1 & __m2);
652
}
653
 
654
extern __inline __m64
655
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656
    _m_pand(__m64 __m1, __m64 __m2) {
657
  return _mm_and_si64(__m1, __m2);
658
}
659
 
660
/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
661
   64-bit value in M2.  */
662
extern __inline __m64
663
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
664
    _mm_andnot_si64(__m64 __m1, __m64 __m2) {
665
  return (~__m1 & __m2);
666
}
667
 
668
extern __inline __m64
669
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
670
    _m_pandn(__m64 __m1, __m64 __m2) {
671
  return _mm_andnot_si64(__m1, __m2);
672
}
673
 
674
/* Bit-wise inclusive OR the 64-bit values in M1 and M2.  */
675
extern __inline __m64
676
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
677
    _mm_or_si64(__m64 __m1, __m64 __m2) {
678
  return (__m1 | __m2);
679
}
680
 
681
extern __inline __m64
682
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
683
    _m_por(__m64 __m1, __m64 __m2) {
684
  return _mm_or_si64(__m1, __m2);
685
}
686
 
687
/* Bit-wise exclusive OR the 64-bit values in M1 and M2.  */
688
extern __inline __m64
689
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
690
    _mm_xor_si64(__m64 __m1, __m64 __m2) {
691
  return (__m1 ^ __m2);
692
}
693
 
694
extern __inline __m64
695
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
696
    _m_pxor(__m64 __m1, __m64 __m2) {
697
  return _mm_xor_si64(__m1, __m2);
698
}
699
 
700
/* Creates a 64-bit zero.  */
701
extern __inline __m64
702
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
703
    _mm_setzero_si64(void) {
704
  return (__m64)0;
705
}
706
 
707
/* Compare eight 8-bit values.  The result of the comparison is 0xFF if the
708
   test is true and zero if false.  */
709
extern __inline __m64
710
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
711
    _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
712
#if defined(_ARCH_PWR6) && defined(__powerpc64__)
713
  __m64 __res;
714
  __asm__("cmpb %0,%1,%2;\n" : "=r"(__res) : "r"(__m1), "r"(__m2) :);
715
  return (__res);
716
#else
717
  __m64_union __mu1, __mu2, __res;
718
 
719
  __mu1.as_m64 = __m1;
720
  __mu2.as_m64 = __m2;
721
 
722
  __res.as_char[0] = (__mu1.as_char[0] == __mu2.as_char[0]) ? -1 : 0;
723
  __res.as_char[1] = (__mu1.as_char[1] == __mu2.as_char[1]) ? -1 : 0;
724
  __res.as_char[2] = (__mu1.as_char[2] == __mu2.as_char[2]) ? -1 : 0;
725
  __res.as_char[3] = (__mu1.as_char[3] == __mu2.as_char[3]) ? -1 : 0;
726
  __res.as_char[4] = (__mu1.as_char[4] == __mu2.as_char[4]) ? -1 : 0;
727
  __res.as_char[5] = (__mu1.as_char[5] == __mu2.as_char[5]) ? -1 : 0;
728
  __res.as_char[6] = (__mu1.as_char[6] == __mu2.as_char[6]) ? -1 : 0;
729
  __res.as_char[7] = (__mu1.as_char[7] == __mu2.as_char[7]) ? -1 : 0;
730
 
731
  return (__m64)__res.as_m64;
732
#endif
733
}
734
 
735
extern __inline __m64
736
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
737
    _m_pcmpeqb(__m64 __m1, __m64 __m2) {
738
  return _mm_cmpeq_pi8(__m1, __m2);
739
}
740
 
741
extern __inline __m64
742
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
743
    _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
744
#if _ARCH_PWR8
745
  __vector signed char __a, __b, __c;
746
 
747
  __a = (__vector signed char)vec_splats(__m1);
748
  __b = (__vector signed char)vec_splats(__m2);
749
  __c = (__vector signed char)vec_cmpgt(__a, __b);
750
  return (__m64)((__vector long long)__c)[0];
751
#else
752
  __m64_union __mu1, __mu2, __res;
753
 
754
  __mu1.as_m64 = __m1;
755
  __mu2.as_m64 = __m2;
756
 
757
  __res.as_char[0] = (__mu1.as_char[0] > __mu2.as_char[0]) ? -1 : 0;
758
  __res.as_char[1] = (__mu1.as_char[1] > __mu2.as_char[1]) ? -1 : 0;
759
  __res.as_char[2] = (__mu1.as_char[2] > __mu2.as_char[2]) ? -1 : 0;
760
  __res.as_char[3] = (__mu1.as_char[3] > __mu2.as_char[3]) ? -1 : 0;
761
  __res.as_char[4] = (__mu1.as_char[4] > __mu2.as_char[4]) ? -1 : 0;
762
  __res.as_char[5] = (__mu1.as_char[5] > __mu2.as_char[5]) ? -1 : 0;
763
  __res.as_char[6] = (__mu1.as_char[6] > __mu2.as_char[6]) ? -1 : 0;
764
  __res.as_char[7] = (__mu1.as_char[7] > __mu2.as_char[7]) ? -1 : 0;
765
 
766
  return (__m64)__res.as_m64;
767
#endif
768
}
769
 
770
extern __inline __m64
771
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772
    _m_pcmpgtb(__m64 __m1, __m64 __m2) {
773
  return _mm_cmpgt_pi8(__m1, __m2);
774
}
775
 
776
/* Compare four 16-bit values.  The result of the comparison is 0xFFFF if
777
   the test is true and zero if false.  */
778
extern __inline __m64
779
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
780
    _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
781
#if _ARCH_PWR8
782
  __vector signed short __a, __b, __c;
783
 
784
  __a = (__vector signed short)vec_splats(__m1);
785
  __b = (__vector signed short)vec_splats(__m2);
786
  __c = (__vector signed short)vec_cmpeq(__a, __b);
787
  return (__m64)((__vector long long)__c)[0];
788
#else
789
  __m64_union __mu1, __mu2, __res;
790
 
791
  __mu1.as_m64 = __m1;
792
  __mu2.as_m64 = __m2;
793
 
794
  __res.as_short[0] = (__mu1.as_short[0] == __mu2.as_short[0]) ? -1 : 0;
795
  __res.as_short[1] = (__mu1.as_short[1] == __mu2.as_short[1]) ? -1 : 0;
796
  __res.as_short[2] = (__mu1.as_short[2] == __mu2.as_short[2]) ? -1 : 0;
797
  __res.as_short[3] = (__mu1.as_short[3] == __mu2.as_short[3]) ? -1 : 0;
798
 
799
  return (__m64)__res.as_m64;
800
#endif
801
}
802
 
803
extern __inline __m64
804
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
805
    _m_pcmpeqw(__m64 __m1, __m64 __m2) {
806
  return _mm_cmpeq_pi16(__m1, __m2);
807
}
808
 
809
extern __inline __m64
810
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
811
    _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
812
#if _ARCH_PWR8
813
  __vector signed short __a, __b, __c;
814
 
815
  __a = (__vector signed short)vec_splats(__m1);
816
  __b = (__vector signed short)vec_splats(__m2);
817
  __c = (__vector signed short)vec_cmpgt(__a, __b);
818
  return (__m64)((__vector long long)__c)[0];
819
#else
820
  __m64_union __mu1, __mu2, __res;
821
 
822
  __mu1.as_m64 = __m1;
823
  __mu2.as_m64 = __m2;
824
 
825
  __res.as_short[0] = (__mu1.as_short[0] > __mu2.as_short[0]) ? -1 : 0;
826
  __res.as_short[1] = (__mu1.as_short[1] > __mu2.as_short[1]) ? -1 : 0;
827
  __res.as_short[2] = (__mu1.as_short[2] > __mu2.as_short[2]) ? -1 : 0;
828
  __res.as_short[3] = (__mu1.as_short[3] > __mu2.as_short[3]) ? -1 : 0;
829
 
830
  return (__m64)__res.as_m64;
831
#endif
832
}
833
 
834
extern __inline __m64
835
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
836
    _m_pcmpgtw(__m64 __m1, __m64 __m2) {
837
  return _mm_cmpgt_pi16(__m1, __m2);
838
}
839
 
840
/* Compare two 32-bit values.  The result of the comparison is 0xFFFFFFFF if
841
   the test is true and zero if false.  */
842
extern __inline __m64
843
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844
    _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
845
#if _ARCH_PWR9
846
  __vector signed int __a, __b, __c;
847
 
848
  __a = (__vector signed int)vec_splats(__m1);
849
  __b = (__vector signed int)vec_splats(__m2);
850
  __c = (__vector signed int)vec_cmpeq(__a, __b);
851
  return (__m64)((__vector long long)__c)[0];
852
#else
853
  __m64_union __mu1, __mu2, __res;
854
 
855
  __mu1.as_m64 = __m1;
856
  __mu2.as_m64 = __m2;
857
 
858
  __res.as_int[0] = (__mu1.as_int[0] == __mu2.as_int[0]) ? -1 : 0;
859
  __res.as_int[1] = (__mu1.as_int[1] == __mu2.as_int[1]) ? -1 : 0;
860
 
861
  return (__m64)__res.as_m64;
862
#endif
863
}
864
 
865
extern __inline __m64
866
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
867
    _m_pcmpeqd(__m64 __m1, __m64 __m2) {
868
  return _mm_cmpeq_pi32(__m1, __m2);
869
}
870
 
871
extern __inline __m64
872
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
873
    _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
874
#if _ARCH_PWR9
875
  __vector signed int __a, __b, __c;
876
 
877
  __a = (__vector signed int)vec_splats(__m1);
878
  __b = (__vector signed int)vec_splats(__m2);
879
  __c = (__vector signed int)vec_cmpgt(__a, __b);
880
  return (__m64)((__vector long long)__c)[0];
881
#else
882
  __m64_union __mu1, __mu2, __res;
883
 
884
  __mu1.as_m64 = __m1;
885
  __mu2.as_m64 = __m2;
886
 
887
  __res.as_int[0] = (__mu1.as_int[0] > __mu2.as_int[0]) ? -1 : 0;
888
  __res.as_int[1] = (__mu1.as_int[1] > __mu2.as_int[1]) ? -1 : 0;
889
 
890
  return (__m64)__res.as_m64;
891
#endif
892
}
893
 
894
extern __inline __m64
895
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
896
    _m_pcmpgtd(__m64 __m1, __m64 __m2) {
897
  return _mm_cmpgt_pi32(__m1, __m2);
898
}
899
 
900
#if _ARCH_PWR8
901
/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
902
   saturated arithmetic.  */
903
extern __inline __m64
904
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
905
    _mm_adds_pi8(__m64 __m1, __m64 __m2) {
906
  __vector signed char __a, __b, __c;
907
 
908
  __a = (__vector signed char)vec_splats(__m1);
909
  __b = (__vector signed char)vec_splats(__m2);
910
  __c = vec_adds(__a, __b);
911
  return (__m64)((__vector long long)__c)[0];
912
}
913
 
914
extern __inline __m64
915
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
916
    _m_paddsb(__m64 __m1, __m64 __m2) {
917
  return _mm_adds_pi8(__m1, __m2);
918
}
919
/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
920
   saturated arithmetic.  */
921
extern __inline __m64
922
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
923
    _mm_adds_pi16(__m64 __m1, __m64 __m2) {
924
  __vector signed short __a, __b, __c;
925
 
926
  __a = (__vector signed short)vec_splats(__m1);
927
  __b = (__vector signed short)vec_splats(__m2);
928
  __c = vec_adds(__a, __b);
929
  return (__m64)((__vector long long)__c)[0];
930
}
931
 
932
extern __inline __m64
933
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
934
    _m_paddsw(__m64 __m1, __m64 __m2) {
935
  return _mm_adds_pi16(__m1, __m2);
936
}
937
/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
938
   saturated arithmetic.  */
939
extern __inline __m64
940
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941
    _mm_adds_pu8(__m64 __m1, __m64 __m2) {
942
  __vector unsigned char __a, __b, __c;
943
 
944
  __a = (__vector unsigned char)vec_splats(__m1);
945
  __b = (__vector unsigned char)vec_splats(__m2);
946
  __c = vec_adds(__a, __b);
947
  return (__m64)((__vector long long)__c)[0];
948
}
949
 
950
extern __inline __m64
951
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
952
    _m_paddusb(__m64 __m1, __m64 __m2) {
953
  return _mm_adds_pu8(__m1, __m2);
954
}
955
 
956
/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
957
   saturated arithmetic.  */
958
extern __inline __m64
959
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
960
    _mm_adds_pu16(__m64 __m1, __m64 __m2) {
961
  __vector unsigned short __a, __b, __c;
962
 
963
  __a = (__vector unsigned short)vec_splats(__m1);
964
  __b = (__vector unsigned short)vec_splats(__m2);
965
  __c = vec_adds(__a, __b);
966
  return (__m64)((__vector long long)__c)[0];
967
}
968
 
969
extern __inline __m64
970
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
971
    _m_paddusw(__m64 __m1, __m64 __m2) {
972
  return _mm_adds_pu16(__m1, __m2);
973
}
974
 
975
/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
976
   saturating arithmetic.  */
977
extern __inline __m64
978
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
979
    _mm_subs_pi8(__m64 __m1, __m64 __m2) {
980
  __vector signed char __a, __b, __c;
981
 
982
  __a = (__vector signed char)vec_splats(__m1);
983
  __b = (__vector signed char)vec_splats(__m2);
984
  __c = vec_subs(__a, __b);
985
  return (__m64)((__vector long long)__c)[0];
986
}
987
 
988
extern __inline __m64
989
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990
    _m_psubsb(__m64 __m1, __m64 __m2) {
991
  return _mm_subs_pi8(__m1, __m2);
992
}
993
 
994
/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
995
   signed saturating arithmetic.  */
996
extern __inline __m64
997
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
998
    _mm_subs_pi16(__m64 __m1, __m64 __m2) {
999
  __vector signed short __a, __b, __c;
1000
 
1001
  __a = (__vector signed short)vec_splats(__m1);
1002
  __b = (__vector signed short)vec_splats(__m2);
1003
  __c = vec_subs(__a, __b);
1004
  return (__m64)((__vector long long)__c)[0];
1005
}
1006
 
1007
extern __inline __m64
1008
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1009
    _m_psubsw(__m64 __m1, __m64 __m2) {
1010
  return _mm_subs_pi16(__m1, __m2);
1011
}
1012
 
1013
/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1014
   unsigned saturating arithmetic.  */
1015
extern __inline __m64
1016
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1017
    _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1018
  __vector unsigned char __a, __b, __c;
1019
 
1020
  __a = (__vector unsigned char)vec_splats(__m1);
1021
  __b = (__vector unsigned char)vec_splats(__m2);
1022
  __c = vec_subs(__a, __b);
1023
  return (__m64)((__vector long long)__c)[0];
1024
}
1025
 
1026
extern __inline __m64
1027
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1028
    _m_psubusb(__m64 __m1, __m64 __m2) {
1029
  return _mm_subs_pu8(__m1, __m2);
1030
}
1031
 
1032
/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1033
   unsigned saturating arithmetic.  */
1034
extern __inline __m64
1035
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1036
    _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1037
  __vector unsigned short __a, __b, __c;
1038
 
1039
  __a = (__vector unsigned short)vec_splats(__m1);
1040
  __b = (__vector unsigned short)vec_splats(__m2);
1041
  __c = vec_subs(__a, __b);
1042
  return (__m64)((__vector long long)__c)[0];
1043
}
1044
 
1045
extern __inline __m64
1046
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047
    _m_psubusw(__m64 __m1, __m64 __m2) {
1048
  return _mm_subs_pu16(__m1, __m2);
1049
}
1050
 
1051
/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1052
   four 32-bit intermediate results, which are then summed by pairs to
1053
   produce two 32-bit results.  */
1054
extern __inline __m64
1055
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056
    _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1057
  __vector signed short __a, __b;
1058
  __vector signed int __c;
1059
  __vector signed int __zero = {0, 0, 0, 0};
1060
 
1061
  __a = (__vector signed short)vec_splats(__m1);
1062
  __b = (__vector signed short)vec_splats(__m2);
1063
  __c = vec_vmsumshm(__a, __b, __zero);
1064
  return (__m64)((__vector long long)__c)[0];
1065
}
1066
 
1067
extern __inline __m64
1068
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1069
    _m_pmaddwd(__m64 __m1, __m64 __m2) {
1070
  return _mm_madd_pi16(__m1, __m2);
1071
}
1072
/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1073
   M2 and produce the high 16 bits of the 32-bit results.  */
1074
extern __inline __m64
1075
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1076
    _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1077
  __vector signed short __a, __b;
1078
  __vector signed short __c;
1079
  __vector signed int __w0, __w1;
1080
  __vector unsigned char __xform1 = {
1081
#ifdef __LITTLE_ENDIAN__
1082
      0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1083
      0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1084
#else
1085
      0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1086
      0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1087
#endif
1088
  };
1089
 
1090
  __a = (__vector signed short)vec_splats(__m1);
1091
  __b = (__vector signed short)vec_splats(__m2);
1092
 
1093
  __w0 = vec_vmulesh(__a, __b);
1094
  __w1 = vec_vmulosh(__a, __b);
1095
  __c = (__vector signed short)vec_perm(__w0, __w1, __xform1);
1096
 
1097
  return (__m64)((__vector long long)__c)[0];
1098
}
1099
 
1100
extern __inline __m64
1101
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1102
    _m_pmulhw(__m64 __m1, __m64 __m2) {
1103
  return _mm_mulhi_pi16(__m1, __m2);
1104
}
1105
 
1106
/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1107
   the low 16 bits of the results.  */
1108
extern __inline __m64
1109
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110
    _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1111
  __vector signed short __a, __b, __c;
1112
 
1113
  __a = (__vector signed short)vec_splats(__m1);
1114
  __b = (__vector signed short)vec_splats(__m2);
1115
  __c = __a * __b;
1116
  return (__m64)((__vector long long)__c)[0];
1117
}
1118
 
1119
extern __inline __m64
1120
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1121
    _m_pmullw(__m64 __m1, __m64 __m2) {
1122
  return _mm_mullo_pi16(__m1, __m2);
1123
}
1124
 
1125
/* Shift four 16-bit values in M left by COUNT.  */
1126
extern __inline __m64
1127
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1128
    _mm_sll_pi16(__m64 __m, __m64 __count) {
1129
  __vector signed short __r;
1130
  __vector unsigned short __c;
1131
 
1132
  if (__count <= 15) {
1133
    __r = (__vector signed short)vec_splats(__m);
1134
    __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1135
    __r = vec_sl(__r, (__vector unsigned short)__c);
1136
    return (__m64)((__vector long long)__r)[0];
1137
  } else
1138
    return (0);
1139
}
1140
 
1141
extern __inline __m64
1142
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1143
    _m_psllw(__m64 __m, __m64 __count) {
1144
  return _mm_sll_pi16(__m, __count);
1145
}
1146
 
1147
extern __inline __m64
1148
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1149
    _mm_slli_pi16(__m64 __m, int __count) {
1150
  /* Promote int to long then invoke mm_sll_pi16.  */
1151
  return _mm_sll_pi16(__m, __count);
1152
}
1153
 
1154
extern __inline __m64
1155
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1156
    _m_psllwi(__m64 __m, int __count) {
1157
  return _mm_slli_pi16(__m, __count);
1158
}
1159
 
1160
/* Shift two 32-bit values in M left by COUNT.  */
1161
extern __inline __m64
1162
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1163
    _mm_sll_pi32(__m64 __m, __m64 __count) {
1164
  __m64_union __res;
1165
 
1166
  __res.as_m64 = __m;
1167
 
1168
  __res.as_int[0] = __res.as_int[0] << __count;
1169
  __res.as_int[1] = __res.as_int[1] << __count;
1170
  return (__res.as_m64);
1171
}
1172
 
1173
extern __inline __m64
1174
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1175
    _m_pslld(__m64 __m, __m64 __count) {
1176
  return _mm_sll_pi32(__m, __count);
1177
}
1178
 
1179
extern __inline __m64
1180
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1181
    _mm_slli_pi32(__m64 __m, int __count) {
1182
  /* Promote int to long then invoke mm_sll_pi32.  */
1183
  return _mm_sll_pi32(__m, __count);
1184
}
1185
 
1186
extern __inline __m64
1187
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1188
    _m_pslldi(__m64 __m, int __count) {
1189
  return _mm_slli_pi32(__m, __count);
1190
}
1191
 
1192
/* Shift four 16-bit values in M right by COUNT; shift in the sign bit.  */
1193
extern __inline __m64
1194
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1195
    _mm_sra_pi16(__m64 __m, __m64 __count) {
1196
  __vector signed short __r;
1197
  __vector unsigned short __c;
1198
 
1199
  if (__count <= 15) {
1200
    __r = (__vector signed short)vec_splats(__m);
1201
    __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1202
    __r = vec_sra(__r, (__vector unsigned short)__c);
1203
    return (__m64)((__vector long long)__r)[0];
1204
  } else
1205
    return (0);
1206
}
1207
 
1208
extern __inline __m64
1209
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1210
    _m_psraw(__m64 __m, __m64 __count) {
1211
  return _mm_sra_pi16(__m, __count);
1212
}
1213
 
1214
extern __inline __m64
1215
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1216
    _mm_srai_pi16(__m64 __m, int __count) {
1217
  /* Promote int to long then invoke mm_sra_pi32.  */
1218
  return _mm_sra_pi16(__m, __count);
1219
}
1220
 
1221
extern __inline __m64
1222
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1223
    _m_psrawi(__m64 __m, int __count) {
1224
  return _mm_srai_pi16(__m, __count);
1225
}
1226
 
1227
/* Shift two 32-bit values in M right by COUNT; shift in the sign bit.  */
1228
extern __inline __m64
1229
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1230
    _mm_sra_pi32(__m64 __m, __m64 __count) {
1231
  __m64_union __res;
1232
 
1233
  __res.as_m64 = __m;
1234
 
1235
  __res.as_int[0] = __res.as_int[0] >> __count;
1236
  __res.as_int[1] = __res.as_int[1] >> __count;
1237
  return (__res.as_m64);
1238
}
1239
 
1240
extern __inline __m64
1241
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1242
    _m_psrad(__m64 __m, __m64 __count) {
1243
  return _mm_sra_pi32(__m, __count);
1244
}
1245
 
1246
extern __inline __m64
1247
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1248
    _mm_srai_pi32(__m64 __m, int __count) {
1249
  /* Promote int to long then invoke mm_sra_pi32.  */
1250
  return _mm_sra_pi32(__m, __count);
1251
}
1252
 
1253
extern __inline __m64
1254
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1255
    _m_psradi(__m64 __m, int __count) {
1256
  return _mm_srai_pi32(__m, __count);
1257
}
1258
 
1259
/* Shift four 16-bit values in M right by COUNT; shift in zeros.  */
1260
extern __inline __m64
1261
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1262
    _mm_srl_pi16(__m64 __m, __m64 __count) {
1263
  __vector unsigned short __r;
1264
  __vector unsigned short __c;
1265
 
1266
  if (__count <= 15) {
1267
    __r = (__vector unsigned short)vec_splats(__m);
1268
    __c = (__vector unsigned short)vec_splats((unsigned short)__count);
1269
    __r = vec_sr(__r, (__vector unsigned short)__c);
1270
    return (__m64)((__vector long long)__r)[0];
1271
  } else
1272
    return (0);
1273
}
1274
 
1275
extern __inline __m64
1276
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1277
    _m_psrlw(__m64 __m, __m64 __count) {
1278
  return _mm_srl_pi16(__m, __count);
1279
}
1280
 
1281
extern __inline __m64
1282
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1283
    _mm_srli_pi16(__m64 __m, int __count) {
1284
  /* Promote int to long then invoke mm_sra_pi32.  */
1285
  return _mm_srl_pi16(__m, __count);
1286
}
1287
 
1288
extern __inline __m64
1289
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290
    _m_psrlwi(__m64 __m, int __count) {
1291
  return _mm_srli_pi16(__m, __count);
1292
}
1293
 
1294
/* Shift two 32-bit values in M right by COUNT; shift in zeros.  */
1295
extern __inline __m64
1296
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297
    _mm_srl_pi32(__m64 __m, __m64 __count) {
1298
  __m64_union __res;
1299
 
1300
  __res.as_m64 = __m;
1301
 
1302
  __res.as_int[0] = (unsigned int)__res.as_int[0] >> __count;
1303
  __res.as_int[1] = (unsigned int)__res.as_int[1] >> __count;
1304
  return (__res.as_m64);
1305
}
1306
 
1307
extern __inline __m64
1308
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309
    _m_psrld(__m64 __m, __m64 __count) {
1310
  return _mm_srl_pi32(__m, __count);
1311
}
1312
 
1313
extern __inline __m64
1314
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315
    _mm_srli_pi32(__m64 __m, int __count) {
1316
  /* Promote int to long then invoke mm_srl_pi32.  */
1317
  return _mm_srl_pi32(__m, __count);
1318
}
1319
 
1320
extern __inline __m64
1321
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1322
    _m_psrldi(__m64 __m, int __count) {
1323
  return _mm_srli_pi32(__m, __count);
1324
}
1325
#endif /* _ARCH_PWR8 */
1326
 
1327
/* Creates a vector of two 32-bit values; I0 is least significant.  */
1328
extern __inline __m64
1329
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1330
    _mm_set_pi32(int __i1, int __i0) {
1331
  __m64_union __res;
1332
 
1333
  __res.as_int[0] = __i0;
1334
  __res.as_int[1] = __i1;
1335
  return (__res.as_m64);
1336
}
1337
 
1338
/* Creates a vector of four 16-bit values; W0 is least significant.  */
1339
extern __inline __m64
1340
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1341
    _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1342
  __m64_union __res;
1343
 
1344
  __res.as_short[0] = __w0;
1345
  __res.as_short[1] = __w1;
1346
  __res.as_short[2] = __w2;
1347
  __res.as_short[3] = __w3;
1348
  return (__res.as_m64);
1349
}
1350
 
1351
/* Creates a vector of eight 8-bit values; B0 is least significant.  */
1352
extern __inline __m64
1353
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1354
    _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1355
                char __b2, char __b1, char __b0) {
1356
  __m64_union __res;
1357
 
1358
  __res.as_char[0] = __b0;
1359
  __res.as_char[1] = __b1;
1360
  __res.as_char[2] = __b2;
1361
  __res.as_char[3] = __b3;
1362
  __res.as_char[4] = __b4;
1363
  __res.as_char[5] = __b5;
1364
  __res.as_char[6] = __b6;
1365
  __res.as_char[7] = __b7;
1366
  return (__res.as_m64);
1367
}
1368
 
1369
/* Similar, but with the arguments in reverse order.  */
1370
extern __inline __m64
1371
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1372
    _mm_setr_pi32(int __i0, int __i1) {
1373
  __m64_union __res;
1374
 
1375
  __res.as_int[0] = __i0;
1376
  __res.as_int[1] = __i1;
1377
  return (__res.as_m64);
1378
}
1379
 
1380
extern __inline __m64
1381
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1382
    _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1383
  return _mm_set_pi16(__w3, __w2, __w1, __w0);
1384
}
1385
 
1386
extern __inline __m64
1387
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1388
    _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1389
                 char __b5, char __b6, char __b7) {
1390
  return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1391
}
1392
 
1393
/* Creates a vector of two 32-bit values, both elements containing I.  */
1394
extern __inline __m64
1395
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1396
    _mm_set1_pi32(int __i) {
1397
  __m64_union __res;
1398
 
1399
  __res.as_int[0] = __i;
1400
  __res.as_int[1] = __i;
1401
  return (__res.as_m64);
1402
}
1403
 
1404
/* Creates a vector of four 16-bit values, all elements containing W.  */
1405
extern __inline __m64
1406
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1407
    _mm_set1_pi16(short __w) {
1408
#if _ARCH_PWR9
1409
  __vector signed short w;
1410
 
1411
  w = (__vector signed short)vec_splats(__w);
1412
  return (__m64)((__vector long long)w)[0];
1413
#else
1414
  __m64_union __res;
1415
 
1416
  __res.as_short[0] = __w;
1417
  __res.as_short[1] = __w;
1418
  __res.as_short[2] = __w;
1419
  __res.as_short[3] = __w;
1420
  return (__res.as_m64);
1421
#endif
1422
}
1423
 
1424
/* Creates a vector of eight 8-bit values, all elements containing B.  */
1425
extern __inline __m64
1426
    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427
    _mm_set1_pi8(signed char __b) {
1428
#if _ARCH_PWR8
1429
  __vector signed char __res;
1430
 
1431
  __res = (__vector signed char)vec_splats(__b);
1432
  return (__m64)((__vector long long)__res)[0];
1433
#else
1434
  __m64_union __res;
1435
 
1436
  __res.as_char[0] = __b;
1437
  __res.as_char[1] = __b;
1438
  __res.as_char[2] = __b;
1439
  __res.as_char[3] = __b;
1440
  __res.as_char[4] = __b;
1441
  __res.as_char[5] = __b;
1442
  __res.as_char[6] = __b;
1443
  __res.as_char[7] = __b;
1444
  return (__res.as_m64);
1445
#endif
1446
}
1447
 
1448
#else
1449
#include_next <mmintrin.h>
1450
#endif /* defined(__powerpc64__) &&                                            \
1451
        *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
1452
 
1453
#endif /* _MMINTRIN_H_INCLUDED */